Vendor import of llvm trunk r161861:

http://llvm.org/svn/llvm-project/llvm/trunk@161861
author: dim <dim@FreeBSD.org> 2012-08-15 19:34:23 +0000
committer: dim <dim@FreeBSD.org> 2012-08-15 19:34:23 +0000
commit: 721c201bd55ffb73cb2ba8d39e0570fa38c44e15 (patch)
tree: eacfc83d988e4b9d11114387ae7dc41243f2a363
parent: 2b2816e083a455f7a656ae88b0fd059d1688bb36 (diff)
download: FreeBSD-src-721c201bd55ffb73cb2ba8d39e0570fa38c44e15.zip
FreeBSD-src-721c201bd55ffb73cb2ba8d39e0570fa38c44e15.tar.gz
2874 files changed, 135059 insertions, 70684 deletions
diff --git a/.gitignore b/.gitignore
index b3d030e..ecf2e3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,7 @@ projects/*
 tools/clang
 # LLDB, which is tracked independently.
 tools/lldb
+# lld, which is tracked independently.
+tools/lld
+# Sphinx build tree, if building in-source dir.
+docs/_build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 321023a..7b5240c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,11 +11,14 @@ set(CMAKE_MODULE_PATH
   )
 
 set(LLVM_VERSION_MAJOR 3)
-set(LLVM_VERSION_MINOR 1)
+set(LLVM_VERSION_MINOR 2)
 
 set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}svn")
 
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON)
+if ( LLVM_USE_FOLDERS )
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+endif()
 
 include(VersionFromVCS)
 
@@ -78,8 +81,8 @@ set(LLVM_ALL_TARGETS
   Mips
   MBlaze
   MSP430
+  NVPTX
   PowerPC
-  PTX
   Sparc
   X86
   XCore
@@ -96,6 +99,9 @@ else( MSVC )
     CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
 endif( MSVC )
 
+set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ""
+  CACHE STRING "Semicolon-separated list of experimental targets to build.")
+
 option(BUILD_SHARED_LIBS
   "Build all libraries as shared libraries instead of static" OFF)
 
@@ -133,6 +139,11 @@ foreach(c ${LLVM_TARGETS_TO_BUILD})
   endif()
 endforeach(c)
 
+set(LLVM_TARGETS_TO_BUILD
+  ${LLVM_TARGETS_TO_BUILD}
+  ${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD}
+  )
+
 set(llvm_builded_incs_dir ${LLVM_BINARY_DIR}/include/llvm)
 
 include(AddLLVMDefinitions)
@@ -398,6 +409,8 @@ add_subdirectory(utils/count)
 add_subdirectory(utils/not)
 add_subdirectory(utils/llvm-lit)
 add_subdirectory(utils/yaml-bench)
+add_subdirectory(utils/obj2yaml)
+add_subdirectory(utils/yaml2obj)
 
 add_subdirectory(projects)
 
@@ -422,6 +435,20 @@ if( LLVM_INCLUDE_TESTS )
     # Windows.
     add_subdirectory(utils/KillTheDoctor)
   endif()
+
+  # Add a global check rule now that all subdirectories have been traversed
+  # and we know the total set of lit testsuites.
+  get_property(LLVM_LIT_TESTSUITES GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
+  get_property(LLVM_LIT_PARAMS GLOBAL PROPERTY LLVM_LIT_PARAMS)
+  get_property(LLVM_LIT_DEPENDS GLOBAL PROPERTY LLVM_LIT_DEPENDS)
+  get_property(LLVM_LIT_EXTRA_ARGS GLOBAL PROPERTY LLVM_LIT_EXTRA_ARGS)
+  add_lit_target(check-all
+    "Running all regression tests"
+    ${LLVM_LIT_TESTSUITES}
+    PARAMS ${LLVM_LIT_PARAMS}
+    DEPENDS ${LLVM_LIT_DEPENDS}
+    ARGS ${LLVM_LIT_EXTRA_ARGS}
+    )
 endif()
 
 add_subdirectory(cmake/modules)
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
new file mode 100644
index 0000000..fd7bcda
--- /dev/null
+++ b/CODE_OWNERS.TXT
@@ -0,0 +1,51 @@
+This file is a list of the people responsible for ensuring that patches for a
+particular part of LLVM are reviewed, either by themself or by someone else.
+They are also the gatekeepers for their part of LLVM, with the final word on
+what goes in or not.
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts.  The fields are: name (N), email (E), web-address
+(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
+(S).
+
+N: Evan Cheng
+E: evan.cheng@apple.com
+D: Code generator and all targets
+
+N: Greg Clayton
+D: LLDB
+
+N: Peter Collingbourne
+D: libclc
+
+N: Doug Gregor
+D: Clang Frontend Libraries
+
+N: Tobias Grosser
+D: Polly
+
+N: Howard Hinnant
+D: libc++
+
+N: Anton Korobeynikov
+E: asl@math.spbu.ru
+D: Exception handling, debug information, and Windows codegen
+
+N: Ted Kremenek
+D: Clang Static Analyzer
+
+N: Chris Lattner
+E: sabre@nondot.org
+W: http://nondot.org/~sabre/
+D: Everything not covered by someone else
+
+N: John McCall
+E: rjmccall@apple.com
+D: Clang LLVM IR generation
+
+N: Jakob Olesen
+D: Register allocators and TableGen
+
+N: Duncan Sands
+E: baldrick@free.fr
+D: DragonEgg
diff --git a/CREDITS.TXT b/CREDITS.TXT
index bf32a4c..f090ad7 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -22,6 +22,10 @@ D: GVNPRE pass, TargetData refactoring, random improvements
 N: Henrik Bach
 D: MingW Win32 API portability layer
 
+N: Aaron Ballman
+E: aaron@aaronballman.com
+D: __declspec attributes, Windows support, general bug fixing
+
 N: Nate Begeman
 E: natebegeman@mac.com
 D: PowerPC backend developer
@@ -342,6 +346,10 @@ W: http://vladimir_prus.blogspot.com
 E: ghost@cs.msu.su
 D: Made inst_iterator behave like a proper iterator, LowerConstantExprs pass
 
+N: Kalle Raiskila
+E: kalle.rasikila@nokia.com
+D: Some bugfixes to CellSPU
+
 N: Xerxes Ranby
 E: xerxes@zafena.se
 D: Cmake dependency chain and various bug fixes
diff --git a/Makefile.common b/Makefile.common
index e1f5203..55e2b63 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -30,8 +30,7 @@
 #
 # 4. Source - If specified, this sets the source code filenames.  If this
 #    is not set, it defaults to be all of the .cpp, .c, .y, and .l files
-#    in the current directory.  Also, if you want to build files in addition
-#    to the local files, you can use the ExtraSource variable
+#    in the current directory.
 #
 # 5. SourceDir - If specified, this specifies a directory that the source files
 #    are in, if they are not in the current directory.  This should include a
diff --git a/Makefile.config.in b/Makefile.config.in
index 2ffdacb..6d06f16 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -112,6 +112,8 @@ HOST_OS=@HOST_OS@
 # Target operating system for which LLVM will compile for.
 TARGET_OS=@TARGET_OS@
 
+# Host hardware architecture
+HOST_ARCH=@HOST_ARCH@
 # Target hardware architecture
 ARCH=@ARCH@
 TARGET_NATIVE_ARCH := $(ARCH)
@@ -189,8 +191,6 @@ GAS        := @GAS@
 POD2HTML   := @POD2HTML@
 POD2MAN    := @POD2MAN@
 PDFROFF    := @PDFROFF@
-RUNTEST    := @RUNTEST@
-TCLSH      := @TCLSH@
 ZIP        := @ZIP@
 
 HAVE_PTHREAD := @HAVE_PTHREAD@
@@ -218,6 +218,12 @@ RDYNAMIC := @RDYNAMIC@
 #ENABLE_LIBCPP = 0
 ENABLE_LIBCPP = @ENABLE_LIBCPP@
 
+# When ENABLE_CXX11 is enabled, LLVM uses c++11 mode by default to build.
+ENABLE_CXX11 = @ENABLE_CXX11@
+
+# When ENABLE_WERROR is enabled, we'll pass -Werror on the command line
+ENABLE_WERROR = @ENABLE_WERROR@
+
 # When ENABLE_OPTIMIZED is enabled, LLVM code is optimized and output is put
 # into the "Release" directories. Otherwise, LLVM code is not optimized and
 # output is put in the "Debug" directories.
@@ -344,6 +350,10 @@ LLVM_HAS_POLLY = @LLVM_HAS_POLLY@
 # bfd ld / gold --version-script=file
 HAVE_LINK_VERSION_SCRIPT = @HAVE_LINK_VERSION_SCRIPT@
 
+# Flags to control using libxml2
+LIBXML2_LIBS := @LIBXML2_LIBS@
+LIBXML2_INC  := @LIBXML2_INC@
+
 # Flags to control building support for Intel JIT Events API
 USE_INTEL_JITEVENTS := @USE_INTEL_JITEVENTS@
 INTEL_JITEVENTS_INCDIR := @INTEL_JITEVENTS_INCDIR@
diff --git a/Makefile.rules b/Makefile.rules
index 70dd62d..289adc2 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -317,6 +317,15 @@ ifeq ($(ENABLE_LIBCPP),1)
   LD.Flags +=  -stdlib=libc++
 endif
 
+ifeq ($(ENABLE_CXX11),1)
+  CXX.Flags += -std=c++11
+endif
+
+ifeq ($(ENABLE_WERROR),1)
+  CXX.Flags += -Werror
+  C.Flags += -Werror
+endif
+
 ifeq ($(ENABLE_PROFILING),1)
   BuildMode := $(BuildMode)+Profile
   CXX.Flags := $(filter-out -fomit-frame-pointer,$(CXX.Flags)) -pg -g
@@ -538,9 +547,6 @@ ifeq ($(LLVM_CROSS_COMPILING),1)
 else
   LLVM_CONFIG := $(LLVMToolDir)/llvm-config$(EXEEXT)
 endif
-ifndef LLVMLD
-LLVMLD    := $(LLVMToolDir)/llvm-ld$(EXEEXT)
-endif
 ifndef LLVMDIS
 LLVMDIS  := $(LLVMToolDir)/llvm-dis$(EXEEXT)
 endif
@@ -556,6 +562,9 @@ endif
 ifndef LBUGPOINT
 LBUGPOINT := $(LLVMToolDir)/bugpoint$(EXEEXT)
 endif
+ifndef LLVMLINK
+LLVMLINK      := $(LLVMToolDir)/llvm-link$(EXEEXT)
+endif
 
 #--------------------------------------------------------------------
 # Adjust to user's request
@@ -1134,14 +1143,14 @@ $(warning Modules require LLVM capable compiler but none is available ****)
 else
 
 Module     := $(LibDir)/$(MODULE_NAME).bc
-LinkModule := $(LLVMLD) -r
+LinkModule := $(LLVMLINK)
 
 
 ifdef EXPORTED_SYMBOL_FILE
 LinkModule += -internalize-public-api-file=$(EXPORTED_SYMBOL_FILE)
 endif
 
-$(Module): $(BUILT_SOURCES) $(ObjectsBC) $(LibDir)/.dir $(LLVMLD)
+$(Module): $(BUILT_SOURCES) $(ObjectsBC) $(LibDir)/.dir $(LLVMLINK)
 	$(Echo) Building $(BuildMode) Bytecode Module $(notdir $@)
 	$(Verb) $(LinkModule) -o $@ $(ObjectsBC)
 
@@ -1276,9 +1285,9 @@ else
 all-local:: $(LibName.BCA)
 
 ifdef EXPORTED_SYMBOL_FILE
-BCLinkLib = $(LLVMLD) -internalize-public-api-file=$(EXPORTED_SYMBOL_FILE)
+BCLinkLib = $(LLVMLINK) -internalize-public-api-file=$(EXPORTED_SYMBOL_FILE)
 
-$(LibName.BCA): $(ObjectsBC) $(LibDir)/.dir $(LLVMLD) \
+$(LibName.BCA): $(ObjectsBC) $(LibDir)/.dir $(LLVMLINK) \
                 $(LLVMToolDir)/llvm-ar
 	$(Echo) Building $(BuildMode) Bytecode Archive $(notdir $@) \
 	  "(internalize)"
@@ -1964,20 +1973,9 @@ check::
 	  $(EchoCmd) No test directory ; \
 	fi
 
+# An alias dating from when both lit and DejaGNU test runners were used.
 check-lit:: check
 
-check-dg::
-	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/test" ; then \
-	  if test -f "$(PROJ_OBJ_ROOT)/test/Makefile" ; then \
-	    $(EchoCmd) Running test suite ; \
-	    $(MAKE) -C $(PROJ_OBJ_ROOT)/test check-local-dg ; \
-	  else \
-	    $(EchoCmd) No Makefile in test directory ; \
-	  fi ; \
-	else \
-	  $(EchoCmd) No test directory ; \
-	fi
-
 check-all::
 	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/test" ; then \
 	  if test -f "$(PROJ_OBJ_ROOT)/test/Makefile" ; then \
diff --git a/README.txt b/README.txt
index 34a568f..0d39ed6 100644
--- a/README.txt
+++ b/README.txt
@@ -1,4 +1,3 @@
-
 Low Level Virtual Machine (LLVM)
 ================================
 
diff --git a/autoconf/config.guess b/autoconf/config.guess
index f7dd69e..dd6dcb3 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -206,6 +206,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
 	exit ;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+	exit ;;
     *:ekkoBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
 	exit ;;
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index e487597..4dde3a6 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -31,9 +31,9 @@ dnl===
 dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
-AC_INIT([LLVM],[3.1],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.2svn],[http://llvm.org/bugs/])
 AC_DEFINE([LLVM_VERSION_MAJOR], [3], [Major version of the LLVM API])
-AC_DEFINE([LLVM_VERSION_MINOR], [1], [Minor version of the LLVM API])
+AC_DEFINE([LLVM_VERSION_MINOR], [2], [Minor version of the LLVM API])
 
 dnl Provide a copyright substitution and ensure the copyright notice is included
 dnl in the output of --version option of the generated configure script.
@@ -369,7 +369,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
   mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
-  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
   *)                      llvm_cv_target_arch="Unknown" ;;
 esac])
 
@@ -387,6 +387,30 @@ esac
 dnl Define a substitution, ARCH, for the target architecture
 AC_SUBST(ARCH,$llvm_cv_target_arch)
 
+dnl Determine what our host architecture.
+dnl This will allow MCJIT regress tests runs only for supported
+dnl platforms.
+case $host in
+  i?86-*)                 host_arch="x86" ;;
+  amd64-* | x86_64-*)     host_arch="x86_64" ;;
+  sparc*-*)               host_arch="Sparc" ;;
+  powerpc*-*)             host_arch="PowerPC" ;;
+  arm*-*)                 host_arch="ARM" ;;
+  mips-*)                 host_arch="Mips" ;;
+  mipsel-*)               host_arch="Mips" ;;
+  xcore-*)                host_arch="XCore" ;;
+  msp430-*)               host_arch="MSP430" ;;
+  hexagon-*)              host_arch="Hexagon" ;;
+  mblaze-*)               host_arch="MBlaze" ;;
+  *)                      host_arch="Unknown" ;;
+esac
+
+if test "$host_arch" = "Unknown" ; then
+  AC_MSG_WARN([Configuring LLVM for an unknown host archicture])
+fi
+
+AC_SUBST(HOST_ARCH,$host_arch)
+
 dnl Check for the endianness of the target
 AC_C_BIGENDIAN(AC_SUBST([ENDIAN],[big]),AC_SUBST([ENDIAN],[little]))
 
@@ -439,6 +463,18 @@ case "$enableval" in
   *) AC_MSG_ERROR([Invalid setting for --enable-libcpp. Use "yes" or "no"]) ;;
 esac
 
+dnl --enable-cxx11 : check whether or not to use -std=c++11 on the command line
+AC_ARG_ENABLE(cxx11,
+              AS_HELP_STRING([--enable-cxx11],
+                             [Use c++11 if available (default is NO)]),,
+                             enableval=default)
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_CXX11,[1]) ;;
+  no)  AC_SUBST(ENABLE_CXX11,[0]) ;;
+  default) AC_SUBST(ENABLE_CXX11,[0]);;
+  *) AC_MSG_ERROR([Invalid setting for --enable-cxx11. Use "yes" or "no"]) ;;
+esac
+
 dnl --enable-optimized : check whether they want to do an optimized build:
 AC_ARG_ENABLE(optimized, AS_HELP_STRING(
  --enable-optimized,[Compile with optimizations enabled (default is NO)]),,enableval=$optimize)
@@ -466,6 +502,16 @@ else
   AC_SUBST(DISABLE_ASSERTIONS,[[DISABLE_ASSERTIONS=1]])
 fi
 
+dnl --enable-werror : check whether we want Werror on by default
+AC_ARG_ENABLE(werror,AS_HELP_STRING(
+  --enable-werror,[Compile with -Werror enabled (default is NO)]),, enableval="no")
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_WERROR,[1]) ;;
+  no)  AC_SUBST(ENABLE_WERROR,[0]) ;;
+  default) AC_SUBST(ENABLE_WERROR,[0]);;
+  *) AC_MSG_ERROR([Invalid setting for --enable-werror. Use "yes" or "no"]) ;;
+esac
+
 dnl --enable-expensive-checks : check whether they want to turn on expensive debug checks:
 AC_ARG_ENABLE(expensive-checks,AS_HELP_STRING(
   --enable-expensive-checks,[Compile with expensive debug checks enabled (default is NO)]),, enableval="no")
@@ -516,7 +562,7 @@ else
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
     Hexagon)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     MBlaze)      AC_SUBST(TARGET_HAS_JIT,0) ;;
-    PTX)         AC_SUBST(TARGET_HAS_JIT,0) ;;
+    NVPTX)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     *)           AC_SUBST(TARGET_HAS_JIT,0) ;;
   esac
 fi
@@ -621,20 +667,20 @@ case "$enableval" in
   *) AC_MSG_ERROR([Invalid setting for --enable-timestamps. Use "yes" or "no"]) ;;
 esac
 AC_DEFINE_UNQUOTED([ENABLE_TIMESTAMPS],$ENABLE_TIMESTAMPS,
-                   [Define if timestamp information (e.g., __DATE___) is allowed])
+                   [Define if timestamp information (e.g., __DATE__) is allowed])
 
 dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
     [Build specific host targets: all or target1,target2,... Valid targets are:
      host, x86, x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-     xcore, msp430, ptx, and cpp (default=all)]),,
+     xcore, msp430, nvptx, and cpp (default=all)]),,
     enableval=all)
 if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -650,7 +696,7 @@ case "$enableval" in
         cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
-        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -663,7 +709,7 @@ case "$enableval" in
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
             MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
             Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
-            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
             *)       AC_MSG_ERROR([Can not set target to build]) ;;
           esac ;;
         *) AC_MSG_ERROR([Unrecognized target $a_target]) ;;
@@ -671,6 +717,17 @@ case "$enableval" in
   done
   ;;
 esac
+
+AC_ARG_ENABLE([experimental-targets],AS_HELP_STRING([--enable-experimental-targets],
+    [Build experimental host targets: disable or target1,target2,...
+     (default=disable)]),,
+    enableval=disable)
+
+if test ${enableval} != "disable"
+then
+  TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
+fi
+
 AC_SUBST(TARGETS_TO_BUILD,$TARGETS_TO_BUILD)
 
 dnl Determine whether we are building LLVM support for the native architecture.
@@ -1024,11 +1081,7 @@ AC_PATH_PROG(CAT, [cat])
 AC_PATH_PROG(DOXYGEN, [doxygen])
 AC_PATH_PROG(GROFF, [groff])
 AC_PATH_PROG(GZIPBIN, [gzip])
-AC_PATH_PROG(POD2HTML, [pod2html])
-AC_PATH_PROG(POD2MAN, [pod2man])
 AC_PATH_PROG(PDFROFF, [pdfroff])
-AC_PATH_PROG(RUNTEST, [runtest])
-DJ_AC_PATH_TCLSH
 AC_PATH_PROG(ZIP, [zip])
 AC_PATH_PROGS(OCAMLC, [ocamlc])
 AC_PATH_PROGS(OCAMLOPT, [ocamlopt])
@@ -1260,6 +1313,23 @@ AC_ARG_WITH(intel-jitevents,
 AC_DEFINE_UNQUOTED([LLVM_USE_INTEL_JITEVENTS],$USE_INTEL_JITEVENTS,
                    [Define if we have the Intel JIT API runtime support library])
 
+dnl Check for libxml2
+dnl Right now we're just checking for the existence, we could also check for a
+dnl particular version via --version on xml2-config
+AC_CHECK_PROGS(XML2CONFIG, xml2-config)
+
+AC_MSG_CHECKING(for libxml2 includes)
+if test "x$XML2CONFIG" = "x"; then
+ AC_MSG_RESULT(xml2-config not found)
+else
+ LIBXML2_INC=`$XML2CONFIG --cflags`
+ AC_MSG_RESULT($LIBXML2_INC)
+ AC_CHECK_LIB(xml2, xmlReadFile,[AC_DEFINE([CLANG_HAVE_LIBXML],1,[Define if we have libxml2])
+                                LIBXML2_LIBS="-lxml2"])
+fi
+AC_SUBST(LIBXML2_LIBS)
+AC_SUBST(LIBXML2_INC)
+
 dnl===-----------------------------------------------------------------------===
 dnl===
 dnl=== SECTION 6: Check for header files
@@ -1342,7 +1412,7 @@ AC_CHECK_FUNCS([powf fmodf strtof round ])
 AC_CHECK_FUNCS([getpagesize getrusage getrlimit setrlimit gettimeofday ])
 AC_CHECK_FUNCS([isatty mkdtemp mkstemp ])
 AC_CHECK_FUNCS([mktemp posix_spawn pread realpath sbrk setrlimit strdup ])
-AC_CHECK_FUNCS([strerror strerror_r setenv ])
+AC_CHECK_FUNCS([strerror strerror_r setenv arc4random ])
 AC_CHECK_FUNCS([strtoll strtoq sysconf malloc_zone_statistics ])
 AC_CHECK_FUNCS([setjmp longjmp sigsetjmp siglongjmp writev])
 AC_C_PRINTF_A
@@ -1518,6 +1588,8 @@ AC_DEFINE_UNQUOTED(LLVM_MANDIR, "$LLVM_MANDIR",
                    [Installation directory for man pages])
 AC_DEFINE_UNQUOTED(LLVM_CONFIGTIME, "$LLVM_CONFIGTIME",
                    [Time at which LLVM was configured])
+AC_DEFINE_UNQUOTED(LLVM_HOSTTRIPLE, "$host",
+                   [Host triple LLVM will be executed on])
 AC_DEFINE_UNQUOTED(LLVM_DEFAULT_TARGET_TRIPLE, "$target",
                    [Target triple LLVM will generate code for by default])
 
diff --git a/autoconf/ltmain.sh b/autoconf/ltmain.sh
index 2455278..21ace01 100644
--- a/autoconf/ltmain.sh
+++ b/autoconf/ltmain.sh
@@ -1560,7 +1560,7 @@ EOF
 	    # These systems don't actually have a C library (as such)
 	    test "X$arg" = "X-lc" && continue
 	    ;;
-	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
 	    # Do not include libc due to us having libc/libc_r.
 	    test "X$arg" = "X-lc" && continue
 	    ;;
@@ -1580,7 +1580,7 @@ EOF
 	  esac
 	elif test "X$arg" = "X-lc_r"; then
 	 case $host in
-	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | -*-*-bitrig*)
 	   # Do not include libc_r directly, use -pthread flag.
 	   continue
 	   ;;
@@ -3464,7 +3464,7 @@ EOF
 	  *-*-netbsd*)
 	    # Don't link with libc until the a.out ld.so is fixed.
 	    ;;
-	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
 	    # Do not include libc due to us having libc/libc_r.
 	    ;;
 	  *-*-sco3.2v5* | *-*-sco5v6*)
diff --git a/autoconf/m4/func_isinf.m4 b/autoconf/m4/func_isinf.m4
index 5c000f8..22ba81d 100644
--- a/autoconf/m4/func_isinf.m4
+++ b/autoconf/m4/func_isinf.m4
@@ -1,5 +1,5 @@
 #
-# This function determins if the the isinf function isavailable on this
+# This function determins if the isinf function isavailable on this
 # platform.
 #
 AC_DEFUN([AC_FUNC_ISINF],[
diff --git a/autoconf/m4/huge_val.m4 b/autoconf/m4/huge_val.m4
index 9dc76f2..6c9a22e 100644
--- a/autoconf/m4/huge_val.m4
+++ b/autoconf/m4/huge_val.m4
@@ -1,5 +1,5 @@
 #
-# This function determins if the the HUGE_VAL macro is compilable with the 
+# This function determins if the HUGE_VAL macro is compilable with the 
 # -pedantic switch or not. XCode < 2.4.1 doesn't get it right.
 #
 AC_DEFUN([AC_HUGE_VAL_CHECK],[
diff --git a/autoconf/m4/libtool.m4 b/autoconf/m4/libtool.m4
index 36ac3d1..05af7a2 100644
--- a/autoconf/m4/libtool.m4
+++ b/autoconf/m4/libtool.m4
@@ -176,7 +176,7 @@ old_postuninstall_cmds=
 
 if test -n "$RANLIB"; then
   case $host_os in
-  openbsd*)
+  openbsd* | bitrig*)
     old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$oldlib"
     ;;
   *)
@@ -729,7 +729,7 @@ AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
     lt_cv_sys_max_cmd_len=8192;
     ;;
 
-  netbsd* | freebsd* | openbsd* | darwin* | dragonfly*)
+  netbsd* | freebsd* | openbsd* | darwin* | dragonfly* | bitrig*)
     # This has been around since 386BSD, at least.  Likely further.
     if test -x /sbin/sysctl; then
       lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax`
@@ -1631,7 +1631,7 @@ nto-qnx*)
   shlibpath_overrides_runpath=yes
   ;;
 
-openbsd*)
+openbsd* | bitrig*)
   version_type=sunos
   sys_lib_dlsearch_path_spec="/usr/lib"
   need_lib_prefix=no
@@ -3382,7 +3382,7 @@ case $host_os in
     # C++ shared libraries are fairly broken
     _LT_AC_TAGVAR(ld_shlibs, $1)=no
     ;;
-  openbsd*)
+  openbsd* | bitrig*)
     _LT_AC_TAGVAR(hardcode_direct, $1)=yes
     _LT_AC_TAGVAR(hardcode_shlibpath_var, $1)=no
     _LT_AC_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
@@ -6003,7 +6003,7 @@ _LT_EOF
       _LT_AC_TAGVAR(hardcode_shlibpath_var, $1)=no
       ;;
 
-    openbsd*)
+    openbsd* | bitrig*)
       _LT_AC_TAGVAR(hardcode_direct, $1)=yes
       _LT_AC_TAGVAR(hardcode_shlibpath_var, $1)=no
       if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
diff --git a/autoconf/m4/rand48.m4 b/autoconf/m4/rand48.m4
index 56705d8..76f08fa 100644
--- a/autoconf/m4/rand48.m4
+++ b/autoconf/m4/rand48.m4
@@ -1,5 +1,5 @@
 #
-# This function determins if the the srand48,drand48,lrand48 functions are
+# This function determins if the srand48,drand48,lrand48 functions are
 # available on this platform.
 #
 AC_DEFUN([AC_FUNC_RAND48],[
diff --git a/bindings/ocaml/bitreader/llvm_bitreader.mli b/bindings/ocaml/bitreader/llvm_bitreader.mli
index 1d33319..573de5e 100644
--- a/bindings/ocaml/bitreader/llvm_bitreader.mli
+++ b/bindings/ocaml/bitreader/llvm_bitreader.mli
@@ -23,7 +23,6 @@ val get_module : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
 
 (** [parse_bitcode context mb] parses the bitcode for a new module [m] from the
     memory buffer [mb] in the context [context]. Returns [m] if successful, or
-	 	raises [Error msg] otherwise, where [msg] is a description of the error
-	 	encountered. See the function [llvm::ParseBitcodeFile]. *)
+    raises [Error msg] otherwise, where [msg] is a description of the error
+    encountered. See the function [llvm::ParseBitcodeFile]. *)
 val parse_bitcode : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
-
diff --git a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
index 2db6456..7047ec0 100644
--- a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
+++ b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
@@ -57,7 +57,7 @@ CAMLprim value llvm_add_scalar_repl_aggregation_ssa(LLVMPassManagerRef PM) {
 
 /* [<Llvm.PassManager.any] Llvm.PassManager.t -> int -> unit */
 CAMLprim value llvm_add_scalar_repl_aggregation_with_threshold(value threshold,
-							       LLVMPassManagerRef PM) {
+                                                               LLVMPassManagerRef PM) {
   LLVMAddScalarReplAggregatesPassWithThreshold(PM, Int_val(threshold));
   return Val_unit;
 }
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 57ae79a..25d6211 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -91,6 +91,7 @@ if( NOT PURE_WINDOWS )
 endif()
 
 # function checks
+check_symbol_exists(arc4random "stdlib.h" HAVE_ARC4RANDOM)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(getrusage sys/resource.h HAVE_GETRUSAGE)
 check_symbol_exists(setrlimit sys/resource.h HAVE_SETRLIMIT)
@@ -240,6 +241,7 @@ llvm_find_program(fdp)
 llvm_find_program(dot)
 llvm_find_program(dotty)
 llvm_find_program(xdot.py)
+llvm_find_program(Graphviz)
 
 if( LLVM_ENABLE_FFI )
   find_path(FFI_INCLUDE_PATH ffi.h PATHS ${FFI_INCLUDE_DIR})
@@ -293,6 +295,7 @@ get_host_triple(LLVM_HOST_TRIPLE)
 # By default, we target the host, but this can be overridden at CMake
 # invocation time.
 set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}")
+set(LLVM_HOSTTRIPLE "${LLVM_HOST_TRIPLE}")
 set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 # Determine the native architecture.
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 388208b..f44a27c 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1,3 +1,4 @@
+include(LLVMParseArguments)
 include(LLVMProcessSources)
 include(LLVM-Config)
 
@@ -65,8 +66,8 @@ ${name} ignored.")
       set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
     else()
       install(TARGETS ${name}
-	LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-	ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+        LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
     endif()
   endif()
 
@@ -82,7 +83,6 @@ macro(add_llvm_executable name)
     add_executable(${name} ${ALL_FILES})
   endif()
   set(EXCLUDE_FROM_ALL OFF)
-  target_link_libraries( ${name} ${LLVM_USED_LIBS} )
   llvm_config( ${name} ${LLVM_LINK_COMPONENTS} )
   if( LLVM_COMMON_DEPENDS )
     add_dependencies( ${name} ${LLVM_COMMON_DEPENDS} )
@@ -130,3 +130,165 @@ macro(add_llvm_target target_name)
   add_llvm_library(LLVM${target_name} ${ARGN} ${TABLEGEN_OUTPUT})
   set( CURRENT_LLVM_TARGET LLVM${target_name} )
 endmacro(add_llvm_target)
+
+# Add external project that may want to be built as part of llvm such as Clang,
+# lld, and Polly. This adds two options. One for the source directory of the
+# project, which defaults to ${CMAKE_CURRENT_SOURCE_DIR}/${name}. Another to
+# enable or disable building it with everthing else.
+macro(add_llvm_external_project name)
+  string(TOUPPER ${name} nameUPPER)
+  set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${name}"
+      CACHE PATH "Path to ${name} source directory")
+  if (NOT ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} STREQUAL ""
+      AND EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}/CMakeLists.txt)
+    option(LLVM_EXTERNAL_${nameUPPER}_BUILD
+           "Whether to build ${name} as part of LLVM" ON)
+    if (LLVM_EXTERNAL_${nameUPPER}_BUILD)
+      add_subdirectory(${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} ${name})
+    endif()
+  endif()
+endmacro(add_llvm_external_project)
+
+# Generic support for adding a unittest.
+function(add_unittest test_suite test_name)
+  if (CMAKE_BUILD_TYPE)
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
+  else()
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+  if( NOT LLVM_BUILD_TESTS )
+    set(EXCLUDE_FROM_ALL ON)
+  endif()
+
+  add_llvm_executable(${test_name} ${ARGN})
+  target_link_libraries(${test_name}
+    gtest
+    gtest_main
+    LLVMSupport # gtest needs it for raw_ostream.
+    )
+
+  add_dependencies(${test_suite} ${test_name})
+  get_target_property(test_suite_folder ${test_suite} FOLDER)
+  if (NOT ${test_suite_folder} STREQUAL "NOTFOUND")
+    set_property(TARGET ${test_name} PROPERTY FOLDER "${test_suite_folder}")
+  endif ()
+
+  # Visual Studio 2012 only supports up to 8 template parameters in
+  # std::tr1::tuple by default, but gtest requires 10
+  if (MSVC AND MSVC_VERSION EQUAL 1700)
+    set_property(TARGET ${test_name} APPEND PROPERTY COMPILE_DEFINITIONS _VARIADIC_MAX=10)
+  endif ()
+
+  include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
+  set_property(TARGET ${test_name} APPEND PROPERTY COMPILE_DEFINITIONS GTEST_HAS_RTTI=0)
+  if (NOT LLVM_ENABLE_THREADS)
+    set_property(TARGET ${test_name} APPEND PROPERTY COMPILE_DEFINITIONS GTEST_HAS_PTHREAD=0)
+  endif ()
+
+  get_property(target_compile_flags TARGET ${test_name} PROPERTY COMPILE_FLAGS)
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    set(target_compile_flags "${target_compile_flags} -fno-rtti")
+  elseif (MSVC)
+    set(target_compile_flags "${target_compile_flags} /GR-")
+  endif ()
+
+  if (SUPPORTS_NO_VARIADIC_MACROS_FLAG)
+    set(target_compile_flags "${target_compile_flags} -Wno-variadic-macros")
+  endif ()
+  set_property(TARGET ${test_name} PROPERTY COMPILE_FLAGS "${target_compile_flags}")
+endfunction()
+
+# This function provides an automatic way to 'configure'-like generate a file
+# based on a set of common and custom variables, specifically targetting the
+# variables needed for the 'lit.site.cfg' files. This function bundles the
+# common variables that any Lit instance is likely to need, and custom
+# variables can be passed in.
+function(configure_lit_site_cfg input output)
+  foreach(c ${LLVM_TARGETS_TO_BUILD})
+    set(TARGETS_BUILT "${TARGETS_BUILT} ${c}")
+  endforeach(c)
+  set(TARGETS_TO_BUILD ${TARGETS_BUILT})
+
+  set(SHLIBEXT "${LTDL_SHLIB_EXT}")
+  set(SHLIBDIR "${LLVM_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}")
+
+  if(BUILD_SHARED_LIBS)
+    set(LLVM_SHARED_LIBS_ENABLED "1")
+  else()
+    set(LLVM_SHARED_LIBS_ENABLED "0")
+  endif(BUILD_SHARED_LIBS)
+
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(SHLIBPATH_VAR "DYLD_LIBRARY_PATH")
+  else() # Default for all other unix like systems.
+    # CMake hardcodes the library locaction using rpath.
+    # Therefore LD_LIBRARY_PATH is not required to run binaries in the
+    # build dir. We pass it anyways.
+    set(SHLIBPATH_VAR "LD_LIBRARY_PATH")
+  endif()
+
+  # Configuration-time: See Unit/lit.site.cfg.in
+  set(LLVM_BUILD_MODE "%(build_mode)s")
+
+  set(LLVM_SOURCE_DIR ${LLVM_MAIN_SRC_DIR})
+  set(LLVM_BINARY_DIR ${LLVM_BINARY_DIR})
+  set(LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}/%(build_config)s")
+  set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib/%(build_config)s")
+  set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
+  set(ENABLE_SHARED ${LLVM_SHARED_LIBS_ENABLED})
+  set(SHLIBPATH_VAR ${SHLIBPATH_VAR})
+
+  if(LLVM_ENABLE_ASSERTIONS AND NOT MSVC_IDE)
+    set(ENABLE_ASSERTIONS "1")
+  else()
+    set(ENABLE_ASSERTIONS "0")
+  endif()
+
+  set(HOST_OS ${CMAKE_HOST_SYSTEM_NAME})
+  set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+
+  configure_file(${input} ${output} @ONLY)
+endfunction()
+
+# A raw function to create a lit target. This is used to implement the testuite
+# management functions.
+function(add_lit_target target comment)
+  parse_arguments(ARG "PARAMS;DEPENDS;ARGS" "" ${ARGN})
+  set(LIT_ARGS "${ARG_ARGS} ${LLVM_LIT_ARGS}")
+  separate_arguments(LIT_ARGS)
+  set(LIT_COMMAND
+    ${PYTHON_EXECUTABLE}
+    ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py
+    --param build_config=${CMAKE_CFG_INTDIR}
+    --param build_mode=${RUNTIME_BUILD_MODE}
+    ${LIT_ARGS}
+    )
+  foreach(param ${ARG_PARAMS})
+    list(APPEND LIT_COMMAND --param ${param})
+  endforeach()
+  add_custom_target(${target}
+    COMMAND ${LIT_COMMAND} ${ARG_DEFAULT_ARGS}
+    COMMENT "${comment}"
+    )
+  add_dependencies(${target} ${ARG_DEPENDS})
+endfunction()
+
+# A function to add a set of lit test suites to be driven through 'check-*' targets.
+function(add_lit_testsuite target comment)
+  parse_arguments(ARG "PARAMS;DEPENDS;ARGS" "" ${ARGN})
+
+  # Register the testsuites, params and depends for the global check rule.
+  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_TESTSUITES ${ARG_DEFAULT_ARGS})
+  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_PARAMS ${ARG_PARAMS})
+  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_DEPENDS ${ARG_DEPENDS})
+  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_EXTRA_ARGS ${ARG_ARGS})
+
+  # Produce a specific suffixed check rule.
+  add_lit_target(${target} ${comment}
+    ${ARG_DEFAULT_ARGS}
+    PARAMS ${ARG_PARAMS}
+    DEPENDS ${ARG_DEPENDS}
+    ARGS ${ARG_ARGS}
+    )
+endfunction()
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index d10e59a..b5f96e8 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -105,8 +105,9 @@ if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
   if( LLVM_BUILD_32_BITS )
     message(STATUS "Building 32 bits executables and libraries.")
     add_llvm_definitions( -m32 )
-    list(APPEND CMAKE_EXE_LINKER_FLAGS -m32)
-    list(APPEND CMAKE_SHARED_LINKER_FLAGS -m32)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -m32")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m32")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -m32")
   endif( LLVM_BUILD_32_BITS )
 endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
 
@@ -150,8 +151,11 @@ if( MSVC )
     -D_SCL_SECURE_NO_DEPRECATE
     -D_SCL_SECURE_NO_WARNINGS
 
+    # Disabled warnings.
+    -wd4065 # Suppress 'switch statement contains 'default' but no 'case' labels'
     -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
     -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
+    -wd4181 # Suppress 'qualifier applied to reference type; ignored'
     -wd4224 # Suppress 'nonstandard extension used : formal parameter 'identifier' was previously defined as a type'
     -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
     -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
@@ -165,14 +169,18 @@ if( MSVC )
     -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
     -wd4715 # Suppress ''function' : not all control paths return a value'
     -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
-    -wd4065 # Suppress 'switch statement contains 'default' but no 'case' labels'
-    -wd4181 # Suppress 'qualifier applied to reference type; ignored'
-    -w14062 # Promote "enumerator in switch of enum is not handled" to level 1 warning.
+
+    # Promoted warnings.
+    -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
+
+    # Promoted warnings to errors.
+    -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
+    -we4239 # Promote 'nonstandard extension used : 'token' : conversion from 'type' to 'type'' to error.
     )
 
   # Enable warnings
   if (LLVM_ENABLE_WARNINGS)
-    add_llvm_definitions( /W4 /Wall )
+    add_llvm_definitions( /W4 )
     if (LLVM_ENABLE_PEDANTIC)
       # No MSVC equivalent available
     endif (LLVM_ENABLE_PEDANTIC)
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 443ec41..c43119e 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -31,7 +31,7 @@ set(HAVE_LIBPTHREAD @HAVE_LIBPTHREAD@)
 set(LLVM_ON_UNIX @LLVM_ON_UNIX@)
 set(LLVM_ON_WIN32 @LLVM_ON_WIN32@)
 
-set(LLVM_INSTALL_PREFIX @LLVM_INSTALL_PREFIX@)
+set(LLVM_INSTALL_PREFIX "@LLVM_INSTALL_PREFIX@")
 set(LLVM_INCLUDE_DIRS ${LLVM_INSTALL_PREFIX}/include)
 set(LLVM_LIBRARY_DIRS ${LLVM_INSTALL_PREFIX}/lib)
 set(LLVM_DEFINITIONS "-D__STDC_LIMIT_MACROS" "-D__STDC_CONSTANT_MACROS")
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 641f1b3..0e410ed 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -29,7 +29,7 @@ endmacro(add_td_sources)
 
 
 macro(add_header_files srcs)
-  file(GLOB hds *.h *.def)
+  file(GLOB hds *.h)
   if( hds )
     set_source_files_properties(${hds} PROPERTIES HEADER_FILE_ONLY ON)
     list(APPEND ${srcs} ${hds})
@@ -50,6 +50,7 @@ function(llvm_process_sources OUT_VAR)
   endforeach(s)
   if( MSVC_IDE )
     # This adds .td and .h files to the Visual Studio solution:
+    # FIXME: Shall we handle *.def here?
     add_td_sources(sources)
     add_header_files(sources)
   endif()
@@ -81,10 +82,13 @@ function(llvm_check_source_file_list)
   file(GLOB globbed *.cpp)
   foreach(g ${globbed})
     get_filename_component(fn ${g} NAME)
-    list(FIND listed ${fn} idx)
+    list(FIND LLVM_OPTIONAL_SOURCES ${fn} idx)
     if( idx LESS 0 )
-      message(SEND_ERROR "Found unknown source file ${g}
+      list(FIND listed ${fn} idx)
+      if( idx LESS 0 )
+        message(SEND_ERROR "Found unknown source file ${g}
 Please update ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt\n")
+      endif()
     endif()
   endforeach()
 endfunction(llvm_check_source_file_list)
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index 1b1b172..e3bdd9c 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -50,7 +50,9 @@ function(add_public_tablegen_target target)
   if( TABLEGEN_OUTPUT )
     add_custom_target(${target}
       DEPENDS ${TABLEGEN_OUTPUT})
-    add_dependencies(${target} ${LLVM_COMMON_DEPENDS})
+    if (LLVM_COMMON_DEPENDS)
+      add_dependencies(${target} ${LLVM_COMMON_DEPENDS})
+    endif ()
     set_target_properties(${target} PROPERTIES FOLDER "Tablegenning")
   endif( TABLEGEN_OUTPUT )
 endfunction()
diff --git a/configure b/configure
index a4c1592..62a6478 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.1.
+# Generated by GNU Autoconf 2.60 for LLVM 3.2svn.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.1'
-PACKAGE_STRING='LLVM 3.1'
+PACKAGE_VERSION='3.2svn'
+PACKAGE_STRING='LLVM 3.2svn'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/VMCore/Module.cpp"
@@ -674,6 +674,7 @@ NOLINKALL
 LLVM_ON_UNIX
 LLVM_ON_WIN32
 ARCH
+HOST_ARCH
 ENDIAN
 GREP
 EGREP
@@ -683,9 +684,11 @@ BUILD_EXEEXT
 BUILD_CXX
 CVSBUILD
 ENABLE_LIBCPP
+ENABLE_CXX11
 ENABLE_OPTIMIZED
 ENABLE_PROFILING
 DISABLE_ASSERTIONS
+ENABLE_WERROR
 ENABLE_EXPENSIVE_CHECKS
 EXPENSIVE_CHECKS
 DEBUG_RUNTIME
@@ -743,11 +746,7 @@ CAT
 DOXYGEN
 GROFF
 GZIPBIN
-POD2HTML
-POD2MAN
 PDFROFF
-RUNTEST
-TCLSH
 ZIP
 OCAMLC
 OCAMLOPT
@@ -768,6 +767,9 @@ USE_OPROFILE
 USE_INTEL_JITEVENTS
 INTEL_JITEVENTS_INCDIR
 INTEL_JITEVENTS_LIBDIR
+XML2CONFIG
+LIBXML2_LIBS
+LIBXML2_INC
 HAVE_PTHREAD
 HUGE_VAL_SANITY
 MMAP_FILE
@@ -1318,7 +1320,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.1 to adapt to many kinds of systems.
+\`configure' configures LLVM 3.2svn to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1384,7 +1386,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.1:";;
+     short | recursive ) echo "Configuration of LLVM 3.2svn:";;
    esac
   cat <<\_ACEOF
 
@@ -1393,10 +1395,12 @@ Optional Features:
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
   --enable-polly          Use polly if available (default is YES)
   --enable-libcpp         Use libc++ if available (default is NO)
+  --enable-cxx11          Use c++11 if available (default is NO)
   --enable-optimized      Compile with optimizations enabled (default is NO)
   --enable-profiling      Compile with profiling enabled (default is NO)
   --enable-assertions     Compile with assertion checks enabled (default is
                           YES)
+  --enable-werror         Compile with -Werror enabled (default is NO)
   --enable-expensive-checks
                           Compile with expensive debug checks enabled (default
                           is NO)
@@ -1420,7 +1424,10 @@ Optional Features:
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
                           x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-                          xcore, msp430, ptx, and cpp (default=all)
+                          xcore, msp430, nvptx, and cpp (default=all)
+  --enable-experimental-targets
+                          Build experimental host targets: disable or
+                          target1,target2,... (default=disable)
   --enable-bindings       Build specific language bindings:
                           all,auto,none,{binding-name} (default=auto)
   --enable-libffi         Check for the presence of libffi (default is NO)
@@ -1448,7 +1455,6 @@ Optional Packages:
   --with-bug-report-url   Specify the URL where bug reports should be
                           submitted (default=http://llvm.org/bugs/)
   --with-internal-prefix  Installation directory for internal files
-  --with-tclinclude       directory where tcl headers are
   --with-udis86=<path>    Use udis86 external x86 disassembler library
   --with-oprofile=<prefix>
                           Tell OProfile >= 0.9.4 how to symbolize JIT output
@@ -1532,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.1
+LLVM configure 3.2svn
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1548,7 +1554,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.1, which was
+It was created by LLVM $as_me 3.2svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1908,7 +1914,7 @@ _ACEOF
 
 
 cat >>confdefs.h <<\_ACEOF
-#define LLVM_VERSION_MINOR 1
+#define LLVM_VERSION_MINOR 2
 _ACEOF
 
 
@@ -3902,7 +3908,7 @@ else
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
   mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
-  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
   *)                      llvm_cv_target_arch="Unknown" ;;
 esac
 fi
@@ -3923,6 +3929,29 @@ esac
 ARCH=$llvm_cv_target_arch
 
 
+case $host in
+  i?86-*)                 host_arch="x86" ;;
+  amd64-* | x86_64-*)     host_arch="x86_64" ;;
+  sparc*-*)               host_arch="Sparc" ;;
+  powerpc*-*)             host_arch="PowerPC" ;;
+  arm*-*)                 host_arch="ARM" ;;
+  mips-*)                 host_arch="Mips" ;;
+  mipsel-*)               host_arch="Mips" ;;
+  xcore-*)                host_arch="XCore" ;;
+  msp430-*)               host_arch="MSP430" ;;
+  hexagon-*)              host_arch="Hexagon" ;;
+  mblaze-*)               host_arch="MBlaze" ;;
+  *)                      host_arch="Unknown" ;;
+esac
+
+if test "$host_arch" = "Unknown" ; then
+  { echo "$as_me:$LINENO: WARNING: Configuring LLVM for an unknown host archicture" >&5
+echo "$as_me: WARNING: Configuring LLVM for an unknown host archicture" >&2;}
+fi
+
+HOST_ARCH=$host_arch
+
+
 
 
 { echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
@@ -4997,6 +5026,25 @@ echo "$as_me: error: Invalid setting for --enable-libcpp. Use \"yes\" or \"no\""
    { (exit 1); exit 1; }; } ;;
 esac
 
+# Check whether --enable-cxx11 was given.
+if test "${enable_cxx11+set}" = set; then
+  enableval=$enable_cxx11;
+else
+  enableval=default
+fi
+
+case "$enableval" in
+  yes) ENABLE_CXX11=1
+ ;;
+  no)  ENABLE_CXX11=0
+ ;;
+  default) ENABLE_CXX11=0
+;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-cxx11. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-cxx11. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
 # Check whether --enable-optimized was given.
 if test "${enable_optimized+set}" = set; then
   enableval=$enable_optimized;
@@ -5042,6 +5090,25 @@ else
 
 fi
 
+# Check whether --enable-werror was given.
+if test "${enable_werror+set}" = set; then
+  enableval=$enable_werror;
+else
+  enableval="no"
+fi
+
+case "$enableval" in
+  yes) ENABLE_WERROR=1
+ ;;
+  no)  ENABLE_WERROR=0
+ ;;
+  default) ENABLE_WERROR=0
+;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-werror. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-werror. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
 # Check whether --enable-expensive-checks was given.
 if test "${enable_expensive_checks+set}" = set; then
   enableval=$enable_expensive_checks;
@@ -5124,7 +5191,7 @@ else
  ;;
     MBlaze)      TARGET_HAS_JIT=0
  ;;
-    PTX)         TARGET_HAS_JIT=0
+    NVPTX)       TARGET_HAS_JIT=0
  ;;
     *)           TARGET_HAS_JIT=0
  ;;
@@ -5310,7 +5377,7 @@ if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -5326,7 +5393,7 @@ case "$enableval" in
         cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
-        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -5339,7 +5406,7 @@ case "$enableval" in
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
             MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
             Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
-            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
             *)       { { echo "$as_me:$LINENO: error: Can not set target to build" >&5
 echo "$as_me: error: Can not set target to build" >&2;}
    { (exit 1); exit 1; }; } ;;
@@ -5351,6 +5418,20 @@ echo "$as_me: error: Unrecognized target $a_target" >&2;}
   done
   ;;
 esac
+
+# Check whether --enable-experimental-targets was given.
+if test "${enable_experimental_targets+set}" = set; then
+  enableval=$enable_experimental_targets;
+else
+  enableval=disable
+fi
+
+
+if test ${enableval} != "disable"
+then
+  TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
+fi
+
 TARGETS_TO_BUILD=$TARGETS_TO_BUILD
 
 
@@ -7194,86 +7275,6 @@ echo "${ECHO_T}no" >&6; }
 fi
 
 
-# Extract the first word of "pod2html", so it can be a program name with args.
-set dummy pod2html; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_POD2HTML+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $POD2HTML in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_POD2HTML="$POD2HTML" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_POD2HTML="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-POD2HTML=$ac_cv_path_POD2HTML
-if test -n "$POD2HTML"; then
-  { echo "$as_me:$LINENO: result: $POD2HTML" >&5
-echo "${ECHO_T}$POD2HTML" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-# Extract the first word of "pod2man", so it can be a program name with args.
-set dummy pod2man; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_POD2MAN+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $POD2MAN in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_POD2MAN="$POD2MAN" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_POD2MAN="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-POD2MAN=$ac_cv_path_POD2MAN
-if test -n "$POD2MAN"; then
-  { echo "$as_me:$LINENO: result: $POD2MAN" >&5
-echo "${ECHO_T}$POD2MAN" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
 # Extract the first word of "pdfroff", so it can be a program name with args.
 set dummy pdfroff; ac_word=$2
 { echo "$as_me:$LINENO: checking for $ac_word" >&5
@@ -7314,136 +7315,6 @@ echo "${ECHO_T}no" >&6; }
 fi
 
 
-# Extract the first word of "runtest", so it can be a program name with args.
-set dummy runtest; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_RUNTEST+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $RUNTEST in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_RUNTEST="$RUNTEST" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_RUNTEST="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-RUNTEST=$ac_cv_path_RUNTEST
-if test -n "$RUNTEST"; then
-  { echo "$as_me:$LINENO: result: $RUNTEST" >&5
-echo "${ECHO_T}$RUNTEST" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-
-no_itcl=true
-{ echo "$as_me:$LINENO: checking for the tclsh program in tclinclude directory" >&5
-echo $ECHO_N "checking for the tclsh program in tclinclude directory... $ECHO_C" >&6; }
-
-# Check whether --with-tclinclude was given.
-if test "${with_tclinclude+set}" = set; then
-  withval=$with_tclinclude; with_tclinclude=${withval}
-else
-  with_tclinclude=''
-fi
-
-if test "${ac_cv_path_tclsh+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-
-if test x"${with_tclinclude}" != x ; then
-  if test -f ${with_tclinclude}/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}; pwd)`
-  elif test -f ${with_tclinclude}/src/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}/src; pwd)`
-  else
-    { { echo "$as_me:$LINENO: error: ${with_tclinclude} directory doesn't contain tclsh" >&5
-echo "$as_me: error: ${with_tclinclude} directory doesn't contain tclsh" >&2;}
-   { (exit 1); exit 1; }; }
-  fi
-fi
-fi
-
-
-if test x"${ac_cv_path_tclsh}" = x ; then
-  { echo "$as_me:$LINENO: result: none" >&5
-echo "${ECHO_T}none" >&6; }
-  for ac_prog in tclsh8.4 tclsh8.4.8 tclsh8.4.7 tclsh8.4.6 tclsh8.4.5 tclsh8.4.4 tclsh8.4.3 tclsh8.4.2 tclsh8.4.1 tclsh8.4.0 tclsh8.3 tclsh8.3.5 tclsh8.3.4 tclsh8.3.3 tclsh8.3.2 tclsh8.3.1 tclsh8.3.0 tclsh
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_TCLSH+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $TCLSH in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_TCLSH="$TCLSH" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_TCLSH="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-TCLSH=$ac_cv_path_TCLSH
-if test -n "$TCLSH"; then
-  { echo "$as_me:$LINENO: result: $TCLSH" >&5
-echo "${ECHO_T}$TCLSH" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$TCLSH" && break
-done
-
-  if test x"${TCLSH}" = x ; then
-    ac_cv_path_tclsh='';
-  else
-    ac_cv_path_tclsh="${TCLSH}";
-  fi
-else
-  { echo "$as_me:$LINENO: result: ${ac_cv_path_tclsh}" >&5
-echo "${ECHO_T}${ac_cv_path_tclsh}" >&6; }
-  TCLSH="${ac_cv_path_tclsh}"
-
-fi
-
 # Extract the first word of "zip", so it can be a program name with args.
 set dummy zip; ac_word=$2
 { echo "$as_me:$LINENO: checking for $ac_word" >&5
@@ -9093,7 +8964,7 @@ nto-qnx*)
   shlibpath_overrides_runpath=yes
   ;;
 
-openbsd*)
+openbsd* | bitrig*)
   version_type=sunos
   sys_lib_dlsearch_path_spec="/usr/lib"
   need_lib_prefix=no
@@ -10401,7 +10272,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10404 "configure"
+#line 10275 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -13976,6 +13847,148 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
+for ac_prog in xml2-config
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_XML2CONFIG+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$XML2CONFIG"; then
+  ac_cv_prog_XML2CONFIG="$XML2CONFIG" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_XML2CONFIG="$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+XML2CONFIG=$ac_cv_prog_XML2CONFIG
+if test -n "$XML2CONFIG"; then
+  { echo "$as_me:$LINENO: result: $XML2CONFIG" >&5
+echo "${ECHO_T}$XML2CONFIG" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  test -n "$XML2CONFIG" && break
+done
+
+
+{ echo "$as_me:$LINENO: checking for libxml2 includes" >&5
+echo $ECHO_N "checking for libxml2 includes... $ECHO_C" >&6; }
+if test "x$XML2CONFIG" = "x"; then
+ { echo "$as_me:$LINENO: result: xml2-config not found" >&5
+echo "${ECHO_T}xml2-config not found" >&6; }
+else
+ LIBXML2_INC=`$XML2CONFIG --cflags`
+ { echo "$as_me:$LINENO: result: $LIBXML2_INC" >&5
+echo "${ECHO_T}$LIBXML2_INC" >&6; }
+ { echo "$as_me:$LINENO: checking for xmlReadFile in -lxml2" >&5
+echo $ECHO_N "checking for xmlReadFile in -lxml2... $ECHO_C" >&6; }
+if test "${ac_cv_lib_xml2_xmlReadFile+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lxml2  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char xmlReadFile ();
+int
+main ()
+{
+return xmlReadFile ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_xml2_xmlReadFile=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_xml2_xmlReadFile=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_xml2_xmlReadFile" >&5
+echo "${ECHO_T}$ac_cv_lib_xml2_xmlReadFile" >&6; }
+if test $ac_cv_lib_xml2_xmlReadFile = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define CLANG_HAVE_LIBXML 1
+_ACEOF
+
+                                LIBXML2_LIBS="-lxml2"
+fi
+
+fi
+
+
+
 
 
 
@@ -17517,7 +17530,8 @@ done
 
 
 
-for ac_func in strerror strerror_r setenv
+
+for ac_func in strerror strerror_r setenv arc4random
 do
 as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
 { echo "$as_me:$LINENO: checking for $ac_func" >&5
@@ -21139,6 +21153,11 @@ _ACEOF
 
 
 cat >>confdefs.h <<_ACEOF
+#define LLVM_HOSTTRIPLE "$host"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
 #define LLVM_DEFAULT_TARGET_TRIPLE "$target"
 _ACEOF
 
@@ -21785,7 +21804,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.1, which was
+This file was extended by LLVM $as_me 3.2svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -21838,7 +21857,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.1
+LLVM config.status 3.2svn
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
@@ -22112,6 +22131,7 @@ NOLINKALL!$NOLINKALL$ac_delim
 LLVM_ON_UNIX!$LLVM_ON_UNIX$ac_delim
 LLVM_ON_WIN32!$LLVM_ON_WIN32$ac_delim
 ARCH!$ARCH$ac_delim
+HOST_ARCH!$HOST_ARCH$ac_delim
 ENDIAN!$ENDIAN$ac_delim
 GREP!$GREP$ac_delim
 EGREP!$EGREP$ac_delim
@@ -22121,9 +22141,11 @@ BUILD_EXEEXT!$BUILD_EXEEXT$ac_delim
 BUILD_CXX!$BUILD_CXX$ac_delim
 CVSBUILD!$CVSBUILD$ac_delim
 ENABLE_LIBCPP!$ENABLE_LIBCPP$ac_delim
+ENABLE_CXX11!$ENABLE_CXX11$ac_delim
 ENABLE_OPTIMIZED!$ENABLE_OPTIMIZED$ac_delim
 ENABLE_PROFILING!$ENABLE_PROFILING$ac_delim
 DISABLE_ASSERTIONS!$DISABLE_ASSERTIONS$ac_delim
+ENABLE_WERROR!$ENABLE_WERROR$ac_delim
 ENABLE_EXPENSIVE_CHECKS!$ENABLE_EXPENSIVE_CHECKS$ac_delim
 EXPENSIVE_CHECKS!$EXPENSIVE_CHECKS$ac_delim
 DEBUG_RUNTIME!$DEBUG_RUNTIME$ac_delim
@@ -22134,9 +22156,6 @@ ENABLE_DOCS!$ENABLE_DOCS$ac_delim
 ENABLE_DOXYGEN!$ENABLE_DOXYGEN$ac_delim
 LLVM_ENABLE_THREADS!$LLVM_ENABLE_THREADS$ac_delim
 ENABLE_PTHREADS!$ENABLE_PTHREADS$ac_delim
-ENABLE_PIC!$ENABLE_PIC$ac_delim
-ENABLE_SHARED!$ENABLE_SHARED$ac_delim
-ENABLE_EMBED_STDCXX!$ENABLE_EMBED_STDCXX$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -22178,6 +22197,9 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
+ENABLE_PIC!$ENABLE_PIC$ac_delim
+ENABLE_SHARED!$ENABLE_SHARED$ac_delim
+ENABLE_EMBED_STDCXX!$ENABLE_EMBED_STDCXX$ac_delim
 ENABLE_TIMESTAMPS!$ENABLE_TIMESTAMPS$ac_delim
 TARGETS_TO_BUILD!$TARGETS_TO_BUILD$ac_delim
 LLVM_ENUM_TARGETS!$LLVM_ENUM_TARGETS$ac_delim
@@ -22222,11 +22244,7 @@ CAT!$CAT$ac_delim
 DOXYGEN!$DOXYGEN$ac_delim
 GROFF!$GROFF$ac_delim
 GZIPBIN!$GZIPBIN$ac_delim
-POD2HTML!$POD2HTML$ac_delim
-POD2MAN!$POD2MAN$ac_delim
 PDFROFF!$PDFROFF$ac_delim
-RUNTEST!$RUNTEST$ac_delim
-TCLSH!$TCLSH$ac_delim
 ZIP!$ZIP$ac_delim
 OCAMLC!$OCAMLC$ac_delim
 OCAMLOPT!$OCAMLOPT$ac_delim
@@ -22247,6 +22265,9 @@ USE_OPROFILE!$USE_OPROFILE$ac_delim
 USE_INTEL_JITEVENTS!$USE_INTEL_JITEVENTS$ac_delim
 INTEL_JITEVENTS_INCDIR!$INTEL_JITEVENTS_INCDIR$ac_delim
 INTEL_JITEVENTS_LIBDIR!$INTEL_JITEVENTS_LIBDIR$ac_delim
+XML2CONFIG!$XML2CONFIG$ac_delim
+LIBXML2_LIBS!$LIBXML2_LIBS$ac_delim
+LIBXML2_INC!$LIBXML2_INC$ac_delim
 HAVE_PTHREAD!$HAVE_PTHREAD$ac_delim
 HUGE_VAL_SANITY!$HUGE_VAL_SANITY$ac_delim
 MMAP_FILE!$MMAP_FILE$ac_delim
@@ -22272,7 +22293,7 @@ LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 92; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 94; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/docs/AliasAnalysis.html b/docs/AliasAnalysis.html
deleted file mode 100644
index c59f60d..0000000
--- a/docs/AliasAnalysis.html
+++ /dev/null
@@ -1,1067 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM Alias Analysis Infrastructure</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  LLVM Alias Analysis Infrastructure
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-
-  <li><a href="#overview"><tt>AliasAnalysis</tt> Class Overview</a>
-    <ul>
-    <li><a href="#pointers">Representation of Pointers</a></li>
-    <li><a href="#alias">The <tt>alias</tt> method</a></li>
-    <li><a href="#ModRefInfo">The <tt>getModRefInfo</tt> methods</a></li>
-    <li><a href="#OtherItfs">Other useful <tt>AliasAnalysis</tt> methods</a></li>
-    </ul>
-  </li>
-
-  <li><a href="#writingnew">Writing a new <tt>AliasAnalysis</tt> Implementation</a>
-    <ul>
-    <li><a href="#passsubclasses">Different Pass styles</a></li>
-    <li><a href="#requiredcalls">Required initialization calls</a></li>
-    <li><a href="#interfaces">Interfaces which may be specified</a></li>
-    <li><a href="#chaining"><tt>AliasAnalysis</tt> chaining behavior</a></li>
-    <li><a href="#updating">Updating analysis results for transformations</a></li>
-    <li><a href="#implefficiency">Efficiency Issues</a></li>
-    <li><a href="#limitations">Limitations</a></li>
-    </ul>
-  </li>
-
-  <li><a href="#using">Using alias analysis results</a>
-    <ul>
-    <li><a href="#memdep">Using the <tt>MemoryDependenceAnalysis</tt> Pass</a></li>
-    <li><a href="#ast">Using the <tt>AliasSetTracker</tt> class</a></li>
-    <li><a href="#direct">Using the <tt>AliasAnalysis</tt> interface directly</a></li>
-    </ul>
-  </li>
-
-  <li><a href="#exist">Existing alias analysis implementations and clients</a>
-    <ul>
-    <li><a href="#impls">Available <tt>AliasAnalysis</tt> implementations</a></li>
-    <li><a href="#aliasanalysis-xforms">Alias analysis driven transformations</a></li>
-    <li><a href="#aliasanalysis-debug">Clients for debugging and evaluation of
-    implementations</a></li>
-    </ul>
-  </li>
-  <li><a href="#memdep">Memory Dependence Analysis</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Alias Analysis (aka Pointer Analysis) is a class of techniques which attempt
-to determine whether or not two pointers ever can point to the same object in
-memory.  There are many different algorithms for alias analysis and many
-different ways of classifying them: flow-sensitive vs flow-insensitive,
-context-sensitive vs context-insensitive, field-sensitive vs field-insensitive,
-unification-based vs subset-based, etc.  Traditionally, alias analyses respond
-to a query with a <a href="#MustMayNo">Must, May, or No</a> alias response,
-indicating that two pointers always point to the same object, might point to the
-same object, or are known to never point to the same object.</p>
-
-<p>The LLVM <a
-href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html"><tt>AliasAnalysis</tt></a>
-class is the primary interface used by clients and implementations of alias
-analyses in the LLVM system.  This class is the common interface between clients
-of alias analysis information and the implementations providing it, and is
-designed to support a wide range of implementations and clients (but currently
-all clients are assumed to be flow-insensitive).  In addition to simple alias
-analysis information, this class exposes Mod/Ref information from those
-implementations which can provide it, allowing for powerful analyses and
-transformations to work well together.</p>
-
-<p>This document contains information necessary to successfully implement this
-interface, use it, and to test both sides.  It also explains some of the finer
-points about what exactly results mean.  If you feel that something is unclear
-or should be added, please <a href="mailto:sabre@nondot.org">let me
-know</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="overview"><tt>AliasAnalysis</tt> Class Overview</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The <a
-href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html"><tt>AliasAnalysis</tt></a>
-class defines the interface that the various alias analysis implementations
-should support.  This class exports two important enums: <tt>AliasResult</tt>
-and <tt>ModRefResult</tt> which represent the result of an alias query or a
-mod/ref query, respectively.</p>
-
-<p>The <tt>AliasAnalysis</tt> interface exposes information about memory,
-represented in several different ways.  In particular, memory objects are
-represented as a starting address and size, and function calls are represented
-as the actual <tt>call</tt> or <tt>invoke</tt> instructions that performs the
-call.  The <tt>AliasAnalysis</tt> interface also exposes some helper methods
-which allow you to get mod/ref information for arbitrary instructions.</p>
-
-<p>All <tt>AliasAnalysis</tt> interfaces require that in queries involving
-multiple values, values which are not
-<a href="LangRef.html#constants">constants</a> are all defined within the
-same function.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="pointers">Representation of Pointers</a>
-</h3>
-
-<div>
-
-<p>Most importantly, the <tt>AliasAnalysis</tt> class provides several methods
-which are used to query whether or not two memory objects alias, whether
-function calls can modify or read a memory object, etc.  For all of these
-queries, memory objects are represented as a pair of their starting address (a
-symbolic LLVM <tt>Value*</tt>) and a static size.</p>
-
-<p>Representing memory objects as a starting address and a size is critically
-important for correct Alias Analyses.  For example, consider this (silly, but
-possible) C code:</p>
-
-<div class="doc_code">
-<pre>
-int i;
-char C[2];
-char A[10]; 
-/* ... */
-for (i = 0; i != 10; ++i) {
-  C[0] = A[i];          /* One byte store */
-  C[1] = A[9-i];        /* One byte store */
-}
-</pre>
-</div>
-
-<p>In this case, the <tt>basicaa</tt> pass will disambiguate the stores to
-<tt>C[0]</tt> and <tt>C[1]</tt> because they are accesses to two distinct
-locations one byte apart, and the accesses are each one byte.  In this case, the
-LICM pass can use store motion to remove the stores from the loop.  In
-constrast, the following code:</p>
-
-<div class="doc_code">
-<pre>
-int i;
-char C[2];
-char A[10]; 
-/* ... */
-for (i = 0; i != 10; ++i) {
-  ((short*)C)[0] = A[i];  /* Two byte store! */
-  C[1] = A[9-i];          /* One byte store */
-}
-</pre>
-</div>
-
-<p>In this case, the two stores to C do alias each other, because the access to
-the <tt>&amp;C[0]</tt> element is a two byte access.  If size information wasn't
-available in the query, even the first case would have to conservatively assume
-that the accesses alias.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="alias">The <tt>alias</tt> method</a>
-</h3>
-  
-<div>
-<p>The <tt>alias</tt> method is the primary interface used to determine whether
-or not two memory objects alias each other.  It takes two memory objects as
-input and returns MustAlias, PartialAlias, MayAlias, or NoAlias as
-appropriate.</p>
-
-<p>Like all <tt>AliasAnalysis</tt> interfaces, the <tt>alias</tt> method requires
-that either the two pointer values be defined within the same function, or at
-least one of the values is a <a href="LangRef.html#constants">constant</a>.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="MustMayNo">Must, May, and No Alias Responses</a>
-</h4>
-
-<div>
-<p>The NoAlias response may be used when there is never an immediate dependence
-between any memory reference <i>based</i> on one pointer and any memory
-reference <i>based</i> the other. The most obvious example is when the two
-pointers point to non-overlapping memory ranges. Another is when the two
-pointers are only ever used for reading memory. Another is when the memory is
-freed and reallocated between accesses through one pointer and accesses through
-the other -- in this case, there is a dependence, but it's mediated by the free
-and reallocation.</p>
-
-<p>As an exception to this is with the
-<a href="LangRef.html#noalias"><tt>noalias</tt></a> keyword; the "irrelevant"
-dependencies are ignored.</p>
-
-<p>The MayAlias response is used whenever the two pointers might refer to the
-same object.</p>
-
-<p>The PartialAlias response is used when the two memory objects are known
-to be overlapping in some way, but do not start at the same address.</p>
-
-<p>The MustAlias response may only be returned if the two memory objects are
-guaranteed to always start at exactly the same location. A MustAlias response
-implies that the pointers compare equal.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ModRefInfo">The <tt>getModRefInfo</tt> methods</a>
-</h3>
-
-<div>
-
-<p>The <tt>getModRefInfo</tt> methods return information about whether the
-execution of an instruction can read or modify a memory location.  Mod/Ref
-information is always conservative: if an instruction <b>might</b> read or write
-a location, ModRef is returned.</p>
-
-<p>The <tt>AliasAnalysis</tt> class also provides a <tt>getModRefInfo</tt>
-method for testing dependencies between function calls.  This method takes two
-call sites (CS1 &amp; CS2), returns NoModRef if neither call writes to memory
-read or written by the other, Ref if CS1 reads memory written by CS2, Mod if CS1
-writes to memory read or written by CS2, or ModRef if CS1 might read or write
-memory written to by CS2.  Note that this relation is not commutative.</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="OtherItfs">Other useful <tt>AliasAnalysis</tt> methods</a>
-</h3>
-
-<div>
-
-<p>
-Several other tidbits of information are often collected by various alias
-analysis implementations and can be put to good use by various clients.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  The <tt>pointsToConstantMemory</tt> method
-</h4>
-
-<div>
-
-<p>The <tt>pointsToConstantMemory</tt> method returns true if and only if the
-analysis can prove that the pointer only points to unchanging memory locations
-(functions, constant global variables, and the null pointer).  This information
-can be used to refine mod/ref information: it is impossible for an unchanging
-memory location to be modified.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="simplemodref">The <tt>doesNotAccessMemory</tt> and
-  <tt>onlyReadsMemory</tt> methods</a>
-</h4>
-
-<div>
-
-<p>These methods are used to provide very simple mod/ref information for
-function calls.  The <tt>doesNotAccessMemory</tt> method returns true for a
-function if the analysis can prove that the function never reads or writes to
-memory, or if the function only reads from constant memory.  Functions with this
-property are side-effect free and only depend on their input arguments, allowing
-them to be eliminated if they form common subexpressions or be hoisted out of
-loops.  Many common functions behave this way (e.g., <tt>sin</tt> and
-<tt>cos</tt>) but many others do not (e.g., <tt>acos</tt>, which modifies the
-<tt>errno</tt> variable).</p>
-
-<p>The <tt>onlyReadsMemory</tt> method returns true for a function if analysis
-can prove that (at most) the function only reads from non-volatile memory.
-Functions with this property are side-effect free, only depending on their input
-arguments and the state of memory when they are called.  This property allows
-calls to these functions to be eliminated and moved around, as long as there is
-no store instruction that changes the contents of memory.  Note that all
-functions that satisfy the <tt>doesNotAccessMemory</tt> method also satisfies
-<tt>onlyReadsMemory</tt>.</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="writingnew">Writing a new <tt>AliasAnalysis</tt> Implementation</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Writing a new alias analysis implementation for LLVM is quite
-straight-forward.  There are already several implementations that you can use
-for examples, and the following information should help fill in any details.
-For a examples, take a look at the <a href="#impls">various alias analysis
-implementations</a> included with LLVM.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="passsubclasses">Different Pass styles</a>
-</h3>
-
-<div>
-
-<p>The first step to determining what type of <a
-href="WritingAnLLVMPass.html">LLVM pass</a> you need to use for your Alias
-Analysis.  As is the case with most other analyses and transformations, the
-answer should be fairly obvious from what type of problem you are trying to
-solve:</p>
-
-<ol>
-  <li>If you require interprocedural analysis, it should be a
-      <tt>Pass</tt>.</li>
-  <li>If you are a function-local analysis, subclass <tt>FunctionPass</tt>.</li>
-  <li>If you don't need to look at the program at all, subclass 
-      <tt>ImmutablePass</tt>.</li>
-</ol>
-
-<p>In addition to the pass that you subclass, you should also inherit from the
-<tt>AliasAnalysis</tt> interface, of course, and use the
-<tt>RegisterAnalysisGroup</tt> template to register as an implementation of
-<tt>AliasAnalysis</tt>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="requiredcalls">Required initialization calls</a>
-</h3>
-
-<div>
-
-<p>Your subclass of <tt>AliasAnalysis</tt> is required to invoke two methods on
-the <tt>AliasAnalysis</tt> base class: <tt>getAnalysisUsage</tt> and
-<tt>InitializeAliasAnalysis</tt>.  In particular, your implementation of
-<tt>getAnalysisUsage</tt> should explicitly call into the
-<tt>AliasAnalysis::getAnalysisUsage</tt> method in addition to doing any
-declaring any pass dependencies your pass has.  Thus you should have something
-like this:</p>
-
-<div class="doc_code">
-<pre>
-void getAnalysisUsage(AnalysisUsage &amp;AU) const {
-  AliasAnalysis::getAnalysisUsage(AU);
-  <i>// declare your dependencies here.</i>
-}
-</pre>
-</div>
-
-<p>Additionally, your must invoke the <tt>InitializeAliasAnalysis</tt> method
-from your analysis run method (<tt>run</tt> for a <tt>Pass</tt>,
-<tt>runOnFunction</tt> for a <tt>FunctionPass</tt>, or <tt>InitializePass</tt>
-for an <tt>ImmutablePass</tt>).  For example (as part of a <tt>Pass</tt>):</p>
-
-<div class="doc_code">
-<pre>
-bool run(Module &amp;M) {
-  InitializeAliasAnalysis(this);
-  <i>// Perform analysis here...</i>
-  return false;
-}
-</pre>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="interfaces">Interfaces which may be specified</a>
-</h3>
-
-<div>
-
-<p>All of the <a
-href="/doxygen/classllvm_1_1AliasAnalysis.html"><tt>AliasAnalysis</tt></a>
-virtual methods default to providing <a href="#chaining">chaining</a> to another
-alias analysis implementation, which ends up returning conservatively correct
-information (returning "May" Alias and "Mod/Ref" for alias and mod/ref queries
-respectively).  Depending on the capabilities of the analysis you are
-implementing, you just override the interfaces you can improve.</p>
-
-</div>
-
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="chaining"><tt>AliasAnalysis</tt> chaining behavior</a>
-</h3>
-
-<div>
-
-<p>With only one special exception (the <a href="#no-aa"><tt>no-aa</tt></a>
-pass) every alias analysis pass chains to another alias analysis
-implementation (for example, the user can specify "<tt>-basicaa -ds-aa
--licm</tt>" to get the maximum benefit from both alias
-analyses).  The alias analysis class automatically takes care of most of this
-for methods that you don't override.  For methods that you do override, in code
-paths that return a conservative MayAlias or Mod/Ref result, simply return
-whatever the superclass computes.  For example:</p>
-
-<div class="doc_code">
-<pre>
-AliasAnalysis::AliasResult alias(const Value *V1, unsigned V1Size,
-                                 const Value *V2, unsigned V2Size) {
-  if (...)
-    return NoAlias;
-  ...
-
-  <i>// Couldn't determine a must or no-alias result.</i>
-  return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
-}
-</pre>
-</div>
-
-<p>In addition to analysis queries, you must make sure to unconditionally pass
-LLVM <a href="#updating">update notification</a> methods to the superclass as
-well if you override them, which allows all alias analyses in a change to be
-updated.</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="updating">Updating analysis results for transformations</a>
-</h3>
-
-<div>
-<p>
-Alias analysis information is initially computed for a static snapshot of the
-program, but clients will use this information to make transformations to the
-code.  All but the most trivial forms of alias analysis will need to have their
-analysis results updated to reflect the changes made by these transformations.
-</p>
-
-<p>
-The <tt>AliasAnalysis</tt> interface exposes four methods which are used to
-communicate program changes from the clients to the analysis implementations.
-Various alias analysis implementations should use these methods to ensure that
-their internal data structures are kept up-to-date as the program changes (for
-example, when an instruction is deleted), and clients of alias analysis must be
-sure to call these interfaces appropriately.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>The <tt>deleteValue</tt> method</h4>
-
-<div>
-The <tt>deleteValue</tt> method is called by transformations when they remove an
-instruction or any other value from the program (including values that do not
-use pointers).  Typically alias analyses keep data structures that have entries
-for each value in the program.  When this method is called, they should remove
-any entries for the specified value, if they exist.
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>The <tt>copyValue</tt> method</h4>
-
-<div>
-The <tt>copyValue</tt> method is used when a new value is introduced into the
-program.  There is no way to introduce a value into the program that did not
-exist before (this doesn't make sense for a safe compiler transformation), so
-this is the only way to introduce a new value.  This method indicates that the
-new value has exactly the same properties as the value being copied.
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>The <tt>replaceWithNewValue</tt> method</h4>
-
-<div>
-This method is a simple helper method that is provided to make clients easier to
-use.  It is implemented by copying the old analysis information to the new
-value, then deleting the old value.  This method cannot be overridden by alias
-analysis implementations.
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>The <tt>addEscapingUse</tt> method</h4>
-
-<div>
-<p>The <tt>addEscapingUse</tt> method is used when the uses of a pointer
-value have changed in ways that may invalidate precomputed analysis information. 
-Implementations may either use this callback to provide conservative responses
-for points whose uses have change since analysis time, or may recompute some
-or all of their internal state to continue providing accurate responses.</p>
-
-<p>In general, any new use of a pointer value is considered an escaping use,
-and must be reported through this callback, <em>except</em> for the
-uses below:</p>
-
-<ul>
-  <li>A <tt>bitcast</tt> or <tt>getelementptr</tt> of the pointer</li>
-  <li>A <tt>store</tt> through the pointer (but not a <tt>store</tt>
-      <em>of</em> the pointer)</li>
-  <li>A <tt>load</tt> through the pointer</li>
-</ul>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="implefficiency">Efficiency Issues</a>
-</h3>
-
-<div>
-
-<p>From the LLVM perspective, the only thing you need to do to provide an
-efficient alias analysis is to make sure that alias analysis <b>queries</b> are
-serviced quickly.  The actual calculation of the alias analysis results (the
-"run" method) is only performed once, but many (perhaps duplicate) queries may
-be performed.  Because of this, try to move as much computation to the run
-method as possible (within reason).</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="limitations">Limitations</a>
-</h3>
-
-<div>
-
-<p>The AliasAnalysis infrastructure has several limitations which make
-writing a new <tt>AliasAnalysis</tt> implementation difficult.</p>
-
-<p>There is no way to override the default alias analysis. It would
-be very useful to be able to do something like "opt -my-aa -O2" and
-have it use -my-aa for all passes which need AliasAnalysis, but there
-is currently no support for that, short of changing the source code
-and recompiling. Similarly, there is also no way of setting a chain
-of analyses as the default.</p>
-
-<p>There is no way for transform passes to declare that they preserve
-<tt>AliasAnalysis</tt> implementations. The <tt>AliasAnalysis</tt>
-interface includes <tt>deleteValue</tt> and <tt>copyValue</tt> methods
-which are intended to allow a pass to keep an AliasAnalysis consistent,
-however there's no way for a pass to declare in its
-<tt>getAnalysisUsage</tt> that it does so. Some passes attempt to use
-<tt>AU.addPreserved&lt;AliasAnalysis&gt;</tt>, however this doesn't
-actually have any effect.</p>
-
-<p><tt>AliasAnalysisCounter</tt> (<tt>-count-aa</tt>) and <tt>AliasDebugger</tt>
-(<tt>-debug-aa</tt>) are implemented as <tt>ModulePass</tt> classes, so if your
-alias analysis uses <tt>FunctionPass</tt>, it won't be able to use
-these utilities. If you try to use them, the pass manager will
-silently route alias analysis queries directly to
-<tt>BasicAliasAnalysis</tt> instead.</p>
-
-<p>Similarly, the <tt>opt -p</tt> option introduces <tt>ModulePass</tt>
-passes between each pass, which prevents the use of <tt>FunctionPass</tt>
-alias analysis passes.</p>
-
-<p>The <tt>AliasAnalysis</tt> API does have functions for notifying
-implementations when values are deleted or copied, however these
-aren't sufficient. There are many other ways that LLVM IR can be
-modified which could be relevant to <tt>AliasAnalysis</tt>
-implementations which can not be expressed.</p>
-
-<p>The <tt>AliasAnalysisDebugger</tt> utility seems to suggest that
-<tt>AliasAnalysis</tt> implementations can expect that they will be
-informed of any relevant <tt>Value</tt> before it appears in an
-alias query. However, popular clients such as <tt>GVN</tt> don't
-support this, and are known to trigger errors when run with the
-<tt>AliasAnalysisDebugger</tt>.</p>
-
-<p>Due to several of the above limitations, the most obvious use for
-the <tt>AliasAnalysisCounter</tt> utility, collecting stats on all
-alias queries in a compilation, doesn't work, even if the
-<tt>AliasAnalysis</tt> implementations don't use <tt>FunctionPass</tt>.
-There's no way to set a default, much less a default sequence,
-and there's no way to preserve it.</p>
-
-<p>The <tt>AliasSetTracker</tt> class (which is used by <tt>LICM</tt>
-makes a non-deterministic number of alias queries. This can cause stats
-collected by <tt>AliasAnalysisCounter</tt> to have fluctuations among
-identical runs, for example. Another consequence is that debugging
-techniques involving pausing execution after a predetermined number
-of queries can be unreliable.</p>
-
-<p>Many alias queries can be reformulated in terms of other alias
-queries. When multiple <tt>AliasAnalysis</tt> queries are chained together,
-it would make sense to start those queries from the beginning of the chain,
-with care taken to avoid infinite looping, however currently an
-implementation which wants to do this can only start such queries
-from itself.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="using">Using alias analysis results</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>There are several different ways to use alias analysis results.  In order of
-preference, these are...</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="memdep">Using the <tt>MemoryDependenceAnalysis</tt> Pass</a>
-</h3>
-
-<div>
-
-<p>The <tt>memdep</tt> pass uses alias analysis to provide high-level dependence
-information about memory-using instructions.  This will tell you which store
-feeds into a load, for example.  It uses caching and other techniques to be
-efficient, and is used by Dead Store Elimination, GVN, and memcpy optimizations.
-</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ast">Using the <tt>AliasSetTracker</tt> class</a>
-</h3>
-
-<div>
-
-<p>Many transformations need information about alias <b>sets</b> that are active
-in some scope, rather than information about pairwise aliasing.  The <tt><a
-href="/doxygen/classllvm_1_1AliasSetTracker.html">AliasSetTracker</a></tt> class
-is used to efficiently build these Alias Sets from the pairwise alias analysis
-information provided by the <tt>AliasAnalysis</tt> interface.</p>
-
-<p>First you initialize the AliasSetTracker by using the "<tt>add</tt>" methods
-to add information about various potentially aliasing instructions in the scope
-you are interested in.  Once all of the alias sets are completed, your pass
-should simply iterate through the constructed alias sets, using the
-<tt>AliasSetTracker</tt> <tt>begin()</tt>/<tt>end()</tt> methods.</p>
-
-<p>The <tt>AliasSet</tt>s formed by the <tt>AliasSetTracker</tt> are guaranteed
-to be disjoint, calculate mod/ref information and volatility for the set, and
-keep track of whether or not all of the pointers in the set are Must aliases.
-The AliasSetTracker also makes sure that sets are properly folded due to call
-instructions, and can provide a list of pointers in each set.</p>
-
-<p>As an example user of this, the <a href="/doxygen/structLICM.html">Loop
-Invariant Code Motion</a> pass uses <tt>AliasSetTracker</tt>s to calculate alias
-sets for each loop nest.  If an <tt>AliasSet</tt> in a loop is not modified,
-then all load instructions from that set may be hoisted out of the loop.  If any
-alias sets are stored to <b>and</b> are must alias sets, then the stores may be
-sunk to outside of the loop, promoting the memory location to a register for the
-duration of the loop nest.  Both of these transformations only apply if the
-pointer argument is loop-invariant.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  The AliasSetTracker implementation
-</h4>
-
-<div>
-
-<p>The AliasSetTracker class is implemented to be as efficient as possible.  It
-uses the union-find algorithm to efficiently merge AliasSets when a pointer is
-inserted into the AliasSetTracker that aliases multiple sets.  The primary data
-structure is a hash table mapping pointers to the AliasSet they are in.</p>
-
-<p>The AliasSetTracker class must maintain a list of all of the LLVM Value*'s
-that are in each AliasSet.  Since the hash table already has entries for each
-LLVM Value* of interest, the AliasesSets thread the linked list through these
-hash-table nodes to avoid having to allocate memory unnecessarily, and to make
-merging alias sets extremely efficient (the linked list merge is constant time).
-</p>
-
-<p>You shouldn't need to understand these details if you are just a client of
-the AliasSetTracker, but if you look at the code, hopefully this brief
-description will help make sense of why things are designed the way they
-are.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="direct">Using the <tt>AliasAnalysis</tt> interface directly</a>
-</h3>
-
-<div>
-
-<p>If neither of these utility class are what your pass needs, you should use
-the interfaces exposed by the <tt>AliasAnalysis</tt> class directly.  Try to use
-the higher-level methods when possible (e.g., use mod/ref information instead of
-the <a href="#alias"><tt>alias</tt></a> method directly if possible) to get the
-best precision and efficiency.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="exist">Existing alias analysis implementations and clients</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you're going to be working with the LLVM alias analysis infrastructure,
-you should know what clients and implementations of alias analysis are
-available.  In particular, if you are implementing an alias analysis, you should
-be aware of the <a href="#aliasanalysis-debug">the clients</a> that are useful
-for monitoring and evaluating different implementations.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="impls">Available <tt>AliasAnalysis</tt> implementations</a>
-</h3>
-
-<div>
-
-<p>This section lists the various implementations of the <tt>AliasAnalysis</tt>
-interface.  With the exception of the <a href="#no-aa"><tt>-no-aa</tt></a>
-implementation, all of these <a href="#chaining">chain</a> to other alias
-analysis implementations.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="no-aa">The <tt>-no-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-no-aa</tt> pass is just like what it sounds: an alias analysis that
-never returns any useful information.  This pass can be useful if you think that
-alias analysis is doing something wrong and are trying to narrow down a
-problem.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="basic-aa">The <tt>-basicaa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-basicaa</tt> pass is an aggressive local analysis that "knows"
-many important facts:</p>
-
-<ul>
-<li>Distinct globals, stack allocations, and heap allocations can never
-    alias.</li>
-<li>Globals, stack allocations, and heap allocations never alias the null
-    pointer.</li>
-<li>Different fields of a structure do not alias.</li>
-<li>Indexes into arrays with statically differing subscripts cannot alias.</li>
-<li>Many common standard C library functions <a
-    href="#simplemodref">never access memory or only read memory</a>.</li>
-<li>Pointers that obviously point to constant globals
-    "<tt>pointToConstantMemory</tt>".</li>
-<li>Function calls can not modify or references stack allocations if they never
-    escape from the function that allocates them (a common case for automatic
-    arrays).</li>
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="globalsmodref">The <tt>-globalsmodref-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>This pass implements a simple context-sensitive mod/ref and alias analysis
-for internal global variables that don't "have their address taken".  If a
-global does not have its address taken, the pass knows that no pointers alias
-the global.  This pass also keeps track of functions that it knows never access
-memory or never read memory.  This allows certain optimizations (e.g. GVN) to
-eliminate call instructions entirely.
-</p>
-
-<p>The real power of this pass is that it provides context-sensitive mod/ref 
-information for call instructions.  This allows the optimizer to know that 
-calls to a function do not clobber or read the value of the global, allowing 
-loads and stores to be eliminated.</p>
-
-<p>Note that this pass is somewhat limited in its scope (only support 
-non-address taken globals), but is very quick analysis.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="steens-aa">The <tt>-steens-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-steens-aa</tt> pass implements a variation on the well-known
-"Steensgaard's algorithm" for interprocedural alias analysis.  Steensgaard's
-algorithm is a unification-based, flow-insensitive, context-insensitive, and
-field-insensitive alias analysis that is also very scalable (effectively linear
-time).</p>
-
-<p>The LLVM <tt>-steens-aa</tt> pass implements a "speculatively
-field-<b>sensitive</b>" version of Steensgaard's algorithm using the Data
-Structure Analysis framework.  This gives it substantially more precision than
-the standard algorithm while maintaining excellent analysis scalability.</p>
-
-<p>Note that <tt>-steens-aa</tt> is available in the optional "poolalloc"
-module, it is not part of the LLVM core.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ds-aa">The <tt>-ds-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-ds-aa</tt> pass implements the full Data Structure Analysis
-algorithm.  Data Structure Analysis is a modular unification-based,
-flow-insensitive, context-<b>sensitive</b>, and speculatively
-field-<b>sensitive</b> alias analysis that is also quite scalable, usually at
-O(n*log(n)).</p>
-
-<p>This algorithm is capable of responding to a full variety of alias analysis
-queries, and can provide context-sensitive mod/ref information as well.  The
-only major facility not implemented so far is support for must-alias
-information.</p>
-
-<p>Note that <tt>-ds-aa</tt> is available in the optional "poolalloc"
-module, it is not part of the LLVM core.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scev-aa">The <tt>-scev-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-scev-aa</tt> pass implements AliasAnalysis queries by
-translating them into ScalarEvolution queries. This gives it a
-more complete understanding of <tt>getelementptr</tt> instructions
-and loop induction variables than other alias analyses have.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="aliasanalysis-xforms">Alias analysis driven transformations</a>
-</h3>
-
-<div>
-LLVM includes several alias-analysis driven transformations which can be used
-with any of the implementations above.
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="adce">The <tt>-adce</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-adce</tt> pass, which implements Aggressive Dead Code Elimination
-uses the <tt>AliasAnalysis</tt> interface to delete calls to functions that do
-not have side-effects and are not used.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="licm">The <tt>-licm</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-licm</tt> pass implements various Loop Invariant Code Motion related
-transformations.  It uses the <tt>AliasAnalysis</tt> interface for several
-different transformations:</p>
-
-<ul>
-<li>It uses mod/ref information to hoist or sink load instructions out of loops
-if there are no instructions in the loop that modifies the memory loaded.</li>
-
-<li>It uses mod/ref information to hoist function calls out of loops that do not
-write to memory and are loop-invariant.</li>
-
-<li>If uses alias information to promote memory objects that are loaded and
-stored to in loops to live in a register instead.  It can do this if there are
-no may aliases to the loaded/stored memory location.</li>
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="argpromotion">The <tt>-argpromotion</tt> pass</a>
-</h4>
-
-<div>
-<p>
-The <tt>-argpromotion</tt> pass promotes by-reference arguments to be passed in
-by-value instead.  In particular, if pointer arguments are only loaded from it
-passes in the value loaded instead of the address to the function.  This pass
-uses alias information to make sure that the value loaded from the argument
-pointer is not modified between the entry of the function and any load of the
-pointer.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="gvn">The <tt>-gvn</tt>, <tt>-memcpyopt</tt>, and <tt>-dse</tt>
-     passes</a>
-</h4>
-
-<div>
-
-<p>These passes use AliasAnalysis information to reason about loads and stores.
-</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="aliasanalysis-debug">Clients for debugging and evaluation of
-  implementations</a>
-</h3>
-
-<div>
-
-<p>These passes are useful for evaluating the various alias analysis
-implementations.  You can use them with commands like '<tt>opt -ds-aa
--aa-eval foo.bc -disable-output -stats</tt>'.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="print-alias-sets">The <tt>-print-alias-sets</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-print-alias-sets</tt> pass is exposed as part of the
-<tt>opt</tt> tool to print out the Alias Sets formed by the <a
-href="#ast"><tt>AliasSetTracker</tt></a> class.  This is useful if you're using
-the <tt>AliasSetTracker</tt> class.  To use it, use something like:</p>
-
-<div class="doc_code">
-<pre>
-% opt -ds-aa -print-alias-sets -disable-output
-</pre>
-</div>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="count-aa">The <tt>-count-aa</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-count-aa</tt> pass is useful to see how many queries a particular
-pass is making and what responses are returned by the alias analysis.  As an
-example,</p>
-
-<div class="doc_code">
-<pre>
-% opt -basicaa -count-aa -ds-aa -count-aa -licm
-</pre>
-</div>
-
-<p>will print out how many queries (and what responses are returned) by the
-<tt>-licm</tt> pass (of the <tt>-ds-aa</tt> pass) and how many queries are made
-of the <tt>-basicaa</tt> pass by the <tt>-ds-aa</tt> pass.  This can be useful
-when debugging a transformation or an alias analysis implementation.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="aa-eval">The <tt>-aa-eval</tt> pass</a>
-</h4>
-
-<div>
-
-<p>The <tt>-aa-eval</tt> pass simply iterates through all pairs of pointers in a
-function and asks an alias analysis whether or not the pointers alias.  This
-gives an indication of the precision of the alias analysis.  Statistics are
-printed indicating the percent of no/may/must aliases found (a more precise
-algorithm will have a lower number of may aliases).</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="memdep">Memory Dependence Analysis</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you're just looking to be a client of alias analysis information, consider
-using the Memory Dependence Analysis interface instead.  MemDep is a lazy, 
-caching layer on top of alias analysis that is able to answer the question of
-what preceding memory operations a given instruction depends on, either at an
-intra- or inter-block level.  Because of its laziness and caching 
-policy, using MemDep can be a significant performance win over accessing alias
-analysis directly.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-01-31 00:05:41 +0100 (Tue, 31 Jan 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
new file mode 100644
index 0000000..2d4f291
--- /dev/null
+++ b/docs/AliasAnalysis.rst
@@ -0,0 +1,702 @@
+.. _alias_analysis:
+
+==================================
+LLVM Alias Analysis Infrastructure
+==================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Alias Analysis (aka Pointer Analysis) is a class of techniques which attempt to
+determine whether or not two pointers ever can point to the same object in
+memory.  There are many different algorithms for alias analysis and many
+different ways of classifying them: flow-sensitive vs. flow-insensitive,
+context-sensitive vs. context-insensitive, field-sensitive
+vs. field-insensitive, unification-based vs. subset-based, etc.  Traditionally,
+alias analyses respond to a query with a `Must, May, or No`_ alias response,
+indicating that two pointers always point to the same object, might point to the
+same object, or are known to never point to the same object.
+
+The LLVM `AliasAnalysis
+<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ class is the
+primary interface used by clients and implementations of alias analyses in the
+LLVM system.  This class is the common interface between clients of alias
+analysis information and the implementations providing it, and is designed to
+support a wide range of implementations and clients (but currently all clients
+are assumed to be flow-insensitive).  In addition to simple alias analysis
+information, this class exposes Mod/Ref information from those implementations
+which can provide it, allowing for powerful analyses and transformations to work
+well together.
+
+This document contains information necessary to successfully implement this
+interface, use it, and to test both sides.  It also explains some of the finer
+points about what exactly results mean.  If you feel that something is unclear
+or should be added, please `let me know <mailto:sabre@nondot.org>`_.
+
+``AliasAnalysis`` Class Overview
+================================
+
+The `AliasAnalysis <http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__
+class defines the interface that the various alias analysis implementations
+should support.  This class exports two important enums: ``AliasResult`` and
+``ModRefResult`` which represent the result of an alias query or a mod/ref
+query, respectively.
+
+The ``AliasAnalysis`` interface exposes information about memory, represented in
+several different ways.  In particular, memory objects are represented as a
+starting address and size, and function calls are represented as the actual
+``call`` or ``invoke`` instructions that performs the call.  The
+``AliasAnalysis`` interface also exposes some helper methods which allow you to
+get mod/ref information for arbitrary instructions.
+
+All ``AliasAnalysis`` interfaces require that in queries involving multiple
+values, values which are not `constants <LangRef.html#constants>`_ are all
+defined within the same function.
+
+Representation of Pointers
+--------------------------
+
+Most importantly, the ``AliasAnalysis`` class provides several methods which are
+used to query whether or not two memory objects alias, whether function calls
+can modify or read a memory object, etc.  For all of these queries, memory
+objects are represented as a pair of their starting address (a symbolic LLVM
+``Value*``) and a static size.
+
+Representing memory objects as a starting address and a size is critically
+important for correct Alias Analyses.  For example, consider this (silly, but
+possible) C code:
+
+.. code-block:: c++
+
+  int i;
+  char C[2];
+  char A[10]; 
+  /* ... */
+  for (i = 0; i != 10; ++i) {
+    C[0] = A[i];          /* One byte store */
+    C[1] = A[9-i];        /* One byte store */
+  }
+
+In this case, the ``basicaa`` pass will disambiguate the stores to ``C[0]`` and
+``C[1]`` because they are accesses to two distinct locations one byte apart, and
+the accesses are each one byte.  In this case, the Loop Invariant Code Motion
+(LICM) pass can use store motion to remove the stores from the loop.  In
+constrast, the following code:
+
+.. code-block:: c++
+
+  int i;
+  char C[2];
+  char A[10]; 
+  /* ... */
+  for (i = 0; i != 10; ++i) {
+    ((short*)C)[0] = A[i];  /* Two byte store! */
+    C[1] = A[9-i];          /* One byte store */
+  }
+
+In this case, the two stores to C do alias each other, because the access to the
+``&C[0]`` element is a two byte access.  If size information wasn't available in
+the query, even the first case would have to conservatively assume that the
+accesses alias.
+
+.. _alias:
+
+The ``alias`` method
+--------------------
+  
+The ``alias`` method is the primary interface used to determine whether or not
+two memory objects alias each other.  It takes two memory objects as input and
+returns MustAlias, PartialAlias, MayAlias, or NoAlias as appropriate.
+
+Like all ``AliasAnalysis`` interfaces, the ``alias`` method requires that either
+the two pointer values be defined within the same function, or at least one of
+the values is a `constant <LangRef.html#constants>`_.
+
+.. _Must, May, or No:
+
+Must, May, and No Alias Responses
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``NoAlias`` response may be used when there is never an immediate dependence
+between any memory reference *based* on one pointer and any memory reference
+*based* the other. The most obvious example is when the two pointers point to
+non-overlapping memory ranges. Another is when the two pointers are only ever
+used for reading memory. Another is when the memory is freed and reallocated
+between accesses through one pointer and accesses through the other --- in this
+case, there is a dependence, but it's mediated by the free and reallocation.
+
+As an exception to this is with the `noalias <LangRef.html#noalias>`_ keyword;
+the "irrelevant" dependencies are ignored.
+
+The ``MayAlias`` response is used whenever the two pointers might refer to the
+same object.
+
+The ``PartialAlias`` response is used when the two memory objects are known to
+be overlapping in some way, but do not start at the same address.
+
+The ``MustAlias`` response may only be returned if the two memory objects are
+guaranteed to always start at exactly the same location. A ``MustAlias``
+response implies that the pointers compare equal.
+
+The ``getModRefInfo`` methods
+-----------------------------
+
+The ``getModRefInfo`` methods return information about whether the execution of
+an instruction can read or modify a memory location.  Mod/Ref information is
+always conservative: if an instruction **might** read or write a location,
+``ModRef`` is returned.
+
+The ``AliasAnalysis`` class also provides a ``getModRefInfo`` method for testing
+dependencies between function calls.  This method takes two call sites (``CS1``
+& ``CS2``), returns ``NoModRef`` if neither call writes to memory read or
+written by the other, ``Ref`` if ``CS1`` reads memory written by ``CS2``,
+``Mod`` if ``CS1`` writes to memory read or written by ``CS2``, or ``ModRef`` if
+``CS1`` might read or write memory written to by ``CS2``.  Note that this
+relation is not commutative.
+
+Other useful ``AliasAnalysis`` methods
+--------------------------------------
+
+Several other tidbits of information are often collected by various alias
+analysis implementations and can be put to good use by various clients.
+
+The ``pointsToConstantMemory`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``pointsToConstantMemory`` method returns true if and only if the analysis
+can prove that the pointer only points to unchanging memory locations
+(functions, constant global variables, and the null pointer).  This information
+can be used to refine mod/ref information: it is impossible for an unchanging
+memory location to be modified.
+
+.. _never access memory or only read memory:
+
+The ``doesNotAccessMemory`` and  ``onlyReadsMemory`` methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These methods are used to provide very simple mod/ref information for function
+calls.  The ``doesNotAccessMemory`` method returns true for a function if the
+analysis can prove that the function never reads or writes to memory, or if the
+function only reads from constant memory.  Functions with this property are
+side-effect free and only depend on their input arguments, allowing them to be
+eliminated if they form common subexpressions or be hoisted out of loops.  Many
+common functions behave this way (e.g., ``sin`` and ``cos``) but many others do
+not (e.g., ``acos``, which modifies the ``errno`` variable).
+
+The ``onlyReadsMemory`` method returns true for a function if analysis can prove
+that (at most) the function only reads from non-volatile memory.  Functions with
+this property are side-effect free, only depending on their input arguments and
+the state of memory when they are called.  This property allows calls to these
+functions to be eliminated and moved around, as long as there is no store
+instruction that changes the contents of memory.  Note that all functions that
+satisfy the ``doesNotAccessMemory`` method also satisfies ``onlyReadsMemory``.
+
+Writing a new ``AliasAnalysis`` Implementation
+==============================================
+
+Writing a new alias analysis implementation for LLVM is quite straight-forward.
+There are already several implementations that you can use for examples, and the
+following information should help fill in any details.  For a examples, take a
+look at the `various alias analysis implementations`_ included with LLVM.
+
+Different Pass styles
+---------------------
+
+The first step to determining what type of `LLVM pass <WritingAnLLVMPass.html>`_
+you need to use for your Alias Analysis.  As is the case with most other
+analyses and transformations, the answer should be fairly obvious from what type
+of problem you are trying to solve:
+
+#. If you require interprocedural analysis, it should be a ``Pass``.
+#. If you are a function-local analysis, subclass ``FunctionPass``.
+#. If you don't need to look at the program at all, subclass ``ImmutablePass``.
+
+In addition to the pass that you subclass, you should also inherit from the
+``AliasAnalysis`` interface, of course, and use the ``RegisterAnalysisGroup``
+template to register as an implementation of ``AliasAnalysis``.
+
+Required initialization calls
+-----------------------------
+
+Your subclass of ``AliasAnalysis`` is required to invoke two methods on the
+``AliasAnalysis`` base class: ``getAnalysisUsage`` and
+``InitializeAliasAnalysis``.  In particular, your implementation of
+``getAnalysisUsage`` should explicitly call into the
+``AliasAnalysis::getAnalysisUsage`` method in addition to doing any declaring
+any pass dependencies your pass has.  Thus you should have something like this:
+
+.. code-block:: c++
+
+  void getAnalysisUsage(AnalysisUsage &amp;AU) const {
+    AliasAnalysis::getAnalysisUsage(AU);
+    // declare your dependencies here.
+  }
+
+Additionally, your must invoke the ``InitializeAliasAnalysis`` method from your
+analysis run method (``run`` for a ``Pass``, ``runOnFunction`` for a
+``FunctionPass``, or ``InitializePass`` for an ``ImmutablePass``).  For example
+(as part of a ``Pass``):
+
+.. code-block:: c++
+
+  bool run(Module &M) {
+    InitializeAliasAnalysis(this);
+    // Perform analysis here...
+    return false;
+  }
+
+Interfaces which may be specified
+---------------------------------
+
+All of the `AliasAnalysis
+<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ virtual methods
+default to providing `chaining`_ to another alias analysis implementation, which
+ends up returning conservatively correct information (returning "May" Alias and
+"Mod/Ref" for alias and mod/ref queries respectively).  Depending on the
+capabilities of the analysis you are implementing, you just override the
+interfaces you can improve.
+
+.. _chaining:
+.. _chain:
+
+``AliasAnalysis`` chaining behavior
+-----------------------------------
+
+With only one special exception (the `no-aa`_ pass) every alias analysis pass
+chains to another alias analysis implementation (for example, the user can
+specify "``-basicaa -ds-aa -licm``" to get the maximum benefit from both alias
+analyses).  The alias analysis class automatically takes care of most of this
+for methods that you don't override.  For methods that you do override, in code
+paths that return a conservative MayAlias or Mod/Ref result, simply return
+whatever the superclass computes.  For example:
+
+.. code-block:: c++
+
+  AliasAnalysis::AliasResult alias(const Value *V1, unsigned V1Size,
+                                   const Value *V2, unsigned V2Size) {
+    if (...)
+      return NoAlias;
+    ...
+
+    // Couldn't determine a must or no-alias result.
+    return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
+  }
+
+In addition to analysis queries, you must make sure to unconditionally pass LLVM
+`update notification`_ methods to the superclass as well if you override them,
+which allows all alias analyses in a change to be updated.
+
+.. _update notification:
+
+Updating analysis results for transformations
+---------------------------------------------
+
+Alias analysis information is initially computed for a static snapshot of the
+program, but clients will use this information to make transformations to the
+code.  All but the most trivial forms of alias analysis will need to have their
+analysis results updated to reflect the changes made by these transformations.
+
+The ``AliasAnalysis`` interface exposes four methods which are used to
+communicate program changes from the clients to the analysis implementations.
+Various alias analysis implementations should use these methods to ensure that
+their internal data structures are kept up-to-date as the program changes (for
+example, when an instruction is deleted), and clients of alias analysis must be
+sure to call these interfaces appropriately.
+
+The ``deleteValue`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``deleteValue`` method is called by transformations when they remove an
+instruction or any other value from the program (including values that do not
+use pointers).  Typically alias analyses keep data structures that have entries
+for each value in the program.  When this method is called, they should remove
+any entries for the specified value, if they exist.
+
+The ``copyValue`` method
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``copyValue`` method is used when a new value is introduced into the
+program.  There is no way to introduce a value into the program that did not
+exist before (this doesn't make sense for a safe compiler transformation), so
+this is the only way to introduce a new value.  This method indicates that the
+new value has exactly the same properties as the value being copied.
+
+The ``replaceWithNewValue`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This method is a simple helper method that is provided to make clients easier to
+use.  It is implemented by copying the old analysis information to the new
+value, then deleting the old value.  This method cannot be overridden by alias
+analysis implementations.
+
+The ``addEscapingUse`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``addEscapingUse`` method is used when the uses of a pointer value have
+changed in ways that may invalidate precomputed analysis information.
+Implementations may either use this callback to provide conservative responses
+for points whose uses have change since analysis time, or may recompute some or
+all of their internal state to continue providing accurate responses.
+
+In general, any new use of a pointer value is considered an escaping use, and
+must be reported through this callback, *except* for the uses below:
+
+* A ``bitcast`` or ``getelementptr`` of the pointer
+* A ``store`` through the pointer (but not a ``store`` *of* the pointer)
+* A ``load`` through the pointer
+
+Efficiency Issues
+-----------------
+
+From the LLVM perspective, the only thing you need to do to provide an efficient
+alias analysis is to make sure that alias analysis **queries** are serviced
+quickly.  The actual calculation of the alias analysis results (the "run"
+method) is only performed once, but many (perhaps duplicate) queries may be
+performed.  Because of this, try to move as much computation to the run method
+as possible (within reason).
+
+Limitations
+-----------
+
+The AliasAnalysis infrastructure has several limitations which make writing a
+new ``AliasAnalysis`` implementation difficult.
+
+There is no way to override the default alias analysis. It would be very useful
+to be able to do something like "``opt -my-aa -O2``" and have it use ``-my-aa``
+for all passes which need AliasAnalysis, but there is currently no support for
+that, short of changing the source code and recompiling. Similarly, there is
+also no way of setting a chain of analyses as the default.
+
+There is no way for transform passes to declare that they preserve
+``AliasAnalysis`` implementations. The ``AliasAnalysis`` interface includes
+``deleteValue`` and ``copyValue`` methods which are intended to allow a pass to
+keep an AliasAnalysis consistent, however there's no way for a pass to declare
+in its ``getAnalysisUsage`` that it does so. Some passes attempt to use
+``AU.addPreserved<AliasAnalysis>``, however this doesn't actually have any
+effect.
+
+``AliasAnalysisCounter`` (``-count-aa``) and ``AliasDebugger`` (``-debug-aa``)
+are implemented as ``ModulePass`` classes, so if your alias analysis uses
+``FunctionPass``, it won't be able to use these utilities. If you try to use
+them, the pass manager will silently route alias analysis queries directly to
+``BasicAliasAnalysis`` instead.
+
+Similarly, the ``opt -p`` option introduces ``ModulePass`` passes between each
+pass, which prevents the use of ``FunctionPass`` alias analysis passes.
+
+The ``AliasAnalysis`` API does have functions for notifying implementations when
+values are deleted or copied, however these aren't sufficient. There are many
+other ways that LLVM IR can be modified which could be relevant to
+``AliasAnalysis`` implementations which can not be expressed.
+
+The ``AliasAnalysisDebugger`` utility seems to suggest that ``AliasAnalysis``
+implementations can expect that they will be informed of any relevant ``Value``
+before it appears in an alias query. However, popular clients such as ``GVN``
+don't support this, and are known to trigger errors when run with the
+``AliasAnalysisDebugger``.
+
+Due to several of the above limitations, the most obvious use for the
+``AliasAnalysisCounter`` utility, collecting stats on all alias queries in a
+compilation, doesn't work, even if the ``AliasAnalysis`` implementations don't
+use ``FunctionPass``.  There's no way to set a default, much less a default
+sequence, and there's no way to preserve it.
+
+The ``AliasSetTracker`` class (which is used by ``LICM``) makes a
+non-deterministic number of alias queries. This can cause stats collected by
+``AliasAnalysisCounter`` to have fluctuations among identical runs, for
+example. Another consequence is that debugging techniques involving pausing
+execution after a predetermined number of queries can be unreliable.
+
+Many alias queries can be reformulated in terms of other alias queries. When
+multiple ``AliasAnalysis`` queries are chained together, it would make sense to
+start those queries from the beginning of the chain, with care taken to avoid
+infinite looping, however currently an implementation which wants to do this can
+only start such queries from itself.
+
+Using alias analysis results
+============================
+
+There are several different ways to use alias analysis results.  In order of
+preference, these are:
+
+Using the ``MemoryDependenceAnalysis`` Pass
+-------------------------------------------
+
+The ``memdep`` pass uses alias analysis to provide high-level dependence
+information about memory-using instructions.  This will tell you which store
+feeds into a load, for example.  It uses caching and other techniques to be
+efficient, and is used by Dead Store Elimination, GVN, and memcpy optimizations.
+
+.. _AliasSetTracker:
+
+Using the ``AliasSetTracker`` class
+-----------------------------------
+
+Many transformations need information about alias **sets** that are active in
+some scope, rather than information about pairwise aliasing.  The
+`AliasSetTracker <http://llvm.org/doxygen/classllvm_1_1AliasSetTracker.html>`__
+class is used to efficiently build these Alias Sets from the pairwise alias
+analysis information provided by the ``AliasAnalysis`` interface.
+
+First you initialize the AliasSetTracker by using the "``add``" methods to add
+information about various potentially aliasing instructions in the scope you are
+interested in.  Once all of the alias sets are completed, your pass should
+simply iterate through the constructed alias sets, using the ``AliasSetTracker``
+``begin()``/``end()`` methods.
+
+The ``AliasSet``\s formed by the ``AliasSetTracker`` are guaranteed to be
+disjoint, calculate mod/ref information and volatility for the set, and keep
+track of whether or not all of the pointers in the set are Must aliases.  The
+AliasSetTracker also makes sure that sets are properly folded due to call
+instructions, and can provide a list of pointers in each set.
+
+As an example user of this, the `Loop Invariant Code Motion
+<doxygen/structLICM.html>`_ pass uses ``AliasSetTracker``\s to calculate alias
+sets for each loop nest.  If an ``AliasSet`` in a loop is not modified, then all
+load instructions from that set may be hoisted out of the loop.  If any alias
+sets are stored to **and** are must alias sets, then the stores may be sunk
+to outside of the loop, promoting the memory location to a register for the
+duration of the loop nest.  Both of these transformations only apply if the
+pointer argument is loop-invariant.
+
+The AliasSetTracker implementation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The AliasSetTracker class is implemented to be as efficient as possible.  It
+uses the union-find algorithm to efficiently merge AliasSets when a pointer is
+inserted into the AliasSetTracker that aliases multiple sets.  The primary data
+structure is a hash table mapping pointers to the AliasSet they are in.
+
+The AliasSetTracker class must maintain a list of all of the LLVM ``Value*``\s
+that are in each AliasSet.  Since the hash table already has entries for each
+LLVM ``Value*`` of interest, the AliasesSets thread the linked list through
+these hash-table nodes to avoid having to allocate memory unnecessarily, and to
+make merging alias sets extremely efficient (the linked list merge is constant
+time).
+
+You shouldn't need to understand these details if you are just a client of the
+AliasSetTracker, but if you look at the code, hopefully this brief description
+will help make sense of why things are designed the way they are.
+
+Using the ``AliasAnalysis`` interface directly
+----------------------------------------------
+
+If neither of these utility class are what your pass needs, you should use the
+interfaces exposed by the ``AliasAnalysis`` class directly.  Try to use the
+higher-level methods when possible (e.g., use mod/ref information instead of the
+`alias`_ method directly if possible) to get the best precision and efficiency.
+
+Existing alias analysis implementations and clients
+===================================================
+
+If you're going to be working with the LLVM alias analysis infrastructure, you
+should know what clients and implementations of alias analysis are available.
+In particular, if you are implementing an alias analysis, you should be aware of
+the `the clients`_ that are useful for monitoring and evaluating different
+implementations.
+
+.. _various alias analysis implementations:
+
+Available ``AliasAnalysis`` implementations
+-------------------------------------------
+
+This section lists the various implementations of the ``AliasAnalysis``
+interface.  With the exception of the `-no-aa`_ implementation, all of these
+`chain`_ to other alias analysis implementations.
+
+.. _no-aa:
+.. _-no-aa:
+
+The ``-no-aa`` pass
+^^^^^^^^^^^^^^^^^^^
+
+The ``-no-aa`` pass is just like what it sounds: an alias analysis that never
+returns any useful information.  This pass can be useful if you think that alias
+analysis is doing something wrong and are trying to narrow down a problem.
+
+The ``-basicaa`` pass
+^^^^^^^^^^^^^^^^^^^^^
+
+The ``-basicaa`` pass is an aggressive local analysis that *knows* many
+important facts:
+
+* Distinct globals, stack allocations, and heap allocations can never alias.
+* Globals, stack allocations, and heap allocations never alias the null pointer.
+* Different fields of a structure do not alias.
+* Indexes into arrays with statically differing subscripts cannot alias.
+* Many common standard C library functions `never access memory or only read
+  memory`_.
+* Pointers that obviously point to constant globals "``pointToConstantMemory``".
+* Function calls can not modify or references stack allocations if they never
+  escape from the function that allocates them (a common case for automatic
+  arrays).
+
+The ``-globalsmodref-aa`` pass
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This pass implements a simple context-sensitive mod/ref and alias analysis for
+internal global variables that don't "have their address taken".  If a global
+does not have its address taken, the pass knows that no pointers alias the
+global.  This pass also keeps track of functions that it knows never access
+memory or never read memory.  This allows certain optimizations (e.g. GVN) to
+eliminate call instructions entirely.
+
+The real power of this pass is that it provides context-sensitive mod/ref
+information for call instructions.  This allows the optimizer to know that calls
+to a function do not clobber or read the value of the global, allowing loads and
+stores to be eliminated.
+
+.. note::
+
+  This pass is somewhat limited in its scope (only support non-address taken
+  globals), but is very quick analysis.
+
+The ``-steens-aa`` pass
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``-steens-aa`` pass implements a variation on the well-known "Steensgaard's
+algorithm" for interprocedural alias analysis.  Steensgaard's algorithm is a
+unification-based, flow-insensitive, context-insensitive, and field-insensitive
+alias analysis that is also very scalable (effectively linear time).
+
+The LLVM ``-steens-aa`` pass implements a "speculatively field-**sensitive**"
+version of Steensgaard's algorithm using the Data Structure Analysis framework.
+This gives it substantially more precision than the standard algorithm while
+maintaining excellent analysis scalability.
+
+.. note::
+
+  ``-steens-aa`` is available in the optional "poolalloc" module. It is not part
+  of the LLVM core.
+
+The ``-ds-aa`` pass
+^^^^^^^^^^^^^^^^^^^
+
+The ``-ds-aa`` pass implements the full Data Structure Analysis algorithm.  Data
+Structure Analysis is a modular unification-based, flow-insensitive,
+context-**sensitive**, and speculatively field-**sensitive** alias
+analysis that is also quite scalable, usually at ``O(n * log(n))``.
+
+This algorithm is capable of responding to a full variety of alias analysis
+queries, and can provide context-sensitive mod/ref information as well.  The
+only major facility not implemented so far is support for must-alias
+information.
+
+.. note::
+
+  ``-ds-aa`` is available in the optional "poolalloc" module. It is not part of
+  the LLVM core.
+
+The ``-scev-aa`` pass
+^^^^^^^^^^^^^^^^^^^^^
+
+The ``-scev-aa`` pass implements AliasAnalysis queries by translating them into
+ScalarEvolution queries. This gives it a more complete understanding of
+``getelementptr`` instructions and loop induction variables than other alias
+analyses have.
+
+Alias analysis driven transformations
+-------------------------------------
+
+LLVM includes several alias-analysis driven transformations which can be used
+with any of the implementations above.
+
+The ``-adce`` pass
+^^^^^^^^^^^^^^^^^^
+
+The ``-adce`` pass, which implements Aggressive Dead Code Elimination uses the
+``AliasAnalysis`` interface to delete calls to functions that do not have
+side-effects and are not used.
+
+The ``-licm`` pass
+^^^^^^^^^^^^^^^^^^
+
+The ``-licm`` pass implements various Loop Invariant Code Motion related
+transformations.  It uses the ``AliasAnalysis`` interface for several different
+transformations:
+
+* It uses mod/ref information to hoist or sink load instructions out of loops if
+  there are no instructions in the loop that modifies the memory loaded.
+
+* It uses mod/ref information to hoist function calls out of loops that do not
+  write to memory and are loop-invariant.
+
+* If uses alias information to promote memory objects that are loaded and stored
+  to in loops to live in a register instead.  It can do this if there are no may
+  aliases to the loaded/stored memory location.
+
+The ``-argpromotion`` pass
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``-argpromotion`` pass promotes by-reference arguments to be passed in
+by-value instead.  In particular, if pointer arguments are only loaded from it
+passes in the value loaded instead of the address to the function.  This pass
+uses alias information to make sure that the value loaded from the argument
+pointer is not modified between the entry of the function and any load of the
+pointer.
+
+The ``-gvn``, ``-memcpyopt``, and ``-dse`` passes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These passes use AliasAnalysis information to reason about loads and stores.
+
+.. _the clients:
+
+Clients for debugging and evaluation of implementations
+-------------------------------------------------------
+
+These passes are useful for evaluating the various alias analysis
+implementations.  You can use them with commands like:
+
+.. code-block:: bash
+
+  % opt -ds-aa -aa-eval foo.bc -disable-output -stats
+
+The ``-print-alias-sets`` pass
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``-print-alias-sets`` pass is exposed as part of the ``opt`` tool to print
+out the Alias Sets formed by the `AliasSetTracker`_ class.  This is useful if
+you're using the ``AliasSetTracker`` class.  To use it, use something like:
+
+.. code-block:: bash
+
+  % opt -ds-aa -print-alias-sets -disable-output
+
+The ``-count-aa`` pass
+^^^^^^^^^^^^^^^^^^^^^^
+
+The ``-count-aa`` pass is useful to see how many queries a particular pass is
+making and what responses are returned by the alias analysis.  As an example:
+
+.. code-block:: bash
+
+  % opt -basicaa -count-aa -ds-aa -count-aa -licm
+
+will print out how many queries (and what responses are returned) by the
+``-licm`` pass (of the ``-ds-aa`` pass) and how many queries are made of the
+``-basicaa`` pass by the ``-ds-aa`` pass.  This can be useful when debugging a
+transformation or an alias analysis implementation.
+
+The ``-aa-eval`` pass
+^^^^^^^^^^^^^^^^^^^^^
+
+The ``-aa-eval`` pass simply iterates through all pairs of pointers in a
+function and asks an alias analysis whether or not the pointers alias.  This
+gives an indication of the precision of the alias analysis.  Statistics are
+printed indicating the percent of no/may/must aliases found (a more precise
+algorithm will have a lower number of may aliases).
+
+Memory Dependence Analysis
+==========================
+
+If you're just looking to be a client of alias analysis information, consider
+using the Memory Dependence Analysis interface instead.  MemDep is a lazy,
+caching layer on top of alias analysis that is able to answer the question of
+what preceding memory operations a given instruction depends on, either at an
+intra- or inter-block level.  Because of its laziness and caching policy, using
+MemDep can be a significant performance win over accessing alias analysis
+directly.
diff --git a/docs/Atomics.html b/docs/Atomics.html
deleted file mode 100644
index fc15e27..0000000
--- a/docs/Atomics.html
+++ /dev/null
@@ -1,569 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <title>LLVM Atomic Instructions and Concurrency Guide</title>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  LLVM Atomic Instructions and Concurrency Guide
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#outsideatomic">Optimization outside atomic</a></li>
-  <li><a href="#atomicinst">Atomic instructions</a></li>
-  <li><a href="#ordering">Atomic orderings</a></li>
-  <li><a href="#iropt">Atomics and IR optimization</a></li>
-  <li><a href="#codegen">Atomics and Codegen</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by Eli Friedman</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Historically, LLVM has not had very strong support for concurrency; some
-minimal intrinsics were provided, and <code>volatile</code> was used in some
-cases to achieve rough semantics in the presence of concurrency.  However, this
-is changing; there are now new instructions which are well-defined in the
-presence of threads and asynchronous signals, and the model for existing
-instructions has been clarified in the IR.</p>
-
-<p>The atomic instructions are designed specifically to provide readable IR and
-   optimized code generation for the following:</p>
-<ul>
-  <li>The new C++0x <code>&lt;atomic&gt;</code> header.
-      (<a href="http://www.open-std.org/jtc1/sc22/wg21/">C++0x draft available here</a>.)
-      (<a href="http://www.open-std.org/jtc1/sc22/wg14/">C1x draft available here</a>)</li>
-  <li>Proper semantics for Java-style memory, for both <code>volatile</code> and
-      regular shared variables.
-      (<a href="http://java.sun.com/docs/books/jls/third_edition/html/memory.html">Java Specification</a>)</li>
-  <li>gcc-compatible <code>__sync_*</code> builtins.
-      (<a href="http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html">Description</a>)</li>
-  <li>Other scenarios with atomic semantics, including <code>static</code>
-      variables with non-trivial constructors in C++.</li>
-</ul>
-
-<p>Atomic and volatile in the IR are orthogonal; "volatile" is the C/C++
-   volatile, which ensures that every volatile load and store happens and is
-   performed in the stated order.  A couple examples: if a
-   SequentiallyConsistent store is immediately followed by another
-   SequentiallyConsistent store to the same address, the first store can
-   be erased. This transformation is not allowed for a pair of volatile
-   stores. On the other hand, a non-volatile non-atomic load can be moved
-   across a volatile load freely, but not an Acquire load.</p>
-
-<p>This document is intended to provide a guide to anyone either writing a
-   frontend for LLVM or working on optimization passes for LLVM with a guide
-   for how to deal with instructions with special semantics in the presence of
-   concurrency.  This is not intended to be a precise guide to the semantics;
-   the details can get extremely complicated and unreadable, and are not
-   usually necessary.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="outsideatomic">Optimization outside atomic</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The basic <code>'load'</code> and <code>'store'</code> allow a variety of 
-   optimizations, but can lead to undefined results in a concurrent environment;
-   see <a href="#o_nonatomic">NonAtomic</a>. This section specifically goes
-   into the one optimizer restriction which applies in concurrent environments,
-   which gets a bit more of an extended description because any optimization
-   dealing with stores needs to be aware of it.</p>
-
-<p>From the optimizer's point of view, the rule is that if there
-   are not any instructions with atomic ordering involved, concurrency does
-   not matter, with one exception: if a variable might be visible to another
-   thread or signal handler, a store cannot be inserted along a path where it
-   might not execute otherwise.  Take the following example:</p>
-
-<pre>
-/* C code, for readability; run through clang -O2 -S -emit-llvm to get
-   equivalent IR */
-int x;
-void f(int* a) {
-  for (int i = 0; i &lt; 100; i++) {
-    if (a[i])
-      x += 1;
-  }
-}
-</pre>
-
-<p>The following is equivalent in non-concurrent situations:</p>
-
-<pre>
-int x;
-void f(int* a) {
-  int xtemp = x;
-  for (int i = 0; i &lt; 100; i++) {
-    if (a[i])
-      xtemp += 1;
-  }
-  x = xtemp;
-}
-</pre>
-
-<p>However, LLVM is not allowed to transform the former to the latter: it could
-   indirectly introduce undefined behavior if another thread can access x at
-   the same time. (This example is particularly of interest because before the
-   concurrency model was implemented, LLVM would perform this
-   transformation.)</p>
-
-<p>Note that speculative loads are allowed; a load which
-   is part of a race returns <code>undef</code>, but does not have undefined
-   behavior.</p>
-
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="atomicinst">Atomic instructions</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>For cases where simple loads and stores are not sufficient, LLVM provides
-   various atomic instructions. The exact guarantees provided depend on the
-   ordering; see <a href="#ordering">Atomic orderings</a></p>
-
-<p><code>load atomic</code> and <code>store atomic</code> provide the same
-   basic functionality as non-atomic loads and stores, but provide additional
-   guarantees in situations where threads and signals are involved.</p>
-
-<p><code>cmpxchg</code> and <code>atomicrmw</code> are essentially like an
-   atomic load followed by an atomic store (where the store is conditional for
-   <code>cmpxchg</code>), but no other memory operation can happen on any thread
-   between the load and store.  Note that LLVM's cmpxchg does not provide quite
-   as many options as the C++0x version.</p>
-
-<p>A <code>fence</code> provides Acquire and/or Release ordering which is not
-   part of another operation; it is normally used along with Monotonic memory
-   operations.  A Monotonic load followed by an Acquire fence is roughly
-   equivalent to an Acquire load.</p>
-
-<p>Frontends generating atomic instructions generally need to be aware of the
-   target to some degree; atomic instructions are guaranteed to be lock-free,
-   and therefore an instruction which is wider than the target natively supports
-   can be impossible to generate.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="ordering">Atomic orderings</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>In order to achieve a balance between performance and necessary guarantees,
-   there are six levels of atomicity. They are listed in order of strength;
-   each level includes all the guarantees of the previous level except for
-   Acquire/Release. (See also <a href="LangRef.html#ordering">LangRef</a>.)</p>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_notatomic">NotAtomic</a>
-</h3>
-
-<div>
-
-<p>NotAtomic is the obvious, a load or store which is not atomic. (This isn't
-   really a level of atomicity, but is listed here for comparison.) This is
-   essentially a regular load or store. If there is a race on a given memory
-   location, loads from that location return undef.</p>
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This is intended to match shared variables in C/C++, and to be used
-      in any other context where memory access is necessary, and
-      a race is impossible. (The precise definition is in
-      <a href="LangRef.html#memmodel">LangRef</a>.)
-  <dt>Notes for frontends</dt>
-  <dd>The rule is essentially that all memory accessed with basic loads and
-      stores by multiple threads should be protected by a lock or other
-      synchronization; otherwise, you are likely to run into undefined
-      behavior. If your frontend is for a "safe" language like Java,
-      use Unordered to load and store any shared variable.  Note that NotAtomic
-      volatile loads and stores are not properly atomic; do not try to use
-      them as a substitute. (Per the C/C++ standards, volatile does provide
-      some limited guarantees around asynchronous signals, but atomics are
-      generally a better solution.)
-  <dt>Notes for optimizers</dt>
-  <dd>Introducing loads to shared variables along a codepath where they would
-      not otherwise exist is allowed; introducing stores to shared variables
-      is not. See <a href="#outsideatomic">Optimization outside
-      atomic</a>.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>The one interesting restriction here is that it is not allowed to write
-      to bytes outside of the bytes relevant to a store.  This is mostly
-      relevant to unaligned stores: it is not allowed in general to convert
-      an unaligned store into two aligned stores of the same width as the
-      unaligned store. Backends are also expected to generate an i8 store
-      as an i8 store, and not an instruction which writes to surrounding
-      bytes.  (If you are writing a backend for an architecture which cannot
-      satisfy these restrictions and cares about concurrency, please send an
-      email to llvmdev.)</dd>
-</dl>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_unordered">Unordered</a>
-</h3>
-
-<div>
-
-<p>Unordered is the lowest level of atomicity. It essentially guarantees that
-   races produce somewhat sane results instead of having undefined behavior.
-   It also guarantees the operation to be lock-free, so it do not depend on
-   the data being part of a special atomic structure or depend on a separate
-   per-process global lock.  Note that code generation will fail for
-   unsupported atomic operations; if you need such an operation, use explicit
-   locking.</p>
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This is intended to match the Java memory model for shared
-      variables.</dd>
-  <dt>Notes for frontends</dt>
-  <dd>This cannot be used for synchronization, but is useful for Java and
-      other "safe" languages which need to guarantee that the generated
-      code never exhibits undefined behavior. Note that this guarantee
-      is cheap on common platforms for loads of a native width, but can
-      be expensive or unavailable for wider loads, like a 64-bit store
-      on ARM. (A frontend for Java or other "safe" languages would normally
-      split a 64-bit store on ARM into two 32-bit unordered stores.)
-  <dt>Notes for optimizers</dt>
-  <dd>In terms of the optimizer, this prohibits any transformation that
-      transforms a single load into multiple loads, transforms a store
-      into multiple stores, narrows a store, or stores a value which
-      would not be stored otherwise.  Some examples of unsafe optimizations
-      are narrowing an assignment into a bitfield, rematerializing
-      a load, and turning loads and stores into a memcpy call. Reordering
-      unordered operations is safe, though, and optimizers should take 
-      advantage of that because unordered operations are common in
-      languages that need them.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>These operations are required to be atomic in the sense that if you
-      use unordered loads and unordered stores, a load cannot see a value
-      which was never stored.  A normal load or store instruction is usually
-      sufficient, but note that an unordered load or store cannot
-      be split into multiple instructions (or an instruction which
-      does multiple memory operations, like <code>LDRD</code> on ARM).</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_monotonic">Monotonic</a>
-</h3>
-
-<div>
-
-<p>Monotonic is the weakest level of atomicity that can be used in
-   synchronization primitives, although it does not provide any general
-   synchronization. It essentially guarantees that if you take all the
-   operations affecting a specific address, a consistent ordering exists.
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This corresponds to the C++0x/C1x <code>memory_order_relaxed</code>;
-     see those standards for the exact definition.
-  <dt>Notes for frontends</dt>
-  <dd>If you are writing a frontend which uses this directly, use with caution.
-      The guarantees in terms of synchronization are very weak, so make
-      sure these are only used in a pattern which you know is correct.
-      Generally, these would either be used for atomic operations which
-      do not protect other memory (like an atomic counter), or along with
-      a <code>fence</code>.</dd>
-  <dt>Notes for optimizers</dt>
-  <dd>In terms of the optimizer, this can be treated as a read+write on the
-      relevant memory location (and alias analysis will take advantage of
-      that). In addition, it is legal to reorder non-atomic and Unordered
-      loads around Monotonic loads. CSE/DSE and a few other optimizations
-      are allowed, but Monotonic operations are unlikely to be used in ways
-      which would make those optimizations useful.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>Code generation is essentially the same as that for unordered for loads
-     and stores.  No fences are required.  <code>cmpxchg</code> and 
-     <code>atomicrmw</code> are required to appear as a single operation.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_acquire">Acquire</a>
-</h3>
-
-<div>
-
-<p>Acquire provides a barrier of the sort necessary to acquire a lock to access
-   other memory with normal loads and stores.
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This corresponds to the C++0x/C1x <code>memory_order_acquire</code>. It
-      should also be used for C++0x/C1x <code>memory_order_consume</code>.
-  <dt>Notes for frontends</dt>
-  <dd>If you are writing a frontend which uses this directly, use with caution.
-      Acquire only provides a semantic guarantee when paired with a Release
-      operation.</dd>
-  <dt>Notes for optimizers</dt>
-  <dd>Optimizers not aware of atomics can treat this like a nothrow call.
-      It is also possible to move stores from before an Acquire load
-      or read-modify-write operation to after it, and move non-Acquire
-      loads from before an Acquire operation to after it.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>Architectures with weak memory ordering (essentially everything relevant
-      today except x86 and SPARC) require some sort of fence to maintain
-      the Acquire semantics.  The precise fences required varies widely by
-      architecture, but for a simple implementation, most architectures provide
-      a barrier which is strong enough for everything (<code>dmb</code> on ARM,
-      <code>sync</code> on PowerPC, etc.).  Putting such a fence after the
-      equivalent Monotonic operation is sufficient to maintain Acquire
-      semantics for a memory operation.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_acquire">Release</a>
-</h3>
-
-<div>
-
-<p>Release is similar to Acquire, but with a barrier of the sort necessary to
-   release a lock.
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This corresponds to the C++0x/C1x <code>memory_order_release</code>.</dd>
-  <dt>Notes for frontends</dt>
-  <dd>If you are writing a frontend which uses this directly, use with caution.
-      Release only provides a semantic guarantee when paired with a Acquire
-      operation.</dd>
-  <dt>Notes for optimizers</dt>
-  <dd>Optimizers not aware of atomics can treat this like a nothrow call.
-      It is also possible to move loads from after a Release store
-      or read-modify-write operation to before it, and move non-Release
-      stores from after an Release operation to before it.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>See the section on Acquire; a fence before the relevant operation is
-      usually sufficient for Release. Note that a store-store fence is not
-      sufficient to implement Release semantics; store-store fences are
-      generally not exposed to IR because they are extremely difficult to
-      use correctly.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_acqrel">AcquireRelease</a>
-</h3>
-
-<div>
-
-<p>AcquireRelease (<code>acq_rel</code> in IR) provides both an Acquire and a
-   Release barrier (for fences and operations which both read and write memory).
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This corresponds to the C++0x/C1x <code>memory_order_acq_rel</code>.
-  <dt>Notes for frontends</dt>
-  <dd>If you are writing a frontend which uses this directly, use with caution.
-      Acquire only provides a semantic guarantee when paired with a Release
-      operation, and vice versa.</dd>
-  <dt>Notes for optimizers</dt>
-  <dd>In general, optimizers should treat this like a nothrow call; the
-      the possible optimizations are usually not interesting.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>This operation has Acquire and Release semantics; see the sections on
-      Acquire and Release.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-     <a name="o_seqcst">SequentiallyConsistent</a>
-</h3>
-
-<div>
-
-<p>SequentiallyConsistent (<code>seq_cst</code> in IR) provides
-   Acquire semantics for loads and Release semantics for
-   stores. Additionally, it guarantees that a total ordering exists
-   between all SequentiallyConsistent operations.
-
-<dl>
-  <dt>Relevant standard</dt>
-  <dd>This corresponds to the C++0x/C1x <code>memory_order_seq_cst</code>,
-      Java volatile, and the gcc-compatible <code>__sync_*</code> builtins
-      which do not specify otherwise.
-  <dt>Notes for frontends</dt>
-  <dd>If a frontend is exposing atomic operations, these are much easier to
-      reason about for the programmer than other kinds of operations, and using
-      them is generally a practical performance tradeoff.</dd>
-  <dt>Notes for optimizers</dt>
-  <dd>Optimizers not aware of atomics can treat this like a nothrow call.
-      For SequentiallyConsistent loads and stores, the same reorderings are
-      allowed as for Acquire loads and Release stores, except that
-      SequentiallyConsistent operations may not be reordered.</dd>
-  <dt>Notes for code generation</dt>
-  <dd>SequentiallyConsistent loads minimally require the same barriers
-     as Acquire operations and SequentiallyConsistent stores require
-     Release barriers. Additionally, the code generator must enforce
-     ordering between SequentiallyConsistent stores followed by
-     SequentiallyConsistent loads. This is usually done by emitting
-     either a full fence before the loads or a full fence after the
-     stores; which is preferred varies by architecture.</dd>
-</dl>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="iropt">Atomics and IR optimization</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Predicates for optimizer writers to query:
-<ul>
-  <li>isSimple(): A load or store which is not volatile or atomic.  This is
-      what, for example, memcpyopt would check for operations it might
-      transform.</li>
-  <li>isUnordered(): A load or store which is not volatile and at most
-      Unordered. This would be checked, for example, by LICM before hoisting
-      an operation.</li>
-  <li>mayReadFromMemory()/mayWriteToMemory(): Existing predicate, but note
-      that they return true for any operation which is volatile or at least
-      Monotonic.</li>
-  <li>Alias analysis: Note that AA will return ModRef for anything Acquire or
-      Release, and for the address accessed by any Monotonic operation.</li>
-</ul>
-
-<p>To support optimizing around atomic operations, make sure you are using
-   the right predicates; everything should work if that is done.  If your
-   pass should optimize some atomic operations (Unordered operations in
-   particular), make sure it doesn't replace an atomic load or store with
-   a non-atomic operation.</p>
-
-<p>Some examples of how optimizations interact with various kinds of atomic
-   operations:
-<ul>
-  <li>memcpyopt: An atomic operation cannot be optimized into part of a
-      memcpy/memset, including unordered loads/stores.  It can pull operations
-      across some atomic operations.
-  <li>LICM: Unordered loads/stores can be moved out of a loop.  It just treats
-      monotonic operations like a read+write to a memory location, and anything
-      stricter than that like a nothrow call.
-  <li>DSE: Unordered stores can be DSE'ed like normal stores.  Monotonic stores
-      can be DSE'ed in some cases, but it's tricky to reason about, and not
-      especially important.
-  <li>Folding a load: Any atomic load from a constant global can be
-      constant-folded, because it cannot be observed.  Similar reasoning allows
-      scalarrepl with atomic loads and stores.
-</ul>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="codegen">Atomics and Codegen</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Atomic operations are represented in the SelectionDAG with
-   <code>ATOMIC_*</code> opcodes.  On architectures which use barrier
-   instructions for all atomic ordering (like ARM), appropriate fences are
-   split out as the DAG is built.</p>
-
-<p>The MachineMemOperand for all atomic operations is currently marked as
-   volatile; this is not correct in the IR sense of volatile, but CodeGen
-   handles anything marked volatile very conservatively.  This should get
-   fixed at some point.</p>
-
-<p>Common architectures have some way of representing at least a pointer-sized
-   lock-free <code>cmpxchg</code>; such an operation can be used to implement
-   all the other atomic operations which can be represented in IR up to that
-   size.  Backends are expected to implement all those operations, but not
-   operations which cannot be implemented in a lock-free manner.  It is
-   expected that backends will give an error when given an operation which
-   cannot be implemented.  (The LLVM code generator is not very helpful here
-   at the moment, but hopefully that will change.)</p>
-
-<p>The implementation of atomics on LL/SC architectures (like ARM) is currently
-   a bit of a mess; there is a lot of copy-pasted code across targets, and
-   the representation is relatively unsuited to optimization (it would be nice
-   to be able to optimize loops involving cmpxchg etc.).</p>
-
-<p>On x86, all atomic loads generate a <code>MOV</code>.
-   SequentiallyConsistent stores generate an <code>XCHG</code>, other stores
-   generate a <code>MOV</code>. SequentiallyConsistent fences generate an
-   <code>MFENCE</code>, other fences do not cause any code to be generated.
-   cmpxchg uses the <code>LOCK CMPXCHG</code> instruction.
-   <code>atomicrmw xchg</code> uses <code>XCHG</code>,
-   <code>atomicrmw add</code> and <code>atomicrmw sub</code> use
-   <code>XADD</code>, and all other <code>atomicrmw</code> operations generate
-   a loop with <code>LOCK CMPXCHG</code>.  Depending on the users of the
-   result, some <code>atomicrmw</code> operations can be translated into
-   operations like <code>LOCK AND</code>, but that does not work in
-   general.</p>
-
-<p>On ARM, MIPS, and many other RISC architectures, Acquire, Release, and
-   SequentiallyConsistent semantics require barrier instructions
-   for every such operation. Loads and stores generate normal instructions.
-   <code>cmpxchg</code> and <code>atomicrmw</code> can be represented using
-   a loop with LL/SC-style instructions which take some sort of exclusive
-   lock on a cache line  (<code>LDREX</code> and <code>STREX</code> on
-   ARM, etc.). At the moment, the IR does not provide any way to represent a
-   weak <code>cmpxchg</code> which would not require a loop.</p>
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-08-09 02:07:00 -0700 (Tue, 09 Aug 2011) $
-</address>
-
-</body>
-</html>
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
new file mode 100644
index 0000000..1bca53e
--- /dev/null
+++ b/docs/Atomics.rst
@@ -0,0 +1,441 @@
+.. _atomics:
+
+==============================================
+LLVM Atomic Instructions and Concurrency Guide
+==============================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Historically, LLVM has not had very strong support for concurrency; some minimal
+intrinsics were provided, and ``volatile`` was used in some cases to achieve
+rough semantics in the presence of concurrency.  However, this is changing;
+there are now new instructions which are well-defined in the presence of threads
+and asynchronous signals, and the model for existing instructions has been
+clarified in the IR.
+
+The atomic instructions are designed specifically to provide readable IR and
+optimized code generation for the following:
+
+* The new C++0x ``<atomic>`` header.  (`C++0x draft available here
+  <http://www.open-std.org/jtc1/sc22/wg21/>`_.) (`C1x draft available here
+  <http://www.open-std.org/jtc1/sc22/wg14/>`_.)
+
+* Proper semantics for Java-style memory, for both ``volatile`` and regular
+  shared variables. (`Java Specification
+  <http://java.sun.com/docs/books/jls/third_edition/html/memory.html>`_)
+
+* gcc-compatible ``__sync_*`` builtins. (`Description
+  <http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html>`_)
+
+* Other scenarios with atomic semantics, including ``static`` variables with
+  non-trivial constructors in C++.
+
+Atomic and volatile in the IR are orthogonal; "volatile" is the C/C++ volatile,
+which ensures that every volatile load and store happens and is performed in the
+stated order.  A couple examples: if a SequentiallyConsistent store is
+immediately followed by another SequentiallyConsistent store to the same
+address, the first store can be erased. This transformation is not allowed for a
+pair of volatile stores. On the other hand, a non-volatile non-atomic load can
+be moved across a volatile load freely, but not an Acquire load.
+
+This document is intended to provide a guide to anyone either writing a frontend
+for LLVM or working on optimization passes for LLVM with a guide for how to deal
+with instructions with special semantics in the presence of concurrency.  This
+is not intended to be a precise guide to the semantics; the details can get
+extremely complicated and unreadable, and are not usually necessary.
+
+.. _Optimization outside atomic:
+
+Optimization outside atomic
+===========================
+
+The basic ``'load'`` and ``'store'`` allow a variety of optimizations, but can
+lead to undefined results in a concurrent environment; see `NotAtomic`_. This
+section specifically goes into the one optimizer restriction which applies in
+concurrent environments, which gets a bit more of an extended description
+because any optimization dealing with stores needs to be aware of it.
+
+From the optimizer's point of view, the rule is that if there are not any
+instructions with atomic ordering involved, concurrency does not matter, with
+one exception: if a variable might be visible to another thread or signal
+handler, a store cannot be inserted along a path where it might not execute
+otherwise.  Take the following example:
+
+.. code-block:: c
+
+ /* C code, for readability; run through clang -O2 -S -emit-llvm to get
+     equivalent IR */
+  int x;
+  void f(int* a) {
+    for (int i = 0; i < 100; i++) {
+      if (a[i])
+        x += 1;
+    }
+  }
+
+The following is equivalent in non-concurrent situations:
+
+.. code-block:: c
+
+  int x;
+  void f(int* a) {
+    int xtemp = x;
+    for (int i = 0; i < 100; i++) {
+      if (a[i])
+        xtemp += 1;
+    }
+    x = xtemp;
+  }
+
+However, LLVM is not allowed to transform the former to the latter: it could
+indirectly introduce undefined behavior if another thread can access ``x`` at
+the same time. (This example is particularly of interest because before the
+concurrency model was implemented, LLVM would perform this transformation.)
+
+Note that speculative loads are allowed; a load which is part of a race returns
+``undef``, but does not have undefined behavior.
+
+Atomic instructions
+===================
+
+For cases where simple loads and stores are not sufficient, LLVM provides
+various atomic instructions. The exact guarantees provided depend on the
+ordering; see `Atomic orderings`_.
+
+``load atomic`` and ``store atomic`` provide the same basic functionality as
+non-atomic loads and stores, but provide additional guarantees in situations
+where threads and signals are involved.
+
+``cmpxchg`` and ``atomicrmw`` are essentially like an atomic load followed by an
+atomic store (where the store is conditional for ``cmpxchg``), but no other
+memory operation can happen on any thread between the load and store.  Note that
+LLVM's cmpxchg does not provide quite as many options as the C++0x version.
+
+A ``fence`` provides Acquire and/or Release ordering which is not part of
+another operation; it is normally used along with Monotonic memory operations.
+A Monotonic load followed by an Acquire fence is roughly equivalent to an
+Acquire load.
+
+Frontends generating atomic instructions generally need to be aware of the
+target to some degree; atomic instructions are guaranteed to be lock-free, and
+therefore an instruction which is wider than the target natively supports can be
+impossible to generate.
+
+.. _Atomic orderings:
+
+Atomic orderings
+================
+
+In order to achieve a balance between performance and necessary guarantees,
+there are six levels of atomicity. They are listed in order of strength; each
+level includes all the guarantees of the previous level except for
+Acquire/Release. (See also `LangRef Ordering <LangRef.html#ordering>`_.)
+
+.. _NotAtomic:
+
+NotAtomic
+---------
+
+NotAtomic is the obvious, a load or store which is not atomic. (This isn't
+really a level of atomicity, but is listed here for comparison.) This is
+essentially a regular load or store. If there is a race on a given memory
+location, loads from that location return undef.
+
+Relevant standard
+  This is intended to match shared variables in C/C++, and to be used in any
+  other context where memory access is necessary, and a race is impossible. (The
+  precise definition is in `LangRef Memory Model <LangRef.html#memmodel>`_.)
+
+Notes for frontends
+  The rule is essentially that all memory accessed with basic loads and stores
+  by multiple threads should be protected by a lock or other synchronization;
+  otherwise, you are likely to run into undefined behavior. If your frontend is
+  for a "safe" language like Java, use Unordered to load and store any shared
+  variable.  Note that NotAtomic volatile loads and stores are not properly
+  atomic; do not try to use them as a substitute. (Per the C/C++ standards,
+  volatile does provide some limited guarantees around asynchronous signals, but
+  atomics are generally a better solution.)
+
+Notes for optimizers
+  Introducing loads to shared variables along a codepath where they would not
+  otherwise exist is allowed; introducing stores to shared variables is not. See
+  `Optimization outside atomic`_.
+
+Notes for code generation
+  The one interesting restriction here is that it is not allowed to write to
+  bytes outside of the bytes relevant to a store.  This is mostly relevant to
+  unaligned stores: it is not allowed in general to convert an unaligned store
+  into two aligned stores of the same width as the unaligned store. Backends are
+  also expected to generate an i8 store as an i8 store, and not an instruction
+  which writes to surrounding bytes.  (If you are writing a backend for an
+  architecture which cannot satisfy these restrictions and cares about
+  concurrency, please send an email to llvmdev.)
+
+Unordered
+---------
+
+Unordered is the lowest level of atomicity. It essentially guarantees that races
+produce somewhat sane results instead of having undefined behavior.  It also
+guarantees the operation to be lock-free, so it do not depend on the data being
+part of a special atomic structure or depend on a separate per-process global
+lock.  Note that code generation will fail for unsupported atomic operations; if
+you need such an operation, use explicit locking.
+
+Relevant standard
+  This is intended to match the Java memory model for shared variables.
+
+Notes for frontends
+  This cannot be used for synchronization, but is useful for Java and other
+  "safe" languages which need to guarantee that the generated code never
+  exhibits undefined behavior. Note that this guarantee is cheap on common
+  platforms for loads of a native width, but can be expensive or unavailable for
+  wider loads, like a 64-bit store on ARM. (A frontend for Java or other "safe"
+  languages would normally split a 64-bit store on ARM into two 32-bit unordered
+  stores.)
+
+Notes for optimizers
+  In terms of the optimizer, this prohibits any transformation that transforms a
+  single load into multiple loads, transforms a store into multiple stores,
+  narrows a store, or stores a value which would not be stored otherwise.  Some
+  examples of unsafe optimizations are narrowing an assignment into a bitfield,
+  rematerializing a load, and turning loads and stores into a memcpy
+  call. Reordering unordered operations is safe, though, and optimizers should
+  take advantage of that because unordered operations are common in languages
+  that need them.
+
+Notes for code generation
+  These operations are required to be atomic in the sense that if you use
+  unordered loads and unordered stores, a load cannot see a value which was
+  never stored.  A normal load or store instruction is usually sufficient, but
+  note that an unordered load or store cannot be split into multiple
+  instructions (or an instruction which does multiple memory operations, like
+  ``LDRD`` on ARM).
+
+Monotonic
+---------
+
+Monotonic is the weakest level of atomicity that can be used in synchronization
+primitives, although it does not provide any general synchronization. It
+essentially guarantees that if you take all the operations affecting a specific
+address, a consistent ordering exists.
+
+Relevant standard
+  This corresponds to the C++0x/C1x ``memory_order_relaxed``; see those
+  standards for the exact definition.
+
+Notes for frontends
+  If you are writing a frontend which uses this directly, use with caution.  The
+  guarantees in terms of synchronization are very weak, so make sure these are
+  only used in a pattern which you know is correct.  Generally, these would
+  either be used for atomic operations which do not protect other memory (like
+  an atomic counter), or along with a ``fence``.
+
+Notes for optimizers
+  In terms of the optimizer, this can be treated as a read+write on the relevant
+  memory location (and alias analysis will take advantage of that). In addition,
+  it is legal to reorder non-atomic and Unordered loads around Monotonic
+  loads. CSE/DSE and a few other optimizations are allowed, but Monotonic
+  operations are unlikely to be used in ways which would make those
+  optimizations useful.
+
+Notes for code generation
+  Code generation is essentially the same as that for unordered for loads and
+  stores.  No fences are required.  ``cmpxchg`` and ``atomicrmw`` are required
+  to appear as a single operation.
+
+Acquire
+-------
+
+Acquire provides a barrier of the sort necessary to acquire a lock to access
+other memory with normal loads and stores.
+
+Relevant standard
+  This corresponds to the C++0x/C1x ``memory_order_acquire``. It should also be
+  used for C++0x/C1x ``memory_order_consume``.
+
+Notes for frontends
+  If you are writing a frontend which uses this directly, use with caution.
+  Acquire only provides a semantic guarantee when paired with a Release
+  operation.
+
+Notes for optimizers
+  Optimizers not aware of atomics can treat this like a nothrow call.  It is
+  also possible to move stores from before an Acquire load or read-modify-write
+  operation to after it, and move non-Acquire loads from before an Acquire
+  operation to after it.
+
+Notes for code generation
+  Architectures with weak memory ordering (essentially everything relevant today
+  except x86 and SPARC) require some sort of fence to maintain the Acquire
+  semantics.  The precise fences required varies widely by architecture, but for
+  a simple implementation, most architectures provide a barrier which is strong
+  enough for everything (``dmb`` on ARM, ``sync`` on PowerPC, etc.).  Putting
+  such a fence after the equivalent Monotonic operation is sufficient to
+  maintain Acquire semantics for a memory operation.
+
+Release
+-------
+
+Release is similar to Acquire, but with a barrier of the sort necessary to
+release a lock.
+
+Relevant standard
+  This corresponds to the C++0x/C1x ``memory_order_release``.
+
+Notes for frontends
+  If you are writing a frontend which uses this directly, use with caution.
+  Release only provides a semantic guarantee when paired with a Acquire
+  operation.
+
+Notes for optimizers
+  Optimizers not aware of atomics can treat this like a nothrow call.  It is
+  also possible to move loads from after a Release store or read-modify-write
+  operation to before it, and move non-Release stores from after an Release
+  operation to before it.
+
+Notes for code generation
+  See the section on Acquire; a fence before the relevant operation is usually
+  sufficient for Release. Note that a store-store fence is not sufficient to
+  implement Release semantics; store-store fences are generally not exposed to
+  IR because they are extremely difficult to use correctly.
+
+AcquireRelease
+--------------
+
+AcquireRelease (``acq_rel`` in IR) provides both an Acquire and a Release
+barrier (for fences and operations which both read and write memory).
+
+Relevant standard
+  This corresponds to the C++0x/C1x ``memory_order_acq_rel``.
+
+Notes for frontends
+  If you are writing a frontend which uses this directly, use with caution.
+  Acquire only provides a semantic guarantee when paired with a Release
+  operation, and vice versa.
+
+Notes for optimizers
+  In general, optimizers should treat this like a nothrow call; the possible
+  optimizations are usually not interesting.
+
+Notes for code generation
+  This operation has Acquire and Release semantics; see the sections on Acquire
+  and Release.
+
+SequentiallyConsistent
+----------------------
+
+SequentiallyConsistent (``seq_cst`` in IR) provides Acquire semantics for loads
+and Release semantics for stores. Additionally, it guarantees that a total
+ordering exists between all SequentiallyConsistent operations.
+
+Relevant standard
+  This corresponds to the C++0x/C1x ``memory_order_seq_cst``, Java volatile, and
+  the gcc-compatible ``__sync_*`` builtins which do not specify otherwise.
+
+Notes for frontends
+  If a frontend is exposing atomic operations, these are much easier to reason
+  about for the programmer than other kinds of operations, and using them is
+  generally a practical performance tradeoff.
+
+Notes for optimizers
+  Optimizers not aware of atomics can treat this like a nothrow call.  For
+  SequentiallyConsistent loads and stores, the same reorderings are allowed as
+  for Acquire loads and Release stores, except that SequentiallyConsistent
+  operations may not be reordered.
+
+Notes for code generation
+  SequentiallyConsistent loads minimally require the same barriers as Acquire
+  operations and SequentiallyConsistent stores require Release
+  barriers. Additionally, the code generator must enforce ordering between
+  SequentiallyConsistent stores followed by SequentiallyConsistent loads. This
+  is usually done by emitting either a full fence before the loads or a full
+  fence after the stores; which is preferred varies by architecture.
+
+Atomics and IR optimization
+===========================
+
+Predicates for optimizer writers to query:
+
+* ``isSimple()``: A load or store which is not volatile or atomic.  This is
+  what, for example, memcpyopt would check for operations it might transform.
+
+* ``isUnordered()``: A load or store which is not volatile and at most
+  Unordered. This would be checked, for example, by LICM before hoisting an
+  operation.
+
+* ``mayReadFromMemory()``/``mayWriteToMemory()``: Existing predicate, but note
+  that they return true for any operation which is volatile or at least
+  Monotonic.
+
+* Alias analysis: Note that AA will return ModRef for anything Acquire or
+  Release, and for the address accessed by any Monotonic operation.
+
+To support optimizing around atomic operations, make sure you are using the
+right predicates; everything should work if that is done.  If your pass should
+optimize some atomic operations (Unordered operations in particular), make sure
+it doesn't replace an atomic load or store with a non-atomic operation.
+
+Some examples of how optimizations interact with various kinds of atomic
+operations:
+
+* ``memcpyopt``: An atomic operation cannot be optimized into part of a
+  memcpy/memset, including unordered loads/stores.  It can pull operations
+  across some atomic operations.
+
+* LICM: Unordered loads/stores can be moved out of a loop.  It just treats
+  monotonic operations like a read+write to a memory location, and anything
+  stricter than that like a nothrow call.
+
+* DSE: Unordered stores can be DSE'ed like normal stores.  Monotonic stores can
+  be DSE'ed in some cases, but it's tricky to reason about, and not especially
+  important.
+
+* Folding a load: Any atomic load from a constant global can be constant-folded,
+  because it cannot be observed.  Similar reasoning allows scalarrepl with
+  atomic loads and stores.
+
+Atomics and Codegen
+===================
+
+Atomic operations are represented in the SelectionDAG with ``ATOMIC_*`` opcodes.
+On architectures which use barrier instructions for all atomic ordering (like
+ARM), appropriate fences are split out as the DAG is built.
+
+The MachineMemOperand for all atomic operations is currently marked as volatile;
+this is not correct in the IR sense of volatile, but CodeGen handles anything
+marked volatile very conservatively.  This should get fixed at some point.
+
+Common architectures have some way of representing at least a pointer-sized
+lock-free ``cmpxchg``; such an operation can be used to implement all the other
+atomic operations which can be represented in IR up to that size.  Backends are
+expected to implement all those operations, but not operations which cannot be
+implemented in a lock-free manner.  It is expected that backends will give an
+error when given an operation which cannot be implemented.  (The LLVM code
+generator is not very helpful here at the moment, but hopefully that will
+change.)
+
+The implementation of atomics on LL/SC architectures (like ARM) is currently a
+bit of a mess; there is a lot of copy-pasted code across targets, and the
+representation is relatively unsuited to optimization (it would be nice to be
+able to optimize loops involving cmpxchg etc.).
+
+On x86, all atomic loads generate a ``MOV``. SequentiallyConsistent stores
+generate an ``XCHG``, other stores generate a ``MOV``. SequentiallyConsistent
+fences generate an ``MFENCE``, other fences do not cause any code to be
+generated.  cmpxchg uses the ``LOCK CMPXCHG`` instruction.  ``atomicrmw xchg``
+uses ``XCHG``, ``atomicrmw add`` and ``atomicrmw sub`` use ``XADD``, and all
+other ``atomicrmw`` operations generate a loop with ``LOCK CMPXCHG``.  Depending
+on the users of the result, some ``atomicrmw`` operations can be translated into
+operations like ``LOCK AND``, but that does not work in general.
+
+On ARM, MIPS, and many other RISC architectures, Acquire, Release, and
+SequentiallyConsistent semantics require barrier instructions for every such
+operation. Loads and stores generate normal instructions.  ``cmpxchg`` and
+``atomicrmw`` can be represented using a loop with LL/SC-style instructions
+which take some sort of exclusive lock on a cache line (``LDREX`` and ``STREX``
+on ARM, etc.). At the moment, the IR does not provide any way to represent a
+weak ``cmpxchg`` which would not require a loop.
diff --git a/docs/BitCodeFormat.html b/docs/BitCodeFormat.html
deleted file mode 100644
index 9a042a0..0000000
--- a/docs/BitCodeFormat.html
+++ /dev/null
@@ -1,1470 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM Bitcode File Format</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-<h1> LLVM Bitcode File Format</h1>
-<ol>
-  <li><a href="#abstract">Abstract</a></li>
-  <li><a href="#overview">Overview</a></li>
-  <li><a href="#bitstream">Bitstream Format</a>
-    <ol>
-    <li><a href="#magic">Magic Numbers</a></li>
-    <li><a href="#primitives">Primitives</a></li>
-    <li><a href="#abbrevid">Abbreviation IDs</a></li>
-    <li><a href="#blocks">Blocks</a></li>
-    <li><a href="#datarecord">Data Records</a></li>
-    <li><a href="#abbreviations">Abbreviations</a></li>
-    <li><a href="#stdblocks">Standard Blocks</a></li>
-    </ol>
-  </li>
-  <li><a href="#wrapper">Bitcode Wrapper Format</a>
-  </li>
-  <li><a href="#llvmir">LLVM IR Encoding</a>
-    <ol>
-    <li><a href="#basics">Basics</a></li>
-    <li><a href="#MODULE_BLOCK">MODULE_BLOCK Contents</a></li>
-    <li><a href="#PARAMATTR_BLOCK">PARAMATTR_BLOCK Contents</a></li>
-    <li><a href="#TYPE_BLOCK">TYPE_BLOCK Contents</a></li>
-    <li><a href="#CONSTANTS_BLOCK">CONSTANTS_BLOCK Contents</a></li>
-    <li><a href="#FUNCTION_BLOCK">FUNCTION_BLOCK Contents</a></li>
-    <li><a href="#TYPE_SYMTAB_BLOCK">TYPE_SYMTAB_BLOCK Contents</a></li>
-    <li><a href="#VALUE_SYMTAB_BLOCK">VALUE_SYMTAB_BLOCK Contents</a></li>
-    <li><a href="#METADATA_BLOCK">METADATA_BLOCK Contents</a></li>
-    <li><a href="#METADATA_ATTACHMENT">METADATA_ATTACHMENT Contents</a></li>
-    </ol>
-  </li>
-</ol>
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a>,
-  <a href="http://www.reverberate.org">Joshua Haberman</a>,
-  and <a href="mailto:housel@acm.org">Peter S. Housel</a>.
-</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="abstract">Abstract</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document describes the LLVM bitstream file format and the encoding of
-the LLVM IR into it.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="overview">Overview</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>
-What is commonly known as the LLVM bitcode file format (also, sometimes
-anachronistically known as bytecode) is actually two things: a <a 
-href="#bitstream">bitstream container format</a>
-and an <a href="#llvmir">encoding of LLVM IR</a> into the container format.</p>
-
-<p>
-The bitstream format is an abstract encoding of structured data, very
-similar to XML in some ways.  Like XML, bitstream files contain tags, and nested
-structures, and you can parse the file without having to understand the tags.
-Unlike XML, the bitstream format is a binary encoding, and unlike XML it
-provides a mechanism for the file to self-describe "abbreviations", which are
-effectively size optimizations for the content.</p>
-
-<p>LLVM IR files may be optionally embedded into a <a 
-href="#wrapper">wrapper</a> structure that makes it easy to embed extra data
-along with LLVM IR files.</p>
-
-<p>This document first describes the LLVM bitstream format, describes the
-wrapper format, then describes the record structure used by LLVM IR files.
-</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="bitstream">Bitstream Format</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>
-The bitstream format is literally a stream of bits, with a very simple
-structure.  This structure consists of the following concepts:
-</p>
-
-<ul>
-<li>A "<a href="#magic">magic number</a>" that identifies the contents of
-    the stream.</li>
-<li>Encoding <a href="#primitives">primitives</a> like variable bit-rate
-    integers.</li> 
-<li><a href="#blocks">Blocks</a>, which define nested content.</li> 
-<li><a href="#datarecord">Data Records</a>, which describe entities within the
-    file.</li> 
-<li>Abbreviations, which specify compression optimizations for the file.</li> 
-</ul>
-
-<p>Note that the <a 
-href="CommandGuide/html/llvm-bcanalyzer.html">llvm-bcanalyzer</a> tool can be
-used to dump and inspect arbitrary bitstreams, which is very useful for
-understanding the encoding.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="magic">Magic Numbers</a>
-</h3>
-
-<div>
-
-<p>The first two bytes of a bitcode file are 'BC' (0x42, 0x43).
-The second two bytes are an application-specific magic number.  Generic
-bitcode tools can look at only the first two bytes to verify the file is
-bitcode, while application-specific programs will want to look at all four.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="primitives">Primitives</a>
-</h3>
-
-<div>
-
-<p>
-A bitstream literally consists of a stream of bits, which are read in order
-starting with the least significant bit of each byte.  The stream is made up of a
-number of primitive values that encode a stream of unsigned integer values.
-These integers are encoded in two ways: either as <a href="#fixedwidth">Fixed
-Width Integers</a> or as <a href="#variablewidth">Variable Width
-Integers</a>.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="fixedwidth">Fixed Width Integers</a>
-</h4>
-
-<div>
-
-<p>Fixed-width integer values have their low bits emitted directly to the file.
-   For example, a 3-bit integer value encodes 1 as 001.  Fixed width integers
-   are used when there are a well-known number of options for a field.  For
-   example, boolean values are usually encoded with a 1-bit wide integer. 
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="variablewidth">Variable Width Integers</a>
-</h4>
-
-<div>
-
-<p>Variable-width integer (VBR) values encode values of arbitrary size,
-optimizing for the case where the values are small.  Given a 4-bit VBR field,
-any 3-bit value (0 through 7) is encoded directly, with the high bit set to
-zero.  Values larger than N-1 bits emit their bits in a series of N-1 bit
-chunks, where all but the last set the high bit.</p>
-
-<p>For example, the value 27 (0x1B) is encoded as 1011 0011 when emitted as a
-vbr4 value.  The first set of four bits indicates the value 3 (011) with a
-continuation piece (indicated by a high bit of 1).  The next word indicates a
-value of 24 (011 << 3) with no continuation.  The sum (3+24) yields the value
-27.
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="char6">6-bit characters</a></h4>
-
-<div>
-
-<p>6-bit characters encode common characters into a fixed 6-bit field.  They
-represent the following characters with the following 6-bit values:</p>
-
-<div class="doc_code">
-<pre>
-'a' .. 'z' &mdash;  0 .. 25
-'A' .. 'Z' &mdash; 26 .. 51
-'0' .. '9' &mdash; 52 .. 61
-       '.' &mdash; 62
-       '_' &mdash; 63
-</pre>
-</div>
-
-<p>This encoding is only suitable for encoding characters and strings that
-consist only of the above characters.  It is completely incapable of encoding
-characters not in the set.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="wordalign">Word Alignment</a></h4>
-
-<div>
-
-<p>Occasionally, it is useful to emit zero bits until the bitstream is a
-multiple of 32 bits.  This ensures that the bit position in the stream can be
-represented as a multiple of 32-bit words.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="abbrevid">Abbreviation IDs</a>
-</h3>
-
-<div>
-
-<p>
-A bitstream is a sequential series of <a href="#blocks">Blocks</a> and
-<a href="#datarecord">Data Records</a>.  Both of these start with an
-abbreviation ID encoded as a fixed-bitwidth field.  The width is specified by
-the current block, as described below.  The value of the abbreviation ID
-specifies either a builtin ID (which have special meanings, defined below) or
-one of the abbreviation IDs defined for the current block by the stream itself.
-</p>
-
-<p>
-The set of builtin abbrev IDs is:
-</p>
-
-<ul>
-<li><tt>0 - <a href="#END_BLOCK">END_BLOCK</a></tt> &mdash; This abbrev ID marks
-    the end of the current block.</li>
-<li><tt>1 - <a href="#ENTER_SUBBLOCK">ENTER_SUBBLOCK</a></tt> &mdash; This
-    abbrev ID marks the beginning of a new block.</li>
-<li><tt>2 - <a href="#DEFINE_ABBREV">DEFINE_ABBREV</a></tt> &mdash; This defines
-    a new abbreviation.</li>
-<li><tt>3 - <a href="#UNABBREV_RECORD">UNABBREV_RECORD</a></tt> &mdash; This ID
-    specifies the definition of an unabbreviated record.</li>
-</ul>
-
-<p>Abbreviation IDs 4 and above are defined by the stream itself, and specify
-an <a href="#abbrev_records">abbreviated record encoding</a>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="blocks">Blocks</a>
-</h3>
-
-<div>
-
-<p>
-Blocks in a bitstream denote nested regions of the stream, and are identified by
-a content-specific id number (for example, LLVM IR uses an ID of 12 to represent
-function bodies).  Block IDs 0-7 are reserved for <a href="#stdblocks">standard blocks</a>
-whose meaning is defined by Bitcode; block IDs 8 and greater are
-application specific. Nested blocks capture the hierarchical structure of the data
-encoded in it, and various properties are associated with blocks as the file is
-parsed.  Block definitions allow the reader to efficiently skip blocks
-in constant time if the reader wants a summary of blocks, or if it wants to
-efficiently skip data it does not understand.  The LLVM IR reader uses this
-mechanism to skip function bodies, lazily reading them on demand.
-</p>
-
-<p>
-When reading and encoding the stream, several properties are maintained for the
-block.  In particular, each block maintains:
-</p>
-
-<ol>
-<li>A current abbrev id width.  This value starts at 2 at the beginning of
-    the stream, and is set every time a
-    block record is entered.  The block entry specifies the abbrev id width for
-    the body of the block.</li>
-
-<li>A set of abbreviations.  Abbreviations may be defined within a block, in
-    which case they are only defined in that block (neither subblocks nor
-    enclosing blocks see the abbreviation).  Abbreviations can also be defined
-    inside a <tt><a href="#BLOCKINFO">BLOCKINFO</a></tt> block, in which case
-    they are defined in all blocks that match the ID that the BLOCKINFO block is
-    describing.
-</li>
-</ol>
-
-<p>
-As sub blocks are entered, these properties are saved and the new sub-block has
-its own set of abbreviations, and its own abbrev id width.  When a sub-block is
-popped, the saved values are restored.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="ENTER_SUBBLOCK">ENTER_SUBBLOCK Encoding</a></h4>
-
-<div>
-
-<p><tt>[ENTER_SUBBLOCK, blockid<sub>vbr8</sub>, newabbrevlen<sub>vbr4</sub>,
-     &lt;align32bits&gt;, blocklen<sub>32</sub>]</tt></p>
-
-<p>
-The <tt>ENTER_SUBBLOCK</tt> abbreviation ID specifies the start of a new block
-record.  The <tt>blockid</tt> value is encoded as an 8-bit VBR identifier, and
-indicates the type of block being entered, which can be
-a <a href="#stdblocks">standard block</a> or an application-specific block.
-The <tt>newabbrevlen</tt> value is a 4-bit VBR, which specifies the abbrev id
-width for the sub-block.  The <tt>blocklen</tt> value is a 32-bit aligned value
-that specifies the size of the subblock in 32-bit words. This value allows the
-reader to skip over the entire block in one jump.
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="END_BLOCK">END_BLOCK Encoding</a></h4>
-
-<div>
-
-<p><tt>[END_BLOCK, &lt;align32bits&gt;]</tt></p>
-
-<p>
-The <tt>END_BLOCK</tt> abbreviation ID specifies the end of the current block
-record.  Its end is aligned to 32-bits to ensure that the size of the block is
-an even multiple of 32-bits.
-</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="datarecord">Data Records</a>
-</h3>
-
-<div>
-<p>
-Data records consist of a record code and a number of (up to) 64-bit
-integer values.  The interpretation of the code and values is
-application specific and may vary between different block types.
-Records can be encoded either using an unabbrev record, or with an
-abbreviation.  In the LLVM IR format, for example, there is a record
-which encodes the target triple of a module.  The code is
-<tt>MODULE_CODE_TRIPLE</tt>, and the values of the record are the
-ASCII codes for the characters in the string.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="UNABBREV_RECORD">UNABBREV_RECORD Encoding</a></h4>
-
-<div>
-
-<p><tt>[UNABBREV_RECORD, code<sub>vbr6</sub>, numops<sub>vbr6</sub>,
-       op0<sub>vbr6</sub>, op1<sub>vbr6</sub>, ...]</tt></p>
-
-<p>
-An <tt>UNABBREV_RECORD</tt> provides a default fallback encoding, which is both
-completely general and extremely inefficient.  It can describe an arbitrary
-record by emitting the code and operands as VBRs.
-</p>
-
-<p>
-For example, emitting an LLVM IR target triple as an unabbreviated record
-requires emitting the <tt>UNABBREV_RECORD</tt> abbrevid, a vbr6 for the
-<tt>MODULE_CODE_TRIPLE</tt> code, a vbr6 for the length of the string, which is
-equal to the number of operands, and a vbr6 for each character.  Because there
-are no letters with values less than 32, each letter would need to be emitted as
-at least a two-part VBR, which means that each letter would require at least 12
-bits.  This is not an efficient encoding, but it is fully general.
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="abbrev_records">Abbreviated Record Encoding</a></h4>
-
-<div>
-
-<p><tt>[&lt;abbrevid&gt;, fields...]</tt></p>
-
-<p>
-An abbreviated record is a abbreviation id followed by a set of fields that are
-encoded according to the <a href="#abbreviations">abbreviation definition</a>.
-This allows records to be encoded significantly more densely than records
-encoded with the <tt><a href="#UNABBREV_RECORD">UNABBREV_RECORD</a></tt> type,
-and allows the abbreviation types to be specified in the stream itself, which
-allows the files to be completely self describing.  The actual encoding of
-abbreviations is defined below.
-</p>
-
-<p>The record code, which is the first field of an abbreviated record,
-may be encoded in the abbreviation definition (as a literal
-operand) or supplied in the abbreviated record (as a Fixed or VBR
-operand value).</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="abbreviations">Abbreviations</a>
-</h3>
-
-<div>
-<p>
-Abbreviations are an important form of compression for bitstreams.  The idea is
-to specify a dense encoding for a class of records once, then use that encoding
-to emit many records.  It takes space to emit the encoding into the file, but
-the space is recouped (hopefully plus some) when the records that use it are
-emitted.
-</p>
-
-<p>
-Abbreviations can be determined dynamically per client, per file. Because the
-abbreviations are stored in the bitstream itself, different streams of the same
-format can contain different sets of abbreviations according to the needs
-of the specific stream.
-As a concrete example, LLVM IR files usually emit an abbreviation
-for binary operators.  If a specific LLVM module contained no or few binary
-operators, the abbreviation does not need to be emitted.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="DEFINE_ABBREV">DEFINE_ABBREV  Encoding</a></h4>
-
-<div>
-
-<p><tt>[DEFINE_ABBREV, numabbrevops<sub>vbr5</sub>, abbrevop0, abbrevop1,
- ...]</tt></p>
-
-<p>
-A <tt>DEFINE_ABBREV</tt> record adds an abbreviation to the list of currently
-defined abbreviations in the scope of this block.  This definition only exists
-inside this immediate block &mdash; it is not visible in subblocks or enclosing
-blocks.  Abbreviations are implicitly assigned IDs sequentially starting from 4
-(the first application-defined abbreviation ID).  Any abbreviations defined in a
-<tt>BLOCKINFO</tt> record for the particular block type
-receive IDs first, in order, followed by any
-abbreviations defined within the block itself.  Abbreviated data records
-reference this ID to indicate what abbreviation they are invoking.
-</p>
-
-<p>
-An abbreviation definition consists of the <tt>DEFINE_ABBREV</tt> abbrevid
-followed by a VBR that specifies the number of abbrev operands, then the abbrev
-operands themselves.  Abbreviation operands come in three forms.  They all start
-with a single bit that indicates whether the abbrev operand is a literal operand
-(when the bit is 1) or an encoding operand (when the bit is 0).
-</p>
-
-<ol>
-<li>Literal operands &mdash; <tt>[1<sub>1</sub>, litvalue<sub>vbr8</sub>]</tt>
-&mdash; Literal operands specify that the value in the result is always a single
-specific value.  This specific value is emitted as a vbr8 after the bit
-indicating that it is a literal operand.</li>
-<li>Encoding info without data &mdash; <tt>[0<sub>1</sub>,
- encoding<sub>3</sub>]</tt> &mdash; Operand encodings that do not have extra
- data are just emitted as their code.
-</li>
-<li>Encoding info with data &mdash; <tt>[0<sub>1</sub>, encoding<sub>3</sub>,
-value<sub>vbr5</sub>]</tt> &mdash; Operand encodings that do have extra data are
-emitted as their code, followed by the extra data.
-</li>
-</ol>
-
-<p>The possible operand encodings are:</p>
-
-<ul>
-<li>Fixed (code 1): The field should be emitted as
-    a <a href="#fixedwidth">fixed-width value</a>, whose width is specified by
-    the operand's extra data.</li>
-<li>VBR (code 2): The field should be emitted as
-    a <a href="#variablewidth">variable-width value</a>, whose width is
-    specified by the operand's extra data.</li>
-<li>Array (code 3): This field is an array of values.  The array operand
-    has no extra data, but expects another operand to follow it, indicating
-    the element type of the array.  When reading an array in an abbreviated
-    record, the first integer is a vbr6 that indicates the array length,
-    followed by the encoded elements of the array.  An array may only occur as
-    the last operand of an abbreviation (except for the one final operand that
-    gives the array's type).</li>
-<li>Char6 (code 4): This field should be emitted as
-    a <a href="#char6">char6-encoded value</a>.  This operand type takes no
-    extra data. Char6 encoding is normally used as an array element type.
-    </li>
-<li>Blob (code 5): This field is emitted as a vbr6, followed by padding to a
-    32-bit boundary (for alignment) and an array of 8-bit objects.  The array of
-    bytes is further followed by tail padding to ensure that its total length is
-    a multiple of 4 bytes.  This makes it very efficient for the reader to
-    decode the data without having to make a copy of it: it can use a pointer to
-    the data in the mapped in file and poke directly at it.  A blob may only
-    occur as the last operand of an abbreviation.</li>
-</ul>
-
-<p>
-For example, target triples in LLVM modules are encoded as a record of the
-form <tt>[TRIPLE, 'a', 'b', 'c', 'd']</tt>.  Consider if the bitstream emitted
-the following abbrev entry:
-</p>
-
-<div class="doc_code">
-<pre>
-[0, Fixed, 4]
-[0, Array]
-[0, Char6]
-</pre>
-</div>
-
-<p>
-When emitting a record with this abbreviation, the above entry would be emitted
-as:
-</p>
-
-<div class="doc_code">
-<p>
-<tt>[4<sub>abbrevwidth</sub>, 2<sub>4</sub>, 4<sub>vbr6</sub>, 0<sub>6</sub>,
-1<sub>6</sub>, 2<sub>6</sub>, 3<sub>6</sub>]</tt>
-</p>
-</div>
-
-<p>These values are:</p>
-
-<ol>
-<li>The first value, 4, is the abbreviation ID for this abbreviation.</li>
-<li>The second value, 2, is the record code for <tt>TRIPLE</tt> records within LLVM IR file <tt>MODULE_BLOCK</tt> blocks.</li>
-<li>The third value, 4, is the length of the array.</li>
-<li>The rest of the values are the char6 encoded values
-    for <tt>"abcd"</tt>.</li>
-</ol>
-
-<p>
-With this abbreviation, the triple is emitted with only 37 bits (assuming a
-abbrev id width of 3).  Without the abbreviation, significantly more space would
-be required to emit the target triple.  Also, because the <tt>TRIPLE</tt> value
-is not emitted as a literal in the abbreviation, the abbreviation can also be
-used for any other string value.
-</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="stdblocks">Standard Blocks</a>
-</h3>
-
-<div>
-
-<p>
-In addition to the basic block structure and record encodings, the bitstream
-also defines specific built-in block types.  These block types specify how the
-stream is to be decoded or other metadata.  In the future, new standard blocks
-may be added.  Block IDs 0-7 are reserved for standard blocks.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="BLOCKINFO">#0 - BLOCKINFO Block</a></h4>
-
-<div>
-
-<p>
-The <tt>BLOCKINFO</tt> block allows the description of metadata for other
-blocks.  The currently specified records are:
-</p>
-
-<div class="doc_code">
-<pre>
-[SETBID (#1), blockid]
-[DEFINE_ABBREV, ...]
-[BLOCKNAME, ...name...]
-[SETRECORDNAME, RecordID, ...name...]
-</pre>
-</div>
-
-<p>
-The <tt>SETBID</tt> record (code 1) indicates which block ID is being
-described.  <tt>SETBID</tt> records can occur multiple times throughout the
-block to change which block ID is being described.  There must be
-a <tt>SETBID</tt> record prior to any other records.
-</p>
-
-<p>
-Standard <tt>DEFINE_ABBREV</tt> records can occur inside <tt>BLOCKINFO</tt>
-blocks, but unlike their occurrence in normal blocks, the abbreviation is
-defined for blocks matching the block ID we are describing, <i>not</i> the
-<tt>BLOCKINFO</tt> block itself.  The abbreviations defined
-in <tt>BLOCKINFO</tt> blocks receive abbreviation IDs as described
-in <tt><a href="#DEFINE_ABBREV">DEFINE_ABBREV</a></tt>.
-</p>
-
-<p>The <tt>BLOCKNAME</tt> record (code 2) can optionally occur in this block.  The elements of
-the record are the bytes of the string name of the block.  llvm-bcanalyzer can use
-this to dump out bitcode files symbolically.</p>
-
-<p>The <tt>SETRECORDNAME</tt> record (code 3) can also optionally occur in this block.  The
-first operand value is a record ID number, and the rest of the elements of the record are
-the bytes for the string name of the record.  llvm-bcanalyzer can use
-this to dump out bitcode files symbolically.</p>
-
-<p>
-Note that although the data in <tt>BLOCKINFO</tt> blocks is described as
-"metadata," the abbreviations they contain are essential for parsing records
-from the corresponding blocks.  It is not safe to skip them.
-</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="wrapper">Bitcode Wrapper Format</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>
-Bitcode files for LLVM IR may optionally be wrapped in a simple wrapper
-structure.  This structure contains a simple header that indicates the offset
-and size of the embedded BC file.  This allows additional information to be
-stored alongside the BC file.  The structure of this file header is:
-</p>
-
-<div class="doc_code">
-<p>
-<tt>[Magic<sub>32</sub>, Version<sub>32</sub>, Offset<sub>32</sub>,
-Size<sub>32</sub>, CPUType<sub>32</sub>]</tt>
-</p>
-</div>
-
-<p>
-Each of the fields are 32-bit fields stored in little endian form (as with
-the rest of the bitcode file fields).  The Magic number is always
-<tt>0x0B17C0DE</tt> and the version is currently always <tt>0</tt>.  The Offset
-field is the offset in bytes to the start of the bitcode stream in the file, and
-the Size field is the size in bytes of the stream. CPUType is a target-specific
-value that can be used to encode the CPU of the target.
-</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="llvmir">LLVM IR Encoding</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>
-LLVM IR is encoded into a bitstream by defining blocks and records.  It uses
-blocks for things like constant pools, functions, symbol tables, etc.  It uses
-records for things like instructions, global variable descriptors, type
-descriptions, etc.  This document does not describe the set of abbreviations
-that the writer uses, as these are fully self-described in the file, and the
-reader is not allowed to build in any knowledge of this.
-</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="basics">Basics</a>
-</h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="ir_magic">LLVM IR Magic Number</a></h4>
-
-<div>
-
-<p>
-The magic number for LLVM IR files is:
-</p>
-
-<div class="doc_code">
-<p>
-<tt>[0x0<sub>4</sub>, 0xC<sub>4</sub>, 0xE<sub>4</sub>, 0xD<sub>4</sub>]</tt>
-</p>
-</div>
-
-<p>
-When combined with the bitcode magic number and viewed as bytes, this is
-<tt>"BC&nbsp;0xC0DE"</tt>.
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="ir_signed_vbr">Signed VBRs</a></h4>
-
-<div>
-
-<p>
-<a href="#variablewidth">Variable Width Integer</a> encoding is an efficient way to
-encode arbitrary sized unsigned values, but is an extremely inefficient for
-encoding signed values, as signed values are otherwise treated as maximally large
-unsigned values.
-</p>
-
-<p>
-As such, signed VBR values of a specific width are emitted as follows:
-</p>
-
-<ul>
-<li>Positive values are emitted as VBRs of the specified width, but with their
-    value shifted left by one.</li>
-<li>Negative values are emitted as VBRs of the specified width, but the negated
-    value is shifted left by one, and the low bit is set.</li>
-</ul>
-
-<p>
-With this encoding, small positive and small negative values can both
-be emitted efficiently. Signed VBR encoding is used in
-<tt>CST_CODE_INTEGER</tt> and <tt>CST_CODE_WIDE_INTEGER</tt> records
-within <tt>CONSTANTS_BLOCK</tt> blocks.
-</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="ir_blocks">LLVM IR Blocks</a></h4>
-
-<div>
-
-<p>
-LLVM IR is defined with the following blocks:
-</p>
-
-<ul>
-<li>8  &mdash; <a href="#MODULE_BLOCK"><tt>MODULE_BLOCK</tt></a> &mdash; This is the top-level block that
-    contains the entire module, and describes a variety of per-module
-    information.</li>
-<li>9  &mdash; <a href="#PARAMATTR_BLOCK"><tt>PARAMATTR_BLOCK</tt></a> &mdash; This enumerates the parameter
-    attributes.</li>
-<li>10 &mdash; <a href="#TYPE_BLOCK"><tt>TYPE_BLOCK</tt></a> &mdash; This describes all of the types in
-    the module.</li>
-<li>11 &mdash; <a href="#CONSTANTS_BLOCK"><tt>CONSTANTS_BLOCK</tt></a> &mdash; This describes constants for a
-    module or function.</li>
-<li>12 &mdash; <a href="#FUNCTION_BLOCK"><tt>FUNCTION_BLOCK</tt></a> &mdash; This describes a function
-    body.</li>
-<li>13 &mdash; <a href="#TYPE_SYMTAB_BLOCK"><tt>TYPE_SYMTAB_BLOCK</tt></a> &mdash; This describes the type symbol
-    table.</li>
-<li>14 &mdash; <a href="#VALUE_SYMTAB_BLOCK"><tt>VALUE_SYMTAB_BLOCK</tt></a> &mdash; This describes a value symbol
-    table.</li>
-<li>15 &mdash; <a href="#METADATA_BLOCK"><tt>METADATA_BLOCK</tt></a> &mdash; This describes metadata items.</li>
-<li>16 &mdash; <a href="#METADATA_ATTACHMENT"><tt>METADATA_ATTACHMENT</tt></a> &mdash; This contains records associating metadata with function instruction values.</li>
-</ul>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="MODULE_BLOCK">MODULE_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>MODULE_BLOCK</tt> block (id 8) is the top-level block for LLVM
-bitcode files, and each bitcode file must contain exactly one. In
-addition to records (described below) containing information
-about the module, a <tt>MODULE_BLOCK</tt> block may contain the
-following sub-blocks:
-</p>
-
-<ul>
-<li><a href="#BLOCKINFO"><tt>BLOCKINFO</tt></a></li>
-<li><a href="#PARAMATTR_BLOCK"><tt>PARAMATTR_BLOCK</tt></a></li>
-<li><a href="#TYPE_BLOCK"><tt>TYPE_BLOCK</tt></a></li>
-<li><a href="#TYPE_SYMTAB_BLOCK"><tt>TYPE_SYMTAB_BLOCK</tt></a></li>
-<li><a href="#VALUE_SYMTAB_BLOCK"><tt>VALUE_SYMTAB_BLOCK</tt></a></li>
-<li><a href="#CONSTANTS_BLOCK"><tt>CONSTANTS_BLOCK</tt></a></li>
-<li><a href="#FUNCTION_BLOCK"><tt>FUNCTION_BLOCK</tt></a></li>
-<li><a href="#METADATA_BLOCK"><tt>METADATA_BLOCK</tt></a></li>
-</ul>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_VERSION">MODULE_CODE_VERSION Record</a></h4>
-
-<div>
-
-<p><tt>[VERSION, version#]</tt></p>
-
-<p>The <tt>VERSION</tt> record (code 1) contains a single value
-indicating the format version. Only version 0 is supported at this
-time.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_TRIPLE">MODULE_CODE_TRIPLE Record</a></h4>
-
-<div>
-<p><tt>[TRIPLE, ...string...]</tt></p>
-
-<p>The <tt>TRIPLE</tt> record (code 2) contains a variable number of
-values representing the bytes of the <tt>target triple</tt>
-specification string.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_DATALAYOUT">MODULE_CODE_DATALAYOUT Record</a></h4>
-
-<div>
-<p><tt>[DATALAYOUT, ...string...]</tt></p>
-
-<p>The <tt>DATALAYOUT</tt> record (code 3) contains a variable number of
-values representing the bytes of the <tt>target datalayout</tt>
-specification string.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_ASM">MODULE_CODE_ASM Record</a></h4>
-
-<div>
-<p><tt>[ASM, ...string...]</tt></p>
-
-<p>The <tt>ASM</tt> record (code 4) contains a variable number of
-values representing the bytes of <tt>module asm</tt> strings, with
-individual assembly blocks separated by newline (ASCII 10) characters.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_SECTIONNAME">MODULE_CODE_SECTIONNAME Record</a></h4>
-
-<div>
-<p><tt>[SECTIONNAME, ...string...]</tt></p>
-
-<p>The <tt>SECTIONNAME</tt> record (code 5) contains a variable number
-of values representing the bytes of a single section name
-string. There should be one <tt>SECTIONNAME</tt> record for each
-section name referenced (e.g., in global variable or function
-<tt>section</tt> attributes) within the module. These records can be
-referenced by the 1-based index in the <i>section</i> fields of
-<tt>GLOBALVAR</tt> or <tt>FUNCTION</tt> records.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_DEPLIB">MODULE_CODE_DEPLIB Record</a></h4>
-
-<div>
-<p><tt>[DEPLIB, ...string...]</tt></p>
-
-<p>The <tt>DEPLIB</tt> record (code 6) contains a variable number of
-values representing the bytes of a single dependent library name
-string, one of the libraries mentioned in a <tt>deplibs</tt>
-declaration.  There should be one <tt>DEPLIB</tt> record for each
-library name referenced.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_GLOBALVAR">MODULE_CODE_GLOBALVAR Record</a></h4>
-
-<div>
-<p><tt>[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal]</tt></p>
-
-<p>The <tt>GLOBALVAR</tt> record (code 7) marks the declaration or
-definition of a global variable. The operand fields are:</p>
-
-<ul>
-<li><i>pointer type</i>: The type index of the pointer type used to point to
-this global variable</li>
-
-<li><i>isconst</i>: Non-zero if the variable is treated as constant within
-the module, or zero if it is not</li>
-
-<li><i>initid</i>: If non-zero, the value index of the initializer for this
-variable, plus 1.</li>
-
-<li><a name="linkage"><i>linkage</i></a>: An encoding of the linkage
-type for this variable:
-  <ul>
-    <li><tt>external</tt>: code 0</li>
-    <li><tt>weak</tt>: code 1</li>
-    <li><tt>appending</tt>: code 2</li>
-    <li><tt>internal</tt>: code 3</li>
-    <li><tt>linkonce</tt>: code 4</li>
-    <li><tt>dllimport</tt>: code 5</li>
-    <li><tt>dllexport</tt>: code 6</li>
-    <li><tt>extern_weak</tt>: code 7</li>
-    <li><tt>common</tt>: code 8</li>
-    <li><tt>private</tt>: code 9</li>
-    <li><tt>weak_odr</tt>: code 10</li>
-    <li><tt>linkonce_odr</tt>: code 11</li>
-    <li><tt>available_externally</tt>: code 12</li>
-    <li><tt>linker_private</tt>: code 13</li>
-  </ul>
-</li>
-
-<li><i>alignment</i>: The logarithm base 2 of the variable's requested
-alignment, plus 1</li>
-
-<li><i>section</i>: If non-zero, the 1-based section index in the
-table of <a href="#MODULE_CODE_SECTIONNAME">MODULE_CODE_SECTIONNAME</a>
-entries.</li>
-
-<li><a name="visibility"><i>visibility</i></a>: If present, an
-encoding of the visibility of this variable:
-  <ul>
-    <li><tt>default</tt>: code 0</li>
-    <li><tt>hidden</tt>: code 1</li>
-    <li><tt>protected</tt>: code 2</li>
-  </ul>
-</li>
-
-<li><i>threadlocal</i>: If present and non-zero, indicates that the variable
-is <tt>thread_local</tt></li>
-
-<li><i>unnamed_addr</i>: If present and non-zero, indicates that the variable
-has <tt>unnamed_addr</tt></li>
-
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_FUNCTION">MODULE_CODE_FUNCTION Record</a></h4>
-
-<div>
-
-<p><tt>[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc]</tt></p>
-
-<p>The <tt>FUNCTION</tt> record (code 8) marks the declaration or
-definition of a function. The operand fields are:</p>
-
-<ul>
-<li><i>type</i>: The type index of the function type describing this function</li>
-
-<li><i>callingconv</i>: The calling convention number:
-  <ul>
-    <li><tt>ccc</tt>: code 0</li>
-    <li><tt>fastcc</tt>: code 8</li>
-    <li><tt>coldcc</tt>: code 9</li>
-    <li><tt>x86_stdcallcc</tt>: code 64</li>
-    <li><tt>x86_fastcallcc</tt>: code 65</li>
-    <li><tt>arm_apcscc</tt>: code 66</li>
-    <li><tt>arm_aapcscc</tt>: code 67</li>
-    <li><tt>arm_aapcs_vfpcc</tt>: code 68</li>
-  </ul>
-</li>
-
-<li><i>isproto</i>: Non-zero if this entry represents a declaration
-rather than a definition</li>
-
-<li><i>linkage</i>: An encoding of the <a href="#linkage">linkage type</a>
-for this function</li>
-
-<li><i>paramattr</i>: If nonzero, the 1-based parameter attribute index
-into the table of <a href="#PARAMATTR_CODE_ENTRY">PARAMATTR_CODE_ENTRY</a>
-entries.</li>
-
-<li><i>alignment</i>: The logarithm base 2 of the function's requested
-alignment, plus 1</li>
-
-<li><i>section</i>: If non-zero, the 1-based section index in the
-table of <a href="#MODULE_CODE_SECTIONNAME">MODULE_CODE_SECTIONNAME</a>
-entries.</li>
-
-<li><i>visibility</i>: An encoding of the <a href="#visibility">visibility</a>
-    of this function</li>
-
-<li><i>gc</i>: If present and nonzero, the 1-based garbage collector
-index in the table of
-<a href="#MODULE_CODE_GCNAME">MODULE_CODE_GCNAME</a> entries.</li>
-
-<li><i>unnamed_addr</i>: If present and non-zero, indicates that the function
-has <tt>unnamed_addr</tt></li>
-
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_ALIAS">MODULE_CODE_ALIAS Record</a></h4>
-
-<div>
-
-<p><tt>[ALIAS, alias type, aliasee val#, linkage, visibility]</tt></p>
-
-<p>The <tt>ALIAS</tt> record (code 9) marks the definition of an
-alias. The operand fields are</p>
-
-<ul>
-<li><i>alias type</i>: The type index of the alias</li>
-
-<li><i>aliasee val#</i>: The value index of the aliased value</li>
-
-<li><i>linkage</i>: An encoding of the <a href="#linkage">linkage type</a>
-for this alias</li>
-
-<li><i>visibility</i>: If present, an encoding of the
-<a href="#visibility">visibility</a> of the alias</li>
-
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_PURGEVALS">MODULE_CODE_PURGEVALS Record</a></h4>
-
-<div>
-<p><tt>[PURGEVALS, numvals]</tt></p>
-
-<p>The <tt>PURGEVALS</tt> record (code 10) resets the module-level
-value list to the size given by the single operand value. Module-level
-value list items are added by <tt>GLOBALVAR</tt>, <tt>FUNCTION</tt>,
-and <tt>ALIAS</tt> records.  After a <tt>PURGEVALS</tt> record is seen,
-new value indices will start from the given <i>numvals</i> value.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="MODULE_CODE_GCNAME">MODULE_CODE_GCNAME Record</a></h4>
-
-<div>
-<p><tt>[GCNAME, ...string...]</tt></p>
-
-<p>The <tt>GCNAME</tt> record (code 11) contains a variable number of
-values representing the bytes of a single garbage collector name
-string. There should be one <tt>GCNAME</tt> record for each garbage
-collector name referenced in function <tt>gc</tt> attributes within
-the module. These records can be referenced by 1-based index in the <i>gc</i>
-fields of <tt>FUNCTION</tt> records.</p>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="PARAMATTR_BLOCK">PARAMATTR_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>PARAMATTR_BLOCK</tt> block (id 9) contains a table of
-entries describing the attributes of function parameters. These
-entries are referenced by 1-based index in the <i>paramattr</i> field
-of module block <a name="MODULE_CODE_FUNCTION"><tt>FUNCTION</tt></a>
-records, or within the <i>attr</i> field of function block <a
-href="#FUNC_CODE_INST_INVOKE"><tt>INST_INVOKE</tt></a> and <a
-href="#FUNC_CODE_INST_CALL"><tt>INST_CALL</tt></a> records.</p>
-
-<p>Entries within <tt>PARAMATTR_BLOCK</tt> are constructed to ensure
-that each is unique (i.e., no two indicies represent equivalent
-attribute lists). </p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="PARAMATTR_CODE_ENTRY">PARAMATTR_CODE_ENTRY Record</a></h4>
-
-<div>
-
-<p><tt>[ENTRY, paramidx0, attr0, paramidx1, attr1...]</tt></p>
-
-<p>The <tt>ENTRY</tt> record (code 1) contains an even number of
-values describing a unique set of function parameter attributes. Each
-<i>paramidx</i> value indicates which set of attributes is
-represented, with 0 representing the return value attributes,
-0xFFFFFFFF representing function attributes, and other values
-representing 1-based function parameters. Each <i>attr</i> value is a
-bitmap with the following interpretation:
-</p>
-
-<ul>
-<li>bit 0: <tt>zeroext</tt></li>
-<li>bit 1: <tt>signext</tt></li>
-<li>bit 2: <tt>noreturn</tt></li>
-<li>bit 3: <tt>inreg</tt></li>
-<li>bit 4: <tt>sret</tt></li>
-<li>bit 5: <tt>nounwind</tt></li>
-<li>bit 6: <tt>noalias</tt></li>
-<li>bit 7: <tt>byval</tt></li>
-<li>bit 8: <tt>nest</tt></li>
-<li>bit 9: <tt>readnone</tt></li>
-<li>bit 10: <tt>readonly</tt></li>
-<li>bit 11: <tt>noinline</tt></li>
-<li>bit 12: <tt>alwaysinline</tt></li>
-<li>bit 13: <tt>optsize</tt></li>
-<li>bit 14: <tt>ssp</tt></li>
-<li>bit 15: <tt>sspreq</tt></li>
-<li>bits 16&ndash;31: <tt>align <var>n</var></tt></li>
-<li>bit 32: <tt>nocapture</tt></li>
-<li>bit 33: <tt>noredzone</tt></li>
-<li>bit 34: <tt>noimplicitfloat</tt></li>
-<li>bit 35: <tt>naked</tt></li>
-<li>bit 36: <tt>inlinehint</tt></li>
-<li>bits 37&ndash;39: <tt>alignstack <var>n</var></tt>, represented as
-the logarithm base 2 of the requested alignment, plus 1</li>
-</ul>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="TYPE_BLOCK">TYPE_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>TYPE_BLOCK</tt> block (id 10) contains records which
-constitute a table of type operator entries used to represent types
-referenced within an LLVM module. Each record (with the exception of
-<a href="#TYPE_CODE_NUMENTRY"><tt>NUMENTRY</tt></a>) generates a
-single type table entry, which may be referenced by 0-based index from
-instructions, constants, metadata, type symbol table entries, or other
-type operator records.
-</p>
-
-<p>Entries within <tt>TYPE_BLOCK</tt> are constructed to ensure that
-each entry is unique (i.e., no two indicies represent structurally
-equivalent types). </p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_NUMENTRY">TYPE_CODE_NUMENTRY Record</a></h4>
-
-<div>
-
-<p><tt>[NUMENTRY, numentries]</tt></p>
-
-<p>The <tt>NUMENTRY</tt> record (code 1) contains a single value which
-indicates the total number of type code entries in the type table of
-the module. If present, <tt>NUMENTRY</tt> should be the first record
-in the block.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_VOID">TYPE_CODE_VOID Record</a></h4>
-
-<div>
-
-<p><tt>[VOID]</tt></p>
-
-<p>The <tt>VOID</tt> record (code 2) adds a <tt>void</tt> type to the
-type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_FLOAT">TYPE_CODE_FLOAT Record</a></h4>
-
-<div>
-
-<p><tt>[FLOAT]</tt></p>
-
-<p>The <tt>FLOAT</tt> record (code 3) adds a <tt>float</tt> (32-bit
-floating point) type to the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_DOUBLE">TYPE_CODE_DOUBLE Record</a></h4>
-
-<div>
-
-<p><tt>[DOUBLE]</tt></p>
-
-<p>The <tt>DOUBLE</tt> record (code 4) adds a <tt>double</tt> (64-bit
-floating point) type to the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_LABEL">TYPE_CODE_LABEL Record</a></h4>
-
-<div>
-
-<p><tt>[LABEL]</tt></p>
-
-<p>The <tt>LABEL</tt> record (code 5) adds a <tt>label</tt> type to
-the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_OPAQUE">TYPE_CODE_OPAQUE Record</a></h4>
-
-<div>
-
-<p><tt>[OPAQUE]</tt></p>
-
-<p>The <tt>OPAQUE</tt> record (code 6) adds an <tt>opaque</tt> type to
-the type table. Note that distinct <tt>opaque</tt> types are not
-unified.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_INTEGER">TYPE_CODE_INTEGER  Record</a></h4>
-
-<div>
-
-<p><tt>[INTEGER, width]</tt></p>
-
-<p>The <tt>INTEGER</tt> record (code 7) adds an integer type to the
-type table. The single <i>width</i> field indicates the width of the
-integer type.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_POINTER">TYPE_CODE_POINTER Record</a></h4>
-
-<div>
-
-<p><tt>[POINTER, pointee type, address space]</tt></p>
-
-<p>The <tt>POINTER</tt> record (code 8) adds a pointer type to the
-type table. The operand fields are</p>
-
-<ul>
-<li><i>pointee type</i>: The type index of the pointed-to type</li>
-
-<li><i>address space</i>: If supplied, the target-specific numbered
-address space where the pointed-to object resides. Otherwise, the
-default address space is zero.
-</li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_FUNCTION">TYPE_CODE_FUNCTION Record</a></h4>
-
-<div>
-
-<p><tt>[FUNCTION, vararg, ignored, retty, ...paramty... ]</tt></p>
-
-<p>The <tt>FUNCTION</tt> record (code 9) adds a function type to the
-type table. The operand fields are</p>
-
-<ul>
-<li><i>vararg</i>: Non-zero if the type represents a varargs function</li>
-
-<li><i>ignored</i>: This value field is present for backward
-compatibility only, and is ignored</li>
-
-<li><i>retty</i>: The type index of the function's return type</li>
-
-<li><i>paramty</i>: Zero or more type indices representing the
-parameter types of the function</li>
-</ul>
-	
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_STRUCT">TYPE_CODE_STRUCT Record</a></h4>
-
-<div>
-
-<p><tt>[STRUCT, ispacked, ...eltty...]</tt></p>
-
-<p>The <tt>STRUCT </tt> record (code 10) adds a struct type to the
-type table. The operand fields are</p>
-
-<ul>
-<li><i>ispacked</i>: Non-zero if the type represents a packed structure</li>
-
-<li><i>eltty</i>: Zero or more type indices representing the element
-types of the structure</li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_ARRAY">TYPE_CODE_ARRAY Record</a></h4>
-
-<div>
-
-<p><tt>[ARRAY, numelts, eltty]</tt></p>
-
-<p>The <tt>ARRAY</tt> record (code 11) adds an array type to the type
-table.  The operand fields are</p>
-
-<ul>
-<li><i>numelts</i>: The number of elements in arrays of this type</li>
-
-<li><i>eltty</i>: The type index of the array element type</li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_VECTOR">TYPE_CODE_VECTOR Record</a></h4>
-
-<div>
-
-<p><tt>[VECTOR, numelts, eltty]</tt></p>
-
-<p>The <tt>VECTOR</tt> record (code 12) adds a vector type to the type
-table.  The operand fields are</p>
-
-<ul>
-<li><i>numelts</i>: The number of elements in vectors of this type</li>
-
-<li><i>eltty</i>: The type index of the vector element type</li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_X86_FP80">TYPE_CODE_X86_FP80 Record</a></h4>
-
-<div>
-
-<p><tt>[X86_FP80]</tt></p>
-
-<p>The <tt>X86_FP80</tt> record (code 13) adds an <tt>x86_fp80</tt> (80-bit
-floating point) type to the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_FP128">TYPE_CODE_FP128 Record</a></h4>
-
-<div>
-
-<p><tt>[FP128]</tt></p>
-
-<p>The <tt>FP128</tt> record (code 14) adds an <tt>fp128</tt> (128-bit
-floating point) type to the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_PPC_FP128">TYPE_CODE_PPC_FP128 Record</a></h4>
-
-<div>
-
-<p><tt>[PPC_FP128]</tt></p>
-
-<p>The <tt>PPC_FP128</tt> record (code 15) adds a <tt>ppc_fp128</tt>
-(128-bit floating point) type to the type table.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TYPE_CODE_METADATA">TYPE_CODE_METADATA Record</a></h4>
-
-<div>
-
-<p><tt>[METADATA]</tt></p>
-
-<p>The <tt>METADATA</tt> record (code 16) adds a <tt>metadata</tt>
-type to the type table.
-</p>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="CONSTANTS_BLOCK">CONSTANTS_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>CONSTANTS_BLOCK</tt> block (id 11) ...
-</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="FUNCTION_BLOCK">FUNCTION_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>FUNCTION_BLOCK</tt> block (id 12) ...
-</p>
-
-<p>In addition to the record types described below, a
-<tt>FUNCTION_BLOCK</tt> block may contain the following sub-blocks:
-</p>
-
-<ul>
-<li><a href="#CONSTANTS_BLOCK"><tt>CONSTANTS_BLOCK</tt></a></li>
-<li><a href="#VALUE_SYMTAB_BLOCK"><tt>VALUE_SYMTAB_BLOCK</tt></a></li>
-<li><a href="#METADATA_ATTACHMENT"><tt>METADATA_ATTACHMENT</tt></a></li>
-</ul>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="TYPE_SYMTAB_BLOCK">TYPE_SYMTAB_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>TYPE_SYMTAB_BLOCK</tt> block (id 13) contains entries which
-map between module-level named types and their corresponding type
-indices.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4><a name="TST_CODE_ENTRY">TST_CODE_ENTRY Record</a></h4>
-
-<div>
-
-<p><tt>[ENTRY, typeid, ...string...]</tt></p>
-
-<p>The <tt>ENTRY</tt> record (code 1) contains a variable number of
-values, with the first giving the type index of the designated type,
-and the remaining values giving the character codes of the type
-name. Each entry corresponds to a single named type.
-</p>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="VALUE_SYMTAB_BLOCK">VALUE_SYMTAB_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>VALUE_SYMTAB_BLOCK</tt> block (id 14) ... 
-</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="METADATA_BLOCK">METADATA_BLOCK Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>METADATA_BLOCK</tt> block (id 15) ...
-</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="METADATA_ATTACHMENT">METADATA_ATTACHMENT Contents</a>
-</h3>
-
-<div>
-
-<p>The <tt>METADATA_ATTACHMENT</tt> block (id 16) ...
-</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address> <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
- src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-<a href="http://validator.w3.org/check/referer"><img
- src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
- <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-<a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
new file mode 100644
index 0000000..d3995e7
--- /dev/null
+++ b/docs/BitCodeFormat.rst
@@ -0,0 +1,1045 @@
+.. _bitcode_format:
+
+.. role:: raw-html(raw)
+   :format: html
+
+========================
+LLVM Bitcode File Format
+========================
+
+.. contents::
+   :local:
+
+Abstract
+========
+
+This document describes the LLVM bitstream file format and the encoding of the
+LLVM IR into it.
+
+Overview
+========
+
+What is commonly known as the LLVM bitcode file format (also, sometimes
+anachronistically known as bytecode) is actually two things: a `bitstream
+container format`_ and an `encoding of LLVM IR`_ into the container format.
+
+The bitstream format is an abstract encoding of structured data, very similar to
+XML in some ways.  Like XML, bitstream files contain tags, and nested
+structures, and you can parse the file without having to understand the tags.
+Unlike XML, the bitstream format is a binary encoding, and unlike XML it
+provides a mechanism for the file to self-describe "abbreviations", which are
+effectively size optimizations for the content.
+
+LLVM IR files may be optionally embedded into a `wrapper`_ structure that makes
+it easy to embed extra data along with LLVM IR files.
+
+This document first describes the LLVM bitstream format, describes the wrapper
+format, then describes the record structure used by LLVM IR files.
+
+.. _bitstream container format:
+
+Bitstream Format
+================
+
+The bitstream format is literally a stream of bits, with a very simple
+structure.  This structure consists of the following concepts:
+
+* A "`magic number`_" that identifies the contents of the stream.
+
+* Encoding `primitives`_ like variable bit-rate integers.
+
+* `Blocks`_, which define nested content.
+
+* `Data Records`_, which describe entities within the file.
+
+* Abbreviations, which specify compression optimizations for the file.
+
+Note that the `llvm-bcanalyzer <CommandGuide/html/llvm-bcanalyzer.html>`_ tool
+can be used to dump and inspect arbitrary bitstreams, which is very useful for
+understanding the encoding.
+
+.. _magic number:
+
+Magic Numbers
+-------------
+
+The first two bytes of a bitcode file are 'BC' (``0x42``, ``0x43``).  The second
+two bytes are an application-specific magic number.  Generic bitcode tools can
+look at only the first two bytes to verify the file is bitcode, while
+application-specific programs will want to look at all four.
+
+.. _primitives:
+
+Primitives
+----------
+
+A bitstream literally consists of a stream of bits, which are read in order
+starting with the least significant bit of each byte.  The stream is made up of
+a number of primitive values that encode a stream of unsigned integer values.
+These integers are encoded in two ways: either as `Fixed Width Integers`_ or as
+`Variable Width Integers`_.
+
+.. _Fixed Width Integers:
+.. _fixed-width value:
+
+Fixed Width Integers
+^^^^^^^^^^^^^^^^^^^^
+
+Fixed-width integer values have their low bits emitted directly to the file.
+For example, a 3-bit integer value encodes 1 as 001.  Fixed width integers are
+used when there are a well-known number of options for a field.  For example,
+boolean values are usually encoded with a 1-bit wide integer.
+
+.. _Variable Width Integers:
+.. _Variable Width Integer:
+.. _variable-width value:
+
+Variable Width Integers
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Variable-width integer (VBR) values encode values of arbitrary size, optimizing
+for the case where the values are small.  Given a 4-bit VBR field, any 3-bit
+value (0 through 7) is encoded directly, with the high bit set to zero.  Values
+larger than N-1 bits emit their bits in a series of N-1 bit chunks, where all
+but the last set the high bit.
+
+For example, the value 27 (0x1B) is encoded as 1011 0011 when emitted as a vbr4
+value.  The first set of four bits indicates the value 3 (011) with a
+continuation piece (indicated by a high bit of 1).  The next word indicates a
+value of 24 (011 << 3) with no continuation.  The sum (3+24) yields the value
+27.
+
+.. _char6-encoded value:
+
+6-bit characters
+^^^^^^^^^^^^^^^^
+
+6-bit characters encode common characters into a fixed 6-bit field.  They
+represent the following characters with the following 6-bit values:
+
+::
+
+  'a' .. 'z' ---  0 .. 25
+  'A' .. 'Z' --- 26 .. 51
+  '0' .. '9' --- 52 .. 61
+         '.' --- 62
+         '_' --- 63
+
+This encoding is only suitable for encoding characters and strings that consist
+only of the above characters.  It is completely incapable of encoding characters
+not in the set.
+
+Word Alignment
+^^^^^^^^^^^^^^
+
+Occasionally, it is useful to emit zero bits until the bitstream is a multiple
+of 32 bits.  This ensures that the bit position in the stream can be represented
+as a multiple of 32-bit words.
+
+Abbreviation IDs
+----------------
+
+A bitstream is a sequential series of `Blocks`_ and `Data Records`_.  Both of
+these start with an abbreviation ID encoded as a fixed-bitwidth field.  The
+width is specified by the current block, as described below.  The value of the
+abbreviation ID specifies either a builtin ID (which have special meanings,
+defined below) or one of the abbreviation IDs defined for the current block by
+the stream itself.
+
+The set of builtin abbrev IDs is:
+
+* 0 - `END_BLOCK`_ --- This abbrev ID marks the end of the current block.
+
+* 1 - `ENTER_SUBBLOCK`_ --- This abbrev ID marks the beginning of a new
+  block.
+
+* 2 - `DEFINE_ABBREV`_ --- This defines a new abbreviation.
+
+* 3 - `UNABBREV_RECORD`_ --- This ID specifies the definition of an
+  unabbreviated record.
+
+Abbreviation IDs 4 and above are defined by the stream itself, and specify an
+`abbreviated record encoding`_.
+
+.. _Blocks:
+
+Blocks
+------
+
+Blocks in a bitstream denote nested regions of the stream, and are identified by
+a content-specific id number (for example, LLVM IR uses an ID of 12 to represent
+function bodies).  Block IDs 0-7 are reserved for `standard blocks`_ whose
+meaning is defined by Bitcode; block IDs 8 and greater are application
+specific. Nested blocks capture the hierarchical structure of the data encoded
+in it, and various properties are associated with blocks as the file is parsed.
+Block definitions allow the reader to efficiently skip blocks in constant time
+if the reader wants a summary of blocks, or if it wants to efficiently skip data
+it does not understand.  The LLVM IR reader uses this mechanism to skip function
+bodies, lazily reading them on demand.
+
+When reading and encoding the stream, several properties are maintained for the
+block.  In particular, each block maintains:
+
+#. A current abbrev id width.  This value starts at 2 at the beginning of the
+   stream, and is set every time a block record is entered.  The block entry
+   specifies the abbrev id width for the body of the block.
+
+#. A set of abbreviations.  Abbreviations may be defined within a block, in
+   which case they are only defined in that block (neither subblocks nor
+   enclosing blocks see the abbreviation).  Abbreviations can also be defined
+   inside a `BLOCKINFO`_ block, in which case they are defined in all blocks
+   that match the ID that the ``BLOCKINFO`` block is describing.
+
+As sub blocks are entered, these properties are saved and the new sub-block has
+its own set of abbreviations, and its own abbrev id width.  When a sub-block is
+popped, the saved values are restored.
+
+.. _ENTER_SUBBLOCK:
+
+ENTER_SUBBLOCK Encoding
+^^^^^^^^^^^^^^^^^^^^^^^
+
+:raw-html:`<tt>`
+[ENTER_SUBBLOCK, blockid\ :sub:`vbr8`, newabbrevlen\ :sub:`vbr4`, <align32bits>, blocklen_32]
+:raw-html:`</tt>`
+
+The ``ENTER_SUBBLOCK`` abbreviation ID specifies the start of a new block
+record.  The ``blockid`` value is encoded as an 8-bit VBR identifier, and
+indicates the type of block being entered, which can be a `standard block`_ or
+an application-specific block.  The ``newabbrevlen`` value is a 4-bit VBR, which
+specifies the abbrev id width for the sub-block.  The ``blocklen`` value is a
+32-bit aligned value that specifies the size of the subblock in 32-bit
+words. This value allows the reader to skip over the entire block in one jump.
+
+.. _END_BLOCK:
+
+END_BLOCK Encoding
+^^^^^^^^^^^^^^^^^^
+
+``[END_BLOCK, <align32bits>]``
+
+The ``END_BLOCK`` abbreviation ID specifies the end of the current block record.
+Its end is aligned to 32-bits to ensure that the size of the block is an even
+multiple of 32-bits.
+
+.. _Data Records:
+
+Data Records
+------------
+
+Data records consist of a record code and a number of (up to) 64-bit integer
+values.  The interpretation of the code and values is application specific and
+may vary between different block types.  Records can be encoded either using an
+unabbrev record, or with an abbreviation.  In the LLVM IR format, for example,
+there is a record which encodes the target triple of a module.  The code is
+``MODULE_CODE_TRIPLE``, and the values of the record are the ASCII codes for the
+characters in the string.
+
+.. _UNABBREV_RECORD:
+
+UNABBREV_RECORD Encoding
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+:raw-html:`<tt>`
+[UNABBREV_RECORD, code\ :sub:`vbr6`, numops\ :sub:`vbr6`, op0\ :sub:`vbr6`, op1\ :sub:`vbr6`, ...]
+:raw-html:`</tt>`
+
+An ``UNABBREV_RECORD`` provides a default fallback encoding, which is both
+completely general and extremely inefficient.  It can describe an arbitrary
+record by emitting the code and operands as VBRs.
+
+For example, emitting an LLVM IR target triple as an unabbreviated record
+requires emitting the ``UNABBREV_RECORD`` abbrevid, a vbr6 for the
+``MODULE_CODE_TRIPLE`` code, a vbr6 for the length of the string, which is equal
+to the number of operands, and a vbr6 for each character.  Because there are no
+letters with values less than 32, each letter would need to be emitted as at
+least a two-part VBR, which means that each letter would require at least 12
+bits.  This is not an efficient encoding, but it is fully general.
+
+.. _abbreviated record encoding:
+
+Abbreviated Record Encoding
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[<abbrevid>, fields...]``
+
+An abbreviated record is a abbreviation id followed by a set of fields that are
+encoded according to the `abbreviation definition`_.  This allows records to be
+encoded significantly more densely than records encoded with the
+`UNABBREV_RECORD`_ type, and allows the abbreviation types to be specified in
+the stream itself, which allows the files to be completely self describing.  The
+actual encoding of abbreviations is defined below.
+
+The record code, which is the first field of an abbreviated record, may be
+encoded in the abbreviation definition (as a literal operand) or supplied in the
+abbreviated record (as a Fixed or VBR operand value).
+
+.. _abbreviation definition:
+
+Abbreviations
+-------------
+
+Abbreviations are an important form of compression for bitstreams.  The idea is
+to specify a dense encoding for a class of records once, then use that encoding
+to emit many records.  It takes space to emit the encoding into the file, but
+the space is recouped (hopefully plus some) when the records that use it are
+emitted.
+
+Abbreviations can be determined dynamically per client, per file. Because the
+abbreviations are stored in the bitstream itself, different streams of the same
+format can contain different sets of abbreviations according to the needs of the
+specific stream.  As a concrete example, LLVM IR files usually emit an
+abbreviation for binary operators.  If a specific LLVM module contained no or
+few binary operators, the abbreviation does not need to be emitted.
+
+.. _DEFINE_ABBREV:
+
+DEFINE_ABBREV Encoding
+^^^^^^^^^^^^^^^^^^^^^^
+
+:raw-html:`<tt>`
+[DEFINE_ABBREV, numabbrevops\ :sub:`vbr5`, abbrevop0, abbrevop1, ...]
+:raw-html:`</tt>`
+
+A ``DEFINE_ABBREV`` record adds an abbreviation to the list of currently defined
+abbreviations in the scope of this block.  This definition only exists inside
+this immediate block --- it is not visible in subblocks or enclosing blocks.
+Abbreviations are implicitly assigned IDs sequentially starting from 4 (the
+first application-defined abbreviation ID).  Any abbreviations defined in a
+``BLOCKINFO`` record for the particular block type receive IDs first, in order,
+followed by any abbreviations defined within the block itself.  Abbreviated data
+records reference this ID to indicate what abbreviation they are invoking.
+
+An abbreviation definition consists of the ``DEFINE_ABBREV`` abbrevid followed
+by a VBR that specifies the number of abbrev operands, then the abbrev operands
+themselves.  Abbreviation operands come in three forms.  They all start with a
+single bit that indicates whether the abbrev operand is a literal operand (when
+the bit is 1) or an encoding operand (when the bit is 0).
+
+#. Literal operands --- :raw-html:`<tt>` [1\ :sub:`1`, litvalue\
+   :sub:`vbr8`] :raw-html:`</tt>` --- Literal operands specify that the value in
+   the result is always a single specific value.  This specific value is emitted
+   as a vbr8 after the bit indicating that it is a literal operand.
+
+#. Encoding info without data --- :raw-html:`<tt>` [0\ :sub:`1`, encoding\
+   :sub:`3`] :raw-html:`</tt>` --- Operand encodings that do not have extra data
+   are just emitted as their code.
+
+#. Encoding info with data --- :raw-html:`<tt>` [0\ :sub:`1`, encoding\
+   :sub:`3`, value\ :sub:`vbr5`] :raw-html:`</tt>` --- Operand encodings that do
+   have extra data are emitted as their code, followed by the extra data.
+
+The possible operand encodings are:
+
+* Fixed (code 1): The field should be emitted as a `fixed-width value`_, whose
+  width is specified by the operand's extra data.
+
+* VBR (code 2): The field should be emitted as a `variable-width value`_, whose
+  width is specified by the operand's extra data.
+
+* Array (code 3): This field is an array of values.  The array operand has no
+  extra data, but expects another operand to follow it, indicating the element
+  type of the array.  When reading an array in an abbreviated record, the first
+  integer is a vbr6 that indicates the array length, followed by the encoded
+  elements of the array.  An array may only occur as the last operand of an
+  abbreviation (except for the one final operand that gives the array's
+  type).
+
+* Char6 (code 4): This field should be emitted as a `char6-encoded value`_.
+  This operand type takes no extra data. Char6 encoding is normally used as an
+  array element type.
+
+* Blob (code 5): This field is emitted as a vbr6, followed by padding to a
+  32-bit boundary (for alignment) and an array of 8-bit objects.  The array of
+  bytes is further followed by tail padding to ensure that its total length is a
+  multiple of 4 bytes.  This makes it very efficient for the reader to decode
+  the data without having to make a copy of it: it can use a pointer to the data
+  in the mapped in file and poke directly at it.  A blob may only occur as the
+  last operand of an abbreviation.
+
+For example, target triples in LLVM modules are encoded as a record of the form
+``[TRIPLE, 'a', 'b', 'c', 'd']``.  Consider if the bitstream emitted the
+following abbrev entry:
+
+::
+
+  [0, Fixed, 4]
+  [0, Array]
+  [0, Char6]
+
+When emitting a record with this abbreviation, the above entry would be emitted
+as:
+
+:raw-html:`<tt><blockquote>`
+[4\ :sub:`abbrevwidth`, 2\ :sub:`4`, 4\ :sub:`vbr6`, 0\ :sub:`6`, 1\ :sub:`6`, 2\ :sub:`6`, 3\ :sub:`6`]
+:raw-html:`</blockquote></tt>`
+
+These values are:
+
+#. The first value, 4, is the abbreviation ID for this abbreviation.
+
+#. The second value, 2, is the record code for ``TRIPLE`` records within LLVM IR
+   file ``MODULE_BLOCK`` blocks.
+
+#. The third value, 4, is the length of the array.
+
+#. The rest of the values are the char6 encoded values for ``"abcd"``.
+
+With this abbreviation, the triple is emitted with only 37 bits (assuming a
+abbrev id width of 3).  Without the abbreviation, significantly more space would
+be required to emit the target triple.  Also, because the ``TRIPLE`` value is
+not emitted as a literal in the abbreviation, the abbreviation can also be used
+for any other string value.
+
+.. _standard blocks:
+.. _standard block:
+
+Standard Blocks
+---------------
+
+In addition to the basic block structure and record encodings, the bitstream
+also defines specific built-in block types.  These block types specify how the
+stream is to be decoded or other metadata.  In the future, new standard blocks
+may be added.  Block IDs 0-7 are reserved for standard blocks.
+
+.. _BLOCKINFO:
+
+#0 - BLOCKINFO Block
+^^^^^^^^^^^^^^^^^^^^
+
+The ``BLOCKINFO`` block allows the description of metadata for other blocks.
+The currently specified records are:
+
+::
+
+  [SETBID (#1), blockid]
+  [DEFINE_ABBREV, ...]
+  [BLOCKNAME, ...name...]
+  [SETRECORDNAME, RecordID, ...name...]
+
+The ``SETBID`` record (code 1) indicates which block ID is being described.
+``SETBID`` records can occur multiple times throughout the block to change which
+block ID is being described.  There must be a ``SETBID`` record prior to any
+other records.
+
+Standard ``DEFINE_ABBREV`` records can occur inside ``BLOCKINFO`` blocks, but
+unlike their occurrence in normal blocks, the abbreviation is defined for blocks
+matching the block ID we are describing, *not* the ``BLOCKINFO`` block
+itself.  The abbreviations defined in ``BLOCKINFO`` blocks receive abbreviation
+IDs as described in `DEFINE_ABBREV`_.
+
+The ``BLOCKNAME`` record (code 2) can optionally occur in this block.  The
+elements of the record are the bytes of the string name of the block.
+llvm-bcanalyzer can use this to dump out bitcode files symbolically.
+
+The ``SETRECORDNAME`` record (code 3) can also optionally occur in this block.
+The first operand value is a record ID number, and the rest of the elements of
+the record are the bytes for the string name of the record.  llvm-bcanalyzer can
+use this to dump out bitcode files symbolically.
+
+Note that although the data in ``BLOCKINFO`` blocks is described as "metadata,"
+the abbreviations they contain are essential for parsing records from the
+corresponding blocks.  It is not safe to skip them.
+
+.. _wrapper:
+
+Bitcode Wrapper Format
+======================
+
+Bitcode files for LLVM IR may optionally be wrapped in a simple wrapper
+structure.  This structure contains a simple header that indicates the offset
+and size of the embedded BC file.  This allows additional information to be
+stored alongside the BC file.  The structure of this file header is:
+
+:raw-html:`<tt><blockquote>`
+[Magic\ :sub:`32`, Version\ :sub:`32`, Offset\ :sub:`32`, Size\ :sub:`32`, CPUType\ :sub:`32`]
+:raw-html:`</blockquote></tt>`
+
+Each of the fields are 32-bit fields stored in little endian form (as with the
+rest of the bitcode file fields).  The Magic number is always ``0x0B17C0DE`` and
+the version is currently always ``0``.  The Offset field is the offset in bytes
+to the start of the bitcode stream in the file, and the Size field is the size
+in bytes of the stream. CPUType is a target-specific value that can be used to
+encode the CPU of the target.
+
+.. _encoding of LLVM IR:
+
+LLVM IR Encoding
+================
+
+LLVM IR is encoded into a bitstream by defining blocks and records.  It uses
+blocks for things like constant pools, functions, symbol tables, etc.  It uses
+records for things like instructions, global variable descriptors, type
+descriptions, etc.  This document does not describe the set of abbreviations
+that the writer uses, as these are fully self-described in the file, and the
+reader is not allowed to build in any knowledge of this.
+
+Basics
+------
+
+LLVM IR Magic Number
+^^^^^^^^^^^^^^^^^^^^
+
+The magic number for LLVM IR files is:
+
+:raw-html:`<tt><blockquote>`
+[0x0\ :sub:`4`, 0xC\ :sub:`4`, 0xE\ :sub:`4`, 0xD\ :sub:`4`]
+:raw-html:`</blockquote></tt>`
+
+When combined with the bitcode magic number and viewed as bytes, this is
+``"BC 0xC0DE"``.
+
+Signed VBRs
+^^^^^^^^^^^
+
+`Variable Width Integer`_ encoding is an efficient way to encode arbitrary sized
+unsigned values, but is an extremely inefficient for encoding signed values, as
+signed values are otherwise treated as maximally large unsigned values.
+
+As such, signed VBR values of a specific width are emitted as follows:
+
+* Positive values are emitted as VBRs of the specified width, but with their
+  value shifted left by one.
+
+* Negative values are emitted as VBRs of the specified width, but the negated
+  value is shifted left by one, and the low bit is set.
+
+With this encoding, small positive and small negative values can both be emitted
+efficiently. Signed VBR encoding is used in ``CST_CODE_INTEGER`` and
+``CST_CODE_WIDE_INTEGER`` records within ``CONSTANTS_BLOCK`` blocks.
+
+LLVM IR Blocks
+^^^^^^^^^^^^^^
+
+LLVM IR is defined with the following blocks:
+
+* 8 --- `MODULE_BLOCK`_ --- This is the top-level block that contains the entire
+  module, and describes a variety of per-module information.
+
+* 9 --- `PARAMATTR_BLOCK`_ --- This enumerates the parameter attributes.
+
+* 10 --- `TYPE_BLOCK`_ --- This describes all of the types in the module.
+
+* 11 --- `CONSTANTS_BLOCK`_ --- This describes constants for a module or
+  function.
+
+* 12 --- `FUNCTION_BLOCK`_ --- This describes a function body.
+
+* 13 --- `TYPE_SYMTAB_BLOCK`_ --- This describes the type symbol table.
+
+* 14 --- `VALUE_SYMTAB_BLOCK`_ --- This describes a value symbol table.
+
+* 15 --- `METADATA_BLOCK`_ --- This describes metadata items.
+
+* 16 --- `METADATA_ATTACHMENT`_ --- This contains records associating metadata
+  with function instruction values.
+
+.. _MODULE_BLOCK:
+
+MODULE_BLOCK Contents
+---------------------
+
+The ``MODULE_BLOCK`` block (id 8) is the top-level block for LLVM bitcode files,
+and each bitcode file must contain exactly one. In addition to records
+(described below) containing information about the module, a ``MODULE_BLOCK``
+block may contain the following sub-blocks:
+
+* `BLOCKINFO`_
+* `PARAMATTR_BLOCK`_
+* `TYPE_BLOCK`_
+* `TYPE_SYMTAB_BLOCK`_
+* `VALUE_SYMTAB_BLOCK`_
+* `CONSTANTS_BLOCK`_
+* `FUNCTION_BLOCK`_
+* `METADATA_BLOCK`_
+
+MODULE_CODE_VERSION Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[VERSION, version#]``
+
+The ``VERSION`` record (code 1) contains a single value indicating the format
+version. Only version 0 is supported at this time.
+
+MODULE_CODE_TRIPLE Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[TRIPLE, ...string...]``
+
+The ``TRIPLE`` record (code 2) contains a variable number of values representing
+the bytes of the ``target triple`` specification string.
+
+MODULE_CODE_DATALAYOUT Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[DATALAYOUT, ...string...]``
+
+The ``DATALAYOUT`` record (code 3) contains a variable number of values
+representing the bytes of the ``target datalayout`` specification string.
+
+MODULE_CODE_ASM Record
+^^^^^^^^^^^^^^^^^^^^^^
+
+``[ASM, ...string...]``
+
+The ``ASM`` record (code 4) contains a variable number of values representing
+the bytes of ``module asm`` strings, with individual assembly blocks separated
+by newline (ASCII 10) characters.
+
+.. _MODULE_CODE_SECTIONNAME:
+
+MODULE_CODE_SECTIONNAME Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[SECTIONNAME, ...string...]``
+
+The ``SECTIONNAME`` record (code 5) contains a variable number of values
+representing the bytes of a single section name string. There should be one
+``SECTIONNAME`` record for each section name referenced (e.g., in global
+variable or function ``section`` attributes) within the module. These records
+can be referenced by the 1-based index in the *section* fields of ``GLOBALVAR``
+or ``FUNCTION`` records.
+
+MODULE_CODE_DEPLIB Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[DEPLIB, ...string...]``
+
+The ``DEPLIB`` record (code 6) contains a variable number of values representing
+the bytes of a single dependent library name string, one of the libraries
+mentioned in a ``deplibs`` declaration.  There should be one ``DEPLIB`` record
+for each library name referenced.
+
+MODULE_CODE_GLOBALVAR Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal, unnamed_addr]``
+
+The ``GLOBALVAR`` record (code 7) marks the declaration or definition of a
+global variable. The operand fields are:
+
+* *pointer type*: The type index of the pointer type used to point to this
+  global variable
+
+* *isconst*: Non-zero if the variable is treated as constant within the module,
+  or zero if it is not
+
+* *initid*: If non-zero, the value index of the initializer for this variable,
+  plus 1.
+
+.. _linkage type:
+
+* *linkage*: An encoding of the linkage type for this variable:
+  * ``external``: code 0
+  * ``weak``: code 1
+  * ``appending``: code 2
+  * ``internal``: code 3
+  * ``linkonce``: code 4
+  * ``dllimport``: code 5
+  * ``dllexport``: code 6
+  * ``extern_weak``: code 7
+  * ``common``: code 8
+  * ``private``: code 9
+  * ``weak_odr``: code 10
+  * ``linkonce_odr``: code 11
+  * ``available_externally``: code 12
+  * ``linker_private``: code 13
+
+* alignment*: The logarithm base 2 of the variable's requested alignment, plus 1
+
+* *section*: If non-zero, the 1-based section index in the table of
+  `MODULE_CODE_SECTIONNAME`_ entries.
+
+.. _visibility:
+
+* *visibility*: If present, an encoding of the visibility of this variable:
+  * ``default``: code 0
+  * ``hidden``: code 1
+  * ``protected``: code 2
+
+* *threadlocal*: If present, an encoding of the thread local storage mode of the
+  variable:
+  * ``not thread local``: code 0
+  * ``thread local; default TLS model``: code 1
+  * ``localdynamic``: code 2
+  * ``initialexec``: code 3
+  * ``localexec``: code 4
+
+* *unnamed_addr*: If present and non-zero, indicates that the variable has
+  ``unnamed_addr``
+
+.. _FUNCTION:
+
+MODULE_CODE_FUNCTION Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc]``
+
+The ``FUNCTION`` record (code 8) marks the declaration or definition of a
+function. The operand fields are:
+
+* *type*: The type index of the function type describing this function
+
+* *callingconv*: The calling convention number:
+  * ``ccc``: code 0
+  * ``fastcc``: code 8
+  * ``coldcc``: code 9
+  * ``x86_stdcallcc``: code 64
+  * ``x86_fastcallcc``: code 65
+  * ``arm_apcscc``: code 66
+  * ``arm_aapcscc``: code 67
+  * ``arm_aapcs_vfpcc``: code 68
+
+* isproto*: Non-zero if this entry represents a declaration rather than a
+  definition
+
+* *linkage*: An encoding of the `linkage type`_ for this function
+
+* *paramattr*: If nonzero, the 1-based parameter attribute index into the table
+  of `PARAMATTR_CODE_ENTRY`_ entries.
+
+* *alignment*: The logarithm base 2 of the function's requested alignment, plus
+  1
+
+* *section*: If non-zero, the 1-based section index in the table of
+  `MODULE_CODE_SECTIONNAME`_ entries.
+
+* *visibility*: An encoding of the `visibility`_ of this function
+
+* *gc*: If present and nonzero, the 1-based garbage collector index in the table
+  of `MODULE_CODE_GCNAME`_ entries.
+
+* *unnamed_addr*: If present and non-zero, indicates that the function has
+  ``unnamed_addr``
+
+MODULE_CODE_ALIAS Record
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[ALIAS, alias type, aliasee val#, linkage, visibility]``
+
+The ``ALIAS`` record (code 9) marks the definition of an alias. The operand
+fields are
+
+* *alias type*: The type index of the alias
+
+* *aliasee val#*: The value index of the aliased value
+
+* *linkage*: An encoding of the `linkage type`_ for this alias
+
+* *visibility*: If present, an encoding of the `visibility`_ of the alias
+
+MODULE_CODE_PURGEVALS Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[PURGEVALS, numvals]``
+
+The ``PURGEVALS`` record (code 10) resets the module-level value list to the
+size given by the single operand value. Module-level value list items are added
+by ``GLOBALVAR``, ``FUNCTION``, and ``ALIAS`` records.  After a ``PURGEVALS``
+record is seen, new value indices will start from the given *numvals* value.
+
+.. _MODULE_CODE_GCNAME:
+
+MODULE_CODE_GCNAME Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[GCNAME, ...string...]``
+
+The ``GCNAME`` record (code 11) contains a variable number of values
+representing the bytes of a single garbage collector name string. There should
+be one ``GCNAME`` record for each garbage collector name referenced in function
+``gc`` attributes within the module. These records can be referenced by 1-based
+index in the *gc* fields of ``FUNCTION`` records.
+
+.. _PARAMATTR_BLOCK:
+
+PARAMATTR_BLOCK Contents
+------------------------
+
+The ``PARAMATTR_BLOCK`` block (id 9) contains a table of entries describing the
+attributes of function parameters. These entries are referenced by 1-based index
+in the *paramattr* field of module block `FUNCTION`_ records, or within the
+*attr* field of function block ``INST_INVOKE`` and ``INST_CALL`` records.
+
+Entries within ``PARAMATTR_BLOCK`` are constructed to ensure that each is unique
+(i.e., no two indicies represent equivalent attribute lists).
+
+.. _PARAMATTR_CODE_ENTRY:
+
+PARAMATTR_CODE_ENTRY Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[ENTRY, paramidx0, attr0, paramidx1, attr1...]``
+
+The ``ENTRY`` record (code 1) contains an even number of values describing a
+unique set of function parameter attributes. Each *paramidx* value indicates
+which set of attributes is represented, with 0 representing the return value
+attributes, 0xFFFFFFFF representing function attributes, and other values
+representing 1-based function parameters. Each *attr* value is a bitmap with the
+following interpretation:
+
+* bit 0: ``zeroext``
+* bit 1: ``signext``
+* bit 2: ``noreturn``
+* bit 3: ``inreg``
+* bit 4: ``sret``
+* bit 5: ``nounwind``
+* bit 6: ``noalias``
+* bit 7: ``byval``
+* bit 8: ``nest``
+* bit 9: ``readnone``
+* bit 10: ``readonly``
+* bit 11: ``noinline``
+* bit 12: ``alwaysinline``
+* bit 13: ``optsize``
+* bit 14: ``ssp``
+* bit 15: ``sspreq``
+* bits 16-31: ``align n``
+* bit 32: ``nocapture``
+* bit 33: ``noredzone``
+* bit 34: ``noimplicitfloat``
+* bit 35: ``naked``
+* bit 36: ``inlinehint``
+* bits 37-39: ``alignstack n``, represented as the logarithm
+  base 2 of the requested alignment, plus 1
+
+.. _TYPE_BLOCK:
+
+TYPE_BLOCK Contents
+-------------------
+
+The ``TYPE_BLOCK`` block (id 10) contains records which constitute a table of
+type operator entries used to represent types referenced within an LLVM
+module. Each record (with the exception of `NUMENTRY`_) generates a single type
+table entry, which may be referenced by 0-based index from instructions,
+constants, metadata, type symbol table entries, or other type operator records.
+
+Entries within ``TYPE_BLOCK`` are constructed to ensure that each entry is
+unique (i.e., no two indicies represent structurally equivalent types).
+
+.. _TYPE_CODE_NUMENTRY:
+.. _NUMENTRY:
+
+TYPE_CODE_NUMENTRY Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[NUMENTRY, numentries]``
+
+The ``NUMENTRY`` record (code 1) contains a single value which indicates the
+total number of type code entries in the type table of the module. If present,
+``NUMENTRY`` should be the first record in the block.
+
+TYPE_CODE_VOID Record
+^^^^^^^^^^^^^^^^^^^^^
+
+``[VOID]``
+
+The ``VOID`` record (code 2) adds a ``void`` type to the type table.
+
+TYPE_CODE_HALF Record
+^^^^^^^^^^^^^^^^^^^^^
+
+``[HALF]``
+
+The ``HALF`` record (code 10) adds a ``half`` (16-bit floating point) type to
+the type table.
+
+TYPE_CODE_FLOAT Record
+^^^^^^^^^^^^^^^^^^^^^^
+
+``[FLOAT]``
+
+The ``FLOAT`` record (code 3) adds a ``float`` (32-bit floating point) type to
+the type table.
+
+TYPE_CODE_DOUBLE Record
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``[DOUBLE]``
+
+The ``DOUBLE`` record (code 4) adds a ``double`` (64-bit floating point) type to
+the type table.
+
+TYPE_CODE_LABEL Record
+^^^^^^^^^^^^^^^^^^^^^^
+
+``[LABEL]``
+
+The ``LABEL`` record (code 5) adds a ``label`` type to the type table.
+
+TYPE_CODE_OPAQUE Record
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``[OPAQUE]``
+
+The ``OPAQUE`` record (code 6) adds an ``opaque`` type to the type table. Note
+that distinct ``opaque`` types are not unified.
+
+TYPE_CODE_INTEGER Record
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[INTEGER, width]``
+
+The ``INTEGER`` record (code 7) adds an integer type to the type table. The
+single *width* field indicates the width of the integer type.
+
+TYPE_CODE_POINTER Record
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[POINTER, pointee type, address space]``
+
+The ``POINTER`` record (code 8) adds a pointer type to the type table. The
+operand fields are
+
+* *pointee type*: The type index of the pointed-to type
+
+* *address space*: If supplied, the target-specific numbered address space where
+  the pointed-to object resides. Otherwise, the default address space is zero.
+
+TYPE_CODE_FUNCTION Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[FUNCTION, vararg, ignored, retty, ...paramty... ]``
+
+The ``FUNCTION`` record (code 9) adds a function type to the type table. The
+operand fields are
+
+* *vararg*: Non-zero if the type represents a varargs function
+
+* *ignored*: This value field is present for backward compatibility only, and is
+  ignored
+
+* *retty*: The type index of the function's return type
+
+* *paramty*: Zero or more type indices representing the parameter types of the
+  function
+
+TYPE_CODE_STRUCT Record
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``[STRUCT, ispacked, ...eltty...]``
+
+The ``STRUCT`` record (code 10) adds a struct type to the type table. The
+operand fields are
+
+* *ispacked*: Non-zero if the type represents a packed structure
+
+* *eltty*: Zero or more type indices representing the element types of the
+  structure
+
+TYPE_CODE_ARRAY Record
+^^^^^^^^^^^^^^^^^^^^^^
+
+``[ARRAY, numelts, eltty]``
+
+The ``ARRAY`` record (code 11) adds an array type to the type table.  The
+operand fields are
+
+* *numelts*: The number of elements in arrays of this type
+
+* *eltty*: The type index of the array element type
+
+TYPE_CODE_VECTOR Record
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``[VECTOR, numelts, eltty]``
+
+The ``VECTOR`` record (code 12) adds a vector type to the type table.  The
+operand fields are
+
+* *numelts*: The number of elements in vectors of this type
+
+* *eltty*: The type index of the vector element type
+
+TYPE_CODE_X86_FP80 Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[X86_FP80]``
+
+The ``X86_FP80`` record (code 13) adds an ``x86_fp80`` (80-bit floating point)
+type to the type table.
+
+TYPE_CODE_FP128 Record
+^^^^^^^^^^^^^^^^^^^^^^
+
+``[FP128]``
+
+The ``FP128`` record (code 14) adds an ``fp128`` (128-bit floating point) type
+to the type table.
+
+TYPE_CODE_PPC_FP128 Record
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[PPC_FP128]``
+
+The ``PPC_FP128`` record (code 15) adds a ``ppc_fp128`` (128-bit floating point)
+type to the type table.
+
+TYPE_CODE_METADATA Record
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``[METADATA]``
+
+The ``METADATA`` record (code 16) adds a ``metadata`` type to the type table.
+
+.. _CONSTANTS_BLOCK:
+
+CONSTANTS_BLOCK Contents
+------------------------
+
+The ``CONSTANTS_BLOCK`` block (id 11) ...
+
+.. _FUNCTION_BLOCK:
+
+FUNCTION_BLOCK Contents
+-----------------------
+
+The ``FUNCTION_BLOCK`` block (id 12) ...
+
+In addition to the record types described below, a ``FUNCTION_BLOCK`` block may
+contain the following sub-blocks:
+
+* `CONSTANTS_BLOCK`_
+* `VALUE_SYMTAB_BLOCK`_
+* `METADATA_ATTACHMENT`_
+
+.. _TYPE_SYMTAB_BLOCK:
+
+TYPE_SYMTAB_BLOCK Contents
+--------------------------
+
+The ``TYPE_SYMTAB_BLOCK`` block (id 13) contains entries which map between
+module-level named types and their corresponding type indices.
+
+.. _TST_CODE_ENTRY:
+
+TST_CODE_ENTRY Record
+^^^^^^^^^^^^^^^^^^^^^
+
+``[ENTRY, typeid, ...string...]``
+
+The ``ENTRY`` record (code 1) contains a variable number of values, with the
+first giving the type index of the designated type, and the remaining values
+giving the character codes of the type name. Each entry corresponds to a single
+named type.
+
+.. _VALUE_SYMTAB_BLOCK:
+
+VALUE_SYMTAB_BLOCK Contents
+---------------------------
+
+The ``VALUE_SYMTAB_BLOCK`` block (id 14) ... 
+
+.. _METADATA_BLOCK:
+
+METADATA_BLOCK Contents
+-----------------------
+
+The ``METADATA_BLOCK`` block (id 15) ...
+
+.. _METADATA_ATTACHMENT:
+
+METADATA_ATTACHMENT Contents
+----------------------------
+
+The ``METADATA_ATTACHMENT`` block (id 16) ...
diff --git a/docs/BranchWeightMetadata.html b/docs/BranchWeightMetadata.html
deleted file mode 100644
index 38b87ba..0000000
--- a/docs/BranchWeightMetadata.html
+++ /dev/null
@@ -1,164 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM Branch Weight Metadata</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  LLVM Branch Weight Metadata
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#supported_instructions">Supported Instructions</a></li>
-  <li><a href="#builtin_expect">Built-in "expect" Instruction </a></li>
-  <li><a href="#cfg_modifications">CFG Modifications</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:jstaszak@apple.com">Jakub Staszak</a></p>
-</div>
-
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<div>
-<p>Branch Weight Metadata represents branch weights as its likeliness to
-be taken. Metadata is assigned to the <tt>TerminatorInst</tt> as a
-<tt>MDNode</tt> of the <tt>MD_prof</tt> kind. The first operator is always a
-<tt>MDString</tt> node with the string "branch_weights". Number of operators
-depends on the terminator type.</p>
-
-<p>Branch weights might be fetch from the profiling file, or generated based on
-<a href="#builtin_expect"><tt>__builtin_expect</tt></a> instruction.
-</p>
-
-<p>All weights are represented as an unsigned 32-bit values, where higher value
-indicates greater chance to be taken.</p>
-</div>
-
-<h2>
-  <a name="supported_instructions">Supported Instructions</a>
-</h2>
-
-<div>
-  <h4>BranchInst</h4>
-  <div>
-    <p>Metadata is only assign to the conditional branches. There are two extra
-    operarands, for the true and the false branch.</p>
-  </div>
-  <div class="doc_code">
-  <pre>
-!0 = metadata !{
-  metadata !"branch_weights",
-  i32 &lt;TRUE_BRANCH_WEIGHT&gt;,
-  i32 &lt;FALSE_BRANCH_WEIGHT&gt;
-}
-  </pre>
-  </div>
-
-  <h4>SwitchInst</h4>
-  <div>
-  <p>Branch weights are assign to every case (including <tt>default</tt> case
-  which is always case #0).</p>
-  </div>
-  <div class="doc_code">
-  <pre>
-!0 = metadata !{
-  metadata !"branch_weights",
-  i32 &lt;DEFAULT_BRANCH_WEIGHT&gt;
-  [ , i32 &lt;CASE_BRANCH_WEIGHT&gt; ... ]
-}
-  </pre>
-  </div>
-
-  <h4>IndirectBrInst</h4>
-  <div>
-  <p>Branch weights are assign to every destination.</p>
-  </div>
-  <div class="doc_code">
-  <pre>
-!0 = metadata !{
-  metadata !"branch_weights",
-  i32 &lt;LABEL_BRANCH_WEIGHT&gt;
-  [ , i32 &lt;LABEL_BRANCH_WEIGHT&gt; ... ]
-}
-  </pre>
-  </div>
-
-  <h4>Other</h4>
-  <div>
-  <p>Other terminator instructions are not allowed to contain Branch Weight
-  Metadata.</p>
-  </div>
-</div>
-
-<h2>
-  <a name="builtin_expect">Built-in "expect" Instructions</a>
-</h2>
-<div>
-  <p><tt>__builtin_expect(long exp, long c)</tt> instruction provides branch
-  prediction information. The return value is the value of <tt>exp</tt>.</p>
-
-  <p>It is especially useful in conditional statements. Currently Clang supports
-  two conditional statements:
-  </p>
-  <h4><tt>if</tt> statement</h4>
-  <div>
-  <p>The <tt>exp</tt> parameter is the condition. The <tt>c</tt> parameter is
-  the expected comparision value. If it is equal to 1 (true), the condition is
-  likely to be true, in other case condition is likely to be false. For example:
-  </p>
-  </div>
-  <div class="doc_code">
-  <pre>
-  if (__builtin_expect(x &gt; 0, 1)) {
-    // This block is likely to be taken.
-  }
-  </pre>
-  </div>
-
-  <h4><tt>switch</tt> statement</h4>
-  <div>
-  <p>The <tt>exp</tt> parameter is the value. The <tt>c</tt> parameter is the
-  expected value. If the expected value doesn't show on the cases list, the
-  <tt>default</tt> case is assumed to be likely taken.</p>
-  </div>
-  <div class="doc_code">
-  <pre>
-  switch (__builtin_expect(x, 5)) {
-  default: break;
-  case 0:  // ...
-  case 3:  // ...
-  case 5:  // This case is likely to be taken.
-  }
-  </pre>
-  </div>
-</div>
-
-<h2>
-  <a name="cfg_modifications">CFG Modifications</a>
-</h2>
-<div>
-<p>Branch Weight Metatada is not proof against CFG changes. If terminator
-operands' are changed some action should be taken. In other case some
-misoptimizations may occur due to incorrent branch prediction information.</p>
-</div>
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:jstaszak@apple.com">Jakub Staszak</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-</address>
-
-</body>
-</html>
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
new file mode 100644
index 0000000..f0df971
--- /dev/null
+++ b/docs/BranchWeightMetadata.rst
@@ -0,0 +1,118 @@
+.. _branch_weight:
+
+===========================
+LLVM Branch Weight Metadata
+===========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Branch Weight Metadata represents branch weights as its likeliness to be
+taken. Metadata is assigned to the ``TerminatorInst`` as a ``MDNode`` of the
+``MD_prof`` kind. The first operator is always a ``MDString`` node with the
+string "branch_weights". Number of operators depends on the terminator type.
+
+Branch weights might be fetch from the profiling file, or generated based on
+`__builtin_expect`_ instruction.
+
+All weights are represented as an unsigned 32-bit values, where higher value
+indicates greater chance to be taken.
+
+Supported Instructions
+======================
+
+``BranchInst``
+^^^^^^^^^^^^^^
+
+Metadata is only assign to the conditional branches. There are two extra
+operarands, for the true and the false branch.
+
+.. code-block:: llvm
+
+  !0 = metadata !{
+    metadata !"branch_weights",
+    i32 <TRUE_BRANCH_WEIGHT>,
+    i32 <FALSE_BRANCH_WEIGHT>
+  }
+
+``SwitchInst``
+^^^^^^^^^^^^^^
+
+Branch weights are assign to every case (including ``default`` case which is
+always case #0).
+
+.. code-block:: llvm
+
+  !0 = metadata !{
+    metadata !"branch_weights",
+    i32 <DEFAULT_BRANCH_WEIGHT>
+    [ , i32 <CASE_BRANCH_WEIGHT> ... ]
+  }
+
+``IndirectBrInst``
+^^^^^^^^^^^^^^^^^^
+
+Branch weights are assign to every destination.
+
+.. code-block:: llvm
+
+  !0 = metadata !{
+    metadata !"branch_weights",
+    i32 <LABEL_BRANCH_WEIGHT>
+    [ , i32 <LABEL_BRANCH_WEIGHT> ... ]
+  }
+
+Other
+^^^^^
+
+Other terminator instructions are not allowed to contain Branch Weight Metadata.
+
+.. _\__builtin_expect:
+
+Built-in ``expect`` Instructions
+================================
+
+``__builtin_expect(long exp, long c)`` instruction provides branch prediction
+information. The return value is the value of ``exp``.
+
+It is especially useful in conditional statements. Currently Clang supports two
+conditional statements:
+
+``if`` statement
+^^^^^^^^^^^^^^^^
+
+The ``exp`` parameter is the condition. The ``c`` parameter is the expected
+comparison value. If it is equal to 1 (true), the condition is likely to be
+true, in other case condition is likely to be false. For example:
+
+.. code-block:: c++
+
+  if (__builtin_expect(x > 0, 1)) {
+    // This block is likely to be taken.
+  }
+
+``switch`` statement
+^^^^^^^^^^^^^^^^^^^^
+
+The ``exp`` parameter is the value. The ``c`` parameter is the expected
+value. If the expected value doesn't show on the cases list, the ``default``
+case is assumed to be likely taken.
+
+.. code-block:: c++
+
+  switch (__builtin_expect(x, 5)) {
+  default: break;
+  case 0:  // ...
+  case 3:  // ...
+  case 5:  // This case is likely to be taken.
+  }
+
+CFG Modifications
+=================
+
+Branch Weight Metatada is not proof against CFG changes. If terminator operands'
+are changed some action should be taken. In other case some misoptimizations may
+occur due to incorrent branch prediction information.
diff --git a/docs/Bugpoint.html b/docs/Bugpoint.html
deleted file mode 100644
index d9cce0b..0000000
--- a/docs/Bugpoint.html
+++ /dev/null
@@ -1,239 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM bugpoint tool: design and usage</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-
-<h1>
-  LLVM bugpoint tool: design and usage
-</h1>
-
-<ul>
-  <li><a href="#desc">Description</a></li>
-  <li><a href="#design">Design Philosophy</a>
-  <ul>
-    <li><a href="#autoselect">Automatic Debugger Selection</a></li>
-    <li><a href="#crashdebug">Crash debugger</a></li>
-    <li><a href="#codegendebug">Code generator debugger</a></li>
-    <li><a href="#miscompilationdebug">Miscompilation debugger</a></li>
-  </ul></li>
-  <li><a href="#advice">Advice for using <tt>bugpoint</tt></a></li>
-</ul>
-
-<div class="doc_author">
-<p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="desc">Description</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p><tt>bugpoint</tt> narrows down the source of problems in LLVM tools and
-passes.  It can be used to debug three types of failures: optimizer crashes,
-miscompilations by optimizers, or bad native code generation (including problems
-in the static and JIT compilers).  It aims to reduce large test cases to small,
-useful ones.  For example, if <tt>opt</tt> crashes while optimizing a
-file, it will identify the optimization (or combination of optimizations) that
-causes the crash, and reduce the file down to a small example which triggers the
-crash.</p>
-
-<p>For detailed case scenarios, such as debugging <tt>opt</tt>,
-<tt>llvm-ld</tt>, or one of the LLVM code generators, see <a
-href="HowToSubmitABug.html">How To Submit a Bug Report document</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="design">Design Philosophy</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p><tt>bugpoint</tt> is designed to be a useful tool without requiring any
-hooks into the LLVM infrastructure at all.  It works with any and all LLVM
-passes and code generators, and does not need to "know" how they work.  Because
-of this, it may appear to do stupid things or miss obvious
-simplifications.  <tt>bugpoint</tt> is also designed to trade off programmer
-time for computer time in the compiler-debugging process; consequently, it may
-take a long period of (unattended) time to reduce a test case, but we feel it
-is still worth it. Note that <tt>bugpoint</tt> is generally very quick unless
-debugging a miscompilation where each test of the program (which requires 
-executing it) takes a long time.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="autoselect">Automatic Debugger Selection</a>
-</h3>
-
-<div>
-
-<p><tt>bugpoint</tt> reads each <tt>.bc</tt> or <tt>.ll</tt> file specified on
-the command line and links them together into a single module, called the test
-program.  If any LLVM passes are specified on the command line, it runs these
-passes on the test program.  If any of the passes crash, or if they produce
-malformed output (which causes the verifier to abort), <tt>bugpoint</tt> starts
-the <a href="#crashdebug">crash debugger</a>.</p>
-
-<p>Otherwise, if the <tt>-output</tt> option was not specified,
-<tt>bugpoint</tt> runs the test program with the C backend (which is assumed to
-generate good code) to generate a reference output.  Once <tt>bugpoint</tt> has
-a reference output for the test program, it tries executing it with the
-selected code generator.  If the selected code generator crashes,
-<tt>bugpoint</tt> starts the <a href="#crashdebug">crash debugger</a> on the
-code generator.  Otherwise, if the resulting output differs from the reference
-output, it assumes the difference resulted from a code generator failure, and
-starts the <a href="#codegendebug">code generator debugger</a>.</p>
-
-<p>Finally, if the output of the selected code generator matches the reference
-output, <tt>bugpoint</tt> runs the test program after all of the LLVM passes
-have been applied to it.  If its output differs from the reference output, it
-assumes the difference resulted from a failure in one of the LLVM passes, and
-enters the <a href="#miscompilationdebug">miscompilation debugger</a>.
-Otherwise, there is no problem <tt>bugpoint</tt> can debug.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="crashdebug">Crash debugger</a>
-</h3>
-
-<div>
-
-<p>If an optimizer or code generator crashes, <tt>bugpoint</tt> will try as hard
-as it can to reduce the list of passes (for optimizer crashes) and the size of
-the test program.  First, <tt>bugpoint</tt> figures out which combination of
-optimizer passes triggers the bug. This is useful when debugging a problem
-exposed by <tt>opt</tt>, for example, because it runs over 38 passes.</p>
-
-<p>Next, <tt>bugpoint</tt> tries removing functions from the test program, to
-reduce its size.  Usually it is able to reduce a test program to a single
-function, when debugging intraprocedural optimizations.  Once the number of
-functions has been reduced, it attempts to delete various edges in the control
-flow graph, to reduce the size of the function as much as possible.  Finally,
-<tt>bugpoint</tt> deletes any individual LLVM instructions whose absence does
-not eliminate the failure.  At the end, <tt>bugpoint</tt> should tell you what
-passes crash, give you a bitcode file, and give you instructions on how to
-reproduce the failure with <tt>opt</tt> or <tt>llc</tt>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="codegendebug">Code generator debugger</a>
-</h3>
-
-<div>
-
-<p>The code generator debugger attempts to narrow down the amount of code that
-is being miscompiled by the selected code generator.  To do this, it takes the
-test program and partitions it into two pieces: one piece which it compiles
-with the C backend (into a shared object), and one piece which it runs with
-either the JIT or the static LLC compiler.  It uses several techniques to
-reduce the amount of code pushed through the LLVM code generator, to reduce the
-potential scope of the problem.  After it is finished, it emits two bitcode
-files (called "test" [to be compiled with the code generator] and "safe" [to be
-compiled with the C backend], respectively), and instructions for reproducing
-the problem.  The code generator debugger assumes that the C backend produces
-good code.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="miscompilationdebug">Miscompilation debugger</a>
-</h3>
-
-<div>
-
-<p>The miscompilation debugger works similarly to the code generator debugger.
-It works by splitting the test program into two pieces, running the
-optimizations specified on one piece, linking the two pieces back together, and
-then executing the result.  It attempts to narrow down the list of passes to
-the one (or few) which are causing the miscompilation, then reduce the portion
-of the test program which is being miscompiled.  The miscompilation debugger
-assumes that the selected code generator is working properly.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="advice">Advice for using bugpoint</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<tt>bugpoint</tt> can be a remarkably useful tool, but it sometimes works in
-non-obvious ways.  Here are some hints and tips:<p>
-
-<ol>
-<li>In the code generator and miscompilation debuggers, <tt>bugpoint</tt> only
-    works with programs that have deterministic output.  Thus, if the program
-    outputs <tt>argv[0]</tt>, the date, time, or any other "random" data,
-    <tt>bugpoint</tt> may misinterpret differences in these data, when output,
-    as the result of a miscompilation.  Programs should be temporarily modified
-    to disable outputs that are likely to vary from run to run.
-
-<li>In the code generator and miscompilation debuggers, debugging will go
-    faster if you manually modify the program or its inputs to reduce the
-    runtime, but still exhibit the problem.
-
-<li><tt>bugpoint</tt> is extremely useful when working on a new optimization:
-    it helps track down regressions quickly.  To avoid having to relink
-    <tt>bugpoint</tt> every time you change your optimization however, have
-    <tt>bugpoint</tt> dynamically load your optimization with the
-    <tt>-load</tt> option.
-
-<li><p><tt>bugpoint</tt> can generate a lot of output and run for a long period
-    of time.  It is often useful to capture the output of the program to file.
-    For example, in the C shell, you can run:</p>
-
-<div class="doc_code">
-<p><tt>bugpoint  ... |&amp; tee bugpoint.log</tt></p>
-</div>
-
-    <p>to get a copy of <tt>bugpoint</tt>'s output in the file
-    <tt>bugpoint.log</tt>, as well as on your terminal.</p>
-
-<li><tt>bugpoint</tt> cannot debug problems with the LLVM linker. If
-    <tt>bugpoint</tt> crashes before you see its "All input ok" message,
-    you might try <tt>llvm-link -v</tt> on the same set of input files. If
-    that also crashes, you may be experiencing a linker bug.
-
-<li><tt>bugpoint</tt> is useful for proactively finding bugs in LLVM. 
-    Invoking <tt>bugpoint</tt> with the <tt>-find-bugs</tt> option will cause
-    the list of specified optimizations to be randomized and applied to the 
-    program. This process will repeat until a bug is found or the user
-    kills <tt>bugpoint</tt>.
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
-</address>
-
-</body>
-</html>
diff --git a/docs/Bugpoint.rst b/docs/Bugpoint.rst
new file mode 100644
index 0000000..9ccf0cc
--- /dev/null
+++ b/docs/Bugpoint.rst
@@ -0,0 +1,218 @@
+.. _bugpoint:
+
+====================================
+LLVM bugpoint tool: design and usage
+====================================
+
+.. contents::
+   :local:
+
+Description
+===========
+
+``bugpoint`` narrows down the source of problems in LLVM tools and passes.  It
+can be used to debug three types of failures: optimizer crashes, miscompilations
+by optimizers, or bad native code generation (including problems in the static
+and JIT compilers).  It aims to reduce large test cases to small, useful ones.
+For example, if ``opt`` crashes while optimizing a file, it will identify the
+optimization (or combination of optimizations) that causes the crash, and reduce
+the file down to a small example which triggers the crash.
+
+For detailed case scenarios, such as debugging ``opt``, or one of the LLVM code
+generators, see `How To Submit a Bug Report document <HowToSubmitABug.html>`_.
+
+Design Philosophy
+=================
+
+``bugpoint`` is designed to be a useful tool without requiring any hooks into
+the LLVM infrastructure at all.  It works with any and all LLVM passes and code
+generators, and does not need to "know" how they work.  Because of this, it may
+appear to do stupid things or miss obvious simplifications.  ``bugpoint`` is
+also designed to trade off programmer time for computer time in the
+compiler-debugging process; consequently, it may take a long period of
+(unattended) time to reduce a test case, but we feel it is still worth it. Note
+that ``bugpoint`` is generally very quick unless debugging a miscompilation
+where each test of the program (which requires executing it) takes a long time.
+
+Automatic Debugger Selection
+----------------------------
+
+``bugpoint`` reads each ``.bc`` or ``.ll`` file specified on the command line
+and links them together into a single module, called the test program.  If any
+LLVM passes are specified on the command line, it runs these passes on the test
+program.  If any of the passes crash, or if they produce malformed output (which
+causes the verifier to abort), ``bugpoint`` starts the `crash debugger`_.
+
+Otherwise, if the ``-output`` option was not specified, ``bugpoint`` runs the
+test program with the "safe" backend (which is assumed to generate good code) to
+generate a reference output.  Once ``bugpoint`` has a reference output for the
+test program, it tries executing it with the selected code generator.  If the
+selected code generator crashes, ``bugpoint`` starts the `crash debugger`_ on
+the code generator.  Otherwise, if the resulting output differs from the
+reference output, it assumes the difference resulted from a code generator
+failure, and starts the `code generator debugger`_.
+
+Finally, if the output of the selected code generator matches the reference
+output, ``bugpoint`` runs the test program after all of the LLVM passes have
+been applied to it.  If its output differs from the reference output, it assumes
+the difference resulted from a failure in one of the LLVM passes, and enters the
+`miscompilation debugger`_.  Otherwise, there is no problem ``bugpoint`` can
+debug.
+
+.. _crash debugger:
+
+Crash debugger
+--------------
+
+If an optimizer or code generator crashes, ``bugpoint`` will try as hard as it
+can to reduce the list of passes (for optimizer crashes) and the size of the
+test program.  First, ``bugpoint`` figures out which combination of optimizer
+passes triggers the bug. This is useful when debugging a problem exposed by
+``opt``, for example, because it runs over 38 passes.
+
+Next, ``bugpoint`` tries removing functions from the test program, to reduce its
+size.  Usually it is able to reduce a test program to a single function, when
+debugging intraprocedural optimizations.  Once the number of functions has been
+reduced, it attempts to delete various edges in the control flow graph, to
+reduce the size of the function as much as possible.  Finally, ``bugpoint``
+deletes any individual LLVM instructions whose absence does not eliminate the
+failure.  At the end, ``bugpoint`` should tell you what passes crash, give you a
+bitcode file, and give you instructions on how to reproduce the failure with
+``opt`` or ``llc``.
+
+.. _code generator debugger:
+
+Code generator debugger
+-----------------------
+
+The code generator debugger attempts to narrow down the amount of code that is
+being miscompiled by the selected code generator.  To do this, it takes the test
+program and partitions it into two pieces: one piece which it compiles with the
+"safe" backend (into a shared object), and one piece which it runs with either
+the JIT or the static LLC compiler.  It uses several techniques to reduce the
+amount of code pushed through the LLVM code generator, to reduce the potential
+scope of the problem.  After it is finished, it emits two bitcode files (called
+"test" [to be compiled with the code generator] and "safe" [to be compiled with
+the "safe" backend], respectively), and instructions for reproducing the
+problem.  The code generator debugger assumes that the "safe" backend produces
+good code.
+
+.. _miscompilation debugger:
+
+Miscompilation debugger
+-----------------------
+
+The miscompilation debugger works similarly to the code generator debugger.  It
+works by splitting the test program into two pieces, running the optimizations
+specified on one piece, linking the two pieces back together, and then executing
+the result.  It attempts to narrow down the list of passes to the one (or few)
+which are causing the miscompilation, then reduce the portion of the test
+program which is being miscompiled.  The miscompilation debugger assumes that
+the selected code generator is working properly.
+
+Advice for using bugpoint
+=========================
+
+``bugpoint`` can be a remarkably useful tool, but it sometimes works in
+non-obvious ways.  Here are some hints and tips:
+
+* In the code generator and miscompilation debuggers, ``bugpoint`` only works
+  with programs that have deterministic output.  Thus, if the program outputs
+  ``argv[0]``, the date, time, or any other "random" data, ``bugpoint`` may
+  misinterpret differences in these data, when output, as the result of a
+  miscompilation.  Programs should be temporarily modified to disable outputs
+  that are likely to vary from run to run.
+
+* In the code generator and miscompilation debuggers, debugging will go faster
+  if you manually modify the program or its inputs to reduce the runtime, but
+  still exhibit the problem.
+
+* ``bugpoint`` is extremely useful when working on a new optimization: it helps
+  track down regressions quickly.  To avoid having to relink ``bugpoint`` every
+  time you change your optimization however, have ``bugpoint`` dynamically load
+  your optimization with the ``-load`` option.
+
+* ``bugpoint`` can generate a lot of output and run for a long period of time.
+  It is often useful to capture the output of the program to file.  For example,
+  in the C shell, you can run:
+
+  .. code-block:: bash
+
+    bugpoint  ... |& tee bugpoint.log
+
+  to get a copy of ``bugpoint``'s output in the file ``bugpoint.log``, as well
+  as on your terminal.
+
+* ``bugpoint`` cannot debug problems with the LLVM linker. If ``bugpoint``
+  crashes before you see its "All input ok" message, you might try ``llvm-link
+  -v`` on the same set of input files. If that also crashes, you may be
+  experiencing a linker bug.
+
+* ``bugpoint`` is useful for proactively finding bugs in LLVM.  Invoking
+  ``bugpoint`` with the ``-find-bugs`` option will cause the list of specified
+  optimizations to be randomized and applied to the program. This process will
+  repeat until a bug is found or the user kills ``bugpoint``.
+
+What to do when bugpoint isn't enough
+=====================================
+	
+Sometimes, ``bugpoint`` is not enough. In particular, InstCombine and
+TargetLowering both have visitor structured code with lots of potential
+transformations.  If the process of using bugpoint has left you with still too
+much code to figure out and the problem seems to be in instcombine, the
+following steps may help.  These same techniques are useful with TargetLowering
+as well.
+
+Turn on ``-debug-only=instcombine`` and see which transformations within
+instcombine are firing by selecting out lines with "``IC``" in them.
+
+At this point, you have a decision to make.  Is the number of transformations
+small enough to step through them using a debugger?  If so, then try that.
+
+If there are too many transformations, then a source modification approach may
+be helpful.  In this approach, you can modify the source code of instcombine to
+disable just those transformations that are being performed on your test input
+and perform a binary search over the set of transformations.  One set of places
+to modify are the "``visit*``" methods of ``InstCombiner`` (*e.g.*
+``visitICmpInst``) by adding a "``return false``" as the first line of the
+method.
+
+If that still doesn't remove enough, then change the caller of
+``InstCombiner::DoOneIteration``, ``InstCombiner::runOnFunction`` to limit the
+number of iterations.
+
+You may also find it useful to use "``-stats``" now to see what parts of
+instcombine are firing.  This can guide where to put additional reporting code.
+
+At this point, if the amount of transformations is still too large, then
+inserting code to limit whether or not to execute the body of the code in the
+visit function can be helpful.  Add a static counter which is incremented on
+every invocation of the function.  Then add code which simply returns false on
+desired ranges.  For example:
+
+.. code-block:: c++
+
+
+  static int calledCount = 0;
+  calledCount++;
+  DEBUG(if (calledCount < 212) return false);
+  DEBUG(if (calledCount > 217) return false);
+  DEBUG(if (calledCount == 213) return false);
+  DEBUG(if (calledCount == 214) return false);
+  DEBUG(if (calledCount == 215) return false);
+  DEBUG(if (calledCount == 216) return false);
+  DEBUG(dbgs() << "visitXOR calledCount: " << calledCount << "\n");
+  DEBUG(dbgs() << "I: "; I->dump());
+
+could be added to ``visitXOR`` to limit ``visitXor`` to being applied only to
+calls 212 and 217. This is from an actual test case and raises an important
+point---a simple binary search may not be sufficient, as transformations that
+interact may require isolating more than one call.  In TargetLowering, use
+``return SDNode();`` instead of ``return false;``.
+
+Now that that the number of transformations is down to a manageable number, try
+examining the output to see if you can figure out which transformations are
+being done.  If that can be figured out, then do the usual debugging.  If which
+code corresponds to the transformation being performed isn't obvious, set a
+breakpoint after the call count based disabling and step through the code.
+Alternatively, you can use "``printf``" style debugging to report waypoints.
diff --git a/docs/CMake.html b/docs/CMake.html
deleted file mode 100644
index acc7fe9..0000000
--- a/docs/CMake.html
+++ /dev/null
@@ -1,584 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Building LLVM with CMake</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-
-<h1>
-  Building LLVM with CMake
-</h1>
-
-<ul>
-  <li><a href="#intro">Introduction</a></li>
-  <li><a href="#quickstart">Quick start</a></li>
-  <li><a href="#usage">Basic CMake usage</a>
-  <li><a href="#options">Options and variables</a>
-    <ul>
-    <li><a href="#freccmake">Frequently-used CMake variables</a></li>
-    <li><a href="#llvmvars">LLVM-specific variables</a></li>
-  </ul></li>
-  <li><a href="#testing">Executing the test suite</a>
-  <li><a href="#cross">Cross compiling</a>
-  <li><a href="#embedding">Embedding LLVM in your project</a>
-    <ul>
-    <li><a href="#passdev">Developing LLVM pass out of source</a></li>
-  </ul></li>
-  <li><a href="#specifics">Compiler/Platform specific topics</a>
-    <ul>
-    <li><a href="#msvc">Microsoft Visual C++</a></li>
-  </ul></li>
-</ul>
-
-<div class="doc_author">
-<p>Written by <a href="mailto:ofv@wanadoo.es">Oscar Fuentes</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="intro">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p><a href="http://www.cmake.org/">CMake</a> is a cross-platform
-    build-generator tool. CMake does not build the project, it generates
-    the files needed by your build tool (GNU make, Visual Studio, etc) for
-    building LLVM.</p>
-
-  <p>If you are really anxious about getting a functional LLVM build,
-    go to the <a href="#quickstart">Quick start</a> section. If you
-    are a CMake novice, start on <a href="#usage">Basic CMake
-      usage</a> and then go back to the <a href="#quickstart">Quick
-      start</a> once you know what you are
-    doing. The <a href="#options">Options and variables</a> section
-    is a reference for customizing your build. If you already have
-    experience with CMake, this is the recommended starting point.
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="quickstart">Quick start</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p> We use here the command-line, non-interactive CMake interface </p>
-
-<ol>
-
-  <li><p><a href="http://www.cmake.org/cmake/resources/software.html">Download</a>
-      and install CMake. Version 2.8 is the minimum required.</p>
-
-  <li><p>Open a shell. Your development tools must be reachable from this
-      shell through the PATH environment variable.</p>
-
-  <li><p>Create a directory for containing the build. It is not
-      supported to build LLVM on the source directory. cd to this
-      directory:</p>
-    <div class="doc_code">
-      <p><tt>mkdir mybuilddir</tt></p>
-      <p><tt>cd mybuilddir</tt></p>
-    </div>
-
-  <li><p>Execute this command on the shell
-      replacing <i>path/to/llvm/source/root</i> with the path to the
-      root of your LLVM source tree:</p>
-    <div class="doc_code">
-      <p><tt>cmake path/to/llvm/source/root</tt></p>
-    </div>
-
-    <p>CMake will detect your development environment, perform a
-      series of test and generate the files required for building
-      LLVM. CMake will use default values for all build
-      parameters. See the <a href="#options">Options and variables</a>
-      section for fine-tuning your build</p>
-
-    <p>This can fail if CMake can't detect your toolset, or if it
-      thinks that the environment is not sane enough. On this case
-      make sure that the toolset that you intend to use is the only
-      one reachable from the shell and that the shell itself is the
-      correct one for you development environment. CMake will refuse
-      to build MinGW makefiles if you have a POSIX shell reachable
-      through the PATH environment variable, for instance. You can
-      force CMake to use a given build tool, see
-      the <a href="#usage">Usage</a> section.</p>
-
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="usage">Basic CMake usage</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p>This section explains basic aspects of CMake, mostly for
-    explaining those options which you may need on your day-to-day
-    usage.</p>
-
-  <p>CMake comes with extensive documentation in the form of html
-    files and on the cmake executable itself. Execute <i>cmake
-    --help</i> for further help options.</p>
-
-  <p>CMake requires to know for which build tool it shall generate
-    files (GNU make, Visual Studio, Xcode, etc). If not specified on
-    the command line, it tries to guess it based on you
-    environment. Once identified the build tool, CMake uses the
-    corresponding <i>Generator</i> for creating files for your build
-    tool. You can explicitly specify the generator with the command
-    line option <i>-G "Name of the generator"</i>. For knowing the
-    available generators on your platform, execute</p>
-
-    <div class="doc_code">
-      <p><tt>cmake --help</tt></p>
-    </div>
-
-    <p>This will list the generator's names at the end of the help
-      text. Generator's names are case-sensitive. Example:</p>
-
-    <div class="doc_code">
-      <p><tt>cmake -G "Visual Studio 9 2008" path/to/llvm/source/root</tt></p>
-    </div>
-
-    <p>For a given development platform there can be more than one
-      adequate generator. If you use Visual Studio "NMake Makefiles"
-      is a generator you can use for building with NMake. By default,
-      CMake chooses the more specific generator supported by your
-      development environment. If you want an alternative generator,
-      you must tell this to CMake with the <i>-G</i> option.</p>
-
-    <p>TODO: explain variables and cache. Move explanation here from
-      #options section.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="options">Options and variables</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p>Variables customize how the build will be generated. Options are
-    boolean variables, with possible values ON/OFF. Options and
-    variables are defined on the CMake command line like this:</p>
-
-  <div class="doc_code">
-    <p><tt>cmake -DVARIABLE=value path/to/llvm/source</tt></p>
-  </div>
-
-  <p>You can set a variable after the initial CMake invocation for
-    changing its value. You can also undefine a variable:</p>
-
-  <div class="doc_code">
-    <p><tt>cmake -UVARIABLE path/to/llvm/source</tt></p>
-  </div>
-
-  <p>Variables are stored on the CMake cache. This is a file
-    named <tt>CMakeCache.txt</tt> on the root of the build
-    directory. Do not hand-edit it.</p>
-
-  <p>Variables are listed here appending its type after a colon. It is
-    correct to write the variable and the type on the CMake command
-    line:</p>
-
-  <div class="doc_code">
-    <p><tt>cmake -DVARIABLE:TYPE=value path/to/llvm/source</tt></p>
-  </div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="freccmake">Frequently-used CMake variables</a>
-</h3>
-
-<div>
-
-<p>Here are listed some of the CMake variables that are used often,
-  along with a brief explanation and LLVM-specific notes. For full
-  documentation, check the CMake docs or execute <i>cmake
-  --help-variable VARIABLE_NAME</i>.</p>
-
-<dl>
-  <dt><b>CMAKE_BUILD_TYPE</b>:STRING</dt>
-
-  <dd>Sets the build type for <i>make</i> based generators. Possible
-    values are Release, Debug, RelWithDebInfo and MinSizeRel. On
-    systems like Visual Studio the user sets the build type with the IDE
-    settings.</dd>
-
-  <dt><b>CMAKE_INSTALL_PREFIX</b>:PATH</dt>
-  <dd>Path where LLVM will be installed if "make install" is invoked
-    or the "INSTALL" target is built.</dd>
-
-  <dt><b>LLVM_LIBDIR_SUFFIX</b>:STRING</dt>
-  <dd>Extra suffix to append to the directory where libraries are to
-    be installed. On a 64-bit architecture, one could use
-    -DLLVM_LIBDIR_SUFFIX=64 to install libraries to /usr/lib64.</dd>
-
-  <dt><b>CMAKE_C_FLAGS</b>:STRING</dt>
-  <dd>Extra flags to use when compiling C source files.</dd>
-
-  <dt><b>CMAKE_CXX_FLAGS</b>:STRING</dt>
-  <dd>Extra flags to use when compiling C++ source files.</dd>
-
-  <dt><b>BUILD_SHARED_LIBS</b>:BOOL</dt>
-  <dd>Flag indicating is shared libraries will be built. Its default
-    value is OFF. Shared libraries are not supported on Windows and
-    not recommended in the other OSes.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="llvmvars">LLVM-specific variables</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt><b>LLVM_TARGETS_TO_BUILD</b>:STRING</dt>
-  <dd>Semicolon-separated list of targets to build, or <i>all</i> for
-    building all targets. Case-sensitive. For Visual C++ defaults
-    to <i>X86</i>. On the other cases defaults to <i>all</i>. Example:
-    <i>-DLLVM_TARGETS_TO_BUILD="X86;PowerPC"</i>.</dd>
-
-  <dt><b>LLVM_BUILD_TOOLS</b>:BOOL</dt>
-  <dd>Build LLVM tools. Defaults to ON. Targets for building each tool
-    are generated in any case. You can build an tool separately by
-    invoking its target. For example, you can build <i>llvm-as</i>
-    with a makefile-based system executing <i>make llvm-as</i> on the
-    root of your build directory.</dd>
-
-  <dt><b>LLVM_INCLUDE_TOOLS</b>:BOOL</dt>
-  <dd>Generate build targets for the LLVM tools. Defaults to
-    ON. You can use that option for disabling the generation of build
-    targets for the LLVM tools.</dd>
-
-  <dt><b>LLVM_BUILD_EXAMPLES</b>:BOOL</dt>
-  <dd>Build LLVM examples. Defaults to OFF. Targets for building each
-    example are generated in any case. See documentation
-    for <i>LLVM_BUILD_TOOLS</i> above for more details.</dd>
-
-  <dt><b>LLVM_INCLUDE_EXAMPLES</b>:BOOL</dt>
-  <dd>Generate build targets for the LLVM examples. Defaults to
-    ON. You can use that option for disabling the generation of build
-    targets for the LLVM examples.</dd>
-
-  <dt><b>LLVM_BUILD_TESTS</b>:BOOL</dt>
-  <dd>Build LLVM unit tests. Defaults to OFF. Targets for building
-    each unit test are generated in any case. You can build a specific
-    unit test with the target <i>UnitTestNameTests</i> (where at this
-    time <i>UnitTestName</i> can be ADT, Analysis, ExecutionEngine,
-    JIT, Support, Transform, VMCore; see the subdirectories
-    of <i>unittests</i> for an updated list.) It is possible to build
-    all unit tests with the target <i>UnitTests</i>.</dd>
-
-  <dt><b>LLVM_INCLUDE_TESTS</b>:BOOL</dt>
-  <dd>Generate build targets for the LLVM unit tests. Defaults to
-    ON. You can use that option for disabling the generation of build
-    targets for the LLVM unit tests.</dd>
-
-  <dt><b>LLVM_APPEND_VC_REV</b>:BOOL</dt>
-  <dd>Append version control revision info (svn revision number or git
-    revision id) to LLVM version string (stored in the PACKAGE_VERSION
-    macro). For this to work cmake must be invoked before the
-    build. Defaults to OFF.</dd>
-
-  <dt><b>LLVM_ENABLE_THREADS</b>:BOOL</dt>
-  <dd>Build with threads support, if available. Defaults to ON.</dd>
-
-  <dt><b>LLVM_ENABLE_ASSERTIONS</b>:BOOL</dt>
-  <dd>Enables code assertions. Defaults to OFF if and only if
-    CMAKE_BUILD_TYPE is <i>Release</i>.</dd>
-
-  <dt><b>LLVM_ENABLE_PIC</b>:BOOL</dt>
-  <dd>Add the <i>-fPIC</i> flag for the compiler command-line, if the
-    compiler supports this flag. Some systems, like Windows, do not
-    need this flag. Defaults to ON.</dd>
-
-  <dt><b>LLVM_ENABLE_WARNINGS</b>:BOOL</dt>
-  <dd>Enable all compiler warnings. Defaults to ON.</dd>
-
-  <dt><b>LLVM_ENABLE_PEDANTIC</b>:BOOL</dt>
-  <dd>Enable pedantic mode. This disable compiler specific extensions, is
-    possible. Defaults to ON.</dd>
-
-  <dt><b>LLVM_ENABLE_WERROR</b>:BOOL</dt>
-  <dd>Stop and fail build, if a compiler warning is
-    triggered. Defaults to OFF.</dd>
-
-  <dt><b>LLVM_BUILD_32_BITS</b>:BOOL</dt>
-  <dd>Build 32-bits executables and libraries on 64-bits systems. This
-    option is available only on some 64-bits unix systems. Defaults to
-    OFF.</dd>
-
-  <dt><b>LLVM_TARGET_ARCH</b>:STRING</dt>
-  <dd>LLVM target to use for native code generation. This is required
-    for JIT generation. It defaults to "host", meaning that it shall
-    pick the architecture of the machine where LLVM is being built. If
-    you are cross-compiling, set it to the target architecture
-    name.</dd>
-
-  <dt><b>LLVM_TABLEGEN</b>:STRING</dt>
-  <dd>Full path to a native TableGen executable (usually
-    named <i>tblgen</i>). This is intented for cross-compiling: if the
-    user sets this variable, no native TableGen will be created.</dd>
-
-  <dt><b>LLVM_LIT_ARGS</b>:STRING</dt>
-  <dd>Arguments given to lit.
-    <tt>make check</tt> and <tt>make clang-test</tt> are affected.
-    By default, <tt>&quot;-sv --no-progress-bar&quot;</tt>
-    on Visual C++ and Xcode,
-    <tt>&quot;-sv&quot;</tt> on others.</dd>
-
-  <dt><b>LLVM_LIT_TOOLS_DIR</b>:PATH</dt>
-  <dd>The path to GnuWin32 tools for tests. Valid on Windows host.
-    Defaults to "", then Lit seeks tools according to %PATH%.
-    Lit can find tools(eg. grep, sort, &amp;c) on LLVM_LIT_TOOLS_DIR at first,
-    without specifying GnuWin32 to %PATH%.</dd>
-
-  <dt><b>LLVM_ENABLE_FFI</b>:BOOL</dt>
-  <dd>Indicates whether LLVM Interpreter will be linked with Foreign
-    Function Interface library. If the library or its headers are
-    installed on a custom location, you can set the variables
-    FFI_INCLUDE_DIR and FFI_LIBRARY_DIR. Defaults to OFF.</dd>
-
-  <dt><b>LLVM_CLANG_SOURCE_DIR</b>:PATH</dt>
-  <dd>Path to Clang's source directory. Defaults to tools/clang.
-    Clang will not be built when it is empty or it does not point valid
-    path.</dd>
-
-  <dt><b>LLVM_USE_OPROFILE</b>:BOOL</dt>
-  <dd> Enable building OProfile JIT support. Defaults to OFF</dd>
-
-  <dt><b>LLVM_USE_INTEL_JITEVENTS</b>:BOOL</dt>
-  <dd> Enable building support for Intel JIT Events API. Defaults to OFF</dd>
-
-  <dt><b>LLVM_INTEL_JITEVENTS_DIR</b>:PATH</dt>
-  <dd> Path to installation of Intel(R) VTune(TM) Amplifier XE 2011,
-    used to locate the <tt>jitprofiling</tt> library. Default =
-    <tt>%VTUNE_AMPLIFIER_XE_2011_DIR%</tt> (Windows)
-    | <tt>/opt/intel/vtune_amplifier_xe_2011</tt> (Linux) </dd>
-
-</dl>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="testing">Executing the test suite</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Testing is performed when the <i>check</i> target is built. For
-  instance, if you are using makefiles, execute this command while on
-  the top level of your build directory:</p>
-
-<div class="doc_code">
-  <p><tt>make check</tt></p>
-</div>
-
-<p>On Visual Studio, you may run tests to build the project "check".</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="cross">Cross compiling</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>See <a href="http://www.vtk.org/Wiki/CMake_Cross_Compiling">this
-    wiki page</a> for generic instructions on how to cross-compile
-    with CMake. It goes into detailed explanations and may seem
-    daunting, but it is not. On the wiki page there are several
-    examples including toolchain files. Go directly to
-    <a href="http://www.vtk.org/Wiki/CMake_Cross_Compiling#Information_how_to_set_up_various_cross_compiling_toolchains">this
-    section</a> for a quick solution.</p>
-
-<p>Also see the <a href="#llvmvars">LLVM-specific variables</a>
-  section for variables used when cross-compiling.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="embedding">Embedding LLVM in your project</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p>The most difficult part of adding LLVM to the build of a project
-    is to determine the set of LLVM libraries corresponding to the set
-    of required LLVM features. What follows is an example of how to
-    obtain this information:</p>
-
-  <div class="doc_code">
-    <pre>
-    <b># A convenience variable:</b>
-    set(LLVM_ROOT "" CACHE PATH "Root of LLVM install.")
-    <b># A bit of a sanity check:</b>
-    if( NOT EXISTS ${LLVM_ROOT}/include/llvm )
-    message(FATAL_ERROR "LLVM_ROOT (${LLVM_ROOT}) is not a valid LLVM install")
-    endif()
-    <b># We incorporate the CMake features provided by LLVM:</b>
-    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${LLVM_ROOT}/share/llvm/cmake")
-    include(LLVMConfig)
-    <b># Now set the header and library paths:</b>
-    include_directories( ${LLVM_INCLUDE_DIRS} )
-    link_directories( ${LLVM_LIBRARY_DIRS} )
-    add_definitions( ${LLVM_DEFINITIONS} )
-    <b># Let's suppose we want to build a JIT compiler with support for
-    # binary code (no interpreter):</b>
-    llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
-    <b># Finally, we link the LLVM libraries to our executable:</b>
-    target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
-    </pre>
-  </div>
-
-  <p>This assumes that LLVM_ROOT points to an install of LLVM. The
-    procedure works too for uninstalled builds although we need to take
-    care to add an <i>include_directories</i> for the location of the
-    headers on the LLVM source directory (if we are building
-    out-of-source.)</p>
-
-  <p>Alternativaly, you can utilize CMake's <i>find_package</i>
-    functionality. Here is an equivalent variant of snippet shown above:</p>
-
-  <div class="doc_code">
-    <pre>
-    find_package(LLVM)
-
-    if( NOT LLVM_FOUND )
-      message(FATAL_ERROR "LLVM package can't be found. Set CMAKE_PREFIX_PATH variable to LLVM's installation prefix.")
-    endif()
-
-    include_directories( ${LLVM_INCLUDE_DIRS} )
-    link_directories( ${LLVM_LIBRARY_DIRS} )
-
-    llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
-
-    target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
-    </pre>
-  </div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="passdev">Developing LLVM pass out of source</a>
-</h3>
-
-<div>
-
-  <p>It is possible to develop LLVM passes against installed LLVM.
-     An example of project layout provided below:</p>
-
-  <div class="doc_code">
-    <pre>
-      &lt;project dir&gt;/
-          |
-          CMakeLists.txt
-          &lt;pass name&gt;/
-              |
-              CMakeLists.txt
-              Pass.cpp
-              ...
-    </pre>
-  </div>
-
-  <p>Contents of &lt;project dir&gt;/CMakeLists.txt:</p>
-
-  <div class="doc_code">
-    <pre>
-    find_package(LLVM)
-
-    <b># Define add_llvm_* macro's.</b>
-    include(AddLLVM)
-
-    add_definitions(${LLVM_DEFINITIONS})
-    include_directories(${LLVM_INCLUDE_DIRS})
-    link_directories(${LLVM_LIBRARY_DIRS})
-
-    add_subdirectory(&lt;pass name&gt;)
-    </pre>
-  </div>
-
-  <p>Contents of &lt;project dir&gt;/&lt;pass name&gt;/CMakeLists.txt:</p>
-
-  <div class="doc_code">
-    <pre>
-    add_llvm_loadable_module(LLVMPassname
-      Pass.cpp
-      )
-    </pre>
-  </div>
-
-  <p>When you are done developing your pass, you may wish to integrate it
-     into LLVM source tree. You can achieve it in two easy steps:<br>
-     1. Copying &lt;pass name&gt; folder into &lt;LLVM root&gt;/lib/Transform directory.<br>
-     2. Adding "add_subdirectory(&lt;pass name&gt;)" line into &lt;LLVM root&gt;/lib/Transform/CMakeLists.txt</p>
-</div>
-<!-- *********************************************************************** -->
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="specifics">Compiler/Platform specific topics</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Notes for specific compilers and/or platforms.</p>
-
-<h3>
-  <a name="msvc">Microsoft Visual C++</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt><b>LLVM_COMPILER_JOBS</b>:STRING</dt>
-  <dd>Specifies the maximum number of parallell compiler jobs to use
-    per project when building with msbuild or Visual Studio. Only supported for
-    Visual Studio 2008 and Visual Studio 2010 CMake generators. 0 means use all
-    processors. Default is 0.</dd>
-</dl>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:ofv@wanadoo.es">Oscar Fuentes</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-08-09 03:59:36 +0100 (Mon, 9 Aug 2010) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CMake.rst b/docs/CMake.rst
new file mode 100644
index 0000000..e1761c5
--- /dev/null
+++ b/docs/CMake.rst
@@ -0,0 +1,423 @@
+.. _building-with-cmake:
+
+========================
+Building LLVM with CMake
+========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+`CMake <http://www.cmake.org/>`_ is a cross-platform build-generator tool. CMake
+does not build the project, it generates the files needed by your build tool
+(GNU make, Visual Studio, etc) for building LLVM.
+
+If you are really anxious about getting a functional LLVM build, go to the
+`Quick start`_ section. If you are a CMake novice, start on `Basic CMake usage`_
+and then go back to the `Quick start`_ once you know what you are doing. The
+`Options and variables`_ section is a reference for customizing your build. If
+you already have experience with CMake, this is the recommended starting point.
+
+.. _Quick start:
+
+Quick start
+===========
+
+We use here the command-line, non-interactive CMake interface.
+
+#. `Download <http://www.cmake.org/cmake/resources/software.html>`_ and install
+   CMake. Version 2.8 is the minimum required.
+
+#. Open a shell. Your development tools must be reachable from this shell
+   through the PATH environment variable.
+
+#. Create a directory for containing the build. It is not supported to build
+   LLVM on the source directory. cd to this directory:
+
+   .. code-block:: bash
+
+     $ mkdir mybuilddir
+     $ cd mybuilddir
+
+#. Execute this command on the shell replacing `path/to/llvm/source/root` with
+   the path to the root of your LLVM source tree:
+
+   .. code-block:: bash
+
+     $ cmake path/to/llvm/source/root
+
+   CMake will detect your development environment, perform a series of test and
+   generate the files required for building LLVM. CMake will use default values
+   for all build parameters. See the `Options and variables`_ section for
+   fine-tuning your build
+
+   This can fail if CMake can't detect your toolset, or if it thinks that the
+   environment is not sane enough. On this case make sure that the toolset that
+   you intend to use is the only one reachable from the shell and that the shell
+   itself is the correct one for you development environment. CMake will refuse
+   to build MinGW makefiles if you have a POSIX shell reachable through the PATH
+   environment variable, for instance. You can force CMake to use a given build
+   tool, see the `Usage`_ section.
+
+.. _Basic CMake usage:
+.. _Usage:
+
+Basic CMake usage
+=================
+
+This section explains basic aspects of CMake, mostly for explaining those
+options which you may need on your day-to-day usage.
+
+CMake comes with extensive documentation in the form of html files and on the
+cmake executable itself. Execute ``cmake --help`` for further help options.
+
+CMake requires to know for which build tool it shall generate files (GNU make,
+Visual Studio, Xcode, etc). If not specified on the command line, it tries to
+guess it based on you environment. Once identified the build tool, CMake uses
+the corresponding *Generator* for creating files for your build tool. You can
+explicitly specify the generator with the command line option ``-G "Name of the
+generator"``. For knowing the available generators on your platform, execute
+
+.. code-block:: bash
+
+  $ cmake --help
+
+This will list the generator's names at the end of the help text. Generator's
+names are case-sensitive. Example:
+
+.. code-block:: bash
+
+  $ cmake -G "Visual Studio 9 2008" path/to/llvm/source/root
+
+For a given development platform there can be more than one adequate
+generator. If you use Visual Studio "NMake Makefiles" is a generator you can use
+for building with NMake. By default, CMake chooses the more specific generator
+supported by your development environment. If you want an alternative generator,
+you must tell this to CMake with the ``-G`` option.
+
+.. todo::
+
+  Explain variables and cache. Move explanation here from #options section.
+
+.. _Options and variables:
+
+Options and variables
+=====================
+
+Variables customize how the build will be generated. Options are boolean
+variables, with possible values ON/OFF. Options and variables are defined on the
+CMake command line like this:
+
+.. code-block:: bash
+
+  $ cmake -DVARIABLE=value path/to/llvm/source
+
+You can set a variable after the initial CMake invocation for changing its
+value. You can also undefine a variable:
+
+.. code-block:: bash
+
+  $ cmake -UVARIABLE path/to/llvm/source
+
+Variables are stored on the CMake cache. This is a file named ``CMakeCache.txt``
+on the root of the build directory. Do not hand-edit it.
+
+Variables are listed here appending its type after a colon. It is correct to
+write the variable and the type on the CMake command line:
+
+.. code-block:: bash
+
+  $ cmake -DVARIABLE:TYPE=value path/to/llvm/source
+
+Frequently-used CMake variables
+-------------------------------
+
+Here are listed some of the CMake variables that are used often, along with a
+brief explanation and LLVM-specific notes. For full documentation, check the
+CMake docs or execute ``cmake --help-variable VARIABLE_NAME``.
+
+**CMAKE_BUILD_TYPE**:STRING
+  Sets the build type for ``make`` based generators. Possible values are
+  Release, Debug, RelWithDebInfo and MinSizeRel. On systems like Visual Studio
+  the user sets the build type with the IDE settings.
+
+**CMAKE_INSTALL_PREFIX**:PATH
+  Path where LLVM will be installed if "make install" is invoked or the
+  "INSTALL" target is built.
+
+**LLVM_LIBDIR_SUFFIX**:STRING
+  Extra suffix to append to the directory where libraries are to be
+  installed. On a 64-bit architecture, one could use ``-DLLVM_LIBDIR_SUFFIX=64``
+  to install libraries to ``/usr/lib64``.
+
+**CMAKE_C_FLAGS**:STRING
+  Extra flags to use when compiling C source files.
+
+**CMAKE_CXX_FLAGS**:STRING
+  Extra flags to use when compiling C++ source files.
+
+**BUILD_SHARED_LIBS**:BOOL
+  Flag indicating is shared libraries will be built. Its default value is
+  OFF. Shared libraries are not supported on Windows and not recommended in the
+  other OSes.
+
+.. _LLVM-specific variables:
+
+LLVM-specific variables
+-----------------------
+
+**LLVM_TARGETS_TO_BUILD**:STRING
+  Semicolon-separated list of targets to build, or *all* for building all
+  targets. Case-sensitive. For Visual C++ defaults to *X86*. On the other cases
+  defaults to *all*. Example: ``-DLLVM_TARGETS_TO_BUILD="X86;PowerPC"``.
+
+**LLVM_BUILD_TOOLS**:BOOL
+  Build LLVM tools. Defaults to ON. Targets for building each tool are generated
+  in any case. You can build an tool separately by invoking its target. For
+  example, you can build *llvm-as* with a makefile-based system executing *make
+  llvm-as* on the root of your build directory.
+
+**LLVM_INCLUDE_TOOLS**:BOOL
+  Generate build targets for the LLVM tools. Defaults to ON. You can use that
+  option for disabling the generation of build targets for the LLVM tools.
+
+**LLVM_BUILD_EXAMPLES**:BOOL
+  Build LLVM examples. Defaults to OFF. Targets for building each example are
+  generated in any case. See documentation for *LLVM_BUILD_TOOLS* above for more
+  details.
+
+**LLVM_INCLUDE_EXAMPLES**:BOOL
+  Generate build targets for the LLVM examples. Defaults to ON. You can use that
+  option for disabling the generation of build targets for the LLVM examples.
+
+**LLVM_BUILD_TESTS**:BOOL
+  Build LLVM unit tests. Defaults to OFF. Targets for building each unit test
+  are generated in any case. You can build a specific unit test with the target
+  *UnitTestNameTests* (where at this time *UnitTestName* can be ADT, Analysis,
+  ExecutionEngine, JIT, Support, Transform, VMCore; see the subdirectories of
+  *unittests* for an updated list.) It is possible to build all unit tests with
+  the target *UnitTests*.
+
+**LLVM_INCLUDE_TESTS**:BOOL
+  Generate build targets for the LLVM unit tests. Defaults to ON. You can use
+  that option for disabling the generation of build targets for the LLVM unit
+  tests.
+
+**LLVM_APPEND_VC_REV**:BOOL
+  Append version control revision info (svn revision number or git revision id)
+  to LLVM version string (stored in the PACKAGE_VERSION macro). For this to work
+  cmake must be invoked before the build. Defaults to OFF.
+
+**LLVM_ENABLE_THREADS**:BOOL
+  Build with threads support, if available. Defaults to ON.
+
+**LLVM_ENABLE_ASSERTIONS**:BOOL
+  Enables code assertions. Defaults to OFF if and only if ``CMAKE_BUILD_TYPE``
+  is *Release*.
+
+**LLVM_ENABLE_PIC**:BOOL
+  Add the ``-fPIC`` flag for the compiler command-line, if the compiler supports
+  this flag. Some systems, like Windows, do not need this flag. Defaults to ON.
+
+**LLVM_ENABLE_WARNINGS**:BOOL
+  Enable all compiler warnings. Defaults to ON.
+
+**LLVM_ENABLE_PEDANTIC**:BOOL
+  Enable pedantic mode. This disable compiler specific extensions, is
+  possible. Defaults to ON.
+
+**LLVM_ENABLE_WERROR**:BOOL
+  Stop and fail build, if a compiler warning is triggered. Defaults to OFF.
+
+**LLVM_BUILD_32_BITS**:BOOL
+  Build 32-bits executables and libraries on 64-bits systems. This option is
+  available only on some 64-bits unix systems. Defaults to OFF.
+
+**LLVM_TARGET_ARCH**:STRING
+  LLVM target to use for native code generation. This is required for JIT
+  generation. It defaults to "host", meaning that it shall pick the architecture
+  of the machine where LLVM is being built. If you are cross-compiling, set it
+  to the target architecture name.
+
+**LLVM_TABLEGEN**:STRING
+  Full path to a native TableGen executable (usually named ``tblgen``). This is
+  intended for cross-compiling: if the user sets this variable, no native
+  TableGen will be created.
+
+**LLVM_LIT_ARGS**:STRING
+  Arguments given to lit.  ``make check`` and ``make clang-test`` are affected.
+  By default, ``'-sv --no-progress-bar'`` on Visual C++ and Xcode, ``'-sv'`` on
+  others.
+
+**LLVM_LIT_TOOLS_DIR**:PATH
+  The path to GnuWin32 tools for tests. Valid on Windows host.  Defaults to "",
+  then Lit seeks tools according to %PATH%.  Lit can find tools(eg. grep, sort,
+  &c) on LLVM_LIT_TOOLS_DIR at first, without specifying GnuWin32 to %PATH%.
+
+**LLVM_ENABLE_FFI**:BOOL
+  Indicates whether LLVM Interpreter will be linked with Foreign Function
+  Interface library. If the library or its headers are installed on a custom
+  location, you can set the variables FFI_INCLUDE_DIR and
+  FFI_LIBRARY_DIR. Defaults to OFF.
+
+**LLVM_EXTERNAL_{CLANG,LLD,POLLY}_SOURCE_DIR**:PATH
+  Path to ``{Clang,lld,Polly}``\'s source directory. Defaults to
+  ``tools/{clang,lld,polly}``. ``{Clang,lld,Polly}`` will not be built when it
+  is empty or it does not point valid path.
+
+**LLVM_USE_OPROFILE**:BOOL
+  Enable building OProfile JIT support. Defaults to OFF
+
+**LLVM_USE_INTEL_JITEVENTS**:BOOL
+  Enable building support for Intel JIT Events API. Defaults to OFF
+
+**LLVM_INTEL_JITEVENTS_DIR**:PATH
+  Path to installation of Intel(R) VTune(TM) Amplifier XE 2011, used to locate
+  the ``jitprofiling`` library. Default = ``%VTUNE_AMPLIFIER_XE_2011_DIR%``
+  (Windows) | ``/opt/intel/vtune_amplifier_xe_2011`` (Linux)
+
+Executing the test suite
+========================
+
+Testing is performed when the *check* target is built. For instance, if you are
+using makefiles, execute this command while on the top level of your build
+directory:
+
+.. code-block:: bash
+
+  $ make check
+
+On Visual Studio, you may run tests to build the project "check".
+
+Cross compiling
+===============
+
+See `this wiki page <http://www.vtk.org/Wiki/CMake_Cross_Compiling>`_ for
+generic instructions on how to cross-compile with CMake. It goes into detailed
+explanations and may seem daunting, but it is not. On the wiki page there are
+several examples including toolchain files. Go directly to `this section
+<http://www.vtk.org/Wiki/CMake_Cross_Compiling#Information_how_to_set_up_various_cross_compiling_toolchains>`_
+for a quick solution.
+
+Also see the `LLVM-specific variables`_ section for variables used when
+cross-compiling.
+
+Embedding LLVM in your project
+==============================
+
+The most difficult part of adding LLVM to the build of a project is to determine
+the set of LLVM libraries corresponding to the set of required LLVM
+features. What follows is an example of how to obtain this information:
+
+.. code-block:: cmake
+
+  # A convenience variable:
+  set(LLVM_ROOT "" CACHE PATH "Root of LLVM install.")
+
+  # A bit of a sanity check:
+  if( NOT EXISTS ${LLVM_ROOT}/include/llvm )
+  message(FATAL_ERROR "LLVM_ROOT (${LLVM_ROOT}) is not a valid LLVM install")
+  endif()
+
+  # We incorporate the CMake features provided by LLVM:
+  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${LLVM_ROOT}/share/llvm/cmake")
+  include(LLVMConfig)
+
+  # Now set the header and library paths:
+  include_directories( ${LLVM_INCLUDE_DIRS} )
+  link_directories( ${LLVM_LIBRARY_DIRS} )
+  add_definitions( ${LLVM_DEFINITIONS} )
+
+  # Let's suppose we want to build a JIT compiler with support for
+  # binary code (no interpreter):
+  llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
+
+  # Finally, we link the LLVM libraries to our executable:
+  target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
+
+This assumes that LLVM_ROOT points to an install of LLVM. The procedure works
+too for uninstalled builds although we need to take care to add an
+`include_directories` for the location of the headers on the LLVM source
+directory (if we are building out-of-source.)
+
+Alternativaly, you can utilize CMake's ``find_package`` functionality. Here is
+an equivalent variant of snippet shown above:
+
+.. code-block:: cmake
+
+  find_package(LLVM)
+
+  if( NOT LLVM_FOUND )
+    message(FATAL_ERROR "LLVM package can't be found. Set CMAKE_PREFIX_PATH variable to LLVM's installation prefix.")
+  endif()
+
+  include_directories( ${LLVM_INCLUDE_DIRS} )
+  link_directories( ${LLVM_LIBRARY_DIRS} )
+
+  llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
+
+  target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
+
+Developing LLVM pass out of source
+----------------------------------
+
+It is possible to develop LLVM passes against installed LLVM.  An example of
+project layout provided below:
+
+.. code-block:: bash
+
+  <project dir>/
+      |
+      CMakeLists.txt
+      <pass name>/
+          |
+          CMakeLists.txt
+          Pass.cpp
+          ...
+
+Contents of ``<project dir>/CMakeLists.txt``:
+
+.. code-block:: cmake
+
+  find_package(LLVM)
+
+  # Define add_llvm_* macro's.
+  include(AddLLVM)
+
+  add_definitions(${LLVM_DEFINITIONS})
+  include_directories(${LLVM_INCLUDE_DIRS})
+  link_directories(${LLVM_LIBRARY_DIRS})
+
+  add_subdirectory(<pass name>)
+
+Contents of ``<project dir>/<pass name>/CMakeLists.txt``:
+
+.. code-block:: cmake
+
+  add_llvm_loadable_module(LLVMPassname
+    Pass.cpp
+    )
+
+When you are done developing your pass, you may wish to integrate it
+into LLVM source tree. You can achieve it in two easy steps:
+
+#. Copying ``<pass name>`` folder into ``<LLVM root>/lib/Transform`` directory.
+
+#. Adding ``add_subdirectory(<pass name>)`` line into
+   ``<LLVM root>/lib/Transform/CMakeLists.txt``.
+
+Compiler/Platform specific topics
+=================================
+
+Notes for specific compilers and/or platforms.
+
+Microsoft Visual C++
+--------------------
+
+**LLVM_COMPILER_JOBS**:STRING
+  Specifies the maximum number of parallell compiler jobs to use per project
+  when building with msbuild or Visual Studio. Only supported for Visual Studio
+  2008 and Visual Studio 2010 CMake generators. 0 means use all
+  processors. Default is 0.
diff --git a/docs/CodeGenerator.html b/docs/CodeGenerator.html
deleted file mode 100644
index 2dc22c1..0000000
--- a/docs/CodeGenerator.html
+++ /dev/null
@@ -1,3189 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>The LLVM Target-Independent Code Generator</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-
-  <style type="text/css">
-    .unknown { background-color: #C0C0C0; text-align: center; }
-    .unknown:before { content: "?" }
-    .no { background-color: #C11B17 }
-    .no:before { content: "N" }
-    .partial { background-color: #F88017 }
-    .yes { background-color: #0F0; }
-    .yes:before { content: "Y" }
-  </style>
-
-</head>
-<body>
-
-<h1>
-  The LLVM Target-Independent Code Generator
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a>
-    <ul>
-      <li><a href="#required">Required components in the code generator</a></li>
-      <li><a href="#high-level-design">The high-level design of the code
-          generator</a></li>
-      <li><a href="#tablegen">Using TableGen for target description</a></li>
-    </ul>
-  </li>
-  <li><a href="#targetdesc">Target description classes</a>
-    <ul>
-      <li><a href="#targetmachine">The <tt>TargetMachine</tt> class</a></li>
-      <li><a href="#targetdata">The <tt>TargetData</tt> class</a></li>
-      <li><a href="#targetlowering">The <tt>TargetLowering</tt> class</a></li>
-      <li><a href="#targetregisterinfo">The <tt>TargetRegisterInfo</tt> class</a></li>
-      <li><a href="#targetinstrinfo">The <tt>TargetInstrInfo</tt> class</a></li>
-      <li><a href="#targetframeinfo">The <tt>TargetFrameInfo</tt> class</a></li>
-      <li><a href="#targetsubtarget">The <tt>TargetSubtarget</tt> class</a></li>
-      <li><a href="#targetjitinfo">The <tt>TargetJITInfo</tt> class</a></li>
-    </ul>
-  </li>
-  <li><a href="#codegendesc">The "Machine" Code Generator classes</a>
-    <ul>
-    <li><a href="#machineinstr">The <tt>MachineInstr</tt> class</a></li>
-    <li><a href="#machinebasicblock">The <tt>MachineBasicBlock</tt>
-                                     class</a></li>
-    <li><a href="#machinefunction">The <tt>MachineFunction</tt> class</a></li>
-    <li><a href="#machineinstrbundle"><tt>MachineInstr Bundles</tt></a></li>
-    </ul>
-  </li>
-  <li><a href="#mc">The "MC" Layer</a>
-    <ul>
-    <li><a href="#mcstreamer">The <tt>MCStreamer</tt> API</a></li>
-    <li><a href="#mccontext">The <tt>MCContext</tt> class</a>
-    <li><a href="#mcsymbol">The <tt>MCSymbol</tt> class</a></li>
-    <li><a href="#mcsection">The <tt>MCSection</tt> class</a></li>
-    <li><a href="#mcinst">The <tt>MCInst</tt> class</a></li>
-    </ul>
-  </li>
-  <li><a href="#codegenalgs">Target-independent code generation algorithms</a>
-    <ul>
-    <li><a href="#instselect">Instruction Selection</a>
-      <ul>
-      <li><a href="#selectiondag_intro">Introduction to SelectionDAGs</a></li>
-      <li><a href="#selectiondag_process">SelectionDAG Code Generation
-                                          Process</a></li>
-      <li><a href="#selectiondag_build">Initial SelectionDAG
-                                        Construction</a></li>
-      <li><a href="#selectiondag_legalize_types">SelectionDAG LegalizeTypes Phase</a></li>
-      <li><a href="#selectiondag_legalize">SelectionDAG Legalize Phase</a></li>
-      <li><a href="#selectiondag_optimize">SelectionDAG Optimization
-                                           Phase: the DAG Combiner</a></li>
-      <li><a href="#selectiondag_select">SelectionDAG Select Phase</a></li>
-      <li><a href="#selectiondag_sched">SelectionDAG Scheduling and Formation
-                                        Phase</a></li>
-      <li><a href="#selectiondag_future">Future directions for the
-                                         SelectionDAG</a></li>
-      </ul></li>
-     <li><a href="#liveintervals">Live Intervals</a>
-       <ul>
-       <li><a href="#livevariable_analysis">Live Variable Analysis</a></li>
-       <li><a href="#liveintervals_analysis">Live Intervals Analysis</a></li>
-       </ul></li>
-    <li><a href="#regalloc">Register Allocation</a>
-      <ul>
-      <li><a href="#regAlloc_represent">How registers are represented in
-                                        LLVM</a></li>
-      <li><a href="#regAlloc_howTo">Mapping virtual registers to physical
-                                    registers</a></li>
-      <li><a href="#regAlloc_twoAddr">Handling two address instructions</a></li>
-      <li><a href="#regAlloc_ssaDecon">The SSA deconstruction phase</a></li>
-      <li><a href="#regAlloc_fold">Instruction folding</a></li>
-      <li><a href="#regAlloc_builtIn">Built in register allocators</a></li>
-      </ul></li>
-    <li><a href="#codeemit">Code Emission</a></li>
-    <li><a href="#vliw_packetizer">VLIW Packetizer</a>
-      <ul>
-      <li><a href="#vliw_mapping">Mapping from instructions to functional
-                 units</a></li>
-      <li><a href="#vliw_repr">How the packetization tables are
-                             generated and used</a></li>
-      </ul>
-    </li>
-    </ul>
-  </li>
-  <li><a href="#nativeassembler">Implementing a Native Assembler</a></li>
-  
-  <li><a href="#targetimpls">Target-specific Implementation Notes</a>
-    <ul>
-    <li><a href="#targetfeatures">Target Feature Matrix</a></li>
-    <li><a href="#tailcallopt">Tail call optimization</a></li>
-    <li><a href="#sibcallopt">Sibling call optimization</a></li>
-    <li><a href="#x86">The X86 backend</a></li>
-    <li><a href="#ppc">The PowerPC backend</a>
-      <ul>
-      <li><a href="#ppc_abi">LLVM PowerPC ABI</a></li>
-      <li><a href="#ppc_frame">Frame Layout</a></li>
-      <li><a href="#ppc_prolog">Prolog/Epilog</a></li>
-      <li><a href="#ppc_dynamic">Dynamic Allocation</a></li>
-      </ul></li>
-    <li><a href="#ptx">The PTX backend</a></li>
-    </ul></li>
-
-</ol>
-
-<div class="doc_author">
-  <p>Written by the LLVM Team.</p>
-</div>
-
-<div class="doc_warning">
-  <p>Warning: This is a work in progress.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM target-independent code generator is a framework that provides a
-   suite of reusable components for translating the LLVM internal representation
-   to the machine code for a specified target&mdash;either in assembly form
-   (suitable for a static compiler) or in binary machine code format (usable for
-   a JIT compiler). The LLVM target-independent code generator consists of six
-   main components:</p>
-
-<ol>
-  <li><a href="#targetdesc">Abstract target description</a> interfaces which
-      capture important properties about various aspects of the machine,
-      independently of how they will be used.  These interfaces are defined in
-      <tt>include/llvm/Target/</tt>.</li>
-
-  <li>Classes used to represent the <a href="#codegendesc">code being
-      generated</a> for a target.  These classes are intended to be abstract
-      enough to represent the machine code for <i>any</i> target machine.  These
-      classes are defined in <tt>include/llvm/CodeGen/</tt>. At this level,
-      concepts like "constant pool entries" and "jump tables" are explicitly
-      exposed.</li>
-
-  <li>Classes and algorithms used to represent code as the object file level,
-      the <a href="#mc">MC Layer</a>.  These classes represent assembly level
-      constructs like labels, sections, and instructions.  At this level,
-      concepts like "constant pool entries" and "jump tables" don't exist.</li>
-
-  <li><a href="#codegenalgs">Target-independent algorithms</a> used to implement
-      various phases of native code generation (register allocation, scheduling,
-      stack frame representation, etc).  This code lives
-      in <tt>lib/CodeGen/</tt>.</li>
-
-  <li><a href="#targetimpls">Implementations of the abstract target description
-      interfaces</a> for particular targets.  These machine descriptions make
-      use of the components provided by LLVM, and can optionally provide custom
-      target-specific passes, to build complete code generators for a specific
-      target.  Target descriptions live in <tt>lib/Target/</tt>.</li>
-
-  <li><a href="#jit">The target-independent JIT components</a>.  The LLVM JIT is
-      completely target independent (it uses the <tt>TargetJITInfo</tt>
-      structure to interface for target-specific issues.  The code for the
-      target-independent JIT lives in <tt>lib/ExecutionEngine/JIT</tt>.</li>
-</ol>
-
-<p>Depending on which part of the code generator you are interested in working
-   on, different pieces of this will be useful to you.  In any case, you should
-   be familiar with the <a href="#targetdesc">target description</a>
-   and <a href="#codegendesc">machine code representation</a> classes.  If you
-   want to add a backend for a new target, you will need
-   to <a href="#targetimpls">implement the target description</a> classes for
-   your new target and understand the <a href="LangRef.html">LLVM code
-   representation</a>.  If you are interested in implementing a
-   new <a href="#codegenalgs">code generation algorithm</a>, it should only
-   depend on the target-description and machine code representation classes,
-   ensuring that it is portable.</p>
-
-<!-- ======================================================================= -->
-<h3>
- <a name="required">Required components in the code generator</a>
-</h3>
-
-<div>
-
-<p>The two pieces of the LLVM code generator are the high-level interface to the
-   code generator and the set of reusable components that can be used to build
-   target-specific backends.  The two most important interfaces
-   (<a href="#targetmachine"><tt>TargetMachine</tt></a>
-   and <a href="#targetdata"><tt>TargetData</tt></a>) are the only ones that are
-   required to be defined for a backend to fit into the LLVM system, but the
-   others must be defined if the reusable code generator components are going to
-   be used.</p>
-
-<p>This design has two important implications.  The first is that LLVM can
-   support completely non-traditional code generation targets.  For example, the
-   C backend does not require register allocation, instruction selection, or any
-   of the other standard components provided by the system.  As such, it only
-   implements these two interfaces, and does its own thing.  Another example of
-   a code generator like this is a (purely hypothetical) backend that converts
-   LLVM to the GCC RTL form and uses GCC to emit machine code for a target.</p>
-
-<p>This design also implies that it is possible to design and implement
-   radically different code generators in the LLVM system that do not make use
-   of any of the built-in components.  Doing so is not recommended at all, but
-   could be required for radically different targets that do not fit into the
-   LLVM machine description model: FPGAs for example.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
- <a name="high-level-design">The high-level design of the code generator</a>
-</h3>
-
-<div>
-
-<p>The LLVM target-independent code generator is designed to support efficient
-   and quality code generation for standard register-based microprocessors.
-   Code generation in this model is divided into the following stages:</p>
-
-<ol>
-  <li><b><a href="#instselect">Instruction Selection</a></b> &mdash; This phase
-      determines an efficient way to express the input LLVM code in the target
-      instruction set.  This stage produces the initial code for the program in
-      the target instruction set, then makes use of virtual registers in SSA
-      form and physical registers that represent any required register
-      assignments due to target constraints or calling conventions.  This step
-      turns the LLVM code into a DAG of target instructions.</li>
-
-  <li><b><a href="#selectiondag_sched">Scheduling and Formation</a></b> &mdash;
-      This phase takes the DAG of target instructions produced by the
-      instruction selection phase, determines an ordering of the instructions,
-      then emits the instructions
-      as <tt><a href="#machineinstr">MachineInstr</a></tt>s with that ordering.
-      Note that we describe this in the <a href="#instselect">instruction
-      selection section</a> because it operates on
-      a <a href="#selectiondag_intro">SelectionDAG</a>.</li>
-
-  <li><b><a href="#ssamco">SSA-based Machine Code Optimizations</a></b> &mdash;
-      This optional stage consists of a series of machine-code optimizations
-      that operate on the SSA-form produced by the instruction selector.
-      Optimizations like modulo-scheduling or peephole optimization work
-      here.</li>
-
-  <li><b><a href="#regalloc">Register Allocation</a></b> &mdash; The target code
-      is transformed from an infinite virtual register file in SSA form to the
-      concrete register file used by the target.  This phase introduces spill
-      code and eliminates all virtual register references from the program.</li>
-
-  <li><b><a href="#proepicode">Prolog/Epilog Code Insertion</a></b> &mdash; Once
-      the machine code has been generated for the function and the amount of
-      stack space required is known (used for LLVM alloca's and spill slots),
-      the prolog and epilog code for the function can be inserted and "abstract
-      stack location references" can be eliminated.  This stage is responsible
-      for implementing optimizations like frame-pointer elimination and stack
-      packing.</li>
-
-  <li><b><a href="#latemco">Late Machine Code Optimizations</a></b> &mdash;
-      Optimizations that operate on "final" machine code can go here, such as
-      spill code scheduling and peephole optimizations.</li>
-
-  <li><b><a href="#codeemit">Code Emission</a></b> &mdash; The final stage
-      actually puts out the code for the current function, either in the target
-      assembler format or in machine code.</li>
-</ol>
-
-<p>The code generator is based on the assumption that the instruction selector
-   will use an optimal pattern matching selector to create high-quality
-   sequences of native instructions.  Alternative code generator designs based
-   on pattern expansion and aggressive iterative peephole optimization are much
-   slower.  This design permits efficient compilation (important for JIT
-   environments) and aggressive optimization (used when generating code offline)
-   by allowing components of varying levels of sophistication to be used for any
-   step of compilation.</p>
-
-<p>In addition to these stages, target implementations can insert arbitrary
-   target-specific passes into the flow.  For example, the X86 target uses a
-   special pass to handle the 80x87 floating point stack architecture.  Other
-   targets with unusual requirements can be supported with custom passes as
-   needed.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
- <a name="tablegen">Using TableGen for target description</a>
-</h3>
-
-<div>
-
-<p>The target description classes require a detailed description of the target
-   architecture.  These target descriptions often have a large amount of common
-   information (e.g., an <tt>add</tt> instruction is almost identical to a
-   <tt>sub</tt> instruction).  In order to allow the maximum amount of
-   commonality to be factored out, the LLVM code generator uses
-   the <a href="TableGenFundamentals.html">TableGen</a> tool to describe big
-   chunks of the target machine, which allows the use of domain-specific and
-   target-specific abstractions to reduce the amount of repetition.</p>
-
-<p>As LLVM continues to be developed and refined, we plan to move more and more
-   of the target description to the <tt>.td</tt> form.  Doing so gives us a
-   number of advantages.  The most important is that it makes it easier to port
-   LLVM because it reduces the amount of C++ code that has to be written, and
-   the surface area of the code generator that needs to be understood before
-   someone can get something working.  Second, it makes it easier to change
-   things. In particular, if tables and other things are all emitted
-   by <tt>tblgen</tt>, we only need a change in one place (<tt>tblgen</tt>) to
-   update all of the targets to a new interface.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="targetdesc">Target description classes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM target description classes (located in the
-   <tt>include/llvm/Target</tt> directory) provide an abstract description of
-   the target machine independent of any particular client.  These classes are
-   designed to capture the <i>abstract</i> properties of the target (such as the
-   instructions and registers it has), and do not incorporate any particular
-   pieces of code generation algorithms.</p>
-
-<p>All of the target description classes (except the
-   <tt><a href="#targetdata">TargetData</a></tt> class) are designed to be
-   subclassed by the concrete target implementation, and have virtual methods
-   implemented.  To get to these implementations, the
-   <tt><a href="#targetmachine">TargetMachine</a></tt> class provides accessors
-   that should be implemented by the target.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetmachine">The <tt>TargetMachine</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetMachine</tt> class provides virtual methods that are used to
-   access the target-specific implementations of the various target description
-   classes via the <tt>get*Info</tt> methods (<tt>getInstrInfo</tt>,
-   <tt>getRegisterInfo</tt>, <tt>getFrameInfo</tt>, etc.).  This class is
-   designed to be specialized by a concrete target implementation
-   (e.g., <tt>X86TargetMachine</tt>) which implements the various virtual
-   methods.  The only required target description class is
-   the <a href="#targetdata"><tt>TargetData</tt></a> class, but if the code
-   generator components are to be used, the other interfaces should be
-   implemented as well.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetdata">The <tt>TargetData</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetData</tt> class is the only required target description class,
-   and it is the only class that is not extensible (you cannot derived a new
-   class from it).  <tt>TargetData</tt> specifies information about how the
-   target lays out memory for structures, the alignment requirements for various
-   data types, the size of pointers in the target, and whether the target is
-   little-endian or big-endian.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetlowering">The <tt>TargetLowering</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetLowering</tt> class is used by SelectionDAG based instruction
-   selectors primarily to describe how LLVM code should be lowered to
-   SelectionDAG operations.  Among other things, this class indicates:</p>
-
-<ul>
-  <li>an initial register class to use for various <tt>ValueType</tt>s,</li>
-
-  <li>which operations are natively supported by the target machine,</li>
-
-  <li>the return type of <tt>setcc</tt> operations,</li>
-
-  <li>the type to use for shift amounts, and</li>
-
-  <li>various high-level characteristics, like whether it is profitable to turn
-      division by a constant into a multiplication sequence</li>
-</ul>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetregisterinfo">The <tt>TargetRegisterInfo</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetRegisterInfo</tt> class is used to describe the register file
-   of the target and any interactions between the registers.</p>
-
-<p>Registers in the code generator are represented in the code generator by
-   unsigned integers.  Physical registers (those that actually exist in the
-   target description) are unique small numbers, and virtual registers are
-   generally large.  Note that register #0 is reserved as a flag value.</p>
-
-<p>Each register in the processor description has an associated
-   <tt>TargetRegisterDesc</tt> entry, which provides a textual name for the
-   register (used for assembly output and debugging dumps) and a set of aliases
-   (used to indicate whether one register overlaps with another).</p>
-
-<p>In addition to the per-register description, the <tt>TargetRegisterInfo</tt>
-   class exposes a set of processor specific register classes (instances of the
-   <tt>TargetRegisterClass</tt> class).  Each register class contains sets of
-   registers that have the same properties (for example, they are all 32-bit
-   integer registers).  Each SSA virtual register created by the instruction
-   selector has an associated register class.  When the register allocator runs,
-   it replaces virtual registers with a physical register in the set.</p>
-
-<p>The target-specific implementations of these classes is auto-generated from
-   a <a href="TableGenFundamentals.html">TableGen</a> description of the
-   register file.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetinstrinfo">The <tt>TargetInstrInfo</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetInstrInfo</tt> class is used to describe the machine
-   instructions supported by the target. It is essentially an array of
-   <tt>TargetInstrDescriptor</tt> objects, each of which describes one
-   instruction the target supports. Descriptors define things like the mnemonic
-   for the opcode, the number of operands, the list of implicit register uses
-   and defs, whether the instruction has certain target-independent properties
-   (accesses memory, is commutable, etc), and holds any target-specific
-   flags.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetframeinfo">The <tt>TargetFrameInfo</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetFrameInfo</tt> class is used to provide information about the
-   stack frame layout of the target. It holds the direction of stack growth, the
-   known stack alignment on entry to each function, and the offset to the local
-   area.  The offset to the local area is the offset from the stack pointer on
-   function entry to the first location where function data (local variables,
-   spill locations) can be stored.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetsubtarget">The <tt>TargetSubtarget</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetSubtarget</tt> class is used to provide information about the
-   specific chip set being targeted.  A sub-target informs code generation of
-   which instructions are supported, instruction latencies and instruction
-   execution itinerary; i.e., which processing units are used, in what order,
-   and for how long.</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetjitinfo">The <tt>TargetJITInfo</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>TargetJITInfo</tt> class exposes an abstract interface used by the
-   Just-In-Time code generator to perform target-specific activities, such as
-   emitting stubs.  If a <tt>TargetMachine</tt> supports JIT code generation, it
-   should provide one of these objects through the <tt>getJITInfo</tt>
-   method.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="codegendesc">Machine code description classes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>At the high-level, LLVM code is translated to a machine specific
-   representation formed out of
-   <a href="#machinefunction"><tt>MachineFunction</tt></a>,
-   <a href="#machinebasicblock"><tt>MachineBasicBlock</tt></a>,
-   and <a href="#machineinstr"><tt>MachineInstr</tt></a> instances (defined
-   in <tt>include/llvm/CodeGen</tt>).  This representation is completely target
-   agnostic, representing instructions in their most abstract form: an opcode
-   and a series of operands.  This representation is designed to support both an
-   SSA representation for machine code, as well as a register allocated, non-SSA
-   form.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="machineinstr">The <tt>MachineInstr</tt> class</a>
-</h3>
-
-<div>
-
-<p>Target machine instructions are represented as instances of the
-   <tt>MachineInstr</tt> class.  This class is an extremely abstract way of
-   representing machine instructions.  In particular, it only keeps track of an
-   opcode number and a set of operands.</p>
-
-<p>The opcode number is a simple unsigned integer that only has meaning to a
-   specific backend.  All of the instructions for a target should be defined in
-   the <tt>*InstrInfo.td</tt> file for the target. The opcode enum values are
-   auto-generated from this description.  The <tt>MachineInstr</tt> class does
-   not have any information about how to interpret the instruction (i.e., what
-   the semantics of the instruction are); for that you must refer to the
-   <tt><a href="#targetinstrinfo">TargetInstrInfo</a></tt> class.</p> 
-
-<p>The operands of a machine instruction can be of several different types: a
-   register reference, a constant integer, a basic block reference, etc.  In
-   addition, a machine operand should be marked as a def or a use of the value
-   (though only registers are allowed to be defs).</p>
-
-<p>By convention, the LLVM code generator orders instruction operands so that
-   all register definitions come before the register uses, even on architectures
-   that are normally printed in other orders.  For example, the SPARC add
-   instruction: "<tt>add %i1, %i2, %i3</tt>" adds the "%i1", and "%i2" registers
-   and stores the result into the "%i3" register.  In the LLVM code generator,
-   the operands should be stored as "<tt>%i3, %i1, %i2</tt>": with the
-   destination first.</p>
-
-<p>Keeping destination (definition) operands at the beginning of the operand
-   list has several advantages.  In particular, the debugging printer will print
-   the instruction like this:</p>
-
-<div class="doc_code">
-<pre>
-%r3 = add %i1, %i2
-</pre>
-</div>
-
-<p>Also if the first operand is a def, it is easier to <a href="#buildmi">create
-   instructions</a> whose only def is the first operand.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="buildmi">Using the <tt>MachineInstrBuilder.h</tt> functions</a>
-</h4>
-
-<div>
-
-<p>Machine instructions are created by using the <tt>BuildMI</tt> functions,
-   located in the <tt>include/llvm/CodeGen/MachineInstrBuilder.h</tt> file.  The
-   <tt>BuildMI</tt> functions make it easy to build arbitrary machine
-   instructions.  Usage of the <tt>BuildMI</tt> functions look like this:</p>
-
-<div class="doc_code">
-<pre>
-// Create a 'DestReg = mov 42' (rendered in X86 assembly as 'mov DestReg, 42')
-// instruction.  The '1' specifies how many operands will be added.
-MachineInstr *MI = BuildMI(X86::MOV32ri, 1, DestReg).addImm(42);
-
-// Create the same instr, but insert it at the end of a basic block.
-MachineBasicBlock &amp;MBB = ...
-BuildMI(MBB, X86::MOV32ri, 1, DestReg).addImm(42);
-
-// Create the same instr, but insert it before a specified iterator point.
-MachineBasicBlock::iterator MBBI = ...
-BuildMI(MBB, MBBI, X86::MOV32ri, 1, DestReg).addImm(42);
-
-// Create a 'cmp Reg, 0' instruction, no destination reg.
-MI = BuildMI(X86::CMP32ri, 2).addReg(Reg).addImm(0);
-// Create an 'sahf' instruction which takes no operands and stores nothing.
-MI = BuildMI(X86::SAHF, 0);
-
-// Create a self looping branch instruction.
-BuildMI(MBB, X86::JNE, 1).addMBB(&amp;MBB);
-</pre>
-</div>
-
-<p>The key thing to remember with the <tt>BuildMI</tt> functions is that you
-   have to specify the number of operands that the machine instruction will
-   take.  This allows for efficient memory allocation.  You also need to specify
-   if operands default to be uses of values, not definitions.  If you need to
-   add a definition operand (other than the optional destination register), you
-   must explicitly mark it as such:</p>
-
-<div class="doc_code">
-<pre>
-MI.addReg(Reg, RegState::Define);
-</pre>
-</div>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="fixedregs">Fixed (preassigned) registers</a>
-</h4>
-
-<div>
-
-<p>One important issue that the code generator needs to be aware of is the
-   presence of fixed registers.  In particular, there are often places in the
-   instruction stream where the register allocator <em>must</em> arrange for a
-   particular value to be in a particular register.  This can occur due to
-   limitations of the instruction set (e.g., the X86 can only do a 32-bit divide
-   with the <tt>EAX</tt>/<tt>EDX</tt> registers), or external factors like
-   calling conventions.  In any case, the instruction selector should emit code
-   that copies a virtual register into or out of a physical register when
-   needed.</p>
-
-<p>For example, consider this simple LLVM example:</p>
-
-<div class="doc_code">
-<pre>
-define i32 @test(i32 %X, i32 %Y) {
-  %Z = udiv i32 %X, %Y
-  ret i32 %Z
-}
-</pre>
-</div>
-
-<p>The X86 instruction selector produces this machine code for the <tt>div</tt>
-   and <tt>ret</tt> (use "<tt>llc X.bc -march=x86 -print-machineinstrs</tt>" to
-   get this):</p>
-
-<div class="doc_code">
-<pre>
-;; Start of div
-%EAX = mov %reg1024           ;; Copy X (in reg1024) into EAX
-%reg1027 = sar %reg1024, 31
-%EDX = mov %reg1027           ;; Sign extend X into EDX
-idiv %reg1025                 ;; Divide by Y (in reg1025)
-%reg1026 = mov %EAX           ;; Read the result (Z) out of EAX
-
-;; Start of ret
-%EAX = mov %reg1026           ;; 32-bit return value goes in EAX
-ret
-</pre>
-</div>
-
-<p>By the end of code generation, the register allocator has coalesced the
-   registers and deleted the resultant identity moves producing the following
-   code:</p>
-
-<div class="doc_code">
-<pre>
-;; X is in EAX, Y is in ECX
-mov %EAX, %EDX
-sar %EDX, 31
-idiv %ECX
-ret 
-</pre>
-</div>
-
-<p>This approach is extremely general (if it can handle the X86 architecture, it
-   can handle anything!) and allows all of the target specific knowledge about
-   the instruction stream to be isolated in the instruction selector.  Note that
-   physical registers should have a short lifetime for good code generation, and
-   all physical registers are assumed dead on entry to and exit from basic
-   blocks (before register allocation).  Thus, if you need a value to be live
-   across basic block boundaries, it <em>must</em> live in a virtual
-   register.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="callclobber">Call-clobbered registers</a>
-</h4>
-
-<div>
-
-<p>Some machine instructions, like calls, clobber a large number of physical
-   registers.  Rather than adding <code>&lt;def,dead&gt;</code> operands for
-   all of them, it is possible to use an <code>MO_RegisterMask</code> operand
-   instead.  The register mask operand holds a bit mask of preserved registers,
-   and everything else is considered to be clobbered by the instruction.  </p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ssa">Machine code in SSA form</a>
-</h4>
-
-<div>
-
-<p><tt>MachineInstr</tt>'s are initially selected in SSA-form, and are
-   maintained in SSA-form until register allocation happens.  For the most part,
-   this is trivially simple since LLVM is already in SSA form; LLVM PHI nodes
-   become machine code PHI nodes, and virtual registers are only allowed to have
-   a single definition.</p>
-
-<p>After register allocation, machine code is no longer in SSA-form because
-   there are no virtual registers left in the code.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="machinebasicblock">The <tt>MachineBasicBlock</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>MachineBasicBlock</tt> class contains a list of machine instructions
-   (<tt><a href="#machineinstr">MachineInstr</a></tt> instances).  It roughly
-   corresponds to the LLVM code input to the instruction selector, but there can
-   be a one-to-many mapping (i.e. one LLVM basic block can map to multiple
-   machine basic blocks). The <tt>MachineBasicBlock</tt> class has a
-   "<tt>getBasicBlock</tt>" method, which returns the LLVM basic block that it
-   comes from.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="machinefunction">The <tt>MachineFunction</tt> class</a>
-</h3>
-
-<div>
-
-<p>The <tt>MachineFunction</tt> class contains a list of machine basic blocks
-   (<tt><a href="#machinebasicblock">MachineBasicBlock</a></tt> instances).  It
-   corresponds one-to-one with the LLVM function input to the instruction
-   selector.  In addition to a list of basic blocks,
-   the <tt>MachineFunction</tt> contains a a <tt>MachineConstantPool</tt>,
-   a <tt>MachineFrameInfo</tt>, a <tt>MachineFunctionInfo</tt>, and a
-   <tt>MachineRegisterInfo</tt>.  See
-   <tt>include/llvm/CodeGen/MachineFunction.h</tt> for more information.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="machineinstrbundle"><tt>MachineInstr Bundles</tt></a>
-</h3>
-
-<div>
-
-<p>LLVM code generator can model sequences of instructions as MachineInstr
-   bundles. A MI bundle can model a VLIW group / pack which contains an
-   arbitrary number of parallel instructions. It can also be used to model
-   a sequential list of instructions (potentially with data dependencies) that
-   cannot be legally separated (e.g. ARM Thumb2 IT blocks).</p>
-
-<p>Conceptually a MI bundle is a MI with a number of other MIs nested within:
-</p>
-
-<div class="doc_code">
-<pre>
---------------
-|   Bundle   | ---------
---------------          \
-       |           ----------------
-       |           |      MI      |
-       |           ----------------
-       |                   |
-       |           ----------------
-       |           |      MI      |
-       |           ----------------
-       |                   |
-       |           ----------------
-       |           |      MI      |
-       |           ----------------
-       |
---------------
-|   Bundle   | --------
---------------         \
-       |           ----------------
-       |           |      MI      |
-       |           ----------------
-       |                   |
-       |           ----------------
-       |           |      MI      |
-       |           ----------------
-       |                   |
-       |                  ...
-       |
---------------
-|   Bundle   | --------
---------------         \
-       |
-      ...
-</pre>
-</div>
-
-<p> MI bundle support does not change the physical representations of
-    MachineBasicBlock and MachineInstr. All the MIs (including top level and
-    nested ones) are stored as sequential list of MIs. The "bundled" MIs are
-    marked with the 'InsideBundle' flag. A top level MI with the special BUNDLE
-    opcode is used to represent the start of a bundle. It's legal to mix BUNDLE
-    MIs with indiviual MIs that are not inside bundles nor represent bundles.
-</p>
-
-<p> MachineInstr passes should operate on a MI bundle as a single unit. Member
-    methods have been taught to correctly handle bundles and MIs inside bundles.
-    The MachineBasicBlock iterator has been modified to skip over bundled MIs to
-    enforce the bundle-as-a-single-unit concept. An alternative iterator
-    instr_iterator has been added to MachineBasicBlock to allow passes to
-    iterate over all of the MIs in a MachineBasicBlock, including those which
-    are nested inside bundles. The top level BUNDLE instruction must have the
-    correct set of register MachineOperand's that represent the cumulative
-    inputs and outputs of the bundled MIs.</p>
-
-<p> Packing / bundling of MachineInstr's should be done as part of the register
-    allocation super-pass. More specifically, the pass which determines what
-    MIs should be bundled together must be done after code generator exits SSA
-    form (i.e. after two-address pass, PHI elimination, and copy coalescing).
-    Bundles should only be finalized (i.e. adding BUNDLE MIs and input and
-    output register MachineOperands) after virtual registers have been
-    rewritten into physical registers. This requirement eliminates the need to
-    add virtual register operands to BUNDLE instructions which would effectively
-    double the virtual register def and use lists.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="mc">The "MC" Layer</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>
-The MC Layer is used to represent and process code at the raw machine code
-level, devoid of "high level" information like "constant pools", "jump tables",
-"global variables" or anything like that.  At this level, LLVM handles things
-like label names, machine instructions, and sections in the object file.  The
-code in this layer is used for a number of important purposes: the tail end of
-the code generator uses it to write a .s or .o file, and it is also used by the
-llvm-mc tool to implement standalone machine code assemblers and disassemblers.
-</p>
-
-<p>
-This section describes some of the important classes.  There are also a number
-of important subsystems that interact at this layer, they are described later
-in this manual.
-</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="mcstreamer">The <tt>MCStreamer</tt> API</a>
-</h3>
-
-<div>
-
-<p>
-MCStreamer is best thought of as an assembler API.  It is an abstract API which
-is <em>implemented</em> in different ways (e.g. to output a .s file, output an
-ELF .o file, etc) but whose API correspond directly to what you see in a .s
-file.  MCStreamer has one method per directive, such as EmitLabel,
-EmitSymbolAttribute, SwitchSection, EmitValue (for .byte, .word), etc, which
-directly correspond to assembly level directives.  It also has an
-EmitInstruction method, which is used to output an MCInst to the streamer.
-</p>
-
-<p>
-This API is most important for two clients: the llvm-mc stand-alone assembler is
-effectively a parser that parses a line, then invokes a method on MCStreamer. In
-the code generator, the <a href="#codeemit">Code Emission</a> phase of the code
-generator lowers higher level LLVM IR and Machine* constructs down to the MC
-layer, emitting directives through MCStreamer.</p>
-
-<p>
-On the implementation side of MCStreamer, there are two major implementations:
-one for writing out a .s file (MCAsmStreamer), and one for writing out a .o
-file (MCObjectStreamer).  MCAsmStreamer is a straight-forward implementation
-that prints out a directive for each method (e.g. EmitValue -&gt; .byte), but
-MCObjectStreamer implements a full assembler.
-</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="mccontext">The <tt>MCContext</tt> class</a>
-</h3>
-
-<div>
-
-<p>
-The MCContext class is the owner of a variety of uniqued data structures at the
-MC layer, including symbols, sections, etc.  As such, this is the class that you
-interact with to create symbols and sections.  This class can not be subclassed.
-</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="mcsymbol">The <tt>MCSymbol</tt> class</a>
-</h3>
-
-<div>
-
-<p>
-The MCSymbol class represents a symbol (aka label) in the assembly file.  There
-are two interesting kinds of symbols: assembler temporary symbols, and normal
-symbols.  Assembler temporary symbols are used and processed by the assembler
-but are discarded when the object file is produced.  The distinction is usually
-represented by adding a prefix to the label, for example "L" labels are
-assembler temporary labels in MachO.
-</p>
-
-<p>MCSymbols are created by MCContext and uniqued there.  This means that
-MCSymbols can be compared for pointer equivalence to find out if they are the
-same symbol.  Note that pointer inequality does not guarantee the labels will
-end up at different addresses though.  It's perfectly legal to output something
-like this to the .s file:<p>
-
-<pre>
-  foo:
-  bar:
-    .byte 4
-</pre>
-
-<p>In this case, both the foo and bar symbols will have the same address.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="mcsection">The <tt>MCSection</tt> class</a>
-</h3>
-
-<div>
-
-<p>
-The MCSection class represents an object-file specific section. It is subclassed
-by object file specific implementations (e.g. <tt>MCSectionMachO</tt>, 
-<tt>MCSectionCOFF</tt>, <tt>MCSectionELF</tt>) and these are created and uniqued
-by MCContext.  The MCStreamer has a notion of the current section, which can be
-changed with the SwitchToSection method (which corresponds to a ".section"
-directive in a .s file).
-</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="mcinst">The <tt>MCInst</tt> class</a>
-</h3>
-
-<div>
-
-<p>
-The MCInst class is a target-independent representation of an instruction.  It
-is a simple class (much more so than <a href="#machineinstr">MachineInstr</a>)
-that holds a target-specific opcode and a vector of MCOperands.  MCOperand, in
-turn, is a simple discriminated union of three cases: 1) a simple immediate, 
-2) a target register ID, 3) a symbolic expression (e.g. "Lfoo-Lbar+42") as an
-MCExpr.
-</p>
-
-<p>MCInst is the common currency used to represent machine instructions at the
-MC layer.  It is the type used by the instruction encoder, the instruction
-printer, and the type generated by the assembly parser and disassembler.
-</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="codegenalgs">Target-independent code generation algorithms</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This section documents the phases described in the
-   <a href="#high-level-design">high-level design of the code generator</a>.
-   It explains how they work and some of the rationale behind their design.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="instselect">Instruction Selection</a>
-</h3>
-
-<div>
-
-<p>Instruction Selection is the process of translating LLVM code presented to
-   the code generator into target-specific machine instructions.  There are
-   several well-known ways to do this in the literature.  LLVM uses a
-   SelectionDAG based instruction selector.</p>
-
-<p>Portions of the DAG instruction selector are generated from the target
-   description (<tt>*.td</tt>) files.  Our goal is for the entire instruction
-   selector to be generated from these <tt>.td</tt> files, though currently
-   there are still things that require custom C++ code.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_intro">Introduction to SelectionDAGs</a>
-</h4>
-
-<div>
-
-<p>The SelectionDAG provides an abstraction for code representation in a way
-   that is amenable to instruction selection using automatic techniques
-   (e.g. dynamic-programming based optimal pattern matching selectors). It is
-   also well-suited to other phases of code generation; in particular,
-   instruction scheduling (SelectionDAG's are very close to scheduling DAGs
-   post-selection).  Additionally, the SelectionDAG provides a host
-   representation where a large variety of very-low-level (but
-   target-independent) <a href="#selectiondag_optimize">optimizations</a> may be
-   performed; ones which require extensive information about the instructions
-   efficiently supported by the target.</p>
-
-<p>The SelectionDAG is a Directed-Acyclic-Graph whose nodes are instances of the
-   <tt>SDNode</tt> class.  The primary payload of the <tt>SDNode</tt> is its
-   operation code (Opcode) that indicates what operation the node performs and
-   the operands to the operation.  The various operation node types are
-   described at the top of the <tt>include/llvm/CodeGen/SelectionDAGNodes.h</tt>
-   file.</p>
-
-<p>Although most operations define a single value, each node in the graph may
-   define multiple values.  For example, a combined div/rem operation will
-   define both the dividend and the remainder. Many other situations require
-   multiple values as well.  Each node also has some number of operands, which
-   are edges to the node defining the used value.  Because nodes may define
-   multiple values, edges are represented by instances of the <tt>SDValue</tt>
-   class, which is a <tt>&lt;SDNode, unsigned&gt;</tt> pair, indicating the node
-   and result value being used, respectively.  Each value produced by
-   an <tt>SDNode</tt> has an associated <tt>MVT</tt> (Machine Value Type)
-   indicating what the type of the value is.</p>
-
-<p>SelectionDAGs contain two different kinds of values: those that represent
-   data flow and those that represent control flow dependencies.  Data values
-   are simple edges with an integer or floating point value type.  Control edges
-   are represented as "chain" edges which are of type <tt>MVT::Other</tt>.
-   These edges provide an ordering between nodes that have side effects (such as
-   loads, stores, calls, returns, etc).  All nodes that have side effects should
-   take a token chain as input and produce a new one as output.  By convention,
-   token chain inputs are always operand #0, and chain results are always the
-   last value produced by an operation.</p>
-
-<p>A SelectionDAG has designated "Entry" and "Root" nodes.  The Entry node is
-   always a marker node with an Opcode of <tt>ISD::EntryToken</tt>.  The Root
-   node is the final side-effecting node in the token chain. For example, in a
-   single basic block function it would be the return node.</p>
-
-<p>One important concept for SelectionDAGs is the notion of a "legal" vs.
-   "illegal" DAG.  A legal DAG for a target is one that only uses supported
-   operations and supported types.  On a 32-bit PowerPC, for example, a DAG with
-   a value of type i1, i8, i16, or i64 would be illegal, as would a DAG that
-   uses a SREM or UREM operation.  The
-   <a href="#selectinodag_legalize_types">legalize types</a> and
-   <a href="#selectiondag_legalize">legalize operations</a> phases are
-   responsible for turning an illegal DAG into a legal DAG.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_process">SelectionDAG Instruction Selection Process</a>
-</h4>
-
-<div>
-
-<p>SelectionDAG-based instruction selection consists of the following steps:</p>
-
-<ol>
-  <li><a href="#selectiondag_build">Build initial DAG</a> &mdash; This stage
-      performs a simple translation from the input LLVM code to an illegal
-      SelectionDAG.</li>
-
-  <li><a href="#selectiondag_optimize">Optimize SelectionDAG</a> &mdash; This
-      stage performs simple optimizations on the SelectionDAG to simplify it,
-      and recognize meta instructions (like rotates
-      and <tt>div</tt>/<tt>rem</tt> pairs) for targets that support these meta
-      operations.  This makes the resultant code more efficient and
-      the <a href="#selectiondag_select">select instructions from DAG</a> phase
-      (below) simpler.</li>
-
-  <li><a href="#selectiondag_legalize_types">Legalize SelectionDAG Types</a>
-      &mdash; This stage transforms SelectionDAG nodes to eliminate any types
-      that are unsupported on the target.</li>
-
-  <li><a href="#selectiondag_optimize">Optimize SelectionDAG</a> &mdash; The
-      SelectionDAG optimizer is run to clean up redundancies exposed by type
-      legalization.</li>
-
-  <li><a href="#selectiondag_legalize">Legalize SelectionDAG Ops</a> &mdash;
-      This stage transforms SelectionDAG nodes to eliminate any operations 
-      that are unsupported on the target.</li>
-
-  <li><a href="#selectiondag_optimize">Optimize SelectionDAG</a> &mdash; The
-      SelectionDAG optimizer is run to eliminate inefficiencies introduced by
-      operation legalization.</li>
-
-  <li><a href="#selectiondag_select">Select instructions from DAG</a> &mdash;
-      Finally, the target instruction selector matches the DAG operations to
-      target instructions.  This process translates the target-independent input
-      DAG into another DAG of target instructions.</li>
-
-  <li><a href="#selectiondag_sched">SelectionDAG Scheduling and Formation</a>
-      &mdash; The last phase assigns a linear order to the instructions in the
-      target-instruction DAG and emits them into the MachineFunction being
-      compiled.  This step uses traditional prepass scheduling techniques.</li>
-</ol>
-
-<p>After all of these steps are complete, the SelectionDAG is destroyed and the
-   rest of the code generation passes are run.</p>
-
-<p>One great way to visualize what is going on here is to take advantage of a
-   few LLC command line options.  The following options pop up a window
-   displaying the SelectionDAG at specific times (if you only get errors printed
-   to the console while using this, you probably
-   <a href="ProgrammersManual.html#ViewGraph">need to configure your system</a>
-   to add support for it).</p>
-
-<ul>
-  <li><tt>-view-dag-combine1-dags</tt> displays the DAG after being built,
-      before the first optimization pass.</li>
-
-  <li><tt>-view-legalize-dags</tt> displays the DAG before Legalization.</li>
-
-  <li><tt>-view-dag-combine2-dags</tt> displays the DAG before the second
-      optimization pass.</li>
-
-  <li><tt>-view-isel-dags</tt> displays the DAG before the Select phase.</li>
-
-  <li><tt>-view-sched-dags</tt> displays the DAG before Scheduling.</li>
-</ul>
-
-<p>The <tt>-view-sunit-dags</tt> displays the Scheduler's dependency graph.
-   This graph is based on the final SelectionDAG, with nodes that must be
-   scheduled together bundled into a single scheduling-unit node, and with
-   immediate operands and other nodes that aren't relevant for scheduling
-   omitted.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_build">Initial SelectionDAG Construction</a>
-</h4>
-
-<div>
-
-<p>The initial SelectionDAG is na&iuml;vely peephole expanded from the LLVM
-   input by the <tt>SelectionDAGLowering</tt> class in the
-   <tt>lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp</tt> file.  The intent of
-   this pass is to expose as much low-level, target-specific details to the
-   SelectionDAG as possible.  This pass is mostly hard-coded (e.g. an
-   LLVM <tt>add</tt> turns into an <tt>SDNode add</tt> while a
-   <tt>getelementptr</tt> is expanded into the obvious arithmetic). This pass
-   requires target-specific hooks to lower calls, returns, varargs, etc.  For
-   these features, the <tt><a href="#targetlowering">TargetLowering</a></tt>
-   interface is used.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_legalize_types">SelectionDAG LegalizeTypes Phase</a>
-</h4>
-
-<div>
-
-<p>The Legalize phase is in charge of converting a DAG to only use the types
-   that are natively supported by the target.</p>
-
-<p>There are two main ways of converting values of unsupported scalar types to
-   values of supported types: converting small types to larger types
-   ("promoting"), and breaking up large integer types into smaller ones
-   ("expanding").  For example, a target might require that all f32 values are
-   promoted to f64 and that all i1/i8/i16 values are promoted to i32.  The same
-   target might require that all i64 values be expanded into pairs of i32
-   values.  These changes can insert sign and zero extensions as needed to make
-   sure that the final code has the same behavior as the input.</p>
-
-<p>There are two main ways of converting values of unsupported vector types to
-   value of supported types: splitting vector types, multiple times if
-   necessary, until a legal type is found, and extending vector types by adding
-   elements to the end to round them out to legal types ("widening").  If a
-   vector gets split all the way down to single-element parts with no supported
-   vector type being found, the elements are converted to scalars
-   ("scalarizing").</p>
-
-<p>A target implementation tells the legalizer which types are supported (and
-   which register class to use for them) by calling the
-   <tt>addRegisterClass</tt> method in its TargetLowering constructor.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_legalize">SelectionDAG Legalize Phase</a>
-</h4>
-
-<div>
-
-<p>The Legalize phase is in charge of converting a DAG to only use the
-   operations that are natively supported by the target.</p>
-
-<p>Targets often have weird constraints, such as not supporting every operation
-   on every supported datatype (e.g. X86 does not support byte conditional moves
-   and PowerPC does not support sign-extending loads from a 16-bit memory
-   location).  Legalize takes care of this by open-coding another sequence of
-   operations to emulate the operation ("expansion"), by promoting one type to a
-   larger type that supports the operation ("promotion"), or by using a
-   target-specific hook to implement the legalization ("custom").</p>
-
-<p>A target implementation tells the legalizer which operations are not
-   supported (and which of the above three actions to take) by calling the
-   <tt>setOperationAction</tt> method in its <tt>TargetLowering</tt>
-   constructor.</p>
-
-<p>Prior to the existence of the Legalize passes, we required that every target
-   <a href="#selectiondag_optimize">selector</a> supported and handled every
-   operator and type even if they are not natively supported.  The introduction
-   of the Legalize phases allows all of the canonicalization patterns to be
-   shared across targets, and makes it very easy to optimize the canonicalized
-   code because it is still in the form of a DAG.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_optimize">
-    SelectionDAG Optimization Phase: the DAG Combiner
-  </a>
-</h4>
-
-<div>
-
-<p>The SelectionDAG optimization phase is run multiple times for code
-   generation, immediately after the DAG is built and once after each
-   legalization.  The first run of the pass allows the initial code to be
-   cleaned up (e.g. performing optimizations that depend on knowing that the
-   operators have restricted type inputs).  Subsequent runs of the pass clean up
-   the messy code generated by the Legalize passes, which allows Legalize to be
-   very simple (it can focus on making code legal instead of focusing on
-   generating <em>good</em> and legal code).</p>
-
-<p>One important class of optimizations performed is optimizing inserted sign
-   and zero extension instructions.  We currently use ad-hoc techniques, but
-   could move to more rigorous techniques in the future.  Here are some good
-   papers on the subject:</p>
-
-<p>"<a href="http://www.eecs.harvard.edu/~nr/pubs/widen-abstract.html">Widening
-   integer arithmetic</a>"<br>
-   Kevin Redwine and Norman Ramsey<br>
-   International Conference on Compiler Construction (CC) 2004</p>
-
-<p>"<a href="http://portal.acm.org/citation.cfm?doid=512529.512552">Effective
-   sign extension elimination</a>"<br>
-   Motohiro Kawahito, Hideaki Komatsu, and Toshio Nakatani<br>
-   Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language Design
-   and Implementation.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_select">SelectionDAG Select Phase</a>
-</h4>
-
-<div>
-
-<p>The Select phase is the bulk of the target-specific code for instruction
-   selection.  This phase takes a legal SelectionDAG as input, pattern matches
-   the instructions supported by the target to this DAG, and produces a new DAG
-   of target code.  For example, consider the following LLVM fragment:</p>
-
-<div class="doc_code">
-<pre>
-%t1 = fadd float %W, %X
-%t2 = fmul float %t1, %Y
-%t3 = fadd float %t2, %Z
-</pre>
-</div>
-
-<p>This LLVM code corresponds to a SelectionDAG that looks basically like
-   this:</p>
-
-<div class="doc_code">
-<pre>
-(fadd:f32 (fmul:f32 (fadd:f32 W, X), Y), Z)
-</pre>
-</div>
-
-<p>If a target supports floating point multiply-and-add (FMA) operations, one of
-   the adds can be merged with the multiply.  On the PowerPC, for example, the
-   output of the instruction selector might look like this DAG:</p>
-
-<div class="doc_code">
-<pre>
-(FMADDS (FADDS W, X), Y, Z)
-</pre>
-</div>
-
-<p>The <tt>FMADDS</tt> instruction is a ternary instruction that multiplies its
-first two operands and adds the third (as single-precision floating-point
-numbers).  The <tt>FADDS</tt> instruction is a simple binary single-precision
-add instruction.  To perform this pattern match, the PowerPC backend includes
-the following instruction definitions:</p>
-
-<div class="doc_code">
-<pre>
-def FMADDS : AForm_1&lt;59, 29,
-                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
-                    "fmadds $FRT, $FRA, $FRC, $FRB",
-                    [<b>(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
-                                           F4RC:$FRB))</b>]&gt;;
-def FADDS : AForm_2&lt;59, 21,
-                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
-                    "fadds $FRT, $FRA, $FRB",
-                    [<b>(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))</b>]&gt;;
-</pre>
-</div>
-
-<p>The portion of the instruction definition in bold indicates the pattern used
-   to match the instruction.  The DAG operators
-   (like <tt>fmul</tt>/<tt>fadd</tt>) are defined in
-   the <tt>include/llvm/Target/TargetSelectionDAG.td</tt> file.  "
-   <tt>F4RC</tt>" is the register class of the input and result values.</p>
-
-<p>The TableGen DAG instruction selector generator reads the instruction
-   patterns in the <tt>.td</tt> file and automatically builds parts of the
-   pattern matching code for your target.  It has the following strengths:</p>
-
-<ul>
-  <li>At compiler-compiler time, it analyzes your instruction patterns and tells
-      you if your patterns make sense or not.</li>
-
-  <li>It can handle arbitrary constraints on operands for the pattern match.  In
-      particular, it is straight-forward to say things like "match any immediate
-      that is a 13-bit sign-extended value".  For examples, see the
-      <tt>immSExt16</tt> and related <tt>tblgen</tt> classes in the PowerPC
-      backend.</li>
-
-  <li>It knows several important identities for the patterns defined.  For
-      example, it knows that addition is commutative, so it allows the
-      <tt>FMADDS</tt> pattern above to match "<tt>(fadd X, (fmul Y, Z))</tt>" as
-      well as "<tt>(fadd (fmul X, Y), Z)</tt>", without the target author having
-      to specially handle this case.</li>
-
-  <li>It has a full-featured type-inferencing system.  In particular, you should
-      rarely have to explicitly tell the system what type parts of your patterns
-      are.  In the <tt>FMADDS</tt> case above, we didn't have to tell
-      <tt>tblgen</tt> that all of the nodes in the pattern are of type 'f32'.
-      It was able to infer and propagate this knowledge from the fact that
-      <tt>F4RC</tt> has type 'f32'.</li>
-
-  <li>Targets can define their own (and rely on built-in) "pattern fragments".
-      Pattern fragments are chunks of reusable patterns that get inlined into
-      your patterns during compiler-compiler time.  For example, the integer
-      "<tt>(not x)</tt>" operation is actually defined as a pattern fragment
-      that expands as "<tt>(xor x, -1)</tt>", since the SelectionDAG does not
-      have a native '<tt>not</tt>' operation.  Targets can define their own
-      short-hand fragments as they see fit.  See the definition of
-      '<tt>not</tt>' and '<tt>ineg</tt>' for examples.</li>
-
-  <li>In addition to instructions, targets can specify arbitrary patterns that
-      map to one or more instructions using the 'Pat' class.  For example, the
-      PowerPC has no way to load an arbitrary integer immediate into a register
-      in one instruction. To tell tblgen how to do this, it defines:
-      <br>
-      <br>
-<div class="doc_code">
-<pre>
-// Arbitrary immediate support.  Implement in terms of LIS/ORI.
-def : Pat&lt;(i32 imm:$imm),
-          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))&gt;;
-</pre>
-</div>
-      <br>
-      If none of the single-instruction patterns for loading an immediate into a
-      register match, this will be used.  This rule says "match an arbitrary i32
-      immediate, turning it into an <tt>ORI</tt> ('or a 16-bit immediate') and
-      an <tt>LIS</tt> ('load 16-bit immediate, where the immediate is shifted to
-      the left 16 bits') instruction".  To make this work, the
-      <tt>LO16</tt>/<tt>HI16</tt> node transformations are used to manipulate
-      the input immediate (in this case, take the high or low 16-bits of the
-      immediate).</li>
-
-  <li>While the system does automate a lot, it still allows you to write custom
-      C++ code to match special cases if there is something that is hard to
-      express.</li>
-</ul>
-
-<p>While it has many strengths, the system currently has some limitations,
-   primarily because it is a work in progress and is not yet finished:</p>
-
-<ul>
-  <li>Overall, there is no way to define or match SelectionDAG nodes that define
-      multiple values (e.g. <tt>SMUL_LOHI</tt>, <tt>LOAD</tt>, <tt>CALL</tt>,
-      etc).  This is the biggest reason that you currently still <em>have
-      to</em> write custom C++ code for your instruction selector.</li>
-
-  <li>There is no great way to support matching complex addressing modes yet.
-      In the future, we will extend pattern fragments to allow them to define
-      multiple values (e.g. the four operands of the <a href="#x86_memory">X86
-      addressing mode</a>, which are currently matched with custom C++ code).
-      In addition, we'll extend fragments so that a fragment can match multiple
-      different patterns.</li>
-
-  <li>We don't automatically infer flags like isStore/isLoad yet.</li>
-
-  <li>We don't automatically generate the set of supported registers and
-      operations for the <a href="#selectiondag_legalize">Legalizer</a>
-      yet.</li>
-
-  <li>We don't have a way of tying in custom legalized nodes yet.</li>
-</ul>
-
-<p>Despite these limitations, the instruction selector generator is still quite
-   useful for most of the binary and logical operations in typical instruction
-   sets.  If you run into any problems or can't figure out how to do something,
-   please let Chris know!</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_sched">SelectionDAG Scheduling and Formation Phase</a>
-</h4>
-
-<div>
-
-<p>The scheduling phase takes the DAG of target instructions from the selection
-   phase and assigns an order.  The scheduler can pick an order depending on
-   various constraints of the machines (i.e. order for minimal register pressure
-   or try to cover instruction latencies).  Once an order is established, the
-   DAG is converted to a list
-   of <tt><a href="#machineinstr">MachineInstr</a></tt>s and the SelectionDAG is
-   destroyed.</p>
-
-<p>Note that this phase is logically separate from the instruction selection
-   phase, but is tied to it closely in the code because it operates on
-   SelectionDAGs.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="selectiondag_future">Future directions for the SelectionDAG</a>
-</h4>
-
-<div>
-
-<ol>
-  <li>Optional function-at-a-time selection.</li>
-
-  <li>Auto-generate entire selector from <tt>.td</tt> file.</li>
-</ol>
-
-</div>
- 
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ssamco">SSA-based Machine Code Optimizations</a>
-</h3>
-<div><p>To Be Written</p></div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="liveintervals">Live Intervals</a>
-</h3>
-
-<div>
-
-<p>Live Intervals are the ranges (intervals) where a variable is <i>live</i>.
-   They are used by some <a href="#regalloc">register allocator</a> passes to
-   determine if two or more virtual registers which require the same physical
-   register are live at the same point in the program (i.e., they conflict).
-   When this situation occurs, one virtual register must be <i>spilled</i>.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="livevariable_analysis">Live Variable Analysis</a>
-</h4>
-
-<div>
-
-<p>The first step in determining the live intervals of variables is to calculate
-   the set of registers that are immediately dead after the instruction (i.e.,
-   the instruction calculates the value, but it is never used) and the set of
-   registers that are used by the instruction, but are never used after the
-   instruction (i.e., they are killed). Live variable information is computed
-   for each <i>virtual</i> register and <i>register allocatable</i> physical
-   register in the function.  This is done in a very efficient manner because it
-   uses SSA to sparsely compute lifetime information for virtual registers
-   (which are in SSA form) and only has to track physical registers within a
-   block.  Before register allocation, LLVM can assume that physical registers
-   are only live within a single basic block.  This allows it to do a single,
-   local analysis to resolve physical register lifetimes within each basic
-   block. If a physical register is not register allocatable (e.g., a stack
-   pointer or condition codes), it is not tracked.</p>
-
-<p>Physical registers may be live in to or out of a function. Live in values are
-   typically arguments in registers. Live out values are typically return values
-   in registers. Live in values are marked as such, and are given a dummy
-   "defining" instruction during live intervals analysis. If the last basic
-   block of a function is a <tt>return</tt>, then it's marked as using all live
-   out values in the function.</p>
-
-<p><tt>PHI</tt> nodes need to be handled specially, because the calculation of
-   the live variable information from a depth first traversal of the CFG of the
-   function won't guarantee that a virtual register used by the <tt>PHI</tt>
-   node is defined before it's used. When a <tt>PHI</tt> node is encountered,
-   only the definition is handled, because the uses will be handled in other
-   basic blocks.</p>
-
-<p>For each <tt>PHI</tt> node of the current basic block, we simulate an
-   assignment at the end of the current basic block and traverse the successor
-   basic blocks. If a successor basic block has a <tt>PHI</tt> node and one of
-   the <tt>PHI</tt> node's operands is coming from the current basic block, then
-   the variable is marked as <i>alive</i> within the current basic block and all
-   of its predecessor basic blocks, until the basic block with the defining
-   instruction is encountered.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="liveintervals_analysis">Live Intervals Analysis</a>
-</h4>
-
-<div>
-
-<p>We now have the information available to perform the live intervals analysis
-   and build the live intervals themselves.  We start off by numbering the basic
-   blocks and machine instructions.  We then handle the "live-in" values.  These
-   are in physical registers, so the physical register is assumed to be killed
-   by the end of the basic block.  Live intervals for virtual registers are
-   computed for some ordering of the machine instructions <tt>[1, N]</tt>.  A
-   live interval is an interval <tt>[i, j)</tt>, where <tt>1 &lt;= i &lt;= j
-   &lt; N</tt>, for which a variable is live.</p>
-
-<p><i><b>More to come...</b></i></p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="regalloc">Register Allocation</a>
-</h3>
-
-<div>
-
-<p>The <i>Register Allocation problem</i> consists in mapping a program
-   <i>P<sub>v</sub></i>, that can use an unbounded number of virtual registers,
-   to a program <i>P<sub>p</sub></i> that contains a finite (possibly small)
-   number of physical registers. Each target architecture has a different number
-   of physical registers. If the number of physical registers is not enough to
-   accommodate all the virtual registers, some of them will have to be mapped
-   into memory. These virtuals are called <i>spilled virtuals</i>.</p>
-
-<!-- _______________________________________________________________________ -->
-
-<h4>
-  <a name="regAlloc_represent">How registers are represented in LLVM</a>
-</h4>
-
-<div>
-
-<p>In LLVM, physical registers are denoted by integer numbers that normally
-   range from 1 to 1023. To see how this numbering is defined for a particular
-   architecture, you can read the <tt>GenRegisterNames.inc</tt> file for that
-   architecture. For instance, by
-   inspecting <tt>lib/Target/X86/X86GenRegisterInfo.inc</tt> we see that the
-   32-bit register <tt>EAX</tt> is denoted by 43, and the MMX register
-   <tt>MM0</tt> is mapped to 65.</p>
-
-<p>Some architectures contain registers that share the same physical location. A
-   notable example is the X86 platform. For instance, in the X86 architecture,
-   the registers <tt>EAX</tt>, <tt>AX</tt> and <tt>AL</tt> share the first eight
-   bits. These physical registers are marked as <i>aliased</i> in LLVM. Given a
-   particular architecture, you can check which registers are aliased by
-   inspecting its <tt>RegisterInfo.td</tt> file. Moreover, the method
-   <tt>MCRegisterInfo::getAliasSet(p_reg)</tt> returns an array containing
-   all the physical registers aliased to the register <tt>p_reg</tt>.</p>
-
-<p>Physical registers, in LLVM, are grouped in <i>Register Classes</i>.
-   Elements in the same register class are functionally equivalent, and can be
-   interchangeably used. Each virtual register can only be mapped to physical
-   registers of a particular class. For instance, in the X86 architecture, some
-   virtuals can only be allocated to 8 bit registers.  A register class is
-   described by <tt>TargetRegisterClass</tt> objects.  To discover if a virtual
-   register is compatible with a given physical, this code can be used:</p>
-
-<div class="doc_code">
-<pre>
-bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
-                                      unsigned v_reg,
-                                      unsigned p_reg) {
-  assert(TargetRegisterInfo::isPhysicalRegister(p_reg) &amp;&amp;
-         "Target register must be physical");
-  const TargetRegisterClass *trc = mf.getRegInfo().getRegClass(v_reg);
-  return trc-&gt;contains(p_reg);
-}
-</pre>
-</div>
-
-<p>Sometimes, mostly for debugging purposes, it is useful to change the number
-   of physical registers available in the target architecture. This must be done
-   statically, inside the <tt>TargetRegsterInfo.td</tt> file. Just <tt>grep</tt>
-   for <tt>RegisterClass</tt>, the last parameter of which is a list of
-   registers. Just commenting some out is one simple way to avoid them being
-   used. A more polite way is to explicitly exclude some registers from
-   the <i>allocation order</i>. See the definition of the <tt>GR8</tt> register
-   class in <tt>lib/Target/X86/X86RegisterInfo.td</tt> for an example of this.
-   </p>
-
-<p>Virtual registers are also denoted by integer numbers. Contrary to physical
-   registers, different virtual registers never share the same number. Whereas
-   physical registers are statically defined in a <tt>TargetRegisterInfo.td</tt>
-   file and cannot be created by the application developer, that is not the case
-   with virtual registers. In order to create new virtual registers, use the
-   method <tt>MachineRegisterInfo::createVirtualRegister()</tt>. This method
-   will return a new virtual register. Use an <tt>IndexedMap&lt;Foo,
-   VirtReg2IndexFunctor&gt;</tt> to hold information per virtual register. If you
-   need to enumerate all virtual registers, use the function
-   <tt>TargetRegisterInfo::index2VirtReg()</tt> to find the virtual register
-   numbers:</p>
-
-<div class="doc_code">
-<pre>
-  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(i);
-    stuff(VirtReg);
-  }
-</pre>
-</div>
-
-<p>Before register allocation, the operands of an instruction are mostly virtual
-   registers, although physical registers may also be used. In order to check if
-   a given machine operand is a register, use the boolean
-   function <tt>MachineOperand::isRegister()</tt>. To obtain the integer code of
-   a register, use <tt>MachineOperand::getReg()</tt>. An instruction may define
-   or use a register. For instance, <tt>ADD reg:1026 := reg:1025 reg:1024</tt>
-   defines the registers 1024, and uses registers 1025 and 1026. Given a
-   register operand, the method <tt>MachineOperand::isUse()</tt> informs if that
-   register is being used by the instruction. The
-   method <tt>MachineOperand::isDef()</tt> informs if that registers is being
-   defined.</p>
-
-<p>We will call physical registers present in the LLVM bitcode before register
-   allocation <i>pre-colored registers</i>. Pre-colored registers are used in
-   many different situations, for instance, to pass parameters of functions
-   calls, and to store results of particular instructions. There are two types
-   of pre-colored registers: the ones <i>implicitly</i> defined, and
-   those <i>explicitly</i> defined. Explicitly defined registers are normal
-   operands, and can be accessed
-   with <tt>MachineInstr::getOperand(int)::getReg()</tt>.  In order to check
-   which registers are implicitly defined by an instruction, use
-   the <tt>TargetInstrInfo::get(opcode)::ImplicitDefs</tt>,
-   where <tt>opcode</tt> is the opcode of the target instruction. One important
-   difference between explicit and implicit physical registers is that the
-   latter are defined statically for each instruction, whereas the former may
-   vary depending on the program being compiled. For example, an instruction
-   that represents a function call will always implicitly define or use the same
-   set of physical registers. To read the registers implicitly used by an
-   instruction,
-   use <tt>TargetInstrInfo::get(opcode)::ImplicitUses</tt>. Pre-colored
-   registers impose constraints on any register allocation algorithm. The
-   register allocator must make sure that none of them are overwritten by
-   the values of virtual registers while still alive.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-
-<h4>
-  <a name="regAlloc_howTo">Mapping virtual registers to physical registers</a>
-</h4>
-
-<div>
-
-<p>There are two ways to map virtual registers to physical registers (or to
-   memory slots). The first way, that we will call <i>direct mapping</i>, is
-   based on the use of methods of the classes <tt>TargetRegisterInfo</tt>,
-   and <tt>MachineOperand</tt>. The second way, that we will call <i>indirect
-   mapping</i>, relies on the <tt>VirtRegMap</tt> class in order to insert loads
-   and stores sending and getting values to and from memory.</p>
-
-<p>The direct mapping provides more flexibility to the developer of the register
-   allocator; however, it is more error prone, and demands more implementation
-   work.  Basically, the programmer will have to specify where load and store
-   instructions should be inserted in the target function being compiled in
-   order to get and store values in memory. To assign a physical register to a
-   virtual register present in a given operand,
-   use <tt>MachineOperand::setReg(p_reg)</tt>. To insert a store instruction,
-   use <tt>TargetInstrInfo::storeRegToStackSlot(...)</tt>, and to insert a
-   load instruction, use <tt>TargetInstrInfo::loadRegFromStackSlot</tt>.</p>
-
-<p>The indirect mapping shields the application developer from the complexities
-   of inserting load and store instructions. In order to map a virtual register
-   to a physical one, use <tt>VirtRegMap::assignVirt2Phys(vreg, preg)</tt>.  In
-   order to map a certain virtual register to memory,
-   use <tt>VirtRegMap::assignVirt2StackSlot(vreg)</tt>. This method will return
-   the stack slot where <tt>vreg</tt>'s value will be located.  If it is
-   necessary to map another virtual register to the same stack slot,
-   use <tt>VirtRegMap::assignVirt2StackSlot(vreg, stack_location)</tt>. One
-   important point to consider when using the indirect mapping, is that even if
-   a virtual register is mapped to memory, it still needs to be mapped to a
-   physical register. This physical register is the location where the virtual
-   register is supposed to be found before being stored or after being
-   reloaded.</p>
-
-<p>If the indirect strategy is used, after all the virtual registers have been
-   mapped to physical registers or stack slots, it is necessary to use a spiller
-   object to place load and store instructions in the code. Every virtual that
-   has been mapped to a stack slot will be stored to memory after been defined
-   and will be loaded before being used. The implementation of the spiller tries
-   to recycle load/store instructions, avoiding unnecessary instructions. For an
-   example of how to invoke the spiller,
-   see <tt>RegAllocLinearScan::runOnMachineFunction</tt>
-   in <tt>lib/CodeGen/RegAllocLinearScan.cpp</tt>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="regAlloc_twoAddr">Handling two address instructions</a>
-</h4>
-
-<div>
-
-<p>With very rare exceptions (e.g., function calls), the LLVM machine code
-   instructions are three address instructions. That is, each instruction is
-   expected to define at most one register, and to use at most two registers.
-   However, some architectures use two address instructions. In this case, the
-   defined register is also one of the used register. For instance, an
-   instruction such as <tt>ADD %EAX, %EBX</tt>, in X86 is actually equivalent
-   to <tt>%EAX = %EAX + %EBX</tt>.</p>
-
-<p>In order to produce correct code, LLVM must convert three address
-   instructions that represent two address instructions into true two address
-   instructions. LLVM provides the pass <tt>TwoAddressInstructionPass</tt> for
-   this specific purpose. It must be run before register allocation takes
-   place. After its execution, the resulting code may no longer be in SSA
-   form. This happens, for instance, in situations where an instruction such
-   as <tt>%a = ADD %b %c</tt> is converted to two instructions such as:</p>
-
-<div class="doc_code">
-<pre>
-%a = MOVE %b
-%a = ADD %a %c
-</pre>
-</div>
-
-<p>Notice that, internally, the second instruction is represented as
-   <tt>ADD %a[def/use] %c</tt>. I.e., the register operand <tt>%a</tt> is both
-   used and defined by the instruction.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="regAlloc_ssaDecon">The SSA deconstruction phase</a>
-</h4>
-
-<div>
-
-<p>An important transformation that happens during register allocation is called
-   the <i>SSA Deconstruction Phase</i>. The SSA form simplifies many analyses
-   that are performed on the control flow graph of programs. However,
-   traditional instruction sets do not implement PHI instructions. Thus, in
-   order to generate executable code, compilers must replace PHI instructions
-   with other instructions that preserve their semantics.</p>
-
-<p>There are many ways in which PHI instructions can safely be removed from the
-   target code. The most traditional PHI deconstruction algorithm replaces PHI
-   instructions with copy instructions. That is the strategy adopted by
-   LLVM. The SSA deconstruction algorithm is implemented
-   in <tt>lib/CodeGen/PHIElimination.cpp</tt>. In order to invoke this pass, the
-   identifier <tt>PHIEliminationID</tt> must be marked as required in the code
-   of the register allocator.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="regAlloc_fold">Instruction folding</a>
-</h4>
-
-<div>
-
-<p><i>Instruction folding</i> is an optimization performed during register
-   allocation that removes unnecessary copy instructions. For instance, a
-   sequence of instructions such as:</p>
-
-<div class="doc_code">
-<pre>
-%EBX = LOAD %mem_address
-%EAX = COPY %EBX
-</pre>
-</div>
-
-<p>can be safely substituted by the single instruction:</p>
-
-<div class="doc_code">
-<pre>
-%EAX = LOAD %mem_address
-</pre>
-</div>
-
-<p>Instructions can be folded with
-   the <tt>TargetRegisterInfo::foldMemoryOperand(...)</tt> method. Care must be
-   taken when folding instructions; a folded instruction can be quite different
-   from the original
-   instruction. See <tt>LiveIntervals::addIntervalsForSpills</tt>
-   in <tt>lib/CodeGen/LiveIntervalAnalysis.cpp</tt> for an example of its
-   use.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-
-<h4>
-  <a name="regAlloc_builtIn">Built in register allocators</a>
-</h4>
-
-<div>
-
-<p>The LLVM infrastructure provides the application developer with three
-   different register allocators:</p>
-
-<ul>
-  <li><i>Fast</i> &mdash; This register allocator is the default for debug
-      builds. It allocates registers on a basic block level, attempting to keep
-      values in registers and reusing registers as appropriate.</li>
-
-  <li><i>Basic</i> &mdash; This is an incremental approach to register
-  allocation. Live ranges are assigned to registers one at a time in
-  an order that is driven by heuristics. Since code can be rewritten
-  on-the-fly during allocation, this framework allows interesting
-  allocators to be developed as extensions. It is not itself a
-  production register allocator but is a potentially useful
-  stand-alone mode for triaging bugs and as a performance baseline.
-
-  <li><i>Greedy</i> &mdash; <i>The default allocator</i>. This is a
-  highly tuned implementation of the <i>Basic</i> allocator that
-  incorporates global live range splitting. This allocator works hard
-  to minimize the cost of spill code.
-
-  <li><i>PBQP</i> &mdash; A Partitioned Boolean Quadratic Programming (PBQP)
-      based register allocator. This allocator works by constructing a PBQP
-      problem representing the register allocation problem under consideration,
-      solving this using a PBQP solver, and mapping the solution back to a
-      register assignment.</li>
-</ul>
-
-<p>The type of register allocator used in <tt>llc</tt> can be chosen with the
-   command line option <tt>-regalloc=...</tt>:</p>
-
-<div class="doc_code">
-<pre>
-$ llc -regalloc=linearscan file.bc -o ln.s;
-$ llc -regalloc=fast file.bc -o fa.s;
-$ llc -regalloc=pbqp file.bc -o pbqp.s;
-</pre>
-</div>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="proepicode">Prolog/Epilog Code Insertion</a>
-</h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="compact_unwind">Compact Unwind</a>
-</h4>
-
-<div>
-
-<p>Throwing an exception requires <em>unwinding</em> out of a function. The
-   information on how to unwind a given function is traditionally expressed in
-   DWARF unwind (a.k.a. frame) info. But that format was originally developed
-   for debuggers to backtrace, and each Frame Description Entry (FDE) requires
-   ~20-30 bytes per function. There is also the cost of mapping from an address
-   in a function to the corresponding FDE at runtime. An alternative unwind
-   encoding is called <em>compact unwind</em> and requires just 4-bytes per
-   function.</p>
-
-<p>The compact unwind encoding is a 32-bit value, which is encoded in an
-   architecture-specific way. It specifies which registers to restore and from
-   where, and how to unwind out of the function. When the linker creates a final
-   linked image, it will create a <code>__TEXT,__unwind_info</code>
-   section. This section is a small and fast way for the runtime to access
-   unwind info for any given function. If we emit compact unwind info for the
-   function, that compact unwind info will be encoded in
-   the <code>__TEXT,__unwind_info</code> section. If we emit DWARF unwind info,
-   the <code>__TEXT,__unwind_info</code> section will contain the offset of the
-   FDE in the <code>__TEXT,__eh_frame</code> section in the final linked
-   image.</p>
-
-<p>For X86, there are three modes for the compact unwind encoding:</p>
-
-<dl>
-  <dt><i>Function with a Frame Pointer (<code>EBP</code> or <code>RBP</code>)</i></dt>
-  <dd><p><code>EBP/RBP</code>-based frame, where <code>EBP/RBP</code> is pushed
-      onto the stack immediately after the return address,
-      then <code>ESP/RSP</code> is moved to <code>EBP/RBP</code>. Thus to
-      unwind, <code>ESP/RSP</code> is restored with the
-      current <code>EBP/RBP</code> value, then <code>EBP/RBP</code> is restored
-      by popping the stack, and the return is done by popping the stack once
-      more into the PC. All non-volatile registers that need to be restored must
-      have been saved in a small range on the stack that
-      starts <code>EBP-4</code> to <code>EBP-1020</code> (<code>RBP-8</code>
-      to <code>RBP-1020</code>). The offset (divided by 4 in 32-bit mode and 8
-      in 64-bit mode) is encoded in bits 16-23 (mask: <code>0x00FF0000</code>).
-      The registers saved are encoded in bits 0-14
-      (mask: <code>0x00007FFF</code>) as five 3-bit entries from the following
-      table:</p>
-<table border="1" cellspacing="0">
-  <tr>
-    <th>Compact Number</th>
-    <th>i386 Register</th>
-    <th>x86-64 Regiser</th>
-  </tr>
-  <tr>
-    <td>1</td>
-    <td><code>EBX</code></td>
-    <td><code>RBX</code></td>
-  </tr>
-  <tr>
-    <td>2</td>
-    <td><code>ECX</code></td>
-    <td><code>R12</code></td>
-  </tr>
-  <tr>
-    <td>3</td>
-    <td><code>EDX</code></td>
-    <td><code>R13</code></td>
-  </tr>
-  <tr>
-    <td>4</td>
-    <td><code>EDI</code></td>
-    <td><code>R14</code></td>
-  </tr>
-  <tr>
-    <td>5</td>
-    <td><code>ESI</code></td>
-    <td><code>R15</code></td>
-  </tr>
-  <tr>
-    <td>6</td>
-    <td><code>EBP</code></td>
-    <td><code>RBP</code></td>
-  </tr>
-</table>
-
-</dd>
-
-  <dt><i>Frameless with a Small Constant Stack Size (<code>EBP</code>
-         or <code>RBP</code> is not used as a frame pointer)</i></dt>
-  <dd><p>To return, a constant (encoded in the compact unwind encoding) is added
-      to the <code>ESP/RSP</code>.  Then the return is done by popping the stack
-      into the PC. All non-volatile registers that need to be restored must have
-      been saved on the stack immediately after the return address. The stack
-      size (divided by 4 in 32-bit mode and 8 in 64-bit mode) is encoded in bits
-      16-23 (mask: <code>0x00FF0000</code>). There is a maximum stack size of
-      1024 bytes in 32-bit mode and 2048 in 64-bit mode. The number of registers
-      saved is encoded in bits 9-12 (mask: <code>0x00001C00</code>). Bits 0-9
-      (mask: <code>0x000003FF</code>) contain which registers were saved and
-      their order. (See
-      the <code>encodeCompactUnwindRegistersWithoutFrame()</code> function
-      in <code>lib/Target/X86FrameLowering.cpp</code> for the encoding
-      algorithm.)</p></dd>
-
-  <dt><i>Frameless with a Large Constant Stack Size (<code>EBP</code>
-         or <code>RBP</code> is not used as a frame pointer)</i></dt>
-  <dd><p>This case is like the "Frameless with a Small Constant Stack Size"
-      case, but the stack size is too large to encode in the compact unwind
-      encoding. Instead it requires that the function contains "<code>subl
-      $nnnnnn, %esp</code>" in its prolog. The compact encoding contains the
-      offset to the <code>$nnnnnn</code> value in the function in bits 9-12
-      (mask: <code>0x00001C00</code>).</p></dd>
-</dl>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="latemco">Late Machine Code Optimizations</a>
-</h3>
-<div><p>To Be Written</p></div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="codeemit">Code Emission</a>
-</h3>
-
-<div>
-
-<p>The code emission step of code generation is responsible for lowering from
-the code generator abstractions (like <a 
-href="#machinefunction">MachineFunction</a>, <a 
-href="#machineinstr">MachineInstr</a>, etc) down
-to the abstractions used by the MC layer (<a href="#mcinst">MCInst</a>, 
-<a href="#mcstreamer">MCStreamer</a>, etc).  This is
-done with a combination of several different classes: the (misnamed)
-target-independent AsmPrinter class, target-specific subclasses of AsmPrinter
-(such as SparcAsmPrinter), and the TargetLoweringObjectFile class.</p>
-
-<p>Since the MC layer works at the level of abstraction of object files, it
-doesn't have a notion of functions, global variables etc.  Instead, it thinks
-about labels, directives, and instructions.  A key class used at this time is
-the MCStreamer class.  This is an abstract API that is implemented in different
-ways (e.g. to output a .s file, output an ELF .o file, etc) that is effectively
-an "assembler API".  MCStreamer has one method per directive, such as EmitLabel,
-EmitSymbolAttribute, SwitchSection, etc, which directly correspond to assembly
-level directives.
-</p>
-
-<p>If you are interested in implementing a code generator for a target, there
-are three important things that you have to implement for your target:</p>
-
-<ol>
-<li>First, you need a subclass of AsmPrinter for your target.  This class
-implements the general lowering process converting MachineFunction's into MC
-label constructs.  The AsmPrinter base class provides a number of useful methods
-and routines, and also allows you to override the lowering process in some
-important ways.  You should get much of the lowering for free if you are
-implementing an ELF, COFF, or MachO target, because the TargetLoweringObjectFile
-class implements much of the common logic.</li>
-
-<li>Second, you need to implement an instruction printer for your target.  The
-instruction printer takes an <a href="#mcinst">MCInst</a> and renders it to a
-raw_ostream as text.  Most of this is automatically generated from the .td file
-(when you specify something like "<tt>add $dst, $src1, $src2</tt>" in the
-instructions), but you need to implement routines to print operands.</li>
-
-<li>Third, you need to implement code that lowers a <a
-href="#machineinstr">MachineInstr</a> to an MCInst, usually implemented in
-"&lt;target&gt;MCInstLower.cpp".  This lowering process is often target
-specific, and is responsible for turning jump table entries, constant pool
-indices, global variable addresses, etc into MCLabels as appropriate.  This
-translation layer is also responsible for expanding pseudo ops used by the code
-generator into the actual machine instructions they correspond to. The MCInsts
-that are generated by this are fed into the instruction printer or the encoder.
-</li>
-
-</ol>
-
-<p>Finally, at your choosing, you can also implement an subclass of
-MCCodeEmitter which lowers MCInst's into machine code bytes and relocations.
-This is important if you want to support direct .o file emission, or would like
-to implement an assembler for your target.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="vliw_packetizer">VLIW Packetizer</a>
-</h3>
-
-<div>
-
-<p>In a Very Long Instruction Word (VLIW) architecture, the compiler is
-   responsible for mapping instructions to functional-units available on
-   the architecture. To that end, the compiler creates groups of instructions
-   called <i>packets</i> or <i>bundles</i>. The VLIW packetizer in LLVM is
-   a target-independent mechanism to enable the packetization of machine
-   instructions.</p>
-
-<!-- _______________________________________________________________________ -->
-
-<h4>
-  <a name="vliw_mapping">Mapping from instructions to functional units</a>
-</h4>
-
-<div>
-
-<p>Instructions in a VLIW target can typically be mapped to multiple functional
-units. During the process of packetizing, the compiler must be able to reason
-about whether an instruction can be added to a packet. This decision can be
-complex since the compiler has to examine all possible mappings of instructions
-to functional units. Therefore to alleviate compilation-time complexity, the
-VLIW packetizer parses the instruction classes of a target and generates tables
-at compiler build time. These tables can then be queried by the provided
-machine-independent API to determine if an instruction can be accommodated in a
-packet.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="vliw_repr">
-    How the packetization tables are generated and used
-  </a>
-</h4>
-
-<div>
-
-<p>The packetizer reads instruction classes from a target's itineraries and
-creates a deterministic finite automaton (DFA) to represent the state of a
-packet. A DFA consists of three major elements: inputs, states, and
-transitions. The set of inputs for the generated DFA represents the instruction
-being added to a packet. The states represent the possible consumption
-of functional units by instructions in a packet. In the DFA, transitions from
-one state to another occur on the addition of an instruction to an existing
-packet. If there is a legal mapping of functional units to instructions, then
-the DFA contains a corresponding transition. The absence of a transition
-indicates that a legal mapping does not exist and that the instruction cannot
-be added to the packet.</p>
-
-<p>To generate tables for a VLIW target, add <i>Target</i>GenDFAPacketizer.inc
-as a target to the Makefile in the target directory. The exported API provides
-three functions: <tt>DFAPacketizer::clearResources()</tt>,
-<tt>DFAPacketizer::reserveResources(MachineInstr *MI)</tt>, and
-<tt>DFAPacketizer::canReserveResources(MachineInstr *MI)</tt>. These functions
-allow a target packetizer to add an instruction to an existing packet and to
-check whether an instruction can be added to a packet. See
-<tt>llvm/CodeGen/DFAPacketizer.h</tt> for more information.</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="nativeassembler">Implementing a Native Assembler</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Though you're probably reading this because you want to write or maintain a
-compiler backend, LLVM also fully supports building a native assemblers too.
-We've tried hard to automate the generation of the assembler from the .td files
-(in particular the instruction syntax and encodings), which means that a large
-part of the manual and repetitive data entry can be factored and shared with the
-compiler.</p>
-
-<!-- ======================================================================= -->
-<h3 id="na_instparsing">Instruction Parsing</h3>
-
-<div><p>To Be Written</p></div>
-
-
-<!-- ======================================================================= -->
-<h3 id="na_instaliases">
-  Instruction Alias Processing
-</h3>
-
-<div>
-<p>Once the instruction is parsed, it enters the MatchInstructionImpl function.
-The MatchInstructionImpl function performs alias processing and then does
-actual matching.</p>
-
-<p>Alias processing is the phase that canonicalizes different lexical forms of
-the same instructions down to one representation.  There are several different
-kinds of alias that are possible to implement and they are listed below in the
-order that they are processed (which is in order from simplest/weakest to most
-complex/powerful).  Generally you want to use the first alias mechanism that
-meets the needs of your instruction, because it will allow a more concise
-description.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>Mnemonic Aliases</h4>
-
-<div>
-
-<p>The first phase of alias processing is simple instruction mnemonic
-remapping for classes of instructions which are allowed with two different
-mnemonics.  This phase is a simple and unconditionally remapping from one input
-mnemonic to one output mnemonic.  It isn't possible for this form of alias to
-look at the operands at all, so the remapping must apply for all forms of a
-given mnemonic.  Mnemonic aliases are defined simply, for example X86 has:
-</p>
-
-<div class="doc_code">
-<pre>
-def : MnemonicAlias&lt;"cbw",     "cbtw"&gt;;
-def : MnemonicAlias&lt;"smovq",   "movsq"&gt;;
-def : MnemonicAlias&lt;"fldcww",  "fldcw"&gt;;
-def : MnemonicAlias&lt;"fucompi", "fucomip"&gt;;
-def : MnemonicAlias&lt;"ud2a",    "ud2"&gt;;
-</pre>
-</div>
-
-<p>... and many others.  With a MnemonicAlias definition, the mnemonic is
-remapped simply and directly.  Though MnemonicAlias's can't look at any aspect
-of the instruction (such as the operands) they can depend on global modes (the
-same ones supported by the matcher), through a Requires clause:</p>
-
-<div class="doc_code">
-<pre>
-def : MnemonicAlias&lt;"pushf", "pushfq"&gt;, Requires&lt;[In64BitMode]&gt;;
-def : MnemonicAlias&lt;"pushf", "pushfl"&gt;, Requires&lt;[In32BitMode]&gt;;
-</pre>
-</div>
-
-<p>In this example, the mnemonic gets mapped into different a new one depending
-on the current instruction set.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>Instruction Aliases</h4>
-
-<div>
-
-<p>The most general phase of alias processing occurs while matching is
-happening: it provides new forms for the matcher to match along with a specific
-instruction to generate.  An instruction alias has two parts: the string to
-match and the instruction to generate.  For example:
-</p>
-
-<div class="doc_code">
-<pre>
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8  :$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX32rr8  GR32:$dst, GR8  :$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16 :$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX64rr8  GR64:$dst, GR8  :$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16 :$src)&gt;;
-def : InstAlias&lt;"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32 :$src)&gt;;
-</pre>
-</div>
-
-<p>This shows a powerful example of the instruction aliases, matching the
-same mnemonic in multiple different ways depending on what operands are present
-in the assembly.  The result of instruction aliases can include operands in a
-different order than the destination instruction, and can use an input
-multiple times, for example:</p>
-
-<div class="doc_code">
-<pre>
-def : InstAlias&lt;"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg)&gt;;
-def : InstAlias&lt;"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)&gt;;
-def : InstAlias&lt;"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)&gt;;
-def : InstAlias&lt;"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)&gt;;
-</pre>
-</div>
-
-<p>This example also shows that tied operands are only listed once.  In the X86
-backend, XOR8rr has two input GR8's and one output GR8 (where an input is tied
-to the output).  InstAliases take a flattened operand list without duplicates
-for tied operands.  The result of an instruction alias can also use immediates
-and fixed physical registers which are added as simple immediate operands in the
-result, for example:</p>
-
-<div class="doc_code">
-<pre>
-// Fixed Immediate operand.
-def : InstAlias&lt;"aad", (AAD8i8 10)&gt;;
-
-// Fixed register operand.
-def : InstAlias&lt;"fcomi", (COM_FIr ST1)&gt;;
-
-// Simple alias.
-def : InstAlias&lt;"fcomi $reg", (COM_FIr RST:$reg)&gt;;
-</pre>
-</div>
-
-
-<p>Instruction aliases can also have a Requires clause to make them
-subtarget specific.</p>
-
-<p>If the back-end supports it, the instruction printer can automatically emit
-   the alias rather than what's being aliased. It typically leads to better,
-   more readable code. If it's better to print out what's being aliased, then
-   pass a '0' as the third parameter to the InstAlias definition.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3 id="na_matching">Instruction Matching</h3>
-
-<div><p>To Be Written</p></div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="targetimpls">Target-specific Implementation Notes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This section of the document explains features or design decisions that are
-   specific to the code generator for a particular target.  First we start
-   with a table that summarizes what features are supported by each target.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="targetfeatures">Target Feature Matrix</a>
-</h3>
-
-<div>
-
-<p>Note that this table does not include the C backend or Cpp backends, since
-they do not use the target independent code generator infrastructure.  It also
-doesn't list features that are not supported fully by any target yet.  It
-considers a feature to be supported if at least one subtarget supports it.  A
-feature being supported means that it is useful and works for most cases, it
-does not indicate that there are zero known bugs in the implementation.  Here
-is the key:</p>
-
-
-<table border="1" cellspacing="0">
-  <tr>
-    <th>Unknown</th>
-    <th>No support</th>
-    <th>Partial Support</th>
-    <th>Complete Support</th>
-  </tr>
-  <tr>
-    <td class="unknown"></td>
-    <td class="no"></td>
-    <td class="partial"></td>
-    <td class="yes"></td>
-  </tr>
-</table>
-
-<p>Here is the table:</p>
-
-<table width="689" border="1" cellspacing="0">
-<tr><td></td>
-<td colspan="13" align="center" style="background-color:#ffc">Target</td>
-</tr>
-  <tr>
-    <th>Feature</th>
-    <th>ARM</th>
-    <th>CellSPU</th>
-    <th>Hexagon</th>
-    <th>MBlaze</th>
-    <th>MSP430</th>
-    <th>Mips</th>
-    <th>PTX</th>
-    <th>PowerPC</th>
-    <th>Sparc</th>
-    <th>X86</th>
-    <th>XCore</th>
-  </tr>
-
-<tr>
-  <td><a href="#feat_reliable">is generally reliable</a></td>
-  <td class="yes"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="yes"></td> <!-- Hexagon -->
-  <td class="no"></td> <!-- MBlaze -->
-  <td class="unknown"></td> <!-- MSP430 -->
-  <td class="yes"></td> <!-- Mips -->
-  <td class="no"></td> <!-- PTX -->
-  <td class="yes"></td> <!-- PowerPC -->
-  <td class="yes"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="unknown"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_asmparser">assembly parser</a></td>
-  <td class="no"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="no"></td> <!-- Hexagon -->
-  <td class="yes"></td> <!-- MBlaze -->
-  <td class="no"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="no"></td> <!-- PTX -->
-  <td class="no"></td> <!-- PowerPC -->
-  <td class="no"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="no"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_disassembler">disassembler</a></td>
-  <td class="yes"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="no"></td> <!-- Hexagon -->
-  <td class="yes"></td> <!-- MBlaze -->
-  <td class="no"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="no"></td> <!-- PTX -->
-  <td class="no"></td> <!-- PowerPC -->
-  <td class="no"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="no"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_inlineasm">inline asm</a></td>
-  <td class="yes"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="yes"></td> <!-- Hexagon -->
-  <td class="yes"></td> <!-- MBlaze -->
-  <td class="unknown"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="unknown"></td> <!-- PTX -->
-  <td class="yes"></td> <!-- PowerPC -->
-  <td class="unknown"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="unknown"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_jit">jit</a></td>
-  <td class="partial"><a href="#feat_jit_arm">*</a></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="no"></td> <!-- Hexagon -->
-  <td class="no"></td> <!-- MBlaze -->
-  <td class="unknown"></td> <!-- MSP430 -->
-  <td class="yes"></td> <!-- Mips -->
-  <td class="unknown"></td> <!-- PTX -->
-  <td class="yes"></td> <!-- PowerPC -->
-  <td class="unknown"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="unknown"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_objectwrite">.o&nbsp;file writing</a></td>
-  <td class="no"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="no"></td> <!-- Hexagon -->
-  <td class="yes"></td> <!-- MBlaze -->
-  <td class="no"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="no"></td> <!-- PTX -->
-  <td class="no"></td> <!-- PowerPC -->
-  <td class="no"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="no"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_tailcall">tail calls</a></td>
-  <td class="yes"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="yes"></td> <!-- Hexagon -->
-  <td class="no"></td> <!-- MBlaze -->
-  <td class="unknown"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="unknown"></td> <!-- PTX -->
-  <td class="yes"></td> <!-- PowerPC -->
-  <td class="unknown"></td> <!-- Sparc -->
-  <td class="yes"></td> <!-- X86 -->
-  <td class="unknown"></td> <!-- XCore -->
-</tr>
-
-<tr>
-  <td><a href="#feat_segstacks">segmented stacks</a></td>
-  <td class="no"></td> <!-- ARM -->
-  <td class="no"></td> <!-- CellSPU -->
-  <td class="no"></td> <!-- Hexagon -->
-  <td class="no"></td> <!-- MBlaze -->
-  <td class="no"></td> <!-- MSP430 -->
-  <td class="no"></td> <!-- Mips -->
-  <td class="no"></td> <!-- PTX -->
-  <td class="no"></td> <!-- PowerPC -->
-  <td class="no"></td> <!-- Sparc -->
-  <td class="partial"><a href="#feat_segstacks_x86">*</a></td> <!-- X86 -->
-  <td class="no"></td> <!-- XCore -->
-</tr>
-
-
-</table>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_reliable">Is Generally Reliable</h4>
-
-<div>
-<p>This box indicates whether the target is considered to be production quality.
-This indicates that the target has been used as a static compiler to
-compile large amounts of code by a variety of different people and is in
-continuous use.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_asmparser">Assembly Parser</h4>
-
-<div>
-<p>This box indicates whether the target supports parsing target specific .s
-files by implementing the MCAsmParser interface.  This is required for llvm-mc
-to be able to act as a native assembler and is required for inline assembly
-support in the native .o file writer.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_disassembler">Disassembler</h4>
-
-<div>
-<p>This box indicates whether the target supports the MCDisassembler API for
-disassembling machine opcode bytes into MCInst's.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_inlineasm">Inline Asm</h4>
-
-<div>
-<p>This box indicates whether the target supports most popular inline assembly
-constraints and modifiers.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_jit">JIT Support</h4>
-
-<div>
-<p>This box indicates whether the target supports the JIT compiler through
-the ExecutionEngine interface.</p>
-
-<p id="feat_jit_arm">The ARM backend has basic support for integer code
-in ARM codegen mode, but lacks NEON and full Thumb support.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_objectwrite">.o File Writing</h4>
-
-<div>
-
-<p>This box indicates whether the target supports writing .o files (e.g. MachO,
-ELF, and/or COFF) files directly from the target.  Note that the target also
-must include an assembly parser and general inline assembly support for full
-inline assembly support in the .o writer.</p>
-
-<p>Targets that don't support this feature can obviously still write out .o
-files, they just rely on having an external assembler to translate from a .s
-file to a .o file (as is the case for many C compilers).</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_tailcall">Tail Calls</h4>
-
-<div>
-
-<p>This box indicates whether the target supports guaranteed tail calls.  These
-are calls marked "<a href="LangRef.html#i_call">tail</a>" and use the fastcc
-calling convention.  Please see the <a href="#tailcallopt">tail call section
-more more details</a>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4 id="feat_segstacks">Segmented Stacks</h4>
-
-<div>
-
-<p>This box indicates whether the target supports segmented stacks. This
-replaces the traditional large C stack with many linked segments. It
-is compatible with the <a href="http://gcc.gnu.org/wiki/SplitStacks">gcc
-implementation</a> used by the Go front end.</p>
-
-<p id="feat_segstacks_x86">Basic support exists on the X86 backend. Currently
-vararg doesn't work and the object files are not marked the way the gold
-linker expects, but simple Go programs can be built by dragonegg.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="tailcallopt">Tail call optimization</a>
-</h3>
-
-<div>
-
-<p>Tail call optimization, callee reusing the stack of the caller, is currently
-   supported on x86/x86-64 and PowerPC. It is performed if:</p>
-
-<ul>
-  <li>Caller and callee have the calling convention <tt>fastcc</tt> or
-       <tt>cc 10</tt> (GHC call convention).</li>
-
-  <li>The call is a tail call - in tail position (ret immediately follows call
-      and ret uses value of call or is void).</li>
-
-  <li>Option <tt>-tailcallopt</tt> is enabled.</li>
-
-  <li>Platform specific constraints are met.</li>
-</ul>
-
-<p>x86/x86-64 constraints:</p>
-
-<ul>
-  <li>No variable argument lists are used.</li>
-
-  <li>On x86-64 when generating GOT/PIC code only module-local calls (visibility
-  = hidden or protected) are supported.</li>
-</ul>
-
-<p>PowerPC constraints:</p>
-
-<ul>
-  <li>No variable argument lists are used.</li>
-
-  <li>No byval parameters are used.</li>
-
-  <li>On ppc32/64 GOT/PIC only module-local calls (visibility = hidden or protected) are supported.</li>
-</ul>
-
-<p>Example:</p>
-
-<p>Call as <tt>llc -tailcallopt test.ll</tt>.</p>
-
-<div class="doc_code">
-<pre>
-declare fastcc i32 @tailcallee(i32 inreg %a1, i32 inreg %a2, i32 %a3, i32 %a4)
-
-define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
-  %l1 = add i32 %in1, %in2
-  %tmp = tail call fastcc i32 @tailcallee(i32 %in1 inreg, i32 %in2 inreg, i32 %in1, i32 %l1)
-  ret i32 %tmp
-}
-</pre>
-</div>
-
-<p>Implications of <tt>-tailcallopt</tt>:</p>
-
-<p>To support tail call optimization in situations where the callee has more
-   arguments than the caller a 'callee pops arguments' convention is used. This
-   currently causes each <tt>fastcc</tt> call that is not tail call optimized
-   (because one or more of above constraints are not met) to be followed by a
-   readjustment of the stack. So performance might be worse in such cases.</p>
-
-</div>
-<!-- ======================================================================= -->
-<h3>
-  <a name="sibcallopt">Sibling call optimization</a>
-</h3>
-
-<div>
-
-<p>Sibling call optimization is a restricted form of tail call optimization.
-   Unlike tail call optimization described in the previous section, it can be
-   performed automatically on any tail calls when <tt>-tailcallopt</tt> option
-   is not specified.</p>
-
-<p>Sibling call optimization is currently performed on x86/x86-64 when the
-   following constraints are met:</p>
-
-<ul>
-  <li>Caller and callee have the same calling convention. It can be either
-      <tt>c</tt> or <tt>fastcc</tt>.
-
-  <li>The call is a tail call - in tail position (ret immediately follows call
-      and ret uses value of call or is void).</li>
-
-  <li>Caller and callee have matching return type or the callee result is not
-      used.
-
-  <li>If any of the callee arguments are being passed in stack, they must be
-      available in caller's own incoming argument stack and the frame offsets
-      must be the same.
-</ul>
-
-<p>Example:</p>
-<div class="doc_code">
-<pre>
-declare i32 @bar(i32, i32)
-
-define i32 @foo(i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = tail call i32 @bar(i32 %a, i32 %b)
-  ret i32 %0
-}
-</pre>
-</div>
-
-</div>
-<!-- ======================================================================= -->
-<h3>
-  <a name="x86">The X86 backend</a>
-</h3>
-
-<div>
-
-<p>The X86 code generator lives in the <tt>lib/Target/X86</tt> directory.  This
-   code generator is capable of targeting a variety of x86-32 and x86-64
-   processors, and includes support for ISA extensions such as MMX and SSE.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="x86_tt">X86 Target Triples supported</a>
-</h4>
-
-<div>
-
-<p>The following are the known target triples that are supported by the X86
-   backend.  This is not an exhaustive list, and it would be useful to add those
-   that people test.</p>
-
-<ul>
-  <li><b>i686-pc-linux-gnu</b> &mdash; Linux</li>
-
-  <li><b>i386-unknown-freebsd5.3</b> &mdash; FreeBSD 5.3</li>
-
-  <li><b>i686-pc-cygwin</b> &mdash; Cygwin on Win32</li>
-
-  <li><b>i686-pc-mingw32</b> &mdash; MingW on Win32</li>
-
-  <li><b>i386-pc-mingw32msvc</b> &mdash; MingW crosscompiler on Linux</li>
-
-  <li><b>i686-apple-darwin*</b> &mdash; Apple Darwin on X86</li>
-
-  <li><b>x86_64-unknown-linux-gnu</b> &mdash; Linux</li>
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="x86_cc">X86 Calling Conventions supported</a>
-</h4>
-
-
-<div>
-
-<p>The following target-specific calling conventions are known to backend:</p>
-
-<ul>
-<li><b>x86_StdCall</b> &mdash; stdcall calling convention seen on Microsoft
-    Windows platform (CC ID = 64).</li>
-<li><b>x86_FastCall</b> &mdash; fastcall calling convention seen on Microsoft
-    Windows platform (CC ID = 65).</li>
-<li><b>x86_ThisCall</b> &mdash; Similar to X86_StdCall. Passes first argument
-    in ECX,  others via stack. Callee is responsible for stack cleaning. This
-    convention is used by MSVC by default for methods in its ABI
-    (CC ID = 70).</li>
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="x86_memory">Representing X86 addressing modes in MachineInstrs</a>
-</h4>
-
-<div>
-
-<p>The x86 has a very flexible way of accessing memory.  It is capable of
-   forming memory addresses of the following expression directly in integer
-   instructions (which use ModR/M addressing):</p>
-
-<div class="doc_code">
-<pre>
-SegmentReg: Base + [1,2,4,8] * IndexReg + Disp32
-</pre>
-</div>
-
-<p>In order to represent this, LLVM tracks no less than 5 operands for each
-   memory operand of this form.  This means that the "load" form of
-   '<tt>mov</tt>' has the following <tt>MachineOperand</tt>s in this order:</p>
-
-<div class="doc_code">
-<pre>
-Index:        0     |    1        2       3           4          5
-Meaning:   DestReg, | BaseReg,  Scale, IndexReg, Displacement Segment
-OperandTy: VirtReg, | VirtReg, UnsImm, VirtReg,   SignExtImm  PhysReg
-</pre>
-</div>
-
-<p>Stores, and all other instructions, treat the four memory operands in the
-   same way and in the same order.  If the segment register is unspecified
-   (regno = 0), then no segment override is generated.  "Lea" operations do not
-   have a segment register specified, so they only have 4 operands for their
-   memory reference.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="x86_memory">X86 address spaces supported</a>
-</h4>
-
-<div>
-
-<p>x86 has a feature which provides
-   the ability to perform loads and stores to different address spaces
-   via the x86 segment registers.  A segment override prefix byte on an
-   instruction causes the instruction's memory access to go to the specified
-   segment.  LLVM address space 0 is the default address space, which includes
-   the stack, and any unqualified memory accesses in a program.  Address spaces
-   1-255 are currently reserved for user-defined code.  The GS-segment is
-   represented by address space 256, while the FS-segment is represented by 
-   address space 257. Other x86 segments have yet to be allocated address space
-   numbers.</p>
-
-<p>While these address spaces may seem similar to TLS via the
-   <tt>thread_local</tt> keyword, and often use the same underlying hardware,
-   there are some fundamental differences.</p>
-
-<p>The <tt>thread_local</tt> keyword applies to global variables and
-   specifies that they are to be allocated in thread-local memory. There are
-   no type qualifiers involved, and these variables can be pointed to with
-   normal pointers and accessed with normal loads and stores.
-   The <tt>thread_local</tt> keyword is target-independent at the LLVM IR
-   level (though LLVM doesn't yet have implementations of it for some
-   configurations).<p>
-
-<p>Special address spaces, in contrast, apply to static types. Every
-   load and store has a particular address space in its address operand type,
-   and this is what determines which address space is accessed.
-   LLVM ignores these special address space qualifiers on global variables,
-   and does not provide a way to directly allocate storage in them.
-   At the LLVM IR level, the behavior of these special address spaces depends
-   in part on the underlying OS or runtime environment, and they are specific
-   to x86 (and LLVM doesn't yet handle them correctly in some cases).</p>
-
-<p>Some operating systems and runtime environments use (or may in the future
-   use) the FS/GS-segment registers for various low-level purposes, so care
-   should be taken when considering them.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="x86_names">Instruction naming</a>
-</h4>
-
-<div>
-
-<p>An instruction name consists of the base name, a default operand size, and a
-   a character per operand with an optional special size. For example:</p>
-
-<div class="doc_code">
-<pre>
-ADD8rr      -&gt; add, 8-bit register, 8-bit register
-IMUL16rmi   -&gt; imul, 16-bit register, 16-bit memory, 16-bit immediate
-IMUL16rmi8  -&gt; imul, 16-bit register, 16-bit memory, 8-bit immediate
-MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
-</pre>
-</div>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ppc">The PowerPC backend</a>
-</h3>
-
-<div>
-
-<p>The PowerPC code generator lives in the lib/Target/PowerPC directory.  The
-   code generation is retargetable to several variations or <i>subtargets</i> of
-   the PowerPC ISA; including ppc32, ppc64 and altivec.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ppc_abi">LLVM PowerPC ABI</a>
-</h4>
-
-<div>
-
-<p>LLVM follows the AIX PowerPC ABI, with two deviations. LLVM uses a PC
-   relative (PIC) or static addressing for accessing global values, so no TOC
-   (r2) is used. Second, r31 is used as a frame pointer to allow dynamic growth
-   of a stack frame.  LLVM takes advantage of having no TOC to provide space to
-   save the frame pointer in the PowerPC linkage area of the caller frame.
-   Other details of PowerPC ABI can be found at <a href=
-   "http://developer.apple.com/documentation/DeveloperTools/Conceptual/LowLevelABI/Articles/32bitPowerPC.html"
-   >PowerPC ABI.</a> Note: This link describes the 32 bit ABI.  The 64 bit ABI
-   is similar except space for GPRs are 8 bytes wide (not 4) and r13 is reserved
-   for system use.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ppc_frame">Frame Layout</a>
-</h4>
-
-<div>
-
-<p>The size of a PowerPC frame is usually fixed for the duration of a
-   function's invocation.  Since the frame is fixed size, all references
-   into the frame can be accessed via fixed offsets from the stack pointer.  The
-   exception to this is when dynamic alloca or variable sized arrays are
-   present, then a base pointer (r31) is used as a proxy for the stack pointer
-   and stack pointer is free to grow or shrink.  A base pointer is also used if
-   llvm-gcc is not passed the -fomit-frame-pointer flag. The stack pointer is
-   always aligned to 16 bytes, so that space allocated for altivec vectors will
-   be properly aligned.</p>
-
-<p>An invocation frame is laid out as follows (low memory at top);</p>
-
-<table class="layout">
-  <tr>
-    <td>Linkage<br><br></td>
-  </tr>
-  <tr>
-    <td>Parameter area<br><br></td>
-  </tr>
-  <tr>
-    <td>Dynamic area<br><br></td>
-  </tr>
-  <tr>
-    <td>Locals area<br><br></td>
-  </tr>
-  <tr>
-    <td>Saved registers area<br><br></td>
-  </tr>
-  <tr style="border-style: none hidden none hidden;">
-    <td><br></td>
-  </tr>
-  <tr>
-    <td>Previous Frame<br><br></td>
-  </tr>
-</table>
-
-<p>The <i>linkage</i> area is used by a callee to save special registers prior
-   to allocating its own frame.  Only three entries are relevant to LLVM. The
-   first entry is the previous stack pointer (sp), aka link.  This allows
-   probing tools like gdb or exception handlers to quickly scan the frames in
-   the stack.  A function epilog can also use the link to pop the frame from the
-   stack.  The third entry in the linkage area is used to save the return
-   address from the lr register. Finally, as mentioned above, the last entry is
-   used to save the previous frame pointer (r31.)  The entries in the linkage
-   area are the size of a GPR, thus the linkage area is 24 bytes long in 32 bit
-   mode and 48 bytes in 64 bit mode.</p>
-
-<p>32 bit linkage area</p>
-
-<table class="layout">
-  <tr>
-    <td>0</td>
-    <td>Saved SP (r1)</td>
-  </tr>
-  <tr>
-    <td>4</td>
-    <td>Saved CR</td>
-  </tr>
-  <tr>
-    <td>8</td>
-    <td>Saved LR</td>
-  </tr>
-  <tr>
-    <td>12</td>
-    <td>Reserved</td>
-  </tr>
-  <tr>
-    <td>16</td>
-    <td>Reserved</td>
-  </tr>
-  <tr>
-    <td>20</td>
-    <td>Saved FP (r31)</td>
-  </tr>
-</table>
-
-<p>64 bit linkage area</p>
-
-<table class="layout">
-  <tr>
-    <td>0</td>
-    <td>Saved SP (r1)</td>
-  </tr>
-  <tr>
-    <td>8</td>
-    <td>Saved CR</td>
-  </tr>
-  <tr>
-    <td>16</td>
-    <td>Saved LR</td>
-  </tr>
-  <tr>
-    <td>24</td>
-    <td>Reserved</td>
-  </tr>
-  <tr>
-    <td>32</td>
-    <td>Reserved</td>
-  </tr>
-  <tr>
-    <td>40</td>
-    <td>Saved FP (r31)</td>
-  </tr>
-</table>
-
-<p>The <i>parameter area</i> is used to store arguments being passed to a callee
-   function.  Following the PowerPC ABI, the first few arguments are actually
-   passed in registers, with the space in the parameter area unused.  However,
-   if there are not enough registers or the callee is a thunk or vararg
-   function, these register arguments can be spilled into the parameter area.
-   Thus, the parameter area must be large enough to store all the parameters for
-   the largest call sequence made by the caller.  The size must also be
-   minimally large enough to spill registers r3-r10.  This allows callees blind
-   to the call signature, such as thunks and vararg functions, enough space to
-   cache the argument registers.  Therefore, the parameter area is minimally 32
-   bytes (64 bytes in 64 bit mode.)  Also note that since the parameter area is
-   a fixed offset from the top of the frame, that a callee can access its spilt
-   arguments using fixed offsets from the stack pointer (or base pointer.)</p>
-
-<p>Combining the information about the linkage, parameter areas and alignment. A
-   stack frame is minimally 64 bytes in 32 bit mode and 128 bytes in 64 bit
-   mode.</p>
-
-<p>The <i>dynamic area</i> starts out as size zero.  If a function uses dynamic
-   alloca then space is added to the stack, the linkage and parameter areas are
-   shifted to top of stack, and the new space is available immediately below the
-   linkage and parameter areas.  The cost of shifting the linkage and parameter
-   areas is minor since only the link value needs to be copied.  The link value
-   can be easily fetched by adding the original frame size to the base pointer.
-   Note that allocations in the dynamic space need to observe 16 byte
-   alignment.</p>
-
-<p>The <i>locals area</i> is where the llvm compiler reserves space for local
-   variables.</p>
-
-<p>The <i>saved registers area</i> is where the llvm compiler spills callee
-   saved registers on entry to the callee.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ppc_prolog">Prolog/Epilog</a>
-</h4>
-
-<div>
-
-<p>The llvm prolog and epilog are the same as described in the PowerPC ABI, with
-   the following exceptions.  Callee saved registers are spilled after the frame
-   is created.  This allows the llvm epilog/prolog support to be common with
-   other targets.  The base pointer callee saved register r31 is saved in the
-   TOC slot of linkage area.  This simplifies allocation of space for the base
-   pointer and makes it convenient to locate programatically and during
-   debugging.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ppc_dynamic">Dynamic Allocation</a>
-</h4>
-
-<div>
-
-<p><i>TODO - More to come.</i></p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ptx">The PTX backend</a>
-</h3>
-
-<div>
-
-<p>The PTX code generator lives in the lib/Target/PTX directory. It is
-  currently a work-in-progress, but already supports most of the code
-  generation functionality needed to generate correct PTX kernels for
-  CUDA devices.</p>
-
-<p>The code generator can target PTX 2.0+, and shader model 1.0+.  The
-  PTX ISA Reference Manual is used as the primary source of ISA
-  information, though an effort is made to make the output of the code
-  generator match the output of the NVidia nvcc compiler, whenever
-  possible.</p>
-
-<p>Code Generator Options:</p>
-<table border="1" cellspacing="0">
-  <tr>
-    <th>Option</th>
-    <th>Description</th>
- </tr>
-   <tr>
-     <td><code>double</code></td>
-     <td align="left">If enabled, the map_f64_to_f32 directive is
-       disabled in the PTX output, allowing native double-precision
-       arithmetic</td>
-  </tr>
-  <tr>
-    <td><code>no-fma</code></td>
-    <td align="left">Disable generation of Fused-Multiply Add
-      instructions, which may be beneficial for some devices</td>
-  </tr>
-  <tr>
-    <td><code>smxy / computexy</code></td>
-    <td align="left">Set shader model/compute capability to x.y,
-    e.g. sm20 or compute13</td>
-  </tr>
-</table>
-
-<p>Working:</p>
-<ul>
-  <li>Arithmetic instruction selection (including combo FMA)</li>
-  <li>Bitwise instruction selection</li>
-  <li>Control-flow instruction selection</li>
-  <li>Function calls (only on SM 2.0+ and no return arguments)</li>
-  <li>Addresses spaces (0 = global, 1 = constant, 2 = local, 4 =
-  shared)</li>
-  <li>Thread synchronization (bar.sync)</li>
-  <li>Special register reads ([N]TID, [N]CTAID, PMx, CLOCK, etc.)</li>
-</ul>
-
-<p>In Progress:</p>
-<ul>
-  <li>Robust call instruction selection</li>
-  <li>Stack frame allocation</li>
-  <li>Device-specific instruction scheduling optimizations</li>
-</ul>
-
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-15 22:22:36 +0200 (Sun, 15 Apr 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
new file mode 100644
index 0000000..d1d0231
--- /dev/null
+++ b/docs/CodeGenerator.rst
@@ -0,0 +1,2428 @@
+.. _code_generator:
+
+==========================================
+The LLVM Target-Independent Code Generator
+==========================================
+
+.. role:: raw-html(raw)
+   :format: html
+
+.. raw:: html
+
+  <style>
+    .unknown { background-color: #C0C0C0; text-align: center; }
+    .unknown:before { content: "?" }
+    .no { background-color: #C11B17 }
+    .no:before { content: "N" }
+    .partial { background-color: #F88017 }
+    .yes { background-color: #0F0; }
+    .yes:before { content: "Y" }
+  </style>
+
+.. contents::
+   :local:
+
+.. warning::
+  This is a work in progress.
+
+Introduction
+============
+
+The LLVM target-independent code generator is a framework that provides a suite
+of reusable components for translating the LLVM internal representation to the
+machine code for a specified target---either in assembly form (suitable for a
+static compiler) or in binary machine code format (usable for a JIT
+compiler). The LLVM target-independent code generator consists of six main
+components:
+
+1. `Abstract target description`_ interfaces which capture important properties
+   about various aspects of the machine, independently of how they will be used.
+   These interfaces are defined in ``include/llvm/Target/``.
+
+2. Classes used to represent the `code being generated`_ for a target.  These
+   classes are intended to be abstract enough to represent the machine code for
+   *any* target machine.  These classes are defined in
+   ``include/llvm/CodeGen/``. At this level, concepts like "constant pool
+   entries" and "jump tables" are explicitly exposed.
+
+3. Classes and algorithms used to represent code as the object file level, the
+   `MC Layer`_.  These classes represent assembly level constructs like labels,
+   sections, and instructions.  At this level, concepts like "constant pool
+   entries" and "jump tables" don't exist.
+
+4. `Target-independent algorithms`_ used to implement various phases of native
+   code generation (register allocation, scheduling, stack frame representation,
+   etc).  This code lives in ``lib/CodeGen/``.
+
+5. `Implementations of the abstract target description interfaces`_ for
+   particular targets.  These machine descriptions make use of the components
+   provided by LLVM, and can optionally provide custom target-specific passes,
+   to build complete code generators for a specific target.  Target descriptions
+   live in ``lib/Target/``.
+
+6. The target-independent JIT components.  The LLVM JIT is completely target
+   independent (it uses the ``TargetJITInfo`` structure to interface for
+   target-specific issues.  The code for the target-independent JIT lives in
+   ``lib/ExecutionEngine/JIT``.
+
+Depending on which part of the code generator you are interested in working on,
+different pieces of this will be useful to you.  In any case, you should be
+familiar with the `target description`_ and `machine code representation`_
+classes.  If you want to add a backend for a new target, you will need to
+`implement the target description`_ classes for your new target and understand
+the `LLVM code representation <LangRef.html>`_.  If you are interested in
+implementing a new `code generation algorithm`_, it should only depend on the
+target-description and machine code representation classes, ensuring that it is
+portable.
+
+Required components in the code generator
+-----------------------------------------
+
+The two pieces of the LLVM code generator are the high-level interface to the
+code generator and the set of reusable components that can be used to build
+target-specific backends.  The two most important interfaces (:raw-html:`<tt>`
+`TargetMachine`_ :raw-html:`</tt>` and :raw-html:`<tt>` `TargetData`_
+:raw-html:`</tt>`) are the only ones that are required to be defined for a
+backend to fit into the LLVM system, but the others must be defined if the
+reusable code generator components are going to be used.
+
+This design has two important implications.  The first is that LLVM can support
+completely non-traditional code generation targets.  For example, the C backend
+does not require register allocation, instruction selection, or any of the other
+standard components provided by the system.  As such, it only implements these
+two interfaces, and does its own thing. Note that C backend was removed from the
+trunk since LLVM 3.1 release. Another example of a code generator like this is a
+(purely hypothetical) backend that converts LLVM to the GCC RTL form and uses
+GCC to emit machine code for a target.
+
+This design also implies that it is possible to design and implement radically
+different code generators in the LLVM system that do not make use of any of the
+built-in components.  Doing so is not recommended at all, but could be required
+for radically different targets that do not fit into the LLVM machine
+description model: FPGAs for example.
+
+.. _high-level design of the code generator:
+
+The high-level design of the code generator
+-------------------------------------------
+
+The LLVM target-independent code generator is designed to support efficient and
+quality code generation for standard register-based microprocessors.  Code
+generation in this model is divided into the following stages:
+
+1. `Instruction Selection`_ --- This phase determines an efficient way to
+   express the input LLVM code in the target instruction set.  This stage
+   produces the initial code for the program in the target instruction set, then
+   makes use of virtual registers in SSA form and physical registers that
+   represent any required register assignments due to target constraints or
+   calling conventions.  This step turns the LLVM code into a DAG of target
+   instructions.
+
+2. `Scheduling and Formation`_ --- This phase takes the DAG of target
+   instructions produced by the instruction selection phase, determines an
+   ordering of the instructions, then emits the instructions as :raw-html:`<tt>`
+   `MachineInstr`_\s :raw-html:`</tt>` with that ordering.  Note that we
+   describe this in the `instruction selection section`_ because it operates on
+   a `SelectionDAG`_.
+
+3. `SSA-based Machine Code Optimizations`_ --- This optional stage consists of a
+   series of machine-code optimizations that operate on the SSA-form produced by
+   the instruction selector.  Optimizations like modulo-scheduling or peephole
+   optimization work here.
+
+4. `Register Allocation`_ --- The target code is transformed from an infinite
+   virtual register file in SSA form to the concrete register file used by the
+   target.  This phase introduces spill code and eliminates all virtual register
+   references from the program.
+
+5. `Prolog/Epilog Code Insertion`_ --- Once the machine code has been generated
+   for the function and the amount of stack space required is known (used for
+   LLVM alloca's and spill slots), the prolog and epilog code for the function
+   can be inserted and "abstract stack location references" can be eliminated.
+   This stage is responsible for implementing optimizations like frame-pointer
+   elimination and stack packing.
+
+6. `Late Machine Code Optimizations`_ --- Optimizations that operate on "final"
+   machine code can go here, such as spill code scheduling and peephole
+   optimizations.
+
+7. `Code Emission`_ --- The final stage actually puts out the code for the
+   current function, either in the target assembler format or in machine
+   code.
+
+The code generator is based on the assumption that the instruction selector will
+use an optimal pattern matching selector to create high-quality sequences of
+native instructions.  Alternative code generator designs based on pattern
+expansion and aggressive iterative peephole optimization are much slower.  This
+design permits efficient compilation (important for JIT environments) and
+aggressive optimization (used when generating code offline) by allowing
+components of varying levels of sophistication to be used for any step of
+compilation.
+
+In addition to these stages, target implementations can insert arbitrary
+target-specific passes into the flow.  For example, the X86 target uses a
+special pass to handle the 80x87 floating point stack architecture.  Other
+targets with unusual requirements can be supported with custom passes as needed.
+
+Using TableGen for target description
+-------------------------------------
+
+The target description classes require a detailed description of the target
+architecture.  These target descriptions often have a large amount of common
+information (e.g., an ``add`` instruction is almost identical to a ``sub``
+instruction).  In order to allow the maximum amount of commonality to be
+factored out, the LLVM code generator uses the
+`TableGen <TableGenFundamentals.html>`_ tool to describe big chunks of the
+target machine, which allows the use of domain-specific and target-specific
+abstractions to reduce the amount of repetition.
+
+As LLVM continues to be developed and refined, we plan to move more and more of
+the target description to the ``.td`` form.  Doing so gives us a number of
+advantages.  The most important is that it makes it easier to port LLVM because
+it reduces the amount of C++ code that has to be written, and the surface area
+of the code generator that needs to be understood before someone can get
+something working.  Second, it makes it easier to change things. In particular,
+if tables and other things are all emitted by ``tblgen``, we only need a change
+in one place (``tblgen``) to update all of the targets to a new interface.
+
+.. _Abstract target description:
+.. _target description:
+
+Target description classes
+==========================
+
+The LLVM target description classes (located in the ``include/llvm/Target``
+directory) provide an abstract description of the target machine independent of
+any particular client.  These classes are designed to capture the *abstract*
+properties of the target (such as the instructions and registers it has), and do
+not incorporate any particular pieces of code generation algorithms.
+
+All of the target description classes (except the :raw-html:`<tt>` `TargetData`_
+:raw-html:`</tt>` class) are designed to be subclassed by the concrete target
+implementation, and have virtual methods implemented.  To get to these
+implementations, the :raw-html:`<tt>` `TargetMachine`_ :raw-html:`</tt>` class
+provides accessors that should be implemented by the target.
+
+.. _TargetMachine:
+
+The ``TargetMachine`` class
+---------------------------
+
+The ``TargetMachine`` class provides virtual methods that are used to access the
+target-specific implementations of the various target description classes via
+the ``get*Info`` methods (``getInstrInfo``, ``getRegisterInfo``,
+``getFrameInfo``, etc.).  This class is designed to be specialized by a concrete
+target implementation (e.g., ``X86TargetMachine``) which implements the various
+virtual methods.  The only required target description class is the
+:raw-html:`<tt>` `TargetData`_ :raw-html:`</tt>` class, but if the code
+generator components are to be used, the other interfaces should be implemented
+as well.
+
+.. _TargetData:
+
+The ``TargetData`` class
+------------------------
+
+The ``TargetData`` class is the only required target description class, and it
+is the only class that is not extensible (you cannot derived a new class from
+it).  ``TargetData`` specifies information about how the target lays out memory
+for structures, the alignment requirements for various data types, the size of
+pointers in the target, and whether the target is little-endian or
+big-endian.
+
+.. _targetlowering:
+
+The ``TargetLowering`` class
+----------------------------
+
+The ``TargetLowering`` class is used by SelectionDAG based instruction selectors
+primarily to describe how LLVM code should be lowered to SelectionDAG
+operations.  Among other things, this class indicates:
+
+* an initial register class to use for various ``ValueType``\s,
+
+* which operations are natively supported by the target machine,
+
+* the return type of ``setcc`` operations,
+
+* the type to use for shift amounts, and
+
+* various high-level characteristics, like whether it is profitable to turn
+  division by a constant into a multiplication sequence
+
+The ``TargetRegisterInfo`` class
+--------------------------------
+
+The ``TargetRegisterInfo`` class is used to describe the register file of the
+target and any interactions between the registers.
+
+Registers in the code generator are represented in the code generator by
+unsigned integers.  Physical registers (those that actually exist in the target
+description) are unique small numbers, and virtual registers are generally
+large.  Note that register ``#0`` is reserved as a flag value.
+
+Each register in the processor description has an associated
+``TargetRegisterDesc`` entry, which provides a textual name for the register
+(used for assembly output and debugging dumps) and a set of aliases (used to
+indicate whether one register overlaps with another).
+
+In addition to the per-register description, the ``TargetRegisterInfo`` class
+exposes a set of processor specific register classes (instances of the
+``TargetRegisterClass`` class).  Each register class contains sets of registers
+that have the same properties (for example, they are all 32-bit integer
+registers).  Each SSA virtual register created by the instruction selector has
+an associated register class.  When the register allocator runs, it replaces
+virtual registers with a physical register in the set.
+
+The target-specific implementations of these classes is auto-generated from a
+`TableGen <TableGenFundamentals.html>`_ description of the register file.
+
+.. _TargetInstrInfo:
+
+The ``TargetInstrInfo`` class
+-----------------------------
+
+The ``TargetInstrInfo`` class is used to describe the machine instructions
+supported by the target. It is essentially an array of ``TargetInstrDescriptor``
+objects, each of which describes one instruction the target
+supports. Descriptors define things like the mnemonic for the opcode, the number
+of operands, the list of implicit register uses and defs, whether the
+instruction has certain target-independent properties (accesses memory, is
+commutable, etc), and holds any target-specific flags.
+
+The ``TargetFrameInfo`` class
+-----------------------------
+
+The ``TargetFrameInfo`` class is used to provide information about the stack
+frame layout of the target. It holds the direction of stack growth, the known
+stack alignment on entry to each function, and the offset to the local area.
+The offset to the local area is the offset from the stack pointer on function
+entry to the first location where function data (local variables, spill
+locations) can be stored.
+
+The ``TargetSubtarget`` class
+-----------------------------
+
+The ``TargetSubtarget`` class is used to provide information about the specific
+chip set being targeted.  A sub-target informs code generation of which
+instructions are supported, instruction latencies and instruction execution
+itinerary; i.e., which processing units are used, in what order, and for how
+long.
+
+The ``TargetJITInfo`` class
+---------------------------
+
+The ``TargetJITInfo`` class exposes an abstract interface used by the
+Just-In-Time code generator to perform target-specific activities, such as
+emitting stubs.  If a ``TargetMachine`` supports JIT code generation, it should
+provide one of these objects through the ``getJITInfo`` method.
+
+.. _code being generated:
+.. _machine code representation:
+
+Machine code description classes
+================================
+
+At the high-level, LLVM code is translated to a machine specific representation
+formed out of :raw-html:`<tt>` `MachineFunction`_ :raw-html:`</tt>`,
+:raw-html:`<tt>` `MachineBasicBlock`_ :raw-html:`</tt>`, and :raw-html:`<tt>`
+`MachineInstr`_ :raw-html:`</tt>` instances (defined in
+``include/llvm/CodeGen``).  This representation is completely target agnostic,
+representing instructions in their most abstract form: an opcode and a series of
+operands.  This representation is designed to support both an SSA representation
+for machine code, as well as a register allocated, non-SSA form.
+
+.. _MachineInstr:
+
+The ``MachineInstr`` class
+--------------------------
+
+Target machine instructions are represented as instances of the ``MachineInstr``
+class.  This class is an extremely abstract way of representing machine
+instructions.  In particular, it only keeps track of an opcode number and a set
+of operands.
+
+The opcode number is a simple unsigned integer that only has meaning to a
+specific backend.  All of the instructions for a target should be defined in the
+``*InstrInfo.td`` file for the target. The opcode enum values are auto-generated
+from this description.  The ``MachineInstr`` class does not have any information
+about how to interpret the instruction (i.e., what the semantics of the
+instruction are); for that you must refer to the :raw-html:`<tt>`
+`TargetInstrInfo`_ :raw-html:`</tt>` class.
+
+The operands of a machine instruction can be of several different types: a
+register reference, a constant integer, a basic block reference, etc.  In
+addition, a machine operand should be marked as a def or a use of the value
+(though only registers are allowed to be defs).
+
+By convention, the LLVM code generator orders instruction operands so that all
+register definitions come before the register uses, even on architectures that
+are normally printed in other orders.  For example, the SPARC add instruction:
+"``add %i1, %i2, %i3``" adds the "%i1", and "%i2" registers and stores the
+result into the "%i3" register.  In the LLVM code generator, the operands should
+be stored as "``%i3, %i1, %i2``": with the destination first.
+
+Keeping destination (definition) operands at the beginning of the operand list
+has several advantages.  In particular, the debugging printer will print the
+instruction like this:
+
+.. code-block:: llvm
+
+  %r3 = add %i1, %i2
+
+Also if the first operand is a def, it is easier to `create instructions`_ whose
+only def is the first operand.
+
+.. _create instructions:
+
+Using the ``MachineInstrBuilder.h`` functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Machine instructions are created by using the ``BuildMI`` functions, located in
+the ``include/llvm/CodeGen/MachineInstrBuilder.h`` file.  The ``BuildMI``
+functions make it easy to build arbitrary machine instructions.  Usage of the
+``BuildMI`` functions look like this:
+
+.. code-block:: c++
+
+  // Create a 'DestReg = mov 42' (rendered in X86 assembly as 'mov DestReg, 42')
+  // instruction.  The '1' specifies how many operands will be added.
+  MachineInstr *MI = BuildMI(X86::MOV32ri, 1, DestReg).addImm(42);
+
+  // Create the same instr, but insert it at the end of a basic block.
+  MachineBasicBlock &amp;MBB = ...
+  BuildMI(MBB, X86::MOV32ri, 1, DestReg).addImm(42);
+
+  // Create the same instr, but insert it before a specified iterator point.
+  MachineBasicBlock::iterator MBBI = ...
+  BuildMI(MBB, MBBI, X86::MOV32ri, 1, DestReg).addImm(42);
+
+  // Create a 'cmp Reg, 0' instruction, no destination reg.
+  MI = BuildMI(X86::CMP32ri, 2).addReg(Reg).addImm(0);
+
+  // Create an 'sahf' instruction which takes no operands and stores nothing.
+  MI = BuildMI(X86::SAHF, 0);
+
+  // Create a self looping branch instruction.
+  BuildMI(MBB, X86::JNE, 1).addMBB(&amp;MBB);
+
+The key thing to remember with the ``BuildMI`` functions is that you have to
+specify the number of operands that the machine instruction will take.  This
+allows for efficient memory allocation.  You also need to specify if operands
+default to be uses of values, not definitions.  If you need to add a definition
+operand (other than the optional destination register), you must explicitly mark
+it as such:
+
+.. code-block:: c++
+
+  MI.addReg(Reg, RegState::Define);
+
+Fixed (preassigned) registers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One important issue that the code generator needs to be aware of is the presence
+of fixed registers.  In particular, there are often places in the instruction
+stream where the register allocator *must* arrange for a particular value to be
+in a particular register.  This can occur due to limitations of the instruction
+set (e.g., the X86 can only do a 32-bit divide with the ``EAX``/``EDX``
+registers), or external factors like calling conventions.  In any case, the
+instruction selector should emit code that copies a virtual register into or out
+of a physical register when needed.
+
+For example, consider this simple LLVM example:
+
+.. code-block:: llvm
+
+  define i32 @test(i32 %X, i32 %Y) {
+    %Z = udiv i32 %X, %Y
+    ret i32 %Z
+  }
+
+The X86 instruction selector produces this machine code for the ``div`` and
+``ret`` (use "``llc X.bc -march=x86 -print-machineinstrs``" to get this):
+
+.. code-block:: llvm
+
+  ;; Start of div
+  %EAX = mov %reg1024           ;; Copy X (in reg1024) into EAX
+  %reg1027 = sar %reg1024, 31
+  %EDX = mov %reg1027           ;; Sign extend X into EDX
+  idiv %reg1025                 ;; Divide by Y (in reg1025)
+  %reg1026 = mov %EAX           ;; Read the result (Z) out of EAX
+
+  ;; Start of ret
+  %EAX = mov %reg1026           ;; 32-bit return value goes in EAX
+  ret
+
+By the end of code generation, the register allocator has coalesced the
+registers and deleted the resultant identity moves producing the following
+code:
+
+.. code-block:: llvm
+
+  ;; X is in EAX, Y is in ECX
+  mov %EAX, %EDX
+  sar %EDX, 31
+  idiv %ECX
+  ret 
+
+This approach is extremely general (if it can handle the X86 architecture, it
+can handle anything!) and allows all of the target specific knowledge about the
+instruction stream to be isolated in the instruction selector.  Note that
+physical registers should have a short lifetime for good code generation, and
+all physical registers are assumed dead on entry to and exit from basic blocks
+(before register allocation).  Thus, if you need a value to be live across basic
+block boundaries, it *must* live in a virtual register.
+
+Call-clobbered registers
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Some machine instructions, like calls, clobber a large number of physical
+registers.  Rather than adding ``<def,dead>`` operands for all of them, it is
+possible to use an ``MO_RegisterMask`` operand instead.  The register mask
+operand holds a bit mask of preserved registers, and everything else is
+considered to be clobbered by the instruction.
+
+Machine code in SSA form
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+``MachineInstr``'s are initially selected in SSA-form, and are maintained in
+SSA-form until register allocation happens.  For the most part, this is
+trivially simple since LLVM is already in SSA form; LLVM PHI nodes become
+machine code PHI nodes, and virtual registers are only allowed to have a single
+definition.
+
+After register allocation, machine code is no longer in SSA-form because there
+are no virtual registers left in the code.
+
+.. _MachineBasicBlock:
+
+The ``MachineBasicBlock`` class
+-------------------------------
+
+The ``MachineBasicBlock`` class contains a list of machine instructions
+(:raw-html:`<tt>` `MachineInstr`_ :raw-html:`</tt>` instances).  It roughly
+corresponds to the LLVM code input to the instruction selector, but there can be
+a one-to-many mapping (i.e. one LLVM basic block can map to multiple machine
+basic blocks). The ``MachineBasicBlock`` class has a "``getBasicBlock``" method,
+which returns the LLVM basic block that it comes from.
+
+.. _MachineFunction:
+
+The ``MachineFunction`` class
+-----------------------------
+
+The ``MachineFunction`` class contains a list of machine basic blocks
+(:raw-html:`<tt>` `MachineBasicBlock`_ :raw-html:`</tt>` instances).  It
+corresponds one-to-one with the LLVM function input to the instruction selector.
+In addition to a list of basic blocks, the ``MachineFunction`` contains a a
+``MachineConstantPool``, a ``MachineFrameInfo``, a ``MachineFunctionInfo``, and
+a ``MachineRegisterInfo``.  See ``include/llvm/CodeGen/MachineFunction.h`` for
+more information.
+
+``MachineInstr Bundles``
+------------------------
+
+LLVM code generator can model sequences of instructions as MachineInstr
+bundles. A MI bundle can model a VLIW group / pack which contains an arbitrary
+number of parallel instructions. It can also be used to model a sequential list
+of instructions (potentially with data dependencies) that cannot be legally
+separated (e.g. ARM Thumb2 IT blocks).
+
+Conceptually a MI bundle is a MI with a number of other MIs nested within:
+
+::
+
+  --------------
+  |   Bundle   | ---------
+  --------------          \
+         |           ----------------
+         |           |      MI      |
+         |           ----------------
+         |                   |
+         |           ----------------
+         |           |      MI      |
+         |           ----------------
+         |                   |
+         |           ----------------
+         |           |      MI      |
+         |           ----------------
+         |
+  --------------
+  |   Bundle   | --------
+  --------------         \
+         |           ----------------
+         |           |      MI      |
+         |           ----------------
+         |                   |
+         |           ----------------
+         |           |      MI      |
+         |           ----------------
+         |                   |
+         |                  ...
+         |
+  --------------
+  |   Bundle   | --------
+  --------------         \
+         |
+        ...
+
+MI bundle support does not change the physical representations of
+MachineBasicBlock and MachineInstr. All the MIs (including top level and nested
+ones) are stored as sequential list of MIs. The "bundled" MIs are marked with
+the 'InsideBundle' flag. A top level MI with the special BUNDLE opcode is used
+to represent the start of a bundle. It's legal to mix BUNDLE MIs with indiviual
+MIs that are not inside bundles nor represent bundles.
+
+MachineInstr passes should operate on a MI bundle as a single unit. Member
+methods have been taught to correctly handle bundles and MIs inside bundles.
+The MachineBasicBlock iterator has been modified to skip over bundled MIs to
+enforce the bundle-as-a-single-unit concept. An alternative iterator
+instr_iterator has been added to MachineBasicBlock to allow passes to iterate
+over all of the MIs in a MachineBasicBlock, including those which are nested
+inside bundles. The top level BUNDLE instruction must have the correct set of
+register MachineOperand's that represent the cumulative inputs and outputs of
+the bundled MIs.
+
+Packing / bundling of MachineInstr's should be done as part of the register
+allocation super-pass. More specifically, the pass which determines what MIs
+should be bundled together must be done after code generator exits SSA form
+(i.e. after two-address pass, PHI elimination, and copy coalescing).  Bundles
+should only be finalized (i.e. adding BUNDLE MIs and input and output register
+MachineOperands) after virtual registers have been rewritten into physical
+registers. This requirement eliminates the need to add virtual register operands
+to BUNDLE instructions which would effectively double the virtual register def
+and use lists.
+
+.. _MC Layer:
+
+The "MC" Layer
+==============
+
+The MC Layer is used to represent and process code at the raw machine code
+level, devoid of "high level" information like "constant pools", "jump tables",
+"global variables" or anything like that.  At this level, LLVM handles things
+like label names, machine instructions, and sections in the object file.  The
+code in this layer is used for a number of important purposes: the tail end of
+the code generator uses it to write a .s or .o file, and it is also used by the
+llvm-mc tool to implement standalone machine code assemblers and disassemblers.
+
+This section describes some of the important classes.  There are also a number
+of important subsystems that interact at this layer, they are described later in
+this manual.
+
+.. _MCStreamer:
+
+The ``MCStreamer`` API
+----------------------
+
+MCStreamer is best thought of as an assembler API.  It is an abstract API which
+is *implemented* in different ways (e.g. to output a .s file, output an ELF .o
+file, etc) but whose API correspond directly to what you see in a .s file.
+MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute,
+SwitchSection, EmitValue (for .byte, .word), etc, which directly correspond to
+assembly level directives.  It also has an EmitInstruction method, which is used
+to output an MCInst to the streamer.
+
+This API is most important for two clients: the llvm-mc stand-alone assembler is
+effectively a parser that parses a line, then invokes a method on MCStreamer. In
+the code generator, the `Code Emission`_ phase of the code generator lowers
+higher level LLVM IR and Machine* constructs down to the MC layer, emitting
+directives through MCStreamer.
+
+On the implementation side of MCStreamer, there are two major implementations:
+one for writing out a .s file (MCAsmStreamer), and one for writing out a .o
+file (MCObjectStreamer).  MCAsmStreamer is a straight-forward implementation
+that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but
+MCObjectStreamer implements a full assembler.
+
+The ``MCContext`` class
+-----------------------
+
+The MCContext class is the owner of a variety of uniqued data structures at the
+MC layer, including symbols, sections, etc.  As such, this is the class that you
+interact with to create symbols and sections.  This class can not be subclassed.
+
+The ``MCSymbol`` class
+----------------------
+
+The MCSymbol class represents a symbol (aka label) in the assembly file.  There
+are two interesting kinds of symbols: assembler temporary symbols, and normal
+symbols.  Assembler temporary symbols are used and processed by the assembler
+but are discarded when the object file is produced.  The distinction is usually
+represented by adding a prefix to the label, for example "L" labels are
+assembler temporary labels in MachO.
+
+MCSymbols are created by MCContext and uniqued there.  This means that MCSymbols
+can be compared for pointer equivalence to find out if they are the same symbol.
+Note that pointer inequality does not guarantee the labels will end up at
+different addresses though.  It's perfectly legal to output something like this
+to the .s file:
+
+::
+
+  foo:
+  bar:
+    .byte 4
+
+In this case, both the foo and bar symbols will have the same address.
+
+The ``MCSection`` class
+-----------------------
+
+The ``MCSection`` class represents an object-file specific section. It is
+subclassed by object file specific implementations (e.g. ``MCSectionMachO``,
+``MCSectionCOFF``, ``MCSectionELF``) and these are created and uniqued by
+MCContext.  The MCStreamer has a notion of the current section, which can be
+changed with the SwitchToSection method (which corresponds to a ".section"
+directive in a .s file).
+
+.. _MCInst:
+
+The ``MCInst`` class
+--------------------
+
+The ``MCInst`` class is a target-independent representation of an instruction.
+It is a simple class (much more so than `MachineInstr`_) that holds a
+target-specific opcode and a vector of MCOperands.  MCOperand, in turn, is a
+simple discriminated union of three cases: 1) a simple immediate, 2) a target
+register ID, 3) a symbolic expression (e.g. "``Lfoo-Lbar+42``") as an MCExpr.
+
+MCInst is the common currency used to represent machine instructions at the MC
+layer.  It is the type used by the instruction encoder, the instruction printer,
+and the type generated by the assembly parser and disassembler.
+
+.. _Target-independent algorithms:
+.. _code generation algorithm:
+
+Target-independent code generation algorithms
+=============================================
+
+This section documents the phases described in the `high-level design of the
+code generator`_.  It explains how they work and some of the rationale behind
+their design.
+
+.. _Instruction Selection:
+.. _instruction selection section:
+
+Instruction Selection
+---------------------
+
+Instruction Selection is the process of translating LLVM code presented to the
+code generator into target-specific machine instructions.  There are several
+well-known ways to do this in the literature.  LLVM uses a SelectionDAG based
+instruction selector.
+
+Portions of the DAG instruction selector are generated from the target
+description (``*.td``) files.  Our goal is for the entire instruction selector
+to be generated from these ``.td`` files, though currently there are still
+things that require custom C++ code.
+
+.. _SelectionDAG:
+
+Introduction to SelectionDAGs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The SelectionDAG provides an abstraction for code representation in a way that
+is amenable to instruction selection using automatic techniques
+(e.g. dynamic-programming based optimal pattern matching selectors). It is also
+well-suited to other phases of code generation; in particular, instruction
+scheduling (SelectionDAG's are very close to scheduling DAGs post-selection).
+Additionally, the SelectionDAG provides a host representation where a large
+variety of very-low-level (but target-independent) `optimizations`_ may be
+performed; ones which require extensive information about the instructions
+efficiently supported by the target.
+
+The SelectionDAG is a Directed-Acyclic-Graph whose nodes are instances of the
+``SDNode`` class.  The primary payload of the ``SDNode`` is its operation code
+(Opcode) that indicates what operation the node performs and the operands to the
+operation.  The various operation node types are described at the top of the
+``include/llvm/CodeGen/SelectionDAGNodes.h`` file.
+
+Although most operations define a single value, each node in the graph may
+define multiple values.  For example, a combined div/rem operation will define
+both the dividend and the remainder. Many other situations require multiple
+values as well.  Each node also has some number of operands, which are edges to
+the node defining the used value.  Because nodes may define multiple values,
+edges are represented by instances of the ``SDValue`` class, which is a
+``<SDNode, unsigned>`` pair, indicating the node and result value being used,
+respectively.  Each value produced by an ``SDNode`` has an associated ``MVT``
+(Machine Value Type) indicating what the type of the value is.
+
+SelectionDAGs contain two different kinds of values: those that represent data
+flow and those that represent control flow dependencies.  Data values are simple
+edges with an integer or floating point value type.  Control edges are
+represented as "chain" edges which are of type ``MVT::Other``.  These edges
+provide an ordering between nodes that have side effects (such as loads, stores,
+calls, returns, etc).  All nodes that have side effects should take a token
+chain as input and produce a new one as output.  By convention, token chain
+inputs are always operand #0, and chain results are always the last value
+produced by an operation.
+
+A SelectionDAG has designated "Entry" and "Root" nodes.  The Entry node is
+always a marker node with an Opcode of ``ISD::EntryToken``.  The Root node is
+the final side-effecting node in the token chain. For example, in a single basic
+block function it would be the return node.
+
+One important concept for SelectionDAGs is the notion of a "legal" vs.
+"illegal" DAG.  A legal DAG for a target is one that only uses supported
+operations and supported types.  On a 32-bit PowerPC, for example, a DAG with a
+value of type i1, i8, i16, or i64 would be illegal, as would a DAG that uses a
+SREM or UREM operation.  The `legalize types`_ and `legalize operations`_ phases
+are responsible for turning an illegal DAG into a legal DAG.
+
+SelectionDAG Instruction Selection Process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+SelectionDAG-based instruction selection consists of the following steps:
+
+#. `Build initial DAG`_ --- This stage performs a simple translation from the
+   input LLVM code to an illegal SelectionDAG.
+
+#. `Optimize SelectionDAG`_ --- This stage performs simple optimizations on the
+   SelectionDAG to simplify it, and recognize meta instructions (like rotates
+   and ``div``/``rem`` pairs) for targets that support these meta operations.
+   This makes the resultant code more efficient and the `select instructions
+   from DAG`_ phase (below) simpler.
+
+#. `Legalize SelectionDAG Types`_ --- This stage transforms SelectionDAG nodes
+   to eliminate any types that are unsupported on the target.
+
+#. `Optimize SelectionDAG`_ --- The SelectionDAG optimizer is run to clean up
+   redundancies exposed by type legalization.
+
+#. `Legalize SelectionDAG Ops`_ --- This stage transforms SelectionDAG nodes to
+   eliminate any operations that are unsupported on the target.
+
+#. `Optimize SelectionDAG`_ --- The SelectionDAG optimizer is run to eliminate
+   inefficiencies introduced by operation legalization.
+
+#. `Select instructions from DAG`_ --- Finally, the target instruction selector
+   matches the DAG operations to target instructions.  This process translates
+   the target-independent input DAG into another DAG of target instructions.
+
+#. `SelectionDAG Scheduling and Formation`_ --- The last phase assigns a linear
+   order to the instructions in the target-instruction DAG and emits them into
+   the MachineFunction being compiled.  This step uses traditional prepass
+   scheduling techniques.
+
+After all of these steps are complete, the SelectionDAG is destroyed and the
+rest of the code generation passes are run.
+
+One great way to visualize what is going on here is to take advantage of a few
+LLC command line options.  The following options pop up a window displaying the
+SelectionDAG at specific times (if you only get errors printed to the console
+while using this, you probably `need to configure your
+system <ProgrammersManual.html#ViewGraph>`_ to add support for it).
+
+* ``-view-dag-combine1-dags`` displays the DAG after being built, before the
+  first optimization pass.
+
+* ``-view-legalize-dags`` displays the DAG before Legalization.
+
+* ``-view-dag-combine2-dags`` displays the DAG before the second optimization
+  pass.
+
+* ``-view-isel-dags`` displays the DAG before the Select phase.
+
+* ``-view-sched-dags`` displays the DAG before Scheduling.
+
+The ``-view-sunit-dags`` displays the Scheduler's dependency graph.  This graph
+is based on the final SelectionDAG, with nodes that must be scheduled together
+bundled into a single scheduling-unit node, and with immediate operands and
+other nodes that aren't relevant for scheduling omitted.
+
+.. _Build initial DAG:
+
+Initial SelectionDAG Construction
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The initial SelectionDAG is na\ :raw-html:`&iuml;`\ vely peephole expanded from
+the LLVM input by the ``SelectionDAGLowering`` class in the
+``lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp`` file.  The intent of this pass
+is to expose as much low-level, target-specific details to the SelectionDAG as
+possible.  This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an
+``SDNode add`` while a ``getelementptr`` is expanded into the obvious
+arithmetic). This pass requires target-specific hooks to lower calls, returns,
+varargs, etc.  For these features, the :raw-html:`<tt>` `TargetLowering`_
+:raw-html:`</tt>` interface is used.
+
+.. _legalize types:
+.. _Legalize SelectionDAG Types:
+.. _Legalize SelectionDAG Ops:
+
+SelectionDAG LegalizeTypes Phase
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Legalize phase is in charge of converting a DAG to only use the types that
+are natively supported by the target.
+
+There are two main ways of converting values of unsupported scalar types to
+values of supported types: converting small types to larger types ("promoting"),
+and breaking up large integer types into smaller ones ("expanding").  For
+example, a target might require that all f32 values are promoted to f64 and that
+all i1/i8/i16 values are promoted to i32.  The same target might require that
+all i64 values be expanded into pairs of i32 values.  These changes can insert
+sign and zero extensions as needed to make sure that the final code has the same
+behavior as the input.
+
+There are two main ways of converting values of unsupported vector types to
+value of supported types: splitting vector types, multiple times if necessary,
+until a legal type is found, and extending vector types by adding elements to
+the end to round them out to legal types ("widening").  If a vector gets split
+all the way down to single-element parts with no supported vector type being
+found, the elements are converted to scalars ("scalarizing").
+
+A target implementation tells the legalizer which types are supported (and which
+register class to use for them) by calling the ``addRegisterClass`` method in
+its TargetLowering constructor.
+
+.. _legalize operations:
+.. _Legalizer:
+
+SelectionDAG Legalize Phase
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Legalize phase is in charge of converting a DAG to only use the operations
+that are natively supported by the target.
+
+Targets often have weird constraints, such as not supporting every operation on
+every supported datatype (e.g. X86 does not support byte conditional moves and
+PowerPC does not support sign-extending loads from a 16-bit memory location).
+Legalize takes care of this by open-coding another sequence of operations to
+emulate the operation ("expansion"), by promoting one type to a larger type that
+supports the operation ("promotion"), or by using a target-specific hook to
+implement the legalization ("custom").
+
+A target implementation tells the legalizer which operations are not supported
+(and which of the above three actions to take) by calling the
+``setOperationAction`` method in its ``TargetLowering`` constructor.
+
+Prior to the existence of the Legalize passes, we required that every target
+`selector`_ supported and handled every operator and type even if they are not
+natively supported.  The introduction of the Legalize phases allows all of the
+canonicalization patterns to be shared across targets, and makes it very easy to
+optimize the canonicalized code because it is still in the form of a DAG.
+
+.. _optimizations:
+.. _Optimize SelectionDAG:
+.. _selector:
+
+SelectionDAG Optimization Phase: the DAG Combiner
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The SelectionDAG optimization phase is run multiple times for code generation,
+immediately after the DAG is built and once after each legalization.  The first
+run of the pass allows the initial code to be cleaned up (e.g. performing
+optimizations that depend on knowing that the operators have restricted type
+inputs).  Subsequent runs of the pass clean up the messy code generated by the
+Legalize passes, which allows Legalize to be very simple (it can focus on making
+code legal instead of focusing on generating *good* and legal code).
+
+One important class of optimizations performed is optimizing inserted sign and
+zero extension instructions.  We currently use ad-hoc techniques, but could move
+to more rigorous techniques in the future.  Here are some good papers on the
+subject:
+
+"`Widening integer arithmetic <http://www.eecs.harvard.edu/~nr/pubs/widen-abstract.html>`_" :raw-html:`<br>`
+Kevin Redwine and Norman Ramsey :raw-html:`<br>`
+International Conference on Compiler Construction (CC) 2004
+
+"`Effective sign extension elimination <http://portal.acm.org/citation.cfm?doid=512529.512552>`_"  :raw-html:`<br>`
+Motohiro Kawahito, Hideaki Komatsu, and Toshio Nakatani :raw-html:`<br>`
+Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language Design
+and Implementation.
+
+.. _Select instructions from DAG:
+
+SelectionDAG Select Phase
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Select phase is the bulk of the target-specific code for instruction
+selection.  This phase takes a legal SelectionDAG as input, pattern matches the
+instructions supported by the target to this DAG, and produces a new DAG of
+target code.  For example, consider the following LLVM fragment:
+
+.. code-block:: llvm
+
+  %t1 = fadd float %W, %X
+  %t2 = fmul float %t1, %Y
+  %t3 = fadd float %t2, %Z
+
+This LLVM code corresponds to a SelectionDAG that looks basically like this:
+
+.. code-block:: llvm
+
+  (fadd:f32 (fmul:f32 (fadd:f32 W, X), Y), Z)
+
+If a target supports floating point multiply-and-add (FMA) operations, one of
+the adds can be merged with the multiply.  On the PowerPC, for example, the
+output of the instruction selector might look like this DAG:
+
+::
+
+  (FMADDS (FADDS W, X), Y, Z)
+
+The ``FMADDS`` instruction is a ternary instruction that multiplies its first
+two operands and adds the third (as single-precision floating-point numbers).
+The ``FADDS`` instruction is a simple binary single-precision add instruction.
+To perform this pattern match, the PowerPC backend includes the following
+instruction definitions:
+
+::
+
+  def FMADDS : AForm_1<59, 29,
+                      (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmadds $FRT, $FRA, $FRC, $FRB",
+                      [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                             F4RC:$FRB))]>;
+  def FADDS : AForm_2<59, 21,
+                      (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
+                      "fadds $FRT, $FRA, $FRB",
+                      [(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))]>;
+
+The portion of the instruction definition in bold indicates the pattern used to
+match the instruction.  The DAG operators (like ``fmul``/``fadd``) are defined
+in the ``include/llvm/Target/TargetSelectionDAG.td`` file.  " ``F4RC``" is the
+register class of the input and result values.
+
+The TableGen DAG instruction selector generator reads the instruction patterns
+in the ``.td`` file and automatically builds parts of the pattern matching code
+for your target.  It has the following strengths:
+
+* At compiler-compiler time, it analyzes your instruction patterns and tells you
+  if your patterns make sense or not.
+
+* It can handle arbitrary constraints on operands for the pattern match.  In
+  particular, it is straight-forward to say things like "match any immediate
+  that is a 13-bit sign-extended value".  For examples, see the ``immSExt16``
+  and related ``tblgen`` classes in the PowerPC backend.
+
+* It knows several important identities for the patterns defined.  For example,
+  it knows that addition is commutative, so it allows the ``FMADDS`` pattern
+  above to match "``(fadd X, (fmul Y, Z))``" as well as "``(fadd (fmul X, Y),
+  Z)``", without the target author having to specially handle this case.
+
+* It has a full-featured type-inferencing system.  In particular, you should
+  rarely have to explicitly tell the system what type parts of your patterns
+  are.  In the ``FMADDS`` case above, we didn't have to tell ``tblgen`` that all
+  of the nodes in the pattern are of type 'f32'.  It was able to infer and
+  propagate this knowledge from the fact that ``F4RC`` has type 'f32'.
+
+* Targets can define their own (and rely on built-in) "pattern fragments".
+  Pattern fragments are chunks of reusable patterns that get inlined into your
+  patterns during compiler-compiler time.  For example, the integer "``(not
+  x)``" operation is actually defined as a pattern fragment that expands as
+  "``(xor x, -1)``", since the SelectionDAG does not have a native '``not``'
+  operation.  Targets can define their own short-hand fragments as they see fit.
+  See the definition of '``not``' and '``ineg``' for examples.
+
+* In addition to instructions, targets can specify arbitrary patterns that map
+  to one or more instructions using the 'Pat' class.  For example, the PowerPC
+  has no way to load an arbitrary integer immediate into a register in one
+  instruction. To tell tblgen how to do this, it defines:
+
+  ::
+
+    // Arbitrary immediate support.  Implement in terms of LIS/ORI.
+    def : Pat<(i32 imm:$imm),
+              (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+  If none of the single-instruction patterns for loading an immediate into a
+  register match, this will be used.  This rule says "match an arbitrary i32
+  immediate, turning it into an ``ORI`` ('or a 16-bit immediate') and an ``LIS``
+  ('load 16-bit immediate, where the immediate is shifted to the left 16 bits')
+  instruction".  To make this work, the ``LO16``/``HI16`` node transformations
+  are used to manipulate the input immediate (in this case, take the high or low
+  16-bits of the immediate).
+
+* While the system does automate a lot, it still allows you to write custom C++
+  code to match special cases if there is something that is hard to
+  express.
+
+While it has many strengths, the system currently has some limitations,
+primarily because it is a work in progress and is not yet finished:
+
+* Overall, there is no way to define or match SelectionDAG nodes that define
+  multiple values (e.g. ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc).  This is the
+  biggest reason that you currently still *have to* write custom C++ code
+  for your instruction selector.
+
+* There is no great way to support matching complex addressing modes yet.  In
+  the future, we will extend pattern fragments to allow them to define multiple
+  values (e.g. the four operands of the `X86 addressing mode`_, which are
+  currently matched with custom C++ code).  In addition, we'll extend fragments
+  so that a fragment can match multiple different patterns.
+
+* We don't automatically infer flags like ``isStore``/``isLoad`` yet.
+
+* We don't automatically generate the set of supported registers and operations
+  for the `Legalizer`_ yet.
+
+* We don't have a way of tying in custom legalized nodes yet.
+
+Despite these limitations, the instruction selector generator is still quite
+useful for most of the binary and logical operations in typical instruction
+sets.  If you run into any problems or can't figure out how to do something,
+please let Chris know!
+
+.. _Scheduling and Formation:
+.. _SelectionDAG Scheduling and Formation:
+
+SelectionDAG Scheduling and Formation Phase
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The scheduling phase takes the DAG of target instructions from the selection
+phase and assigns an order.  The scheduler can pick an order depending on
+various constraints of the machines (i.e. order for minimal register pressure or
+try to cover instruction latencies).  Once an order is established, the DAG is
+converted to a list of :raw-html:`<tt>` `MachineInstr`_\s :raw-html:`</tt>` and
+the SelectionDAG is destroyed.
+
+Note that this phase is logically separate from the instruction selection phase,
+but is tied to it closely in the code because it operates on SelectionDAGs.
+
+Future directions for the SelectionDAG
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+#. Optional function-at-a-time selection.
+
+#. Auto-generate entire selector from ``.td`` file.
+
+.. _SSA-based Machine Code Optimizations:
+
+SSA-based Machine Code Optimizations
+------------------------------------
+
+To Be Written
+
+Live Intervals
+--------------
+
+Live Intervals are the ranges (intervals) where a variable is *live*.  They are
+used by some `register allocator`_ passes to determine if two or more virtual
+registers which require the same physical register are live at the same point in
+the program (i.e., they conflict).  When this situation occurs, one virtual
+register must be *spilled*.
+
+Live Variable Analysis
+^^^^^^^^^^^^^^^^^^^^^^
+
+The first step in determining the live intervals of variables is to calculate
+the set of registers that are immediately dead after the instruction (i.e., the
+instruction calculates the value, but it is never used) and the set of registers
+that are used by the instruction, but are never used after the instruction
+(i.e., they are killed). Live variable information is computed for
+each *virtual* register and *register allocatable* physical register
+in the function.  This is done in a very efficient manner because it uses SSA to
+sparsely compute lifetime information for virtual registers (which are in SSA
+form) and only has to track physical registers within a block.  Before register
+allocation, LLVM can assume that physical registers are only live within a
+single basic block.  This allows it to do a single, local analysis to resolve
+physical register lifetimes within each basic block. If a physical register is
+not register allocatable (e.g., a stack pointer or condition codes), it is not
+tracked.
+
+Physical registers may be live in to or out of a function. Live in values are
+typically arguments in registers. Live out values are typically return values in
+registers. Live in values are marked as such, and are given a dummy "defining"
+instruction during live intervals analysis. If the last basic block of a
+function is a ``return``, then it's marked as using all live out values in the
+function.
+
+``PHI`` nodes need to be handled specially, because the calculation of the live
+variable information from a depth first traversal of the CFG of the function
+won't guarantee that a virtual register used by the ``PHI`` node is defined
+before it's used. When a ``PHI`` node is encountered, only the definition is
+handled, because the uses will be handled in other basic blocks.
+
+For each ``PHI`` node of the current basic block, we simulate an assignment at
+the end of the current basic block and traverse the successor basic blocks. If a
+successor basic block has a ``PHI`` node and one of the ``PHI`` node's operands
+is coming from the current basic block, then the variable is marked as *alive*
+within the current basic block and all of its predecessor basic blocks, until
+the basic block with the defining instruction is encountered.
+
+Live Intervals Analysis
+^^^^^^^^^^^^^^^^^^^^^^^
+
+We now have the information available to perform the live intervals analysis and
+build the live intervals themselves.  We start off by numbering the basic blocks
+and machine instructions.  We then handle the "live-in" values.  These are in
+physical registers, so the physical register is assumed to be killed by the end
+of the basic block.  Live intervals for virtual registers are computed for some
+ordering of the machine instructions ``[1, N]``.  A live interval is an interval
+``[i, j)``, where ``1 >= i >= j > N``, for which a variable is live.
+
+.. note::
+  More to come...
+
+.. _Register Allocation:
+.. _register allocator:
+
+Register Allocation
+-------------------
+
+The *Register Allocation problem* consists in mapping a program
+:raw-html:`<b><tt>` P\ :sub:`v`\ :raw-html:`</tt></b>`, that can use an unbounded
+number of virtual registers, to a program :raw-html:`<b><tt>` P\ :sub:`p`\
+:raw-html:`</tt></b>` that contains a finite (possibly small) number of physical
+registers. Each target architecture has a different number of physical
+registers. If the number of physical registers is not enough to accommodate all
+the virtual registers, some of them will have to be mapped into memory. These
+virtuals are called *spilled virtuals*.
+
+How registers are represented in LLVM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In LLVM, physical registers are denoted by integer numbers that normally range
+from 1 to 1023. To see how this numbering is defined for a particular
+architecture, you can read the ``GenRegisterNames.inc`` file for that
+architecture. For instance, by inspecting
+``lib/Target/X86/X86GenRegisterInfo.inc`` we see that the 32-bit register
+``EAX`` is denoted by 43, and the MMX register ``MM0`` is mapped to 65.
+
+Some architectures contain registers that share the same physical location. A
+notable example is the X86 platform. For instance, in the X86 architecture, the
+registers ``EAX``, ``AX`` and ``AL`` share the first eight bits. These physical
+registers are marked as *aliased* in LLVM. Given a particular architecture, you
+can check which registers are aliased by inspecting its ``RegisterInfo.td``
+file. Moreover, the class ``MCRegAliasIterator`` enumerates all the physical
+registers aliased to a register.
+
+Physical registers, in LLVM, are grouped in *Register Classes*.  Elements in the
+same register class are functionally equivalent, and can be interchangeably
+used. Each virtual register can only be mapped to physical registers of a
+particular class. For instance, in the X86 architecture, some virtuals can only
+be allocated to 8 bit registers.  A register class is described by
+``TargetRegisterClass`` objects.  To discover if a virtual register is
+compatible with a given physical, this code can be used:</p>
+
+.. code-block:: c++
+
+  bool RegMapping_Fer::compatible_class(MachineFunction &mf,
+                                        unsigned v_reg,
+                                        unsigned p_reg) {
+    assert(TargetRegisterInfo::isPhysicalRegister(p_reg) &&
+           "Target register must be physical");
+    const TargetRegisterClass *trc = mf.getRegInfo().getRegClass(v_reg);
+    return trc->contains(p_reg);
+  }
+
+Sometimes, mostly for debugging purposes, it is useful to change the number of
+physical registers available in the target architecture. This must be done
+statically, inside the ``TargetRegsterInfo.td`` file. Just ``grep`` for
+``RegisterClass``, the last parameter of which is a list of registers. Just
+commenting some out is one simple way to avoid them being used. A more polite
+way is to explicitly exclude some registers from the *allocation order*. See the
+definition of the ``GR8`` register class in
+``lib/Target/X86/X86RegisterInfo.td`` for an example of this.
+
+Virtual registers are also denoted by integer numbers. Contrary to physical
+registers, different virtual registers never share the same number. Whereas
+physical registers are statically defined in a ``TargetRegisterInfo.td`` file
+and cannot be created by the application developer, that is not the case with
+virtual registers. In order to create new virtual registers, use the method
+``MachineRegisterInfo::createVirtualRegister()``. This method will return a new
+virtual register. Use an ``IndexedMap<Foo, VirtReg2IndexFunctor>`` to hold
+information per virtual register. If you need to enumerate all virtual
+registers, use the function ``TargetRegisterInfo::index2VirtReg()`` to find the
+virtual register numbers:
+
+.. code-block:: c++
+
+    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+      unsigned VirtReg = TargetRegisterInfo::index2VirtReg(i);
+      stuff(VirtReg);
+    }
+
+Before register allocation, the operands of an instruction are mostly virtual
+registers, although physical registers may also be used. In order to check if a
+given machine operand is a register, use the boolean function
+``MachineOperand::isRegister()``. To obtain the integer code of a register, use
+``MachineOperand::getReg()``. An instruction may define or use a register. For
+instance, ``ADD reg:1026 := reg:1025 reg:1024`` defines the registers 1024, and
+uses registers 1025 and 1026. Given a register operand, the method
+``MachineOperand::isUse()`` informs if that register is being used by the
+instruction. The method ``MachineOperand::isDef()`` informs if that registers is
+being defined.
+
+We will call physical registers present in the LLVM bitcode before register
+allocation *pre-colored registers*. Pre-colored registers are used in many
+different situations, for instance, to pass parameters of functions calls, and
+to store results of particular instructions. There are two types of pre-colored
+registers: the ones *implicitly* defined, and those *explicitly*
+defined. Explicitly defined registers are normal operands, and can be accessed
+with ``MachineInstr::getOperand(int)::getReg()``.  In order to check which
+registers are implicitly defined by an instruction, use the
+``TargetInstrInfo::get(opcode)::ImplicitDefs``, where ``opcode`` is the opcode
+of the target instruction. One important difference between explicit and
+implicit physical registers is that the latter are defined statically for each
+instruction, whereas the former may vary depending on the program being
+compiled. For example, an instruction that represents a function call will
+always implicitly define or use the same set of physical registers. To read the
+registers implicitly used by an instruction, use
+``TargetInstrInfo::get(opcode)::ImplicitUses``. Pre-colored registers impose
+constraints on any register allocation algorithm. The register allocator must
+make sure that none of them are overwritten by the values of virtual registers
+while still alive.
+
+Mapping virtual registers to physical registers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two ways to map virtual registers to physical registers (or to memory
+slots). The first way, that we will call *direct mapping*, is based on the use
+of methods of the classes ``TargetRegisterInfo``, and ``MachineOperand``. The
+second way, that we will call *indirect mapping*, relies on the ``VirtRegMap``
+class in order to insert loads and stores sending and getting values to and from
+memory.
+
+The direct mapping provides more flexibility to the developer of the register
+allocator; however, it is more error prone, and demands more implementation
+work.  Basically, the programmer will have to specify where load and store
+instructions should be inserted in the target function being compiled in order
+to get and store values in memory. To assign a physical register to a virtual
+register present in a given operand, use ``MachineOperand::setReg(p_reg)``. To
+insert a store instruction, use ``TargetInstrInfo::storeRegToStackSlot(...)``,
+and to insert a load instruction, use ``TargetInstrInfo::loadRegFromStackSlot``.
+
+The indirect mapping shields the application developer from the complexities of
+inserting load and store instructions. In order to map a virtual register to a
+physical one, use ``VirtRegMap::assignVirt2Phys(vreg, preg)``.  In order to map
+a certain virtual register to memory, use
+``VirtRegMap::assignVirt2StackSlot(vreg)``. This method will return the stack
+slot where ``vreg``'s value will be located.  If it is necessary to map another
+virtual register to the same stack slot, use
+``VirtRegMap::assignVirt2StackSlot(vreg, stack_location)``. One important point
+to consider when using the indirect mapping, is that even if a virtual register
+is mapped to memory, it still needs to be mapped to a physical register. This
+physical register is the location where the virtual register is supposed to be
+found before being stored or after being reloaded.
+
+If the indirect strategy is used, after all the virtual registers have been
+mapped to physical registers or stack slots, it is necessary to use a spiller
+object to place load and store instructions in the code. Every virtual that has
+been mapped to a stack slot will be stored to memory after been defined and will
+be loaded before being used. The implementation of the spiller tries to recycle
+load/store instructions, avoiding unnecessary instructions. For an example of
+how to invoke the spiller, see ``RegAllocLinearScan::runOnMachineFunction`` in
+``lib/CodeGen/RegAllocLinearScan.cpp``.
+
+Handling two address instructions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With very rare exceptions (e.g., function calls), the LLVM machine code
+instructions are three address instructions. That is, each instruction is
+expected to define at most one register, and to use at most two registers.
+However, some architectures use two address instructions. In this case, the
+defined register is also one of the used register. For instance, an instruction
+such as ``ADD %EAX, %EBX``, in X86 is actually equivalent to ``%EAX = %EAX +
+%EBX``.
+
+In order to produce correct code, LLVM must convert three address instructions
+that represent two address instructions into true two address instructions. LLVM
+provides the pass ``TwoAddressInstructionPass`` for this specific purpose. It
+must be run before register allocation takes place. After its execution, the
+resulting code may no longer be in SSA form. This happens, for instance, in
+situations where an instruction such as ``%a = ADD %b %c`` is converted to two
+instructions such as:
+
+::
+
+  %a = MOVE %b
+  %a = ADD %a %c
+
+Notice that, internally, the second instruction is represented as ``ADD
+%a[def/use] %c``. I.e., the register operand ``%a`` is both used and defined by
+the instruction.
+
+The SSA deconstruction phase
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An important transformation that happens during register allocation is called
+the *SSA Deconstruction Phase*. The SSA form simplifies many analyses that are
+performed on the control flow graph of programs. However, traditional
+instruction sets do not implement PHI instructions. Thus, in order to generate
+executable code, compilers must replace PHI instructions with other instructions
+that preserve their semantics.
+
+There are many ways in which PHI instructions can safely be removed from the
+target code. The most traditional PHI deconstruction algorithm replaces PHI
+instructions with copy instructions. That is the strategy adopted by LLVM. The
+SSA deconstruction algorithm is implemented in
+``lib/CodeGen/PHIElimination.cpp``. In order to invoke this pass, the identifier
+``PHIEliminationID`` must be marked as required in the code of the register
+allocator.
+
+Instruction folding
+^^^^^^^^^^^^^^^^^^^
+
+*Instruction folding* is an optimization performed during register allocation
+that removes unnecessary copy instructions. For instance, a sequence of
+instructions such as:
+
+::
+
+  %EBX = LOAD %mem_address
+  %EAX = COPY %EBX
+
+can be safely substituted by the single instruction:
+
+::
+
+  %EAX = LOAD %mem_address
+
+Instructions can be folded with the
+``TargetRegisterInfo::foldMemoryOperand(...)`` method. Care must be taken when
+folding instructions; a folded instruction can be quite different from the
+original instruction. See ``LiveIntervals::addIntervalsForSpills`` in
+``lib/CodeGen/LiveIntervalAnalysis.cpp`` for an example of its use.
+
+Built in register allocators
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The LLVM infrastructure provides the application developer with three different
+register allocators:
+
+* *Fast* --- This register allocator is the default for debug builds. It
+  allocates registers on a basic block level, attempting to keep values in
+  registers and reusing registers as appropriate.
+
+* *Basic* --- This is an incremental approach to register allocation. Live
+  ranges are assigned to registers one at a time in an order that is driven by
+  heuristics. Since code can be rewritten on-the-fly during allocation, this
+  framework allows interesting allocators to be developed as extensions. It is
+  not itself a production register allocator but is a potentially useful
+  stand-alone mode for triaging bugs and as a performance baseline.
+
+* *Greedy* --- *The default allocator*. This is a highly tuned implementation of
+  the *Basic* allocator that incorporates global live range splitting. This
+  allocator works hard to minimize the cost of spill code.
+
+* *PBQP* --- A Partitioned Boolean Quadratic Programming (PBQP) based register
+  allocator. This allocator works by constructing a PBQP problem representing
+  the register allocation problem under consideration, solving this using a PBQP
+  solver, and mapping the solution back to a register assignment.
+
+The type of register allocator used in ``llc`` can be chosen with the command
+line option ``-regalloc=...``:
+
+.. code-block:: bash
+
+  $ llc -regalloc=linearscan file.bc -o ln.s
+  $ llc -regalloc=fast file.bc -o fa.s
+  $ llc -regalloc=pbqp file.bc -o pbqp.s
+
+.. _Prolog/Epilog Code Insertion:
+
+Prolog/Epilog Code Insertion
+----------------------------
+
+Compact Unwind
+
+Throwing an exception requires *unwinding* out of a function. The information on
+how to unwind a given function is traditionally expressed in DWARF unwind
+(a.k.a. frame) info. But that format was originally developed for debuggers to
+backtrace, and each Frame Description Entry (FDE) requires ~20-30 bytes per
+function. There is also the cost of mapping from an address in a function to the
+corresponding FDE at runtime. An alternative unwind encoding is called *compact
+unwind* and requires just 4-bytes per function.
+
+The compact unwind encoding is a 32-bit value, which is encoded in an
+architecture-specific way. It specifies which registers to restore and from
+where, and how to unwind out of the function. When the linker creates a final
+linked image, it will create a ``__TEXT,__unwind_info`` section. This section is
+a small and fast way for the runtime to access unwind info for any given
+function. If we emit compact unwind info for the function, that compact unwind
+info will be encoded in the ``__TEXT,__unwind_info`` section. If we emit DWARF
+unwind info, the ``__TEXT,__unwind_info`` section will contain the offset of the
+FDE in the ``__TEXT,__eh_frame`` section in the final linked image.
+
+For X86, there are three modes for the compact unwind encoding:
+
+*Function with a Frame Pointer (``EBP`` or ``RBP``)*
+  ``EBP/RBP``-based frame, where ``EBP/RBP`` is pushed onto the stack
+  immediately after the return address, then ``ESP/RSP`` is moved to
+  ``EBP/RBP``. Thus to unwind, ``ESP/RSP`` is restored with the current
+  ``EBP/RBP`` value, then ``EBP/RBP`` is restored by popping the stack, and the
+  return is done by popping the stack once more into the PC. All non-volatile
+  registers that need to be restored must have been saved in a small range on
+  the stack that starts ``EBP-4`` to ``EBP-1020`` (``RBP-8`` to
+  ``RBP-1020``). The offset (divided by 4 in 32-bit mode and 8 in 64-bit mode)
+  is encoded in bits 16-23 (mask: ``0x00FF0000``).  The registers saved are
+  encoded in bits 0-14 (mask: ``0x00007FFF``) as five 3-bit entries from the
+  following table:
+
+    ==============  =============  ===============
+    Compact Number  i386 Register  x86-64 Register
+    ==============  =============  ===============
+    1               ``EBX``        ``RBX``
+    2               ``ECX``        ``R12``
+    3               ``EDX``        ``R13``
+    4               ``EDI``        ``R14``
+    5               ``ESI``        ``R15``
+    6               ``EBP``        ``RBP``
+    ==============  =============  ===============
+
+*Frameless with a Small Constant Stack Size (``EBP`` or ``RBP`` is not used as a frame pointer)*
+  To return, a constant (encoded in the compact unwind encoding) is added to the
+  ``ESP/RSP``.  Then the return is done by popping the stack into the PC. All
+  non-volatile registers that need to be restored must have been saved on the
+  stack immediately after the return address. The stack size (divided by 4 in
+  32-bit mode and 8 in 64-bit mode) is encoded in bits 16-23 (mask:
+  ``0x00FF0000``). There is a maximum stack size of 1024 bytes in 32-bit mode
+  and 2048 in 64-bit mode. The number of registers saved is encoded in bits 9-12
+  (mask: ``0x00001C00``). Bits 0-9 (mask: ``0x000003FF``) contain which
+  registers were saved and their order. (See the
+  ``encodeCompactUnwindRegistersWithoutFrame()`` function in
+  ``lib/Target/X86FrameLowering.cpp`` for the encoding algorithm.)
+
+*Frameless with a Large Constant Stack Size (``EBP`` or ``RBP`` is not used as a frame pointer)*
+  This case is like the "Frameless with a Small Constant Stack Size" case, but
+  the stack size is too large to encode in the compact unwind encoding. Instead
+  it requires that the function contains "``subl $nnnnnn, %esp``" in its
+  prolog. The compact encoding contains the offset to the ``$nnnnnn`` value in
+  the function in bits 9-12 (mask: ``0x00001C00``).
+
+.. _Late Machine Code Optimizations:
+
+Late Machine Code Optimizations
+-------------------------------
+
+.. note::
+
+  To Be Written
+
+.. _Code Emission:
+
+Code Emission
+-------------
+
+The code emission step of code generation is responsible for lowering from the
+code generator abstractions (like `MachineFunction`_, `MachineInstr`_, etc) down
+to the abstractions used by the MC layer (`MCInst`_, `MCStreamer`_, etc).  This
+is done with a combination of several different classes: the (misnamed)
+target-independent AsmPrinter class, target-specific subclasses of AsmPrinter
+(such as SparcAsmPrinter), and the TargetLoweringObjectFile class.
+
+Since the MC layer works at the level of abstraction of object files, it doesn't
+have a notion of functions, global variables etc.  Instead, it thinks about
+labels, directives, and instructions.  A key class used at this time is the
+MCStreamer class.  This is an abstract API that is implemented in different ways
+(e.g. to output a .s file, output an ELF .o file, etc) that is effectively an
+"assembler API".  MCStreamer has one method per directive, such as EmitLabel,
+EmitSymbolAttribute, SwitchSection, etc, which directly correspond to assembly
+level directives.
+
+If you are interested in implementing a code generator for a target, there are
+three important things that you have to implement for your target:
+
+#. First, you need a subclass of AsmPrinter for your target.  This class
+   implements the general lowering process converting MachineFunction's into MC
+   label constructs.  The AsmPrinter base class provides a number of useful
+   methods and routines, and also allows you to override the lowering process in
+   some important ways.  You should get much of the lowering for free if you are
+   implementing an ELF, COFF, or MachO target, because the
+   TargetLoweringObjectFile class implements much of the common logic.
+
+#. Second, you need to implement an instruction printer for your target.  The
+   instruction printer takes an `MCInst`_ and renders it to a raw_ostream as
+   text.  Most of this is automatically generated from the .td file (when you
+   specify something like "``add $dst, $src1, $src2``" in the instructions), but
+   you need to implement routines to print operands.
+
+#. Third, you need to implement code that lowers a `MachineInstr`_ to an MCInst,
+   usually implemented in "<target>MCInstLower.cpp".  This lowering process is
+   often target specific, and is responsible for turning jump table entries,
+   constant pool indices, global variable addresses, etc into MCLabels as
+   appropriate.  This translation layer is also responsible for expanding pseudo
+   ops used by the code generator into the actual machine instructions they
+   correspond to. The MCInsts that are generated by this are fed into the
+   instruction printer or the encoder.
+
+Finally, at your choosing, you can also implement an subclass of MCCodeEmitter
+which lowers MCInst's into machine code bytes and relocations.  This is
+important if you want to support direct .o file emission, or would like to
+implement an assembler for your target.
+
+VLIW Packetizer
+---------------
+
+In a Very Long Instruction Word (VLIW) architecture, the compiler is responsible
+for mapping instructions to functional-units available on the architecture. To
+that end, the compiler creates groups of instructions called *packets* or
+*bundles*. The VLIW packetizer in LLVM is a target-independent mechanism to
+enable the packetization of machine instructions.
+
+Mapping from instructions to functional units
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Instructions in a VLIW target can typically be mapped to multiple functional
+units. During the process of packetizing, the compiler must be able to reason
+about whether an instruction can be added to a packet. This decision can be
+complex since the compiler has to examine all possible mappings of instructions
+to functional units. Therefore to alleviate compilation-time complexity, the
+VLIW packetizer parses the instruction classes of a target and generates tables
+at compiler build time. These tables can then be queried by the provided
+machine-independent API to determine if an instruction can be accommodated in a
+packet.
+
+How the packetization tables are generated and used
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The packetizer reads instruction classes from a target's itineraries and creates
+a deterministic finite automaton (DFA) to represent the state of a packet. A DFA
+consists of three major elements: inputs, states, and transitions. The set of
+inputs for the generated DFA represents the instruction being added to a
+packet. The states represent the possible consumption of functional units by
+instructions in a packet. In the DFA, transitions from one state to another
+occur on the addition of an instruction to an existing packet. If there is a
+legal mapping of functional units to instructions, then the DFA contains a
+corresponding transition. The absence of a transition indicates that a legal
+mapping does not exist and that the instruction cannot be added to the packet.
+
+To generate tables for a VLIW target, add *Target*\ GenDFAPacketizer.inc as a
+target to the Makefile in the target directory. The exported API provides three
+functions: ``DFAPacketizer::clearResources()``,
+``DFAPacketizer::reserveResources(MachineInstr *MI)``, and
+``DFAPacketizer::canReserveResources(MachineInstr *MI)``. These functions allow
+a target packetizer to add an instruction to an existing packet and to check
+whether an instruction can be added to a packet. See
+``llvm/CodeGen/DFAPacketizer.h`` for more information.
+
+Implementing a Native Assembler
+===============================
+
+Though you're probably reading this because you want to write or maintain a
+compiler backend, LLVM also fully supports building a native assemblers too.
+We've tried hard to automate the generation of the assembler from the .td files
+(in particular the instruction syntax and encodings), which means that a large
+part of the manual and repetitive data entry can be factored and shared with the
+compiler.
+
+Instruction Parsing
+-------------------
+
+.. note::
+
+  To Be Written
+
+
+Instruction Alias Processing
+----------------------------
+
+Once the instruction is parsed, it enters the MatchInstructionImpl function.
+The MatchInstructionImpl function performs alias processing and then does actual
+matching.
+
+Alias processing is the phase that canonicalizes different lexical forms of the
+same instructions down to one representation.  There are several different kinds
+of alias that are possible to implement and they are listed below in the order
+that they are processed (which is in order from simplest/weakest to most
+complex/powerful).  Generally you want to use the first alias mechanism that
+meets the needs of your instruction, because it will allow a more concise
+description.
+
+Mnemonic Aliases
+^^^^^^^^^^^^^^^^
+
+The first phase of alias processing is simple instruction mnemonic remapping for
+classes of instructions which are allowed with two different mnemonics.  This
+phase is a simple and unconditionally remapping from one input mnemonic to one
+output mnemonic.  It isn't possible for this form of alias to look at the
+operands at all, so the remapping must apply for all forms of a given mnemonic.
+Mnemonic aliases are defined simply, for example X86 has:
+
+::
+
+  def : MnemonicAlias<"cbw",     "cbtw">;
+  def : MnemonicAlias<"smovq",   "movsq">;
+  def : MnemonicAlias<"fldcww",  "fldcw">;
+  def : MnemonicAlias<"fucompi", "fucomip">;
+  def : MnemonicAlias<"ud2a",    "ud2">;
+
+... and many others.  With a MnemonicAlias definition, the mnemonic is remapped
+simply and directly.  Though MnemonicAlias's can't look at any aspect of the
+instruction (such as the operands) they can depend on global modes (the same
+ones supported by the matcher), through a Requires clause:
+
+::
+
+  def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>;
+  def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>;
+
+In this example, the mnemonic gets mapped into different a new one depending on
+the current instruction set.
+
+Instruction Aliases
+^^^^^^^^^^^^^^^^^^^
+
+The most general phase of alias processing occurs while matching is happening:
+it provides new forms for the matcher to match along with a specific instruction
+to generate.  An instruction alias has two parts: the string to match and the
+instruction to generate.  For example:
+
+::
+
+  def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8  :$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX32rr8  GR32:$dst, GR8  :$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16 :$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX64rr8  GR64:$dst, GR8  :$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16 :$src)>;
+  def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32 :$src)>;
+
+This shows a powerful example of the instruction aliases, matching the same
+mnemonic in multiple different ways depending on what operands are present in
+the assembly.  The result of instruction aliases can include operands in a
+different order than the destination instruction, and can use an input multiple
+times, for example:
+
+::
+
+  def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg)>;
+  def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
+  def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
+  def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+
+This example also shows that tied operands are only listed once.  In the X86
+backend, XOR8rr has two input GR8's and one output GR8 (where an input is tied
+to the output).  InstAliases take a flattened operand list without duplicates
+for tied operands.  The result of an instruction alias can also use immediates
+and fixed physical registers which are added as simple immediate operands in the
+result, for example:
+
+::
+
+  // Fixed Immediate operand.
+  def : InstAlias<"aad", (AAD8i8 10)>;
+
+  // Fixed register operand.
+  def : InstAlias<"fcomi", (COM_FIr ST1)>;
+
+  // Simple alias.
+  def : InstAlias<"fcomi $reg", (COM_FIr RST:$reg)>;
+
+Instruction aliases can also have a Requires clause to make them subtarget
+specific.
+
+If the back-end supports it, the instruction printer can automatically emit the
+alias rather than what's being aliased. It typically leads to better, more
+readable code. If it's better to print out what's being aliased, then pass a '0'
+as the third parameter to the InstAlias definition.
+
+Instruction Matching
+--------------------
+
+.. note::
+
+  To Be Written
+
+.. _Implementations of the abstract target description interfaces:
+.. _implement the target description:
+
+Target-specific Implementation Notes
+====================================
+
+This section of the document explains features or design decisions that are
+specific to the code generator for a particular target.  First we start with a
+table that summarizes what features are supported by each target.
+
+Target Feature Matrix
+---------------------
+
+Note that this table does not include the C backend or Cpp backends, since they
+do not use the target independent code generator infrastructure.  It also
+doesn't list features that are not supported fully by any target yet.  It
+considers a feature to be supported if at least one subtarget supports it.  A
+feature being supported means that it is useful and works for most cases, it
+does not indicate that there are zero known bugs in the implementation.  Here is
+the key:
+
+:raw-html:`<table border="1" cellspacing="0">`
+:raw-html:`<tr>`
+:raw-html:`<th>Unknown</th>`
+:raw-html:`<th>No support</th>`
+:raw-html:`<th>Partial Support</th>`
+:raw-html:`<th>Complete Support</th>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td class="unknown"></td>`
+:raw-html:`<td class="no"></td>`
+:raw-html:`<td class="partial"></td>`
+:raw-html:`<td class="yes"></td>`
+:raw-html:`</tr>`
+:raw-html:`</table>`
+
+Here is the table:
+
+:raw-html:`<table width="689" border="1" cellspacing="0">`
+:raw-html:`<tr><td></td>`
+:raw-html:`<td colspan="13" align="center" style="background-color:#ffc">Target</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<th>Feature</th>`
+:raw-html:`<th>ARM</th>`
+:raw-html:`<th>CellSPU</th>`
+:raw-html:`<th>Hexagon</th>`
+:raw-html:`<th>MBlaze</th>`
+:raw-html:`<th>MSP430</th>`
+:raw-html:`<th>Mips</th>`
+:raw-html:`<th>PTX</th>`
+:raw-html:`<th>PowerPC</th>`
+:raw-html:`<th>Sparc</th>`
+:raw-html:`<th>X86</th>`
+:raw-html:`<th>XCore</th>`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_reliable">is generally reliable</a></td>`
+:raw-html:`<td class="yes"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="yes"></td> <!-- Hexagon -->`
+:raw-html:`<td class="no"></td> <!-- MBlaze -->`
+:raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
+:raw-html:`<td class="yes"></td> <!-- Mips -->`
+:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- PowerPC -->`
+:raw-html:`<td class="yes"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="unknown"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_asmparser">assembly parser</a></td>`
+:raw-html:`<td class="no"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="no"></td> <!-- Hexagon -->`
+:raw-html:`<td class="yes"></td> <!-- MBlaze -->`
+:raw-html:`<td class="no"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- PowerPC -->`
+:raw-html:`<td class="no"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_disassembler">disassembler</a></td>`
+:raw-html:`<td class="yes"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="no"></td> <!-- Hexagon -->`
+:raw-html:`<td class="yes"></td> <!-- MBlaze -->`
+:raw-html:`<td class="no"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- PowerPC -->`
+:raw-html:`<td class="no"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_inlineasm">inline asm</a></td>`
+:raw-html:`<td class="yes"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="yes"></td> <!-- Hexagon -->`
+:raw-html:`<td class="yes"></td> <!-- MBlaze -->`
+:raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- PowerPC -->`
+:raw-html:`<td class="unknown"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="unknown"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_jit">jit</a></td>`
+:raw-html:`<td class="partial"><a href="#feat_jit_arm">*</a></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="no"></td> <!-- Hexagon -->`
+:raw-html:`<td class="no"></td> <!-- MBlaze -->`
+:raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
+:raw-html:`<td class="yes"></td> <!-- Mips -->`
+:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- PowerPC -->`
+:raw-html:`<td class="unknown"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="unknown"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_objectwrite">.o&nbsp;file writing</a></td>`
+:raw-html:`<td class="no"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="no"></td> <!-- Hexagon -->`
+:raw-html:`<td class="yes"></td> <!-- MBlaze -->`
+:raw-html:`<td class="no"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- PowerPC -->`
+:raw-html:`<td class="no"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a hr:raw-html:`ef="#feat_tailcall">tail calls</a></td>`
+:raw-html:`<td class="yes"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="yes"></td> <!-- Hexagon -->`
+:raw-html:`<td class="no"></td> <!-- MBlaze -->`
+:raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- PowerPC -->`
+:raw-html:`<td class="unknown"></td> <!-- Sparc -->`
+:raw-html:`<td class="yes"></td> <!-- X86 -->`
+:raw-html:`<td class="unknown"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`<tr>`
+:raw-html:`<td><a href="#feat_segstacks">segmented stacks</a></td>`
+:raw-html:`<td class="no"></td> <!-- ARM -->`
+:raw-html:`<td class="no"></td> <!-- CellSPU -->`
+:raw-html:`<td class="no"></td> <!-- Hexagon -->`
+:raw-html:`<td class="no"></td> <!-- MBlaze -->`
+:raw-html:`<td class="no"></td> <!-- MSP430 -->`
+:raw-html:`<td class="no"></td> <!-- Mips -->`
+:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- PowerPC -->`
+:raw-html:`<td class="no"></td> <!-- Sparc -->`
+:raw-html:`<td class="partial"><a href="#feat_segstacks_x86">*</a></td> <!-- X86 -->`
+:raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`</tr>`
+
+:raw-html:`</table>`
+
+.. _feat_reliable:
+
+Is Generally Reliable
+^^^^^^^^^^^^^^^^^^^^^
+
+This box indicates whether the target is considered to be production quality.
+This indicates that the target has been used as a static compiler to compile
+large amounts of code by a variety of different people and is in continuous use.
+
+.. _feat_asmparser:
+
+Assembly Parser
+^^^^^^^^^^^^^^^
+
+This box indicates whether the target supports parsing target specific .s files
+by implementing the MCAsmParser interface.  This is required for llvm-mc to be
+able to act as a native assembler and is required for inline assembly support in
+the native .o file writer.
+
+.. _feat_disassembler:
+
+Disassembler
+^^^^^^^^^^^^
+
+This box indicates whether the target supports the MCDisassembler API for
+disassembling machine opcode bytes into MCInst's.
+
+.. _feat_inlineasm:
+
+Inline Asm
+^^^^^^^^^^
+
+This box indicates whether the target supports most popular inline assembly
+constraints and modifiers.
+
+.. _feat_jit:
+
+JIT Support
+^^^^^^^^^^^
+
+This box indicates whether the target supports the JIT compiler through the
+ExecutionEngine interface.
+
+.. _feat_jit_arm:
+
+The ARM backend has basic support for integer code in ARM codegen mode, but
+lacks NEON and full Thumb support.
+
+.. _feat_objectwrite:
+
+.o File Writing
+^^^^^^^^^^^^^^^
+
+This box indicates whether the target supports writing .o files (e.g. MachO,
+ELF, and/or COFF) files directly from the target.  Note that the target also
+must include an assembly parser and general inline assembly support for full
+inline assembly support in the .o writer.
+
+Targets that don't support this feature can obviously still write out .o files,
+they just rely on having an external assembler to translate from a .s file to a
+.o file (as is the case for many C compilers).
+
+.. _feat_tailcall:
+
+Tail Calls
+^^^^^^^^^^
+
+This box indicates whether the target supports guaranteed tail calls.  These are
+calls marked "`tail <LangRef.html#i_call>`_" and use the fastcc calling
+convention.  Please see the `tail call section more more details`_.
+
+.. _feat_segstacks:
+
+Segmented Stacks
+^^^^^^^^^^^^^^^^
+
+This box indicates whether the target supports segmented stacks. This replaces
+the traditional large C stack with many linked segments. It is compatible with
+the `gcc implementation <http://gcc.gnu.org/wiki/SplitStacks>`_ used by the Go
+front end.
+
+.. _feat_segstacks_x86:
+
+Basic support exists on the X86 backend. Currently vararg doesn't work and the
+object files are not marked the way the gold linker expects, but simple Go
+programs can be built by dragonegg.
+
+.. _tail call section more more details:
+
+Tail call optimization
+----------------------
+
+Tail call optimization, callee reusing the stack of the caller, is currently
+supported on x86/x86-64 and PowerPC. It is performed if:
+
+* Caller and callee have the calling convention ``fastcc`` or ``cc 10`` (GHC
+  call convention).
+
+* The call is a tail call - in tail position (ret immediately follows call and
+  ret uses value of call or is void).
+
+* Option ``-tailcallopt`` is enabled.
+
+* Platform specific constraints are met.
+
+x86/x86-64 constraints:
+
+* No variable argument lists are used.
+
+* On x86-64 when generating GOT/PIC code only module-local calls (visibility =
+  hidden or protected) are supported.
+
+PowerPC constraints:
+
+* No variable argument lists are used.
+
+* No byval parameters are used.
+
+* On ppc32/64 GOT/PIC only module-local calls (visibility = hidden or protected)
+  are supported.
+
+Example:
+
+Call as ``llc -tailcallopt test.ll``.
+
+.. code-block:: llvm
+
+  declare fastcc i32 @tailcallee(i32 inreg %a1, i32 inreg %a2, i32 %a3, i32 %a4)
+
+  define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
+    %l1 = add i32 %in1, %in2
+    %tmp = tail call fastcc i32 @tailcallee(i32 %in1 inreg, i32 %in2 inreg, i32 %in1, i32 %l1)
+    ret i32 %tmp
+  }
+
+Implications of ``-tailcallopt``:
+
+To support tail call optimization in situations where the callee has more
+arguments than the caller a 'callee pops arguments' convention is used. This
+currently causes each ``fastcc`` call that is not tail call optimized (because
+one or more of above constraints are not met) to be followed by a readjustment
+of the stack. So performance might be worse in such cases.
+
+Sibling call optimization
+-------------------------
+
+Sibling call optimization is a restricted form of tail call optimization.
+Unlike tail call optimization described in the previous section, it can be
+performed automatically on any tail calls when ``-tailcallopt`` option is not
+specified.
+
+Sibling call optimization is currently performed on x86/x86-64 when the
+following constraints are met:
+
+* Caller and callee have the same calling convention. It can be either ``c`` or
+  ``fastcc``.
+
+* The call is a tail call - in tail position (ret immediately follows call and
+  ret uses value of call or is void).
+
+* Caller and callee have matching return type or the callee result is not used.
+
+* If any of the callee arguments are being passed in stack, they must be
+  available in caller's own incoming argument stack and the frame offsets must
+  be the same.
+
+Example:
+
+.. code-block:: llvm
+
+  declare i32 @bar(i32, i32)
+
+  define i32 @foo(i32 %a, i32 %b, i32 %c) {
+  entry:
+    %0 = tail call i32 @bar(i32 %a, i32 %b)
+    ret i32 %0
+  }
+
+The X86 backend
+---------------
+
+The X86 code generator lives in the ``lib/Target/X86`` directory.  This code
+generator is capable of targeting a variety of x86-32 and x86-64 processors, and
+includes support for ISA extensions such as MMX and SSE.
+
+X86 Target Triples supported
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following are the known target triples that are supported by the X86
+backend.  This is not an exhaustive list, and it would be useful to add those
+that people test.
+
+* **i686-pc-linux-gnu** --- Linux
+
+* **i386-unknown-freebsd5.3** --- FreeBSD 5.3
+
+* **i686-pc-cygwin** --- Cygwin on Win32
+
+* **i686-pc-mingw32** --- MingW on Win32
+
+* **i386-pc-mingw32msvc** --- MingW crosscompiler on Linux
+
+* **i686-apple-darwin*** --- Apple Darwin on X86
+
+* **x86_64-unknown-linux-gnu** --- Linux
+
+X86 Calling Conventions supported
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following target-specific calling conventions are known to backend:
+
+* **x86_StdCall** --- stdcall calling convention seen on Microsoft Windows
+  platform (CC ID = 64).
+
+* **x86_FastCall** --- fastcall calling convention seen on Microsoft Windows
+  platform (CC ID = 65).
+
+* **x86_ThisCall** --- Similar to X86_StdCall. Passes first argument in ECX,
+  others via stack. Callee is responsible for stack cleaning. This convention is
+  used by MSVC by default for methods in its ABI (CC ID = 70).
+
+.. _X86 addressing mode:
+
+Representing X86 addressing modes in MachineInstrs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The x86 has a very flexible way of accessing memory.  It is capable of forming
+memory addresses of the following expression directly in integer instructions
+(which use ModR/M addressing):
+
+::
+
+  SegmentReg: Base + [1,2,4,8] * IndexReg + Disp32
+
+In order to represent this, LLVM tracks no less than 5 operands for each memory
+operand of this form.  This means that the "load" form of '``mov``' has the
+following ``MachineOperand``\s in this order:
+
+::
+
+  Index:        0     |    1        2       3           4          5
+  Meaning:   DestReg, | BaseReg,  Scale, IndexReg, Displacement Segment
+  OperandTy: VirtReg, | VirtReg, UnsImm, VirtReg,   SignExtImm  PhysReg
+
+Stores, and all other instructions, treat the four memory operands in the same
+way and in the same order.  If the segment register is unspecified (regno = 0),
+then no segment override is generated.  "Lea" operations do not have a segment
+register specified, so they only have 4 operands for their memory reference.
+
+X86 address spaces supported
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+x86 has a feature which provides the ability to perform loads and stores to
+different address spaces via the x86 segment registers.  A segment override
+prefix byte on an instruction causes the instruction's memory access to go to
+the specified segment.  LLVM address space 0 is the default address space, which
+includes the stack, and any unqualified memory accesses in a program.  Address
+spaces 1-255 are currently reserved for user-defined code.  The GS-segment is
+represented by address space 256, while the FS-segment is represented by address
+space 257. Other x86 segments have yet to be allocated address space
+numbers.
+
+While these address spaces may seem similar to TLS via the ``thread_local``
+keyword, and often use the same underlying hardware, there are some fundamental
+differences.
+
+The ``thread_local`` keyword applies to global variables and specifies that they
+are to be allocated in thread-local memory. There are no type qualifiers
+involved, and these variables can be pointed to with normal pointers and
+accessed with normal loads and stores.  The ``thread_local`` keyword is
+target-independent at the LLVM IR level (though LLVM doesn't yet have
+implementations of it for some configurations)
+
+Special address spaces, in contrast, apply to static types. Every load and store
+has a particular address space in its address operand type, and this is what
+determines which address space is accessed.  LLVM ignores these special address
+space qualifiers on global variables, and does not provide a way to directly
+allocate storage in them.  At the LLVM IR level, the behavior of these special
+address spaces depends in part on the underlying OS or runtime environment, and
+they are specific to x86 (and LLVM doesn't yet handle them correctly in some
+cases).
+
+Some operating systems and runtime environments use (or may in the future use)
+the FS/GS-segment registers for various low-level purposes, so care should be
+taken when considering them.
+
+Instruction naming
+^^^^^^^^^^^^^^^^^^
+
+An instruction name consists of the base name, a default operand size, and a a
+character per operand with an optional special size. For example:
+
+::
+
+  ADD8rr      -> add, 8-bit register, 8-bit register
+  IMUL16rmi   -> imul, 16-bit register, 16-bit memory, 16-bit immediate
+  IMUL16rmi8  -> imul, 16-bit register, 16-bit memory, 8-bit immediate
+  MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
+
+The PowerPC backend
+-------------------
+
+The PowerPC code generator lives in the lib/Target/PowerPC directory.  The code
+generation is retargetable to several variations or *subtargets* of the PowerPC
+ISA; including ppc32, ppc64 and altivec.
+
+LLVM PowerPC ABI
+^^^^^^^^^^^^^^^^
+
+LLVM follows the AIX PowerPC ABI, with two deviations. LLVM uses a PC relative
+(PIC) or static addressing for accessing global values, so no TOC (r2) is
+used. Second, r31 is used as a frame pointer to allow dynamic growth of a stack
+frame.  LLVM takes advantage of having no TOC to provide space to save the frame
+pointer in the PowerPC linkage area of the caller frame.  Other details of
+PowerPC ABI can be found at `PowerPC ABI
+<http://developer.apple.com/documentation/DeveloperTools/Conceptual/LowLevelABI/Articles/32bitPowerPC.html>`_\
+. Note: This link describes the 32 bit ABI.  The 64 bit ABI is similar except
+space for GPRs are 8 bytes wide (not 4) and r13 is reserved for system use.
+
+Frame Layout
+^^^^^^^^^^^^
+
+The size of a PowerPC frame is usually fixed for the duration of a function's
+invocation.  Since the frame is fixed size, all references into the frame can be
+accessed via fixed offsets from the stack pointer.  The exception to this is
+when dynamic alloca or variable sized arrays are present, then a base pointer
+(r31) is used as a proxy for the stack pointer and stack pointer is free to grow
+or shrink.  A base pointer is also used if llvm-gcc is not passed the
+-fomit-frame-pointer flag. The stack pointer is always aligned to 16 bytes, so
+that space allocated for altivec vectors will be properly aligned.
+
+An invocation frame is laid out as follows (low memory at top):
+
+:raw-html:`<table border="1" cellspacing="0">`
+:raw-html:`<tr>`
+:raw-html:`<td>Linkage<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>Parameter area<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>Dynamic area<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>Locals area<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>Saved registers area<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr style="border-style: none hidden none hidden;">`
+:raw-html:`<td><br></td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>Previous Frame<br><br></td>`
+:raw-html:`</tr>`
+:raw-html:`</table>`
+
+The *linkage* area is used by a callee to save special registers prior to
+allocating its own frame.  Only three entries are relevant to LLVM. The first
+entry is the previous stack pointer (sp), aka link.  This allows probing tools
+like gdb or exception handlers to quickly scan the frames in the stack.  A
+function epilog can also use the link to pop the frame from the stack.  The
+third entry in the linkage area is used to save the return address from the lr
+register. Finally, as mentioned above, the last entry is used to save the
+previous frame pointer (r31.)  The entries in the linkage area are the size of a
+GPR, thus the linkage area is 24 bytes long in 32 bit mode and 48 bytes in 64
+bit mode.
+
+32 bit linkage area:
+
+:raw-html:`<table  border="1" cellspacing="0">`
+:raw-html:`<tr>`
+:raw-html:`<td>0</td>`
+:raw-html:`<td>Saved SP (r1)</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>4</td>`
+:raw-html:`<td>Saved CR</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>8</td>`
+:raw-html:`<td>Saved LR</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>12</td>`
+:raw-html:`<td>Reserved</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>16</td>`
+:raw-html:`<td>Reserved</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>20</td>`
+:raw-html:`<td>Saved FP (r31)</td>`
+:raw-html:`</tr>`
+:raw-html:`</table>`
+
+64 bit linkage area:
+
+:raw-html:`<table border="1" cellspacing="0">`
+:raw-html:`<tr>`
+:raw-html:`<td>0</td>`
+:raw-html:`<td>Saved SP (r1)</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>8</td>`
+:raw-html:`<td>Saved CR</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>16</td>`
+:raw-html:`<td>Saved LR</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>24</td>`
+:raw-html:`<td>Reserved</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>32</td>`
+:raw-html:`<td>Reserved</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>40</td>`
+:raw-html:`<td>Saved FP (r31)</td>`
+:raw-html:`</tr>`
+:raw-html:`</table>`
+
+The *parameter area* is used to store arguments being passed to a callee
+function.  Following the PowerPC ABI, the first few arguments are actually
+passed in registers, with the space in the parameter area unused.  However, if
+there are not enough registers or the callee is a thunk or vararg function,
+these register arguments can be spilled into the parameter area.  Thus, the
+parameter area must be large enough to store all the parameters for the largest
+call sequence made by the caller.  The size must also be minimally large enough
+to spill registers r3-r10.  This allows callees blind to the call signature,
+such as thunks and vararg functions, enough space to cache the argument
+registers.  Therefore, the parameter area is minimally 32 bytes (64 bytes in 64
+bit mode.)  Also note that since the parameter area is a fixed offset from the
+top of the frame, that a callee can access its spilt arguments using fixed
+offsets from the stack pointer (or base pointer.)
+
+Combining the information about the linkage, parameter areas and alignment. A
+stack frame is minimally 64 bytes in 32 bit mode and 128 bytes in 64 bit mode.
+
+The *dynamic area* starts out as size zero.  If a function uses dynamic alloca
+then space is added to the stack, the linkage and parameter areas are shifted to
+top of stack, and the new space is available immediately below the linkage and
+parameter areas.  The cost of shifting the linkage and parameter areas is minor
+since only the link value needs to be copied.  The link value can be easily
+fetched by adding the original frame size to the base pointer.  Note that
+allocations in the dynamic space need to observe 16 byte alignment.
+
+The *locals area* is where the llvm compiler reserves space for local variables.
+
+The *saved registers area* is where the llvm compiler spills callee saved
+registers on entry to the callee.
+
+Prolog/Epilog
+^^^^^^^^^^^^^
+
+The llvm prolog and epilog are the same as described in the PowerPC ABI, with
+the following exceptions.  Callee saved registers are spilled after the frame is
+created.  This allows the llvm epilog/prolog support to be common with other
+targets.  The base pointer callee saved register r31 is saved in the TOC slot of
+linkage area.  This simplifies allocation of space for the base pointer and
+makes it convenient to locate programatically and during debugging.
+
+Dynamic Allocation
+^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  TODO - More to come.
+
+The PTX backend
+---------------
+
+The PTX code generator lives in the lib/Target/PTX directory. It is currently a
+work-in-progress, but already supports most of the code generation functionality
+needed to generate correct PTX kernels for CUDA devices.
+
+The code generator can target PTX 2.0+, and shader model 1.0+.  The PTX ISA
+Reference Manual is used as the primary source of ISA information, though an
+effort is made to make the output of the code generator match the output of the
+NVidia nvcc compiler, whenever possible.
+
+Code Generator Options:
+
+:raw-html:`<table border="1" cellspacing="0">`
+:raw-html:`<tr>`
+:raw-html:`<th>Option</th>`
+:raw-html:`<th>Description</th>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>``double``</td>`
+:raw-html:`<td align="left">If enabled, the map_f64_to_f32 directive is disabled in the PTX output, allowing native double-precision arithmetic</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>``no-fma``</td>`
+:raw-html:`<td align="left">Disable generation of Fused-Multiply Add instructions, which may be beneficial for some devices</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>``smxy / computexy``</td>`
+:raw-html:`<td align="left">Set shader model/compute capability to x.y, e.g. sm20 or compute13</td>`
+:raw-html:`</tr>`
+:raw-html:`</table>`
+
+Working:
+
+* Arithmetic instruction selection (including combo FMA)
+
+* Bitwise instruction selection
+
+* Control-flow instruction selection
+
+* Function calls (only on SM 2.0+ and no return arguments)
+
+* Addresses spaces (0 = global, 1 = constant, 2 = local, 4 = shared)
+
+* Thread synchronization (bar.sync)
+
+* Special register reads ([N]TID, [N]CTAID, PMx, CLOCK, etc.)
+
+In Progress:
+
+* Robust call instruction selection
+
+* Stack frame allocation
+
+* Device-specific instruction scheduling optimizations
diff --git a/docs/CodingStandards.html b/docs/CodingStandards.html
deleted file mode 100644
index 847ac4c..0000000
--- a/docs/CodingStandards.html
+++ /dev/null
@@ -1,1568 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-  <title>LLVM Coding Standards</title>
-</head>
-<body>
-
-<h1>
-  LLVM Coding Standards
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#mechanicalissues">Mechanical Source Issues</a>
-    <ol>
-      <li><a href="#sourceformating">Source Code Formatting</a>
-        <ol>
-          <li><a href="#scf_commenting">Commenting</a></li>
-          <li><a href="#scf_commentformat">Comment Formatting</a></li>
-          <li><a href="#scf_includes"><tt>#include</tt> Style</a></li>
-          <li><a href="#scf_codewidth">Source Code Width</a></li>
-          <li><a href="#scf_spacestabs">Use Spaces Instead of Tabs</a></li>
-          <li><a href="#scf_indentation">Indent Code Consistently</a></li>
-        </ol></li>
-      <li><a href="#compilerissues">Compiler Issues</a>
-        <ol>
-          <li><a href="#ci_warningerrors">Treat Compiler Warnings Like
-              Errors</a></li>
-          <li><a href="#ci_portable_code">Write Portable Code</a></li>
-          <li><a href="#ci_rtti_exceptions">Do not use RTTI or Exceptions</a></li>
-          <li><a href="#ci_static_ctors">Do not use Static Constructors</a></li>
-          <li><a href="#ci_class_struct">Use of <tt>class</tt>/<tt>struct</tt> Keywords</a></li>
-        </ol></li>
-    </ol></li>
-  <li><a href="#styleissues">Style Issues</a>
-    <ol>
-      <li><a href="#macro">The High-Level Issues</a>
-        <ol>
-          <li><a href="#hl_module">A Public Header File <b>is</b> a
-              Module</a></li>
-          <li><a href="#hl_dontinclude"><tt>#include</tt> as Little as Possible</a></li>
-          <li><a href="#hl_privateheaders">Keep "internal" Headers
-              Private</a></li>
-          <li><a href="#hl_earlyexit">Use Early Exits and <tt>continue</tt> to Simplify
-              Code</a></li>
-          <li><a href="#hl_else_after_return">Don't use <tt>else</tt> after a
-              <tt>return</tt></a></li>
-          <li><a href="#hl_predicateloops">Turn Predicate Loops into Predicate
-              Functions</a></li>
-        </ol></li>
-      <li><a href="#micro">The Low-Level Issues</a>
-        <ol>
-          <li><a href="#ll_naming">Name Types, Functions, Variables, and Enumerators Properly</a></li>
-          <li><a href="#ll_assert">Assert Liberally</a></li>
-          <li><a href="#ll_ns_std">Do not use '<tt>using namespace std</tt>'</a></li>
-          <li><a href="#ll_virtual_anch">Provide a virtual method anchor for
-              classes in headers</a></li>
-          <li><a href="#ll_end">Don't evaluate <tt>end()</tt> every time through a
-              loop</a></li>
-          <li><a href="#ll_iostream"><tt>#include &lt;iostream&gt;</tt> is
-              <em>forbidden</em></a></li>
-          <li><a href="#ll_raw_ostream">Use <tt>raw_ostream</tt></a></li>
-          <li><a href="#ll_avoidendl">Avoid <tt>std::endl</tt></a></li>
-        </ol></li>
-        
-      <li><a href="#nano">Microscopic Details</a>
-        <ol>
-          <li><a href="#micro_spaceparen">Spaces Before Parentheses</a></li>
-          <li><a href="#micro_preincrement">Prefer Preincrement</a></li>
-          <li><a href="#micro_namespaceindent">Namespace Indentation</a></li>
-          <li><a href="#micro_anonns">Anonymous Namespaces</a></li>
-        </ol></li>
-
-        
-    </ol></li>
-  <li><a href="#seealso">See Also</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a></p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document attempts to describe a few coding standards that are being used
-in the LLVM source tree.  Although no coding standards should be regarded as
-absolute requirements to be followed in all instances, coding standards are
-particularly important for large-scale code bases that follow a library-based
-design (like LLVM).</p>
-
-<p>This document intentionally does not prescribe fixed standards for religious
-issues such as brace placement and space usage.  For issues like this, follow
-the golden rule:</p>
-
-<blockquote>
-
-<p><b><a name="goldenrule">If you are extending, enhancing, or bug fixing
-already implemented code, use the style that is already being used so that the
-source is uniform and easy to follow.</a></b></p>
-
-</blockquote>
-  
-<p>Note that some code bases (e.g. libc++) have really good reasons to deviate
-from the coding standards.  In the case of libc++, this is because the naming
-and other conventions are dictated by the C++ standard.  If you think there is
-a specific good reason to deviate from the standards here, please bring it up
-on the LLVMdev mailing list.</p>
-
-<p>There are some conventions that are not uniformly followed in the code base
-(e.g. the naming convention).  This is because they are relatively new, and a
-lot of code was written before they were put in place.  Our long term goal is
-for the entire codebase to follow the convention, but we explicitly <em>do 
-not</em> want patches that do large-scale reformating of existing code.  OTOH,
-it is reasonable to rename the methods of a class if you're about to change it
-in some other way.  Just do the reformating as a separate commit from the
-functionality change. </p>
-  
-<p>The ultimate goal of these guidelines is the increase readability and
-maintainability of our common source base. If you have suggestions for topics to
-be included, please mail them to <a
-href="mailto:sabre@nondot.org">Chris</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="mechanicalissues">Mechanical Source Issues</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="sourceformating">Source Code Formatting</a>
-</h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_commenting">Commenting</a>
-</h4>
-
-<div>
-
-<p>Comments are one critical part of readability and maintainability.  Everyone
-knows they should comment their code, and so should you.  When writing comments,
-write them as English prose, which means they should use proper capitalization,
-punctuation, etc.  Aim to describe what a code is trying to do and why, not
-"how" it does it at a micro level. Here are a few critical things to
-document:</p>
-
-<h5>File Headers</h5>
-
-<div>
-
-<p>Every source file should have a header on it that describes the basic 
-purpose of the file.  If a file does not have a header, it should not be 
-checked into the tree.  The standard header looks like this:</p>
-
-<div class="doc_code">
-<pre>
-//===-- llvm/Instruction.h - Instruction class definition -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the Instruction class, which is the
-// base class for all of the VM instructions.
-//
-//===----------------------------------------------------------------------===//
-</pre>
-</div>
-
-<p>A few things to note about this particular format:  The "<tt>-*- C++
--*-</tt>" string on the first line is there to tell Emacs that the source file
-is a C++ file, not a C file (Emacs assumes <tt>.h</tt> files are C files by default).
-Note that this tag is not necessary in <tt>.cpp</tt> files.  The name of the file is also
-on the first line, along with a very short description of the purpose of the
-file.  This is important when printing out code and flipping though lots of
-pages.</p>
-
-<p>The next section in the file is a concise note that defines the license
-that the file is released under.  This makes it perfectly clear what terms the
-source code can be distributed under and should not be modified in any way.</p>
-
-<p>The main body of the description does not have to be very long in most cases.
-Here it's only two lines.  If an algorithm is being implemented or something
-tricky is going on, a reference to the paper where it is published should be
-included, as well as any notes or "gotchas" in the code to watch out for.</p>
-
-</div>
-
-<h5>Class overviews</h5>
-
-<p>Classes are one fundamental part of a good object oriented design.  As such,
-a class definition should have a comment block that explains what the class is
-used for and how it works.  Every non-trivial class is expected to have a
-doxygen comment block.</p>
-
-
-<h5>Method information</h5>
-
-<div>
-
-<p>Methods defined in a class (as well as any global functions) should also be
-documented properly.  A quick note about what it does and a description of the
-borderline behaviour is all that is necessary here (unless something
-particularly tricky or insidious is going on).  The hope is that people can
-figure out how to use your interfaces without reading the code itself.</p>
-
-<p>Good things to talk about here are what happens when something unexpected
-happens: does the method return null?  Abort?  Format your hard disk?</p>
-
-</div>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_commentformat">Comment Formatting</a>
-</h4>
-
-<div>
-
-<p>In general, prefer C++ style (<tt>//</tt>) comments.  They take less space,
-require less typing, don't have nesting problems, etc.  There are a few cases
-when it is useful to use C style (<tt>/* */</tt>) comments however:</p>
-
-<ol>
-  <li>When writing C code: Obviously if you are writing C code, use C style
-      comments.</li>
-  <li>When writing a header file that may be <tt>#include</tt>d by a C source
-      file.</li>
-  <li>When writing a source file that is used by a tool that only accepts C
-      style comments.</li>
-</ol>
-
-<p>To comment out a large block of code, use <tt>#if 0</tt> and <tt>#endif</tt>.
-These nest properly and are better behaved in general than C style comments.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_includes"><tt>#include</tt> Style</a>
-</h4>
-
-<div>
-
-<p>Immediately after the <a href="#scf_commenting">header file comment</a> (and
-include guards if working on a header file), the <a
-href="#hl_dontinclude">minimal</a> list of <tt>#include</tt>s required by the
-file should be listed.  We prefer these <tt>#include</tt>s to be listed in this
-order:</p>
-
-<ol>
-  <li><a href="#mmheader">Main Module Header</a></li>
-  <li><a href="#hl_privateheaders">Local/Private Headers</a></li>
-  <li><tt>llvm/*</tt></li>
-  <li><tt>llvm/Analysis/*</tt></li>
-  <li><tt>llvm/Assembly/*</tt></li>
-  <li><tt>llvm/Bitcode/*</tt></li>
-  <li><tt>llvm/CodeGen/*</tt></li>
-  <li>...</li>
-  <li><tt>Support/*</tt></li>
-  <li><tt>Config/*</tt></li>
-  <li>System <tt>#includes</tt></li>
-</ol>
-
-<p>and each category should be sorted by name.</p>
-
-<p><a name="mmheader">The "Main Module Header"</a> file applies to <tt>.cpp</tt> files
-which implement an interface defined by a <tt>.h</tt> file.  This <tt>#include</tt>
-should always be included <b>first</b> regardless of where it lives on the file
-system.  By including a header file first in the <tt>.cpp</tt> files that implement the
-interfaces, we ensure that the header does not have any hidden dependencies
-which are not explicitly #included in the header, but should be.  It is also a
-form of documentation in the <tt>.cpp</tt> file to indicate where the interfaces it
-implements are defined.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_codewidth">Source Code Width</a>
-</h4>
-
-<div>
-
-<p>Write your code to fit within 80 columns of text.  This helps those of us who
-like to print out code and look at your code in an xterm without resizing
-it.</p>
-
-<p>The longer answer is that there must be some limit to the width of the code
-in order to reasonably allow developers to have multiple files side-by-side in
-windows on a modest display.  If you are going to pick a width limit, it is
-somewhat arbitrary but you might as well pick something standard.  Going with
-90 columns (for example) instead of 80 columns wouldn't add any significant 
-value and would be detrimental to printing out code.  Also many other projects
-have standardized on 80 columns, so some people have already configured their
-editors for it (vs something else, like 90 columns).</p>
-
-<p>This is one of many contentious issues in coding standards, but it is not up
-for debate.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_spacestabs">Use Spaces Instead of Tabs</a>
-</h4>
-
-<div>
-
-<p>In all cases, prefer spaces to tabs in source files.  People have different
-preferred indentation levels, and different styles of indentation that they
-like; this is fine.  What isn't fine is that different editors/viewers expand
-tabs out to different tab stops.  This can cause your code to look completely
-unreadable, and it is not worth dealing with.</p>
-
-<p>As always, follow the <a href="#goldenrule">Golden Rule</a> above: follow the
-style of existing code if you are modifying and extending it.  If you like four
-spaces of indentation, <b>DO NOT</b> do that in the middle of a chunk of code
-with two spaces of indentation.  Also, do not reindent a whole source file: it
-makes for incredible diffs that are absolutely worthless.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="scf_indentation">Indent Code Consistently</a>
-</h4>
-
-<div>
-
-<p>Okay, in your first year of programming you were told that indentation is
-important.  If you didn't believe and internalize this then, now is the time.
-Just do it.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="compilerissues">Compiler Issues</a>
-</h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ci_warningerrors">Treat Compiler Warnings Like Errors</a>
-</h4>
-
-<div>
-
-<p>If your code has compiler warnings in it, something is wrong &mdash; you
-aren't casting values correctly, your have "questionable" constructs in your
-code, or you are doing something legitimately wrong.  Compiler warnings can
-cover up legitimate errors in output and make dealing with a translation unit
-difficult.</p>
-
-<p>It is not possible to prevent all warnings from all compilers, nor is it
-desirable.  Instead, pick a standard compiler (like <tt>gcc</tt>) that provides
-a good thorough set of warnings, and stick to it.  At least in the case of
-<tt>gcc</tt>, it is possible to work around any spurious errors by changing the
-syntax of the code slightly.  For example, a warning that annoys me occurs when
-I write code like this:</p>
-
-<div class="doc_code">
-<pre>
-if (V = getValue()) {
-  ...
-}
-</pre>
-</div>
-
-<p><tt>gcc</tt> will warn me that I probably want to use the <tt>==</tt>
-operator, and that I probably mistyped it.  In most cases, I haven't, and I
-really don't want the spurious errors.  To fix this particular problem, I
-rewrite the code like this:</p>
-
-<div class="doc_code">
-<pre>
-if ((V = getValue())) {
-  ...
-}
-</pre>
-</div>
-
-<p>which shuts <tt>gcc</tt> up.  Any <tt>gcc</tt> warning that annoys you can
-be fixed by massaging the code appropriately.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ci_portable_code">Write Portable Code</a>
-</h4>
-
-<div>
-
-<p>In almost all cases, it is possible and within reason to write completely
-portable code.  If there are cases where it isn't possible to write portable
-code, isolate it behind a well defined (and well documented) interface.</p>
-
-<p>In practice, this means that you shouldn't assume much about the host
-compiler, and Visual Studio tends to be the lowest common denominator.
-If advanced features are used, they should only be an implementation detail of 
-a library which has a simple exposed API, and preferably be buried in 
-libSystem.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-<a name="ci_rtti_exceptions">Do not use RTTI or Exceptions</a>
-</h4>
-<div>
-
-<p>In an effort to reduce code and executable size, LLVM does not use RTTI
-(e.g. <tt>dynamic_cast&lt;&gt;</tt>) or exceptions.  These two language features
-violate the general C++ principle of <i>"you only pay for what you use"</i>,
-causing executable bloat even if exceptions are never used in the code base, or
-if RTTI is never used for a class.  Because of this, we turn them off globally
-in the code.</p>
-
-<p>That said, LLVM does make extensive use of a hand-rolled form of RTTI that
-use templates like <a href="ProgrammersManual.html#isa"><tt>isa&lt;&gt;</tt>,
-<tt>cast&lt;&gt;</tt>, and <tt>dyn_cast&lt;&gt;</tt></a>.  This form of RTTI is
-opt-in and can be added to any class.  It is also substantially more efficient
-than <tt>dynamic_cast&lt;&gt;</tt>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-<a name="ci_static_ctors">Do not use Static Constructors</a>
-</h4>
-<div>
-
-<p>Static constructors and destructors (e.g. global variables whose types have
-a constructor or destructor) should not be added to the code base, and should be
-removed wherever possible.  Besides <a 
-href="http://yosefk.com/c++fqa/ctors.html#fqa-10.12">well known problems</a>
-where the order of initialization is undefined between globals in different
-source files, the entire concept of static constructors is at odds with the
-common use case of LLVM as a library linked into a larger application.</p>
-  
-<p>Consider the use of LLVM as a JIT linked into another application (perhaps
-for <a href="http://llvm.org/Users.html">OpenGL, custom languages</a>,
-<a href="http://llvm.org/devmtg/2010-11/Gritz-OpenShadingLang.pdf">shaders in
-movies</a>, etc).  Due to the design of static constructors, they must be
-executed at startup time of the entire application, regardless of whether or
-how LLVM is used in that larger application.  There are two problems with
-this:</p>
-  
-<ol>
-  <li>The time to run the static constructors impacts startup time of
-    applications &mdash; a critical time for GUI apps, among others.</li>
-  
-  <li>The static constructors cause the app to pull many extra pages of memory
-    off the disk: both the code for the constructor in each <tt>.o</tt> file and
-    the small amount of data that gets touched. In addition, touched/dirty pages
-    put more pressure on the VM system on low-memory machines.</li>
-</ol>
-
-<p>We would really like for there to be zero cost for linking in an additional
-LLVM target or other library into an application, but static constructors
-violate this goal.</p>
-  
-<p>That said, LLVM unfortunately does contain static constructors.  It would be
-a <a href="http://llvm.org/PR11944">great project</a> for someone to purge all
-static constructors from LLVM, and then enable the 
-<tt>-Wglobal-constructors</tt> warning flag (when building with Clang) to ensure
-we do not regress in the future.
-</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-<a name="ci_class_struct">Use of <tt>class</tt> and <tt>struct</tt> Keywords</a>
-</h4>
-<div>
-
-<p>In C++, the <tt>class</tt> and <tt>struct</tt> keywords can be used almost
-interchangeably. The only difference is when they are used to declare a class:
-<tt>class</tt> makes all members private by default while <tt>struct</tt> makes
-all members public by default.</p>
-
-<p>Unfortunately, not all compilers follow the rules and some will generate
-different symbols based on whether <tt>class</tt> or <tt>struct</tt> was used to
-declare the symbol.  This can lead to problems at link time.</p> 
-
-<p>So, the rule for LLVM is to always use the <tt>class</tt> keyword, unless
-<b>all</b> members are public and the type is a C++
-<a href="http://en.wikipedia.org/wiki/Plain_old_data_structure">POD</a> type, in
-which case <tt>struct</tt> is allowed.</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="styleissues">Style Issues</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="macro">The High-Level Issues</a>
-</h3>
-<!-- ======================================================================= -->
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_module">A Public Header File <b>is</b> a Module</a>
-</h4>
-
-<div>
-
-<p>C++ doesn't do too well in the modularity department.  There is no real
-encapsulation or data hiding (unless you use expensive protocol classes), but it
-is what we have to work with.  When you write a public header file (in the LLVM
-source tree, they live in the top level "<tt>include</tt>" directory), you are
-defining a module of functionality.</p>
-
-<p>Ideally, modules should be completely independent of each other, and their
-header files should only <tt>#include</tt> the absolute minimum number of
-headers possible. A module is not just a class, a function, or a
-namespace: <a href="http://www.cuj.com/articles/2000/0002/0002c/0002c.htm">it's
-a collection of these</a> that defines an interface.  This interface may be
-several functions, classes, or data structures, but the important issue is how
-they work together.</p>
-
-<p>In general, a module should be implemented by one or more <tt>.cpp</tt>
-files.  Each of these <tt>.cpp</tt> files should include the header that defines
-their interface first.  This ensures that all of the dependences of the module
-header have been properly added to the module header itself, and are not
-implicit.  System headers should be included after user headers for a
-translation unit.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_dontinclude"><tt>#include</tt> as Little as Possible</a>
-</h4>
-
-<div>
-
-<p><tt>#include</tt> hurts compile time performance.  Don't do it unless you
-have to, especially in header files.</p>
-
-<p>But wait! Sometimes you need to have the definition of a class to use it, or
-to inherit from it.  In these cases go ahead and <tt>#include</tt> that header
-file.  Be aware however that there are many cases where you don't need to have
-the full definition of a class.  If you are using a pointer or reference to a
-class, you don't need the header file.  If you are simply returning a class
-instance from a prototyped function or method, you don't need it.  In fact, for
-most cases, you simply don't need the definition of a class. And not
-<tt>#include</tt>'ing speeds up compilation.</p>
-
-<p>It is easy to try to go too overboard on this recommendation, however.  You
-<b>must</b> include all of the header files that you are using &mdash; you can
-include them either directly or indirectly (through another header file).  To
-make sure that you don't accidentally forget to include a header file in your
-module header, make sure to include your module header <b>first</b> in the
-implementation file (as mentioned above).  This way there won't be any hidden
-dependencies that you'll find out about later.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_privateheaders">Keep "Internal" Headers Private</a>
-</h4>
-
-<div>
-
-<p>Many modules have a complex implementation that causes them to use more than
-one implementation (<tt>.cpp</tt>) file.  It is often tempting to put the
-internal communication interface (helper classes, extra functions, etc) in the
-public module header file.  Don't do this!</p>
-
-<p>If you really need to do something like this, put a private header file in
-the same directory as the source files, and include it locally.  This ensures
-that your private interface remains private and undisturbed by outsiders.</p>
-
-<p>Note however, that it's okay to put extra implementation methods in a public
-class itself. Just make them private (or protected) and all is well.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_earlyexit">Use Early Exits and <tt>continue</tt> to Simplify Code</a>
-</h4>
-
-<div>
-
-<p>When reading code, keep in mind how much state and how many previous
-decisions have to be remembered by the reader to understand a block of code.
-Aim to reduce indentation where possible when it doesn't make it more difficult
-to understand the code.  One great way to do this is by making use of early
-exits and the <tt>continue</tt> keyword in long loops.  As an example of using
-an early exit from a function, consider this "bad" code:</p>
-
-<div class="doc_code">
-<pre>
-Value *DoSomething(Instruction *I) {
-  if (!isa&lt;TerminatorInst&gt;(I) &amp;&amp;
-      I-&gt;hasOneUse() &amp;&amp; SomeOtherThing(I)) {
-    ... some long code ....
-  }
-  
-  return 0;
-}
-</pre>
-</div>
-
-<p>This code has several problems if the body of the '<tt>if</tt>' is large.
-When you're looking at the top of the function, it isn't immediately clear that
-this <em>only</em> does interesting things with non-terminator instructions, and
-only applies to things with the other predicates.  Second, it is relatively
-difficult to describe (in comments) why these predicates are important because
-the <tt>if</tt> statement makes it difficult to lay out the comments.  Third,
-when you're deep within the body of the code, it is indented an extra level.
-Finally, when reading the top of the function, it isn't clear what the result is
-if the predicate isn't true; you have to read to the end of the function to know
-that it returns null.</p>
-
-<p>It is much preferred to format the code like this:</p>
-
-<div class="doc_code">
-<pre>
-Value *DoSomething(Instruction *I) {
-  // Terminators never need 'something' done to them because ... 
-  if (isa&lt;TerminatorInst&gt;(I))
-    return 0;
-
-  // We conservatively avoid transforming instructions with multiple uses
-  // because goats like cheese.
-  if (!I-&gt;hasOneUse())
-    return 0;
-
-  // This is really just here for example.
-  if (!SomeOtherThing(I))
-    return 0;
-    
-  ... some long code ....
-}
-</pre>
-</div>
-
-<p>This fixes these problems.  A similar problem frequently happens in <tt>for</tt>
-loops.  A silly example is something like this:</p>
-
-<div class="doc_code">
-<pre>
-  for (BasicBlock::iterator II = BB-&gt;begin(), E = BB-&gt;end(); II != E; ++II) {
-    if (BinaryOperator *BO = dyn_cast&lt;BinaryOperator&gt;(II)) {
-      Value *LHS = BO-&gt;getOperand(0);
-      Value *RHS = BO-&gt;getOperand(1);
-      if (LHS != RHS) {
-        ...
-      }
-    }
-  }
-</pre>
-</div>
-
-<p>When you have very, very small loops, this sort of structure is fine. But if
-it exceeds more than 10-15 lines, it becomes difficult for people to read and
-understand at a glance. The problem with this sort of code is that it gets very
-nested very quickly. Meaning that the reader of the code has to keep a lot of
-context in their brain to remember what is going immediately on in the loop,
-because they don't know if/when the <tt>if</tt> conditions will have elses etc.
-It is strongly preferred to structure the loop like this:</p>
-
-<div class="doc_code">
-<pre>
-  for (BasicBlock::iterator II = BB-&gt;begin(), E = BB-&gt;end(); II != E; ++II) {
-    BinaryOperator *BO = dyn_cast&lt;BinaryOperator&gt;(II);
-    if (!BO) continue;
-    
-    Value *LHS = BO-&gt;getOperand(0);
-    Value *RHS = BO-&gt;getOperand(1);
-    if (LHS == RHS) continue;
-
-    ...
-  }
-</pre>
-</div>
-
-<p>This has all the benefits of using early exits for functions: it reduces
-nesting of the loop, it makes it easier to describe why the conditions are true,
-and it makes it obvious to the reader that there is no <tt>else</tt> coming up
-that they have to push context into their brain for.  If a loop is large, this
-can be a big understandability win.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_else_after_return">Don't use <tt>else</tt> after a <tt>return</tt></a>
-</h4>
-
-<div>
-
-<p>For similar reasons above (reduction of indentation and easier reading),
-please do not use '<tt>else</tt>' or '<tt>else if</tt>' after something that
-interrupts control flow &mdash; like <tt>return</tt>, <tt>break</tt>,
-<tt>continue</tt>, <tt>goto</tt>, etc. For example, this is <em>bad</em>:</p>
-
-<div class="doc_code">
-<pre>
-  case 'J': {
-    if (Signed) {
-      Type = Context.getsigjmp_bufType();
-      if (Type.isNull()) {
-        Error = ASTContext::GE_Missing_sigjmp_buf;
-        return QualType();
-      <b>} else {
-        break;
-      }</b>
-    } else {
-      Type = Context.getjmp_bufType();
-      if (Type.isNull()) {
-        Error = ASTContext::GE_Missing_jmp_buf;
-        return QualType();
-      <b>} else {
-        break;
-      }</b>
-    }
-  }
-  }
-</pre>
-</div>
-
-<p>It is better to write it like this:</p>
-
-<div class="doc_code">
-<pre>
-  case 'J':
-    if (Signed) {
-      Type = Context.getsigjmp_bufType();
-      if (Type.isNull()) {
-        Error = ASTContext::GE_Missing_sigjmp_buf;
-        return QualType();
-      }
-    } else {
-      Type = Context.getjmp_bufType();
-      if (Type.isNull()) {
-        Error = ASTContext::GE_Missing_jmp_buf;
-        return QualType();
-      }
-    }
-    <b>break;</b>
-</pre>
-</div>
-
-<p>Or better yet (in this case) as:</p>
-
-<div class="doc_code">
-<pre>
-  case 'J':
-    if (Signed)
-      Type = Context.getsigjmp_bufType();
-    else
-      Type = Context.getjmp_bufType();
-    
-    if (Type.isNull()) {
-      Error = Signed ? ASTContext::GE_Missing_sigjmp_buf :
-                       ASTContext::GE_Missing_jmp_buf;
-      return QualType();
-    }
-    <b>break;</b>
-</pre>
-</div>
-
-<p>The idea is to reduce indentation and the amount of code you have to keep
-track of when reading the code.</p>
-              
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hl_predicateloops">Turn Predicate Loops into Predicate Functions</a>
-</h4>
-
-<div>
-
-<p>It is very common to write small loops that just compute a boolean value.
-There are a number of ways that people commonly write these, but an example of
-this sort of thing is:</p>
-   
-<div class="doc_code">
-<pre>
-  <b>bool FoundFoo = false;</b>
-  for (unsigned i = 0, e = BarList.size(); i != e; ++i)
-    if (BarList[i]-&gt;isFoo()) {
-      <b>FoundFoo = true;</b>
-      break;
-    }
-    
-  <b>if (FoundFoo) {</b>
-    ...
-  }
-</pre>
-</div>
-
-<p>This sort of code is awkward to write, and is almost always a bad sign.
-Instead of this sort of loop, we strongly prefer to use a predicate function
-(which may be <a href="#micro_anonns">static</a>) that uses
-<a href="#hl_earlyexit">early exits</a> to compute the predicate.  We prefer
-the code to be structured like this:</p>
-
-<div class="doc_code">
-<pre>
-/// ListContainsFoo - Return true if the specified list has an element that is
-/// a foo.
-static bool ListContainsFoo(const std::vector&lt;Bar*&gt; &amp;List) {
-  for (unsigned i = 0, e = List.size(); i != e; ++i)
-    if (List[i]-&gt;isFoo())
-      return true;
-  return false;
-}
-...
-
-  <b>if (ListContainsFoo(BarList)) {</b>
-    ...
-  }
-</pre>
-</div>
-
-<p>There are many reasons for doing this: it reduces indentation and factors out
-code which can often be shared by other code that checks for the same predicate.
-More importantly, it <em>forces you to pick a name</em> for the function, and
-forces you to write a comment for it.  In this silly example, this doesn't add
-much value.  However, if the condition is complex, this can make it a lot easier
-for the reader to understand the code that queries for this predicate.  Instead
-of being faced with the in-line details of how we check to see if the BarList
-contains a foo, we can trust the function name and continue reading with better
-locality.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="micro">The Low-Level Issues</a>
-</h3>
-<!-- ======================================================================= -->
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_naming">
-    Name Types, Functions, Variables, and Enumerators Properly
-  </a>
-</h4>
-
-<div>
-
-<p>Poorly-chosen names can mislead the reader and cause bugs. We cannot stress
-enough how important it is to use <em>descriptive</em> names.  Pick names that
-match the semantics and role of the underlying entities, within reason.  Avoid
-abbreviations unless they are well known.  After picking a good name, make sure
-to use consistent capitalization for the name, as inconsistency requires clients
-to either memorize the APIs or to look it up to find the exact spelling.</p>
-
-<p>In general, names should be in camel case (e.g. <tt>TextFileReader</tt>
-and <tt>isLValue()</tt>).  Different kinds of declarations have different
-rules:</p>
-
-<ul>
-<li><p><b>Type names</b> (including classes, structs, enums, typedefs, etc)
-    should be nouns and start with an upper-case letter (e.g.
-    <tt>TextFileReader</tt>).</p></li>
-
-<li><p><b>Variable names</b> should be nouns (as they represent state).  The
-    name should be camel case, and start with an upper case letter (e.g.
-    <tt>Leader</tt> or <tt>Boats</tt>).</p></li>
-  
-<li><p><b>Function names</b> should be verb phrases (as they represent
-    actions), and command-like function should be imperative.  The name should
-    be camel case, and start with a lower case letter (e.g. <tt>openFile()</tt>
-    or <tt>isFoo()</tt>).</p></li>
-
-<li><p><b>Enum declarations</b> (e.g. <tt>enum Foo {...}</tt>) are types, so
-    they should follow the naming conventions for types.  A common use for enums
-    is as a discriminator for a union, or an indicator of a subclass.  When an
-    enum is used for something like this, it should have a <tt>Kind</tt> suffix
-    (e.g. <tt>ValueKind</tt>).</p></li>
-  
-<li><p><b>Enumerators</b> (e.g. <tt>enum { Foo, Bar }</tt>) and <b>public member
-    variables</b> should start with an upper-case letter, just like types.
-    Unless the enumerators are defined in their own small namespace or inside a
-    class, enumerators should have a prefix corresponding to the enum
-    declaration name.  For example, <tt>enum ValueKind { ... };</tt> may contain
-    enumerators like <tt>VK_Argument</tt>, <tt>VK_BasicBlock</tt>, etc.
-    Enumerators that are just convenience constants are exempt from the
-    requirement for a prefix.  For instance:</p>
-
-<div class="doc_code">
-<pre>
-enum {
-  MaxSize = 42,
-  Density = 12
-};
-</pre>
-</div>
-</li>
-
-</ul>
-  
-<p>As an exception, classes that mimic STL classes can have member names in
-STL's style of lower-case words separated by underscores (e.g. <tt>begin()</tt>,
-<tt>push_back()</tt>, and <tt>empty()</tt>).</p>
-
-<p>Here are some examples of good and bad names:</p>
-
-<div class="doc_code">
-<pre>
-class VehicleMaker {
-  ...
-  Factory&lt;Tire&gt; F;            // Bad -- abbreviation and non-descriptive.
-  Factory&lt;Tire&gt; Factory;      // Better.
-  Factory&lt;Tire&gt; TireFactory;  // Even better -- if VehicleMaker has more than one
-                              // kind of factories.
-};
-
-Vehicle MakeVehicle(VehicleType Type) {
-  VehicleMaker M;                         // Might be OK if having a short life-span.
-  Tire tmp1 = M.makeTire();               // Bad -- 'tmp1' provides no information.
-  Light headlight = M.makeLight("head");  // Good -- descriptive.
-  ...
-}
-</pre>
-</div>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_assert">Assert Liberally</a>
-</h4>
-
-<div>
-
-<p>Use the "<tt>assert</tt>" macro to its fullest.  Check all of your
-preconditions and assumptions, you never know when a bug (not necessarily even
-yours) might be caught early by an assertion, which reduces debugging time
-dramatically.  The "<tt>&lt;cassert&gt;</tt>" header file is probably already
-included by the header files you are using, so it doesn't cost anything to use
-it.</p>
-
-<p>To further assist with debugging, make sure to put some kind of error message
-in the assertion statement, which is printed if the assertion is tripped. This
-helps the poor debugger make sense of why an assertion is being made and
-enforced, and hopefully what to do about it.  Here is one complete example:</p>
-
-<div class="doc_code">
-<pre>
-inline Value *getOperand(unsigned i) { 
-  assert(i &lt; Operands.size() &amp;&amp; "getOperand() out of range!");
-  return Operands[i]; 
-}
-</pre>
-</div>
-
-<p>Here are more examples:</p>
-
-<div class="doc_code">
-<pre>
-assert(Ty-&gt;isPointerType() &amp;&amp; "Can't allocate a non pointer type!");
-
-assert((Opcode == Shl || Opcode == Shr) &amp;&amp; "ShiftInst Opcode invalid!");
-
-assert(idx &lt; getNumSuccessors() &amp;&amp; "Successor # out of range!");
-
-assert(V1.getType() == V2.getType() &amp;&amp; "Constant types must be identical!");
-
-assert(isa&lt;PHINode&gt;(Succ-&gt;front()) &amp;&amp; "Only works on PHId BBs!");
-</pre>
-</div>
-
-<p>You get the idea.</p>
-
-<p>Please be aware that, when adding assert statements, not all compilers are aware of
-the semantics of the assert.  In some places, asserts are used to indicate a piece of
-code that should not be reached.  These are typically of the form:</p>
-
-<div class="doc_code">
-<pre>
-assert(0 &amp;&amp; "Some helpful error message");
-</pre>
-</div>
-
-<p>When used in a function that returns a value, they should be followed with a return
-statement and a comment indicating that this line is never reached.  This will prevent
-a compiler which is unable to deduce that the assert statement never returns from
-generating a warning.</p>
-
-<div class="doc_code">
-<pre>
-assert(0 &amp;&amp; "Some helpful error message");
-// Not reached
-return 0;
-</pre>
-</div>
-
-<p>Another issue is that values used only by assertions will produce an "unused
-value" warning when assertions are disabled.  For example, this code will
-warn:</p>
-
-<div class="doc_code">
-<pre>
-unsigned Size = V.size();
-assert(Size &gt; 42 &amp;&amp; "Vector smaller than it should be");
-
-bool NewToSet = Myset.insert(Value);
-assert(NewToSet &amp;&amp; "The value shouldn't be in the set yet");
-</pre>
-</div>
-
-<p>These are two interesting different cases. In the first case, the call to
-V.size() is only useful for the assert, and we don't want it executed when
-assertions are disabled.  Code like this should move the call into the assert
-itself.  In the second case, the side effects of the call must happen whether
-the assert is enabled or not.  In this case, the value should be cast to void to
-disable the warning.  To be specific, it is preferred to write the code like
-this:</p>
-
-<div class="doc_code">
-<pre>
-assert(V.size() &gt; 42 &amp;&amp; "Vector smaller than it should be");
-
-bool NewToSet = Myset.insert(Value); (void)NewToSet;
-assert(NewToSet &amp;&amp; "The value shouldn't be in the set yet");
-</pre>
-</div>
-
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_ns_std">Do Not Use '<tt>using namespace std</tt>'</a>
-</h4>
-
-<div>
-
-<p>In LLVM, we prefer to explicitly prefix all identifiers from the standard
-namespace with an "<tt>std::</tt>" prefix, rather than rely on
-"<tt>using namespace std;</tt>".</p>
-
-<p> In header files, adding a '<tt>using namespace XXX</tt>' directive pollutes
-the namespace of any source file that <tt>#include</tt>s the header.  This is
-clearly a bad thing.</p>
-
-<p>In implementation files (e.g. <tt>.cpp</tt> files), the rule is more of a stylistic
-rule, but is still important.  Basically, using explicit namespace prefixes
-makes the code <b>clearer</b>, because it is immediately obvious what facilities
-are being used and where they are coming from. And <b>more portable</b>, because
-namespace clashes cannot occur between LLVM code and other namespaces.  The
-portability rule is important because different standard library implementations
-expose different symbols (potentially ones they shouldn't), and future revisions
-to the C++ standard will add more symbols to the <tt>std</tt> namespace.  As
-such, we never use '<tt>using namespace std;</tt>' in LLVM.</p>
-
-<p>The exception to the general rule (i.e. it's not an exception for
-the <tt>std</tt> namespace) is for implementation files.  For example, all of
-the code in the LLVM project implements code that lives in the 'llvm' namespace.
-As such, it is ok, and actually clearer, for the <tt>.cpp</tt> files to have a
-'<tt>using namespace llvm;</tt>' directive at the top, after the
-<tt>#include</tt>s.  This reduces indentation in the body of the file for source
-editors that indent based on braces, and keeps the conceptual context cleaner.
-The general form of this rule is that any <tt>.cpp</tt> file that implements
-code in any namespace may use that namespace (and its parents'), but should not
-use any others.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_virtual_anch">
-    Provide a Virtual Method Anchor for Classes in Headers
-  </a>
-</h4>
-
-<div>
-
-<p>If a class is defined in a header file and has a v-table (either it has 
-virtual methods or it derives from classes with virtual methods), it must 
-always have at least one out-of-line virtual method in the class.  Without 
-this, the compiler will copy the vtable and RTTI into every <tt>.o</tt> file
-that <tt>#include</tt>s the header, bloating <tt>.o</tt> file sizes and
-increasing link times.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_end">Don't evaluate <tt>end()</tt> every time through a loop</a>
-</h4>
-
-<div>
-
-<p>Because C++ doesn't have a standard "<tt>foreach</tt>" loop (though it can be
-emulated with macros and may be coming in C++'0x) we end up writing a lot of
-loops that manually iterate from begin to end on a variety of containers or
-through other data structures.  One common mistake is to write a loop in this
-style:</p>
-
-<div class="doc_code">
-<pre>
-  BasicBlock *BB = ...
-  for (BasicBlock::iterator I = BB->begin(); I != <b>BB->end()</b>; ++I)
-     ... use I ...
-</pre>
-</div>
-
-<p>The problem with this construct is that it evaluates "<tt>BB->end()</tt>"
-every time through the loop.  Instead of writing the loop like this, we strongly
-prefer loops to be written so that they evaluate it once before the loop starts.
-A convenient way to do this is like so:</p>
-
-<div class="doc_code">
-<pre>
-  BasicBlock *BB = ...
-  for (BasicBlock::iterator I = BB->begin(), E = <b>BB->end()</b>; I != E; ++I)
-     ... use I ...
-</pre>
-</div>
-
-<p>The observant may quickly point out that these two loops may have different
-semantics: if the container (a basic block in this case) is being mutated, then
-"<tt>BB->end()</tt>" may change its value every time through the loop and the
-second loop may not in fact be correct.  If you actually do depend on this
-behavior, please write the loop in the first form and add a comment indicating
-that you did it intentionally.</p>
-
-<p>Why do we prefer the second form (when correct)?  Writing the loop in the
-first form has two problems. First it may be less efficient than evaluating it
-at the start of the loop.  In this case, the cost is probably minor &mdash; a
-few extra loads every time through the loop.  However, if the base expression is
-more complex, then the cost can rise quickly.  I've seen loops where the end
-expression was actually something like: "<tt>SomeMap[x]->end()</tt>" and map
-lookups really aren't cheap.  By writing it in the second form consistently, you
-eliminate the issue entirely and don't even have to think about it.</p>
-
-<p>The second (even bigger) issue is that writing the loop in the first form
-hints to the reader that the loop is mutating the container (a fact that a
-comment would handily confirm!).  If you write the loop in the second form, it
-is immediately obvious without even looking at the body of the loop that the
-container isn't being modified, which makes it easier to read the code and
-understand what it does.</p>
-
-<p>While the second form of the loop is a few extra keystrokes, we do strongly
-prefer it.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_iostream"><tt>#include &lt;iostream&gt;</tt> is Forbidden</a>
-</h4>
-
-<div>
-
-<p>The use of <tt>#include &lt;iostream&gt;</tt> in library files is
-hereby <b><em>forbidden</em></b>, because many common implementations
-transparently inject a <a href="#ci_static_ctors">static constructor</a> into
-every translation unit that includes it.</p>
-  
-<p>Note that using the other stream headers (<tt>&lt;sstream&gt;</tt> for
-example) is not problematic in this regard &mdash;
-just <tt>&lt;iostream&gt;</tt>. However, <tt>raw_ostream</tt> provides various
-APIs that are better performing for almost every use than <tt>std::ostream</tt>
-style APIs. <b>Therefore new code should always
-use <a href="#ll_raw_ostream"><tt>raw_ostream</tt></a> for writing, or
-the <tt>llvm::MemoryBuffer</tt> API for reading files.</b></p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_raw_ostream">Use <tt>raw_ostream</tt></a>
-</h4>
-
-<div>
-
-<p>LLVM includes a lightweight, simple, and efficient stream implementation
-in <tt>llvm/Support/raw_ostream.h</tt>, which provides all of the common
-features of <tt>std::ostream</tt>.  All new code should use <tt>raw_ostream</tt>
-instead of <tt>ostream</tt>.</p>
-
-<p>Unlike <tt>std::ostream</tt>, <tt>raw_ostream</tt> is not a template and can
-be forward declared as <tt>class raw_ostream</tt>.  Public headers should
-generally not include the <tt>raw_ostream</tt> header, but use forward
-declarations and constant references to <tt>raw_ostream</tt> instances.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="ll_avoidendl">Avoid <tt>std::endl</tt></a>
-</h4>
-
-<div>
-
-<p>The <tt>std::endl</tt> modifier, when used with <tt>iostreams</tt> outputs a
-newline to the output stream specified.  In addition to doing this, however, it
-also flushes the output stream.  In other words, these are equivalent:</p>
-
-<div class="doc_code">
-<pre>
-std::cout &lt;&lt; std::endl;
-std::cout &lt;&lt; '\n' &lt;&lt; std::flush;
-</pre>
-</div>
-
-<p>Most of the time, you probably have no reason to flush the output stream, so
-it's better to use a literal <tt>'\n'</tt>.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="nano">Microscopic Details</a>
-</h3>
-<!-- ======================================================================= -->
-
-<div>
-
-<p>This section describes preferred low-level formatting guidelines along with
-reasoning on why we prefer them.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="micro_spaceparen">Spaces Before Parentheses</a>
-</h4>
-
-<div>
-
-<p>We prefer to put a space before an open parenthesis only in control flow
-statements, but not in normal function call expressions and function-like
-macros.  For example, this is good:</p>
-
-<div class="doc_code">
-<pre>
-<b>if (</b>x) ...
-<b>for (</b>i = 0; i != 100; ++i) ...
-<b>while (</b>llvm_rocks) ...
-
-<b>somefunc(</b>42);
-<b><a href="#ll_assert">assert</a>(</b>3 != 4 &amp;&amp; "laws of math are failing me");
-  
-a = <b>foo(</b>42, 92) + <b>bar(</b>x);
-</pre>
-</div>
-
-<p>and this is bad:</p>
-
-<div class="doc_code">
-<pre>
-<b>if(</b>x) ...
-<b>for(</b>i = 0; i != 100; ++i) ...
-<b>while(</b>llvm_rocks) ...
-
-<b>somefunc (</b>42);
-<b><a href="#ll_assert">assert</a> (</b>3 != 4 &amp;&amp; "laws of math are failing me");
-  
-a = <b>foo (</b>42, 92) + <b>bar (</b>x);
-</pre>
-</div>
-
-<p>The reason for doing this is not completely arbitrary.  This style makes
-control flow operators stand out more, and makes expressions flow better. The
-function call operator binds very tightly as a postfix operator.  Putting a
-space after a function name (as in the last example) makes it appear that the
-code might bind the arguments of the left-hand-side of a binary operator with
-the argument list of a function and the name of the right side.  More
-specifically, it is easy to misread the "a" example as:</p>
-   
-<div class="doc_code">
-<pre>
-a = foo <b>(</b>(42, 92) + bar<b>)</b> (x);
-</pre>
-</div>
-
-<p>when skimming through the code.  By avoiding a space in a function, we avoid
-this misinterpretation.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="micro_preincrement">Prefer Preincrement</a>
-</h4>
-
-<div>
-
-<p>Hard fast rule: Preincrement (<tt>++X</tt>) may be no slower than
-postincrement (<tt>X++</tt>) and could very well be a lot faster than it.  Use
-preincrementation whenever possible.</p>
-
-<p>The semantics of postincrement include making a copy of the value being
-incremented, returning it, and then preincrementing the "work value".  For
-primitive types, this isn't a big deal... but for iterators, it can be a huge
-issue (for example, some iterators contains stack and set objects in them...
-copying an iterator could invoke the copy ctor's of these as well).  In general,
-get in the habit of always using preincrement, and you won't have a problem.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="micro_namespaceindent">Namespace Indentation</a>
-</h4>
-
-<div>
-
-<p>
-In general, we strive to reduce indentation wherever possible.  This is useful
-because we want code to <a href="#scf_codewidth">fit into 80 columns</a> without
-wrapping horribly, but also because it makes it easier to understand the code.
-Namespaces are a funny thing: they are often large, and we often desire to put
-lots of stuff into them (so they can be large).  Other times they are tiny,
-because they just hold an enum or something similar.  In order to balance this,
-we use different approaches for small versus large namespaces.  
-</p>
-
-<p>
-If a namespace definition is small and <em>easily</em> fits on a screen (say,
-less than 35 lines of code), then you should indent its body.  Here's an
-example:
-</p>
-
-<div class="doc_code">
-<pre>
-namespace llvm {
-  namespace X86 {
-    /// RelocationType - An enum for the x86 relocation codes. Note that
-    /// the terminology here doesn't follow x86 convention - word means
-    /// 32-bit and dword means 64-bit.
-    enum RelocationType {
-      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
-      /// the value already in memory, after we adjust it for where the PC is.
-      reloc_pcrel_word = 0,
-
-      /// reloc_picrel_word - PIC base relative relocation, add the relocated
-      /// value to the value already in memory, after we adjust it for where the
-      /// PIC base is.
-      reloc_picrel_word = 1,
-      
-      /// reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
-      /// add the relocated value to the value already in memory.
-      reloc_absolute_word = 2,
-      reloc_absolute_dword = 3
-    };
-  }
-}
-</pre>
-</div>
-
-<p>Since the body is small, indenting adds value because it makes it very clear
-where the namespace starts and ends, and it is easy to take the whole thing in
-in one "gulp" when reading the code.  If the blob of code in the namespace is
-larger (as it typically is in a header in the <tt>llvm</tt> or <tt>clang</tt> namespaces), do not
-indent the code, and add a comment indicating what namespace is being closed.
-For example:</p>
-
-<div class="doc_code">
-<pre>
-namespace llvm {
-namespace knowledge {
-
-/// Grokable - This class represents things that Smith can have an intimate
-/// understanding of and contains the data associated with it.
-class Grokable {
-...
-public:
-  explicit Grokable() { ... }
-  virtual ~Grokable() = 0;
-  
-  ...
-
-};
-
-} // end namespace knowledge
-} // end namespace llvm
-</pre>
-</div>
-
-<p>Because the class is large, we don't expect that the reader can easily
-understand the entire concept in a glance, and the end of the file (where the
-namespaces end) may be a long ways away from the place they open.  As such,
-indenting the contents of the namespace doesn't add any value, and detracts from
-the readability of the class.  In these cases it is best to <em>not</em> indent
-the contents of the namespace.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="micro_anonns">Anonymous Namespaces</a>
-</h4>
-
-<div>
-
-<p>After talking about namespaces in general, you may be wondering about
-anonymous namespaces in particular.
-Anonymous namespaces are a great language feature that tells the C++ compiler
-that the contents of the namespace are only visible within the current
-translation unit, allowing more aggressive optimization and eliminating the
-possibility of symbol name collisions.  Anonymous namespaces are to C++ as 
-"static" is to C functions and global variables.  While "static" is available
-in C++, anonymous namespaces are more general: they can make entire classes
-private to a file.</p>
-
-<p>The problem with anonymous namespaces is that they naturally want to
-encourage indentation of their body, and they reduce locality of reference: if
-you see a random function definition in a C++ file, it is easy to see if it is
-marked static, but seeing if it is in an anonymous namespace requires scanning
-a big chunk of the file.</p>
-
-<p>Because of this, we have a simple guideline: make anonymous namespaces as
-small as possible, and only use them for class declarations.  For example, this
-is good:</p>
-
-<div class="doc_code">
-<pre>
-<b>namespace {</b>
-  class StringSort {
-  ...
-  public:
-    StringSort(...)
-    bool operator&lt;(const char *RHS) const;
-  };
-<b>} // end anonymous namespace</b>
-
-static void Helper() { 
-  ... 
-}
-
-bool StringSort::operator&lt;(const char *RHS) const {
-  ...
-}
-
-</pre>
-</div>
-
-<p>This is bad:</p>
-
-
-<div class="doc_code">
-<pre>
-<b>namespace {</b>
-class StringSort {
-...
-public:
-  StringSort(...)
-  bool operator&lt;(const char *RHS) const;
-};
-
-void Helper() { 
-  ... 
-}
-
-bool StringSort::operator&lt;(const char *RHS) const {
-  ...
-}
-
-<b>} // end anonymous namespace</b>
-
-</pre>
-</div>
-
-
-<p>This is bad specifically because if you're looking at "Helper" in the middle
-of a large C++ file, that you have no immediate way to tell if it is local to
-the file.  When it is marked static explicitly, this is immediately obvious.
-Also, there is no reason to enclose the definition of "operator&lt;" in the
-namespace just because it was declared there.
-</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="seealso">See Also</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>A lot of these comments and recommendations have been culled for other
-sources.  Two particularly important books for our work are:</p>
-
-<ol>
-
-<li><a href="http://www.amazon.com/Effective-Specific-Addison-Wesley-Professional-Computing/dp/0321334876">Effective
-C++</a> by Scott Meyers.  Also 
-interesting and useful are "More Effective C++" and "Effective STL" by the same
-author.</li>
-
-<li>Large-Scale C++ Software Design by John Lakos</li>
-
-</ol>
-
-<p>If you get some free time, and you haven't read them: do so, you might learn
-something.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
new file mode 100644
index 0000000..a416a1e
--- /dev/null
+++ b/docs/CodingStandards.rst
@@ -0,0 +1,1147 @@
+.. _coding_standards:
+
+=====================
+LLVM Coding Standards
+=====================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document attempts to describe a few coding standards that are being used in
+the LLVM source tree.  Although no coding standards should be regarded as
+absolute requirements to be followed in all instances, coding standards are
+particularly important for large-scale code bases that follow a library-based
+design (like LLVM).
+
+This document intentionally does not prescribe fixed standards for religious
+issues such as brace placement and space usage.  For issues like this, follow
+the golden rule:
+
+.. _Golden Rule:
+
+    **If you are extending, enhancing, or bug fixing already implemented code,
+    use the style that is already being used so that the source is uniform and
+    easy to follow.**
+
+Note that some code bases (e.g. ``libc++``) have really good reasons to deviate
+from the coding standards.  In the case of ``libc++``, this is because the
+naming and other conventions are dictated by the C++ standard.  If you think
+there is a specific good reason to deviate from the standards here, please bring
+it up on the LLVMdev mailing list.
+
+There are some conventions that are not uniformly followed in the code base
+(e.g. the naming convention).  This is because they are relatively new, and a
+lot of code was written before they were put in place.  Our long term goal is
+for the entire codebase to follow the convention, but we explicitly *do not*
+want patches that do large-scale reformating of existing code.  On the other
+hand, it is reasonable to rename the methods of a class if you're about to
+change it in some other way.  Just do the reformating as a separate commit from
+the functionality change.
+  
+The ultimate goal of these guidelines is the increase readability and
+maintainability of our common source base. If you have suggestions for topics to
+be included, please mail them to `Chris <mailto:sabre@nondot.org>`_.
+
+Mechanical Source Issues
+========================
+
+Source Code Formatting
+----------------------
+
+Commenting
+^^^^^^^^^^
+
+Comments are one critical part of readability and maintainability.  Everyone
+knows they should comment their code, and so should you.  When writing comments,
+write them as English prose, which means they should use proper capitalization,
+punctuation, etc.  Aim to describe what the code is trying to do and why, not
+*how* it does it at a micro level. Here are a few critical things to document:
+
+.. _header file comment:
+
+File Headers
+""""""""""""
+
+Every source file should have a header on it that describes the basic purpose of
+the file.  If a file does not have a header, it should not be checked into the
+tree.  The standard header looks like this:
+
+.. code-block:: c++
+
+  //===-- llvm/Instruction.h - Instruction class definition -------*- C++ -*-===//
+  //
+  //                     The LLVM Compiler Infrastructure
+  //
+  // This file is distributed under the University of Illinois Open Source
+  // License. See LICENSE.TXT for details.
+  //
+  //===----------------------------------------------------------------------===//
+  //
+  // This file contains the declaration of the Instruction class, which is the
+  // base class for all of the VM instructions.
+  //
+  //===----------------------------------------------------------------------===//
+
+A few things to note about this particular format: The "``-*- C++ -*-``" string
+on the first line is there to tell Emacs that the source file is a C++ file, not
+a C file (Emacs assumes ``.h`` files are C files by default).
+
+.. note::
+
+    This tag is not necessary in ``.cpp`` files.  The name of the file is also
+    on the first line, along with a very short description of the purpose of the
+    file.  This is important when printing out code and flipping though lots of
+    pages.
+
+The next section in the file is a concise note that defines the license that the
+file is released under.  This makes it perfectly clear what terms the source
+code can be distributed under and should not be modified in any way.
+
+The main body of the description does not have to be very long in most cases.
+Here it's only two lines.  If an algorithm is being implemented or something
+tricky is going on, a reference to the paper where it is published should be
+included, as well as any notes or *gotchas* in the code to watch out for.
+
+Class overviews
+"""""""""""""""
+
+Classes are one fundamental part of a good object oriented design.  As such, a
+class definition should have a comment block that explains what the class is
+used for and how it works.  Every non-trivial class is expected to have a
+``doxygen`` comment block.
+
+Method information
+""""""""""""""""""
+
+Methods defined in a class (as well as any global functions) should also be
+documented properly.  A quick note about what it does and a description of the
+borderline behaviour is all that is necessary here (unless something
+particularly tricky or insidious is going on).  The hope is that people can
+figure out how to use your interfaces without reading the code itself.
+
+Good things to talk about here are what happens when something unexpected
+happens: does the method return null?  Abort?  Format your hard disk?
+
+Comment Formatting
+^^^^^^^^^^^^^^^^^^
+
+In general, prefer C++ style (``//``) comments.  They take less space, require
+less typing, don't have nesting problems, etc.  There are a few cases when it is
+useful to use C style (``/* */``) comments however:
+
+#. When writing C code: Obviously if you are writing C code, use C style
+   comments.
+
+#. When writing a header file that may be ``#include``\d by a C source file.
+
+#. When writing a source file that is used by a tool that only accepts C style
+   comments.
+
+To comment out a large block of code, use ``#if 0`` and ``#endif``. These nest
+properly and are better behaved in general than C style comments.
+
+``#include`` Style
+^^^^^^^^^^^^^^^^^^
+
+Immediately after the `header file comment`_ (and include guards if working on a
+header file), the `minimal list of #includes`_ required by the file should be
+listed.  We prefer these ``#include``\s to be listed in this order:
+
+.. _Main Module Header:
+.. _Local/Private Headers:
+
+#. Main Module Header
+#. Local/Private Headers
+#. ``llvm/*``
+#. ``llvm/Analysis/*``
+#. ``llvm/Assembly/*``
+#. ``llvm/Bitcode/*``
+#. ``llvm/CodeGen/*``
+#. ...
+#. ``llvm/Support/*``
+#. ``llvm/Config/*``
+#. System ``#include``\s
+
+and each category should be sorted by name.
+
+The `Main Module Header`_ file applies to ``.cpp`` files which implement an
+interface defined by a ``.h`` file.  This ``#include`` should always be included
+**first** regardless of where it lives on the file system.  By including a
+header file first in the ``.cpp`` files that implement the interfaces, we ensure
+that the header does not have any hidden dependencies which are not explicitly
+``#include``\d in the header, but should be. It is also a form of documentation
+in the ``.cpp`` file to indicate where the interfaces it implements are defined.
+
+.. _fit into 80 columns:
+
+Source Code Width
+^^^^^^^^^^^^^^^^^
+
+Write your code to fit within 80 columns of text.  This helps those of us who
+like to print out code and look at your code in an ``xterm`` without resizing
+it.
+
+The longer answer is that there must be some limit to the width of the code in
+order to reasonably allow developers to have multiple files side-by-side in
+windows on a modest display.  If you are going to pick a width limit, it is
+somewhat arbitrary but you might as well pick something standard.  Going with 90
+columns (for example) instead of 80 columns wouldn't add any significant value
+and would be detrimental to printing out code.  Also many other projects have
+standardized on 80 columns, so some people have already configured their editors
+for it (vs something else, like 90 columns).
+
+This is one of many contentious issues in coding standards, but it is not up for
+debate.
+
+Use Spaces Instead of Tabs
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In all cases, prefer spaces to tabs in source files.  People have different
+preferred indentation levels, and different styles of indentation that they
+like; this is fine.  What isn't fine is that different editors/viewers expand
+tabs out to different tab stops.  This can cause your code to look completely
+unreadable, and it is not worth dealing with.
+
+As always, follow the `Golden Rule`_ above: follow the style of
+existing code if you are modifying and extending it.  If you like four spaces of
+indentation, **DO NOT** do that in the middle of a chunk of code with two spaces
+of indentation.  Also, do not reindent a whole source file: it makes for
+incredible diffs that are absolutely worthless.
+
+Indent Code Consistently
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Okay, in your first year of programming you were told that indentation is
+important.  If you didn't believe and internalize this then, now is the time.
+Just do it.
+
+Compiler Issues
+---------------
+
+Treat Compiler Warnings Like Errors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If your code has compiler warnings in it, something is wrong --- you aren't
+casting values correctly, you have "questionable" constructs in your code, or
+you are doing something legitimately wrong.  Compiler warnings can cover up
+legitimate errors in output and make dealing with a translation unit difficult.
+
+It is not possible to prevent all warnings from all compilers, nor is it
+desirable.  Instead, pick a standard compiler (like ``gcc``) that provides a
+good thorough set of warnings, and stick to it.  At least in the case of
+``gcc``, it is possible to work around any spurious errors by changing the
+syntax of the code slightly.  For example, a warning that annoys me occurs when
+I write code like this:
+
+.. code-block:: c++
+
+  if (V = getValue()) {
+    ...
+  }
+
+``gcc`` will warn me that I probably want to use the ``==`` operator, and that I
+probably mistyped it.  In most cases, I haven't, and I really don't want the
+spurious errors.  To fix this particular problem, I rewrite the code like
+this:
+
+.. code-block:: c++
+
+  if ((V = getValue())) {
+    ...
+  }
+
+which shuts ``gcc`` up.  Any ``gcc`` warning that annoys you can be fixed by
+massaging the code appropriately.
+
+Write Portable Code
+^^^^^^^^^^^^^^^^^^^
+
+In almost all cases, it is possible and within reason to write completely
+portable code.  If there are cases where it isn't possible to write portable
+code, isolate it behind a well defined (and well documented) interface.
+
+In practice, this means that you shouldn't assume much about the host compiler
+(and Visual Studio tends to be the lowest common denominator).  If advanced
+features are used, they should only be an implementation detail of a library
+which has a simple exposed API, and preferably be buried in ``libSystem``.
+
+Do not use RTTI or Exceptions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In an effort to reduce code and executable size, LLVM does not use RTTI
+(e.g. ``dynamic_cast<>;``) or exceptions.  These two language features violate
+the general C++ principle of *"you only pay for what you use"*, causing
+executable bloat even if exceptions are never used in the code base, or if RTTI
+is never used for a class.  Because of this, we turn them off globally in the
+code.
+
+That said, LLVM does make extensive use of a hand-rolled form of RTTI that use
+templates like `isa<>, cast<>, and dyn_cast<> <ProgrammersManual.html#isa>`_.
+This form of RTTI is opt-in and can be added to any class.  It is also
+substantially more efficient than ``dynamic_cast<>``.
+
+.. _static constructor:
+
+Do not use Static Constructors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Static constructors and destructors (e.g. global variables whose types have a
+constructor or destructor) should not be added to the code base, and should be
+removed wherever possible.  Besides `well known problems
+<http://yosefk.com/c++fqa/ctors.html#fqa-10.12>`_ where the order of
+initialization is undefined between globals in different source files, the
+entire concept of static constructors is at odds with the common use case of
+LLVM as a library linked into a larger application.
+  
+Consider the use of LLVM as a JIT linked into another application (perhaps for
+`OpenGL, custom languages <http://llvm.org/Users.html>`_, `shaders in movies
+<http://llvm.org/devmtg/2010-11/Gritz-OpenShadingLang.pdf>`_, etc). Due to the
+design of static constructors, they must be executed at startup time of the
+entire application, regardless of whether or how LLVM is used in that larger
+application.  There are two problems with this:
+
+* The time to run the static constructors impacts startup time of applications
+  --- a critical time for GUI apps, among others.
+  
+* The static constructors cause the app to pull many extra pages of memory off
+  the disk: both the code for the constructor in each ``.o`` file and the small
+  amount of data that gets touched. In addition, touched/dirty pages put more
+  pressure on the VM system on low-memory machines.
+
+We would really like for there to be zero cost for linking in an additional LLVM
+target or other library into an application, but static constructors violate
+this goal.
+  
+That said, LLVM unfortunately does contain static constructors.  It would be a
+`great project <http://llvm.org/PR11944>`_ for someone to purge all static
+constructors from LLVM, and then enable the ``-Wglobal-constructors`` warning
+flag (when building with Clang) to ensure we do not regress in the future.
+
+Use of ``class`` and ``struct`` Keywords
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In C++, the ``class`` and ``struct`` keywords can be used almost
+interchangeably. The only difference is when they are used to declare a class:
+``class`` makes all members private by default while ``struct`` makes all
+members public by default.
+
+Unfortunately, not all compilers follow the rules and some will generate
+different symbols based on whether ``class`` or ``struct`` was used to declare
+the symbol.  This can lead to problems at link time.
+
+So, the rule for LLVM is to always use the ``class`` keyword, unless **all**
+members are public and the type is a C++ `POD
+<http://en.wikipedia.org/wiki/Plain_old_data_structure>`_ type, in which case
+``struct`` is allowed.
+
+Style Issues
+============
+
+The High-Level Issues
+---------------------
+
+A Public Header File **is** a Module
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+C++ doesn't do too well in the modularity department.  There is no real
+encapsulation or data hiding (unless you use expensive protocol classes), but it
+is what we have to work with.  When you write a public header file (in the LLVM
+source tree, they live in the top level "``include``" directory), you are
+defining a module of functionality.
+
+Ideally, modules should be completely independent of each other, and their
+header files should only ``#include`` the absolute minimum number of headers
+possible. A module is not just a class, a function, or a namespace: it's a
+collection of these that defines an interface.  This interface may be several
+functions, classes, or data structures, but the important issue is how they work
+together.
+
+In general, a module should be implemented by one or more ``.cpp`` files.  Each
+of these ``.cpp`` files should include the header that defines their interface
+first.  This ensures that all of the dependences of the module header have been
+properly added to the module header itself, and are not implicit.  System
+headers should be included after user headers for a translation unit.
+
+.. _minimal list of #includes:
+
+``#include`` as Little as Possible
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``#include`` hurts compile time performance.  Don't do it unless you have to,
+especially in header files.
+
+But wait! Sometimes you need to have the definition of a class to use it, or to
+inherit from it.  In these cases go ahead and ``#include`` that header file.  Be
+aware however that there are many cases where you don't need to have the full
+definition of a class.  If you are using a pointer or reference to a class, you
+don't need the header file.  If you are simply returning a class instance from a
+prototyped function or method, you don't need it.  In fact, for most cases, you
+simply don't need the definition of a class. And not ``#include``\ing speeds up
+compilation.
+
+It is easy to try to go too overboard on this recommendation, however.  You
+**must** include all of the header files that you are using --- you can include
+them either directly or indirectly through another header file.  To make sure
+that you don't accidentally forget to include a header file in your module
+header, make sure to include your module header **first** in the implementation
+file (as mentioned above).  This way there won't be any hidden dependencies that
+you'll find out about later.
+
+Keep "Internal" Headers Private
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many modules have a complex implementation that causes them to use more than one
+implementation (``.cpp``) file.  It is often tempting to put the internal
+communication interface (helper classes, extra functions, etc) in the public
+module header file.  Don't do this!
+
+If you really need to do something like this, put a private header file in the
+same directory as the source files, and include it locally.  This ensures that
+your private interface remains private and undisturbed by outsiders.
+
+.. note::
+
+    It's okay to put extra implementation methods in a public class itself. Just
+    make them private (or protected) and all is well.
+
+.. _early exits:
+
+Use Early Exits and ``continue`` to Simplify Code
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When reading code, keep in mind how much state and how many previous decisions
+have to be remembered by the reader to understand a block of code.  Aim to
+reduce indentation where possible when it doesn't make it more difficult to
+understand the code.  One great way to do this is by making use of early exits
+and the ``continue`` keyword in long loops.  As an example of using an early
+exit from a function, consider this "bad" code:
+
+.. code-block:: c++
+
+  Value *DoSomething(Instruction *I) {
+    if (!isa<TerminatorInst>(I) &&
+        I->hasOneUse() && SomeOtherThing(I)) {
+      ... some long code ....
+    }
+
+    return 0;
+  }
+
+This code has several problems if the body of the ``'if'`` is large.  When
+you're looking at the top of the function, it isn't immediately clear that this
+*only* does interesting things with non-terminator instructions, and only
+applies to things with the other predicates.  Second, it is relatively difficult
+to describe (in comments) why these predicates are important because the ``if``
+statement makes it difficult to lay out the comments.  Third, when you're deep
+within the body of the code, it is indented an extra level.  Finally, when
+reading the top of the function, it isn't clear what the result is if the
+predicate isn't true; you have to read to the end of the function to know that
+it returns null.
+
+It is much preferred to format the code like this:
+
+.. code-block:: c++
+
+  Value *DoSomething(Instruction *I) {
+    // Terminators never need 'something' done to them because ... 
+    if (isa<TerminatorInst>(I))
+      return 0;
+
+    // We conservatively avoid transforming instructions with multiple uses
+    // because goats like cheese.
+    if (!I->hasOneUse())
+      return 0;
+
+    // This is really just here for example.
+    if (!SomeOtherThing(I))
+      return 0;
+    
+    ... some long code ....
+  }
+
+This fixes these problems.  A similar problem frequently happens in ``for``
+loops.  A silly example is something like this:
+
+.. code-block:: c++
+
+  for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(II)) {
+      Value *LHS = BO->getOperand(0);
+      Value *RHS = BO->getOperand(1);
+      if (LHS != RHS) {
+        ...
+      }
+    }
+  }
+
+When you have very, very small loops, this sort of structure is fine. But if it
+exceeds more than 10-15 lines, it becomes difficult for people to read and
+understand at a glance. The problem with this sort of code is that it gets very
+nested very quickly. Meaning that the reader of the code has to keep a lot of
+context in their brain to remember what is going immediately on in the loop,
+because they don't know if/when the ``if`` conditions will have ``else``\s etc.
+It is strongly preferred to structure the loop like this:
+
+.. code-block:: c++
+
+  for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(II);
+    if (!BO) continue;
+
+    Value *LHS = BO->getOperand(0);
+    Value *RHS = BO->getOperand(1);
+    if (LHS == RHS) continue;
+
+    ...
+  }
+
+This has all the benefits of using early exits for functions: it reduces nesting
+of the loop, it makes it easier to describe why the conditions are true, and it
+makes it obvious to the reader that there is no ``else`` coming up that they
+have to push context into their brain for.  If a loop is large, this can be a
+big understandability win.
+
+Don't use ``else`` after a ``return``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For similar reasons above (reduction of indentation and easier reading), please
+do not use ``'else'`` or ``'else if'`` after something that interrupts control
+flow --- like ``return``, ``break``, ``continue``, ``goto``, etc. For
+example, this is *bad*:
+
+.. code-block:: c++
+
+  case 'J': {
+    if (Signed) {
+      Type = Context.getsigjmp_bufType();
+      if (Type.isNull()) {
+        Error = ASTContext::GE_Missing_sigjmp_buf;
+        return QualType();
+      } else {
+        break;
+      }
+    } else {
+      Type = Context.getjmp_bufType();
+      if (Type.isNull()) {
+        Error = ASTContext::GE_Missing_jmp_buf;
+        return QualType();
+      } else {
+        break;
+      }
+    }
+  }
+
+It is better to write it like this:
+
+.. code-block:: c++
+
+  case 'J':
+    if (Signed) {
+      Type = Context.getsigjmp_bufType();
+      if (Type.isNull()) {
+        Error = ASTContext::GE_Missing_sigjmp_buf;
+        return QualType();
+      }
+    } else {
+      Type = Context.getjmp_bufType();
+      if (Type.isNull()) {
+        Error = ASTContext::GE_Missing_jmp_buf;
+        return QualType();
+      }
+    }
+    break;
+
+Or better yet (in this case) as:
+
+.. code-block:: c++
+
+  case 'J':
+    if (Signed)
+      Type = Context.getsigjmp_bufType();
+    else
+      Type = Context.getjmp_bufType();
+    
+    if (Type.isNull()) {
+      Error = Signed ? ASTContext::GE_Missing_sigjmp_buf :
+                       ASTContext::GE_Missing_jmp_buf;
+      return QualType();
+    }
+    break;
+
+The idea is to reduce indentation and the amount of code you have to keep track
+of when reading the code.
+              
+Turn Predicate Loops into Predicate Functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It is very common to write small loops that just compute a boolean value.  There
+are a number of ways that people commonly write these, but an example of this
+sort of thing is:
+
+.. code-block:: c++
+
+  bool FoundFoo = false;
+  for (unsigned i = 0, e = BarList.size(); i != e; ++i)
+    if (BarList[i]->isFoo()) {
+      FoundFoo = true;
+      break;
+    }
+
+  if (FoundFoo) {
+    ...
+  }
+
+This sort of code is awkward to write, and is almost always a bad sign.  Instead
+of this sort of loop, we strongly prefer to use a predicate function (which may
+be `static`_) that uses `early exits`_ to compute the predicate.  We prefer the
+code to be structured like this:
+
+.. code-block:: c++
+
+  /// ListContainsFoo - Return true if the specified list has an element that is
+  /// a foo.
+  static bool ListContainsFoo(const std::vector<Bar*> &List) {
+    for (unsigned i = 0, e = List.size(); i != e; ++i)
+      if (List[i]->isFoo())
+        return true;
+    return false;
+  }
+  ...
+
+  if (ListContainsFoo(BarList)) {
+    ...
+  }
+
+There are many reasons for doing this: it reduces indentation and factors out
+code which can often be shared by other code that checks for the same predicate.
+More importantly, it *forces you to pick a name* for the function, and forces
+you to write a comment for it.  In this silly example, this doesn't add much
+value.  However, if the condition is complex, this can make it a lot easier for
+the reader to understand the code that queries for this predicate.  Instead of
+being faced with the in-line details of how we check to see if the BarList
+contains a foo, we can trust the function name and continue reading with better
+locality.
+
+The Low-Level Issues
+--------------------
+
+Name Types, Functions, Variables, and Enumerators Properly
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Poorly-chosen names can mislead the reader and cause bugs. We cannot stress
+enough how important it is to use *descriptive* names.  Pick names that match
+the semantics and role of the underlying entities, within reason.  Avoid
+abbreviations unless they are well known.  After picking a good name, make sure
+to use consistent capitalization for the name, as inconsistency requires clients
+to either memorize the APIs or to look it up to find the exact spelling.
+
+In general, names should be in camel case (e.g. ``TextFileReader`` and
+``isLValue()``).  Different kinds of declarations have different rules:
+
+* **Type names** (including classes, structs, enums, typedefs, etc) should be
+  nouns and start with an upper-case letter (e.g. ``TextFileReader``).
+
+* **Variable names** should be nouns (as they represent state).  The name should
+  be camel case, and start with an upper case letter (e.g. ``Leader`` or
+  ``Boats``).
+  
+* **Function names** should be verb phrases (as they represent actions), and
+  command-like function should be imperative.  The name should be camel case,
+  and start with a lower case letter (e.g. ``openFile()`` or ``isFoo()``).
+
+* **Enum declarations** (e.g. ``enum Foo {...}``) are types, so they should
+  follow the naming conventions for types.  A common use for enums is as a
+  discriminator for a union, or an indicator of a subclass.  When an enum is
+  used for something like this, it should have a ``Kind`` suffix
+  (e.g. ``ValueKind``).
+  
+* **Enumerators** (e.g. ``enum { Foo, Bar }``) and **public member variables**
+  should start with an upper-case letter, just like types.  Unless the
+  enumerators are defined in their own small namespace or inside a class,
+  enumerators should have a prefix corresponding to the enum declaration name.
+  For example, ``enum ValueKind { ... };`` may contain enumerators like
+  ``VK_Argument``, ``VK_BasicBlock``, etc.  Enumerators that are just
+  convenience constants are exempt from the requirement for a prefix.  For
+  instance:
+
+  .. code-block:: c++
+
+      enum {
+        MaxSize = 42,
+        Density = 12
+      };
+  
+As an exception, classes that mimic STL classes can have member names in STL's
+style of lower-case words separated by underscores (e.g. ``begin()``,
+``push_back()``, and ``empty()``).
+
+Here are some examples of good and bad names:
+
+.. code-block:: c++
+
+  class VehicleMaker {
+    ...
+    Factory<Tire> F;            // Bad -- abbreviation and non-descriptive.
+    Factory<Tire> Factory;      // Better.
+    Factory<Tire> TireFactory;  // Even better -- if VehicleMaker has more than one
+                                // kind of factories.
+  };
+
+  Vehicle MakeVehicle(VehicleType Type) {
+    VehicleMaker M;                         // Might be OK if having a short life-span.
+    Tire tmp1 = M.makeTire();               // Bad -- 'tmp1' provides no information.
+    Light headlight = M.makeLight("head");  // Good -- descriptive.
+    ...
+  }
+
+Assert Liberally
+^^^^^^^^^^^^^^^^
+
+Use the "``assert``" macro to its fullest.  Check all of your preconditions and
+assumptions, you never know when a bug (not necessarily even yours) might be
+caught early by an assertion, which reduces debugging time dramatically.  The
+"``<cassert>``" header file is probably already included by the header files you
+are using, so it doesn't cost anything to use it.
+
+To further assist with debugging, make sure to put some kind of error message in
+the assertion statement, which is printed if the assertion is tripped. This
+helps the poor debugger make sense of why an assertion is being made and
+enforced, and hopefully what to do about it.  Here is one complete example:
+
+.. code-block:: c++
+
+  inline Value *getOperand(unsigned i) { 
+    assert(i < Operands.size() &amp;&amp; "getOperand() out of range!");
+    return Operands[i]; 
+  }
+
+Here are more examples:
+
+.. code-block:: c++
+
+  assert(Ty->isPointerType() && "Can't allocate a non pointer type!");
+
+  assert((Opcode == Shl || Opcode == Shr) && "ShiftInst Opcode invalid!");
+
+  assert(idx < getNumSuccessors() && "Successor # out of range!");
+
+  assert(V1.getType() == V2.getType() && "Constant types must be identical!");
+
+  assert(isa<PHINode>(Succ->front()) && "Only works on PHId BBs!");
+
+You get the idea.
+
+Please be aware that, when adding assert statements, not all compilers are aware
+of the semantics of the assert.  In some places, asserts are used to indicate a
+piece of code that should not be reached.  These are typically of the form:
+
+.. code-block:: c++
+
+  assert(0 && "Some helpful error message");
+
+When used in a function that returns a value, they should be followed with a
+return statement and a comment indicating that this line is never reached.  This
+will prevent a compiler which is unable to deduce that the assert statement
+never returns from generating a warning.
+
+.. code-block:: c++
+
+  assert(0 && "Some helpful error message");
+  return 0;
+
+Another issue is that values used only by assertions will produce an "unused
+value" warning when assertions are disabled.  For example, this code will warn:
+
+.. code-block:: c++
+
+  unsigned Size = V.size();
+  assert(Size > 42 && "Vector smaller than it should be");
+
+  bool NewToSet = Myset.insert(Value);
+  assert(NewToSet && "The value shouldn't be in the set yet");
+
+These are two interesting different cases. In the first case, the call to
+``V.size()`` is only useful for the assert, and we don't want it executed when
+assertions are disabled.  Code like this should move the call into the assert
+itself.  In the second case, the side effects of the call must happen whether
+the assert is enabled or not.  In this case, the value should be cast to void to
+disable the warning.  To be specific, it is preferred to write the code like
+this:
+
+.. code-block:: c++
+
+  assert(V.size() > 42 && "Vector smaller than it should be");
+
+  bool NewToSet = Myset.insert(Value); (void)NewToSet;
+  assert(NewToSet && "The value shouldn't be in the set yet");
+
+Do Not Use ``using namespace std``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In LLVM, we prefer to explicitly prefix all identifiers from the standard
+namespace with an "``std::``" prefix, rather than rely on "``using namespace
+std;``".
+
+In header files, adding a ``'using namespace XXX'`` directive pollutes the
+namespace of any source file that ``#include``\s the header.  This is clearly a
+bad thing.
+
+In implementation files (e.g. ``.cpp`` files), the rule is more of a stylistic
+rule, but is still important.  Basically, using explicit namespace prefixes
+makes the code **clearer**, because it is immediately obvious what facilities
+are being used and where they are coming from. And **more portable**, because
+namespace clashes cannot occur between LLVM code and other namespaces.  The
+portability rule is important because different standard library implementations
+expose different symbols (potentially ones they shouldn't), and future revisions
+to the C++ standard will add more symbols to the ``std`` namespace.  As such, we
+never use ``'using namespace std;'`` in LLVM.
+
+The exception to the general rule (i.e. it's not an exception for the ``std``
+namespace) is for implementation files.  For example, all of the code in the
+LLVM project implements code that lives in the 'llvm' namespace.  As such, it is
+ok, and actually clearer, for the ``.cpp`` files to have a ``'using namespace
+llvm;'`` directive at the top, after the ``#include``\s.  This reduces
+indentation in the body of the file for source editors that indent based on
+braces, and keeps the conceptual context cleaner.  The general form of this rule
+is that any ``.cpp`` file that implements code in any namespace may use that
+namespace (and its parents'), but should not use any others.
+
+Provide a Virtual Method Anchor for Classes in Headers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a class is defined in a header file and has a vtable (either it has virtual
+methods or it derives from classes with virtual methods), it must always have at
+least one out-of-line virtual method in the class.  Without this, the compiler
+will copy the vtable and RTTI into every ``.o`` file that ``#include``\s the
+header, bloating ``.o`` file sizes and increasing link times.
+
+Don't evaluate ``end()`` every time through a loop
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Because C++ doesn't have a standard "``foreach``" loop (though it can be
+emulated with macros and may be coming in C++'0x) we end up writing a lot of
+loops that manually iterate from begin to end on a variety of containers or
+through other data structures.  One common mistake is to write a loop in this
+style:
+
+.. code-block:: c++
+
+  BasicBlock *BB = ...
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+    ... use I ...
+
+The problem with this construct is that it evaluates "``BB->end()``" every time
+through the loop.  Instead of writing the loop like this, we strongly prefer
+loops to be written so that they evaluate it once before the loop starts.  A
+convenient way to do this is like so:
+
+.. code-block:: c++
+
+  BasicBlock *BB = ...
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    ... use I ...
+
+The observant may quickly point out that these two loops may have different
+semantics: if the container (a basic block in this case) is being mutated, then
+"``BB->end()``" may change its value every time through the loop and the second
+loop may not in fact be correct.  If you actually do depend on this behavior,
+please write the loop in the first form and add a comment indicating that you
+did it intentionally.
+
+Why do we prefer the second form (when correct)?  Writing the loop in the first
+form has two problems. First it may be less efficient than evaluating it at the
+start of the loop.  In this case, the cost is probably minor --- a few extra
+loads every time through the loop.  However, if the base expression is more
+complex, then the cost can rise quickly.  I've seen loops where the end
+expression was actually something like: "``SomeMap[x]->end()``" and map lookups
+really aren't cheap.  By writing it in the second form consistently, you
+eliminate the issue entirely and don't even have to think about it.
+
+The second (even bigger) issue is that writing the loop in the first form hints
+to the reader that the loop is mutating the container (a fact that a comment
+would handily confirm!).  If you write the loop in the second form, it is
+immediately obvious without even looking at the body of the loop that the
+container isn't being modified, which makes it easier to read the code and
+understand what it does.
+
+While the second form of the loop is a few extra keystrokes, we do strongly
+prefer it.
+
+``#include <iostream>`` is Forbidden
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The use of ``#include <iostream>`` in library files is hereby **forbidden**,
+because many common implementations transparently inject a `static constructor`_
+into every translation unit that includes it.
+  
+Note that using the other stream headers (``<sstream>`` for example) is not
+problematic in this regard --- just ``<iostream>``. However, ``raw_ostream``
+provides various APIs that are better performing for almost every use than
+``std::ostream`` style APIs.
+
+.. note::
+
+  New code should always use `raw_ostream`_ for writing, or the
+  ``llvm::MemoryBuffer`` API for reading files.
+
+.. _raw_ostream:
+
+Use ``raw_ostream``
+^^^^^^^^^^^^^^^^^^^
+
+LLVM includes a lightweight, simple, and efficient stream implementation in
+``llvm/Support/raw_ostream.h``, which provides all of the common features of
+``std::ostream``.  All new code should use ``raw_ostream`` instead of
+``ostream``.
+
+Unlike ``std::ostream``, ``raw_ostream`` is not a template and can be forward
+declared as ``class raw_ostream``.  Public headers should generally not include
+the ``raw_ostream`` header, but use forward declarations and constant references
+to ``raw_ostream`` instances.
+
+Avoid ``std::endl``
+^^^^^^^^^^^^^^^^^^^
+
+The ``std::endl`` modifier, when used with ``iostreams`` outputs a newline to
+the output stream specified.  In addition to doing this, however, it also
+flushes the output stream.  In other words, these are equivalent:
+
+.. code-block:: c++
+
+  std::cout << std::endl;
+  std::cout << '\n' << std::flush;
+
+Most of the time, you probably have no reason to flush the output stream, so
+it's better to use a literal ``'\n'``.
+
+Microscopic Details
+-------------------
+
+This section describes preferred low-level formatting guidelines along with
+reasoning on why we prefer them.
+
+Spaces Before Parentheses
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We prefer to put a space before an open parenthesis only in control flow
+statements, but not in normal function call expressions and function-like
+macros.  For example, this is good:
+
+.. code-block:: c++
+
+  if (x) ...
+  for (i = 0; i != 100; ++i) ...
+  while (llvm_rocks) ...
+
+  somefunc(42);
+  assert(3 != 4 && "laws of math are failing me");
+  
+  a = foo(42, 92) + bar(x);
+
+and this is bad:
+
+.. code-block:: c++
+
+  if(x) ...
+  for(i = 0; i != 100; ++i) ...
+  while(llvm_rocks) ...
+
+  somefunc (42);
+  assert (3 != 4 && "laws of math are failing me");
+  
+  a = foo (42, 92) + bar (x);
+
+The reason for doing this is not completely arbitrary.  This style makes control
+flow operators stand out more, and makes expressions flow better. The function
+call operator binds very tightly as a postfix operator.  Putting a space after a
+function name (as in the last example) makes it appear that the code might bind
+the arguments of the left-hand-side of a binary operator with the argument list
+of a function and the name of the right side.  More specifically, it is easy to
+misread the "``a``" example as:
+
+.. code-block:: c++
+
+  a = foo ((42, 92) + bar) (x);
+
+when skimming through the code.  By avoiding a space in a function, we avoid
+this misinterpretation.
+
+Prefer Preincrement
+^^^^^^^^^^^^^^^^^^^
+
+Hard fast rule: Preincrement (``++X``) may be no slower than postincrement
+(``X++``) and could very well be a lot faster than it.  Use preincrementation
+whenever possible.
+
+The semantics of postincrement include making a copy of the value being
+incremented, returning it, and then preincrementing the "work value".  For
+primitive types, this isn't a big deal. But for iterators, it can be a huge
+issue (for example, some iterators contains stack and set objects in them...
+copying an iterator could invoke the copy ctor's of these as well).  In general,
+get in the habit of always using preincrement, and you won't have a problem.
+
+
+Namespace Indentation
+^^^^^^^^^^^^^^^^^^^^^
+
+In general, we strive to reduce indentation wherever possible.  This is useful
+because we want code to `fit into 80 columns`_ without wrapping horribly, but
+also because it makes it easier to understand the code.  Namespaces are a funny
+thing: they are often large, and we often desire to put lots of stuff into them
+(so they can be large).  Other times they are tiny, because they just hold an
+enum or something similar.  In order to balance this, we use different
+approaches for small versus large namespaces.
+
+If a namespace definition is small and *easily* fits on a screen (say, less than
+35 lines of code), then you should indent its body.  Here's an example:
+
+.. code-block:: c++
+
+  namespace llvm {
+    namespace X86 {
+      /// RelocationType - An enum for the x86 relocation codes. Note that
+      /// the terminology here doesn't follow x86 convention - word means
+      /// 32-bit and dword means 64-bit.
+      enum RelocationType {
+        /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+        /// the value already in memory, after we adjust it for where the PC is.
+        reloc_pcrel_word = 0,
+
+        /// reloc_picrel_word - PIC base relative relocation, add the relocated
+        /// value to the value already in memory, after we adjust it for where the
+        /// PIC base is.
+        reloc_picrel_word = 1,
+
+        /// reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+        /// add the relocated value to the value already in memory.
+        reloc_absolute_word = 2,
+        reloc_absolute_dword = 3
+      };
+    }
+  }
+
+Since the body is small, indenting adds value because it makes it very clear
+where the namespace starts and ends, and it is easy to take the whole thing in
+in one "gulp" when reading the code.  If the blob of code in the namespace is
+larger (as it typically is in a header in the ``llvm`` or ``clang`` namespaces),
+do not indent the code, and add a comment indicating what namespace is being
+closed.  For example:
+
+.. code-block:: c++
+
+  namespace llvm {
+  namespace knowledge {
+
+  /// Grokable - This class represents things that Smith can have an intimate
+  /// understanding of and contains the data associated with it.
+  class Grokable {
+  ...
+  public:
+    explicit Grokable() { ... }
+    virtual ~Grokable() = 0;
+  
+    ...
+
+  };
+
+  } // end namespace knowledge
+  } // end namespace llvm
+
+Because the class is large, we don't expect that the reader can easily
+understand the entire concept in a glance, and the end of the file (where the
+namespaces end) may be a long ways away from the place they open.  As such,
+indenting the contents of the namespace doesn't add any value, and detracts from
+the readability of the class.  In these cases it is best to *not* indent the
+contents of the namespace.
+
+.. _static:
+
+Anonymous Namespaces
+^^^^^^^^^^^^^^^^^^^^
+
+After talking about namespaces in general, you may be wondering about anonymous
+namespaces in particular.  Anonymous namespaces are a great language feature
+that tells the C++ compiler that the contents of the namespace are only visible
+within the current translation unit, allowing more aggressive optimization and
+eliminating the possibility of symbol name collisions.  Anonymous namespaces are
+to C++ as "static" is to C functions and global variables.  While "``static``"
+is available in C++, anonymous namespaces are more general: they can make entire
+classes private to a file.
+
+The problem with anonymous namespaces is that they naturally want to encourage
+indentation of their body, and they reduce locality of reference: if you see a
+random function definition in a C++ file, it is easy to see if it is marked
+static, but seeing if it is in an anonymous namespace requires scanning a big
+chunk of the file.
+
+Because of this, we have a simple guideline: make anonymous namespaces as small
+as possible, and only use them for class declarations.  For example, this is
+good:
+
+.. code-block:: c++
+
+  namespace {
+    class StringSort {
+    ...
+    public:
+      StringSort(...)
+      bool operator<(const char *RHS) const;
+    };
+  } // end anonymous namespace
+
+  static void Helper() { 
+    ... 
+  }
+
+  bool StringSort::operator<(const char *RHS) const {
+    ...
+  }
+
+This is bad:
+
+.. code-block:: c++
+
+  namespace {
+  class StringSort {
+  ...
+  public:
+    StringSort(...)
+    bool operator<(const char *RHS) const;
+  };
+
+  void Helper() { 
+    ... 
+  }
+
+  bool StringSort::operator<(const char *RHS) const {
+    ...
+  }
+
+  } // end anonymous namespace
+
+This is bad specifically because if you're looking at "``Helper``" in the middle
+of a large C++ file, that you have no immediate way to tell if it is local to
+the file.  When it is marked static explicitly, this is immediately obvious.
+Also, there is no reason to enclose the definition of "``operator<``" in the
+namespace just because it was declared there.
+
+See Also
+========
+
+A lot of these comments and recommendations have been culled for other sources.
+Two particularly important books for our work are:
+
+#. `Effective C++
+   <http://www.amazon.com/Effective-Specific-Addison-Wesley-Professional-Computing/dp/0321334876>`_
+   by Scott Meyers.  Also interesting and useful are "More Effective C++" and
+   "Effective STL" by the same author.
+
+#. `Large-Scale C++ Software Design
+   <http://www.amazon.com/Large-Scale-Software-Design-John-Lakos/dp/0201633620/ref=sr_1_1>`_
+   by John Lakos
+
+If you get some free time, and you haven't read them: do so, you might learn
+something.
diff --git a/docs/CommandGuide/FileCheck.pod b/docs/CommandGuide/FileCheck.rst
index 2662cc0..51a9bf6 100644
--- a/docs/CommandGuide/FileCheck.pod
+++ b/docs/CommandGuide/FileCheck.rst
@@ -1,67 +1,87 @@
+FileCheck - Flexible pattern matching file verifier
+===================================================
 
-=pod
 
-=head1 NAME
+SYNOPSIS
+--------
 
-FileCheck - Flexible pattern matching file verifier
 
-=head1 SYNOPSIS
+**FileCheck** *match-filename* [*--check-prefix=XXX*] [*--strict-whitespace*]
+
 
-B<FileCheck> I<match-filename> [I<--check-prefix=XXX>] [I<--strict-whitespace>]
+DESCRIPTION
+-----------
 
-=head1 DESCRIPTION
 
-B<FileCheck> reads two files (one from standard input, and one specified on the
+**FileCheck** reads two files (one from standard input, and one specified on the
 command line) and uses one to verify the other.  This behavior is particularly
 useful for the testsuite, which wants to verify that the output of some tool
 (e.g. llc) contains the expected information (for example, a movsd from esp or
 whatever is interesting).  This is similar to using grep, but it is optimized
 for matching multiple different inputs in one file in a specific order.
 
-The I<match-filename> file specifies the file that contains the patterns to
+The *match-filename* file specifies the file that contains the patterns to
 match.  The file to verify is always read from standard input.
 
-=head1 OPTIONS
 
-=over
+OPTIONS
+-------
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**--check-prefix** *prefix*
+
+ FileCheck searches the contents of *match-filename* for patterns to match.  By
+ default, these patterns are prefixed with "CHECK:".  If you'd like to use a
+ different prefix (e.g. because the same input file is checking multiple
+ different tool or options), the **--check-prefix** argument allows you to specify
+ a specific prefix to match.
+
 
-=item B<-help>
 
-Print a summary of command line options.
+**--strict-whitespace**
 
-=item B<--check-prefix> I<prefix>
+ By default, FileCheck canonicalizes input horizontal whitespace (spaces and
+ tabs) which causes it to ignore these differences (a space will match a tab).
+ The --strict-whitespace argument disables this behavior.
 
-FileCheck searches the contents of I<match-filename> for patterns to match.  By
-default, these patterns are prefixed with "CHECK:".  If you'd like to use a
-different prefix (e.g. because the same input file is checking multiple
-different tool or options), the B<--check-prefix> argument allows you to specify
-a specific prefix to match.
 
-=item B<--strict-whitespace>
 
-By default, FileCheck canonicalizes input horizontal whitespace (spaces and
-tabs) which causes it to ignore these differences (a space will match a tab).
-The --strict-whitespace argument disables this behavior.
+**-version**
 
-=item B<-version>
+ Show the version number of this program.
 
-Show the version number of this program.
 
-=back
 
-=head1 EXIT STATUS
 
-If B<FileCheck> verifies that the file matches the expected contents, it exits
+EXIT STATUS
+-----------
+
+
+If **FileCheck** verifies that the file matches the expected contents, it exits
 with 0.  Otherwise, if not, or if an error occurs, it will exit with a non-zero
 value.
 
-=head1 TUTORIAL
+
+TUTORIAL
+--------
+
 
 FileCheck is typically used from LLVM regression tests, being invoked on the RUN
 line of the test.  A simple example of using FileCheck from a RUN line looks
 like this:
 
-  ; RUN: llvm-as < %s | llc -march=x86-64 | FileCheck %s
+
+.. code-block:: llvm
+
+   ; RUN: llvm-as < %s | llc -march=x86-64 | FileCheck %s
+
 
 This syntax says to pipe the current file ("%s") into llvm-as, pipe that into
 llc, then pipe the output of llc into FileCheck.  This means that FileCheck will
@@ -69,21 +89,25 @@ be verifying its standard input (the llc output) against the filename argument
 specified (the original .ll file specified by "%s").  To see how this works,
 let's look at the rest of the .ll file (after the RUN line):
 
-  define void @sub1(i32* %p, i32 %v) {
-  entry:
-  ; CHECK: sub1:
-  ; CHECK: subl
-          %0 = tail call i32 @llvm.atomic.load.sub.i32.p0i32(i32* %p, i32 %v)
-          ret void
-  }
-  
-  define void @inc4(i64* %p) {
-  entry:
-  ; CHECK: inc4:
-  ; CHECK: incq
-          %0 = tail call i64 @llvm.atomic.load.add.i64.p0i64(i64* %p, i64 1)
-          ret void
-  }
+
+.. code-block:: llvm
+
+   define void @sub1(i32* %p, i32 %v) {
+   entry:
+   ; CHECK: sub1:
+   ; CHECK: subl
+           %0 = tail call i32 @llvm.atomic.load.sub.i32.p0i32(i32* %p, i32 %v)
+           ret void
+   }
+
+   define void @inc4(i64* %p) {
+   entry:
+   ; CHECK: inc4:
+   ; CHECK: incq
+           %0 = tail call i64 @llvm.atomic.load.add.i64.p0i64(i64* %p, i64 1)
+           ret void
+   }
+
 
 Here you can see some "CHECK:" lines specified in comments.  Now you can see
 how the file is piped into llvm-as, then llc, and the machine code output is
@@ -102,35 +126,40 @@ is a "subl" in between those labels.  If it existed somewhere else in the file,
 that would not count: "grep subl" matches if subl exists anywhere in the
 file.
 
+The FileCheck -check-prefix option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-=head2 The FileCheck -check-prefix option
-
 The FileCheck -check-prefix option allows multiple test configurations to be
 driven from one .ll file.  This is useful in many circumstances, for example,
 testing different architectural variants with llc.  Here's a simple example:
 
-  ; RUN: llvm-as < %s | llc -mtriple=i686-apple-darwin9 -mattr=sse41 \
-  ; RUN:              | FileCheck %s -check-prefix=X32>
-  ; RUN: llvm-as < %s | llc -mtriple=x86_64-apple-darwin9 -mattr=sse41 \
-  ; RUN:              | FileCheck %s -check-prefix=X64>
-
-  define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
-          %tmp1 = insertelement <4 x i32>; %tmp, i32 %s, i32 1
-          ret <4 x i32> %tmp1
-  ; X32: pinsrd_1:
-  ; X32:    pinsrd $1, 4(%esp), %xmm0
-  
-  ; X64: pinsrd_1:
-  ; X64:    pinsrd $1, %edi, %xmm0
-  }
+
+.. code-block:: llvm
+
+   ; RUN: llvm-as < %s | llc -mtriple=i686-apple-darwin9 -mattr=sse41 \
+   ; RUN:              | FileCheck %s -check-prefix=X32
+   ; RUN: llvm-as < %s | llc -mtriple=x86_64-apple-darwin9 -mattr=sse41 \
+   ; RUN:              | FileCheck %s -check-prefix=X64
+
+   define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
+           %tmp1 = insertelement <4 x i32>; %tmp, i32 %s, i32 1
+           ret <4 x i32> %tmp1
+   ; X32: pinsrd_1:
+   ; X32:    pinsrd $1, 4(%esp), %xmm0
+
+   ; X64: pinsrd_1:
+   ; X64:    pinsrd $1, %edi, %xmm0
+   }
+
 
 In this case, we're testing that we get the expected code generation with
 both 32-bit and 64-bit code generation.
 
 
+The "CHECK-NEXT:" directive
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-=head2 The "CHECK-NEXT:" directive
 
 Sometimes you want to match lines and would like to verify that matches
 happen on exactly consecutive lines with no other lines in between them.  In
@@ -138,64 +167,78 @@ this case, you can use CHECK: and CHECK-NEXT: directives to specify this.  If
 you specified a custom check prefix, just use "<PREFIX>-NEXT:".  For
 example, something like this works as you'd expect:
 
-  define void @t2(<2 x double>* %r, <2 x double&gt;* %A, double %B) {
-	%tmp3 = load <2 x double&gt;* %A, align 16
-	%tmp7 = insertelement <2 x double&gt; undef, double %B, i32 0
-	%tmp9 = shufflevector <2 x double&gt; %tmp3,
-                              <2 x double&gt; %tmp7,
-                              <2 x i32&gt; < i32 0, i32 2 &gt;
-	store <2 x double&gt; %tmp9, <2 x double&gt;* %r, align 16
-	ret void
-        
-  ; CHECK:          t2:
-  ; CHECK: 	        movl	8(%esp), %eax
-  ; CHECK-NEXT: 	movapd	(%eax), %xmm0
-  ; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-  ; CHECK-NEXT: 	movl	4(%esp), %eax
-  ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
-  ; CHECK-NEXT: 	ret
-  }
+
+.. code-block:: llvm
+
+   define void @t2(<2 x double>* %r, <2 x double>* %A, double %B) {
+ 	%tmp3 = load <2 x double>* %A, align 16
+ 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
+ 	%tmp9 = shufflevector <2 x double> %tmp3,
+                               <2 x double> %tmp7,
+                               <2 x i32> < i32 0, i32 2 >
+ 	store <2 x double> %tmp9, <2 x double>* %r, align 16
+ 	ret void
+
+   ; CHECK:          t2:
+   ; CHECK: 	        movl	8(%esp), %eax
+   ; CHECK-NEXT: 	movapd	(%eax), %xmm0
+   ; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
+   ; CHECK-NEXT: 	movl	4(%esp), %eax
+   ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
+   ; CHECK-NEXT: 	ret
+   }
+
 
 CHECK-NEXT: directives reject the input unless there is exactly one newline
 between it an the previous directive.  A CHECK-NEXT cannot be the first
 directive in a file.
 
 
+The "CHECK-NOT:" directive
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-=head2 The "CHECK-NOT:" directive
 
 The CHECK-NOT: directive is used to verify that a string doesn't occur
 between two matches (or before the first match, or after the last match).  For
 example, to verify that a load is removed by a transformation, a test like this
 can be used:
 
-  define i8 @coerce_offset0(i32 %V, i32* %P) {
-    store i32 %V, i32* %P
-   
-    %P2 = bitcast i32* %P to i8*
-    %P3 = getelementptr i8* %P2, i32 2
 
-    %A = load i8* %P3
-    ret i8 %A
-  ; CHECK: @coerce_offset0
-  ; CHECK-NOT: load
-  ; CHECK: ret i8
-  }
+.. code-block:: llvm
+
+   define i8 @coerce_offset0(i32 %V, i32* %P) {
+     store i32 %V, i32* %P
+
+     %P2 = bitcast i32* %P to i8*
+     %P3 = getelementptr i8* %P2, i32 2
+
+     %A = load i8* %P3
+     ret i8 %A
+   ; CHECK: @coerce_offset0
+   ; CHECK-NOT: load
+   ; CHECK: ret i8
+   }
 
 
 
-=head2 FileCheck Pattern Matching Syntax
+FileCheck Pattern Matching Syntax
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 
 The CHECK: and CHECK-NOT: directives both take a pattern to match.  For most
 uses of FileCheck, fixed string matching is perfectly sufficient.  For some
 things, a more flexible form of matching is desired.  To support this, FileCheck
 allows you to specify regular expressions in matching strings, surrounded by
-double braces: B<{{yourregex}}>.  Because we want to use fixed string
+double braces: **{{yourregex}}**.  Because we want to use fixed string
 matching for a majority of what we do, FileCheck has been designed to support
 mixing and matching fixed string matching with regular expressions.  This allows
 you to write things like this:
 
-  ; CHECK: movhpd	{{[0-9]+}}(%esp), {{%xmm[0-7]}}
+
+.. code-block:: llvm
+
+   ; CHECK: movhpd	{{[0-9]+}}(%esp), {{%xmm[0-7]}}
+
 
 In this case, any offset from the ESP register will be allowed, and any xmm
 register will be allowed.
@@ -204,11 +247,12 @@ Because regular expressions are enclosed with double braces, they are
 visually distinct, and you don't need to use escape characters within the double
 braces like you would in C.  In the rare case that you want to match double
 braces explicitly from the input, you can use something ugly like
-B<{{[{][{]}}> as your pattern.
+**{{[{][{]}}** as your pattern.
 
 
+FileCheck Variables
+~~~~~~~~~~~~~~~~~~~
 
-=head2 FileCheck Variables
 
 It is often useful to match a pattern and then verify that it occurs again
 later in the file.  For codegen tests, this can be useful to allow any register,
@@ -216,30 +260,25 @@ but verify that that register is used consistently later.  To do this, FileCheck
 allows named variables to be defined and substituted into patterns.  Here is a
 simple example:
 
-  ; CHECK: test5:
-  ; CHECK:    notw	[[REGISTER:%[a-z]+]]
-  ; CHECK:    andw	{{.*}}[REGISTER]]
 
-The first check line matches a regex (B<%[a-z]+>) and captures it into
+.. code-block:: llvm
+
+   ; CHECK: test5:
+   ; CHECK:    notw	[[REGISTER:%[a-z]+]]
+   ; CHECK:    andw	{{.*}}[[REGISTER]]
+
+
+The first check line matches a regex (**%[a-z]+**) and captures it into
 the variable "REGISTER".  The second line verifies that whatever is in REGISTER
 occurs later in the file after an "andw".  FileCheck variable references are
-always contained in B<[[ ]]> pairs, are named, and their names can be
-formed with the regex "B<[a-zA-Z_][a-zA-Z0-9_]*>".  If a colon follows the
+always contained in **[[ ]]** pairs, are named, and their names can be
 name, then it is a definition of the variable, if not, it is a use.
 
 FileCheck variables can be defined multiple times, and uses always get the
 latest value.  Note that variables are all read at the start of a "CHECK" line
 and are all defined at the end.  This means that if you have something like
-"B<CHECK: [[XYZ:.*]]x[[XYZ]]>", the check line will read the previous
+"**CHECK: [[XYZ:.\\*]]x[[XYZ]]**", the check line will read the previous
 value of the XYZ variable and define a new one after the match is performed.  If
 you need to do something like this you can probably take advantage of the fact
 that FileCheck is not actually line-oriented when it matches, this allows you to
 define two separate CHECK lines that match on the same line.
-
-
-
-=head1 AUTHORS
-
-Maintained by The LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/Makefile b/docs/CommandGuide/Makefile
deleted file mode 100644
index 3f9f60b..0000000
--- a/docs/CommandGuide/Makefile
+++ /dev/null
@@ -1,103 +0,0 @@
-##===- docs/CommandGuide/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-ifdef BUILD_FOR_WEBSITE
-# This special case is for keeping the CommandGuide on the LLVM web site
-# up to date automatically as the documents are checked in. It must build
-# the POD files to HTML only and keep them in the src directories. It must also
-# build in an unconfigured tree, hence the ifdef. To use this, run
-# make -s BUILD_FOR_WEBSITE=1 inside the cvs commit script.
-SRC_DOC_DIR=
-DST_HTML_DIR=html/
-DST_MAN_DIR=man/man1/
-DST_PS_DIR=ps/
-
-# If we are in BUILD_FOR_WEBSITE mode, default to the all target.
-all:: html man ps
-
-clean:
-	rm -f pod2htm*.*~~ $(HTML) $(MAN) $(PS)
-
-# To create other directories, as needed, and timestamp their creation
-%/.dir:
-	-mkdir $* > /dev/null
-	date > $@
-
-else
-
-# Otherwise, if not in BUILD_FOR_WEBSITE mode, use the project info.
-LEVEL := ../..
-include $(LEVEL)/Makefile.common
-
-SRC_DOC_DIR=$(PROJ_SRC_DIR)/
-DST_HTML_DIR=$(PROJ_OBJ_DIR)/
-DST_MAN_DIR=$(PROJ_OBJ_DIR)/
-DST_PS_DIR=$(PROJ_OBJ_DIR)/
-
-endif
-
-
-POD  := $(wildcard $(SRC_DOC_DIR)*.pod)
-HTML := $(patsubst $(SRC_DOC_DIR)%.pod, $(DST_HTML_DIR)%.html, $(POD))
-MAN  := $(patsubst $(SRC_DOC_DIR)%.pod, $(DST_MAN_DIR)%.1, $(POD))
-PS   := $(patsubst $(SRC_DOC_DIR)%.pod, $(DST_PS_DIR)%.ps, $(POD))
-
-# The set of man pages we will not install
-NO_INSTALL_MANS = $(DST_MAN_DIR)FileCheck.1 $(DST_MAN_DIR)llvm-build.1
-
-# The set of man pages that we will install
-INSTALL_MANS = $(filter-out $(NO_INSTALL_MANS), $(MAN))
-
-.SUFFIXES:
-.SUFFIXES: .html .pod .1 .ps
-
-$(DST_HTML_DIR)%.html: %.pod $(DST_HTML_DIR)/.dir
-	pod2html --css=manpage.css --htmlroot=. \
-	  --podpath=. --noindex --infile=$< --outfile=$@ --title=$*
-
-$(DST_MAN_DIR)%.1: %.pod $(DST_MAN_DIR)/.dir
-	pod2man --release=CVS --center="LLVM Command Guide" $< $@
-
-$(DST_PS_DIR)%.ps: $(DST_MAN_DIR)%.1 $(DST_PS_DIR)/.dir
-	groff -Tps -man $< > $@
-
-
-html: $(HTML)
-man: $(MAN)
-ps: $(PS)
-
-EXTRA_DIST := $(POD) index.html
-
-clean-local::
-	$(Verb) $(RM) -f pod2htm*.*~~ $(HTML) $(MAN) $(PS)
-
-HTML_DIR := $(DESTDIR)$(PROJ_docsdir)/html/CommandGuide
-MAN_DIR  := $(DESTDIR)$(PROJ_mandir)/man1
-PS_DIR   := $(DESTDIR)$(PROJ_docsdir)/ps
-
-install-local:: $(HTML) $(INSTALL_MANS) $(PS)
-	$(Echo) Installing HTML CommandGuide Documentation
-	$(Verb) $(MKDIR) $(HTML_DIR)
-	$(Verb) $(DataInstall) $(HTML) $(HTML_DIR)
-	$(Verb) $(DataInstall) $(PROJ_SRC_DIR)/index.html $(HTML_DIR)
-	$(Verb) $(DataInstall) $(PROJ_SRC_DIR)/manpage.css $(HTML_DIR)
-	$(Echo) Installing MAN CommandGuide Documentation
-	$(Verb) $(MKDIR) $(MAN_DIR)
-	$(Verb) $(DataInstall) $(INSTALL_MANS) $(MAN_DIR)
-	$(Echo) Installing PS CommandGuide Documentation
-	$(Verb) $(MKDIR) $(PS_DIR)
-	$(Verb) $(DataInstall) $(PS) $(PS_DIR)
-
-uninstall-local::
-	$(Echo) Uninstalling CommandGuide Documentation
-	$(Verb) $(RM) -rf $(HTML_DIR) $(MAN_DIR) $(PS_DIR)
-
-printvars::
-	$(Echo) "POD            : " '$(POD)'
-	$(Echo) "HTML           : " '$(HTML)'
diff --git a/docs/CommandGuide/bugpoint.pod b/docs/CommandGuide/bugpoint.pod
deleted file mode 100644
index 31db62f..0000000
--- a/docs/CommandGuide/bugpoint.pod
+++ /dev/null
@@ -1,186 +0,0 @@
-=pod
-
-=head1 NAME
-
-bugpoint - automatic test case reduction tool
-
-=head1 SYNOPSIS
-
-B<bugpoint> [I<options>] [I<input LLVM ll/bc files>] [I<LLVM passes>] B<--args>
-I<program arguments>
-
-=head1 DESCRIPTION
-
-B<bugpoint> narrows down the source of problems in LLVM tools and passes.  It
-can be used to debug three types of failures: optimizer crashes, miscompilations
-by optimizers, or bad native code generation (including problems in the static
-and JIT compilers).  It aims to reduce large test cases to small, useful ones.
-For more information on the design and inner workings of B<bugpoint>, as well as
-advice for using bugpoint, see F<llvm/docs/Bugpoint.html> in the LLVM
-distribution.
-
-=head1 OPTIONS
-
-=over
-
-=item B<--additional-so> F<library>
-
-Load the dynamic shared object F<library> into the test program whenever it is
-run.  This is useful if you are debugging programs which depend on non-LLVM
-libraries (such as the X or curses libraries) to run.
-
-=item B<--append-exit-code>=I<{true,false}>
-
-Append the test programs exit code to the output file so that a change in exit
-code is considered a test failure. Defaults to false.
-
-=item B<--args> I<program args>
-
-Pass all arguments specified after -args to the test program whenever it runs.
-Note that if any of the I<program args> start with a '-', you should use:
-
-    bugpoint [bugpoint args] --args -- [program args]
-
-The "--" right after the B<--args> option tells B<bugpoint> to consider any
-options starting with C<-> to be part of the B<--args> option, not as options to
-B<bugpoint> itself.
-
-=item B<--tool-args> I<tool args>
-
-Pass all arguments specified after --tool-args to the LLVM tool under test
-(B<llc>, B<lli>, etc.) whenever it runs.  You should use this option in the
-following way:
-
-    bugpoint [bugpoint args] --tool-args -- [tool args]
-
-The "--" right after the B<--tool-args> option tells B<bugpoint> to consider any
-options starting with C<-> to be part of the B<--tool-args> option, not as
-options to B<bugpoint> itself. (See B<--args>, above.)
-
-=item B<--safe-tool-args> I<tool args>
-
-Pass all arguments specified after B<--safe-tool-args> to the "safe" execution
-tool.
-
-=item B<--gcc-tool-args> I<gcc tool args>
-
-Pass all arguments specified after B<--gcc-tool-args> to the invocation of
-B<gcc>.
-
-=item B<--opt-args> I<opt args>
-
-Pass all arguments specified after B<--opt-args> to the invocation of B<opt>.
-
-=item B<--disable-{dce,simplifycfg}>
-
-Do not run the specified passes to clean up and reduce the size of the test
-program. By default, B<bugpoint> uses these passes internally when attempting to
-reduce test programs.  If you're trying to find a bug in one of these passes,
-B<bugpoint> may crash.
-
-=item B<--enable-valgrind>
-
-Use valgrind to find faults in the optimization phase. This will allow
-bugpoint to find otherwise asymptomatic problems caused by memory
-mis-management.
-
-=item B<-find-bugs>
-
-Continually randomize the specified passes and run them on the test program
-until a bug is found or the user kills B<bugpoint>.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<--input> F<filename>
-
-Open F<filename> and redirect the standard input of the test program, whenever
-it runs, to come from that file.
-
-=item B<--load> F<plugin>
-
-Load the dynamic object F<plugin> into B<bugpoint> itself.  This object should
-register new optimization passes.  Once loaded, the object will add new command
-line options to enable various optimizations.  To see the new complete list of
-optimizations, use the B<-help> and B<--load> options together; for example:
-
-    bugpoint --load myNewPass.so -help
-
-=item B<--mlimit> F<megabytes>
-
-Specifies an upper limit on memory usage of the optimization and codegen. Set
-to zero to disable the limit.
-
-=item B<--output> F<filename>
-
-Whenever the test program produces output on its standard output stream, it
-should match the contents of F<filename> (the "reference output"). If you
-do not use this option, B<bugpoint> will attempt to generate a reference output
-by compiling the program with the "safe" backend and running it.
-
-=item B<--profile-info-file> F<filename>
-
-Profile file loaded by B<--profile-loader>.
-
-=item B<--run-{int,jit,llc,cbe,custom}>
-
-Whenever the test program is compiled, B<bugpoint> should generate code for it
-using the specified code generator.  These options allow you to choose the
-interpreter, the JIT compiler, the static native code compiler, the C
-backend, or a custom command (see B<--exec-command>) respectively.
-
-=item B<--safe-{llc,cbe,custom}>
-
-When debugging a code generator, B<bugpoint> should use the specified code
-generator as the "safe" code generator. This is a known-good code generator
-used to generate the "reference output" if it has not been provided, and to
-compile portions of the program that as they are excluded from the testcase.
-These options allow you to choose the
-static native code compiler, the C backend, or a custom command,
-(see B<--exec-command>) respectively. The interpreter and the JIT backends
-cannot currently be used as the "safe" backends.
-
-=item B<--exec-command> I<command>
-
-This option defines the command to use with the B<--run-custom> and
-B<--safe-custom> options to execute the bitcode testcase. This can
-be useful for cross-compilation.
-
-=item B<--compile-command> I<command>
-
-This option defines the command to use with the B<--compile-custom>
-option to compile the bitcode testcase. This can be useful for
-testing compiler output without running any link or execute stages. To
-generate a reduced unit test, you may add CHECK directives to the
-testcase and pass the name of an executable compile-command script in this form:
-
-    #!/bin/sh
-    llc "$@"
-    not FileCheck [bugpoint input file].ll < bugpoint-test-program.s
-
-This script will "fail" as long as FileCheck passes. So the result
-will be the minimum bitcode that passes FileCheck.
-
-=item B<--safe-path> I<path>
-
-This option defines the path to the command to execute with the
-B<--safe-{int,jit,llc,cbe,custom}>
-option.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<bugpoint> succeeds in finding a problem, it will exit with 0.  Otherwise,
-if an error occurs, it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<opt|opt>
-
-=head1 AUTHOR
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/bugpoint.rst b/docs/CommandGuide/bugpoint.rst
new file mode 100644
index 0000000..c1b3b6e
--- /dev/null
+++ b/docs/CommandGuide/bugpoint.rst
@@ -0,0 +1,247 @@
+bugpoint - automatic test case reduction tool
+=============================================
+
+
+SYNOPSIS
+--------
+
+
+**bugpoint** [*options*] [*input LLVM ll/bc files*] [*LLVM passes*] **--args**
+*program arguments*
+
+
+DESCRIPTION
+-----------
+
+
+**bugpoint** narrows down the source of problems in LLVM tools and passes.  It
+can be used to debug three types of failures: optimizer crashes, miscompilations
+by optimizers, or bad native code generation (including problems in the static
+and JIT compilers).  It aims to reduce large test cases to small, useful ones.
+For more information on the design and inner workings of **bugpoint**, as well as
+advice for using bugpoint, see *llvm/docs/Bugpoint.html* in the LLVM
+distribution.
+
+
+OPTIONS
+-------
+
+
+
+**--additional-so** *library*
+
+ Load the dynamic shared object *library* into the test program whenever it is
+ run.  This is useful if you are debugging programs which depend on non-LLVM
+ libraries (such as the X or curses libraries) to run.
+
+
+
+**--append-exit-code**\ =\ *{true,false}*
+
+ Append the test programs exit code to the output file so that a change in exit
+ code is considered a test failure. Defaults to false.
+
+
+
+**--args** *program args*
+
+ Pass all arguments specified after -args to the test program whenever it runs.
+ Note that if any of the *program args* start with a '-', you should use:
+
+
+ .. code-block:: perl
+
+      bugpoint [bugpoint args] --args -- [program args]
+
+
+ The "--" right after the **--args** option tells **bugpoint** to consider any
+ options starting with ``-`` to be part of the **--args** option, not as options to
+ **bugpoint** itself.
+
+
+
+**--tool-args** *tool args*
+
+ Pass all arguments specified after --tool-args to the LLVM tool under test
+ (**llc**, **lli**, etc.) whenever it runs.  You should use this option in the
+ following way:
+
+
+ .. code-block:: perl
+
+      bugpoint [bugpoint args] --tool-args -- [tool args]
+
+
+ The "--" right after the **--tool-args** option tells **bugpoint** to consider any
+ options starting with ``-`` to be part of the **--tool-args** option, not as
+ options to **bugpoint** itself. (See **--args**, above.)
+
+
+
+**--safe-tool-args** *tool args*
+
+ Pass all arguments specified after **--safe-tool-args** to the "safe" execution
+ tool.
+
+
+
+**--gcc-tool-args** *gcc tool args*
+
+ Pass all arguments specified after **--gcc-tool-args** to the invocation of
+ **gcc**.
+
+
+
+**--opt-args** *opt args*
+
+ Pass all arguments specified after **--opt-args** to the invocation of **opt**.
+
+
+
+**--disable-{dce,simplifycfg}**
+
+ Do not run the specified passes to clean up and reduce the size of the test
+ program. By default, **bugpoint** uses these passes internally when attempting to
+ reduce test programs.  If you're trying to find a bug in one of these passes,
+ **bugpoint** may crash.
+
+
+
+**--enable-valgrind**
+
+ Use valgrind to find faults in the optimization phase. This will allow
+ bugpoint to find otherwise asymptomatic problems caused by memory
+ mis-management.
+
+
+
+**-find-bugs**
+
+ Continually randomize the specified passes and run them on the test program
+ until a bug is found or the user kills **bugpoint**.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**--input** *filename*
+
+ Open *filename* and redirect the standard input of the test program, whenever
+ it runs, to come from that file.
+
+
+
+**--load** *plugin*
+
+ Load the dynamic object *plugin* into **bugpoint** itself.  This object should
+ register new optimization passes.  Once loaded, the object will add new command
+ line options to enable various optimizations.  To see the new complete list of
+ optimizations, use the **-help** and **--load** options together; for example:
+
+
+ .. code-block:: perl
+
+      bugpoint --load myNewPass.so -help
+
+
+
+
+**--mlimit** *megabytes*
+
+ Specifies an upper limit on memory usage of the optimization and codegen. Set
+ to zero to disable the limit.
+
+
+
+**--output** *filename*
+
+ Whenever the test program produces output on its standard output stream, it
+ should match the contents of *filename* (the "reference output"). If you
+ do not use this option, **bugpoint** will attempt to generate a reference output
+ by compiling the program with the "safe" backend and running it.
+
+
+
+**--profile-info-file** *filename*
+
+ Profile file loaded by **--profile-loader**.
+
+
+
+**--run-{int,jit,llc,custom}**
+
+ Whenever the test program is compiled, **bugpoint** should generate code for it
+ using the specified code generator.  These options allow you to choose the
+ interpreter, the JIT compiler, the static native code compiler, or a
+ custom command (see **--exec-command**) respectively.
+
+
+
+**--safe-{llc,custom}**
+
+ When debugging a code generator, **bugpoint** should use the specified code
+ generator as the "safe" code generator. This is a known-good code generator
+ used to generate the "reference output" if it has not been provided, and to
+ compile portions of the program that as they are excluded from the testcase.
+ These options allow you to choose the
+ static native code compiler, or a custom command, (see **--exec-command**)
+ respectively. The interpreter and the JIT backends cannot currently
+ be used as the "safe" backends.
+
+
+
+**--exec-command** *command*
+
+ This option defines the command to use with the **--run-custom** and
+ **--safe-custom** options to execute the bitcode testcase. This can
+ be useful for cross-compilation.
+
+
+
+**--compile-command** *command*
+
+ This option defines the command to use with the **--compile-custom**
+ option to compile the bitcode testcase. This can be useful for
+ testing compiler output without running any link or execute stages. To
+ generate a reduced unit test, you may add CHECK directives to the
+ testcase and pass the name of an executable compile-command script in this form:
+
+
+ .. code-block:: sh
+
+      #!/bin/sh
+      llc "$@"
+      not FileCheck [bugpoint input file].ll < bugpoint-test-program.s
+
+
+ This script will "fail" as long as FileCheck passes. So the result
+ will be the minimum bitcode that passes FileCheck.
+
+
+
+**--safe-path** *path*
+
+ This option defines the path to the command to execute with the
+ **--safe-{int,jit,llc,custom}**
+ option.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **bugpoint** succeeds in finding a problem, it will exit with 0.  Otherwise,
+if an error occurs, it will exit with a non-zero value.
+
+
+SEE ALSO
+--------
+
+
+opt|opt
diff --git a/docs/CommandGuide/html/manpage.css b/docs/CommandGuide/html/manpage.css
deleted file mode 100644
index b200343..0000000
--- a/docs/CommandGuide/html/manpage.css
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Based on http://www.perldoc.com/css/perldoc.css */
-
-@import url("../llvm.css");
-
-body { font-family: Arial,Helvetica; }
-
-blockquote { margin: 10pt;  }
-
-h1, a { color: #336699; }
-
-
-/*** Top menu style ****/
-.mmenuon { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ff6600; font-size: 10pt;
- }
-.mmenuoff { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: 10pt;
-}	  
-.cpyright {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: xx-small;
-}
-.cpyrightText {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: xx-small;
-}
-.sections { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 11pt;
-}	 
-.dsections { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 12pt;
-}	
-.slink { 
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #000000; font-size: 9pt;
-}	 
-
-.slink2 { font-family: Arial,Helvetica; text-decoration: none; color: #336699; }	 
-
-.maintitle { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 18pt;
-}	 
-.dblArrow {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: small;
-}
-.menuSec {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: small;
-}
-
-.newstext {
- font-family: Arial,Helvetica; font-size: small;
-}
-
-.linkmenu {
- font-family: Arial,Helvetica; color: #000000; font-weight: bold;
- text-decoration: none;
-}
-
-P {
- font-family: Arial,Helvetica;
-}
-
-PRE {
-    font-size: 10pt;
-}
-.quote { 
- font-family: Times; text-decoration: none;
- color: #000000; font-size: 9pt; font-style: italic;
-}	
-.smstd { font-family: Arial,Helvetica; color: #000000; font-size: x-small; } 
-.std { font-family: Arial,Helvetica; color: #000000; } 
-.meerkatTitle { 
- font-family: sans-serif; font-size: x-small;  color: black;    }
-
-.meerkatDescription { font-family: sans-serif; font-size: 10pt; color: black }
-.meerkatCategory { 
- font-family: sans-serif; font-size: 9pt; font-weight: bold; font-style: italic; 
- color: brown; }
-.meerkatChannel { 
- font-family: sans-serif; font-size: 9pt; font-style: italic; color: brown; }
-.meerkatDate { font-family: sans-serif; font-size: xx-small; color: #336699; }
-
-.tocTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-.toc-item {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #336699; font-size: 10pt; text-decoration: underline;
-}
-
-.perlVersion {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #336699; font-size: 10pt; text-decoration: none;
-}
-
-.podTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #000000;
-}
-
-.docTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #000000; font-size: 10pt;
-}
-.dotDot {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #000000; font-size: 9pt;
-}
-
-.docSec {
- font-family: Arial,Helvetica; font-weight: normal; 
- color: #333333; font-size: 9pt;
-}
-.docVersion {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 10pt;
-}
-
-.docSecs-on {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #ff0000; font-size: 10pt;
-}
-.docSecs-off {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-h2 {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: medium;
-}
-h1 {
- font-family: Verdana,Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: large;
-}
-
-DL {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-UL > LI > A {
- font-family: Arial,Helvetica; font-weight: bold;
- color: #336699; font-size: 10pt;
-}
-
-.moduleInfo {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #333333; font-size: 11pt;
-}
-
-.moduleInfoSec {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 10pt;
-}
-
-.moduleInfoVal {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: underline;
- color: #000000; font-size: 10pt;
-}
-
-.cpanNavTitle {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #ffffff; font-size: 10pt;
-}
-.cpanNavLetter {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none; 
- color: #333333; font-size: 9pt;
-}
-.cpanCat {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none; 
- color: #336699; font-size: 9pt;
-}
-
-.bttndrkblue-bkgd-top {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgtop.gif);
-}
-.bttndrkblue-bkgd-left {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgleft.gif);
-}
-.bttndrkblue-bkgd {
-	padding-top: 0px;
-	padding-bottom: 0px;
-	margin-bottom: 0px;
-	margin-top: 0px;
-	background-repeat: no-repeat;
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgmiddle.gif);
-	vertical-align: top;
-}
-.bttndrkblue-bkgd-right {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgright.gif);
-}
-.bttndrkblue-bkgd-bottom {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgbottom.gif);
-}
-.bttndrkblue-text a {
-	color: #ffffff;
-	text-decoration: none;
-}
-a.bttndrkblue-text:hover {
-	color: #ffDD3C;
-	text-decoration: none;
-}
-.bg-ltblue {
-	background-color: #f0f5fa;
-} 
-
-.border-left-b {
-	background: #f0f5fa url(/i/corner-leftline.gif) repeat-y;
-} 
-
-.border-right-b {
-	background: #f0f5fa url(/i/corner-rightline.gif) repeat-y;
-} 
-
-.border-top-b {
-	background: #f0f5fa url(/i/corner-topline.gif) repeat-x;
-} 
-
-.border-bottom-b {
-	background: #f0f5fa url(/i/corner-botline.gif) repeat-x;
-} 
-
-.border-right-w {
-	background: #ffffff url(/i/corner-rightline.gif) repeat-y;
-} 
-
-.border-top-w {
-	background: #ffffff url(/i/corner-topline.gif) repeat-x;
-} 
-
-.border-bottom-w {
-	background: #ffffff url(/i/corner-botline.gif) repeat-x;
-} 
-
-.bg-white {
-	background-color: #ffffff;
-} 
-
-.border-left-w {
-	background: #ffffff url(/i/corner-leftline.gif) repeat-y;
-} 
diff --git a/docs/CommandGuide/index.html b/docs/CommandGuide/index.html
deleted file mode 100644
index 74ac004..0000000
--- a/docs/CommandGuide/index.html
+++ /dev/null
@@ -1,142 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <title>LLVM Command Guide</title>
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  LLVM Command Guide
-</h1>
-
-<div>
-
-<p>These documents are HTML versions of the <a href="man/man1/">man pages</a>
-for all of the LLVM tools.  These pages describe how to use the LLVM commands
-and what their options are.  Note that these pages do not describe all of the
-options available for all tools.  To get a complete listing, pass the
-<tt>-help</tt> (general options) or <tt>-help-hidden</tt> (general+debugging
-options) arguments to the tool you are interested in.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="basic">Basic Commands</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<ul>
-
-<li><a href="/cmds/llvm-as.html"><b>llvm-as</b></a> -
-    assemble a human-readable .ll file into bytecode</li>
-
-<li><a href="/cmds/llvm-dis.html"><b>llvm-dis</b></a> -
-    disassemble a bytecode file into a human-readable .ll file</li>
-
-<li><a href="/cmds/opt.html"><b>opt</b></a> -
-    run a series of LLVM-to-LLVM optimizations on a bytecode file</li>
-
-<li><a href="/cmds/llc.html"><b>llc</b></a> -
-    generate native machine code for a bytecode file</li>
-
-<li><a href="/cmds/lli.html"><b>lli</b></a> -
-    directly run a program compiled to bytecode using a JIT compiler or
-    interpreter</li>
-
-<li><a href="/cmds/llvm-link.html"><b>llvm-link</b></a> -
-    link several bytecode files into one</li>
-
-<li><a href="/cmds/llvm-ar.html"><b>llvm-ar</b></a> -
-    archive bytecode files</li>
-
-<li><a href="/cmds/llvm-ranlib.html"><b>llvm-ranlib</b></a> -
-    create an index for archives made with llvm-ar</li>
-
-<li><a href="/cmds/llvm-nm.html"><b>llvm-nm</b></a> -
-    print out the names and types of symbols in a bytecode file</li>
-
-<li><a href="/cmds/llvm-prof.html"><b>llvm-prof</b></a> -
-    format raw `<tt>llvmprof.out</tt>' data into a human-readable report</li>
-
-<li><a href="/cmds/llvm-ld.html"><b>llvm-ld</b></a> -
-    general purpose linker with loadable runtime optimization support</li>
-
-<li><a href="/cmds/llvm-config.html"><b>llvm-config</b></a> -
-    print out LLVM compilation options, libraries, etc. as configured</li>
-
-<li><a href="/cmds/llvm-diff.html"><b>llvm-diff</b></a> -
-    structurally compare two modules</li>
-
-<li><a href="/cmds/llvm-cov.html"><b>llvm-cov</b></a> -
-    emit coverage information</li>
-
-<li><a href="/cmds/llvm-stress.html"><b>llvm-stress</b></a> -
-    generate random .ll files to fuzz different llvm components</li>
-
-</ul>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="debug">Debugging Tools</a>
-</h2>
-<!-- *********************************************************************** -->
-
-
-<div>
-
-<ul>
-
-<li><a href="/cmds/bugpoint.html"><b>bugpoint</b></a> -
-    automatic test-case reducer</li>
-
-<li><a href="/cmds/llvm-extract.html"><b>llvm-extract</b></a> -
-    extract a function from an LLVM bytecode file</li>
-
-<li><a href="/cmds/llvm-bcanalyzer.html"><b>llvm-bcanalyzer</b></a> -
-    bytecode analyzer (analyzes the binary encoding itself, not the program it
-    represents)</li>
-
-</ul>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="internal">Internal Tools</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<ul>
-
-<li><a href="/cmds/FileCheck.html"><b>FileCheck</b></a> -
-    Flexible file verifier used extensively by the testing harness</li>
-<li><a href="/cmds/tblgen.html"><b>tblgen</b></a> -
-    target description reader and generator</li>
-<li><a href="/cmds/lit.html"><b>lit</b></a> -
-    LLVM Integrated Tester, for running tests</li>
-
-</ul>
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-02-26 09:35:53 +0100 (Sun, 26 Feb 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
new file mode 100644
index 0000000..73a4835
--- /dev/null
+++ b/docs/CommandGuide/index.rst
@@ -0,0 +1,53 @@
+.. _commands:
+
+LLVM Command Guide
+------------------
+
+The following documents are command descriptions for all of the LLVM tools.
+These pages describe how to use the LLVM commands and what their options are.
+Note that these pages do not describe all of the options available for all
+tools. To get a complete listing, pass the ``--help`` (general options) or
+``--help-hidden`` (general and debugging options) arguments to the tool you are
+interested in.
+
+Basic Commands
+~~~~~~~~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   llvm-as
+   llvm-dis
+   opt
+   llc
+   lli
+   llvm-link
+   llvm-ar
+   llvm-ranlib
+   llvm-nm
+   llvm-prof
+   llvm-config
+   llvm-diff
+   llvm-cov
+   llvm-stress
+
+Debugging Tools
+~~~~~~~~~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   bugpoint
+   llvm-extract
+   llvm-bcanalyzer
+
+Developer Tools
+~~~~~~~~~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   FileCheck
+   tblgen
+   lit
+   llvm-build
diff --git a/docs/CommandGuide/lit.pod b/docs/CommandGuide/lit.pod
deleted file mode 100644
index 81fc2c9..0000000
--- a/docs/CommandGuide/lit.pod
+++ /dev/null
@@ -1,404 +0,0 @@
-=pod
-
-=head1 NAME
-
-lit - LLVM Integrated Tester
-
-=head1 SYNOPSIS
-
-B<lit> [I<options>] [I<tests>]
-
-=head1 DESCRIPTION
-
-B<lit> is a portable tool for executing LLVM and Clang style test suites,
-summarizing their results, and providing indication of failures. B<lit> is
-designed to be a lightweight testing tool with as simple a user interface as
-possible.
-
-B<lit> should be run with one or more I<tests> to run specified on the command
-line. Tests can be either individual test files or directories to search for
-tests (see L<"TEST DISCOVERY">).
-
-Each specified test will be executed (potentially in parallel) and once all
-tests have been run B<lit> will print summary information on the number of tests
-which passed or failed (see L<"TEST STATUS RESULTS">). The B<lit> program will
-execute with a non-zero exit code if any tests fail.
-
-By default B<lit> will use a succinct progress display and will only print
-summary information for test failures. See L<"OUTPUT OPTIONS"> for options
-controlling the B<lit> progress display and output.
-
-B<lit> also includes a number of options for controlling how tests are executed
-(specific features may depend on the particular test format). See L<"EXECUTION
-OPTIONS"> for more information.
-
-Finally, B<lit> also supports additional options for only running a subset of
-the options specified on the command line, see L<"SELECTION OPTIONS"> for
-more information.
-
-Users interested in the B<lit> architecture or designing a B<lit> testing
-implementation should see L<"LIT INFRASTRUCTURE">
-
-=head1 GENERAL OPTIONS
-
-=over
-
-=item B<-h>, B<--help>
-
-Show the B<lit> help message.
-
-=item B<-j> I<N>, B<--threads>=I<N>
-
-Run I<N> tests in parallel. By default, this is automatically chosen to match
-the number of detected available CPUs.
-
-=item B<--config-prefix>=I<NAME>
-
-Search for I<NAME.cfg> and I<NAME.site.cfg> when searching for test suites,
-instead of I<lit.cfg> and I<lit.site.cfg>.
-
-=item B<--param> I<NAME>, B<--param> I<NAME>=I<VALUE>
-
-Add a user defined parameter I<NAME> with the given I<VALUE> (or the empty
-string if not given). The meaning and use of these parameters is test suite
-dependent.
-
-=back 
-
-=head1 OUTPUT OPTIONS
-
-=over
-
-=item B<-q>, B<--quiet>
-
-Suppress any output except for test failures.
-
-=item B<-s>, B<--succinct>
-
-Show less output, for example don't show information on tests that pass.
-
-=item B<-v>, B<--verbose>
-
-Show more information on test failures, for example the entire test output
-instead of just the test result.
-
-=item B<--no-progress-bar>
-
-Do not use curses based progress bar.
-
-=back 
-
-=head1 EXECUTION OPTIONS
-
-=over
-
-=item B<--path>=I<PATH>
-
-Specify an addition I<PATH> to use when searching for executables in tests.
-
-=item B<--vg>
-
-Run individual tests under valgrind (using the memcheck tool). The
-I<--error-exitcode> argument for valgrind is used so that valgrind failures will
-cause the program to exit with a non-zero status.
-
-=item B<--vg-arg>=I<ARG>
-
-When I<--vg> is used, specify an additional argument to pass to valgrind itself.
-
-=item B<--time-tests>
-
-Track the wall time individual tests take to execute and includes the results in
-the summary output. This is useful for determining which tests in a test suite
-take the most time to execute. Note that this option is most useful with I<-j
-1>.
-
-=back
-
-=head1 SELECTION OPTIONS
-
-=over
-
-=item B<--max-tests>=I<N>
-
-Run at most I<N> tests and then terminate.
-
-=item B<--max-time>=I<N>
-
-Spend at most I<N> seconds (approximately) running tests and then terminate.
-
-=item B<--shuffle>
-
-Run the tests in a random order.
-
-=back
-
-=head1 ADDITIONAL OPTIONS
-
-=over
-
-=item B<--debug>
-
-Run B<lit> in debug mode, for debugging configuration issues and B<lit> itself.
-
-=item B<--show-suites>
-
-List the discovered test suites as part of the standard output.
-
-=item B<--no-tcl-as-sh>
-
-Run Tcl scripts internally (instead of converting to shell scripts).
-
-=item B<--repeat>=I<N>
-
-Run each test I<N> times. Currently this is primarily useful for timing tests,
-other results are not collated in any reasonable fashion.
-
-=back
-
-=head1 EXIT STATUS
-
-B<lit> will exit with an exit code of 1 if there are any FAIL or XPASS
-results. Otherwise, it will exit with the status 0. Other exit codes are used
-for non-test related failures (for example a user error or an internal program
-error).
-
-=head1 TEST DISCOVERY
-
-The inputs passed to B<lit> can be either individual tests, or entire
-directories or hierarchies of tests to run. When B<lit> starts up, the first
-thing it does is convert the inputs into a complete list of tests to run as part
-of I<test discovery>.
-
-In the B<lit> model, every test must exist inside some I<test suite>. B<lit>
-resolves the inputs specified on the command line to test suites by searching
-upwards from the input path until it finds a I<lit.cfg> or I<lit.site.cfg>
-file. These files serve as both a marker of test suites and as configuration
-files which B<lit> loads in order to understand how to find and run the tests
-inside the test suite.
-
-Once B<lit> has mapped the inputs into test suites it traverses the list of
-inputs adding tests for individual files and recursively searching for tests in
-directories.
-
-This behavior makes it easy to specify a subset of tests to run, while still
-allowing the test suite configuration to control exactly how tests are
-interpreted. In addition, B<lit> always identifies tests by the test suite they
-are in, and their relative path inside the test suite. For appropriately
-configured projects, this allows B<lit> to provide convenient and flexible
-support for out-of-tree builds.
-
-=head1 TEST STATUS RESULTS
-
-Each test ultimately produces one of the following six results:
-
-=over
-
-=item B<PASS>
-
-The test succeeded.
-
-=item B<XFAIL>
-
-The test failed, but that is expected. This is used for test formats which allow
-specifying that a test does not currently work, but wish to leave it in the test
-suite.
-
-=item B<XPASS>
-
-The test succeeded, but it was expected to fail. This is used for tests which
-were specified as expected to fail, but are now succeeding (generally because
-the feature they test was broken and has been fixed).
-
-=item B<FAIL>
-
-The test failed.
-
-=item B<UNRESOLVED>
-
-The test result could not be determined. For example, this occurs when the test
-could not be run, the test itself is invalid, or the test was interrupted.
-
-=item B<UNSUPPORTED>
-
-The test is not supported in this environment. This is used by test formats
-which can report unsupported tests.
-
-=back
-
-Depending on the test format tests may produce additional information about
-their status (generally only for failures). See the L<Output|"OUTPUT OPTIONS">
-section for more information.
-
-=head1 LIT INFRASTRUCTURE
-
-This section describes the B<lit> testing architecture for users interested in
-creating a new B<lit> testing implementation, or extending an existing one.
-
-B<lit> proper is primarily an infrastructure for discovering and running
-arbitrary tests, and to expose a single convenient interface to these
-tests. B<lit> itself doesn't know how to run tests, rather this logic is
-defined by I<test suites>.
-
-=head2 TEST SUITES
-
-As described in L<"TEST DISCOVERY">, tests are always located inside a I<test
-suite>. Test suites serve to define the format of the tests they contain, the
-logic for finding those tests, and any additional information to run the tests.
-
-B<lit> identifies test suites as directories containing I<lit.cfg> or
-I<lit.site.cfg> files (see also B<--config-prefix>). Test suites are initially
-discovered by recursively searching up the directory hierarchy for all the input
-files passed on the command line. You can use B<--show-suites> to display the
-discovered test suites at startup.
-
-Once a test suite is discovered, its config file is loaded. Config files
-themselves are Python modules which will be executed. When the config file is
-executed, two important global variables are predefined:
-
-=over
-
-=item B<lit>
-
-The global B<lit> configuration object (a I<LitConfig> instance), which defines
-the builtin test formats, global configuration parameters, and other helper
-routines for implementing test configurations.
-
-=item B<config>
-
-This is the config object (a I<TestingConfig> instance) for the test suite,
-which the config file is expected to populate. The following variables are also
-available on the I<config> object, some of which must be set by the config and
-others are optional or predefined:
-
-B<name> I<[required]> The name of the test suite, for use in reports and
-diagnostics.
-
-B<test_format> I<[required]> The test format object which will be used to
-discover and run tests in the test suite. Generally this will be a builtin test
-format available from the I<lit.formats> module.
-
-B<test_src_root> The filesystem path to the test suite root. For out-of-dir
-builds this is the directory that will be scanned for tests.
-
-B<test_exec_root> For out-of-dir builds, the path to the test suite root inside
-the object directory. This is where tests will be run and temporary output files
-placed.
-
-B<environment> A dictionary representing the environment to use when executing
-tests in the suite.
-
-B<suffixes> For B<lit> test formats which scan directories for tests, this
-variable is a list of suffixes to identify test files. Used by: I<ShTest>,
-I<TclTest>.
-
-B<substitutions> For B<lit> test formats which substitute variables into a test
-script, the list of substitutions to perform. Used by: I<ShTest>, I<TclTest>.
-
-B<unsupported> Mark an unsupported directory, all tests within it will be
-reported as unsupported. Used by: I<ShTest>, I<TclTest>.
-
-B<parent> The parent configuration, this is the config object for the directory
-containing the test suite, or None.
-
-B<root> The root configuration. This is the top-most B<lit> configuration in
-the project.
-
-B<on_clone> The config is actually cloned for every subdirectory inside a test
-suite, to allow local configuration on a per-directory basis. The I<on_clone>
-variable can be set to a Python function which will be called whenever a
-configuration is cloned (for a subdirectory). The function should takes three
-arguments: (1) the parent configuration, (2) the new configuration (which the
-I<on_clone> function will generally modify), and (3) the test path to the new
-directory being scanned.
-
-=back
-
-=head2 TEST DISCOVERY
-
-Once test suites are located, B<lit> recursively traverses the source directory
-(following I<test_src_root>) looking for tests. When B<lit> enters a
-sub-directory, it first checks to see if a nested test suite is defined in that
-directory. If so, it loads that test suite recursively, otherwise it
-instantiates a local test config for the directory (see L<"LOCAL CONFIGURATION
-FILES">).
-
-Tests are identified by the test suite they are contained within, and the
-relative path inside that suite. Note that the relative path may not refer to an
-actual file on disk; some test formats (such as I<GoogleTest>) define "virtual
-tests" which have a path that contains both the path to the actual test file and
-a subpath to identify the virtual test.
-
-=head2 LOCAL CONFIGURATION FILES
-
-When B<lit> loads a subdirectory in a test suite, it instantiates a local test
-configuration by cloning the configuration for the parent direction -- the root
-of this configuration chain will always be a test suite. Once the test
-configuration is cloned B<lit> checks for a I<lit.local.cfg> file in the
-subdirectory. If present, this file will be loaded and can be used to specialize
-the configuration for each individual directory. This facility can be used to
-define subdirectories of optional tests, or to change other configuration
-parameters -- for example, to change the test format, or the suffixes which
-identify test files.
-
-=head2 TEST RUN OUTPUT FORMAT
-
-The b<lit> output for a test run conforms to the following schema, in both short
-and verbose modes (although in short mode no PASS lines will be shown). This
-schema has been chosen to be relatively easy to reliably parse by a machine (for
-example in buildbot log scraping), and for other tools to generate.
-
-Each test result is expected to appear on a line that matches:
-
-<result code>: <test name> (<progress info>)
-
-where <result-code> is a standard test result such as PASS, FAIL, XFAIL, XPASS,
-UNRESOLVED, or UNSUPPORTED. The performance result codes of IMPROVED and
-REGRESSED are also allowed.
-
-The <test name> field can consist of an arbitrary string containing no newline.
-
-The <progress info> field can be used to report progress information such as
-(1/300) or can be empty, but even when empty the parentheses are required.
-
-Each test result may include additional (multiline) log information in the
-following format.
-
-<log delineator> TEST '(<test name>)' <trailing delineator>
-... log message ...
-<log delineator>
-
-where <test name> should be the name of a preceeding reported test, <log
-delineator> is a string of '*' characters I<at least> four characters long (the
-recommended length is 20), and <trailing delineator> is an arbitrary (unparsed)
-string.
-
-The following is an example of a test run output which consists of four tests A,
-B, C, and D, and a log message for the failing test C.
-
-=head3 Example Test Run Output Listing
-
-PASS: A (1 of 4)
-PASS: B (2 of 4)
-FAIL: C (3 of 4)
-******************** TEST 'C' FAILED ********************
-Test 'C' failed as a result of exit code 1.
-********************
-PASS: D (4 of 4)
-
-=back
-
-=head2 LIT EXAMPLE TESTS
-
-The B<lit> distribution contains several example implementations of test suites
-in the I<ExampleTests> directory.
-
-=head1 SEE ALSO
-
-L<valgrind(1)>
-
-=head1 AUTHOR
-
-Written by Daniel Dunbar and maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
new file mode 100644
index 0000000..3eb0be9
--- /dev/null
+++ b/docs/CommandGuide/lit.rst
@@ -0,0 +1,474 @@
+lit - LLVM Integrated Tester
+============================
+
+
+SYNOPSIS
+--------
+
+
+**lit** [*options*] [*tests*]
+
+
+DESCRIPTION
+-----------
+
+
+**lit** is a portable tool for executing LLVM and Clang style test suites,
+summarizing their results, and providing indication of failures. **lit** is
+designed to be a lightweight testing tool with as simple a user interface as
+possible.
+
+**lit** should be run with one or more *tests* to run specified on the command
+line. Tests can be either individual test files or directories to search for
+tests (see "TEST DISCOVERY").
+
+Each specified test will be executed (potentially in parallel) and once all
+tests have been run **lit** will print summary information on the number of tests
+which passed or failed (see "TEST STATUS RESULTS"). The **lit** program will
+execute with a non-zero exit code if any tests fail.
+
+By default **lit** will use a succinct progress display and will only print
+summary information for test failures. See "OUTPUT OPTIONS" for options
+controlling the **lit** progress display and output.
+
+**lit** also includes a number of options for controlling how tests are executed
+(specific features may depend on the particular test format). See "EXECUTION
+OPTIONS" for more information.
+
+Finally, **lit** also supports additional options for only running a subset of
+the options specified on the command line, see "SELECTION OPTIONS" for
+more information.
+
+Users interested in the **lit** architecture or designing a **lit** testing
+implementation should see "LIT INFRASTRUCTURE"
+
+
+GENERAL OPTIONS
+---------------
+
+
+
+**-h**, **--help**
+
+ Show the **lit** help message.
+
+
+
+**-j** *N*, **--threads**\ =\ *N*
+
+ Run *N* tests in parallel. By default, this is automatically chosen to match
+ the number of detected available CPUs.
+
+
+
+**--config-prefix**\ =\ *NAME*
+
+ Search for *NAME.cfg* and *NAME.site.cfg* when searching for test suites,
+ instead of *lit.cfg* and *lit.site.cfg*.
+
+
+
+**--param** *NAME*, **--param** *NAME*\ =\ *VALUE*
+
+ Add a user defined parameter *NAME* with the given *VALUE* (or the empty
+ string if not given). The meaning and use of these parameters is test suite
+ dependent.
+
+
+
+
+OUTPUT OPTIONS
+--------------
+
+
+
+**-q**, **--quiet**
+
+ Suppress any output except for test failures.
+
+
+
+**-s**, **--succinct**
+
+ Show less output, for example don't show information on tests that pass.
+
+
+
+**-v**, **--verbose**
+
+ Show more information on test failures, for example the entire test output
+ instead of just the test result.
+
+
+
+**--no-progress-bar**
+
+ Do not use curses based progress bar.
+
+
+
+
+EXECUTION OPTIONS
+-----------------
+
+
+
+**--path**\ =\ *PATH*
+
+ Specify an addition *PATH* to use when searching for executables in tests.
+
+
+
+**--vg**
+
+ Run individual tests under valgrind (using the memcheck tool). The
+ *--error-exitcode* argument for valgrind is used so that valgrind failures will
+ cause the program to exit with a non-zero status.
+
+
+
+**--vg-arg**\ =\ *ARG*
+
+ When *--vg* is used, specify an additional argument to pass to valgrind itself.
+
+
+
+**--time-tests**
+
+ Track the wall time individual tests take to execute and includes the results in
+ the summary output. This is useful for determining which tests in a test suite
+ take the most time to execute. Note that this option is most useful with *-j
+ 1*.
+
+
+
+
+SELECTION OPTIONS
+-----------------
+
+
+
+**--max-tests**\ =\ *N*
+
+ Run at most *N* tests and then terminate.
+
+
+
+**--max-time**\ =\ *N*
+
+ Spend at most *N* seconds (approximately) running tests and then terminate.
+
+
+
+**--shuffle**
+
+ Run the tests in a random order.
+
+
+
+
+ADDITIONAL OPTIONS
+------------------
+
+
+
+**--debug**
+
+ Run **lit** in debug mode, for debugging configuration issues and **lit** itself.
+
+
+
+**--show-suites**
+
+ List the discovered test suites as part of the standard output.
+
+
+
+**--no-tcl-as-sh**
+
+ Run Tcl scripts internally (instead of converting to shell scripts).
+
+
+
+**--repeat**\ =\ *N*
+
+ Run each test *N* times. Currently this is primarily useful for timing tests,
+ other results are not collated in any reasonable fashion.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+**lit** will exit with an exit code of 1 if there are any FAIL or XPASS
+results. Otherwise, it will exit with the status 0. Other exit codes are used
+for non-test related failures (for example a user error or an internal program
+error).
+
+
+TEST DISCOVERY
+--------------
+
+
+The inputs passed to **lit** can be either individual tests, or entire
+directories or hierarchies of tests to run. When **lit** starts up, the first
+thing it does is convert the inputs into a complete list of tests to run as part
+of *test discovery*.
+
+In the **lit** model, every test must exist inside some *test suite*. **lit**
+resolves the inputs specified on the command line to test suites by searching
+upwards from the input path until it finds a *lit.cfg* or *lit.site.cfg*
+file. These files serve as both a marker of test suites and as configuration
+files which **lit** loads in order to understand how to find and run the tests
+inside the test suite.
+
+Once **lit** has mapped the inputs into test suites it traverses the list of
+inputs adding tests for individual files and recursively searching for tests in
+directories.
+
+This behavior makes it easy to specify a subset of tests to run, while still
+allowing the test suite configuration to control exactly how tests are
+interpreted. In addition, **lit** always identifies tests by the test suite they
+are in, and their relative path inside the test suite. For appropriately
+configured projects, this allows **lit** to provide convenient and flexible
+support for out-of-tree builds.
+
+
+TEST STATUS RESULTS
+-------------------
+
+
+Each test ultimately produces one of the following six results:
+
+
+**PASS**
+
+ The test succeeded.
+
+
+
+**XFAIL**
+
+ The test failed, but that is expected. This is used for test formats which allow
+ specifying that a test does not currently work, but wish to leave it in the test
+ suite.
+
+
+
+**XPASS**
+
+ The test succeeded, but it was expected to fail. This is used for tests which
+ were specified as expected to fail, but are now succeeding (generally because
+ the feature they test was broken and has been fixed).
+
+
+
+**FAIL**
+
+ The test failed.
+
+
+
+**UNRESOLVED**
+
+ The test result could not be determined. For example, this occurs when the test
+ could not be run, the test itself is invalid, or the test was interrupted.
+
+
+
+**UNSUPPORTED**
+
+ The test is not supported in this environment. This is used by test formats
+ which can report unsupported tests.
+
+
+
+Depending on the test format tests may produce additional information about
+their status (generally only for failures). See the Output|"OUTPUT OPTIONS"
+section for more information.
+
+
+LIT INFRASTRUCTURE
+------------------
+
+
+This section describes the **lit** testing architecture for users interested in
+creating a new **lit** testing implementation, or extending an existing one.
+
+**lit** proper is primarily an infrastructure for discovering and running
+arbitrary tests, and to expose a single convenient interface to these
+tests. **lit** itself doesn't know how to run tests, rather this logic is
+defined by *test suites*.
+
+TEST SUITES
+~~~~~~~~~~~
+
+
+As described in "TEST DISCOVERY", tests are always located inside a *test
+suite*. Test suites serve to define the format of the tests they contain, the
+logic for finding those tests, and any additional information to run the tests.
+
+**lit** identifies test suites as directories containing *lit.cfg* or
+*lit.site.cfg* files (see also **--config-prefix**). Test suites are initially
+discovered by recursively searching up the directory hierarchy for all the input
+files passed on the command line. You can use **--show-suites** to display the
+discovered test suites at startup.
+
+Once a test suite is discovered, its config file is loaded. Config files
+themselves are Python modules which will be executed. When the config file is
+executed, two important global variables are predefined:
+
+
+**lit**
+
+ The global **lit** configuration object (a *LitConfig* instance), which defines
+ the builtin test formats, global configuration parameters, and other helper
+ routines for implementing test configurations.
+
+
+
+**config**
+
+ This is the config object (a *TestingConfig* instance) for the test suite,
+ which the config file is expected to populate. The following variables are also
+ available on the *config* object, some of which must be set by the config and
+ others are optional or predefined:
+
+ **name** *[required]* The name of the test suite, for use in reports and
+ diagnostics.
+
+ **test_format** *[required]* The test format object which will be used to
+ discover and run tests in the test suite. Generally this will be a builtin test
+ format available from the *lit.formats* module.
+
+ **test_src_root** The filesystem path to the test suite root. For out-of-dir
+ builds this is the directory that will be scanned for tests.
+
+ **test_exec_root** For out-of-dir builds, the path to the test suite root inside
+ the object directory. This is where tests will be run and temporary output files
+ placed.
+
+ **environment** A dictionary representing the environment to use when executing
+ tests in the suite.
+
+ **suffixes** For **lit** test formats which scan directories for tests, this
+ variable is a list of suffixes to identify test files. Used by: *ShTest*,
+ *TclTest*.
+
+ **substitutions** For **lit** test formats which substitute variables into a test
+ script, the list of substitutions to perform. Used by: *ShTest*, *TclTest*.
+
+ **unsupported** Mark an unsupported directory, all tests within it will be
+ reported as unsupported. Used by: *ShTest*, *TclTest*.
+
+ **parent** The parent configuration, this is the config object for the directory
+ containing the test suite, or None.
+
+ **root** The root configuration. This is the top-most **lit** configuration in
+ the project.
+
+ **on_clone** The config is actually cloned for every subdirectory inside a test
+ suite, to allow local configuration on a per-directory basis. The *on_clone*
+ variable can be set to a Python function which will be called whenever a
+ configuration is cloned (for a subdirectory). The function should takes three
+ arguments: (1) the parent configuration, (2) the new configuration (which the
+ *on_clone* function will generally modify), and (3) the test path to the new
+ directory being scanned.
+
+
+
+
+TEST DISCOVERY
+~~~~~~~~~~~~~~
+
+
+Once test suites are located, **lit** recursively traverses the source directory
+(following *test_src_root*) looking for tests. When **lit** enters a
+sub-directory, it first checks to see if a nested test suite is defined in that
+directory. If so, it loads that test suite recursively, otherwise it
+instantiates a local test config for the directory (see "LOCAL CONFIGURATION
+FILES").
+
+Tests are identified by the test suite they are contained within, and the
+relative path inside that suite. Note that the relative path may not refer to an
+actual file on disk; some test formats (such as *GoogleTest*) define "virtual
+tests" which have a path that contains both the path to the actual test file and
+a subpath to identify the virtual test.
+
+
+LOCAL CONFIGURATION FILES
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+When **lit** loads a subdirectory in a test suite, it instantiates a local test
+configuration by cloning the configuration for the parent direction -- the root
+of this configuration chain will always be a test suite. Once the test
+configuration is cloned **lit** checks for a *lit.local.cfg* file in the
+subdirectory. If present, this file will be loaded and can be used to specialize
+the configuration for each individual directory. This facility can be used to
+define subdirectories of optional tests, or to change other configuration
+parameters -- for example, to change the test format, or the suffixes which
+identify test files.
+
+
+TEST RUN OUTPUT FORMAT
+~~~~~~~~~~~~~~~~~~~~~~
+
+
+The b<lit> output for a test run conforms to the following schema, in both short
+and verbose modes (although in short mode no PASS lines will be shown). This
+schema has been chosen to be relatively easy to reliably parse by a machine (for
+example in buildbot log scraping), and for other tools to generate.
+
+Each test result is expected to appear on a line that matches:
+
+<result code>: <test name> (<progress info>)
+
+where <result-code> is a standard test result such as PASS, FAIL, XFAIL, XPASS,
+UNRESOLVED, or UNSUPPORTED. The performance result codes of IMPROVED and
+REGRESSED are also allowed.
+
+The <test name> field can consist of an arbitrary string containing no newline.
+
+The <progress info> field can be used to report progress information such as
+(1/300) or can be empty, but even when empty the parentheses are required.
+
+Each test result may include additional (multiline) log information in the
+following format.
+
+<log delineator> TEST '(<test name>)' <trailing delineator>
+... log message ...
+<log delineator>
+
+where <test name> should be the name of a preceding reported test, <log
+delineator> is a string of '\*' characters *at least* four characters long (the
+recommended length is 20), and <trailing delineator> is an arbitrary (unparsed)
+string.
+
+The following is an example of a test run output which consists of four tests A,
+B, C, and D, and a log message for the failing test C::
+
+  PASS: A (1 of 4)
+  PASS: B (2 of 4)
+  FAIL: C (3 of 4)
+  \*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\* TEST 'C' FAILED \*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*
+  Test 'C' failed as a result of exit code 1.
+  \*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*
+  PASS: D (4 of 4)
+
+
+LIT EXAMPLE TESTS
+~~~~~~~~~~~~~~~~~
+
+
+The **lit** distribution contains several example implementations of test suites
+in the *ExampleTests* directory.
+
+
+SEE ALSO
+--------
+
+
+valgrind(1)
diff --git a/docs/CommandGuide/llc.pod b/docs/CommandGuide/llc.pod
deleted file mode 100644
index 35abdae..0000000
--- a/docs/CommandGuide/llc.pod
+++ /dev/null
@@ -1,201 +0,0 @@
-=pod
-
-=head1 NAME
-
-llc - LLVM static compiler
-
-=head1 SYNOPSIS
-
-B<llc> [I<options>] [I<filename>]
-
-=head1 DESCRIPTION
-
-The B<llc> command compiles LLVM source inputs into assembly language for a
-specified architecture.  The assembly language output can then be passed through
-a native assembler and linker to generate a native executable.
-
-The choice of architecture for the output assembly code is automatically
-determined from the input file, unless the B<-march> option is used to override
-the default.
-
-=head1 OPTIONS
-
-If I<filename> is - or omitted, B<llc> reads from standard input.  Otherwise, it
-will from I<filename>.  Inputs can be in either the LLVM assembly language
-format (.ll) or the LLVM bitcode format (.bc).
-
-If the B<-o> option is omitted, then B<llc> will send its output to standard
-output if the input is from standard input.  If the B<-o> option specifies -,
-then the output will also be sent to standard output.
-
-If no B<-o> option is specified and an input file other than - is specified,
-then B<llc> creates the output filename by taking the input filename,
-removing any existing F<.bc> extension, and adding a F<.s> suffix.
-
-Other B<llc> options are as follows:
-
-=head2 End-user Options
-
-=over
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-O>=I<uint>
-
-Generate code at different optimization levels. These correspond to the I<-O0>,
-I<-O1>, I<-O2>, and I<-O3> optimization levels used by B<llvm-gcc> and
-B<clang>.
-
-=item B<-mtriple>=I<target triple>
-
-Override the target triple specified in the input file with the specified
-string.
-
-=item B<-march>=I<arch>
-
-Specify the architecture for which to generate assembly, overriding the target
-encoded in the input file.  See the output of B<llc -help> for a list of
-valid architectures.  By default this is inferred from the target triple or
-autodetected to the current architecture.
-
-=item B<-mcpu>=I<cpuname>
-
-Specify a specific chip in the current architecture to generate code for.
-By default this is inferred from the target triple and autodetected to 
-the current architecture.  For a list of available CPUs, use:
-B<llvm-as E<lt> /dev/null | llc -march=xyz -mcpu=help>
-
-=item B<-mattr>=I<a1,+a2,-a3,...>
-
-Override or control specific attributes of the target, such as whether SIMD
-operations are enabled or not.  The default set of attributes is set by the
-current CPU.  For a list of available attributes, use:
-B<llvm-as E<lt> /dev/null | llc -march=xyz -mattr=help>
-
-=item B<--disable-fp-elim>
-
-Disable frame pointer elimination optimization.
-
-=item B<--disable-excess-fp-precision>
-
-Disable optimizations that may produce excess precision for floating point.
-Note that this option can dramatically slow down code on some systems
-(e.g. X86).
-
-=item B<--enable-no-infs-fp-math>
-
-Enable optimizations that assume no Inf values.
-
-=item B<--enable-no-nans-fp-math>
-
-Enable optimizations that assume no NAN values.
-
-=item B<--enable-unsafe-fp-math>
-
-Enable optimizations that make unsafe assumptions about IEEE math (e.g. that
-addition is associative) or may not work for all input ranges.  These
-optimizations allow the code generator to make use of some instructions which
-would otherwise not be usable (such as fsin on X86).
-
-=item B<--enable-correct-eh-support>
-
-Instruct the B<lowerinvoke> pass to insert code for correct exception handling
-support.  This is expensive and is by default omitted for efficiency.
-
-=item B<--stats>
-
-Print statistics recorded by code-generation passes.
-
-=item B<--time-passes>
-
-Record the amount of time needed for each pass and print a report to standard
-error.
-
-=item B<--load>=F<dso_path>
-
-Dynamically load F<dso_path> (a path to a dynamically shared object) that
-implements an LLVM target. This will permit the target name to be used with the
-B<-march> option so that code can be generated for that target.
-
-=back
-
-=head2 Tuning/Configuration Options
-
-=over
-
-=item B<--print-machineinstrs>
-
-Print generated machine code between compilation phases (useful for debugging).
-
-=item B<--regalloc>=I<allocator>
-
-Specify the register allocator to use. The default I<allocator> is I<local>.
-Valid register allocators are:
-
-=over
-
-=item I<simple>
-
-Very simple "always spill" register allocator
-
-=item I<local>
-
-Local register allocator
-
-=item I<linearscan>
-
-Linear scan global register allocator
-
-=item I<iterativescan>
-
-Iterative scan global register allocator
-
-=back
-
-=item B<--spiller>=I<spiller>
-
-Specify the spiller to use for register allocators that support it.  Currently
-this option is used only by the linear scan register allocator. The default
-I<spiller> is I<local>.  Valid spillers are:
-
-=over
-
-=item I<simple>
-
-Simple spiller
-
-=item I<local>
-
-Local spiller
-
-=back
-
-=back
-
-=head2 Intel IA-32-specific Options
-
-=over
-
-=item B<--x86-asm-syntax=att|intel>
-
-Specify whether to emit assembly code in AT&T syntax (the default) or intel
-syntax.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llc> succeeds, it will exit with 0.  Otherwise, if an error occurs,
-it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<lli|lli>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llc.rst b/docs/CommandGuide/llc.rst
new file mode 100644
index 0000000..6f1c486
--- /dev/null
+++ b/docs/CommandGuide/llc.rst
@@ -0,0 +1,251 @@
+llc - LLVM static compiler
+==========================
+
+
+SYNOPSIS
+--------
+
+
+**llc** [*options*] [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+The **llc** command compiles LLVM source inputs into assembly language for a
+specified architecture.  The assembly language output can then be passed through
+a native assembler and linker to generate a native executable.
+
+The choice of architecture for the output assembly code is automatically
+determined from the input file, unless the **-march** option is used to override
+the default.
+
+
+OPTIONS
+-------
+
+
+If *filename* is - or omitted, **llc** reads from standard input.  Otherwise, it
+will from *filename*.  Inputs can be in either the LLVM assembly language
+format (.ll) or the LLVM bitcode format (.bc).
+
+If the **-o** option is omitted, then **llc** will send its output to standard
+output if the input is from standard input.  If the **-o** option specifies -,
+then the output will also be sent to standard output.
+
+If no **-o** option is specified and an input file other than - is specified,
+then **llc** creates the output filename by taking the input filename,
+removing any existing *.bc* extension, and adding a *.s* suffix.
+
+Other **llc** options are as follows:
+
+End-user Options
+~~~~~~~~~~~~~~~~
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-O**\ =\ *uint*
+
+ Generate code at different optimization levels. These correspond to the *-O0*,
+ *-O1*, *-O2*, and *-O3* optimization levels used by **llvm-gcc** and
+ **clang**.
+
+
+
+**-mtriple**\ =\ *target triple*
+
+ Override the target triple specified in the input file with the specified
+ string.
+
+
+
+**-march**\ =\ *arch*
+
+ Specify the architecture for which to generate assembly, overriding the target
+ encoded in the input file.  See the output of **llc -help** for a list of
+ valid architectures.  By default this is inferred from the target triple or
+ autodetected to the current architecture.
+
+
+
+**-mcpu**\ =\ *cpuname*
+
+ Specify a specific chip in the current architecture to generate code for.
+ By default this is inferred from the target triple and autodetected to
+ the current architecture.  For a list of available CPUs, use:
+ **llvm-as < /dev/null | llc -march=xyz -mcpu=help**
+
+
+
+**-mattr**\ =\ *a1,+a2,-a3,...*
+
+ Override or control specific attributes of the target, such as whether SIMD
+ operations are enabled or not.  The default set of attributes is set by the
+ current CPU.  For a list of available attributes, use:
+ **llvm-as < /dev/null | llc -march=xyz -mattr=help**
+
+
+
+**--disable-fp-elim**
+
+ Disable frame pointer elimination optimization.
+
+
+
+**--disable-excess-fp-precision**
+
+ Disable optimizations that may produce excess precision for floating point.
+ Note that this option can dramatically slow down code on some systems
+ (e.g. X86).
+
+
+
+**--enable-no-infs-fp-math**
+
+ Enable optimizations that assume no Inf values.
+
+
+
+**--enable-no-nans-fp-math**
+
+ Enable optimizations that assume no NAN values.
+
+
+
+**--enable-unsafe-fp-math**
+
+ Enable optimizations that make unsafe assumptions about IEEE math (e.g. that
+ addition is associative) or may not work for all input ranges.  These
+ optimizations allow the code generator to make use of some instructions which
+ would otherwise not be usable (such as fsin on X86).
+
+
+
+**--enable-correct-eh-support**
+
+ Instruct the **lowerinvoke** pass to insert code for correct exception handling
+ support.  This is expensive and is by default omitted for efficiency.
+
+
+
+**--stats**
+
+ Print statistics recorded by code-generation passes.
+
+
+
+**--time-passes**
+
+ Record the amount of time needed for each pass and print a report to standard
+ error.
+
+
+
+**--load**\ =\ *dso_path*
+
+ Dynamically load *dso_path* (a path to a dynamically shared object) that
+ implements an LLVM target. This will permit the target name to be used with the
+ **-march** option so that code can be generated for that target.
+
+
+
+
+Tuning/Configuration Options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+**--print-machineinstrs**
+
+ Print generated machine code between compilation phases (useful for debugging).
+
+
+
+**--regalloc**\ =\ *allocator*
+
+ Specify the register allocator to use. The default *allocator* is *local*.
+ Valid register allocators are:
+
+
+ *simple*
+
+  Very simple "always spill" register allocator
+
+
+
+ *local*
+
+  Local register allocator
+
+
+
+ *linearscan*
+
+  Linear scan global register allocator
+
+
+
+ *iterativescan*
+
+  Iterative scan global register allocator
+
+
+
+
+
+**--spiller**\ =\ *spiller*
+
+ Specify the spiller to use for register allocators that support it.  Currently
+ this option is used only by the linear scan register allocator. The default
+ *spiller* is *local*.  Valid spillers are:
+
+
+ *simple*
+
+  Simple spiller
+
+
+
+ *local*
+
+  Local spiller
+
+
+
+
+
+
+Intel IA-32-specific Options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+**--x86-asm-syntax=att|intel**
+
+ Specify whether to emit assembly code in AT&T syntax (the default) or intel
+ syntax.
+
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llc** succeeds, it will exit with 0.  Otherwise, if an error occurs,
+it will exit with a non-zero value.
+
+
+SEE ALSO
+--------
+
+
+lli|lli
diff --git a/docs/CommandGuide/lli.pod b/docs/CommandGuide/lli.pod
deleted file mode 100644
index a313a31..0000000
--- a/docs/CommandGuide/lli.pod
+++ /dev/null
@@ -1,219 +0,0 @@
-=pod
-
-=head1 NAME
-
-lli - directly execute programs from LLVM bitcode
-
-=head1 SYNOPSIS
-
-B<lli> [I<options>] [I<filename>] [I<program args>]
-
-=head1 DESCRIPTION
-
-B<lli> directly executes programs in LLVM bitcode format.  It takes a program
-in LLVM bitcode format and executes it using a just-in-time compiler, if one is
-available for the current architecture, or an interpreter.  B<lli> takes all of
-the same code generator options as L<llc|llc>, but they are only effective when
-B<lli> is using the just-in-time compiler.
-
-If I<filename> is not specified, then B<lli> reads the LLVM bitcode for the
-program from standard input.
-
-The optional I<args> specified on the command line are passed to the program as
-arguments.
-
-=head1 GENERAL OPTIONS
-
-=over
-
-=item B<-fake-argv0>=I<executable>
-
-Override the C<argv[0]> value passed into the executing program.
-
-=item B<-force-interpreter>=I<{false,true}>
-
-If set to true, use the interpreter even if a just-in-time compiler is available
-for this architecture. Defaults to false.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-load>=I<puginfilename>
-
-Causes B<lli> to load the plugin (shared object) named I<pluginfilename> and use
-it for optimization.
-
-=item B<-stats>
-
-Print statistics from the code-generation passes. This is only meaningful for
-the just-in-time compiler, at present.
-
-=item B<-time-passes>
-
-Record the amount of time needed for each code-generation pass and print it to
-standard error.
-
-=item B<-version>
-
-Print out the version of B<lli> and exit without doing anything else.
-
-=back 
-
-=head1 TARGET OPTIONS
-
-=over 
-
-=item B<-mtriple>=I<target triple>
-
-Override the target triple specified in the input bitcode file with the 
-specified string.  This may result in a crash if you pick an
-architecture which is not compatible with the current system.
-
-=item B<-march>=I<arch>
-
-Specify the architecture for which to generate assembly, overriding the target
-encoded in the bitcode file.  See the output of B<llc -help> for a list of
-valid architectures.  By default this is inferred from the target triple or
-autodetected to the current architecture.
-
-=item B<-mcpu>=I<cpuname>
-
-Specify a specific chip in the current architecture to generate code for.
-By default this is inferred from the target triple and autodetected to 
-the current architecture.  For a list of available CPUs, use:
-B<llvm-as E<lt> /dev/null | llc -march=xyz -mcpu=help>
-
-=item B<-mattr>=I<a1,+a2,-a3,...>
-
-Override or control specific attributes of the target, such as whether SIMD
-operations are enabled or not.  The default set of attributes is set by the
-current CPU.  For a list of available attributes, use:
-B<llvm-as E<lt> /dev/null | llc -march=xyz -mattr=help>
-
-=back
-
-
-=head1 FLOATING POINT OPTIONS
-
-=over 
-
-=item B<-disable-excess-fp-precision>
-
-Disable optimizations that may increase floating point precision.
-
-=item B<-enable-no-infs-fp-math>
-
-Enable optimizations that assume no Inf values.
-
-=item B<-enable-no-nans-fp-math>
-
-Enable optimizations that assume no NAN values.
-
-=item B<-enable-unsafe-fp-math>
-
-Causes B<lli> to enable optimizations that may decrease floating point
-precision.
-
-=item B<-soft-float>
-
-Causes B<lli> to generate software floating point library calls instead of
-equivalent hardware instructions.
-
-=back
-
-=head1 CODE GENERATION OPTIONS
-
-=over
-
-=item B<-code-model>=I<model>
-
-Choose the code model from:
-
-    default: Target default code model
-    small: Small code model
-    kernel: Kernel code model
-    medium: Medium code model
-    large: Large code model
-
-=item B<-disable-post-RA-scheduler>
-
-Disable scheduling after register allocation.
-
-=item B<-disable-spill-fusing>
-
-Disable fusing of spill code into instructions.
-
-=item B<-enable-correct-eh-support> 
-
-Make the -lowerinvoke pass insert expensive, but correct, EH code.
-
-=item B<-jit-enable-eh> 
-
-Exception handling should be enabled in the just-in-time compiler.
-
-=item B<-join-liveintervals> 
-
-Coalesce copies (default=true).
-
-=item B<-nozero-initialized-in-bss>
-Don't place zero-initialized symbols into the BSS section.
-
-=item B<-pre-RA-sched>=I<scheduler>
-
-Instruction schedulers available (before register allocation):
-
-    =default: Best scheduler for the target 
-    =none: No scheduling: breadth first sequencing 
-    =simple: Simple two pass scheduling: minimize critical path and maximize processor utilization 
-    =simple-noitin: Simple two pass scheduling: Same as simple except using generic latency 
-    =list-burr: Bottom-up register reduction list scheduling 
-    =list-tdrr: Top-down register reduction list scheduling 
-    =list-td: Top-down list scheduler -print-machineinstrs - Print generated machine code
-
-=item B<-regalloc>=I<allocator>
-
-Register allocator to use (default=linearscan)
-
-    =bigblock: Big-block register allocator 
-    =linearscan: linear scan register allocator =local -   local register allocator 
-    =simple: simple register allocator 
-
-=item B<-relocation-model>=I<model> 
-
-Choose relocation model from:
-
-    =default: Target default relocation model 
-    =static: Non-relocatable code =pic -   Fully relocatable, position independent code 
-    =dynamic-no-pic: Relocatable external references, non-relocatable code 
-
-=item B<-spiller>
-
-Spiller to use (default=local) 
-
-    =simple: simple spiller 
-    =local: local spiller 
-
-=item B<-x86-asm-syntax>=I<syntax>
-
-Choose style of code to emit from X86 backend: 
-
-    =att: Emit AT&T-style assembly 
-    =intel: Emit Intel-style assembly
-
-=back
-
-=head1 EXIT STATUS
-
-If B<lli> fails to load the program, it will exit with an exit code of 1.
-Otherwise, it will return the exit code of the program it executes.
-
-=head1 SEE ALSO
-
-L<llc|llc>
-
-=head1 AUTHOR
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/lli.rst b/docs/CommandGuide/lli.rst
new file mode 100644
index 0000000..7cc1284
--- /dev/null
+++ b/docs/CommandGuide/lli.rst
@@ -0,0 +1,300 @@
+lli - directly execute programs from LLVM bitcode
+=================================================
+
+
+SYNOPSIS
+--------
+
+
+**lli** [*options*] [*filename*] [*program args*]
+
+
+DESCRIPTION
+-----------
+
+
+**lli** directly executes programs in LLVM bitcode format.  It takes a program
+in LLVM bitcode format and executes it using a just-in-time compiler, if one is
+available for the current architecture, or an interpreter.  **lli** takes all of
+the same code generator options as llc|llc, but they are only effective when
+**lli** is using the just-in-time compiler.
+
+If *filename* is not specified, then **lli** reads the LLVM bitcode for the
+program from standard input.
+
+The optional *args* specified on the command line are passed to the program as
+arguments.
+
+
+GENERAL OPTIONS
+---------------
+
+
+
+**-fake-argv0**\ =\ *executable*
+
+ Override the ``argv[0]`` value passed into the executing program.
+
+
+
+**-force-interpreter**\ =\ *{false,true}*
+
+ If set to true, use the interpreter even if a just-in-time compiler is available
+ for this architecture. Defaults to false.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-load**\ =\ *puginfilename*
+
+ Causes **lli** to load the plugin (shared object) named *pluginfilename* and use
+ it for optimization.
+
+
+
+**-stats**
+
+ Print statistics from the code-generation passes. This is only meaningful for
+ the just-in-time compiler, at present.
+
+
+
+**-time-passes**
+
+ Record the amount of time needed for each code-generation pass and print it to
+ standard error.
+
+
+
+**-version**
+
+ Print out the version of **lli** and exit without doing anything else.
+
+
+
+
+TARGET OPTIONS
+--------------
+
+
+
+**-mtriple**\ =\ *target triple*
+
+ Override the target triple specified in the input bitcode file with the
+ specified string.  This may result in a crash if you pick an
+ architecture which is not compatible with the current system.
+
+
+
+**-march**\ =\ *arch*
+
+ Specify the architecture for which to generate assembly, overriding the target
+ encoded in the bitcode file.  See the output of **llc -help** for a list of
+ valid architectures.  By default this is inferred from the target triple or
+ autodetected to the current architecture.
+
+
+
+**-mcpu**\ =\ *cpuname*
+
+ Specify a specific chip in the current architecture to generate code for.
+ By default this is inferred from the target triple and autodetected to
+ the current architecture.  For a list of available CPUs, use:
+ **llvm-as < /dev/null | llc -march=xyz -mcpu=help**
+
+
+
+**-mattr**\ =\ *a1,+a2,-a3,...*
+
+ Override or control specific attributes of the target, such as whether SIMD
+ operations are enabled or not.  The default set of attributes is set by the
+ current CPU.  For a list of available attributes, use:
+ **llvm-as < /dev/null | llc -march=xyz -mattr=help**
+
+
+
+
+FLOATING POINT OPTIONS
+----------------------
+
+
+
+**-disable-excess-fp-precision**
+
+ Disable optimizations that may increase floating point precision.
+
+
+
+**-enable-no-infs-fp-math**
+
+ Enable optimizations that assume no Inf values.
+
+
+
+**-enable-no-nans-fp-math**
+
+ Enable optimizations that assume no NAN values.
+
+
+
+**-enable-unsafe-fp-math**
+
+ Causes **lli** to enable optimizations that may decrease floating point
+ precision.
+
+
+
+**-soft-float**
+
+ Causes **lli** to generate software floating point library calls instead of
+ equivalent hardware instructions.
+
+
+
+
+CODE GENERATION OPTIONS
+-----------------------
+
+
+
+**-code-model**\ =\ *model*
+
+ Choose the code model from:
+
+
+ .. code-block:: perl
+
+      default: Target default code model
+      small: Small code model
+      kernel: Kernel code model
+      medium: Medium code model
+      large: Large code model
+
+
+
+
+**-disable-post-RA-scheduler**
+
+ Disable scheduling after register allocation.
+
+
+
+**-disable-spill-fusing**
+
+ Disable fusing of spill code into instructions.
+
+
+
+**-enable-correct-eh-support**
+
+ Make the -lowerinvoke pass insert expensive, but correct, EH code.
+
+
+
+**-jit-enable-eh**
+
+ Exception handling should be enabled in the just-in-time compiler.
+
+
+
+**-join-liveintervals**
+
+ Coalesce copies (default=true).
+
+
+
+**-nozero-initialized-in-bss** Don't place zero-initialized symbols into the BSS section.
+
+
+
+**-pre-RA-sched**\ =\ *scheduler*
+
+ Instruction schedulers available (before register allocation):
+
+
+ .. code-block:: perl
+
+      =default: Best scheduler for the target
+      =none: No scheduling: breadth first sequencing
+      =simple: Simple two pass scheduling: minimize critical path and maximize processor utilization
+      =simple-noitin: Simple two pass scheduling: Same as simple except using generic latency
+      =list-burr: Bottom-up register reduction list scheduling
+      =list-tdrr: Top-down register reduction list scheduling
+      =list-td: Top-down list scheduler -print-machineinstrs - Print generated machine code
+
+
+
+
+**-regalloc**\ =\ *allocator*
+
+ Register allocator to use (default=linearscan)
+
+
+ .. code-block:: perl
+
+      =bigblock: Big-block register allocator
+      =linearscan: linear scan register allocator =local -   local register allocator
+      =simple: simple register allocator
+
+
+
+
+**-relocation-model**\ =\ *model*
+
+ Choose relocation model from:
+
+
+ .. code-block:: perl
+
+      =default: Target default relocation model
+      =static: Non-relocatable code =pic -   Fully relocatable, position independent code
+      =dynamic-no-pic: Relocatable external references, non-relocatable code
+
+
+
+
+**-spiller**
+
+ Spiller to use (default=local)
+
+
+ .. code-block:: perl
+
+      =simple: simple spiller
+      =local: local spiller
+
+
+
+
+**-x86-asm-syntax**\ =\ *syntax*
+
+ Choose style of code to emit from X86 backend:
+
+
+ .. code-block:: perl
+
+      =att: Emit AT&T-style assembly
+      =intel: Emit Intel-style assembly
+
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **lli** fails to load the program, it will exit with an exit code of 1.
+Otherwise, it will return the exit code of the program it executes.
+
+
+SEE ALSO
+--------
+
+
+llc|llc
diff --git a/docs/CommandGuide/llvm-ar.pod b/docs/CommandGuide/llvm-ar.pod
deleted file mode 100644
index a8f01b0..0000000
--- a/docs/CommandGuide/llvm-ar.pod
+++ /dev/null
@@ -1,406 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-ar - LLVM archiver
-
-=head1 SYNOPSIS
-
-B<llvm-ar> [-]{dmpqrtx}[Rabfikouz] [relpos] [count] <archive> [files...]
-
-
-=head1 DESCRIPTION
-
-The B<llvm-ar> command is similar to the common Unix utility, C<ar>. It 
-archives several files together into a single file. The intent for this is
-to produce archive libraries by LLVM bitcode that can be linked into an
-LLVM program. However, the archive can contain any kind of file. By default,
-B<llvm-ar> generates a symbol table that makes linking faster because
-only the symbol table needs to be consulted, not each individual file member
-of the archive. 
-
-The B<llvm-ar> command can be used to I<read> both SVR4 and BSD style archive
-files. However, it cannot be used to write them.  While the B<llvm-ar> command 
-produces files that are I<almost> identical to the format used by other C<ar> 
-implementations, it has two significant departures in order to make the 
-archive appropriate for LLVM. The first departure is that B<llvm-ar> only
-uses BSD4.4 style long path names (stored immediately after the header) and
-never contains a string table for long names. The second departure is that the
-symbol table is formated for efficient construction of an in-memory data
-structure that permits rapid (red-black tree) lookups. Consequently, archives 
-produced with B<llvm-ar> usually won't be readable or editable with any
-C<ar> implementation or useful for linking.  Using the C<f> modifier to flatten
-file names will make the archive readable by other C<ar> implementations
-but not for linking because the symbol table format for LLVM is unique. If an
-SVR4 or BSD style archive is used with the C<r> (replace) or C<q> (quick
-update) operations, the archive will be reconstructed in LLVM format. This 
-means that the string table will be dropped (in deference to BSD 4.4 long names)
-and an LLVM symbol table will be added (by default). The system symbol table
-will be retained.
-
-Here's where B<llvm-ar> departs from previous C<ar> implementations:
-
-=over
-
-=item I<Symbol Table>
-
-Since B<llvm-ar> is intended to archive bitcode files, the symbol table
-won't make much sense to anything but LLVM. Consequently, the symbol table's
-format has been simplified. It consists simply of a sequence of pairs
-of a file member index number as an LSB 4byte integer and a null-terminated 
-string.
-
-=item I<Long Paths>
-
-Some C<ar> implementations (SVR4) use a separate file member to record long
-path names (> 15 characters). B<llvm-ar> takes the BSD 4.4 and Mac OS X 
-approach which is to simply store the full path name immediately preceding
-the data for the file. The path name is null terminated and may contain the
-slash (/) character. 
-
-=item I<Compression>
-
-B<llvm-ar> can compress the members of an archive to save space. The 
-compression used depends on what's available on the platform and what choices
-the LLVM Compressor utility makes. It generally favors bzip2 but will select
-between "no compression" or bzip2 depending on what makes sense for the
-file's content.
-
-=item I<Directory Recursion>
-
-Most C<ar> implementations do not recurse through directories but simply
-ignore directories if they are presented to the program in the F<files> 
-option. B<llvm-ar>, however, can recurse through directory structures and
-add all the files under a directory, if requested.
-
-=item I<TOC Verbose Output>
-
-When B<llvm-ar> prints out the verbose table of contents (C<tv> option), it
-precedes the usual output with a character indicating the basic kind of 
-content in the file. A blank means the file is a regular file. A 'Z' means
-the file is compressed. A 'B' means the file is an LLVM bitcode file. An
-'S' means the file is the symbol table.
-
-=back
-
-=head1 OPTIONS
-
-The options to B<llvm-ar> are compatible with other C<ar> implementations.
-However, there are a few modifiers (F<zR>) that are not found in other
-C<ar>s. The options to B<llvm-ar> specify a single basic operation to 
-perform on the archive, a variety of modifiers for that operation, the
-name of the archive file, and an optional list of file names. These options
-are used to determine how B<llvm-ar> should process the archive file.
-
-The Operations and Modifiers are explained in the sections below. The minimal
-set of options is at least one operator and the name of the archive. Typically
-archive files end with a C<.a> suffix, but this is not required. Following
-the F<archive-name> comes a list of F<files> that indicate the specific members
-of the archive to operate on. If the F<files> option is not specified, it
-generally means either "none" or "all" members, depending on the operation.
-
-=head2 Operations
-
-=over
-
-=item d
-
-Delete files from the archive. No modifiers are applicable to this operation.
-The F<files> options specify which members should be removed from the
-archive. It is not an error if a specified file does not appear in the archive.
-If no F<files> are specified, the archive is not modified.
-
-=item m[abi]
-
-Move files from one location in the archive to another. The F<a>, F<b>, and 
-F<i> modifiers apply to this operation. The F<files> will all be moved
-to the location given by the modifiers. If no modifiers are used, the files
-will be moved to the end of the archive. If no F<files> are specified, the
-archive is not modified.
-
-=item p[k]
-
-Print files to the standard output. The F<k> modifier applies to this
-operation. This operation simply prints the F<files> indicated to the
-standard output. If no F<files> are specified, the entire archive is printed.
-Printing bitcode files is ill-advised as they might confuse your terminal
-settings. The F<p> operation never modifies the archive.
-
-=item q[Rfz]
-
-Quickly append files to the end of the archive. The F<R>, F<f>, and F<z>
-modifiers apply to this operation.  This operation quickly adds the 
-F<files> to the archive without checking for duplicates that should be 
-removed first. If no F<files> are specified, the archive is not modified. 
-Because of the way that B<llvm-ar> constructs the archive file, its dubious 
-whether the F<q> operation is any faster than the F<r> operation.
-
-=item r[Rabfuz]
-
-Replace or insert file members. The F<R>, F<a>, F<b>, F<f>, F<u>, and F<z>
-modifiers apply to this operation. This operation will replace existing
-F<files> or insert them at the end of the archive if they do not exist. If no
-F<files> are specified, the archive is not modified.
-
-=item t[v]
-
-Print the table of contents. Without any modifiers, this operation just prints
-the names of the members to the standard output. With the F<v> modifier,
-B<llvm-ar> also prints out the file type (B=bitcode, Z=compressed, S=symbol
-table, blank=regular file), the permission mode, the owner and group, the
-size, and the date. If any F<files> are specified, the listing is only for
-those files. If no F<files> are specified, the table of contents for the
-whole archive is printed.
-
-=item x[oP]
-
-Extract archive members back to files. The F<o> modifier applies to this
-operation. This operation retrieves the indicated F<files> from the archive 
-and writes them back to the operating system's file system. If no 
-F<files> are specified, the entire archive is extract. 
-
-=back
-
-=head2 Modifiers (operation specific)
-
-The modifiers below are specific to certain operations. See the Operations
-section (above) to determine which modifiers are applicable to which operations.
-
-=over
-
-=item [a] 
-
-When inserting or moving member files, this option specifies the destination of
-the new files as being C<a>fter the F<relpos> member. If F<relpos> is not found,
-the files are placed at the end of the archive.
-
-=item [b] 
-
-When inserting or moving member files, this option specifies the destination of
-the new files as being C<b>efore the F<relpos> member. If F<relpos> is not 
-found, the files are placed at the end of the archive. This modifier is 
-identical to the the F<i> modifier.
-
-=item [f] 
-
-Normally, B<llvm-ar> stores the full path name to a file as presented to it on
-the command line. With this option, truncated (15 characters max) names are
-used. This ensures name compatibility with older versions of C<ar> but may also
-thwart correct extraction of the files (duplicates may overwrite). If used with
-the F<R> option, the directory recursion will be performed but the file names
-will all be C<f>lattened to simple file names.
-
-=item [i] 
-
-A synonym for the F<b> option.
-
-=item [k] 
-
-Normally, B<llvm-ar> will not print the contents of bitcode files when the 
-F<p> operation is used. This modifier defeats the default and allows the 
-bitcode members to be printed.
-
-=item [N] 
-
-This option is ignored by B<llvm-ar> but provided for compatibility.
-
-=item [o] 
-
-When extracting files, this option will cause B<llvm-ar> to preserve the
-original modification times of the files it writes. 
-
-=item [P] 
-
-use full path names when matching
-
-=item [R]
-
-This modifier instructions the F<r> option to recursively process directories.
-Without F<R>, directories are ignored and only those F<files> that refer to
-files will be added to the archive. When F<R> is used, any directories specified
-with F<files> will be scanned (recursively) to find files to be added to the
-archive. Any file whose name begins with a dot will not be added.
-
-=item [u] 
-
-When replacing existing files in the archive, only replace those files that have
-a time stamp than the time stamp of the member in the archive.
-
-=item [z] 
-
-When inserting or replacing any file in the archive, compress the file first.
-This
-modifier is safe to use when (previously) compressed bitcode files are added to
-the archive; the compressed bitcode files will not be doubly compressed.
-
-=back
-
-=head2 Modifiers (generic)
-
-The modifiers below may be applied to any operation.
-
-=over
-
-=item [c]
-
-For all operations, B<llvm-ar> will always create the archive if it doesn't 
-exist. Normally, B<llvm-ar> will print a warning message indicating that the
-archive is being created. Using this modifier turns off that warning.
-
-=item [s]
-
-This modifier requests that an archive index (or symbol table) be added to the
-archive. This is the default mode of operation. The symbol table will contain
-all the externally visible functions and global variables defined by all the
-bitcode files in the archive. Using this modifier is more efficient that using
-L<llvm-ranlib|llvm-ranlib> which also creates the symbol table.
-
-=item [S]
-
-This modifier is the opposite of the F<s> modifier. It instructs B<llvm-ar> to
-not build the symbol table. If both F<s> and F<S> are used, the last modifier to
-occur in the options will prevail. 
-
-=item [v]
-
-This modifier instructs B<llvm-ar> to be verbose about what it is doing. Each
-editing operation taken against the archive will produce a line of output saying
-what is being done.
-
-=back
-
-=head1 STANDARDS
-
-The B<llvm-ar> utility is intended to provide a superset of the IEEE Std 1003.2
-(POSIX.2) functionality for C<ar>. B<llvm-ar> can read both SVR4 and BSD4.4 (or
-Mac OS X) archives. If the C<f> modifier is given to the C<x> or C<r> operations
-then B<llvm-ar> will write SVR4 compatible archives. Without this modifier, 
-B<llvm-ar> will write BSD4.4 compatible archives that have long names
-immediately after the header and indicated using the "#1/ddd" notation for the
-name in the header.
-
-=head1 FILE FORMAT
-
-The file format for LLVM Archive files is similar to that of BSD 4.4 or Mac OSX
-archive files. In fact, except for the symbol table, the C<ar> commands on those
-operating systems should be able to read LLVM archive files. The details of the
-file format follow.
-
-Each archive begins with the archive magic number which is the eight printable
-characters "!<arch>\n" where \n represents the newline character (0x0A). 
-Following the magic number, the file is composed of even length members that 
-begin with an archive header and end with a \n padding character if necessary 
-(to make the length even). Each file member is composed of a header (defined 
-below), an optional newline-terminated "long file name" and the contents of 
-the file. 
-
-The fields of the header are described in the items below. All fields of the
-header contain only ASCII characters, are left justified and are right padded 
-with space characters.
-
-=over
-
-=item name - char[16]
-
-This field of the header provides the name of the archive member. If the name is
-longer than 15 characters or contains a slash (/) character, then this field
-contains C<#1/nnn> where C<nnn> provides the length of the name and the C<#1/>
-is literal.  In this case, the actual name of the file is provided in the C<nnn>
-bytes immediately following the header. If the name is 15 characters or less, it
-is contained directly in this field and terminated with a slash (/) character.
-
-=item date - char[12]
-
-This field provides the date of modification of the file in the form of a
-decimal encoded number that provides the number of seconds since the epoch 
-(since 00:00:00 Jan 1, 1970) per Posix specifications.
-
-=item uid - char[6]
-
-This field provides the user id of the file encoded as a decimal ASCII string.
-This field might not make much sense on non-Unix systems. On Unix, it is the
-same value as the st_uid field of the stat structure returned by the stat(2)
-operating system call.
-
-=item gid - char[6]
-
-This field provides the group id of the file encoded as a decimal ASCII string.
-This field might not make much sense on non-Unix systems. On Unix, it is the
-same value as the st_gid field of the stat structure returned by the stat(2)
-operating system call.
-
-=item mode - char[8]
-
-This field provides the access mode of the file encoded as an octal ASCII 
-string. This field might not make much sense on non-Unix systems. On Unix, it 
-is the same value as the st_mode field of the stat structure returned by the 
-stat(2) operating system call.
-
-=item size - char[10]
-
-This field provides the size of the file, in bytes, encoded as a decimal ASCII
-string. If the size field is negative (starts with a minus sign, 0x02D), then
-the archive member is stored in compressed form. The first byte of the archive
-member's data indicates the compression type used. A value of 0 (0x30) indicates
-that no compression was used. A value of 2 (0x32) indicates that bzip2
-compression was used.
-
-=item fmag - char[2]
-
-This field is the archive file member magic number. Its content is always the
-two characters back tick (0x60) and newline (0x0A). This provides some measure 
-utility in identifying archive files that have been corrupted.
-
-=back 
-
-The LLVM symbol table has the special name "#_LLVM_SYM_TAB_#". It is presumed
-that no regular archive member file will want this name. The LLVM symbol table 
-is simply composed of a sequence of triplets: byte offset, length of symbol, 
-and the symbol itself. Symbols are not null or newline terminated. Here are 
-the details on each of these items:
-
-=over
-
-=item offset - vbr encoded 32-bit integer
-
-The offset item provides the offset into the archive file where the bitcode
-member is stored that is associated with the symbol. The offset value is 0
-based at the start of the first "normal" file member. To derive the actual
-file offset of the member, you must add the number of bytes occupied by the file
-signature (8 bytes) and the symbol tables. The value of this item is encoded
-using variable bit rate encoding to reduce the size of the symbol table.
-Variable bit rate encoding uses the high bit (0x80) of each byte to indicate 
-if there are more bytes to follow. The remaining 7 bits in each byte carry bits
-from the value. The final byte does not have the high bit set.
-
-=item length - vbr encoded 32-bit integer
-
-The length item provides the length of the symbol that follows. Like this
-I<offset> item, the length is variable bit rate encoded.
-
-=item symbol - character array
-
-The symbol item provides the text of the symbol that is associated with the
-I<offset>. The symbol is not terminated by any character. Its length is provided
-by the I<length> field. Note that is allowed (but unwise) to use non-printing
-characters (even 0x00) in the symbol. This allows for multiple encodings of 
-symbol names.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-ar> succeeds, it will exit with 0.  A usage error, results
-in an exit code of 1. A hard (file system typically) error results in an
-exit code of 2. Miscellaneous or unknown errors result in an
-exit code of 3.
-
-=head1 SEE ALSO
-
-L<llvm-ranlib|llvm-ranlib>, ar(1)
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-ar.rst b/docs/CommandGuide/llvm-ar.rst
new file mode 100644
index 0000000..8ff4192
--- /dev/null
+++ b/docs/CommandGuide/llvm-ar.rst
@@ -0,0 +1,458 @@
+llvm-ar - LLVM archiver
+=======================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-ar** [-]{dmpqrtx}[Rabfikou] [relpos] [count] <archive> [files...]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-ar** command is similar to the common Unix utility, ``ar``. It
+archives several files together into a single file. The intent for this is
+to produce archive libraries by LLVM bitcode that can be linked into an
+LLVM program. However, the archive can contain any kind of file. By default,
+**llvm-ar** generates a symbol table that makes linking faster because
+only the symbol table needs to be consulted, not each individual file member
+of the archive.
+
+The **llvm-ar** command can be used to *read* both SVR4 and BSD style archive
+files. However, it cannot be used to write them.  While the **llvm-ar** command
+produces files that are *almost* identical to the format used by other ``ar``
+implementations, it has two significant departures in order to make the
+archive appropriate for LLVM. The first departure is that **llvm-ar** only
+uses BSD4.4 style long path names (stored immediately after the header) and
+never contains a string table for long names. The second departure is that the
+symbol table is formated for efficient construction of an in-memory data
+structure that permits rapid (red-black tree) lookups. Consequently, archives
+produced with **llvm-ar** usually won't be readable or editable with any
+``ar`` implementation or useful for linking.  Using the ``f`` modifier to flatten
+file names will make the archive readable by other ``ar`` implementations
+but not for linking because the symbol table format for LLVM is unique. If an
+SVR4 or BSD style archive is used with the ``r`` (replace) or ``q`` (quick
+update) operations, the archive will be reconstructed in LLVM format. This
+means that the string table will be dropped (in deference to BSD 4.4 long names)
+and an LLVM symbol table will be added (by default). The system symbol table
+will be retained.
+
+Here's where **llvm-ar** departs from previous ``ar`` implementations:
+
+
+*Symbol Table*
+
+ Since **llvm-ar** is intended to archive bitcode files, the symbol table
+ won't make much sense to anything but LLVM. Consequently, the symbol table's
+ format has been simplified. It consists simply of a sequence of pairs
+ of a file member index number as an LSB 4byte integer and a null-terminated
+ string.
+
+
+
+*Long Paths*
+
+ Some ``ar`` implementations (SVR4) use a separate file member to record long
+ path names (> 15 characters). **llvm-ar** takes the BSD 4.4 and Mac OS X
+ approach which is to simply store the full path name immediately preceding
+ the data for the file. The path name is null terminated and may contain the
+ slash (/) character.
+
+
+
+*Directory Recursion*
+
+ Most ``ar`` implementations do not recurse through directories but simply
+ ignore directories if they are presented to the program in the *files*
+ option. **llvm-ar**, however, can recurse through directory structures and
+ add all the files under a directory, if requested.
+
+
+
+*TOC Verbose Output*
+
+ When **llvm-ar** prints out the verbose table of contents (``tv`` option), it
+ precedes the usual output with a character indicating the basic kind of
+ content in the file. A blank means the file is a regular file. A 'B' means
+ the file is an LLVM bitcode file. An 'S' means the file is the symbol table.
+
+
+
+
+OPTIONS
+-------
+
+
+The options to **llvm-ar** are compatible with other ``ar`` implementations.
+However, there are a few modifiers (*R*) that are not found in other ``ar``
+implementations. The options to **llvm-ar** specify a single basic operation to
+perform on the archive, a variety of modifiers for that operation, the name of
+the archive file, and an optional list of file names. These options are used to
+determine how **llvm-ar** should process the archive file.
+
+The Operations and Modifiers are explained in the sections below. The minimal
+set of options is at least one operator and the name of the archive. Typically
+archive files end with a ``.a`` suffix, but this is not required. Following
+the *archive-name* comes a list of *files* that indicate the specific members
+of the archive to operate on. If the *files* option is not specified, it
+generally means either "none" or "all" members, depending on the operation.
+
+Operations
+~~~~~~~~~~
+
+
+
+d
+
+ Delete files from the archive. No modifiers are applicable to this operation.
+ The *files* options specify which members should be removed from the
+ archive. It is not an error if a specified file does not appear in the archive.
+ If no *files* are specified, the archive is not modified.
+
+
+
+m[abi]
+
+ Move files from one location in the archive to another. The *a*, *b*, and
+ *i* modifiers apply to this operation. The *files* will all be moved
+ to the location given by the modifiers. If no modifiers are used, the files
+ will be moved to the end of the archive. If no *files* are specified, the
+ archive is not modified.
+
+
+
+p[k]
+
+ Print files to the standard output. The *k* modifier applies to this
+ operation. This operation simply prints the *files* indicated to the
+ standard output. If no *files* are specified, the entire archive is printed.
+ Printing bitcode files is ill-advised as they might confuse your terminal
+ settings. The *p* operation never modifies the archive.
+
+
+
+q[Rf]
+
+ Quickly append files to the end of the archive. The *R*, and *f*
+ modifiers apply to this operation.  This operation quickly adds the
+ *files* to the archive without checking for duplicates that should be
+ removed first. If no *files* are specified, the archive is not modified.
+ Because of the way that **llvm-ar** constructs the archive file, its dubious
+ whether the *q* operation is any faster than the *r* operation.
+
+
+
+r[Rabfu]
+
+ Replace or insert file members. The *R*, *a*, *b*, *f*, and *u*
+ modifiers apply to this operation. This operation will replace existing
+ *files* or insert them at the end of the archive if they do not exist. If no
+ *files* are specified, the archive is not modified.
+
+
+
+t[v]
+
+ Print the table of contents. Without any modifiers, this operation just prints
+ the names of the members to the standard output. With the *v* modifier,
+ **llvm-ar** also prints out the file type (B=bitcode, S=symbol
+ table, blank=regular file), the permission mode, the owner and group, the
+ size, and the date. If any *files* are specified, the listing is only for
+ those files. If no *files* are specified, the table of contents for the
+ whole archive is printed.
+
+
+
+x[oP]
+
+ Extract archive members back to files. The *o* modifier applies to this
+ operation. This operation retrieves the indicated *files* from the archive
+ and writes them back to the operating system's file system. If no
+ *files* are specified, the entire archive is extract.
+
+
+
+
+Modifiers (operation specific)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+The modifiers below are specific to certain operations. See the Operations
+section (above) to determine which modifiers are applicable to which operations.
+
+
+[a]
+
+ When inserting or moving member files, this option specifies the destination of
+ the new files as being after the *relpos* member. If *relpos* is not found,
+ the files are placed at the end of the archive.
+
+
+
+[b]
+
+ When inserting or moving member files, this option specifies the destination of
+ the new files as being before the *relpos* member. If *relpos* is not
+ found, the files are placed at the end of the archive. This modifier is
+ identical to the *i* modifier.
+
+
+
+[f]
+
+ Normally, **llvm-ar** stores the full path name to a file as presented to it on
+ the command line. With this option, truncated (15 characters max) names are
+ used. This ensures name compatibility with older versions of ``ar`` but may also
+ thwart correct extraction of the files (duplicates may overwrite). If used with
+ the *R* option, the directory recursion will be performed but the file names
+ will all be flattened to simple file names.
+
+
+
+[i]
+
+ A synonym for the *b* option.
+
+
+
+[k]
+
+ Normally, **llvm-ar** will not print the contents of bitcode files when the
+ *p* operation is used. This modifier defeats the default and allows the
+ bitcode members to be printed.
+
+
+
+[N]
+
+ This option is ignored by **llvm-ar** but provided for compatibility.
+
+
+
+[o]
+
+ When extracting files, this option will cause **llvm-ar** to preserve the
+ original modification times of the files it writes.
+
+
+
+[P]
+
+ use full path names when matching
+
+
+
+[R]
+
+ This modifier instructions the *r* option to recursively process directories.
+ Without *R*, directories are ignored and only those *files* that refer to
+ files will be added to the archive. When *R* is used, any directories specified
+ with *files* will be scanned (recursively) to find files to be added to the
+ archive. Any file whose name begins with a dot will not be added.
+
+
+
+[u]
+
+ When replacing existing files in the archive, only replace those files that have
+ a time stamp than the time stamp of the member in the archive.
+
+
+
+
+Modifiers (generic)
+~~~~~~~~~~~~~~~~~~~
+
+
+The modifiers below may be applied to any operation.
+
+
+[c]
+
+ For all operations, **llvm-ar** will always create the archive if it doesn't
+ exist. Normally, **llvm-ar** will print a warning message indicating that the
+ archive is being created. Using this modifier turns off that warning.
+
+
+
+[s]
+
+ This modifier requests that an archive index (or symbol table) be added to the
+ archive. This is the default mode of operation. The symbol table will contain
+ all the externally visible functions and global variables defined by all the
+ bitcode files in the archive. Using this modifier is more efficient that using
+ llvm-ranlib|llvm-ranlib which also creates the symbol table.
+
+
+
+[S]
+
+ This modifier is the opposite of the *s* modifier. It instructs **llvm-ar** to
+ not build the symbol table. If both *s* and *S* are used, the last modifier to
+ occur in the options will prevail.
+
+
+
+[v]
+
+ This modifier instructs **llvm-ar** to be verbose about what it is doing. Each
+ editing operation taken against the archive will produce a line of output saying
+ what is being done.
+
+
+
+
+
+STANDARDS
+---------
+
+
+The **llvm-ar** utility is intended to provide a superset of the IEEE Std 1003.2
+(POSIX.2) functionality for ``ar``. **llvm-ar** can read both SVR4 and BSD4.4 (or
+Mac OS X) archives. If the ``f`` modifier is given to the ``x`` or ``r`` operations
+then **llvm-ar** will write SVR4 compatible archives. Without this modifier,
+**llvm-ar** will write BSD4.4 compatible archives that have long names
+immediately after the header and indicated using the "#1/ddd" notation for the
+name in the header.
+
+
+FILE FORMAT
+-----------
+
+
+The file format for LLVM Archive files is similar to that of BSD 4.4 or Mac OSX
+archive files. In fact, except for the symbol table, the ``ar`` commands on those
+operating systems should be able to read LLVM archive files. The details of the
+file format follow.
+
+Each archive begins with the archive magic number which is the eight printable
+characters "!<arch>\n" where \n represents the newline character (0x0A).
+Following the magic number, the file is composed of even length members that
+begin with an archive header and end with a \n padding character if necessary
+(to make the length even). Each file member is composed of a header (defined
+below), an optional newline-terminated "long file name" and the contents of
+the file.
+
+The fields of the header are described in the items below. All fields of the
+header contain only ASCII characters, are left justified and are right padded
+with space characters.
+
+
+name - char[16]
+
+ This field of the header provides the name of the archive member. If the name is
+ longer than 15 characters or contains a slash (/) character, then this field
+ contains ``#1/nnn`` where ``nnn`` provides the length of the name and the ``#1/``
+ is literal.  In this case, the actual name of the file is provided in the ``nnn``
+ bytes immediately following the header. If the name is 15 characters or less, it
+ is contained directly in this field and terminated with a slash (/) character.
+
+
+
+date - char[12]
+
+ This field provides the date of modification of the file in the form of a
+ decimal encoded number that provides the number of seconds since the epoch
+ (since 00:00:00 Jan 1, 1970) per Posix specifications.
+
+
+
+uid - char[6]
+
+ This field provides the user id of the file encoded as a decimal ASCII string.
+ This field might not make much sense on non-Unix systems. On Unix, it is the
+ same value as the st_uid field of the stat structure returned by the stat(2)
+ operating system call.
+
+
+
+gid - char[6]
+
+ This field provides the group id of the file encoded as a decimal ASCII string.
+ This field might not make much sense on non-Unix systems. On Unix, it is the
+ same value as the st_gid field of the stat structure returned by the stat(2)
+ operating system call.
+
+
+
+mode - char[8]
+
+ This field provides the access mode of the file encoded as an octal ASCII
+ string. This field might not make much sense on non-Unix systems. On Unix, it
+ is the same value as the st_mode field of the stat structure returned by the
+ stat(2) operating system call.
+
+
+
+size - char[10]
+
+ This field provides the size of the file, in bytes, encoded as a decimal ASCII
+ string.
+
+
+
+fmag - char[2]
+
+ This field is the archive file member magic number. Its content is always the
+ two characters back tick (0x60) and newline (0x0A). This provides some measure
+ utility in identifying archive files that have been corrupted.
+
+
+
+The LLVM symbol table has the special name "#_LLVM_SYM_TAB_#". It is presumed
+that no regular archive member file will want this name. The LLVM symbol table
+is simply composed of a sequence of triplets: byte offset, length of symbol,
+and the symbol itself. Symbols are not null or newline terminated. Here are
+the details on each of these items:
+
+
+offset - vbr encoded 32-bit integer
+
+ The offset item provides the offset into the archive file where the bitcode
+ member is stored that is associated with the symbol. The offset value is 0
+ based at the start of the first "normal" file member. To derive the actual
+ file offset of the member, you must add the number of bytes occupied by the file
+ signature (8 bytes) and the symbol tables. The value of this item is encoded
+ using variable bit rate encoding to reduce the size of the symbol table.
+ Variable bit rate encoding uses the high bit (0x80) of each byte to indicate
+ if there are more bytes to follow. The remaining 7 bits in each byte carry bits
+ from the value. The final byte does not have the high bit set.
+
+
+
+length - vbr encoded 32-bit integer
+
+ The length item provides the length of the symbol that follows. Like this
+ *offset* item, the length is variable bit rate encoded.
+
+
+
+symbol - character array
+
+ The symbol item provides the text of the symbol that is associated with the
+ *offset*. The symbol is not terminated by any character. Its length is provided
+ by the *length* field. Note that is allowed (but unwise) to use non-printing
+ characters (even 0x00) in the symbol. This allows for multiple encodings of
+ symbol names.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-ar** succeeds, it will exit with 0.  A usage error, results
+in an exit code of 1. A hard (file system typically) error results in an
+exit code of 2. Miscellaneous or unknown errors result in an
+exit code of 3.
+
+
+SEE ALSO
+--------
+
+
+llvm-ranlib|llvm-ranlib, ar(1)
diff --git a/docs/CommandGuide/llvm-as.pod b/docs/CommandGuide/llvm-as.pod
deleted file mode 100644
index cc81887..0000000
--- a/docs/CommandGuide/llvm-as.pod
+++ /dev/null
@@ -1,77 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-as - LLVM assembler
-
-=head1 SYNOPSIS
-
-B<llvm-as> [I<options>] [I<filename>]
-
-=head1 DESCRIPTION
-
-B<llvm-as> is the LLVM assembler.  It reads a file containing human-readable
-LLVM assembly language, translates it to LLVM bitcode, and writes the result
-into a file or to standard output.
-
-If F<filename> is omitted or is C<->, then B<llvm-as> reads its input from
-standard input.
-
-If an output file is not specified with the B<-o> option, then
-B<llvm-as> sends its output to a file or standard output by following
-these rules:
-
-=over 
-
-=item *
-
-If the input is standard input, then the output is standard output.
-
-=item *
-
-If the input is a file that ends with C<.ll>, then the output file is of
-the same name, except that the suffix is changed to C<.bc>.
-
-=item *
-
-If the input is a file that does not end with the C<.ll> suffix, then the
-output file has the same name as the input file, except that the C<.bc>
-suffix is appended.
-
-=back
-
-=head1 OPTIONS
-
-=over
-
-=item B<-f>
-
-Enable binary output on terminals.  Normally, B<llvm-as> will refuse to
-write raw bitcode output if the output stream is a terminal. With this option,
-B<llvm-as> will write raw bitcode regardless of the output device.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-o> F<filename>
-
-Specify the output file name.  If F<filename> is C<->, then B<llvm-as>
-sends its output to standard output.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-as> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<llvm-dis|llvm-dis>, L<gccas|gccas>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-as.rst b/docs/CommandGuide/llvm-as.rst
new file mode 100644
index 0000000..1b499bb
--- /dev/null
+++ b/docs/CommandGuide/llvm-as.rst
@@ -0,0 +1,56 @@
+llvm-as - LLVM assembler
+========================
+
+SYNOPSIS
+--------
+
+**llvm-as** [*options*] [*filename*]
+
+DESCRIPTION
+-----------
+
+**llvm-as** is the LLVM assembler.  It reads a file containing human-readable
+LLVM assembly language, translates it to LLVM bitcode, and writes the result
+into a file or to standard output.
+
+If *filename* is omitted or is ``-``, then **llvm-as** reads its input from
+standard input.
+
+If an output file is not specified with the **-o** option, then
+**llvm-as** sends its output to a file or standard output by following
+these rules:
+
+* If the input is standard input, then the output is standard output.
+
+* If the input is a file that ends with ``.ll``, then the output file is of the
+  same name, except that the suffix is changed to ``.bc``.
+
+* If the input is a file that does not end with the ``.ll`` suffix, then the
+  output file has the same name as the input file, except that the ``.bc``
+  suffix is appended.
+
+OPTIONS
+-------
+
+**-f**
+ Enable binary output on terminals.  Normally, **llvm-as** will refuse to
+ write raw bitcode output if the output stream is a terminal. With this option,
+ **llvm-as** will write raw bitcode regardless of the output device.
+
+**-help**
+ Print a summary of command line options.
+
+**-o** *filename*
+ Specify the output file name.  If *filename* is ``-``, then **llvm-as**
+ sends its output to standard output.
+
+EXIT STATUS
+-----------
+
+If **llvm-as** succeeds, it will exit with 0.  Otherwise, if an error occurs, it
+will exit with a non-zero value.
+
+SEE ALSO
+--------
+
+llvm-dis|llvm-dis, gccas|gccas
diff --git a/docs/CommandGuide/llvm-bcanalyzer.pod b/docs/CommandGuide/llvm-bcanalyzer.pod
deleted file mode 100644
index 9c5021b..0000000
--- a/docs/CommandGuide/llvm-bcanalyzer.pod
+++ /dev/null
@@ -1,315 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-bcanalyzer - LLVM bitcode analyzer
-
-=head1 SYNOPSIS
-
-B<llvm-bcanalyzer> [I<options>] [F<filename>]
-
-=head1 DESCRIPTION
-
-The B<llvm-bcanalyzer> command is a small utility for analyzing bitcode files.
-The tool reads a bitcode file (such as generated with the B<llvm-as> tool) and
-produces a statistical report on the contents of the bitcode file.  The tool
-can also dump a low level but human readable version of the bitcode file. 
-This tool is probably not of much interest or utility except for those working 
-directly with the bitcode file format. Most LLVM users can just ignore
-this tool.
-
-If F<filename> is omitted or is C<->, then B<llvm-bcanalyzer> reads its input 
-from standard input. This is useful for combining the tool into a pipeline.
-Output is written to the standard output.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-nodetails>
-
-Causes B<llvm-bcanalyzer> to abbreviate its output by writing out only a module 
-level summary. The details for individual functions are not displayed.
-
-=item B<-dump>
-
-Causes B<llvm-bcanalyzer> to dump the bitcode in a human readable format. This 
-format is significantly different from LLVM assembly and provides details about 
-the encoding of the bitcode file.
-
-=item B<-verify>
-
-Causes B<llvm-bcanalyzer> to verify the module produced by reading the 
-bitcode. This ensures that the statistics generated are based on a consistent
-module.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-bcanalyzer> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value, usually 1.
-
-=head1 SUMMARY OUTPUT DEFINITIONS
-
-The following items are always printed by llvm-bcanalyzer. They comprize the
-summary output.
-
-=over
-
-=item B<Bitcode Analysis Of Module>
-
-This just provides the name of the module for which bitcode analysis is being
-generated.
-
-=item B<Bitcode Version Number>
-
-The bitcode version (not LLVM version) of the file read by the analyzer.
-
-=item B<File Size>
-
-The size, in bytes, of the entire bitcode file.
-
-=item B<Module Bytes>
-
-The size, in bytes, of the module block. Percentage is relative to File Size.
-
-=item B<Function Bytes>
-
-The size, in bytes, of all the function blocks. Percentage is relative to File
-Size.
-
-=item B<Global Types Bytes>
-
-The size, in bytes, of the Global Types Pool. Percentage is relative to File
-Size. This is the size of the definitions of all types in the bitcode file.
-
-=item B<Constant Pool Bytes>
-
-The size, in bytes, of the Constant Pool Blocks Percentage is relative to File
-Size.
-
-=item B<Module Globals Bytes>
-
-Ths size, in bytes, of the Global Variable Definitions and their initializers.
-Percentage is relative to File Size.
-
-=item B<Instruction List Bytes>
-
-The size, in bytes, of all the instruction lists in all the functions.
-Percentage is relative to File Size. Note that this value is also included in
-the Function Bytes.
-
-=item B<Compaction Table Bytes>
-
-The size, in bytes, of all the compaction tables in all the functions.
-Percentage is relative to File Size. Note that this value is also included in
-the Function Bytes.
-
-=item B<Symbol Table Bytes>
-
-The size, in bytes, of all the symbol tables in all the functions. Percentage is
-relative to File Size. Note that this value is also included in the Function
-Bytes.
-
-=item B<Dependent Libraries Bytes>
-
-The size, in bytes, of the list of dependent libraries in the module. Percentage
-is relative to File Size. Note that this value is also included in the Module
-Global Bytes.
-
-=item B<Number Of Bitcode Blocks>
-
-The total number of blocks of any kind in the bitcode file.
-
-=item B<Number Of Functions>
-
-The total number of function definitions in the bitcode file.
-
-=item B<Number Of Types>
-
-The total number of types defined in the Global Types Pool.
-
-=item B<Number Of Constants>
-
-The total number of constants (of any type) defined in the Constant Pool.
-
-=item B<Number Of Basic Blocks>
-
-The total number of basic blocks defined in all functions in the bitcode file.
-
-=item B<Number Of Instructions>
-
-The total number of instructions defined in all functions in the bitcode file.
-
-=item B<Number Of Long Instructions>
-
-The total number of long instructions defined in all functions in the bitcode
-file. Long instructions are those taking greater than 4 bytes. Typically long
-instructions are GetElementPtr with several indices, PHI nodes, and calls to
-functions with large numbers of arguments.
-
-=item B<Number Of Operands>
-
-The total number of operands used in all instructions in the bitcode file.
-
-=item B<Number Of Compaction Tables>
-
-The total number of compaction tables in all functions in the bitcode file.
-
-=item B<Number Of Symbol Tables>
-
-The total number of symbol tables in all functions in the bitcode file.
-
-=item B<Number Of Dependent Libs>
-
-The total number of dependent libraries found in the bitcode file.
-
-=item B<Total Instruction Size>
-
-The total size of the instructions in all functions in the bitcode file.
-
-=item B<Average Instruction Size>
-
-The average number of bytes per instruction across all functions in the bitcode
-file. This value is computed by dividing Total Instruction Size by Number Of
-Instructions.
-
-=item B<Maximum Type Slot Number>
-
-The maximum value used for a type's slot number. Larger slot number values take 
-more bytes to encode.
-
-=item B<Maximum Value Slot Number>
-
-The maximum value used for a value's slot number. Larger slot number values take 
-more bytes to encode.
-
-=item B<Bytes Per Value>
-
-The average size of a Value definition (of any type). This is computed by
-dividing File Size by the total number of values of any type.
-
-=item B<Bytes Per Global>
-
-The average size of a global definition (constants and global variables).
-
-=item B<Bytes Per Function>
-
-The average number of bytes per function definition. This is computed by
-dividing Function Bytes by Number Of Functions.
-
-=item B<# of VBR 32-bit Integers>
-
-The total number of 32-bit integers encoded using the Variable Bit Rate
-encoding scheme.
-
-=item B<# of VBR 64-bit Integers>
-
-The total number of 64-bit integers encoded using the Variable Bit Rate encoding
-scheme.
-
-=item B<# of VBR Compressed Bytes>
-
-The total number of bytes consumed by the 32-bit and 64-bit integers that use
-the Variable Bit Rate encoding scheme.
-
-=item B<# of VBR Expanded Bytes>
-
-The total number of bytes that would have been consumed by the 32-bit and 64-bit
-integers had they not been compressed with the Variable Bit Rage encoding
-scheme.
-
-=item B<Bytes Saved With VBR>
-
-The total number of bytes saved by using the Variable Bit Rate encoding scheme.
-The percentage is relative to # of VBR Expanded Bytes.
-
-=back
-
-=head1 DETAILED OUTPUT DEFINITIONS
-
-The following definitions occur only if the -nodetails option was not given.
-The detailed output provides additional information on a per-function basis.
-
-=over
-
-=item B<Type>
-
-The type signature of the function.
-
-=item B<Byte Size>
-
-The total number of bytes in the function's block.
-
-=item B<Basic Blocks>
-
-The number of basic blocks defined by the function.
-
-=item B<Instructions>
-
-The number of instructions defined by the function.
-
-=item B<Long Instructions>
-
-The number of instructions using the long instruction format in the function.
-
-=item B<Operands>
-
-The number of operands used by all instructions in the function.
-
-=item B<Instruction Size>
-
-The number of bytes consumed by instructions in the function.
-
-=item B<Average Instruction Size>
-
-The average number of bytes consumed by the instructions in the function. This
-value is computed by dividing Instruction Size by Instructions.
-
-=item B<Bytes Per Instruction>
-
-The average number of bytes used by the function per instruction. This value is
-computed by dividing Byte Size by Instructions. Note that this is not the same
-as Average Instruction Size. It computes a number relative to the total function
-size not just the size of the instruction list.
-
-=item B<Number of VBR 32-bit Integers>
-
-The total number of 32-bit integers found in this function (for any use).
-
-=item B<Number of VBR 64-bit Integers>
-
-The total number of 64-bit integers found in this function (for any use).
-
-=item B<Number of VBR Compressed Bytes>
-
-The total number of bytes in this function consumed by the 32-bit and 64-bit 
-integers that use the Variable Bit Rate encoding scheme.
-
-=item B<Number of VBR Expanded Bytes>
-
-The total number of bytes in this function that would have been consumed by 
-the 32-bit and 64-bit integers had they not been compressed with the Variable 
-Bit Rate encoding scheme.
-
-=item B<Bytes Saved With VBR>
-
-The total number of bytes saved in this function by using the Variable Bit 
-Rate encoding scheme. The percentage is relative to # of VBR Expanded Bytes.
-
-=back
-
-=head1 SEE ALSO
-
-L<llvm-dis|llvm-dis>, L<http://llvm.org/docs/BitCodeFormat.html>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-bcanalyzer.rst b/docs/CommandGuide/llvm-bcanalyzer.rst
new file mode 100644
index 0000000..f1e4eac
--- /dev/null
+++ b/docs/CommandGuide/llvm-bcanalyzer.rst
@@ -0,0 +1,424 @@
+llvm-bcanalyzer - LLVM bitcode analyzer
+=======================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-bcanalyzer** [*options*] [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-bcanalyzer** command is a small utility for analyzing bitcode files.
+The tool reads a bitcode file (such as generated with the **llvm-as** tool) and
+produces a statistical report on the contents of the bitcode file.  The tool
+can also dump a low level but human readable version of the bitcode file.
+This tool is probably not of much interest or utility except for those working
+directly with the bitcode file format. Most LLVM users can just ignore
+this tool.
+
+If *filename* is omitted or is ``-``, then **llvm-bcanalyzer** reads its input
+from standard input. This is useful for combining the tool into a pipeline.
+Output is written to the standard output.
+
+
+OPTIONS
+-------
+
+
+
+**-nodetails**
+
+ Causes **llvm-bcanalyzer** to abbreviate its output by writing out only a module
+ level summary. The details for individual functions are not displayed.
+
+
+
+**-dump**
+
+ Causes **llvm-bcanalyzer** to dump the bitcode in a human readable format. This
+ format is significantly different from LLVM assembly and provides details about
+ the encoding of the bitcode file.
+
+
+
+**-verify**
+
+ Causes **llvm-bcanalyzer** to verify the module produced by reading the
+ bitcode. This ensures that the statistics generated are based on a consistent
+ module.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-bcanalyzer** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value, usually 1.
+
+
+SUMMARY OUTPUT DEFINITIONS
+--------------------------
+
+
+The following items are always printed by llvm-bcanalyzer. They comprize the
+summary output.
+
+
+**Bitcode Analysis Of Module**
+
+ This just provides the name of the module for which bitcode analysis is being
+ generated.
+
+
+
+**Bitcode Version Number**
+
+ The bitcode version (not LLVM version) of the file read by the analyzer.
+
+
+
+**File Size**
+
+ The size, in bytes, of the entire bitcode file.
+
+
+
+**Module Bytes**
+
+ The size, in bytes, of the module block. Percentage is relative to File Size.
+
+
+
+**Function Bytes**
+
+ The size, in bytes, of all the function blocks. Percentage is relative to File
+ Size.
+
+
+
+**Global Types Bytes**
+
+ The size, in bytes, of the Global Types Pool. Percentage is relative to File
+ Size. This is the size of the definitions of all types in the bitcode file.
+
+
+
+**Constant Pool Bytes**
+
+ The size, in bytes, of the Constant Pool Blocks Percentage is relative to File
+ Size.
+
+
+
+**Module Globals Bytes**
+
+ Ths size, in bytes, of the Global Variable Definitions and their initializers.
+ Percentage is relative to File Size.
+
+
+
+**Instruction List Bytes**
+
+ The size, in bytes, of all the instruction lists in all the functions.
+ Percentage is relative to File Size. Note that this value is also included in
+ the Function Bytes.
+
+
+
+**Compaction Table Bytes**
+
+ The size, in bytes, of all the compaction tables in all the functions.
+ Percentage is relative to File Size. Note that this value is also included in
+ the Function Bytes.
+
+
+
+**Symbol Table Bytes**
+
+ The size, in bytes, of all the symbol tables in all the functions. Percentage is
+ relative to File Size. Note that this value is also included in the Function
+ Bytes.
+
+
+
+**Dependent Libraries Bytes**
+
+ The size, in bytes, of the list of dependent libraries in the module. Percentage
+ is relative to File Size. Note that this value is also included in the Module
+ Global Bytes.
+
+
+
+**Number Of Bitcode Blocks**
+
+ The total number of blocks of any kind in the bitcode file.
+
+
+
+**Number Of Functions**
+
+ The total number of function definitions in the bitcode file.
+
+
+
+**Number Of Types**
+
+ The total number of types defined in the Global Types Pool.
+
+
+
+**Number Of Constants**
+
+ The total number of constants (of any type) defined in the Constant Pool.
+
+
+
+**Number Of Basic Blocks**
+
+ The total number of basic blocks defined in all functions in the bitcode file.
+
+
+
+**Number Of Instructions**
+
+ The total number of instructions defined in all functions in the bitcode file.
+
+
+
+**Number Of Long Instructions**
+
+ The total number of long instructions defined in all functions in the bitcode
+ file. Long instructions are those taking greater than 4 bytes. Typically long
+ instructions are GetElementPtr with several indices, PHI nodes, and calls to
+ functions with large numbers of arguments.
+
+
+
+**Number Of Operands**
+
+ The total number of operands used in all instructions in the bitcode file.
+
+
+
+**Number Of Compaction Tables**
+
+ The total number of compaction tables in all functions in the bitcode file.
+
+
+
+**Number Of Symbol Tables**
+
+ The total number of symbol tables in all functions in the bitcode file.
+
+
+
+**Number Of Dependent Libs**
+
+ The total number of dependent libraries found in the bitcode file.
+
+
+
+**Total Instruction Size**
+
+ The total size of the instructions in all functions in the bitcode file.
+
+
+
+**Average Instruction Size**
+
+ The average number of bytes per instruction across all functions in the bitcode
+ file. This value is computed by dividing Total Instruction Size by Number Of
+ Instructions.
+
+
+
+**Maximum Type Slot Number**
+
+ The maximum value used for a type's slot number. Larger slot number values take
+ more bytes to encode.
+
+
+
+**Maximum Value Slot Number**
+
+ The maximum value used for a value's slot number. Larger slot number values take
+ more bytes to encode.
+
+
+
+**Bytes Per Value**
+
+ The average size of a Value definition (of any type). This is computed by
+ dividing File Size by the total number of values of any type.
+
+
+
+**Bytes Per Global**
+
+ The average size of a global definition (constants and global variables).
+
+
+
+**Bytes Per Function**
+
+ The average number of bytes per function definition. This is computed by
+ dividing Function Bytes by Number Of Functions.
+
+
+
+**# of VBR 32-bit Integers**
+
+ The total number of 32-bit integers encoded using the Variable Bit Rate
+ encoding scheme.
+
+
+
+**# of VBR 64-bit Integers**
+
+ The total number of 64-bit integers encoded using the Variable Bit Rate encoding
+ scheme.
+
+
+
+**# of VBR Compressed Bytes**
+
+ The total number of bytes consumed by the 32-bit and 64-bit integers that use
+ the Variable Bit Rate encoding scheme.
+
+
+
+**# of VBR Expanded Bytes**
+
+ The total number of bytes that would have been consumed by the 32-bit and 64-bit
+ integers had they not been compressed with the Variable Bit Rage encoding
+ scheme.
+
+
+
+**Bytes Saved With VBR**
+
+ The total number of bytes saved by using the Variable Bit Rate encoding scheme.
+ The percentage is relative to # of VBR Expanded Bytes.
+
+
+
+
+DETAILED OUTPUT DEFINITIONS
+---------------------------
+
+
+The following definitions occur only if the -nodetails option was not given.
+The detailed output provides additional information on a per-function basis.
+
+
+**Type**
+
+ The type signature of the function.
+
+
+
+**Byte Size**
+
+ The total number of bytes in the function's block.
+
+
+
+**Basic Blocks**
+
+ The number of basic blocks defined by the function.
+
+
+
+**Instructions**
+
+ The number of instructions defined by the function.
+
+
+
+**Long Instructions**
+
+ The number of instructions using the long instruction format in the function.
+
+
+
+**Operands**
+
+ The number of operands used by all instructions in the function.
+
+
+
+**Instruction Size**
+
+ The number of bytes consumed by instructions in the function.
+
+
+
+**Average Instruction Size**
+
+ The average number of bytes consumed by the instructions in the function. This
+ value is computed by dividing Instruction Size by Instructions.
+
+
+
+**Bytes Per Instruction**
+
+ The average number of bytes used by the function per instruction. This value is
+ computed by dividing Byte Size by Instructions. Note that this is not the same
+ as Average Instruction Size. It computes a number relative to the total function
+ size not just the size of the instruction list.
+
+
+
+**Number of VBR 32-bit Integers**
+
+ The total number of 32-bit integers found in this function (for any use).
+
+
+
+**Number of VBR 64-bit Integers**
+
+ The total number of 64-bit integers found in this function (for any use).
+
+
+
+**Number of VBR Compressed Bytes**
+
+ The total number of bytes in this function consumed by the 32-bit and 64-bit
+ integers that use the Variable Bit Rate encoding scheme.
+
+
+
+**Number of VBR Expanded Bytes**
+
+ The total number of bytes in this function that would have been consumed by
+ the 32-bit and 64-bit integers had they not been compressed with the Variable
+ Bit Rate encoding scheme.
+
+
+
+**Bytes Saved With VBR**
+
+ The total number of bytes saved in this function by using the Variable Bit
+ Rate encoding scheme. The percentage is relative to # of VBR Expanded Bytes.
+
+
+
+
+SEE ALSO
+--------
+
+
+llvm-dis|llvm-dis, `http://llvm.org/docs/BitCodeFormat.html <http://llvm.org/docs/BitCodeFormat.html>`_
diff --git a/docs/CommandGuide/llvm-build.pod b/docs/CommandGuide/llvm-build.pod
deleted file mode 100644
index 14e08cb..0000000
--- a/docs/CommandGuide/llvm-build.pod
+++ /dev/null
@@ -1,86 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-build - LLVM Project Build Utility
-
-=head1 SYNOPSIS
-
-B<llvm-build> [I<options>]
-
-=head1 DESCRIPTION
-
-B<llvm-build> is a tool for working with LLVM projects that use the LLVMBuild
-system for describing their components.
-
-At heart, B<llvm-build> is responsible for loading, verifying, and manipulating
-the project's component data. The tool is primarily designed for use in
-implementing build systems and tools which need access to the project structure
-information.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-h>, B<--help>
-
-Print the builtin program help.
-
-=item B<--source-root>=I<PATH>
-
-If given, load the project at the given source root path. If this option is not
-given, the location of the project sources will be inferred from the location of
-the B<llvm-build> script itself.
-
-=item B<--print-tree>
-
-Print the component tree for the project.
-
-=item B<--write-library-table>
-
-Write out the C++ fragment which defines the components, library names, and
-required libraries. This C++ fragment is built into L<llvm-config|llvm-config>
-in order to provide clients with the list of required libraries for arbitrary
-component combinations.
-
-=item B<--write-llvmbuild>
-
-Write out new I<LLVMBuild.txt> files based on the loaded components. This is
-useful for auto-upgrading the schema of the files. B<llvm-build> will try to a
-limited extent to preserve the comments which were written in the original
-source file, although at this time it only preserves block comments that preceed
-the section names in the I<LLVMBuild> files.
-
-=item B<--write-cmake-fragment>
-
-Write out the LLVMBuild in the form of a CMake fragment, so it can easily be
-consumed by the CMake based build system. The exact contents and format of this
-file are closely tied to how LLVMBuild is integrated with CMake, see LLVM's
-top-level CMakeLists.txt.
-
-=item B<--write-make-fragment>
-
-Write out the LLVMBuild in the form of a Makefile fragment, so it can easily be
-consumed by a Make based build system. The exact contents and format of this
-file are closely tied to how LLVMBuild is integrated with the Makefiles, see
-LLVM's Makefile.rules.
-
-=item B<--llvmbuild-source-root>=I<PATH>
-
-If given, expect the I<LLVMBuild> files for the project to be rooted at the
-given path, instead of inside the source tree itself. This option is primarily
-designed for use in conjunction with B<--write-llvmbuild> to test changes to
-I<LLVMBuild> schema.
-
-=back
-
-=head1 EXIT STATUS
-
-B<llvm-build> exits with 0 if operation was successful. Otherwise, it will exist
-with a non-zero value.
-
-=head1 AUTHOR
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-build.rst b/docs/CommandGuide/llvm-build.rst
new file mode 100644
index 0000000..f788f7c
--- /dev/null
+++ b/docs/CommandGuide/llvm-build.rst
@@ -0,0 +1,102 @@
+llvm-build - LLVM Project Build Utility
+=======================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-build** [*options*]
+
+
+DESCRIPTION
+-----------
+
+
+**llvm-build** is a tool for working with LLVM projects that use the LLVMBuild
+system for describing their components.
+
+At heart, **llvm-build** is responsible for loading, verifying, and manipulating
+the project's component data. The tool is primarily designed for use in
+implementing build systems and tools which need access to the project structure
+information.
+
+
+OPTIONS
+-------
+
+
+
+**-h**, **--help**
+
+ Print the builtin program help.
+
+
+
+**--source-root**\ =\ *PATH*
+
+ If given, load the project at the given source root path. If this option is not
+ given, the location of the project sources will be inferred from the location of
+ the **llvm-build** script itself.
+
+
+
+**--print-tree**
+
+ Print the component tree for the project.
+
+
+
+**--write-library-table**
+
+ Write out the C++ fragment which defines the components, library names, and
+ required libraries. This C++ fragment is built into llvm-config|llvm-config
+ in order to provide clients with the list of required libraries for arbitrary
+ component combinations.
+
+
+
+**--write-llvmbuild**
+
+ Write out new *LLVMBuild.txt* files based on the loaded components. This is
+ useful for auto-upgrading the schema of the files. **llvm-build** will try to a
+ limited extent to preserve the comments which were written in the original
+ source file, although at this time it only preserves block comments that precede
+ the section names in the *LLVMBuild* files.
+
+
+
+**--write-cmake-fragment**
+
+ Write out the LLVMBuild in the form of a CMake fragment, so it can easily be
+ consumed by the CMake based build system. The exact contents and format of this
+ file are closely tied to how LLVMBuild is integrated with CMake, see LLVM's
+ top-level CMakeLists.txt.
+
+
+
+**--write-make-fragment**
+
+ Write out the LLVMBuild in the form of a Makefile fragment, so it can easily be
+ consumed by a Make based build system. The exact contents and format of this
+ file are closely tied to how LLVMBuild is integrated with the Makefiles, see
+ LLVM's Makefile.rules.
+
+
+
+**--llvmbuild-source-root**\ =\ *PATH*
+
+ If given, expect the *LLVMBuild* files for the project to be rooted at the
+ given path, instead of inside the source tree itself. This option is primarily
+ designed for use in conjunction with **--write-llvmbuild** to test changes to
+ *LLVMBuild* schema.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+**llvm-build** exits with 0 if operation was successful. Otherwise, it will exist
+with a non-zero value.
diff --git a/docs/CommandGuide/llvm-config.pod b/docs/CommandGuide/llvm-config.pod
deleted file mode 100644
index 7d68564..0000000
--- a/docs/CommandGuide/llvm-config.pod
+++ /dev/null
@@ -1,131 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-config - Print LLVM compilation options
-
-=head1 SYNOPSIS
-
-B<llvm-config> I<option> [I<components>...]
-
-=head1 DESCRIPTION
-
-B<llvm-config> makes it easier to build applications that use LLVM.  It can
-print the compiler flags, linker flags and object libraries needed to link
-against LLVM.
-
-=head1 EXAMPLES
-
-To link against the JIT:
-
-  g++ `llvm-config --cxxflags` -o HowToUseJIT.o -c HowToUseJIT.cpp
-  g++ `llvm-config --ldflags` -o HowToUseJIT HowToUseJIT.o \
-      `llvm-config --libs engine bcreader scalaropts`
-
-=head1 OPTIONS
-
-=over
-
-=item B<--version>
-
-Print the version number of LLVM.
-
-=item B<-help>
-
-Print a summary of B<llvm-config> arguments.
-
-=item B<--prefix>
-
-Print the installation prefix for LLVM.
-
-=item B<--src-root>
-
-Print the source root from which LLVM was built.
-
-=item B<--obj-root>
-
-Print the object root used to build LLVM.
-
-=item B<--bindir>
-
-Print the installation directory for LLVM binaries.
-
-=item B<--includedir>
-
-Print the installation directory for LLVM headers.
-
-=item B<--libdir>
-
-Print the installation directory for LLVM libraries.
-
-=item B<--cxxflags>
-
-Print the C++ compiler flags needed to use LLVM headers.
-
-=item B<--ldflags>
-
-Print the flags needed to link against LLVM libraries.
-
-=item B<--libs>
-
-Print all the libraries needed to link against the specified LLVM
-I<components>, including any dependencies.
-
-=item B<--libnames>
-
-Similar to B<--libs>, but prints the bare filenames of the libraries
-without B<-l> or pathnames.  Useful for linking against a not-yet-installed
-copy of LLVM.
-
-=item B<--libfiles>
-
-Similar to B<--libs>, but print the full path to each library file.  This is
-useful when creating makefile dependencies, to ensure that a tool is relinked if
-any library it uses changes.
-
-=item B<--components>
-
-Print all valid component names.
-
-=item B<--targets-built>
-
-Print the component names for all targets supported by this copy of LLVM.
-
-=item B<--build-mode>
-
-Print the build mode used when LLVM was built (e.g. Debug or Release)
-
-=back
-
-=head1 COMPONENTS
-
-To print a list of all available components, run B<llvm-config
---components>.  In most cases, components correspond directly to LLVM
-libraries.  Useful "virtual" components include:
-
-=over
-
-=item B<all>
-
-Includes all LLVM libaries.  The default if no components are specified.
-
-=item B<backend>
-
-Includes either a native backend or the C backend.
-
-=item B<engine>
-
-Includes either a native JIT or the bitcode interpreter.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-config> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-config.rst b/docs/CommandGuide/llvm-config.rst
new file mode 100644
index 0000000..0ebb344
--- /dev/null
+++ b/docs/CommandGuide/llvm-config.rst
@@ -0,0 +1,176 @@
+llvm-config - Print LLVM compilation options
+============================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-config** *option* [*components*...]
+
+
+DESCRIPTION
+-----------
+
+
+**llvm-config** makes it easier to build applications that use LLVM.  It can
+print the compiler flags, linker flags and object libraries needed to link
+against LLVM.
+
+
+EXAMPLES
+--------
+
+
+To link against the JIT:
+
+
+.. code-block:: sh
+
+   g++ `llvm-config --cxxflags` -o HowToUseJIT.o -c HowToUseJIT.cpp
+   g++ `llvm-config --ldflags` -o HowToUseJIT HowToUseJIT.o \
+       `llvm-config --libs engine bcreader scalaropts`
+
+
+
+OPTIONS
+-------
+
+
+
+**--version**
+
+ Print the version number of LLVM.
+
+
+
+**-help**
+
+ Print a summary of **llvm-config** arguments.
+
+
+
+**--prefix**
+
+ Print the installation prefix for LLVM.
+
+
+
+**--src-root**
+
+ Print the source root from which LLVM was built.
+
+
+
+**--obj-root**
+
+ Print the object root used to build LLVM.
+
+
+
+**--bindir**
+
+ Print the installation directory for LLVM binaries.
+
+
+
+**--includedir**
+
+ Print the installation directory for LLVM headers.
+
+
+
+**--libdir**
+
+ Print the installation directory for LLVM libraries.
+
+
+
+**--cxxflags**
+
+ Print the C++ compiler flags needed to use LLVM headers.
+
+
+
+**--ldflags**
+
+ Print the flags needed to link against LLVM libraries.
+
+
+
+**--libs**
+
+ Print all the libraries needed to link against the specified LLVM
+ *components*, including any dependencies.
+
+
+
+**--libnames**
+
+ Similar to **--libs**, but prints the bare filenames of the libraries
+ without **-l** or pathnames.  Useful for linking against a not-yet-installed
+ copy of LLVM.
+
+
+
+**--libfiles**
+
+ Similar to **--libs**, but print the full path to each library file.  This is
+ useful when creating makefile dependencies, to ensure that a tool is relinked if
+ any library it uses changes.
+
+
+
+**--components**
+
+ Print all valid component names.
+
+
+
+**--targets-built**
+
+ Print the component names for all targets supported by this copy of LLVM.
+
+
+
+**--build-mode**
+
+ Print the build mode used when LLVM was built (e.g. Debug or Release)
+
+
+
+
+COMPONENTS
+----------
+
+
+To print a list of all available components, run **llvm-config
+--components**.  In most cases, components correspond directly to LLVM
+libraries.  Useful "virtual" components include:
+
+
+**all**
+
+ Includes all LLVM libaries.  The default if no components are specified.
+
+
+
+**backend**
+
+ Includes either a native backend or the C backend.
+
+
+
+**engine**
+
+ Includes either a native JIT or the bitcode interpreter.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-config** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
diff --git a/docs/CommandGuide/llvm-cov.pod b/docs/CommandGuide/llvm-cov.pod
deleted file mode 100644
index e8ff683..0000000
--- a/docs/CommandGuide/llvm-cov.pod
+++ /dev/null
@@ -1,45 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-cov - emit coverage information
-
-=head1 SYNOPSIS
-
-B<llvm-cov> [-gcno=filename] [-gcda=filename] [dump]
-
-=head1 DESCRIPTION
-
-The experimental B<llvm-cov> tool reads in description file generated by compiler 
-and coverage data file generated by instrumented program. This program assumes 
-that the description and data file uses same format as gcov files. 
-
-=head1 OPTIONS
-
-=over
-
-=item B<-gcno=filename]
-
-This option selects input description file generated by compiler while instrumenting
-program.
-
-=item B<-gcda=filename]
-
-This option selects coverage data file generated by instrumented compiler.
-
-=item B<-dump>
-
-This options enables output dump that is suitable for a developer to help debug
-B<llvm-cov> itself.
-
-=back
-
-=head1 EXIT STATUS
-
-B<llvm-cov> returns 1 if it cannot read input files. Otherwise, it exits with zero.
-
-=head1 AUTHOR
-
-B<llvm-cov> is maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst
new file mode 100644
index 0000000..09275f6
--- /dev/null
+++ b/docs/CommandGuide/llvm-cov.rst
@@ -0,0 +1,51 @@
+llvm-cov - emit coverage information
+====================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-cov** [-gcno=filename] [-gcda=filename] [dump]
+
+
+DESCRIPTION
+-----------
+
+
+The experimental **llvm-cov** tool reads in description file generated by compiler
+and coverage data file generated by instrumented program. This program assumes
+that the description and data file uses same format as gcov files.
+
+
+OPTIONS
+-------
+
+
+
+**-gcno=filename]**
+
+ This option selects input description file generated by compiler while instrumenting
+ program.
+
+
+
+**-gcda=filename]**
+
+ This option selects coverage data file generated by instrumented compiler.
+
+
+
+**-dump**
+
+ This options enables output dump that is suitable for a developer to help debug
+ **llvm-cov** itself.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+**llvm-cov** returns 1 if it cannot read input files. Otherwise, it exits with zero.
diff --git a/docs/CommandGuide/llvm-diff.pod b/docs/CommandGuide/llvm-diff.rst
index ffe0b48..991d4fe 100644
--- a/docs/CommandGuide/llvm-diff.pod
+++ b/docs/CommandGuide/llvm-diff.rst
@@ -1,16 +1,19 @@
-=pod
+llvm-diff - LLVM structural 'diff'
+==================================
 
-=head1 NAME
 
-llvm-diff - LLVM structural 'diff'
+SYNOPSIS
+--------
+
 
-=head1 SYNOPSIS
+**llvm-diff** [*options*] *module 1* *module 2* [*global name ...*]
 
-B<llvm-diff> [I<options>] I<module 1> I<module 2> [I<global name ...>]
 
-=head1 DESCRIPTION
+DESCRIPTION
+-----------
 
-B<llvm-diff> compares the structure of two LLVM modules, primarily
+
+**llvm-diff** compares the structure of two LLVM modules, primarily
 focusing on differences in function definitions.  Insignificant
 differences, such as changes in the ordering of globals or in the
 names of local values, are ignored.
@@ -23,31 +26,31 @@ are compared; otherwise, all global values are compared, and
 diagnostics are produced for globals which only appear in one module
 or the other.
 
-B<llvm-diff> compares two functions by comparing their basic blocks,
+**llvm-diff** compares two functions by comparing their basic blocks,
 beginning with the entry blocks.  If the terminators seem to match,
 then the corresponding successors are compared; otherwise they are
 ignored.  This algorithm is very sensitive to changes in control flow,
 which tend to stop any downstream changes from being detected.
 
-B<llvm-diff> is intended as a debugging tool for writers of LLVM
+**llvm-diff** is intended as a debugging tool for writers of LLVM
 passes and frontends.  It does not have a stable output format.
 
-=head1 EXIT STATUS
 
-If B<llvm-diff> finds no differences between the modules, it will exit
+EXIT STATUS
+-----------
+
+
+If **llvm-diff** finds no differences between the modules, it will exit
 with 0 and produce no output.  Otherwise it will exit with a non-zero
 value.
 
-=head1 BUGS
+
+BUGS
+----
+
 
 Many important differences, like changes in linkage or function
 attributes, are not diagnosed.
 
 Changes in memory behavior (for example, coalescing loads) can cause
 massive detected differences in blocks.
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-dis.pod b/docs/CommandGuide/llvm-dis.pod
deleted file mode 100644
index 9f4026c..0000000
--- a/docs/CommandGuide/llvm-dis.pod
+++ /dev/null
@@ -1,60 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-dis - LLVM disassembler
-
-=head1 SYNOPSIS
-
-B<llvm-dis> [I<options>] [I<filename>]
-
-=head1 DESCRIPTION
-
-The B<llvm-dis> command is the LLVM disassembler.  It takes an LLVM
-bitcode file and converts it into human-readable LLVM assembly language.
-
-If filename is omitted or specified as C<->, B<llvm-dis> reads its
-input from standard input.
-
-If the input is being read from standard input, then B<llvm-dis>
-will send its output to standard output by default.  Otherwise, the
-output will be written to a file named after the input file, with
-a C<.ll> suffix added (any existing C<.bc> suffix will first be
-removed).  You can override the choice of output file using the
-B<-o> option.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-f>
-
-Enable binary output on terminals.  Normally, B<llvm-dis> will refuse to
-write raw bitcode output if the output stream is a terminal. With this option,
-B<llvm-dis> will write raw bitcode regardless of the output device.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-o> F<filename>
-
-Specify the output file name.  If F<filename> is -, then the output is sent
-to standard output.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-dis> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<llvm-as|llvm-as>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-dis.rst b/docs/CommandGuide/llvm-dis.rst
new file mode 100644
index 0000000..85cdca8
--- /dev/null
+++ b/docs/CommandGuide/llvm-dis.rst
@@ -0,0 +1,69 @@
+llvm-dis - LLVM disassembler
+============================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-dis** [*options*] [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-dis** command is the LLVM disassembler.  It takes an LLVM
+bitcode file and converts it into human-readable LLVM assembly language.
+
+If filename is omitted or specified as ``-``, **llvm-dis** reads its
+input from standard input.
+
+If the input is being read from standard input, then **llvm-dis**
+will send its output to standard output by default.  Otherwise, the
+output will be written to a file named after the input file, with
+a ``.ll`` suffix added (any existing ``.bc`` suffix will first be
+removed).  You can override the choice of output file using the
+**-o** option.
+
+
+OPTIONS
+-------
+
+
+
+**-f**
+
+ Enable binary output on terminals.  Normally, **llvm-dis** will refuse to
+ write raw bitcode output if the output stream is a terminal. With this option,
+ **llvm-dis** will write raw bitcode regardless of the output device.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-o** *filename*
+
+ Specify the output file name.  If *filename* is -, then the output is sent
+ to standard output.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-dis** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
+
+
+SEE ALSO
+--------
+
+
+llvm-as|llvm-as
diff --git a/docs/CommandGuide/llvm-extract.pod b/docs/CommandGuide/llvm-extract.pod
deleted file mode 100644
index 67f00f0..0000000
--- a/docs/CommandGuide/llvm-extract.pod
+++ /dev/null
@@ -1,85 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-extract - extract a function from an LLVM module
-
-=head1 SYNOPSIS
-
-B<llvm-extract> [I<options>] B<--func> I<function-name> [I<filename>]
-
-=head1 DESCRIPTION
-
-The B<llvm-extract> command takes the name of a function and extracts it from
-the specified LLVM bitcode file.  It is primarily used as a debugging tool to
-reduce test cases from larger programs that are triggering a bug.
-
-In addition to extracting the bitcode of the specified function,
-B<llvm-extract> will also remove unreachable global variables, prototypes, and
-unused types.
-
-The B<llvm-extract> command reads its input from standard input if filename is
-omitted or if filename is -.  The output is always written to standard output,
-unless the B<-o> option is specified (see below).
-
-=head1 OPTIONS
-
-=over
-
-=item B<-f>
-
-Enable binary output on terminals.  Normally, B<llvm-extract> will refuse to
-write raw bitcode output if the output stream is a terminal. With this option,
-B<llvm-extract> will write raw bitcode regardless of the output device.
-
-=item B<--func> I<function-name>
-
-Extract the function named I<function-name> from the LLVM bitcode. May be
-specified multiple times to extract multiple functions at once.
-
-=item B<--rfunc> I<function-regular-expr>
-
-Extract the function(s) matching I<function-regular-expr> from the LLVM bitcode.
-All functions matching the regular expression will be extracted.  May be 
-specified multiple times.
-
-=item B<--glob> I<global-name>
-
-Extract the global variable named I<global-name> from the LLVM bitcode. May be
-specified multiple times to extract multiple global variables at once.
-
-=item B<--rglob> I<glob-regular-expr>
-
-Extract the global variable(s) matching I<global-regular-expr> from the LLVM
-bitcode. All global variables matching the regular expression will be extracted.
-May be specified multiple times.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-o> I<filename>
-
-Specify the output filename.  If filename is "-" (the default), then
-B<llvm-extract> sends its output to standard output.
-
-=item B<-S>
-
-Write output in LLVM intermediate language (instead of bitcode).
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-extract> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<bugpoint|bugpoint>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-extract.rst b/docs/CommandGuide/llvm-extract.rst
new file mode 100644
index 0000000..d569e35
--- /dev/null
+++ b/docs/CommandGuide/llvm-extract.rst
@@ -0,0 +1,104 @@
+llvm-extract - extract a function from an LLVM module
+=====================================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-extract** [*options*] **--func** *function-name* [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-extract** command takes the name of a function and extracts it from
+the specified LLVM bitcode file.  It is primarily used as a debugging tool to
+reduce test cases from larger programs that are triggering a bug.
+
+In addition to extracting the bitcode of the specified function,
+**llvm-extract** will also remove unreachable global variables, prototypes, and
+unused types.
+
+The **llvm-extract** command reads its input from standard input if filename is
+omitted or if filename is -.  The output is always written to standard output,
+unless the **-o** option is specified (see below).
+
+
+OPTIONS
+-------
+
+
+
+**-f**
+
+ Enable binary output on terminals.  Normally, **llvm-extract** will refuse to
+ write raw bitcode output if the output stream is a terminal. With this option,
+ **llvm-extract** will write raw bitcode regardless of the output device.
+
+
+
+**--func** *function-name*
+
+ Extract the function named *function-name* from the LLVM bitcode. May be
+ specified multiple times to extract multiple functions at once.
+
+
+
+**--rfunc** *function-regular-expr*
+
+ Extract the function(s) matching *function-regular-expr* from the LLVM bitcode.
+ All functions matching the regular expression will be extracted.  May be
+ specified multiple times.
+
+
+
+**--glob** *global-name*
+
+ Extract the global variable named *global-name* from the LLVM bitcode. May be
+ specified multiple times to extract multiple global variables at once.
+
+
+
+**--rglob** *glob-regular-expr*
+
+ Extract the global variable(s) matching *global-regular-expr* from the LLVM
+ bitcode. All global variables matching the regular expression will be extracted.
+ May be specified multiple times.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-o** *filename*
+
+ Specify the output filename.  If filename is "-" (the default), then
+ **llvm-extract** sends its output to standard output.
+
+
+
+**-S**
+
+ Write output in LLVM intermediate language (instead of bitcode).
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-extract** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
+
+
+SEE ALSO
+--------
+
+
+bugpoint|bugpoint
diff --git a/docs/CommandGuide/llvm-ld.pod b/docs/CommandGuide/llvm-ld.pod
deleted file mode 100644
index efa9ebd..0000000
--- a/docs/CommandGuide/llvm-ld.pod
+++ /dev/null
@@ -1,234 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-ld - LLVM linker
-
-=head1 SYNOPSIS
-
-B<llvm-ld> <options> <files>
-
-=head1 DESCRIPTION
-
-The B<llvm-ld> tool takes a set of LLVM bitcode files and links them
-together into a single LLVM bitcode file.  The output bitcode file can be
-another bitcode file or an executable bitcode program.  Using additional
-options, B<llvm-ld> is able to produce native code executables.
-
-The B<llvm-ld> tool is the main linker for LLVM. It is used to link together
-the output of LLVM front-end compilers and run "link time" optimizations (mostly
-the inter-procedural kind).
-
-The B<llvm-ld> tools attempts to mimic the interface provided by the default
-system linker so that it can act as a I<drop-in> replacement.
-
-=head2 Search Order
-
-When looking for objects specified on the command line, B<llvm-ld> will search 
-for the object first in the current directory and then in the directory 
-specified by the B<LLVM_LIB_SEARCH_PATH> environment variable.  If it cannot 
-find the object, it fails.
-
-When looking for a library specified with the B<-l> option, B<llvm-ld> first
-attempts to load a file with that name from the current directory.  If that
-fails, it looks for libI<library>.bc, libI<library>.a, or libI<library>.I<shared
-library extension>, in that order, in each directory added to the library search
-path with the B<-L> option.  These directories are searched in the order they
-are specified.  If the library cannot be located, then B<llvm-ld> looks in the
-directory specified by the B<LLVM_LIB_SEARCH_PATH> environment variable.  If it
-does not find a library there, it fails.
-
-The I<shared library extension> may be I<.so>, I<.dyld>, I<.dll>, or something
-different, depending upon the system.
-
-The B<-L> option is global.  It does not matter where it is specified in the
-list of command line arguments; the directory is simply added to the search path
-and is applied to all libraries, preceding or succeeding, in the command line.
-
-=head2 Link order
-
-All object and bitcode files are linked first in the order they were 
-specified on the command line.  All library files are linked next.  
-Some libraries may not be linked into the object program; see below.
-
-=head2 Library Linkage
-
-Object files and static bitcode objects are always linked into the output
-file.  Library archives (.a files) load only the objects within the archive
-that define symbols needed by the output file.  Hence, libraries should be
-listed after the object files and libraries which need them; otherwise, the
-library may not be linked in, and the dependent library will not have its
-undefined symbols defined.
-
-=head2 Native code generation
-
-The B<llvm-ld> program has limited support for native code generation, when
-using the B<-native> or B<-native-cbe> options. Native code generation is
-performed by converting the linked bitcode into native assembly (.s) or C code
-and running the system compiler (typically gcc) on the result.
-
-=head1 OPTIONS
-
-=head2 General Options
-
-=over 
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-v>
-
-Specifies verbose mode. In this mode the linker will print additional
-information about the actions it takes, programs it executes, etc. 
-
-=item B<-stats>
-
-Print statistics.
-
-=item B<-time-passes>
-
-Record the amount of time needed for each pass and print it to standard
-error.
-
-=back 
-
-=head2 Input/Output Options
-
-=over
-
-=item B<-o> F<filename>
-
-This overrides the default output file and specifies the name of the file that
-should be generated by the linker. By default, B<llvm-ld> generates a file named
-F<a.out> for compatibility with B<ld>. The output will be written to
-F<filename>.
-
-=item B<-b> F<filename>
-
-This option can be used to override the output bitcode file name. By default, 
-the name of the bitcode output file is one more ".bc" suffix added to the name 
-specified by B<-o filename> option.
-
-=item B<-l>F<name>
-
-This option specifies the F<name> of a library to search when resolving symbols
-for the program. Only the base name should be specified as F<name>, without a
-F<lib> prefix or any suffix. 
-
-=item B<-L>F<Path>
-
-This option tells B<llvm-ld> to look in F<Path> to find any library subsequently
-specified with the B<-l> option. The paths will be searched in the order in
-which they are specified on the command line. If the library is still not found,
-a small set of system specific directories will also be searched. Note that
-libraries specified with the B<-l> option that occur I<before> any B<-L> options
-will not search the paths given by the B<-L> options following it.
-
-=item B<-link-as-library>
-
-Link the bitcode files together as a library, not an executable. In this mode,
-undefined symbols will be permitted.
-
-=item B<-r>
-
-An alias for -link-as-library.
-
-=item B<-native>
-
-Generate a native machine code executable.
-
-When generating native executables, B<llvm-ld> first checks for a bitcode
-version of the library and links it in, if necessary.  If the library is
-missing, B<llvm-ld> skips it.  Then, B<llvm-ld> links in the same
-libraries as native code.
-
-In this way, B<llvm-ld> should be able to link in optimized bitcode
-subsets of common libraries and then link in any part of the library that
-hasn't been converted to bitcode.
-
-=item B<-native-cbe>
-
-Generate a native machine code executable with the LLVM C backend.
-      
-This option is identical to the B<-native> option, but uses the
-C backend to generate code for the program instead of an LLVM native
-code generator.
-
-=back
-
-=head2 Optimization Options
-
-=over 
-
-=item B<-disable-inlining>
-
-Do not run the inlining pass. Functions will not be inlined into other
-functions.
-
-=item B<-disable-opt>
-
-Completely disable optimization.
-
-=item B<-disable-internalize>
-
-Do not mark all symbols as internal.
-
-=item B<-verify-each>
-
-Run the verification pass after each of the passes to verify intermediate
-results.
-
-=item B<-strip-all>
-
-Strip all debug and symbol information from the executable to make it smaller.
-
-=item B<-strip-debug>
-
-Strip all debug information from the executable to make it smaller.
-
-=item B<-s>
-
-An alias for B<-strip-all>.
-
-=item B<-S>
-
-An alias for B<-strip-debug>.
-
-=item B<-export-dynamic>
-
-An alias for B<-disable-internalize>
-
-=item B<-post-link-opt>F<Path>
-
-Run post-link optimization program. After linking is completed a bitcode file
-will be generated. It will be passed to the program specified by F<Path> as the
-first argument. The second argument to the program will be the name of a
-temporary file into which the program should place its optimized output. For
-example, the "no-op optimization" would be a simple shell script:
-
-    #!/bin/bash
-    cp $1 $2
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-ld> succeeds, it will exit with 0 return code.  If an error occurs,
-it will exit with a non-zero return code.
-
-=head1 ENVIRONMENT
-
-The C<LLVM_LIB_SEARCH_PATH> environment variable is used to find bitcode
-libraries. Any paths specified in this variable will be searched after the C<-L>
-options.
-
-=head1 SEE ALSO
-
-L<llvm-link|llvm-link>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-link.pod b/docs/CommandGuide/llvm-link.pod
deleted file mode 100644
index 1e466a5..0000000
--- a/docs/CommandGuide/llvm-link.pod
+++ /dev/null
@@ -1,79 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-link - LLVM linker
-
-=head1 SYNOPSIS
-
-B<llvm-link> [I<options>] I<filename ...>
-
-=head1 DESCRIPTION
-
-B<llvm-link> takes several LLVM bitcode files and links them together into a
-single LLVM bitcode file.  It writes the output file to standard output, unless
-the B<-o> option is used to specify a filename.
-
-B<llvm-link> attempts to load the input files from the current directory.  If
-that fails, it looks for each file in each of the directories specified by the
-B<-L> options on the command line.  The library search paths are global; each
-one is searched for every input file if necessary.  The directories are searched
-in the order they were specified on the command line.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-L> F<directory>
-
-Add the specified F<directory> to the library search path.  When looking for
-libraries, B<llvm-link> will look in path name for libraries.  This option can be
-specified multiple times; B<llvm-link> will search inside these directories in
-the order in which they were specified on the command line.
-
-=item B<-f>
-
-Enable binary output on terminals.  Normally, B<llvm-link> will refuse to
-write raw bitcode output if the output stream is a terminal. With this option,
-B<llvm-link> will write raw bitcode regardless of the output device.
-
-=item B<-o> F<filename>
-
-Specify the output file name.  If F<filename> is C<->, then B<llvm-link> will
-write its output to standard output.
-
-=item B<-S>
-
-Write output in LLVM intermediate language (instead of bitcode).
-
-=item B<-d>
-
-If specified, B<llvm-link> prints a human-readable version of the output
-bitcode file to standard error.
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-v>
-
-Verbose mode.  Print information about what B<llvm-link> is doing.  This
-typically includes a message for each bitcode file linked in and for each
-library found.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-link> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 SEE ALSO
-
-L<gccld|gccld>
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-link.rst b/docs/CommandGuide/llvm-link.rst
new file mode 100644
index 0000000..63019d7
--- /dev/null
+++ b/docs/CommandGuide/llvm-link.rst
@@ -0,0 +1,96 @@
+llvm-link - LLVM linker
+=======================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-link** [*options*] *filename ...*
+
+
+DESCRIPTION
+-----------
+
+
+**llvm-link** takes several LLVM bitcode files and links them together into a
+single LLVM bitcode file.  It writes the output file to standard output, unless
+the **-o** option is used to specify a filename.
+
+**llvm-link** attempts to load the input files from the current directory.  If
+that fails, it looks for each file in each of the directories specified by the
+**-L** options on the command line.  The library search paths are global; each
+one is searched for every input file if necessary.  The directories are searched
+in the order they were specified on the command line.
+
+
+OPTIONS
+-------
+
+
+
+**-L** *directory*
+
+ Add the specified *directory* to the library search path.  When looking for
+ libraries, **llvm-link** will look in path name for libraries.  This option can be
+ specified multiple times; **llvm-link** will search inside these directories in
+ the order in which they were specified on the command line.
+
+
+
+**-f**
+
+ Enable binary output on terminals.  Normally, **llvm-link** will refuse to
+ write raw bitcode output if the output stream is a terminal. With this option,
+ **llvm-link** will write raw bitcode regardless of the output device.
+
+
+
+**-o** *filename*
+
+ Specify the output file name.  If *filename* is ``-``, then **llvm-link** will
+ write its output to standard output.
+
+
+
+**-S**
+
+ Write output in LLVM intermediate language (instead of bitcode).
+
+
+
+**-d**
+
+ If specified, **llvm-link** prints a human-readable version of the output
+ bitcode file to standard error.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-v**
+
+ Verbose mode.  Print information about what **llvm-link** is doing.  This
+ typically includes a message for each bitcode file linked in and for each
+ library found.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-link** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
+
+
+SEE ALSO
+--------
+
+
+gccld|gccld
diff --git a/docs/CommandGuide/llvm-nm.pod b/docs/CommandGuide/llvm-nm.pod
deleted file mode 100644
index a6dc490..0000000
--- a/docs/CommandGuide/llvm-nm.pod
+++ /dev/null
@@ -1,122 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-nm - list LLVM bitcode file's symbol table
-
-=head1 SYNOPSIS
-
-B<llvm-nm> [I<options>] [I<filenames...>]
-
-=head1 DESCRIPTION
-
-The B<llvm-nm> utility lists the names of symbols from the LLVM bitcode files,
-or B<ar> archives containing LLVM bitcode files, named on the command line.
-Each symbol is listed along with some simple information about its provenance.
-If no file name is specified, or I<-> is used as a file name, B<llvm-nm> will
-process a bitcode file on its standard input stream.
-
-B<llvm-nm>'s default output format is the traditional BSD B<nm> output format.
-Each such output record consists of an (optional) 8-digit hexadecimal address,
-followed by a type code character, followed by a name, for each symbol. One
-record is printed per line; fields are separated by spaces. When the address is
-omitted, it is replaced by 8 spaces.
-
-Type code characters currently supported, and their meanings, are as follows:
-
-=over
-
-=item U
-
-Named object is referenced but undefined in this bitcode file
-
-=item C
-
-Common (multiple definitions link together into one def)
-
-=item W
-
-Weak reference (multiple definitions link together into zero or one definitions)
-
-=item t
-
-Local function (text) object
-
-=item T
-
-Global function (text) object
-
-=item d
-
-Local data object
-
-=item D
-
-Global data object
-
-=item ?
-
-Something unrecognizable
-
-=back
-
-Because LLVM bitcode files typically contain objects that are not considered to
-have addresses until they are linked into an executable image or dynamically
-compiled "just-in-time", B<llvm-nm> does not print an address for any symbol,
-even symbols which are defined in the bitcode file.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-P>
-
-Use POSIX.2 output format. Alias for B<--format=posix>.
-
-=item B<-B>    (default)
-
-Use BSD output format. Alias for B<--format=bsd>.
-
-=item B<-help>
-
-Print a summary of command-line options and their meanings.
-
-=item B<--defined-only>
-
-Print only symbols defined in this bitcode file (as opposed to
-symbols which may be referenced by objects in this file, but not
-defined in this file.)
-
-=item B<--extern-only>, B<-g>
-
-Print only symbols whose definitions are external; that is, accessible
-from other bitcode files.
-
-=item B<--undefined-only>, B<-u>
-
-Print only symbols referenced but not defined in this bitcode file.
-
-=item B<--format=>I<fmt>, B<-f>
-
-Select an output format; I<fmt> may be I<sysv>, I<posix>, or I<bsd>. The
-default is I<bsd>.
-
-=back
-
-=head1 BUGS
-
-B<llvm-nm> cannot demangle C++ mangled names, like GNU B<nm> can.
-
-=head1 EXIT STATUS
-
-B<llvm-nm> exits with an exit code of zero.
-
-=head1 SEE ALSO
-
-L<llvm-dis|llvm-dis>, ar(1), nm(1)
-
-=head1 AUTHOR
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-nm.rst b/docs/CommandGuide/llvm-nm.rst
new file mode 100644
index 0000000..cbc7af2
--- /dev/null
+++ b/docs/CommandGuide/llvm-nm.rst
@@ -0,0 +1,189 @@
+llvm-nm - list LLVM bitcode and object file's symbol table
+==========================================================
+
+
+SYNOPSIS
+--------
+
+
+:program:`llvm-nm` [*options*] [*filenames...*]
+
+
+DESCRIPTION
+-----------
+
+
+The :program:`llvm-nm` utility lists the names of symbols from the LLVM bitcode
+files, object files, or :program:`ar` archives containing them, named on the
+command line. Each symbol is listed along with some simple information about its
+provenance. If no file name is specified, or *-* is used as a file name,
+:program:`llvm-nm` will process a file on its standard input stream.
+
+:program:`llvm-nm`'s default output format is the traditional BSD :program:`nm`
+output format. Each such output record consists of an (optional) 8-digit
+hexadecimal address, followed by a type code character, followed by a name, for
+each symbol. One record is printed per line; fields are separated by spaces.
+When the address is omitted, it is replaced by 8 spaces.
+
+Type code characters currently supported, and their meanings, are as follows:
+
+
+U
+
+ Named object is referenced but undefined in this bitcode file
+
+
+
+C
+
+ Common (multiple definitions link together into one def)
+
+
+
+W
+
+ Weak reference (multiple definitions link together into zero or one definitions)
+
+
+
+t
+
+ Local function (text) object
+
+
+
+T
+
+ Global function (text) object
+
+
+
+d
+
+ Local data object
+
+
+
+D
+
+ Global data object
+
+
+
+?
+
+ Something unrecognizable
+
+
+
+Because LLVM bitcode files typically contain objects that are not considered to
+have addresses until they are linked into an executable image or dynamically
+compiled "just-in-time", :program:`llvm-nm` does not print an address for any
+symbol in a LLVM bitcode file, even symbols which are defined in the bitcode
+file.
+
+
+OPTIONS
+-------
+
+
+.. program:: llvm-nm
+
+
+.. option:: -B    (default)
+
+ Use BSD output format. Alias for :option:`--format=bsd`.
+
+
+.. option:: -P
+
+ Use POSIX.2 output format. Alias for :option:`--format=posix`.
+
+
+.. option:: --debug-syms, -a
+
+ Show all symbols, even debugger only.
+
+
+.. option:: --defined-only
+
+ Print only symbols defined in this file (as opposed to
+ symbols which may be referenced by objects in this file, but not
+ defined in this file.)
+
+
+.. option:: --dynamic, -D
+
+ Display dynamic symbols instead of normal symbols.
+
+
+.. option:: --extern-only, -g
+
+ Print only symbols whose definitions are external; that is, accessible
+ from other files.
+
+
+.. option:: --format=format, -f format
+
+ Select an output format; *format* may be *sysv*, *posix*, or *bsd*. The default
+ is *bsd*.
+
+
+.. option:: -help
+
+ Print a summary of command-line options and their meanings.
+
+
+.. option:: --no-sort, -p
+
+ Shows symbols in order encountered.
+
+
+.. option:: --numeric-sort, -n, -v
+
+ Sort symbols by address.
+
+
+.. option:: --print-file-name, -A, -o
+
+ Precede each symbol with the file it came from.
+
+
+.. option:: --print-size, -S
+
+ Show symbol size instead of address.
+
+
+.. option:: --size-sort
+
+ Sort symbols by size.
+
+
+.. option:: --undefined-only, -u
+
+ Print only symbols referenced but not defined in this file.
+
+
+BUGS
+----
+
+
+ * :program:`llvm-nm` cannot demangle C++ mangled names, like GNU :program:`nm`
+   can.
+
+ * :program:`llvm-nm` does not support the full set of arguments that GNU
+   :program:`nm` does.
+
+
+EXIT STATUS
+-----------
+
+
+:program:`llvm-nm` exits with an exit code of zero.
+
+
+SEE ALSO
+--------
+
+
+llvm-dis|llvm-dis, ar(1), nm(1)
diff --git a/docs/CommandGuide/llvm-prof.pod b/docs/CommandGuide/llvm-prof.pod
deleted file mode 100644
index 4b2e09d..0000000
--- a/docs/CommandGuide/llvm-prof.pod
+++ /dev/null
@@ -1,57 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-prof - print execution profile of LLVM program
-
-=head1 SYNOPSIS
-
-B<llvm-prof> [I<options>] [I<bitcode file>] [I<llvmprof.out>]
-
-=head1 DESCRIPTION
-
-The B<llvm-prof> tool reads in an F<llvmprof.out> file (which can
-optionally use a specific file with the third program argument), a bitcode file
-for the program, and produces a human readable report, suitable for determining
-where the program hotspots are.
-
-This program is often used in conjunction with the F<utils/profile.pl>
-script.  This script automatically instruments a program, runs it with the JIT,
-then runs B<llvm-prof> to format a report.  To get more information about
-F<utils/profile.pl>, execute it with the B<-help> option.
-
-=head1 OPTIONS
-
-=over
-
-=item B<--annotated-llvm> or B<-A>
-
-In addition to the normal report printed, print out the code for the
-program, annotated with execution frequency information. This can be
-particularly useful when trying to visualize how frequently basic blocks
-are executed.  This is most useful with basic block profiling
-information or better.
-
-=item B<--print-all-code>
-
-Using this option enables the B<--annotated-llvm> option, but it
-prints the entire module, instead of just the most commonly executed
-functions.
-
-=item B<--time-passes>
-
-Record the amount of time needed for each pass and print it to standard
-error.
-
-=back
-
-=head1 EXIT STATUS
-
-B<llvm-prof> returns 1 if it cannot load the bitcode file or the profile
-information. Otherwise, it exits with zero.
-
-=head1 AUTHOR
-
-B<llvm-prof> is maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-prof.rst b/docs/CommandGuide/llvm-prof.rst
new file mode 100644
index 0000000..e8d0b19
--- /dev/null
+++ b/docs/CommandGuide/llvm-prof.rst
@@ -0,0 +1,63 @@
+llvm-prof - print execution profile of LLVM program
+===================================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-prof** [*options*] [*bitcode file*] [*llvmprof.out*]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-prof** tool reads in an *llvmprof.out* file (which can
+optionally use a specific file with the third program argument), a bitcode file
+for the program, and produces a human readable report, suitable for determining
+where the program hotspots are.
+
+This program is often used in conjunction with the *utils/profile.pl*
+script.  This script automatically instruments a program, runs it with the JIT,
+then runs **llvm-prof** to format a report.  To get more information about
+*utils/profile.pl*, execute it with the **-help** option.
+
+
+OPTIONS
+-------
+
+
+
+**--annotated-llvm** or **-A**
+
+ In addition to the normal report printed, print out the code for the
+ program, annotated with execution frequency information. This can be
+ particularly useful when trying to visualize how frequently basic blocks
+ are executed.  This is most useful with basic block profiling
+ information or better.
+
+
+
+**--print-all-code**
+
+ Using this option enables the **--annotated-llvm** option, but it
+ prints the entire module, instead of just the most commonly executed
+ functions.
+
+
+
+**--time-passes**
+
+ Record the amount of time needed for each pass and print it to standard
+ error.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+**llvm-prof** returns 1 if it cannot load the bitcode file or the profile
+information. Otherwise, it exits with zero.
diff --git a/docs/CommandGuide/llvm-ranlib.pod b/docs/CommandGuide/llvm-ranlib.pod
deleted file mode 100644
index 431bc55..0000000
--- a/docs/CommandGuide/llvm-ranlib.pod
+++ /dev/null
@@ -1,52 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-ranlib - Generate index for LLVM archive
-
-=head1 SYNOPSIS
-
-B<llvm-ranlib> [--version] [-help] <archive-file>
-
-=head1 DESCRIPTION
-
-The B<llvm-ranlib> command is similar to the common Unix utility, C<ranlib>. It
-adds or updates the symbol table in an LLVM archive file. Note that using the
-B<llvm-ar> modifier F<s> is usually more efficient than running B<llvm-ranlib>
-which is only provided only for completness and compatibility. Unlike other 
-implementations of C<ranlib>, B<llvm-ranlib> indexes LLVM bitcode files, not
-native object modules. You can list the contents of the symbol table with the
-C<llvm-nm -s> command.
-
-=head1 OPTIONS
-
-=over
-
-=item F<archive-file>
-
-Specifies the archive-file to which the symbol table is added or updated.
-
-=item F<--version>
-
-Print the version of B<llvm-ranlib> and exit without building a symbol table.
-
-=item F<-help>
-
-Print usage help for B<llvm-ranlib> and exit without building a symbol table.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<llvm-ranlib> succeeds, it will exit with 0.  If an error occurs, a non-zero
-exit code will be returned.
-
-=head1 SEE ALSO
-
-L<llvm-ar|llvm-ar>, ranlib(1)
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-ranlib.rst b/docs/CommandGuide/llvm-ranlib.rst
new file mode 100644
index 0000000..6658818
--- /dev/null
+++ b/docs/CommandGuide/llvm-ranlib.rst
@@ -0,0 +1,61 @@
+llvm-ranlib - Generate index for LLVM archive
+=============================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-ranlib** [--version] [-help] <archive-file>
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-ranlib** command is similar to the common Unix utility, ``ranlib``. It
+adds or updates the symbol table in an LLVM archive file. Note that using the
+**llvm-ar** modifier *s* is usually more efficient than running **llvm-ranlib**
+which is only provided only for completness and compatibility. Unlike other
+implementations of ``ranlib``, **llvm-ranlib** indexes LLVM bitcode files, not
+native object modules. You can list the contents of the symbol table with the
+``llvm-nm -s`` command.
+
+
+OPTIONS
+-------
+
+
+
+*archive-file*
+
+ Specifies the archive-file to which the symbol table is added or updated.
+
+
+
+*--version*
+
+ Print the version of **llvm-ranlib** and exit without building a symbol table.
+
+
+
+*-help*
+
+ Print usage help for **llvm-ranlib** and exit without building a symbol table.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **llvm-ranlib** succeeds, it will exit with 0.  If an error occurs, a non-zero
+exit code will be returned.
+
+
+SEE ALSO
+--------
+
+
+llvm-ar|llvm-ar, ranlib(1)
diff --git a/docs/CommandGuide/llvm-stress.pod b/docs/CommandGuide/llvm-stress.pod
deleted file mode 100644
index 92083d2..0000000
--- a/docs/CommandGuide/llvm-stress.pod
+++ /dev/null
@@ -1,42 +0,0 @@
-=pod
-
-=head1 NAME
-
-llvm-stress - generate random .ll files
-
-=head1 SYNOPSIS
-
-B<llvm-cov> [-gcno=filename] [-gcda=filename] [dump]
-
-=head1 DESCRIPTION
-
-The B<llvm-stress> tool is used to generate random .ll files that can be used to
-test different components of LLVM.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-o> I<filename>
-
-Specify the output filename.
-
-=item B<-size> I<size>
-
-Specify the size of the generated .ll file.
-
-=item B<-seed> I<seed>
-
-Specify the seed to be used for the randomly generated instructions.
-
-=back
-
-=head1 EXIT STATUS
-
-B<llvm-stress> returns 0.
-
-=head1 AUTHOR
-
-B<llvm-stress> is maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/llvm-stress.rst b/docs/CommandGuide/llvm-stress.rst
new file mode 100644
index 0000000..44aa32c
--- /dev/null
+++ b/docs/CommandGuide/llvm-stress.rst
@@ -0,0 +1,48 @@
+llvm-stress - generate random .ll files
+=======================================
+
+
+SYNOPSIS
+--------
+
+
+**llvm-stress** [-size=filesize] [-seed=initialseed] [-o=outfile]
+
+
+DESCRIPTION
+-----------
+
+
+The **llvm-stress** tool is used to generate random .ll files that can be used to
+test different components of LLVM.
+
+
+OPTIONS
+-------
+
+
+
+**-o** *filename*
+
+ Specify the output filename.
+
+
+
+**-size** *size*
+
+ Specify the size of the generated .ll file.
+
+
+
+**-seed** *seed*
+
+ Specify the seed to be used for the randomly generated instructions.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+**llvm-stress** returns 0.
diff --git a/docs/CommandGuide/manpage.css b/docs/CommandGuide/manpage.css
deleted file mode 100644
index c922564..0000000
--- a/docs/CommandGuide/manpage.css
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Based on http://www.perldoc.com/css/perldoc.css */
-
-@import url("../llvm.css");
-
-body { font-family: Arial,Helvetica; }
-
-blockquote { margin: 10pt;  }
-
-h1, a { color: #336699; }
-
-
-/*** Top menu style ****/
-.mmenuon { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ff6600; font-size: 10pt;
-}
-.mmenuoff { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: 10pt;
-}	  
-.cpyright {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: xx-small;
-}
-.cpyrightText {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #ffffff; font-size: xx-small;
-}
-.sections { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 11pt;
-}	 
-.dsections { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 12pt;
-}	
-.slink { 
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #000000; font-size: 9pt;
-}	 
-
-.slink2 { font-family: Arial,Helvetica; text-decoration: none; color: #336699; }	 
-
-.maintitle { 
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 18pt;
-}	 
-.dblArrow {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: small;
-}
-.menuSec {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: small;
-}
-
-.newstext {
- font-family: Arial,Helvetica; font-size: small;
-}
-
-.linkmenu {
- font-family: Arial,Helvetica; color: #000000; font-weight: bold;
- text-decoration: none;
-}
-
-P {
- font-family: Arial,Helvetica;
-}
-
-PRE {
-    font-size: 10pt;
-}
-.quote { 
- font-family: Times; text-decoration: none;
- color: #000000; font-size: 9pt; font-style: italic;
-}	
-.smstd { font-family: Arial,Helvetica; color: #000000; font-size: x-small; } 
-.std { font-family: Arial,Helvetica; color: #000000; } 
-.meerkatTitle { 
- font-family: sans-serif; font-size: x-small;  color: black;    }
-
-.meerkatDescription { font-family: sans-serif; font-size: 10pt; color: black }
-.meerkatCategory { 
- font-family: sans-serif; font-size: 9pt; font-weight: bold; font-style: italic; 
- color: brown; }
-.meerkatChannel { 
- font-family: sans-serif; font-size: 9pt; font-style: italic; color: brown; }
-.meerkatDate { font-family: sans-serif; font-size: xx-small; color: #336699; }
-
-.tocTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-.toc-item {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #336699; font-size: 10pt; text-decoration: underline;
-}
-
-.perlVersion {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #336699; font-size: 10pt; text-decoration: none;
-}
-
-.podTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #000000;
-}
-
-.docTitle {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #000000; font-size: 10pt;
-}
-.dotDot {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #000000; font-size: 9pt;
-}
-
-.docSec {
- font-family: Arial,Helvetica; font-weight: normal; 
- color: #333333; font-size: 9pt;
-}
-.docVersion {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 10pt;
-}
-
-.docSecs-on {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #ff0000; font-size: 10pt;
-}
-.docSecs-off {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-h2 {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: medium;
-}
-h1 {
- font-family: Verdana,Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: large;
-}
-
-DL {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: none;
- color: #333333; font-size: 10pt;
-}
-
-UL > LI > A {
- font-family: Arial,Helvetica; font-weight: bold;
- color: #336699; font-size: 10pt;
-}
-
-.moduleInfo {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #333333; font-size: 11pt;
-}
-
-.moduleInfoSec {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none;
- color: #336699; font-size: 10pt;
-}
-
-.moduleInfoVal {
- font-family: Arial,Helvetica; font-weight: normal; text-decoration: underline;
- color: #000000; font-size: 10pt;
-}
-
-.cpanNavTitle {
- font-family: Arial,Helvetica; font-weight: bold; 
- color: #ffffff; font-size: 10pt;
-}
-.cpanNavLetter {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none; 
- color: #333333; font-size: 9pt;
-}
-.cpanCat {
- font-family: Arial,Helvetica; font-weight: bold; text-decoration: none; 
- color: #336699; font-size: 9pt;
-}
-
-.bttndrkblue-bkgd-top {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgtop.gif);
-}
-.bttndrkblue-bkgd-left {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgleft.gif);
-}
-.bttndrkblue-bkgd {
-	padding-top: 0px;
-	padding-bottom: 0px;
-	margin-bottom: 0px;
-	margin-top: 0px;
-	background-repeat: no-repeat;
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgmiddle.gif);
-	vertical-align: top;
-}
-.bttndrkblue-bkgd-right {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgright.gif);
-}
-.bttndrkblue-bkgd-bottom {
-	background-color: #225688;
-	background-image: url(/global/mvc_objects/images/bttndrkblue_bgbottom.gif);
-}
-.bttndrkblue-text a {
-	color: #ffffff;
-	text-decoration: none;
-}
-a.bttndrkblue-text:hover {
-	color: #ffDD3C;
-	text-decoration: none;
-}
-.bg-ltblue {
-	background-color: #f0f5fa;
-} 
-
-.border-left-b {
-	background: #f0f5fa url(/i/corner-leftline.gif) repeat-y;
-} 
-
-.border-right-b {
-	background: #f0f5fa url(/i/corner-rightline.gif) repeat-y;
-} 
-
-.border-top-b {
-	background: #f0f5fa url(/i/corner-topline.gif) repeat-x;
-} 
-
-.border-bottom-b {
-	background: #f0f5fa url(/i/corner-botline.gif) repeat-x;
-} 
-
-.border-right-w {
-	background: #ffffff url(/i/corner-rightline.gif) repeat-y;
-} 
-
-.border-top-w {
-	background: #ffffff url(/i/corner-topline.gif) repeat-x;
-} 
-
-.border-bottom-w {
-	background: #ffffff url(/i/corner-botline.gif) repeat-x;
-} 
-
-.bg-white {
-	background-color: #ffffff;
-} 
-
-.border-left-w {
-	background: #ffffff url(/i/corner-leftline.gif) repeat-y;
-} 
diff --git a/docs/CommandGuide/opt.pod b/docs/CommandGuide/opt.pod
deleted file mode 100644
index f5f4968..0000000
--- a/docs/CommandGuide/opt.pod
+++ /dev/null
@@ -1,143 +0,0 @@
-=pod
-
-=head1 NAME
-
-opt - LLVM optimizer
-
-=head1 SYNOPSIS
-
-B<opt> [I<options>] [I<filename>]
-
-=head1 DESCRIPTION
-
-The B<opt> command is the modular LLVM optimizer and analyzer.  It takes LLVM 
-source files as input, runs the specified optimizations or analyses on it, and then
-outputs the optimized file or the analysis results.  The function of 
-B<opt> depends on whether the B<-analyze> option is given. 
-
-When B<-analyze> is specified, B<opt> performs various analyses of the input
-source.  It will usually print the results on standard output, but in a few
-cases, it will print output to standard error or generate a file with the
-analysis output, which is usually done when the output is meant for another
-program.
-
-While B<-analyze> is I<not> given, B<opt> attempts to produce an optimized 
-output file.  The optimizations available via B<opt> depend upon what 
-libraries were linked into it as well as any additional libraries that have 
-been loaded with the B<-load> option.  Use the B<-help> option to determine 
-what optimizations you can use.
-
-If I<filename> is omitted from the command line or is I<->, B<opt> reads its
-input from standard input. Inputs can be in either the LLVM assembly language
-format (.ll) or the LLVM bitcode format (.bc).
-
-If an output filename is not specified with the B<-o> option, B<opt>
-writes its output to the standard output.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-f>
-
-Enable binary output on terminals.  Normally, B<opt> will refuse to
-write raw bitcode output if the output stream is a terminal. With this option,
-B<opt> will write raw bitcode regardless of the output device.
-
-=item B<-help>
-
-Print a summary of command line options. 
-
-=item B<-o> I<filename>
-
-Specify the output filename.
-
-=item B<-S>
-
-Write output in LLVM intermediate language (instead of bitcode).
-
-=item B<-{passname}>
-
-B<opt> provides the ability to run any of LLVM's optimization or analysis passes
-in any order. The B<-help> option lists all the passes available. The order in
-which the options occur on the command line are the order in which they are
-executed (within pass constraints). 
-
-=item B<-std-compile-opts>
-
-This is short hand for a standard list of I<compile time optimization> passes.
-This is typically used to optimize the output from the llvm-gcc front end. It
-might be useful for other front end compilers as well. To discover the full set
-of options available, use the following command:
-
-   llvm-as < /dev/null | opt -std-compile-opts -disable-output -debug-pass=Arguments
-
-=item B<-disable-inlining>
-
-This option is only meaningful when B<-std-compile-opts> is given. It simply
-removes the inlining pass from the standard list.
-
-=item B<-disable-opt>
-
-This option is only meaningful when B<-std-compile-opts> is given. It disables
-most, but not all, of the B<-std-compile-opts>. The ones that remain are
-B<-verify>, B<-lower-setjmp>, and B<-funcresolve>.
-
-=item B<-strip-debug>
-
-This option causes opt to strip debug information from the module before 
-applying other optimizations. It is essentially the same as B<-strip> but it
-ensures that stripping of debug information is done first.
-
-=item B<-verify-each>
-
-This option causes opt to add a verify pass after every pass otherwise specified
-on the command line (including B<-verify>).  This is useful for cases where it 
-is suspected that a pass is creating an invalid module but it is not clear which
-pass is doing it. The combination of B<-std-compile-opts> and B<-verify-each>
-can quickly track down this kind of problem.
-
-=item B<-profile-info-file> I<filename>
-
-Specify the name of the file loaded by the -profile-loader option.
-
-=item B<-stats>
-
-Print statistics.
-
-=item B<-time-passes>
-
-Record the amount of time needed for each pass and print it to standard
-error.
-
-=item B<-debug>
-
-If this is a debug build, this option will enable debug printouts
-from passes which use the I<DEBUG()> macro.  See the B<LLVM Programmer's
-Manual>, section I<#DEBUG> for more information.
-
-=item B<-load>=I<plugin>
-
-Load the dynamic object I<plugin>.  This object should register new optimization
-or analysis passes. Once loaded, the object will add new command line options to
-enable various optimizations or analyses.  To see the new complete list of 
-optimizations, use the B<-help> and B<-load> options together. For example:
-
-   opt -load=plugin.so -help
-
-=item B<-p>
-
-Print module after each transformation.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<opt> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 AUTHORS
-
-Maintained by the LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst
new file mode 100644
index 0000000..72f1903
--- /dev/null
+++ b/docs/CommandGuide/opt.rst
@@ -0,0 +1,183 @@
+opt - LLVM optimizer
+====================
+
+
+SYNOPSIS
+--------
+
+
+**opt** [*options*] [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+The **opt** command is the modular LLVM optimizer and analyzer.  It takes LLVM
+source files as input, runs the specified optimizations or analyses on it, and then
+outputs the optimized file or the analysis results.  The function of
+**opt** depends on whether the **-analyze** option is given.
+
+When **-analyze** is specified, **opt** performs various analyses of the input
+source.  It will usually print the results on standard output, but in a few
+cases, it will print output to standard error or generate a file with the
+analysis output, which is usually done when the output is meant for another
+program.
+
+While **-analyze** is *not* given, **opt** attempts to produce an optimized
+output file.  The optimizations available via **opt** depend upon what
+libraries were linked into it as well as any additional libraries that have
+been loaded with the **-load** option.  Use the **-help** option to determine
+what optimizations you can use.
+
+If *filename* is omitted from the command line or is *-*, **opt** reads its
+input from standard input. Inputs can be in either the LLVM assembly language
+format (.ll) or the LLVM bitcode format (.bc).
+
+If an output filename is not specified with the **-o** option, **opt**
+writes its output to the standard output.
+
+
+OPTIONS
+-------
+
+
+
+**-f**
+
+ Enable binary output on terminals.  Normally, **opt** will refuse to
+ write raw bitcode output if the output stream is a terminal. With this option,
+ **opt** will write raw bitcode regardless of the output device.
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-o** *filename*
+
+ Specify the output filename.
+
+
+
+**-S**
+
+ Write output in LLVM intermediate language (instead of bitcode).
+
+
+
+**-{passname}**
+
+ **opt** provides the ability to run any of LLVM's optimization or analysis passes
+ in any order. The **-help** option lists all the passes available. The order in
+ which the options occur on the command line are the order in which they are
+ executed (within pass constraints).
+
+
+
+**-std-compile-opts**
+
+ This is short hand for a standard list of *compile time optimization* passes.
+ This is typically used to optimize the output from the llvm-gcc front end. It
+ might be useful for other front end compilers as well. To discover the full set
+ of options available, use the following command:
+
+
+ .. code-block:: sh
+
+     llvm-as < /dev/null | opt -std-compile-opts -disable-output -debug-pass=Arguments
+
+
+
+
+**-disable-inlining**
+
+ This option is only meaningful when **-std-compile-opts** is given. It simply
+ removes the inlining pass from the standard list.
+
+
+
+**-disable-opt**
+
+ This option is only meaningful when **-std-compile-opts** is given. It disables
+ most, but not all, of the **-std-compile-opts**. The ones that remain are
+ **-verify**, **-lower-setjmp**, and **-funcresolve**.
+
+
+
+**-strip-debug**
+
+ This option causes opt to strip debug information from the module before
+ applying other optimizations. It is essentially the same as **-strip** but it
+ ensures that stripping of debug information is done first.
+
+
+
+**-verify-each**
+
+ This option causes opt to add a verify pass after every pass otherwise specified
+ on the command line (including **-verify**).  This is useful for cases where it
+ is suspected that a pass is creating an invalid module but it is not clear which
+ pass is doing it. The combination of **-std-compile-opts** and **-verify-each**
+ can quickly track down this kind of problem.
+
+
+
+**-profile-info-file** *filename*
+
+ Specify the name of the file loaded by the -profile-loader option.
+
+
+
+**-stats**
+
+ Print statistics.
+
+
+
+**-time-passes**
+
+ Record the amount of time needed for each pass and print it to standard
+ error.
+
+
+
+**-debug**
+
+ If this is a debug build, this option will enable debug printouts
+ from passes which use the *DEBUG()* macro.  See the **LLVM Programmer's
+ Manual**, section *#DEBUG* for more information.
+
+
+
+**-load**\ =\ *plugin*
+
+ Load the dynamic object *plugin*.  This object should register new optimization
+ or analysis passes. Once loaded, the object will add new command line options to
+ enable various optimizations or analyses.  To see the new complete list of
+ optimizations, use the **-help** and **-load** options together. For example:
+
+
+ .. code-block:: sh
+
+     opt -load=plugin.so -help
+
+
+
+
+**-p**
+
+ Print module after each transformation.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **opt** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
diff --git a/docs/CommandGuide/tblgen.pod b/docs/CommandGuide/tblgen.pod
deleted file mode 100644
index 180bcc1..0000000
--- a/docs/CommandGuide/tblgen.pod
+++ /dev/null
@@ -1,139 +0,0 @@
-
-=pod
-
-=head1 NAME
-
-tblgen - Target Description To C++ Code Generator
-
-=head1 SYNOPSIS
-
-B<tblgen> [I<options>] [I<filename>]
-
-=head1 DESCRIPTION
-
-B<tblgen> translates from target description (.td) files into C++ code that can
-be included in the definition of an LLVM target library. Most users of LLVM will
-not need to use this program. It is only for assisting with writing an LLVM
-target backend.
-
-The input and output of B<tblgen> is beyond the scope of this short
-introduction. Please see the I<CodeGeneration> page in the LLVM documentation.
-
-The F<filename> argument specifies the name of a Target Description (.td) file
-to read as input.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-help>
-
-Print a summary of command line options.
-
-=item B<-o> F<filename>
-
-Specify the output file name.  If F<filename> is C<->, then B<tblgen>
-sends its output to standard output.
-
-=item B<-I> F<directory>
-
-Specify where to find other target description files for inclusion. The
-F<directory> value should be a full or partial path to a directory that contains
-target description files.
-
-=item B<-asmparsernum> F<N>
-
-Make -gen-asm-parser emit assembly writer number F<N>.
-
-=item B<-asmwriternum> F<N>
-
-Make -gen-asm-writer emit assembly writer number F<N>.
-
-=item B<-class> F<class Name>
-
-Print the enumeration list for this class.
-
-=item B<-print-records>
-
-Print all records to standard output (default).
-
-=item B<-print-enums>
-
-Print enumeration values for a class
-
-=item B<-print-sets>
-
-Print expanded sets for testing DAG exprs.
-
-=item B<-gen-emitter>
-
-Generate machine code emitter.
-
-=item B<-gen-register-info>
-
-Generate registers and register classes info.
-
-=item B<-gen-instr-info>
-
-Generate instruction descriptions.
-
-=item B<-gen-asm-writer>
-
-Generate the assembly writer.
-
-=item B<-gen-disassembler>
-
-Generate disassembler.
-
-=item B<-gen-pseudo-lowering>
-
-Generate pseudo instruction lowering.
-
-=item B<-gen-dag-isel>
-
-Generate a DAG (Directed Acycle Graph) instruction selector.
-
-=item B<-gen-asm-matcher>
-
-Generate assembly instruction matcher.
-
-=item B<-gen-dfa-packetizer>
-
-Generate DFA Packetizer for VLIW targets.
-
-=item B<-gen-fast-isel>
-
-Generate a "fast" instruction selector.
-
-=item B<-gen-subtarget>
-
-Generate subtarget enumerations.
-
-=item B<-gen-intrinsic>
-
-Generate intrinsic information.
-
-=item B<-gen-tgt-intrinsic>
-
-Generate target intrinsic information.
-
-=item B<-gen-enhanced-disassembly-info>
-
-Generate enhanced disassembly info.
-
-=item B<-version>
-
-Show the version number of this program.
-
-=back
-
-=head1 EXIT STATUS
-
-If B<tblgen> succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value.
-
-=head1 AUTHORS
-
-Maintained by The LLVM Team (L<http://llvm.org/>).
-
-=cut
diff --git a/docs/CommandGuide/tblgen.rst b/docs/CommandGuide/tblgen.rst
new file mode 100644
index 0000000..2d19167
--- /dev/null
+++ b/docs/CommandGuide/tblgen.rst
@@ -0,0 +1,186 @@
+tblgen - Target Description To C++ Code Generator
+=================================================
+
+
+SYNOPSIS
+--------
+
+
+**tblgen** [*options*] [*filename*]
+
+
+DESCRIPTION
+-----------
+
+
+**tblgen** translates from target description (.td) files into C++ code that can
+be included in the definition of an LLVM target library. Most users of LLVM will
+not need to use this program. It is only for assisting with writing an LLVM
+target backend.
+
+The input and output of **tblgen** is beyond the scope of this short
+introduction. Please see the *CodeGeneration* page in the LLVM documentation.
+
+The *filename* argument specifies the name of a Target Description (.td) file
+to read as input.
+
+
+OPTIONS
+-------
+
+
+
+**-help**
+
+ Print a summary of command line options.
+
+
+
+**-o** *filename*
+
+ Specify the output file name.  If *filename* is ``-``, then **tblgen**
+ sends its output to standard output.
+
+
+
+**-I** *directory*
+
+ Specify where to find other target description files for inclusion. The
+ *directory* value should be a full or partial path to a directory that contains
+ target description files.
+
+
+
+**-asmparsernum** *N*
+
+ Make -gen-asm-parser emit assembly writer number *N*.
+
+
+
+**-asmwriternum** *N*
+
+ Make -gen-asm-writer emit assembly writer number *N*.
+
+
+
+**-class** *class Name*
+
+ Print the enumeration list for this class.
+
+
+
+**-print-records**
+
+ Print all records to standard output (default).
+
+
+
+**-print-enums**
+
+ Print enumeration values for a class
+
+
+
+**-print-sets**
+
+ Print expanded sets for testing DAG exprs.
+
+
+
+**-gen-emitter**
+
+ Generate machine code emitter.
+
+
+
+**-gen-register-info**
+
+ Generate registers and register classes info.
+
+
+
+**-gen-instr-info**
+
+ Generate instruction descriptions.
+
+
+
+**-gen-asm-writer**
+
+ Generate the assembly writer.
+
+
+
+**-gen-disassembler**
+
+ Generate disassembler.
+
+
+
+**-gen-pseudo-lowering**
+
+ Generate pseudo instruction lowering.
+
+
+
+**-gen-dag-isel**
+
+ Generate a DAG (Directed Acycle Graph) instruction selector.
+
+
+
+**-gen-asm-matcher**
+
+ Generate assembly instruction matcher.
+
+
+
+**-gen-dfa-packetizer**
+
+ Generate DFA Packetizer for VLIW targets.
+
+
+
+**-gen-fast-isel**
+
+ Generate a "fast" instruction selector.
+
+
+
+**-gen-subtarget**
+
+ Generate subtarget enumerations.
+
+
+
+**-gen-intrinsic**
+
+ Generate intrinsic information.
+
+
+
+**-gen-tgt-intrinsic**
+
+ Generate target intrinsic information.
+
+
+
+**-gen-enhanced-disassembly-info**
+
+ Generate enhanced disassembly info.
+
+
+
+**-version**
+
+ Show the version number of this program.
+
+
+
+
+EXIT STATUS
+-----------
+
+
+If **tblgen** succeeds, it will exit with 0.  Otherwise, if an error
+occurs, it will exit with a non-zero value.
diff --git a/docs/CommandLine.html b/docs/CommandLine.html
deleted file mode 100644
index 7535ca4..0000000
--- a/docs/CommandLine.html
+++ /dev/null
@@ -1,1976 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>CommandLine 2.0 Library Manual</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  CommandLine 2.0 Library Manual
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-
-  <li><a href="#quickstart">Quick Start Guide</a>
-    <ol>
-      <li><a href="#bool">Boolean Arguments</a></li>
-      <li><a href="#alias">Argument Aliases</a></li>
-      <li><a href="#onealternative">Selecting an alternative from a
-                                    set of possibilities</a></li>
-      <li><a href="#namedalternatives">Named alternatives</a></li>
-      <li><a href="#list">Parsing a list of options</a></li>
-      <li><a href="#bits">Collecting options as a set of flags</a></li>
-      <li><a href="#description">Adding freeform text to help output</a></li>
-    </ol></li>
-
-  <li><a href="#referenceguide">Reference Guide</a>
-    <ol>
-      <li><a href="#positional">Positional Arguments</a>
-        <ul>
-        <li><a href="#--">Specifying positional options with hyphens</a></li>
-        <li><a href="#getPosition">Determining absolute position with
-          getPosition</a></li>
-        <li><a href="#cl::ConsumeAfter">The <tt>cl::ConsumeAfter</tt>
-             modifier</a></li>
-        </ul></li>
-
-      <li><a href="#storage">Internal vs External Storage</a></li>
-
-      <li><a href="#attributes">Option Attributes</a></li>
-
-      <li><a href="#modifiers">Option Modifiers</a>
-        <ul>
-        <li><a href="#hiding">Hiding an option from <tt>-help</tt>
-            output</a></li>
-        <li><a href="#numoccurrences">Controlling the number of occurrences
-                                     required and allowed</a></li>
-        <li><a href="#valrequired">Controlling whether or not a value must be
-                                   specified</a></li>
-        <li><a href="#formatting">Controlling other formatting options</a></li>
-        <li><a href="#misc">Miscellaneous option modifiers</a></li>
-        <li><a href="#response">Response files</a></li>
-        </ul></li>
-
-      <li><a href="#toplevel">Top-Level Classes and Functions</a>
-        <ul>
-        <li><a href="#cl::ParseCommandLineOptions">The
-            <tt>cl::ParseCommandLineOptions</tt> function</a></li>
-        <li><a href="#cl::ParseEnvironmentOptions">The
-            <tt>cl::ParseEnvironmentOptions</tt> function</a></li>
-        <li><a href="#cl::SetVersionPrinter">The <tt>cl::SetVersionPrinter</tt>
-          function</a></li>
-        <li><a href="#cl::opt">The <tt>cl::opt</tt> class</a></li>
-        <li><a href="#cl::list">The <tt>cl::list</tt> class</a></li>
-        <li><a href="#cl::bits">The <tt>cl::bits</tt> class</a></li>
-        <li><a href="#cl::alias">The <tt>cl::alias</tt> class</a></li>
-        <li><a href="#cl::extrahelp">The <tt>cl::extrahelp</tt> class</a></li>
-        </ul></li>
-
-      <li><a href="#builtinparsers">Builtin parsers</a>
-        <ul>
-        <li><a href="#genericparser">The Generic <tt>parser&lt;t&gt;</tt>
-            parser</a></li>
-        <li><a href="#boolparser">The <tt>parser&lt;bool&gt;</tt>
-            specialization</a></li>
-        <li><a href="#boolOrDefaultparser">The <tt>parser&lt;boolOrDefault&gt;</tt>
-            specialization</a></li>
-        <li><a href="#stringparser">The <tt>parser&lt;string&gt;</tt>
-            specialization</a></li>
-        <li><a href="#intparser">The <tt>parser&lt;int&gt;</tt>
-            specialization</a></li>
-        <li><a href="#doubleparser">The <tt>parser&lt;double&gt;</tt> and
-            <tt>parser&lt;float&gt;</tt> specializations</a></li>
-        </ul></li>
-    </ol></li>
-  <li><a href="#extensionguide">Extension Guide</a>
-    <ol>
-      <li><a href="#customparser">Writing a custom parser</a></li>
-      <li><a href="#explotingexternal">Exploiting external storage</a></li>
-      <li><a href="#dynamicopts">Dynamically adding command line
-          options</a></li>
-    </ol></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document describes the CommandLine argument processing library.  It will
-show you how to use it, and what it can do.  The CommandLine library uses a
-declarative approach to specifying the command line options that your program
-takes.  By default, these options declarations implicitly hold the value parsed
-for the option declared (of course this <a href="#storage">can be
-changed</a>).</p>
-
-<p>Although there are a <b>lot</b> of command line argument parsing libraries
-out there in many different languages, none of them fit well with what I needed.
-By looking at the features and problems of other libraries, I designed the
-CommandLine library to have the following features:</p>
-
-<ol>
-<li>Speed: The CommandLine library is very quick and uses little resources.  The
-parsing time of the library is directly proportional to the number of arguments
-parsed, not the the number of options recognized.  Additionally, command line
-argument values are captured transparently into user defined global variables,
-which can be accessed like any other variable (and with the same
-performance).</li>
-
-<li>Type Safe: As a user of CommandLine, you don't have to worry about
-remembering the type of arguments that you want (is it an int?  a string? a
-bool? an enum?) and keep casting it around.  Not only does this help prevent
-error prone constructs, it also leads to dramatically cleaner source code.</li>
-
-<li>No subclasses required: To use CommandLine, you instantiate variables that
-correspond to the arguments that you would like to capture, you don't subclass a
-parser.  This means that you don't have to write <b>any</b> boilerplate
-code.</li>
-
-<li>Globally accessible: Libraries can specify command line arguments that are
-automatically enabled in any tool that links to the library.  This is possible
-because the application doesn't have to keep a list of arguments to pass to
-the parser.  This also makes supporting <a href="#dynamicopts">dynamically
-loaded options</a> trivial.</li>
-
-<li>Cleaner: CommandLine supports enum and other types directly, meaning that
-there is less error and more security built into the library.  You don't have to
-worry about whether your integral command line argument accidentally got
-assigned a value that is not valid for your enum type.</li>
-
-<li>Powerful: The CommandLine library supports many different types of
-arguments, from simple <a href="#boolparser">boolean flags</a> to <a
-href="#cl::opt">scalars arguments</a> (<a href="#stringparser">strings</a>, <a
-href="#intparser">integers</a>, <a href="#genericparser">enums</a>, <a
-href="#doubleparser">doubles</a>), to <a href="#cl::list">lists of
-arguments</a>.  This is possible because CommandLine is...</li>
-
-<li>Extensible: It is very simple to add a new argument type to CommandLine.
-Simply specify the parser that you want to use with the command line option when
-you declare it.  <a href="#customparser">Custom parsers</a> are no problem.</li>
-
-<li>Labor Saving: The CommandLine library cuts down on the amount of grunt work
-that you, the user, have to do.  For example, it automatically provides a
-<tt>-help</tt> option that shows the available command line options for your
-tool.  Additionally, it does most of the basic correctness checking for
-you.</li>
-
-<li>Capable: The CommandLine library can handle lots of different forms of
-options often found in real programs.  For example, <a
-href="#positional">positional</a> arguments, <tt>ls</tt> style <a
-href="#cl::Grouping">grouping</a> options (to allow processing '<tt>ls
--lad</tt>' naturally), <tt>ld</tt> style <a href="#cl::Prefix">prefix</a>
-options (to parse '<tt>-lmalloc -L/usr/lib</tt>'), and <a
-href="#cl::ConsumeAfter">interpreter style options</a>.</li>
-
-</ol>
-
-<p>This document will hopefully let you jump in and start using CommandLine in
-your utility quickly and painlessly.  Additionally it should be a simple
-reference manual to figure out how stuff works.  If it is failing in some area
-(or you want an extension to the library), nag the author, <a
-href="mailto:sabre@nondot.org">Chris Lattner</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="quickstart">Quick Start Guide</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This section of the manual runs through a simple CommandLine'ification of a
-basic compiler tool.  This is intended to show you how to jump into using the
-CommandLine library in your own program, and show you some of the cool things it
-can do.</p>
-
-<p>To start out, you need to include the CommandLine header file into your
-program:</p>
-
-<div class="doc_code"><pre>
-  #include "llvm/Support/CommandLine.h"
-</pre></div>
-
-<p>Additionally, you need to add this as the first line of your main
-program:</p>
-
-<div class="doc_code"><pre>
-int main(int argc, char **argv) {
-  <a href="#cl::ParseCommandLineOptions">cl::ParseCommandLineOptions</a>(argc, argv);
-  ...
-}
-</pre></div>
-
-<p>... which actually parses the arguments and fills in the variable
-declarations.</p>
-
-<p>Now that you are ready to support command line arguments, we need to tell the
-system which ones we want, and what type of arguments they are.  The CommandLine
-library uses a declarative syntax to model command line arguments with the
-global variable declarations that capture the parsed values.  This means that
-for every command line option that you would like to support, there should be a
-global variable declaration to capture the result.  For example, in a compiler,
-we would like to support the Unix-standard '<tt>-o &lt;filename&gt;</tt>' option
-to specify where to put the output.  With the CommandLine library, this is
-represented like this:</p>
-
-<a name="value_desc_example"></a>
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; OutputFilename("<i>o</i>", <a href="#cl::desc">cl::desc</a>("<i>Specify output filename</i>"), <a href="#cl::value_desc">cl::value_desc</a>("<i>filename</i>"));
-</pre></div>
-
-<p>This declares a global variable &quot;<tt>OutputFilename</tt>&quot; that is used to
-capture the result of the &quot;<tt>o</tt>&quot; argument (first parameter).  We specify
-that this is a simple scalar option by using the &quot;<tt><a
-href="#cl::opt">cl::opt</a></tt>&quot; template (as opposed to the <a
-href="#list">&quot;<tt>cl::list</tt> template</a>), and tell the CommandLine library
-that the data type that we are parsing is a string.</p>
-
-<p>The second and third parameters (which are optional) are used to specify what
-to output for the "<tt>-help</tt>" option.  In this case, we get a line that
-looks like this:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options]
-
-OPTIONS:
-  -help             - display available options (-help-hidden for more)
-  <b>-o &lt;filename&gt;     - Specify output filename</b>
-</pre></div>
-
-<p>Because we specified that the command line option should parse using the
-<tt>string</tt> data type, the variable declared is automatically usable as a
-real string in all contexts that a normal C++ string object may be used.  For
-example:</p>
-
-<div class="doc_code"><pre>
-  ...
-  std::ofstream Output(OutputFilename.c_str());
-  if (Output.good()) ...
-  ...
-</pre></div>
-
-<p>There are many different options that you can use to customize the command
-line option handling library, but the above example shows the general interface
-to these options.  The options can be specified in any order, and are specified
-with helper functions like <a href="#cl::desc"><tt>cl::desc(...)</tt></a>, so
-there are no positional dependencies to remember.  The available options are
-discussed in detail in the <a href="#referenceguide">Reference Guide</a>.</p>
-
-<p>Continuing the example, we would like to have our compiler take an input
-filename as well as an output filename, but we do not want the input filename to
-be specified with a hyphen (ie, not <tt>-filename.c</tt>).  To support this
-style of argument, the CommandLine library allows for <a
-href="#positional">positional</a> arguments to be specified for the program.
-These positional arguments are filled with command line parameters that are not
-in option form.  We use this feature like this:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; InputFilename(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;input file&gt;</i>"), <a href="#cl::init">cl::init</a>("<i>-</i>"));
-</pre></div>
-
-<p>This declaration indicates that the first positional argument should be
-treated as the input filename.  Here we use the <tt><a
-href="#cl::init">cl::init</a></tt> option to specify an initial value for the
-command line option, which is used if the option is not specified (if you do not
-specify a <tt><a href="#cl::init">cl::init</a></tt> modifier for an option, then
-the default constructor for the data type is used to initialize the value).
-Command line options default to being optional, so if we would like to require
-that the user always specify an input filename, we would add the <tt><a
-href="#cl::Required">cl::Required</a></tt> flag, and we could eliminate the
-<tt><a href="#cl::init">cl::init</a></tt> modifier, like this:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; InputFilename(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;input file&gt;</i>"), <b><a href="#cl::Required">cl::Required</a></b>);
-</pre></div>
-
-<p>Again, the CommandLine library does not require the options to be specified
-in any particular order, so the above declaration is equivalent to:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; InputFilename(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::Required">cl::Required</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;input file&gt;</i>"));
-</pre></div>
-
-<p>By simply adding the <tt><a href="#cl::Required">cl::Required</a></tt> flag,
-the CommandLine library will automatically issue an error if the argument is not
-specified, which shifts all of the command line option verification code out of
-your application into the library.  This is just one example of how using flags
-can alter the default behaviour of the library, on a per-option basis.  By
-adding one of the declarations above, the <tt>-help</tt> option synopsis is now
-extended to:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options] <b>&lt;input file&gt;</b>
-
-OPTIONS:
-  -help             - display available options (-help-hidden for more)
-  -o &lt;filename&gt;     - Specify output filename
-</pre></div>
-
-<p>... indicating that an input filename is expected.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="bool">Boolean Arguments</a>
-</h3>
-
-<div>
-
-<p>In addition to input and output filenames, we would like the compiler example
-to support three boolean flags: "<tt>-f</tt>" to force writing binary output to
-a terminal, "<tt>--quiet</tt>" to enable quiet mode, and "<tt>-q</tt>" for
-backwards compatibility with some of our users.  We can support these by
-declaring options of boolean type like this:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt; Force ("<i>f</i>", <a href="#cl::desc">cl::desc</a>("<i>Enable binary output on terminals</i>"));
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt; Quiet ("<i>quiet</i>", <a href="#cl::desc">cl::desc</a>("<i>Don't print informational messages</i>"));
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt; Quiet2("<i>q</i>", <a href="#cl::desc">cl::desc</a>("<i>Don't print informational messages</i>"), <a href="#cl::Hidden">cl::Hidden</a>);
-</pre></div>
-
-<p>This does what you would expect: it declares three boolean variables
-("<tt>Force</tt>", "<tt>Quiet</tt>", and "<tt>Quiet2</tt>") to recognize these
-options.  Note that the "<tt>-q</tt>" option is specified with the "<a
-href="#cl::Hidden"><tt>cl::Hidden</tt></a>" flag.  This modifier prevents it
-from being shown by the standard "<tt>-help</tt>" output (note that it is still
-shown in the "<tt>-help-hidden</tt>" output).</p>
-
-<p>The CommandLine library uses a <a href="#builtinparsers">different parser</a>
-for different data types.  For example, in the string case, the argument passed
-to the option is copied literally into the content of the string variable... we
-obviously cannot do that in the boolean case, however, so we must use a smarter
-parser.  In the case of the boolean parser, it allows no options (in which case
-it assigns the value of true to the variable), or it allows the values
-"<tt>true</tt>" or "<tt>false</tt>" to be specified, allowing any of the
-following inputs:</p>
-
-<div class="doc_code"><pre>
- compiler -f          # No value, 'Force' == true
- compiler -f=true     # Value specified, 'Force' == true
- compiler -f=TRUE     # Value specified, 'Force' == true
- compiler -f=FALSE    # Value specified, 'Force' == false
-</pre></div>
-
-<p>... you get the idea.  The <a href="#boolparser">bool parser</a> just turns
-the string values into boolean values, and rejects things like '<tt>compiler
--f=foo</tt>'.  Similarly, the <a href="#doubleparser">float</a>, <a
-href="#doubleparser">double</a>, and <a href="#intparser">int</a> parsers work
-like you would expect, using the '<tt>strtol</tt>' and '<tt>strtod</tt>' C
-library calls to parse the string value into the specified data type.</p>
-
-<p>With the declarations above, "<tt>compiler -help</tt>" emits this:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options] &lt;input file&gt;
-
-OPTIONS:
-  <b>-f     - Enable binary output on terminals</b>
-  -o     - Override output filename
-  <b>-quiet - Don't print informational messages</b>
-  -help  - display available options (-help-hidden for more)
-</pre></div>
-
-<p>and "<tt>compiler -help-hidden</tt>" prints this:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options] &lt;input file&gt;
-
-OPTIONS:
-  -f     - Enable binary output on terminals
-  -o     - Override output filename
-  <b>-q     - Don't print informational messages</b>
-  -quiet - Don't print informational messages
-  -help  - display available options (-help-hidden for more)
-</pre></div>
-
-<p>This brief example has shown you how to use the '<tt><a
-href="#cl::opt">cl::opt</a></tt>' class to parse simple scalar command line
-arguments.  In addition to simple scalar arguments, the CommandLine library also
-provides primitives to support CommandLine option <a href="#alias">aliases</a>,
-and <a href="#list">lists</a> of options.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="alias">Argument Aliases</a>
-</h3>
-
-<div>
-
-<p>So far, the example works well, except for the fact that we need to check the
-quiet condition like this now:</p>
-
-<div class="doc_code"><pre>
-...
-  if (!Quiet &amp;&amp; !Quiet2) printInformationalMessage(...);
-...
-</pre></div>
-
-<p>... which is a real pain!  Instead of defining two values for the same
-condition, we can use the "<tt><a href="#cl::alias">cl::alias</a></tt>" class to make the "<tt>-q</tt>"
-option an <b>alias</b> for the "<tt>-quiet</tt>" option, instead of providing
-a value itself:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt; Force ("<i>f</i>", <a href="#cl::desc">cl::desc</a>("<i>Overwrite output files</i>"));
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt; Quiet ("<i>quiet</i>", <a href="#cl::desc">cl::desc</a>("<i>Don't print informational messages</i>"));
-<a href="#cl::alias">cl::alias</a>     QuietA("<i>q</i>", <a href="#cl::desc">cl::desc</a>("<i>Alias for -quiet</i>"), <a href="#cl::aliasopt">cl::aliasopt</a>(Quiet));
-</pre></div>
-
-<p>The third line (which is the only one we modified from above) defines a
-"<tt>-q</tt>" alias that updates the "<tt>Quiet</tt>" variable (as specified by
-the <tt><a href="#cl::aliasopt">cl::aliasopt</a></tt> modifier) whenever it is
-specified.  Because aliases do not hold state, the only thing the program has to
-query is the <tt>Quiet</tt> variable now.  Another nice feature of aliases is
-that they automatically hide themselves from the <tt>-help</tt> output
-(although, again, they are still visible in the <tt>-help-hidden
-output</tt>).</p>
-
-<p>Now the application code can simply use:</p>
-
-<div class="doc_code"><pre>
-...
-  if (!Quiet) printInformationalMessage(...);
-...
-</pre></div>
-
-<p>... which is much nicer!  The "<tt><a href="#cl::alias">cl::alias</a></tt>"
-can be used to specify an alternative name for any variable type, and has many
-uses.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="onealternative">Selecting an alternative from a set of
-  possibilities</a>
-</h3>
-
-<div>
-
-<p>So far we have seen how the CommandLine library handles builtin types like
-<tt>std::string</tt>, <tt>bool</tt> and <tt>int</tt>, but how does it handle
-things it doesn't know about, like enums or '<tt>int*</tt>'s?</p>
-
-<p>The answer is that it uses a table-driven generic parser (unless you specify
-your own parser, as described in the <a href="#extensionguide">Extension
-Guide</a>).  This parser maps literal strings to whatever type is required, and
-requires you to tell it what this mapping should be.</p>
-
-<p>Let's say that we would like to add four optimization levels to our
-optimizer, using the standard flags "<tt>-g</tt>", "<tt>-O0</tt>",
-"<tt>-O1</tt>", and "<tt>-O2</tt>".  We could easily implement this with boolean
-options like above, but there are several problems with this strategy:</p>
-
-<ol>
-<li>A user could specify more than one of the options at a time, for example,
-"<tt>compiler -O3 -O2</tt>".  The CommandLine library would not be able to
-catch this erroneous input for us.</li>
-
-<li>We would have to test 4 different variables to see which ones are set.</li>
-
-<li>This doesn't map to the numeric levels that we want... so we cannot easily
-see if some level &gt;= "<tt>-O1</tt>" is enabled.</li>
-
-</ol>
-
-<p>To cope with these problems, we can use an enum value, and have the
-CommandLine library fill it in with the appropriate level directly, which is
-used like this:</p>
-
-<div class="doc_code"><pre>
-enum OptLevel {
-  g, O1, O2, O3
-};
-
-<a href="#cl::opt">cl::opt</a>&lt;OptLevel&gt; OptimizationLevel(<a href="#cl::desc">cl::desc</a>("<i>Choose optimization level:</i>"),
-  <a href="#cl::values">cl::values</a>(
-    clEnumVal(g , "<i>No optimizations, enable debugging</i>"),
-    clEnumVal(O1, "<i>Enable trivial optimizations</i>"),
-    clEnumVal(O2, "<i>Enable default optimizations</i>"),
-    clEnumVal(O3, "<i>Enable expensive optimizations</i>"),
-   clEnumValEnd));
-
-...
-  if (OptimizationLevel &gt;= O2) doPartialRedundancyElimination(...);
-...
-</pre></div>
-
-<p>This declaration defines a variable "<tt>OptimizationLevel</tt>" of the
-"<tt>OptLevel</tt>" enum type.  This variable can be assigned any of the values
-that are listed in the declaration (Note that the declaration list must be
-terminated with the "<tt>clEnumValEnd</tt>" argument!).  The CommandLine
-library enforces
-that the user can only specify one of the options, and it ensure that only valid
-enum values can be specified.  The "<tt>clEnumVal</tt>" macros ensure that the
-command line arguments matched the enum values.  With this option added, our
-help output now is:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options] &lt;input file&gt;
-
-OPTIONS:
-  <b>Choose optimization level:
-    -g          - No optimizations, enable debugging
-    -O1         - Enable trivial optimizations
-    -O2         - Enable default optimizations
-    -O3         - Enable expensive optimizations</b>
-  -f            - Enable binary output on terminals
-  -help         - display available options (-help-hidden for more)
-  -o &lt;filename&gt; - Specify output filename
-  -quiet        - Don't print informational messages
-</pre></div>
-
-<p>In this case, it is sort of awkward that flag names correspond directly to
-enum names, because we probably don't want a enum definition named "<tt>g</tt>"
-in our program.  Because of this, we can alternatively write this example like
-this:</p>
-
-<div class="doc_code"><pre>
-enum OptLevel {
-  Debug, O1, O2, O3
-};
-
-<a href="#cl::opt">cl::opt</a>&lt;OptLevel&gt; OptimizationLevel(<a href="#cl::desc">cl::desc</a>("<i>Choose optimization level:</i>"),
-  <a href="#cl::values">cl::values</a>(
-   clEnumValN(Debug, "g", "<i>No optimizations, enable debugging</i>"),
-    clEnumVal(O1        , "<i>Enable trivial optimizations</i>"),
-    clEnumVal(O2        , "<i>Enable default optimizations</i>"),
-    clEnumVal(O3        , "<i>Enable expensive optimizations</i>"),
-   clEnumValEnd));
-
-...
-  if (OptimizationLevel == Debug) outputDebugInfo(...);
-...
-</pre></div>
-
-<p>By using the "<tt>clEnumValN</tt>" macro instead of "<tt>clEnumVal</tt>", we
-can directly specify the name that the flag should get.  In general a direct
-mapping is nice, but sometimes you can't or don't want to preserve the mapping,
-which is when you would use it.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="namedalternatives">Named Alternatives</a>
-</h3>
-
-<div>
-
-<p>Another useful argument form is a named alternative style.  We shall use this
-style in our compiler to specify different debug levels that can be used.
-Instead of each debug level being its own switch, we want to support the
-following options, of which only one can be specified at a time:
-"<tt>--debug-level=none</tt>", "<tt>--debug-level=quick</tt>",
-"<tt>--debug-level=detailed</tt>".  To do this, we use the exact same format as
-our optimization level flags, but we also specify an option name.  For this
-case, the code looks like this:</p>
-
-<div class="doc_code"><pre>
-enum DebugLev {
-  nodebuginfo, quick, detailed
-};
-
-// Enable Debug Options to be specified on the command line
-<a href="#cl::opt">cl::opt</a>&lt;DebugLev&gt; DebugLevel("<i>debug_level</i>", <a href="#cl::desc">cl::desc</a>("<i>Set the debugging level:</i>"),
-  <a href="#cl::values">cl::values</a>(
-    clEnumValN(nodebuginfo, "none", "<i>disable debug information</i>"),
-     clEnumVal(quick,               "<i>enable quick debug information</i>"),
-     clEnumVal(detailed,            "<i>enable detailed debug information</i>"),
-    clEnumValEnd));
-</pre></div>
-
-<p>This definition defines an enumerated command line variable of type "<tt>enum
-DebugLev</tt>", which works exactly the same way as before.  The difference here
-is just the interface exposed to the user of your program and the help output by
-the "<tt>-help</tt>" option:</p>
-
-<div class="doc_code"><pre>
-USAGE: compiler [options] &lt;input file&gt;
-
-OPTIONS:
-  Choose optimization level:
-    -g          - No optimizations, enable debugging
-    -O1         - Enable trivial optimizations
-    -O2         - Enable default optimizations
-    -O3         - Enable expensive optimizations
-  <b>-debug_level  - Set the debugging level:
-    =none       - disable debug information
-    =quick      - enable quick debug information
-    =detailed   - enable detailed debug information</b>
-  -f            - Enable binary output on terminals
-  -help         - display available options (-help-hidden for more)
-  -o &lt;filename&gt; - Specify output filename
-  -quiet        - Don't print informational messages
-</pre></div>
-
-<p>Again, the only structural difference between the debug level declaration and
-the optimization level declaration is that the debug level declaration includes
-an option name (<tt>"debug_level"</tt>), which automatically changes how the
-library processes the argument.  The CommandLine library supports both forms so
-that you can choose the form most appropriate for your application.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="list">Parsing a list of options</a>
-</h3>
-
-<div>
-
-<p>Now that we have the standard run-of-the-mill argument types out of the way,
-lets get a little wild and crazy.  Lets say that we want our optimizer to accept
-a <b>list</b> of optimizations to perform, allowing duplicates.  For example, we
-might want to run: "<tt>compiler -dce -constprop -inline -dce -strip</tt>".  In
-this case, the order of the arguments and the number of appearances is very
-important.  This is what the "<tt><a href="#cl::list">cl::list</a></tt>"
-template is for.  First, start by defining an enum of the optimizations that you
-would like to perform:</p>
-
-<div class="doc_code"><pre>
-enum Opts {
-  // 'inline' is a C++ keyword, so name it 'inlining'
-  dce, constprop, inlining, strip
-};
-</pre></div>
-
-<p>Then define your "<tt><a href="#cl::list">cl::list</a></tt>" variable:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::list">cl::list</a>&lt;Opts&gt; OptimizationList(<a href="#cl::desc">cl::desc</a>("<i>Available Optimizations:</i>"),
-  <a href="#cl::values">cl::values</a>(
-    clEnumVal(dce               , "<i>Dead Code Elimination</i>"),
-    clEnumVal(constprop         , "<i>Constant Propagation</i>"),
-   clEnumValN(inlining, "<i>inline</i>", "<i>Procedure Integration</i>"),
-    clEnumVal(strip             , "<i>Strip Symbols</i>"),
-  clEnumValEnd));
-</pre></div>
-
-<p>This defines a variable that is conceptually of the type
-"<tt>std::vector&lt;enum Opts&gt;</tt>".  Thus, you can access it with standard
-vector methods:</p>
-
-<div class="doc_code"><pre>
-  for (unsigned i = 0; i != OptimizationList.size(); ++i)
-    switch (OptimizationList[i])
-       ...
-</pre></div>
-
-<p>... to iterate through the list of options specified.</p>
-
-<p>Note that the "<tt><a href="#cl::list">cl::list</a></tt>" template is
-completely general and may be used with any data types or other arguments that
-you can use with the "<tt><a href="#cl::opt">cl::opt</a></tt>" template.  One
-especially useful way to use a list is to capture all of the positional
-arguments together if there may be more than one specified.  In the case of a
-linker, for example, the linker takes several '<tt>.o</tt>' files, and needs to
-capture them into a list.  This is naturally specified as:</p>
-
-<div class="doc_code"><pre>
-...
-<a href="#cl::list">cl::list</a>&lt;std::string&gt; InputFilenames(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("&lt;Input files&gt;"), <a href="#cl::OneOrMore">cl::OneOrMore</a>);
-...
-</pre></div>
-
-<p>This variable works just like a "<tt>vector&lt;string&gt;</tt>" object.  As
-such, accessing the list is simple, just like above.  In this example, we used
-the <tt><a href="#cl::OneOrMore">cl::OneOrMore</a></tt> modifier to inform the
-CommandLine library that it is an error if the user does not specify any
-<tt>.o</tt> files on our command line.  Again, this just reduces the amount of
-checking we have to do.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="bits">Collecting options as a set of flags</a>
-</h3>
-
-<div>
-
-<p>Instead of collecting sets of options in a list, it is also possible to
-gather information for enum values in a <b>bit vector</b>.  The representation used by
-the <a href="#bits"><tt>cl::bits</tt></a> class is an <tt>unsigned</tt>
-integer.  An enum value is represented by a 0/1 in the enum's ordinal value bit
-position. 1 indicating that the enum was specified, 0 otherwise.  As each
-specified value is parsed, the resulting enum's bit is set in the option's bit
-vector:</p>
-
-<div class="doc_code"><pre>
-  <i>bits</i> |= 1 << (unsigned)<i>enum</i>;
-</pre></div>
-
-<p>Options that are specified multiple times are redundant.  Any instances after
-the first are discarded.</p>
-
-<p>Reworking the above list example, we could replace <a href="#list">
-<tt>cl::list</tt></a> with <a href="#bits"><tt>cl::bits</tt></a>:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::bits">cl::bits</a>&lt;Opts&gt; OptimizationBits(<a href="#cl::desc">cl::desc</a>("<i>Available Optimizations:</i>"),
-  <a href="#cl::values">cl::values</a>(
-    clEnumVal(dce               , "<i>Dead Code Elimination</i>"),
-    clEnumVal(constprop         , "<i>Constant Propagation</i>"),
-   clEnumValN(inlining, "<i>inline</i>", "<i>Procedure Integration</i>"),
-    clEnumVal(strip             , "<i>Strip Symbols</i>"),
-  clEnumValEnd));
-</pre></div>
-
-<p>To test to see if <tt>constprop</tt> was specified, we can use the
-<tt>cl:bits::isSet</tt> function:</p>
-
-<div class="doc_code"><pre>
-  if (OptimizationBits.isSet(constprop)) {
-    ...
-  }
-</pre></div>
-
-<p>It's also possible to get the raw bit vector using the
-<tt>cl::bits::getBits</tt> function:</p>
-
-<div class="doc_code"><pre>
-  unsigned bits = OptimizationBits.getBits();
-</pre></div>
-
-<p>Finally, if external storage is used, then the location specified must be of
-<b>type</b> <tt>unsigned</tt>. In all other ways a <a
-href="#bits"><tt>cl::bits</tt></a> option is equivalent to a <a
-href="#list"> <tt>cl::list</tt></a> option.</p>
-
-</div>
-
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="description">Adding freeform text to help output</a>
-</h3>
-
-<div>
-
-<p>As our program grows and becomes more mature, we may decide to put summary
-information about what it does into the help output.  The help output is styled
-to look similar to a Unix <tt>man</tt> page, providing concise information about
-a program.  Unix <tt>man</tt> pages, however often have a description about what
-the program does.  To add this to your CommandLine program, simply pass a third
-argument to the <a
-href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>
-call in main.  This additional argument is then printed as the overview
-information for your program, allowing you to include any additional information
-that you want.  For example:</p>
-
-<div class="doc_code"><pre>
-int main(int argc, char **argv) {
-  <a href="#cl::ParseCommandLineOptions">cl::ParseCommandLineOptions</a>(argc, argv, " CommandLine compiler example\n\n"
-                              "  This program blah blah blah...\n");
-  ...
-}
-</pre></div>
-
-<p>would yield the help output:</p>
-
-<div class="doc_code"><pre>
-<b>OVERVIEW: CommandLine compiler example
-
-  This program blah blah blah...</b>
-
-USAGE: compiler [options] &lt;input file&gt;
-
-OPTIONS:
-  ...
-  -help             - display available options (-help-hidden for more)
-  -o &lt;filename&gt;     - Specify output filename
-</pre></div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="referenceguide">Reference Guide</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Now that you know the basics of how to use the CommandLine library, this
-section will give you the detailed information you need to tune how command line
-options work, as well as information on more "advanced" command line option
-processing capabilities.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="positional">Positional Arguments</a>
-</h3>
-
-<div>
-
-<p>Positional arguments are those arguments that are not named, and are not
-specified with a hyphen.  Positional arguments should be used when an option is
-specified by its position alone.  For example, the standard Unix <tt>grep</tt>
-tool takes a regular expression argument, and an optional filename to search
-through (which defaults to standard input if a filename is not specified).
-Using the CommandLine library, this would be specified as:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; Regex   (<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;regular expression&gt;</i>"), <a href="#cl::Required">cl::Required</a>);
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; Filename(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;input file&gt;</i>"), <a href="#cl::init">cl::init</a>("<i>-</i>"));
-</pre></div>
-
-<p>Given these two option declarations, the <tt>-help</tt> output for our grep
-replacement would look like this:</p>
-
-<div class="doc_code"><pre>
-USAGE: spiffygrep [options] <b>&lt;regular expression&gt; &lt;input file&gt;</b>
-
-OPTIONS:
-  -help - display available options (-help-hidden for more)
-</pre></div>
-
-<p>... and the resultant program could be used just like the standard
-<tt>grep</tt> tool.</p>
-
-<p>Positional arguments are sorted by their order of construction.  This means
-that command line options will be ordered according to how they are listed in a
-.cpp file, but will not have an ordering defined if the positional arguments
-are defined in multiple .cpp files.  The fix for this problem is simply to
-define all of your positional arguments in one .cpp file.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="--">Specifying positional options with hyphens</a>
-</h4>
-
-<div>
-
-<p>Sometimes you may want to specify a value to your positional argument that
-starts with a hyphen (for example, searching for '<tt>-foo</tt>' in a file).  At
-first, you will have trouble doing this, because it will try to find an argument
-named '<tt>-foo</tt>', and will fail (and single quotes will not save you).
-Note that the system <tt>grep</tt> has the same problem:</p>
-
-<div class="doc_code"><pre>
-  $ spiffygrep '-foo' test.txt
-  Unknown command line argument '-foo'.  Try: spiffygrep -help'
-
-  $ grep '-foo' test.txt
-  grep: illegal option -- f
-  grep: illegal option -- o
-  grep: illegal option -- o
-  Usage: grep -hblcnsviw pattern file . . .
-</pre></div>
-
-<p>The solution for this problem is the same for both your tool and the system
-version: use the '<tt>--</tt>' marker.  When the user specifies '<tt>--</tt>' on
-the command line, it is telling the program that all options after the
-'<tt>--</tt>' should be treated as positional arguments, not options.  Thus, we
-can use it like this:</p>
-
-<div class="doc_code"><pre>
-  $ spiffygrep -- -foo test.txt
-    ...output...
-</pre></div>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="getPosition">Determining absolute position with getPosition()</a>
-</h4>
-<div>
-  <p>Sometimes an option can affect or modify the meaning of another option. For
-  example, consider <tt>gcc</tt>'s <tt>-x LANG</tt> option. This tells
-  <tt>gcc</tt> to ignore the suffix of subsequent positional arguments and force
-  the file to be interpreted as if it contained source code in language
-  <tt>LANG</tt>. In order to handle this properly, you need to know the
-  absolute position of each argument, especially those in lists, so their
-  interaction(s) can be applied correctly. This is also useful for options like
-  <tt>-llibname</tt> which is actually a positional argument that starts with
-  a dash.</p>
-  <p>So, generally, the problem is that you have two <tt>cl::list</tt> variables
-  that interact in some way. To ensure the correct interaction, you can use the
-  <tt>cl::list::getPosition(optnum)</tt> method. This method returns the
-  absolute position (as found on the command line) of the <tt>optnum</tt>
-  item in the <tt>cl::list</tt>.</p>
-  <p>The idiom for usage is like this:</p>
-
-  <div class="doc_code"><pre>
-  static cl::list&lt;std::string&gt; Files(cl::Positional, cl::OneOrMore);
-  static cl::list&lt;std::string&gt; Libraries("l", cl::ZeroOrMore);
-
-  int main(int argc, char**argv) {
-    // ...
-    std::vector&lt;std::string&gt;::iterator fileIt = Files.begin();
-    std::vector&lt;std::string&gt;::iterator libIt  = Libraries.begin();
-    unsigned libPos = 0, filePos = 0;
-    while ( 1 ) {
-      if ( libIt != Libraries.end() )
-        libPos = Libraries.getPosition( libIt - Libraries.begin() );
-      else
-        libPos = 0;
-      if ( fileIt != Files.end() )
-        filePos = Files.getPosition( fileIt - Files.begin() );
-      else
-        filePos = 0;
-
-      if ( filePos != 0 &amp;&amp; (libPos == 0 || filePos &lt; libPos) ) {
-        // Source File Is next
-        ++fileIt;
-      }
-      else if ( libPos != 0 &amp;&amp; (filePos == 0 || libPos &lt; filePos) ) {
-        // Library is next
-        ++libIt;
-      }
-      else
-        break; // we're done with the list
-    }
-  }</pre></div>
-
-  <p>Note that, for compatibility reasons, the <tt>cl::opt</tt> also supports an
-  <tt>unsigned getPosition()</tt> option that will provide the absolute position
-  of that option. You can apply the same approach as above with a
-  <tt>cl::opt</tt> and a <tt>cl::list</tt> option as you can with two lists.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::ConsumeAfter">The <tt>cl::ConsumeAfter</tt> modifier</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::ConsumeAfter</tt> <a href="#formatting">formatting option</a> is
-used to construct programs that use "interpreter style" option processing.  With
-this style of option processing, all arguments specified after the last
-positional argument are treated as special interpreter arguments that are not
-interpreted by the command line argument.</p>
-
-<p>As a concrete example, lets say we are developing a replacement for the
-standard Unix Bourne shell (<tt>/bin/sh</tt>).  To run <tt>/bin/sh</tt>, first
-you specify options to the shell itself (like <tt>-x</tt> which turns on trace
-output), then you specify the name of the script to run, then you specify
-arguments to the script.  These arguments to the script are parsed by the Bourne
-shell command line option processor, but are not interpreted as options to the
-shell itself.  Using the CommandLine library, we would specify this as:</p>
-
-<div class="doc_code"><pre>
-<a href="#cl::opt">cl::opt</a>&lt;string&gt; Script(<a href="#cl::Positional">cl::Positional</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;input script&gt;</i>"), <a href="#cl::init">cl::init</a>("-"));
-<a href="#cl::list">cl::list</a>&lt;string&gt;  Argv(<a href="#cl::ConsumeAfter">cl::ConsumeAfter</a>, <a href="#cl::desc">cl::desc</a>("<i>&lt;program arguments&gt;...</i>"));
-<a href="#cl::opt">cl::opt</a>&lt;bool&gt;    Trace("<i>x</i>", <a href="#cl::desc">cl::desc</a>("<i>Enable trace output</i>"));
-</pre></div>
-
-<p>which automatically provides the help output:</p>
-
-<div class="doc_code"><pre>
-USAGE: spiffysh [options] <b>&lt;input script&gt; &lt;program arguments&gt;...</b>
-
-OPTIONS:
-  -help - display available options (-help-hidden for more)
-  <b>-x    - Enable trace output</b>
-</pre></div>
-
-<p>At runtime, if we run our new shell replacement as `<tt>spiffysh -x test.sh
--a -x -y bar</tt>', the <tt>Trace</tt> variable will be set to true, the
-<tt>Script</tt> variable will be set to "<tt>test.sh</tt>", and the
-<tt>Argv</tt> list will contain <tt>["-a", "-x", "-y", "bar"]</tt>, because they
-were specified after the last positional argument (which is the script
-name).</p>
-
-<p>There are several limitations to when <tt>cl::ConsumeAfter</tt> options can
-be specified.  For example, only one <tt>cl::ConsumeAfter</tt> can be specified
-per program, there must be at least one <a href="#positional">positional
-argument</a> specified, there must not be any <a href="#cl::list">cl::list</a>
-positional arguments, and the <tt>cl::ConsumeAfter</tt> option should be a <a
-href="#cl::list">cl::list</a> option.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="storage">Internal vs External Storage</a>
-</h3>
-
-<div>
-
-<p>By default, all command line options automatically hold the value that they
-parse from the command line.  This is very convenient in the common case,
-especially when combined with the ability to define command line options in the
-files that use them.  This is called the internal storage model.</p>
-
-<p>Sometimes, however, it is nice to separate the command line option processing
-code from the storage of the value parsed.  For example, lets say that we have a
-'<tt>-debug</tt>' option that we would like to use to enable debug information
-across the entire body of our program.  In this case, the boolean value
-controlling the debug code should be globally accessible (in a header file, for
-example) yet the command line option processing code should not be exposed to
-all of these clients (requiring lots of .cpp files to #include
-<tt>CommandLine.h</tt>).</p>
-
-<p>To do this, set up your .h file with your option, like this for example:</p>
-
-<div class="doc_code">
-<pre>
-<i>// DebugFlag.h - Get access to the '-debug' command line option
-//
-
-// DebugFlag - This boolean is set to true if the '-debug' command line option
-// is specified.  This should probably not be referenced directly, instead, use
-// the DEBUG macro below.
-//</i>
-extern bool DebugFlag;
-
-<i>// DEBUG macro - This macro should be used by code to emit debug information.
-// In the '-debug' option is specified on the command line, and if this is a
-// debug build, then the code specified as the option to the macro will be
-// executed.  Otherwise it will not be.</i>
-<span class="doc_hilite">#ifdef NDEBUG
-#define DEBUG(X)
-#else
-#define DEBUG(X)</span> do { if (DebugFlag) { X; } } while (0)
-<span class="doc_hilite">#endif</span>
-</pre>
-</div>
-
-<p>This allows clients to blissfully use the <tt>DEBUG()</tt> macro, or the
-<tt>DebugFlag</tt> explicitly if they want to.  Now we just need to be able to
-set the <tt>DebugFlag</tt> boolean when the option is set.  To do this, we pass
-an additional argument to our command line argument processor, and we specify
-where to fill in with the <a href="#cl::location">cl::location</a>
-attribute:</p>
-
-<div class="doc_code">
-<pre>
-bool DebugFlag;                  <i>// the actual value</i>
-static <a href="#cl::opt">cl::opt</a>&lt;bool, true&gt;       <i>// The parser</i>
-Debug("<i>debug</i>", <a href="#cl::desc">cl::desc</a>("<i>Enable debug output</i>"), <a href="#cl::Hidden">cl::Hidden</a>, <a href="#cl::location">cl::location</a>(DebugFlag));
-</pre>
-</div>
-
-<p>In the above example, we specify "<tt>true</tt>" as the second argument to
-the <tt><a href="#cl::opt">cl::opt</a></tt> template, indicating that the
-template should not maintain a copy of the value itself.  In addition to this,
-we specify the <tt><a href="#cl::location">cl::location</a></tt> attribute, so
-that <tt>DebugFlag</tt> is automatically set.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="attributes">Option Attributes</a>
-</h3>
-
-<div>
-
-<p>This section describes the basic attributes that you can specify on
-options.</p>
-
-<ul>
-
-<li>The option name attribute (which is required for all options, except <a
-href="#positional">positional options</a>) specifies what the option name is.
-This option is specified in simple double quotes:
-
-<pre>
-<a href="#cl::opt">cl::opt</a>&lt;<b>bool</b>&gt; Quiet("<i>quiet</i>");
-</pre>
-
-</li>
-
-<li><a name="cl::desc">The <b><tt>cl::desc</tt></b></a> attribute specifies a
-description for the option to be shown in the <tt>-help</tt> output for the
-program.</li>
-
-<li><a name="cl::value_desc">The <b><tt>cl::value_desc</tt></b></a> attribute
-specifies a string that can be used to fine tune the <tt>-help</tt> output for
-a command line option.  Look <a href="#value_desc_example">here</a> for an
-example.</li>
-
-<li><a name="cl::init">The <b><tt>cl::init</tt></b></a> attribute specifies an
-initial value for a <a href="#cl::opt">scalar</a> option.  If this attribute is
-not specified then the command line option value defaults to the value created
-by the default constructor for the type. <b>Warning</b>: If you specify both
-<b><tt>cl::init</tt></b> and <b><tt>cl::location</tt></b> for an option,
-you must specify <b><tt>cl::location</tt></b> first, so that when the
-command-line parser sees <b><tt>cl::init</tt></b>, it knows where to put the
-initial value. (You will get an error at runtime if you don't put them in
-the right order.)</li>
-
-<li><a name="cl::location">The <b><tt>cl::location</tt></b></a> attribute where
-to store the value for a parsed command line option if using external storage.
-See the section on <a href="#storage">Internal vs External Storage</a> for more
-information.</li>
-
-<li><a name="cl::aliasopt">The <b><tt>cl::aliasopt</tt></b></a> attribute
-specifies which option a <tt><a href="#cl::alias">cl::alias</a></tt> option is
-an alias for.</li>
-
-<li><a name="cl::values">The <b><tt>cl::values</tt></b></a> attribute specifies
-the string-to-value mapping to be used by the generic parser.  It takes a
-<b>clEnumValEnd terminated</b> list of (option, value, description) triplets
-that
-specify the option name, the value mapped to, and the description shown in the
-<tt>-help</tt> for the tool.  Because the generic parser is used most
-frequently with enum values, two macros are often useful:
-
-<ol>
-
-<li><a name="clEnumVal">The <b><tt>clEnumVal</tt></b></a> macro is used as a
-nice simple way to specify a triplet for an enum.  This macro automatically
-makes the option name be the same as the enum name.  The first option to the
-macro is the enum, the second is the description for the command line
-option.</li>
-
-<li><a name="clEnumValN">The <b><tt>clEnumValN</tt></b></a> macro is used to
-specify macro options where the option name doesn't equal the enum name.  For
-this macro, the first argument is the enum value, the second is the flag name,
-and the second is the description.</li>
-
-</ol>
-
-You will get a compile time error if you try to use cl::values with a parser
-that does not support it.</li>
-
-<li><a name="cl::multi_val">The <b><tt>cl::multi_val</tt></b></a>
-attribute specifies that this option takes has multiple values
-(example: <tt>-sectalign segname sectname sectvalue</tt>). This
-attribute takes one unsigned argument - the number of values for the
-option. This attribute is valid only on <tt>cl::list</tt> options (and
-will fail with compile error if you try to use it with other option
-types). It is allowed to use all of the usual modifiers on
-multi-valued options (besides <tt>cl::ValueDisallowed</tt>,
-obviously).</li>
-
-</ul>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="modifiers">Option Modifiers</a>
-</h3>
-
-<div>
-
-<p>Option modifiers are the flags and expressions that you pass into the
-constructors for <tt><a href="#cl::opt">cl::opt</a></tt> and <tt><a
-href="#cl::list">cl::list</a></tt>.  These modifiers give you the ability to
-tweak how options are parsed and how <tt>-help</tt> output is generated to fit
-your application well.</p>
-
-<p>These options fall into five main categories:</p>
-
-<ol>
-<li><a href="#hiding">Hiding an option from <tt>-help</tt> output</a></li>
-<li><a href="#numoccurrences">Controlling the number of occurrences
-                             required and allowed</a></li>
-<li><a href="#valrequired">Controlling whether or not a value must be
-                           specified</a></li>
-<li><a href="#formatting">Controlling other formatting options</a></li>
-<li><a href="#misc">Miscellaneous option modifiers</a></li>
-</ol>
-
-<p>It is not possible to specify two options from the same category (you'll get
-a runtime error) to a single option, except for options in the miscellaneous
-category.  The CommandLine library specifies defaults for all of these settings
-that are the most useful in practice and the most common, which mean that you
-usually shouldn't have to worry about these.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="hiding">Hiding an option from <tt>-help</tt> output</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::NotHidden</tt>, <tt>cl::Hidden</tt>, and
-<tt>cl::ReallyHidden</tt> modifiers are used to control whether or not an option
-appears in the <tt>-help</tt> and <tt>-help-hidden</tt> output for the
-compiled program:</p>
-
-<ul>
-
-<li><a name="cl::NotHidden">The <b><tt>cl::NotHidden</tt></b></a> modifier
-(which is the default for <tt><a href="#cl::opt">cl::opt</a></tt> and <tt><a
-href="#cl::list">cl::list</a></tt> options) indicates the option is to appear
-in both help listings.</li>
-
-<li><a name="cl::Hidden">The <b><tt>cl::Hidden</tt></b></a> modifier (which is the
-default for <tt><a href="#cl::alias">cl::alias</a></tt> options) indicates that
-the option should not appear in the <tt>-help</tt> output, but should appear in
-the <tt>-help-hidden</tt> output.</li>
-
-<li><a name="cl::ReallyHidden">The <b><tt>cl::ReallyHidden</tt></b></a> modifier
-indicates that the option should not appear in any help output.</li>
-
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="numoccurrences">Controlling the number of occurrences required and
-  allowed</a>
-</h4>
-
-<div>
-
-<p>This group of options is used to control how many time an option is allowed
-(or required) to be specified on the command line of your program.  Specifying a
-value for this setting allows the CommandLine library to do error checking for
-you.</p>
-
-<p>The allowed values for this option group are:</p>
-
-<ul>
-
-<li><a name="cl::Optional">The <b><tt>cl::Optional</tt></b></a> modifier (which
-is the default for the <tt><a href="#cl::opt">cl::opt</a></tt> and <tt><a
-href="#cl::alias">cl::alias</a></tt> classes) indicates that your program will
-allow either zero or one occurrence of the option to be specified.</li>
-
-<li><a name="cl::ZeroOrMore">The <b><tt>cl::ZeroOrMore</tt></b></a> modifier
-(which is the default for the <tt><a href="#cl::list">cl::list</a></tt> class)
-indicates that your program will allow the option to be specified zero or more
-times.</li>
-
-<li><a name="cl::Required">The <b><tt>cl::Required</tt></b></a> modifier
-indicates that the specified option must be specified exactly one time.</li>
-
-<li><a name="cl::OneOrMore">The <b><tt>cl::OneOrMore</tt></b></a> modifier
-indicates that the option must be specified at least one time.</li>
-
-<li>The <b><tt>cl::ConsumeAfter</tt></b> modifier is described in the <a
-href="#positional">Positional arguments section</a>.</li>
-
-</ul>
-
-<p>If an option is not specified, then the value of the option is equal to the
-value specified by the <tt><a href="#cl::init">cl::init</a></tt> attribute.  If
-the <tt><a href="#cl::init">cl::init</a></tt> attribute is not specified, the
-option value is initialized with the default constructor for the data type.</p>
-
-<p>If an option is specified multiple times for an option of the <tt><a
-href="#cl::opt">cl::opt</a></tt> class, only the last value will be
-retained.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="valrequired">Controlling whether or not a value must be specified</a>
-</h4>
-
-<div>
-
-<p>This group of options is used to control whether or not the option allows a
-value to be present.  In the case of the CommandLine library, a value is either
-specified with an equal sign (e.g. '<tt>-index-depth=17</tt>') or as a trailing
-string (e.g. '<tt>-o a.out</tt>').</p>
-
-<p>The allowed values for this option group are:</p>
-
-<ul>
-
-<li><a name="cl::ValueOptional">The <b><tt>cl::ValueOptional</tt></b></a> modifier
-(which is the default for <tt>bool</tt> typed options) specifies that it is
-acceptable to have a value, or not.  A boolean argument can be enabled just by
-appearing on the command line, or it can have an explicit '<tt>-foo=true</tt>'.
-If an option is specified with this mode, it is illegal for the value to be
-provided without the equal sign.  Therefore '<tt>-foo true</tt>' is illegal.  To
-get this behavior, you must use the <a
-href="#cl::ValueRequired">cl::ValueRequired</a> modifier.</li>
-
-<li><a name="cl::ValueRequired">The <b><tt>cl::ValueRequired</tt></b></a> modifier
-(which is the default for all other types except for <a
-href="#onealternative">unnamed alternatives using the generic parser</a>)
-specifies that a value must be provided.  This mode informs the command line
-library that if an option is not provides with an equal sign, that the next
-argument provided must be the value.  This allows things like '<tt>-o
-a.out</tt>' to work.</li>
-
-<li><a name="cl::ValueDisallowed">The <b><tt>cl::ValueDisallowed</tt></b></a>
-modifier (which is the default for <a href="#onealternative">unnamed
-alternatives using the generic parser</a>) indicates that it is a runtime error
-for the user to specify a value.  This can be provided to disallow users from
-providing options to boolean options (like '<tt>-foo=true</tt>').</li>
-
-</ul>
-
-<p>In general, the default values for this option group work just like you would
-want them to.  As mentioned above, you can specify the <a
-href="#cl::ValueDisallowed">cl::ValueDisallowed</a> modifier to a boolean
-argument to restrict your command line parser.  These options are mostly useful
-when <a href="#extensionguide">extending the library</a>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="formatting">Controlling other formatting options</a>
-</h4>
-
-<div>
-
-<p>The formatting option group is used to specify that the command line option
-has special abilities and is otherwise different from other command line
-arguments.  As usual, you can only specify one of these arguments at most.</p>
-
-<ul>
-
-<li><a name="cl::NormalFormatting">The <b><tt>cl::NormalFormatting</tt></b></a>
-modifier (which is the default all options) specifies that this option is
-"normal".</li>
-
-<li><a name="cl::Positional">The <b><tt>cl::Positional</tt></b></a> modifier
-specifies that this is a positional argument that does not have a command line
-option associated with it.  See the <a href="#positional">Positional
-Arguments</a> section for more information.</li>
-
-<li>The <b><a href="#cl::ConsumeAfter"><tt>cl::ConsumeAfter</tt></a></b> modifier
-specifies that this option is used to capture "interpreter style" arguments.  See <a href="#cl::ConsumeAfter">this section for more information</a>.</li>
-
-<li><a name="cl::Prefix">The <b><tt>cl::Prefix</tt></b></a> modifier specifies
-that this option prefixes its value.  With 'Prefix' options, the equal sign does
-not separate the value from the option name specified. Instead, the value is
-everything after the prefix, including any equal sign if present. This is useful
-for processing odd arguments like <tt>-lmalloc</tt> and <tt>-L/usr/lib</tt> in a
-linker tool or <tt>-DNAME=value</tt> in a compiler tool.   Here, the
-'<tt>l</tt>', '<tt>D</tt>' and '<tt>L</tt>' options are normal string (or list)
-options, that have the <b><tt><a href="#cl::Prefix">cl::Prefix</a></tt></b>
-modifier added to allow the CommandLine library to recognize them.  Note that
-<b><tt><a href="#cl::Prefix">cl::Prefix</a></tt></b> options must not have the
-<b><tt><a href="#cl::ValueDisallowed">cl::ValueDisallowed</a></tt></b> modifier
-specified.</li>
-
-<li><a name="cl::Grouping">The <b><tt>cl::Grouping</tt></b></a> modifier is used
-to implement Unix-style tools (like <tt>ls</tt>) that have lots of single letter
-arguments, but only require a single dash.  For example, the '<tt>ls -labF</tt>'
-command actually enables four different options, all of which are single
-letters.  Note that <b><tt><a href="#cl::Grouping">cl::Grouping</a></tt></b>
-options cannot have values.</li>
-
-</ul>
-
-<p>The CommandLine library does not restrict how you use the <b><tt><a
-href="#cl::Prefix">cl::Prefix</a></tt></b> or <b><tt><a
-href="#cl::Grouping">cl::Grouping</a></tt></b> modifiers, but it is possible to
-specify ambiguous argument settings.  Thus, it is possible to have multiple
-letter options that are prefix or grouping options, and they will still work as
-designed.</p>
-
-<p>To do this, the CommandLine library uses a greedy algorithm to parse the
-input option into (potentially multiple) prefix and grouping options.  The
-strategy basically looks like this:</p>
-
-<div class="doc_code"><tt>parse(string OrigInput) {</tt>
-
-<ol>
-<li><tt>string input = OrigInput;</tt>
-<li><tt>if (isOption(input)) return getOption(input).parse();</tt>&nbsp;&nbsp;&nbsp;&nbsp;<i>// Normal option</i>
-<li><tt>while (!isOption(input) &amp;&amp; !input.empty()) input.pop_back();</tt>&nbsp;&nbsp;&nbsp;&nbsp;<i>// Remove the last letter</i>
-<li><tt>if (input.empty()) return error();</tt>&nbsp;&nbsp;&nbsp;&nbsp;<i>// No matching option</i>
-<li><tt>if (getOption(input).isPrefix())<br>
-&nbsp;&nbsp;return getOption(input).parse(input);</tt>
-<li><tt>while (!input.empty()) {&nbsp;&nbsp;&nbsp;&nbsp;<i>// Must be grouping options</i><br>
-&nbsp;&nbsp;getOption(input).parse();<br>
-&nbsp;&nbsp;OrigInput.erase(OrigInput.begin(), OrigInput.begin()+input.length());<br>
-&nbsp;&nbsp;input = OrigInput;<br>
-&nbsp;&nbsp;while (!isOption(input) &amp;&amp; !input.empty()) input.pop_back();<br>
-}</tt>
-<li><tt>if (!OrigInput.empty()) error();</tt></li>
-</ol>
-
-<p><tt>}</tt></p>
-</div>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="misc">Miscellaneous option modifiers</a>
-</h4>
-
-<div>
-
-<p>The miscellaneous option modifiers are the only flags where you can specify
-more than one flag from the set: they are not mutually exclusive.  These flags
-specify boolean properties that modify the option.</p>
-
-<ul>
-
-<li><a name="cl::CommaSeparated">The <b><tt>cl::CommaSeparated</tt></b></a> modifier
-indicates that any commas specified for an option's value should be used to
-split the value up into multiple values for the option.  For example, these two
-options are equivalent when <tt>cl::CommaSeparated</tt> is specified:
-"<tt>-foo=a -foo=b -foo=c</tt>" and "<tt>-foo=a,b,c</tt>".  This option only
-makes sense to be used in a case where the option is allowed to accept one or
-more values (i.e. it is a <a href="#cl::list">cl::list</a> option).</li>
-
-<li><a name="cl::PositionalEatsArgs">The
-<b><tt>cl::PositionalEatsArgs</tt></b></a> modifier (which only applies to
-positional arguments, and only makes sense for lists) indicates that positional
-argument should consume any strings after it (including strings that start with
-a "-") up until another recognized positional argument.  For example, if you
-have two "eating" positional arguments, "<tt>pos1</tt>" and "<tt>pos2</tt>", the
-string "<tt>-pos1 -foo -bar baz -pos2 -bork</tt>" would cause the "<tt>-foo -bar
--baz</tt>" strings to be applied to the "<tt>-pos1</tt>" option and the
-"<tt>-bork</tt>" string to be applied to the "<tt>-pos2</tt>" option.</li>
-
-<li><a name="cl::Sink">The <b><tt>cl::Sink</tt></b></a> modifier is
-used to handle unknown options. If there is at least one option with
-<tt>cl::Sink</tt> modifier specified, the parser passes
-unrecognized option strings to it as values instead of signaling an
-error. As with <tt>cl::CommaSeparated</tt>, this modifier
-only makes sense with a <a href="#cl::list">cl::list</a> option.</li>
-
-</ul>
-
-<p>So far, these are the only three miscellaneous option modifiers.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="response">Response files</a>
-</h4>
-
-<div>
-
-<p>Some systems, such as certain variants of Microsoft Windows and
-some older Unices have a relatively low limit on command-line
-length. It is therefore customary to use the so-called 'response
-files' to circumvent this restriction. These files are mentioned on
-the command-line (using the "@file") syntax. The program reads these
-files and inserts the contents into argv, thereby working around the
-command-line length limits. Response files are enabled by an optional
-fourth argument to
-<a href="#cl::ParseEnvironmentOptions"><tt>cl::ParseEnvironmentOptions</tt></a>
-and
-<a href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>.
-</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="toplevel">Top-Level Classes and Functions</a>
-</h3>
-
-<div>
-
-<p>Despite all of the built-in flexibility, the CommandLine option library
-really only consists of one function (<a
-href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>)
-and three main classes: <a href="#cl::opt"><tt>cl::opt</tt></a>, <a
-href="#cl::list"><tt>cl::list</tt></a>, and <a
-href="#cl::alias"><tt>cl::alias</tt></a>.  This section describes these three
-classes in detail.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::ParseCommandLineOptions">The <tt>cl::ParseCommandLineOptions</tt>
-  function</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::ParseCommandLineOptions</tt> function is designed to be called
-directly from <tt>main</tt>, and is used to fill in the values of all of the
-command line option variables once <tt>argc</tt> and <tt>argv</tt> are
-available.</p>
-
-<p>The <tt>cl::ParseCommandLineOptions</tt> function requires two parameters
-(<tt>argc</tt> and <tt>argv</tt>), but may also take an optional third parameter
-which holds <a href="#description">additional extra text</a> to emit when the
-<tt>-help</tt> option is invoked, and a fourth boolean parameter that enables
-<a href="#response">response files</a>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::ParseEnvironmentOptions">The <tt>cl::ParseEnvironmentOptions</tt>
-  function</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::ParseEnvironmentOptions</tt> function has mostly the same effects
-as <a
-href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>,
-except that it is designed to take values for options from an environment
-variable, for those cases in which reading the command line is not convenient or
-desired. It fills in the values of all the command line option variables just
-like <a
-href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>
-does.</p>
-
-<p>It takes four parameters: the name of the program (since <tt>argv</tt> may
-not be available, it can't just look in <tt>argv[0]</tt>), the name of the
-environment variable to examine, the optional
-<a href="#description">additional extra text</a> to emit when the
-<tt>-help</tt> option is invoked, and the boolean
-switch that controls whether <a href="#response">response files</a>
-should be read.</p>
-
-<p><tt>cl::ParseEnvironmentOptions</tt> will break the environment
-variable's value up into words and then process them using
-<a href="#cl::ParseCommandLineOptions"><tt>cl::ParseCommandLineOptions</tt></a>.
-<b>Note:</b> Currently <tt>cl::ParseEnvironmentOptions</tt> does not support
-quoting, so an environment variable containing <tt>-option "foo bar"</tt> will
-be parsed as three words, <tt>-option</tt>, <tt>"foo</tt>, and <tt>bar"</tt>,
-which is different from what you would get from the shell with the same
-input.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::SetVersionPrinter">The <tt>cl::SetVersionPrinter</tt>
-  function</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::SetVersionPrinter</tt> function is designed to be called
-directly from <tt>main</tt> and <i>before</i>
-<tt>cl::ParseCommandLineOptions</tt>. Its use is optional. It simply arranges
-for a function to be called in response to the <tt>--version</tt> option instead
-of having the <tt>CommandLine</tt> library print out the usual version string
-for LLVM. This is useful for programs that are not part of LLVM but wish to use
-the <tt>CommandLine</tt> facilities. Such programs should just define a small
-function that takes no arguments and returns <tt>void</tt> and that prints out
-whatever version information is appropriate for the program. Pass the address
-of that function to <tt>cl::SetVersionPrinter</tt> to arrange for it to be
-called when the <tt>--version</tt> option is given by the user.</p>
-
-</div>
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::opt">The <tt>cl::opt</tt> class</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::opt</tt> class is the class used to represent scalar command line
-options, and is the one used most of the time.  It is a templated class which
-can take up to three arguments (all except for the first have default values
-though):</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> cl {
-  <b>template</b> &lt;<b>class</b> DataType, <b>bool</b> ExternalStorage = <b>false</b>,
-            <b>class</b> ParserClass = parser&lt;DataType&gt; &gt;
-  <b>class</b> opt;
-}
-</pre></div>
-
-<p>The first template argument specifies what underlying data type the command
-line argument is, and is used to select a default parser implementation.  The
-second template argument is used to specify whether the option should contain
-the storage for the option (the default) or whether external storage should be
-used to contain the value parsed for the option (see <a href="#storage">Internal
-vs External Storage</a> for more information).</p>
-
-<p>The third template argument specifies which parser to use.  The default value
-selects an instantiation of the <tt>parser</tt> class based on the underlying
-data type of the option.  In general, this default works well for most
-applications, so this option is only used when using a <a
-href="#customparser">custom parser</a>.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::list">The <tt>cl::list</tt> class</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::list</tt> class is the class used to represent a list of command
-line options.  It too is a templated class which can take up to three
-arguments:</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> cl {
-  <b>template</b> &lt;<b>class</b> DataType, <b>class</b> Storage = <b>bool</b>,
-            <b>class</b> ParserClass = parser&lt;DataType&gt; &gt;
-  <b>class</b> list;
-}
-</pre></div>
-
-<p>This class works the exact same as the <a
-href="#cl::opt"><tt>cl::opt</tt></a> class, except that the second argument is
-the <b>type</b> of the external storage, not a boolean value.  For this class,
-the marker type '<tt>bool</tt>' is used to indicate that internal storage should
-be used.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::bits">The <tt>cl::bits</tt> class</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::bits</tt> class is the class used to represent a list of command
-line options in the form of a bit vector.  It is also a templated class which
-can take up to three arguments:</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> cl {
-  <b>template</b> &lt;<b>class</b> DataType, <b>class</b> Storage = <b>bool</b>,
-            <b>class</b> ParserClass = parser&lt;DataType&gt; &gt;
-  <b>class</b> bits;
-}
-</pre></div>
-
-<p>This class works the exact same as the <a
-href="#cl::opt"><tt>cl::lists</tt></a> class, except that the second argument
-must be of <b>type</b> <tt>unsigned</tt> if external storage is used.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::alias">The <tt>cl::alias</tt> class</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::alias</tt> class is a nontemplated class that is used to form
-aliases for other arguments.</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> cl {
-  <b>class</b> alias;
-}
-</pre></div>
-
-<p>The <a href="#cl::aliasopt"><tt>cl::aliasopt</tt></a> attribute should be
-used to specify which option this is an alias for.  Alias arguments default to
-being <a href="#cl::Hidden">Hidden</a>, and use the aliased options parser to do
-the conversion from string to data.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="cl::extrahelp">The <tt>cl::extrahelp</tt> class</a>
-</h4>
-
-<div>
-
-<p>The <tt>cl::extrahelp</tt> class is a nontemplated class that allows extra
-help text to be printed out for the <tt>-help</tt> option.</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> cl {
-  <b>struct</b> extrahelp;
-}
-</pre></div>
-
-<p>To use the extrahelp, simply construct one with a <tt>const char*</tt>
-parameter to the constructor. The text passed to the constructor will be printed
-at the bottom of the help message, verbatim. Note that multiple
-<tt>cl::extrahelp</tt> <b>can</b> be used, but this practice is discouraged. If
-your tool needs to print additional help information, put all that help into a
-single <tt>cl::extrahelp</tt> instance.</p>
-<p>For example:</p>
-<div class="doc_code"><pre>
-  cl::extrahelp("\nADDITIONAL HELP:\n\n  This is the extra help\n");
-</pre></div>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="builtinparsers">Builtin parsers</a>
-</h3>
-
-<div>
-
-<p>Parsers control how the string value taken from the command line is
-translated into a typed value, suitable for use in a C++ program.  By default,
-the CommandLine library uses an instance of <tt>parser&lt;type&gt;</tt> if the
-command line option specifies that it uses values of type '<tt>type</tt>'.
-Because of this, custom option processing is specified with specializations of
-the '<tt>parser</tt>' class.</p>
-
-<p>The CommandLine library provides the following builtin parser
-specializations, which are sufficient for most applications. It can, however,
-also be extended to work with new data types and new ways of interpreting the
-same data.  See the <a href="#customparser">Writing a Custom Parser</a> for more
-details on this type of library extension.</p>
-
-<ul>
-
-<li><a name="genericparser">The <b>generic <tt>parser&lt;t&gt;</tt> parser</b></a>
-can be used to map strings values to any data type, through the use of the <a
-href="#cl::values">cl::values</a> property, which specifies the mapping
-information.  The most common use of this parser is for parsing enum values,
-which allows you to use the CommandLine library for all of the error checking to
-make sure that only valid enum values are specified (as opposed to accepting
-arbitrary strings).  Despite this, however, the generic parser class can be used
-for any data type.</li>
-
-<li><a name="boolparser">The <b><tt>parser&lt;bool&gt;</tt> specialization</b></a>
-is used to convert boolean strings to a boolean value.  Currently accepted
-strings are "<tt>true</tt>", "<tt>TRUE</tt>", "<tt>True</tt>", "<tt>1</tt>",
-"<tt>false</tt>", "<tt>FALSE</tt>", "<tt>False</tt>", and "<tt>0</tt>".</li>
-
-<li><a name="boolOrDefaultparser">The <b><tt>parser&lt;boolOrDefault&gt;</tt>
- specialization</b></a> is used for cases where the value is boolean,
-but we also need to know whether the option was specified at all.  boolOrDefault
-is an enum with 3 values, BOU_UNSET, BOU_TRUE and BOU_FALSE.  This parser accepts
-the same strings as <b><tt>parser&lt;bool&gt;</tt></b>.</li>
-
-<li><a name="stringparser">The <b><tt>parser&lt;string&gt;</tt>
-specialization</b></a> simply stores the parsed string into the string value
-specified.  No conversion or modification of the data is performed.</li>
-
-<li><a name="intparser">The <b><tt>parser&lt;int&gt;</tt> specialization</b></a>
-uses the C <tt>strtol</tt> function to parse the string input.  As such, it will
-accept a decimal number (with an optional '+' or '-' prefix) which must start
-with a non-zero digit.  It accepts octal numbers, which are identified with a
-'<tt>0</tt>' prefix digit, and hexadecimal numbers with a prefix of
-'<tt>0x</tt>' or '<tt>0X</tt>'.</li>
-
-<li><a name="doubleparser">The <b><tt>parser&lt;double&gt;</tt></b></a> and
-<b><tt>parser&lt;float&gt;</tt> specializations</b> use the standard C
-<tt>strtod</tt> function to convert floating point strings into floating point
-values.  As such, a broad range of string formats is supported, including
-exponential notation (ex: <tt>1.7e15</tt>) and properly supports locales.
-</li>
-
-</ul>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="extensionguide">Extension Guide</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Although the CommandLine library has a lot of functionality built into it
-already (as discussed previously), one of its true strengths lie in its
-extensibility.  This section discusses how the CommandLine library works under
-the covers and illustrates how to do some simple, common, extensions.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="customparser">Writing a custom parser</a>
-</h3>
-
-<div>
-
-<p>One of the simplest and most common extensions is the use of a custom parser.
-As <a href="#builtinparsers">discussed previously</a>, parsers are the portion
-of the CommandLine library that turns string input from the user into a
-particular parsed data type, validating the input in the process.</p>
-
-<p>There are two ways to use a new parser:</p>
-
-<ol>
-
-<li>
-
-<p>Specialize the <a href="#genericparser"><tt>cl::parser</tt></a> template for
-your custom data type.<p>
-
-<p>This approach has the advantage that users of your custom data type will
-automatically use your custom parser whenever they define an option with a value
-type of your data type.  The disadvantage of this approach is that it doesn't
-work if your fundamental data type is something that is already supported.</p>
-
-</li>
-
-<li>
-
-<p>Write an independent class, using it explicitly from options that need
-it.</p>
-
-<p>This approach works well in situations where you would line to parse an
-option using special syntax for a not-very-special data-type.  The drawback of
-this approach is that users of your parser have to be aware that they are using
-your parser instead of the builtin ones.</p>
-
-</li>
-
-</ol>
-
-<p>To guide the discussion, we will discuss a custom parser that accepts file
-sizes, specified with an optional unit after the numeric size.  For example, we
-would like to parse "102kb", "41M", "1G" into the appropriate integer value.  In
-this case, the underlying data type we want to parse into is
-'<tt>unsigned</tt>'.  We choose approach #2 above because we don't want to make
-this the default for all <tt>unsigned</tt> options.</p>
-
-<p>To start out, we declare our new <tt>FileSizeParser</tt> class:</p>
-
-<div class="doc_code"><pre>
-<b>struct</b> FileSizeParser : <b>public</b> cl::basic_parser&lt;<b>unsigned</b>&gt; {
-  <i>// parse - Return true on error.</i>
-  <b>bool</b> parse(cl::Option &amp;O, <b>const char</b> *ArgName, <b>const</b> std::string &amp;ArgValue,
-             <b>unsigned</b> &amp;Val);
-};
-</pre></div>
-
-<p>Our new class inherits from the <tt>cl::basic_parser</tt> template class to
-fill in the default, boiler plate code for us.  We give it the data type that
-we parse into, the last argument to the <tt>parse</tt> method, so that clients of
-our custom parser know what object type to pass in to the parse method.  (Here we
-declare that we parse into '<tt>unsigned</tt>' variables.)</p>
-
-<p>For most purposes, the only method that must be implemented in a custom
-parser is the <tt>parse</tt> method.  The <tt>parse</tt> method is called
-whenever the option is invoked, passing in the option itself, the option name,
-the string to parse, and a reference to a return value.  If the string to parse
-is not well-formed, the parser should output an error message and return true.
-Otherwise it should return false and set '<tt>Val</tt>' to the parsed value.  In
-our example, we implement <tt>parse</tt> as:</p>
-
-<div class="doc_code"><pre>
-<b>bool</b> FileSizeParser::parse(cl::Option &amp;O, <b>const char</b> *ArgName,
-                           <b>const</b> std::string &amp;Arg, <b>unsigned</b> &amp;Val) {
-  <b>const char</b> *ArgStart = Arg.c_str();
-  <b>char</b> *End;
-
-  <i>// Parse integer part, leaving 'End' pointing to the first non-integer char</i>
-  Val = (unsigned)strtol(ArgStart, &amp;End, 0);
-
-  <b>while</b> (1) {
-    <b>switch</b> (*End++) {
-    <b>case</b> 0: <b>return</b> false;   <i>// No error</i>
-    <b>case</b> 'i':               <i>// Ignore the 'i' in KiB if people use that</i>
-    <b>case</b> 'b': <b>case</b> 'B':     <i>// Ignore B suffix</i>
-      <b>break</b>;
-
-    <b>case</b> 'g': <b>case</b> 'G': Val *= 1024*1024*1024; <b>break</b>;
-    <b>case</b> 'm': <b>case</b> 'M': Val *= 1024*1024;      <b>break</b>;
-    <b>case</b> 'k': <b>case</b> 'K': Val *= 1024;           <b>break</b>;
-
-    default:
-      <i>// Print an error message if unrecognized character!</i>
-      <b>return</b> O.error("'" + Arg + "' value invalid for file size argument!");
-    }
-  }
-}
-</pre></div>
-
-<p>This function implements a very simple parser for the kinds of strings we are
-interested in.  Although it has some holes (it allows "<tt>123KKK</tt>" for
-example), it is good enough for this example.  Note that we use the option
-itself to print out the error message (the <tt>error</tt> method always returns
-true) in order to get a nice error message (shown below).  Now that we have our
-parser class, we can use it like this:</p>
-
-<div class="doc_code"><pre>
-<b>static</b> <a href="#cl::opt">cl::opt</a>&lt;<b>unsigned</b>, <b>false</b>, FileSizeParser&gt;
-MFS(<i>"max-file-size"</i>, <a href="#cl::desc">cl::desc</a>(<i>"Maximum file size to accept"</i>),
-    <a href="#cl::value_desc">cl::value_desc</a>("<i>size</i>"));
-</pre></div>
-
-<p>Which adds this to the output of our program:</p>
-
-<div class="doc_code"><pre>
-OPTIONS:
-  -help                 - display available options (-help-hidden for more)
-  ...
-  <b>-max-file-size=&lt;size&gt; - Maximum file size to accept</b>
-</pre></div>
-
-<p>And we can test that our parse works correctly now (the test program just
-prints out the max-file-size argument value):</p>
-
-<div class="doc_code"><pre>
-$ ./test
-MFS: 0
-$ ./test -max-file-size=123MB
-MFS: 128974848
-$ ./test -max-file-size=3G
-MFS: 3221225472
-$ ./test -max-file-size=dog
--max-file-size option: 'dog' value invalid for file size argument!
-</pre></div>
-
-<p>It looks like it works.  The error message that we get is nice and helpful,
-and we seem to accept reasonable file sizes.  This wraps up the "custom parser"
-tutorial.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="explotingexternal">Exploiting external storage</a>
-</h3>
-
-<div>
-  <p>Several of the LLVM libraries define static <tt>cl::opt</tt> instances that
-  will automatically be included in any program that links with that library.
-  This is a feature. However, sometimes it is necessary to know the value of the
-  command line option outside of the library. In these cases the library does or
-  should provide an external storage location that is accessible to users of the
-  library. Examples of this include the <tt>llvm::DebugFlag</tt> exported by the
-  <tt>lib/Support/Debug.cpp</tt> file and the <tt>llvm::TimePassesIsEnabled</tt>
-  flag exported by the <tt>lib/VMCore/Pass.cpp</tt> file.</p>
-
-<p>TODO: complete this section</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="dynamicopts">Dynamically adding command line options</a>
-</h3>
-
-<div>
-
-<p>TODO: fill in this section</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
new file mode 100644
index 0000000..302f5a4
--- /dev/null
+++ b/docs/CommandLine.rst
@@ -0,0 +1,1615 @@
+.. _commandline:
+
+==============================
+CommandLine 2.0 Library Manual
+==============================
+
+Introduction
+============
+
+This document describes the CommandLine argument processing library.  It will
+show you how to use it, and what it can do.  The CommandLine library uses a
+declarative approach to specifying the command line options that your program
+takes.  By default, these options declarations implicitly hold the value parsed
+for the option declared (of course this `can be changed`_).
+
+Although there are a **lot** of command line argument parsing libraries out
+there in many different languages, none of them fit well with what I needed.  By
+looking at the features and problems of other libraries, I designed the
+CommandLine library to have the following features:
+
+#. Speed: The CommandLine library is very quick and uses little resources.  The
+   parsing time of the library is directly proportional to the number of
+   arguments parsed, not the number of options recognized.  Additionally,
+   command line argument values are captured transparently into user defined
+   global variables, which can be accessed like any other variable (and with the
+   same performance).
+
+#. Type Safe: As a user of CommandLine, you don't have to worry about
+   remembering the type of arguments that you want (is it an int?  a string? a
+   bool? an enum?) and keep casting it around.  Not only does this help prevent
+   error prone constructs, it also leads to dramatically cleaner source code.
+
+#. No subclasses required: To use CommandLine, you instantiate variables that
+   correspond to the arguments that you would like to capture, you don't
+   subclass a parser.  This means that you don't have to write **any**
+   boilerplate code.
+
+#. Globally accessible: Libraries can specify command line arguments that are
+   automatically enabled in any tool that links to the library.  This is
+   possible because the application doesn't have to keep a list of arguments to
+   pass to the parser.  This also makes supporting `dynamically loaded options`_
+   trivial.
+
+#. Cleaner: CommandLine supports enum and other types directly, meaning that
+   there is less error and more security built into the library.  You don't have
+   to worry about whether your integral command line argument accidentally got
+   assigned a value that is not valid for your enum type.
+
+#. Powerful: The CommandLine library supports many different types of arguments,
+   from simple `boolean flags`_ to `scalars arguments`_ (`strings`_,
+   `integers`_, `enums`_, `doubles`_), to `lists of arguments`_.  This is
+   possible because CommandLine is...
+
+#. Extensible: It is very simple to add a new argument type to CommandLine.
+   Simply specify the parser that you want to use with the command line option
+   when you declare it. `Custom parsers`_ are no problem.
+
+#. Labor Saving: The CommandLine library cuts down on the amount of grunt work
+   that you, the user, have to do.  For example, it automatically provides a
+   ``-help`` option that shows the available command line options for your tool.
+   Additionally, it does most of the basic correctness checking for you.
+
+#. Capable: The CommandLine library can handle lots of different forms of
+   options often found in real programs.  For example, `positional`_ arguments,
+   ``ls`` style `grouping`_ options (to allow processing '``ls -lad``'
+   naturally), ``ld`` style `prefix`_ options (to parse '``-lmalloc
+   -L/usr/lib``'), and interpreter style options.
+
+This document will hopefully let you jump in and start using CommandLine in your
+utility quickly and painlessly.  Additionally it should be a simple reference
+manual to figure out how stuff works.  If it is failing in some area (or you
+want an extension to the library), nag the author, `Chris
+Lattner <mailto:sabre@nondot.org>`_.
+
+Quick Start Guide
+=================
+
+This section of the manual runs through a simple CommandLine'ification of a
+basic compiler tool.  This is intended to show you how to jump into using the
+CommandLine library in your own program, and show you some of the cool things it
+can do.
+
+To start out, you need to include the CommandLine header file into your program:
+
+.. code-block:: c++
+
+  #include "llvm/Support/CommandLine.h"
+
+Additionally, you need to add this as the first line of your main program:
+
+.. code-block:: c++
+
+  int main(int argc, char **argv) {
+    cl::ParseCommandLineOptions(argc, argv);
+    ...
+  }
+
+... which actually parses the arguments and fills in the variable declarations.
+
+Now that you are ready to support command line arguments, we need to tell the
+system which ones we want, and what type of arguments they are.  The CommandLine
+library uses a declarative syntax to model command line arguments with the
+global variable declarations that capture the parsed values.  This means that
+for every command line option that you would like to support, there should be a
+global variable declaration to capture the result.  For example, in a compiler,
+we would like to support the Unix-standard '``-o <filename>``' option to specify
+where to put the output.  With the CommandLine library, this is represented like
+this:
+
+.. _scalars arguments:
+.. _here:
+
+.. code-block:: c++
+
+  cl::opt<string> OutputFilename("o", cl::desc("Specify output filename"), cl::value_desc("filename"));
+
+This declares a global variable "``OutputFilename``" that is used to capture the
+result of the "``o``" argument (first parameter).  We specify that this is a
+simple scalar option by using the "``cl::opt``" template (as opposed to the
+"``cl::list``" template), and tell the CommandLine library that the data
+type that we are parsing is a string.
+
+The second and third parameters (which are optional) are used to specify what to
+output for the "``-help``" option.  In this case, we get a line that looks like
+this:
+
+::
+
+  USAGE: compiler [options]
+
+  OPTIONS:
+    -help             - display available options (-help-hidden for more)
+    -o <filename>     - Specify output filename
+
+Because we specified that the command line option should parse using the
+``string`` data type, the variable declared is automatically usable as a real
+string in all contexts that a normal C++ string object may be used.  For
+example:
+
+.. code-block:: c++
+
+  ...
+  std::ofstream Output(OutputFilename.c_str());
+  if (Output.good()) ...
+  ...
+
+There are many different options that you can use to customize the command line
+option handling library, but the above example shows the general interface to
+these options.  The options can be specified in any order, and are specified
+with helper functions like `cl::desc(...)`_, so there are no positional
+dependencies to remember.  The available options are discussed in detail in the
+`Reference Guide`_.
+
+Continuing the example, we would like to have our compiler take an input
+filename as well as an output filename, but we do not want the input filename to
+be specified with a hyphen (ie, not ``-filename.c``).  To support this style of
+argument, the CommandLine library allows for `positional`_ arguments to be
+specified for the program.  These positional arguments are filled with command
+line parameters that are not in option form.  We use this feature like this:
+
+.. code-block:: c++
+
+
+  cl::opt<string> InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+This declaration indicates that the first positional argument should be treated
+as the input filename.  Here we use the `cl::init`_ option to specify an initial
+value for the command line option, which is used if the option is not specified
+(if you do not specify a `cl::init`_ modifier for an option, then the default
+constructor for the data type is used to initialize the value).  Command line
+options default to being optional, so if we would like to require that the user
+always specify an input filename, we would add the `cl::Required`_ flag, and we
+could eliminate the `cl::init`_ modifier, like this:
+
+.. code-block:: c++
+
+  cl::opt<string> InputFilename(cl::Positional, cl::desc("<input file>"), cl::Required);
+
+Again, the CommandLine library does not require the options to be specified in
+any particular order, so the above declaration is equivalent to:
+
+.. code-block:: c++
+
+  cl::opt<string> InputFilename(cl::Positional, cl::Required, cl::desc("<input file>"));
+
+By simply adding the `cl::Required`_ flag, the CommandLine library will
+automatically issue an error if the argument is not specified, which shifts all
+of the command line option verification code out of your application into the
+library.  This is just one example of how using flags can alter the default
+behaviour of the library, on a per-option basis.  By adding one of the
+declarations above, the ``-help`` option synopsis is now extended to:
+
+::
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    -help             - display available options (-help-hidden for more)
+    -o <filename>     - Specify output filename
+
+... indicating that an input filename is expected.
+
+Boolean Arguments
+-----------------
+
+In addition to input and output filenames, we would like the compiler example to
+support three boolean flags: "``-f``" to force writing binary output to a
+terminal, "``--quiet``" to enable quiet mode, and "``-q``" for backwards
+compatibility with some of our users.  We can support these by declaring options
+of boolean type like this:
+
+.. code-block:: c++
+
+  cl::opt<bool> Force ("f", cl::desc("Enable binary output on terminals"));
+  cl::opt<bool> Quiet ("quiet", cl::desc("Don't print informational messages"));
+  cl::opt<bool> Quiet2("q", cl::desc("Don't print informational messages"), cl::Hidden);
+
+This does what you would expect: it declares three boolean variables
+("``Force``", "``Quiet``", and "``Quiet2``") to recognize these options.  Note
+that the "``-q``" option is specified with the "`cl::Hidden`_" flag.  This
+modifier prevents it from being shown by the standard "``-help``" output (note
+that it is still shown in the "``-help-hidden``" output).
+
+The CommandLine library uses a `different parser`_ for different data types.
+For example, in the string case, the argument passed to the option is copied
+literally into the content of the string variable... we obviously cannot do that
+in the boolean case, however, so we must use a smarter parser.  In the case of
+the boolean parser, it allows no options (in which case it assigns the value of
+true to the variable), or it allows the values "``true``" or "``false``" to be
+specified, allowing any of the following inputs:
+
+::
+
+  compiler -f          # No value, 'Force' == true
+  compiler -f=true     # Value specified, 'Force' == true
+  compiler -f=TRUE     # Value specified, 'Force' == true
+  compiler -f=FALSE    # Value specified, 'Force' == false
+
+... you get the idea.  The `bool parser`_ just turns the string values into
+boolean values, and rejects things like '``compiler -f=foo``'.  Similarly, the
+`float`_, `double`_, and `int`_ parsers work like you would expect, using the
+'``strtol``' and '``strtod``' C library calls to parse the string value into the
+specified data type.
+
+With the declarations above, "``compiler -help``" emits this:
+
+::
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    -f     - Enable binary output on terminals
+    -o     - Override output filename
+    -quiet - Don't print informational messages
+    -help  - display available options (-help-hidden for more)
+
+and "``compiler -help-hidden``" prints this:
+
+::
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    -f     - Enable binary output on terminals
+    -o     - Override output filename
+    -q     - Don't print informational messages
+    -quiet - Don't print informational messages
+    -help  - display available options (-help-hidden for more)
+
+This brief example has shown you how to use the '`cl::opt`_' class to parse
+simple scalar command line arguments.  In addition to simple scalar arguments,
+the CommandLine library also provides primitives to support CommandLine option
+`aliases`_, and `lists`_ of options.
+
+.. _aliases:
+
+Argument Aliases
+----------------
+
+So far, the example works well, except for the fact that we need to check the
+quiet condition like this now:
+
+.. code-block:: c++
+
+  ...
+    if (!Quiet && !Quiet2) printInformationalMessage(...);
+  ...
+
+... which is a real pain!  Instead of defining two values for the same
+condition, we can use the "`cl::alias`_" class to make the "``-q``" option an
+**alias** for the "``-quiet``" option, instead of providing a value itself:
+
+.. code-block:: c++
+
+  cl::opt<bool> Force ("f", cl::desc("Overwrite output files"));
+  cl::opt<bool> Quiet ("quiet", cl::desc("Don't print informational messages"));
+  cl::alias     QuietA("q", cl::desc("Alias for -quiet"), cl::aliasopt(Quiet));
+
+The third line (which is the only one we modified from above) defines a "``-q``"
+alias that updates the "``Quiet``" variable (as specified by the `cl::aliasopt`_
+modifier) whenever it is specified.  Because aliases do not hold state, the only
+thing the program has to query is the ``Quiet`` variable now.  Another nice
+feature of aliases is that they automatically hide themselves from the ``-help``
+output (although, again, they are still visible in the ``-help-hidden output``).
+
+Now the application code can simply use:
+
+.. code-block:: c++
+
+  ...
+    if (!Quiet) printInformationalMessage(...);
+  ...
+
+... which is much nicer!  The "`cl::alias`_" can be used to specify an
+alternative name for any variable type, and has many uses.
+
+.. _unnamed alternatives using the generic parser:
+
+Selecting an alternative from a set of possibilities
+----------------------------------------------------
+
+So far we have seen how the CommandLine library handles builtin types like
+``std::string``, ``bool`` and ``int``, but how does it handle things it doesn't
+know about, like enums or '``int*``'s?
+
+The answer is that it uses a table-driven generic parser (unless you specify
+your own parser, as described in the `Extension Guide`_).  This parser maps
+literal strings to whatever type is required, and requires you to tell it what
+this mapping should be.
+
+Let's say that we would like to add four optimization levels to our optimizer,
+using the standard flags "``-g``", "``-O0``", "``-O1``", and "``-O2``".  We
+could easily implement this with boolean options like above, but there are
+several problems with this strategy:
+
+#. A user could specify more than one of the options at a time, for example,
+   "``compiler -O3 -O2``".  The CommandLine library would not be able to catch
+   this erroneous input for us.
+
+#. We would have to test 4 different variables to see which ones are set.
+
+#. This doesn't map to the numeric levels that we want... so we cannot easily
+   see if some level >= "``-O1``" is enabled.
+
+To cope with these problems, we can use an enum value, and have the CommandLine
+library fill it in with the appropriate level directly, which is used like this:
+
+.. code-block:: c++
+
+  enum OptLevel {
+    g, O1, O2, O3
+  };
+
+  cl::opt<OptLevel> OptimizationLevel(cl::desc("Choose optimization level:"),
+    cl::values(
+      clEnumVal(g , "No optimizations, enable debugging"),
+      clEnumVal(O1, "Enable trivial optimizations"),
+      clEnumVal(O2, "Enable default optimizations"),
+      clEnumVal(O3, "Enable expensive optimizations"),
+     clEnumValEnd));
+
+  ...
+    if (OptimizationLevel >= O2) doPartialRedundancyElimination(...);
+  ...
+
+This declaration defines a variable "``OptimizationLevel``" of the
+"``OptLevel``" enum type.  This variable can be assigned any of the values that
+are listed in the declaration (Note that the declaration list must be terminated
+with the "``clEnumValEnd``" argument!).  The CommandLine library enforces that
+the user can only specify one of the options, and it ensure that only valid enum
+values can be specified.  The "``clEnumVal``" macros ensure that the command
+line arguments matched the enum values.  With this option added, our help output
+now is:
+
+::
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    Choose optimization level:
+      -g          - No optimizations, enable debugging
+      -O1         - Enable trivial optimizations
+      -O2         - Enable default optimizations
+      -O3         - Enable expensive optimizations
+    -f            - Enable binary output on terminals
+    -help         - display available options (-help-hidden for more)
+    -o <filename> - Specify output filename
+    -quiet        - Don't print informational messages
+
+In this case, it is sort of awkward that flag names correspond directly to enum
+names, because we probably don't want a enum definition named "``g``" in our
+program.  Because of this, we can alternatively write this example like this:
+
+.. code-block:: c++
+
+  enum OptLevel {
+    Debug, O1, O2, O3
+  };
+
+  cl::opt<OptLevel> OptimizationLevel(cl::desc("Choose optimization level:"),
+    cl::values(
+     clEnumValN(Debug, "g", "No optimizations, enable debugging"),
+      clEnumVal(O1        , "Enable trivial optimizations"),
+      clEnumVal(O2        , "Enable default optimizations"),
+      clEnumVal(O3        , "Enable expensive optimizations"),
+     clEnumValEnd));
+
+  ...
+    if (OptimizationLevel == Debug) outputDebugInfo(...);
+  ...
+
+By using the "``clEnumValN``" macro instead of "``clEnumVal``", we can directly
+specify the name that the flag should get.  In general a direct mapping is nice,
+but sometimes you can't or don't want to preserve the mapping, which is when you
+would use it.
+
+Named Alternatives
+------------------
+
+Another useful argument form is a named alternative style.  We shall use this
+style in our compiler to specify different debug levels that can be used.
+Instead of each debug level being its own switch, we want to support the
+following options, of which only one can be specified at a time:
+"``--debug-level=none``", "``--debug-level=quick``",
+"``--debug-level=detailed``".  To do this, we use the exact same format as our
+optimization level flags, but we also specify an option name.  For this case,
+the code looks like this:
+
+.. code-block:: c++
+
+  enum DebugLev {
+    nodebuginfo, quick, detailed
+  };
+
+  // Enable Debug Options to be specified on the command line
+  cl::opt<DebugLev> DebugLevel("debug_level", cl::desc("Set the debugging level:"),
+    cl::values(
+      clEnumValN(nodebuginfo, "none", "disable debug information"),
+       clEnumVal(quick,               "enable quick debug information"),
+       clEnumVal(detailed,            "enable detailed debug information"),
+      clEnumValEnd));
+
+This definition defines an enumerated command line variable of type "``enum
+DebugLev``", which works exactly the same way as before.  The difference here is
+just the interface exposed to the user of your program and the help output by
+the "``-help``" option:
+
+::
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    Choose optimization level:
+      -g          - No optimizations, enable debugging
+      -O1         - Enable trivial optimizations
+      -O2         - Enable default optimizations
+      -O3         - Enable expensive optimizations
+    -debug_level  - Set the debugging level:
+      =none       - disable debug information
+      =quick      - enable quick debug information
+      =detailed   - enable detailed debug information
+    -f            - Enable binary output on terminals
+    -help         - display available options (-help-hidden for more)
+    -o <filename> - Specify output filename
+    -quiet        - Don't print informational messages
+
+Again, the only structural difference between the debug level declaration and
+the optimization level declaration is that the debug level declaration includes
+an option name (``"debug_level"``), which automatically changes how the library
+processes the argument.  The CommandLine library supports both forms so that you
+can choose the form most appropriate for your application.
+
+.. _lists:
+
+Parsing a list of options
+-------------------------
+
+Now that we have the standard run-of-the-mill argument types out of the way,
+lets get a little wild and crazy.  Lets say that we want our optimizer to accept
+a **list** of optimizations to perform, allowing duplicates.  For example, we
+might want to run: "``compiler -dce -constprop -inline -dce -strip``".  In this
+case, the order of the arguments and the number of appearances is very
+important.  This is what the "``cl::list``" template is for.  First, start by
+defining an enum of the optimizations that you would like to perform:
+
+.. code-block:: c++
+
+  enum Opts {
+    // 'inline' is a C++ keyword, so name it 'inlining'
+    dce, constprop, inlining, strip
+  };
+
+Then define your "``cl::list``" variable:
+
+.. code-block:: c++
+
+  cl::list<Opts> OptimizationList(cl::desc("Available Optimizations:"),
+    cl::values(
+      clEnumVal(dce               , "Dead Code Elimination"),
+      clEnumVal(constprop         , "Constant Propagation"),
+     clEnumValN(inlining, "inline", "Procedure Integration"),
+      clEnumVal(strip             , "Strip Symbols"),
+    clEnumValEnd));
+
+This defines a variable that is conceptually of the type
+"``std::vector<enum Opts>``".  Thus, you can access it with standard vector
+methods:
+
+.. code-block:: c++
+
+  for (unsigned i = 0; i != OptimizationList.size(); ++i)
+    switch (OptimizationList[i])
+       ...
+
+... to iterate through the list of options specified.
+
+Note that the "``cl::list``" template is completely general and may be used with
+any data types or other arguments that you can use with the "``cl::opt``"
+template.  One especially useful way to use a list is to capture all of the
+positional arguments together if there may be more than one specified.  In the
+case of a linker, for example, the linker takes several '``.o``' files, and
+needs to capture them into a list.  This is naturally specified as:
+
+.. code-block:: c++
+
+  ...
+  cl::list<std::string> InputFilenames(cl::Positional, cl::desc("<Input files>"), cl::OneOrMore);
+  ...
+
+This variable works just like a "``vector<string>``" object.  As such, accessing
+the list is simple, just like above.  In this example, we used the
+`cl::OneOrMore`_ modifier to inform the CommandLine library that it is an error
+if the user does not specify any ``.o`` files on our command line.  Again, this
+just reduces the amount of checking we have to do.
+
+Collecting options as a set of flags
+------------------------------------
+
+Instead of collecting sets of options in a list, it is also possible to gather
+information for enum values in a **bit vector**.  The representation used by the
+`cl::bits`_ class is an ``unsigned`` integer.  An enum value is represented by a
+0/1 in the enum's ordinal value bit position. 1 indicating that the enum was
+specified, 0 otherwise.  As each specified value is parsed, the resulting enum's
+bit is set in the option's bit vector:
+
+.. code-block:: c++
+
+  bits |= 1 << (unsigned)enum;
+
+Options that are specified multiple times are redundant.  Any instances after
+the first are discarded.
+
+Reworking the above list example, we could replace `cl::list`_ with `cl::bits`_:
+
+.. code-block:: c++
+
+  cl::bits<Opts> OptimizationBits(cl::desc("Available Optimizations:"),
+    cl::values(
+      clEnumVal(dce               , "Dead Code Elimination"),
+      clEnumVal(constprop         , "Constant Propagation"),
+     clEnumValN(inlining, "inline", "Procedure Integration"),
+      clEnumVal(strip             , "Strip Symbols"),
+    clEnumValEnd));
+
+To test to see if ``constprop`` was specified, we can use the ``cl:bits::isSet``
+function:
+
+.. code-block:: c++
+
+  if (OptimizationBits.isSet(constprop)) {
+    ...
+  }
+
+It's also possible to get the raw bit vector using the ``cl::bits::getBits``
+function:
+
+.. code-block:: c++
+
+  unsigned bits = OptimizationBits.getBits();
+
+Finally, if external storage is used, then the location specified must be of
+**type** ``unsigned``. In all other ways a `cl::bits`_ option is equivalent to a
+`cl::list`_ option.
+
+.. _additional extra text:
+
+Adding freeform text to help output
+-----------------------------------
+
+As our program grows and becomes more mature, we may decide to put summary
+information about what it does into the help output.  The help output is styled
+to look similar to a Unix ``man`` page, providing concise information about a
+program.  Unix ``man`` pages, however often have a description about what the
+program does.  To add this to your CommandLine program, simply pass a third
+argument to the `cl::ParseCommandLineOptions`_ call in main.  This additional
+argument is then printed as the overview information for your program, allowing
+you to include any additional information that you want.  For example:
+
+.. code-block:: c++
+
+  int main(int argc, char **argv) {
+    cl::ParseCommandLineOptions(argc, argv, " CommandLine compiler example\n\n"
+                                "  This program blah blah blah...\n");
+    ...
+  }
+
+would yield the help output:
+
+::
+
+  **OVERVIEW: CommandLine compiler example
+
+    This program blah blah blah...**
+
+  USAGE: compiler [options] <input file>
+
+  OPTIONS:
+    ...
+    -help             - display available options (-help-hidden for more)
+    -o <filename>     - Specify output filename
+
+.. _Reference Guide:
+
+Reference Guide
+===============
+
+Now that you know the basics of how to use the CommandLine library, this section
+will give you the detailed information you need to tune how command line options
+work, as well as information on more "advanced" command line option processing
+capabilities.
+
+.. _positional:
+.. _positional argument:
+.. _Positional Arguments:
+.. _Positional arguments section:
+.. _positional options:
+
+Positional Arguments
+--------------------
+
+Positional arguments are those arguments that are not named, and are not
+specified with a hyphen.  Positional arguments should be used when an option is
+specified by its position alone.  For example, the standard Unix ``grep`` tool
+takes a regular expression argument, and an optional filename to search through
+(which defaults to standard input if a filename is not specified).  Using the
+CommandLine library, this would be specified as:
+
+.. code-block:: c++
+
+  cl::opt<string> Regex   (cl::Positional, cl::desc("<regular expression>"), cl::Required);
+  cl::opt<string> Filename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+Given these two option declarations, the ``-help`` output for our grep
+replacement would look like this:
+
+::
+
+  USAGE: spiffygrep [options] <regular expression> <input file>
+
+  OPTIONS:
+    -help - display available options (-help-hidden for more)
+
+... and the resultant program could be used just like the standard ``grep``
+tool.
+
+Positional arguments are sorted by their order of construction.  This means that
+command line options will be ordered according to how they are listed in a .cpp
+file, but will not have an ordering defined if the positional arguments are
+defined in multiple .cpp files.  The fix for this problem is simply to define
+all of your positional arguments in one .cpp file.
+
+Specifying positional options with hyphens
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sometimes you may want to specify a value to your positional argument that
+starts with a hyphen (for example, searching for '``-foo``' in a file).  At
+first, you will have trouble doing this, because it will try to find an argument
+named '``-foo``', and will fail (and single quotes will not save you).  Note
+that the system ``grep`` has the same problem:
+
+::
+
+  $ spiffygrep '-foo' test.txt
+  Unknown command line argument '-foo'.  Try: spiffygrep -help'
+
+  $ grep '-foo' test.txt
+  grep: illegal option -- f
+  grep: illegal option -- o
+  grep: illegal option -- o
+  Usage: grep -hblcnsviw pattern file . . .
+
+The solution for this problem is the same for both your tool and the system
+version: use the '``--``' marker.  When the user specifies '``--``' on the
+command line, it is telling the program that all options after the '``--``'
+should be treated as positional arguments, not options.  Thus, we can use it
+like this:
+
+::
+
+  $ spiffygrep -- -foo test.txt
+    ...output...
+
+Determining absolute position with getPosition()
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sometimes an option can affect or modify the meaning of another option. For
+example, consider ``gcc``'s ``-x LANG`` option. This tells ``gcc`` to ignore the
+suffix of subsequent positional arguments and force the file to be interpreted
+as if it contained source code in language ``LANG``. In order to handle this
+properly, you need to know the absolute position of each argument, especially
+those in lists, so their interaction(s) can be applied correctly. This is also
+useful for options like ``-llibname`` which is actually a positional argument
+that starts with a dash.
+
+So, generally, the problem is that you have two ``cl::list`` variables that
+interact in some way. To ensure the correct interaction, you can use the
+``cl::list::getPosition(optnum)`` method. This method returns the absolute
+position (as found on the command line) of the ``optnum`` item in the
+``cl::list``.
+
+The idiom for usage is like this:
+
+.. code-block:: c++
+
+  static cl::list<std::string> Files(cl::Positional, cl::OneOrMore);
+  static cl::list<std::string> Libraries("l", cl::ZeroOrMore);
+
+  int main(int argc, char**argv) {
+    // ...
+    std::vector<std::string>::iterator fileIt = Files.begin();
+    std::vector<std::string>::iterator libIt  = Libraries.begin();
+    unsigned libPos = 0, filePos = 0;
+    while ( 1 ) {
+      if ( libIt != Libraries.end() )
+        libPos = Libraries.getPosition( libIt - Libraries.begin() );
+      else
+        libPos = 0;
+      if ( fileIt != Files.end() )
+        filePos = Files.getPosition( fileIt - Files.begin() );
+      else
+        filePos = 0;
+
+      if ( filePos != 0 && (libPos == 0 || filePos < libPos) ) {
+        // Source File Is next
+        ++fileIt;
+      }
+      else if ( libPos != 0 && (filePos == 0 || libPos < filePos) ) {
+        // Library is next
+        ++libIt;
+      }
+      else
+        break; // we're done with the list
+    }
+  }
+
+Note that, for compatibility reasons, the ``cl::opt`` also supports an
+``unsigned getPosition()`` option that will provide the absolute position of
+that option. You can apply the same approach as above with a ``cl::opt`` and a
+``cl::list`` option as you can with two lists.
+
+.. _interpreter style options:
+.. _cl::ConsumeAfter:
+.. _this section for more information:
+
+The ``cl::ConsumeAfter`` modifier
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::ConsumeAfter`` `formatting option`_ is used to construct programs that
+use "interpreter style" option processing.  With this style of option
+processing, all arguments specified after the last positional argument are
+treated as special interpreter arguments that are not interpreted by the command
+line argument.
+
+As a concrete example, lets say we are developing a replacement for the standard
+Unix Bourne shell (``/bin/sh``).  To run ``/bin/sh``, first you specify options
+to the shell itself (like ``-x`` which turns on trace output), then you specify
+the name of the script to run, then you specify arguments to the script.  These
+arguments to the script are parsed by the Bourne shell command line option
+processor, but are not interpreted as options to the shell itself.  Using the
+CommandLine library, we would specify this as:
+
+.. code-block:: c++
+
+  cl::opt<string> Script(cl::Positional, cl::desc("<input script>"), cl::init("-"));
+  cl::list<string>  Argv(cl::ConsumeAfter, cl::desc("<program arguments>..."));
+  cl::opt<bool>    Trace("x", cl::desc("Enable trace output"));
+
+which automatically provides the help output:
+
+::
+
+  USAGE: spiffysh [options] <input script> <program arguments>...
+
+  OPTIONS:
+    -help - display available options (-help-hidden for more)
+    -x    - Enable trace output
+
+At runtime, if we run our new shell replacement as ```spiffysh -x test.sh -a -x
+-y bar``', the ``Trace`` variable will be set to true, the ``Script`` variable
+will be set to "``test.sh``", and the ``Argv`` list will contain ``["-a", "-x",
+"-y", "bar"]``, because they were specified after the last positional argument
+(which is the script name).
+
+There are several limitations to when ``cl::ConsumeAfter`` options can be
+specified.  For example, only one ``cl::ConsumeAfter`` can be specified per
+program, there must be at least one `positional argument`_ specified, there must
+not be any `cl::list`_ positional arguments, and the ``cl::ConsumeAfter`` option
+should be a `cl::list`_ option.
+
+.. _can be changed:
+.. _Internal vs External Storage:
+
+Internal vs External Storage
+----------------------------
+
+By default, all command line options automatically hold the value that they
+parse from the command line.  This is very convenient in the common case,
+especially when combined with the ability to define command line options in the
+files that use them.  This is called the internal storage model.
+
+Sometimes, however, it is nice to separate the command line option processing
+code from the storage of the value parsed.  For example, lets say that we have a
+'``-debug``' option that we would like to use to enable debug information across
+the entire body of our program.  In this case, the boolean value controlling the
+debug code should be globally accessible (in a header file, for example) yet the
+command line option processing code should not be exposed to all of these
+clients (requiring lots of .cpp files to ``#include CommandLine.h``).
+
+To do this, set up your .h file with your option, like this for example:
+
+.. code-block:: c++
+
+  // DebugFlag.h - Get access to the '-debug' command line option
+  //
+
+  // DebugFlag - This boolean is set to true if the '-debug' command line option
+  // is specified.  This should probably not be referenced directly, instead, use
+  // the DEBUG macro below.
+  //
+  extern bool DebugFlag;
+
+  // DEBUG macro - This macro should be used by code to emit debug information.
+  // In the '-debug' option is specified on the command line, and if this is a
+  // debug build, then the code specified as the option to the macro will be
+  // executed.  Otherwise it will not be.
+  #ifdef NDEBUG
+  #define DEBUG(X)
+  #else
+  #define DEBUG(X) do { if (DebugFlag) { X; } } while (0)
+  #endif
+
+This allows clients to blissfully use the ``DEBUG()`` macro, or the
+``DebugFlag`` explicitly if they want to.  Now we just need to be able to set
+the ``DebugFlag`` boolean when the option is set.  To do this, we pass an
+additional argument to our command line argument processor, and we specify where
+to fill in with the `cl::location`_ attribute:
+
+.. code-block:: c++
+
+  bool DebugFlag;                  // the actual value
+  static cl::opt<bool, true>       // The parser
+  Debug("debug", cl::desc("Enable debug output"), cl::Hidden, cl::location(DebugFlag));
+
+In the above example, we specify "``true``" as the second argument to the
+`cl::opt`_ template, indicating that the template should not maintain a copy of
+the value itself.  In addition to this, we specify the `cl::location`_
+attribute, so that ``DebugFlag`` is automatically set.
+
+Option Attributes
+-----------------
+
+This section describes the basic attributes that you can specify on options.
+
+* The option name attribute (which is required for all options, except
+  `positional options`_) specifies what the option name is.  This option is
+  specified in simple double quotes:
+
+  .. code-block:: c++
+
+    cl::opt<**bool**> Quiet("quiet");
+
+.. _cl::desc(...):
+
+* The **cl::desc** attribute specifies a description for the option to be
+  shown in the ``-help`` output for the program.
+
+.. _cl::value_desc:
+
+* The **cl::value_desc** attribute specifies a string that can be used to
+  fine tune the ``-help`` output for a command line option.  Look `here`_ for an
+  example.
+
+.. _cl::init:
+
+* The **cl::init** attribute specifies an initial value for a `scalar`_
+  option.  If this attribute is not specified then the command line option value
+  defaults to the value created by the default constructor for the
+  type.
+
+  .. warning::
+
+    If you specify both **cl::init** and **cl::location** for an option, you
+    must specify **cl::location** first, so that when the command-line parser
+    sees **cl::init**, it knows where to put the initial value. (You will get an
+    error at runtime if you don't put them in the right order.)
+
+.. _cl::location:
+
+* The **cl::location** attribute where to store the value for a parsed command
+  line option if using external storage.  See the section on `Internal vs
+  External Storage`_ for more information.
+
+.. _cl::aliasopt:
+
+* The **cl::aliasopt** attribute specifies which option a `cl::alias`_ option is
+  an alias for.
+
+.. _cl::values:
+
+* The **cl::values** attribute specifies the string-to-value mapping to be used
+  by the generic parser.  It takes a **clEnumValEnd terminated** list of
+  (option, value, description) triplets that specify the option name, the value
+  mapped to, and the description shown in the ``-help`` for the tool.  Because
+  the generic parser is used most frequently with enum values, two macros are
+  often useful:
+
+  #. The **clEnumVal** macro is used as a nice simple way to specify a triplet
+     for an enum.  This macro automatically makes the option name be the same as
+     the enum name.  The first option to the macro is the enum, the second is
+     the description for the command line option.
+
+  #. The **clEnumValN** macro is used to specify macro options where the option
+     name doesn't equal the enum name.  For this macro, the first argument is
+     the enum value, the second is the flag name, and the second is the
+     description.
+
+  You will get a compile time error if you try to use cl::values with a parser
+  that does not support it.
+
+.. _cl::multi_val:
+
+* The **cl::multi_val** attribute specifies that this option takes has multiple
+  values (example: ``-sectalign segname sectname sectvalue``). This attribute
+  takes one unsigned argument - the number of values for the option. This
+  attribute is valid only on ``cl::list`` options (and will fail with compile
+  error if you try to use it with other option types). It is allowed to use all
+  of the usual modifiers on multi-valued options (besides
+  ``cl::ValueDisallowed``, obviously).
+
+Option Modifiers
+----------------
+
+Option modifiers are the flags and expressions that you pass into the
+constructors for `cl::opt`_ and `cl::list`_.  These modifiers give you the
+ability to tweak how options are parsed and how ``-help`` output is generated to
+fit your application well.
+
+These options fall into five main categories:
+
+#. Hiding an option from ``-help`` output
+
+#. Controlling the number of occurrences required and allowed
+
+#. Controlling whether or not a value must be specified
+
+#. Controlling other formatting options
+
+#. Miscellaneous option modifiers
+
+It is not possible to specify two options from the same category (you'll get a
+runtime error) to a single option, except for options in the miscellaneous
+category.  The CommandLine library specifies defaults for all of these settings
+that are the most useful in practice and the most common, which mean that you
+usually shouldn't have to worry about these.
+
+Hiding an option from ``-help`` output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::NotHidden``, ``cl::Hidden``, and ``cl::ReallyHidden`` modifiers are
+used to control whether or not an option appears in the ``-help`` and
+``-help-hidden`` output for the compiled program:
+
+.. _cl::NotHidden:
+
+* The **cl::NotHidden** modifier (which is the default for `cl::opt`_ and
+  `cl::list`_ options) indicates the option is to appear in both help
+  listings.
+
+.. _cl::Hidden:
+
+* The **cl::Hidden** modifier (which is the default for `cl::alias`_ options)
+  indicates that the option should not appear in the ``-help`` output, but
+  should appear in the ``-help-hidden`` output.
+
+.. _cl::ReallyHidden:
+
+* The **cl::ReallyHidden** modifier indicates that the option should not appear
+  in any help output.
+
+Controlling the number of occurrences required and allowed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This group of options is used to control how many time an option is allowed (or
+required) to be specified on the command line of your program.  Specifying a
+value for this setting allows the CommandLine library to do error checking for
+you.
+
+The allowed values for this option group are:
+
+.. _cl::Optional:
+
+* The **cl::Optional** modifier (which is the default for the `cl::opt`_ and
+  `cl::alias`_ classes) indicates that your program will allow either zero or
+  one occurrence of the option to be specified.
+
+.. _cl::ZeroOrMore:
+
+* The **cl::ZeroOrMore** modifier (which is the default for the `cl::list`_
+  class) indicates that your program will allow the option to be specified zero
+  or more times.
+
+.. _cl::Required:
+
+* The **cl::Required** modifier indicates that the specified option must be
+  specified exactly one time.
+
+.. _cl::OneOrMore:
+
+* The **cl::OneOrMore** modifier indicates that the option must be specified at
+  least one time.
+
+* The **cl::ConsumeAfter** modifier is described in the `Positional arguments
+  section`_.
+
+If an option is not specified, then the value of the option is equal to the
+value specified by the `cl::init`_ attribute.  If the ``cl::init`` attribute is
+not specified, the option value is initialized with the default constructor for
+the data type.
+
+If an option is specified multiple times for an option of the `cl::opt`_ class,
+only the last value will be retained.
+
+Controlling whether or not a value must be specified
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This group of options is used to control whether or not the option allows a
+value to be present.  In the case of the CommandLine library, a value is either
+specified with an equal sign (e.g. '``-index-depth=17``') or as a trailing
+string (e.g. '``-o a.out``').
+
+The allowed values for this option group are:
+
+.. _cl::ValueOptional:
+
+* The **cl::ValueOptional** modifier (which is the default for ``bool`` typed
+  options) specifies that it is acceptable to have a value, or not.  A boolean
+  argument can be enabled just by appearing on the command line, or it can have
+  an explicit '``-foo=true``'.  If an option is specified with this mode, it is
+  illegal for the value to be provided without the equal sign.  Therefore
+  '``-foo true``' is illegal.  To get this behavior, you must use
+  the `cl::ValueRequired`_ modifier.
+
+.. _cl::ValueRequired:
+
+* The **cl::ValueRequired** modifier (which is the default for all other types
+  except for `unnamed alternatives using the generic parser`_) specifies that a
+  value must be provided.  This mode informs the command line library that if an
+  option is not provides with an equal sign, that the next argument provided
+  must be the value.  This allows things like '``-o a.out``' to work.
+
+.. _cl::ValueDisallowed:
+
+* The **cl::ValueDisallowed** modifier (which is the default for `unnamed
+  alternatives using the generic parser`_) indicates that it is a runtime error
+  for the user to specify a value.  This can be provided to disallow users from
+  providing options to boolean options (like '``-foo=true``').
+
+In general, the default values for this option group work just like you would
+want them to.  As mentioned above, you can specify the `cl::ValueDisallowed`_
+modifier to a boolean argument to restrict your command line parser.  These
+options are mostly useful when `extending the library`_.
+
+.. _formatting option:
+
+Controlling other formatting options
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The formatting option group is used to specify that the command line option has
+special abilities and is otherwise different from other command line arguments.
+As usual, you can only specify one of these arguments at most.
+
+.. _cl::NormalFormatting:
+
+* The **cl::NormalFormatting** modifier (which is the default all options)
+  specifies that this option is "normal".
+
+.. _cl::Positional:
+
+* The **cl::Positional** modifier specifies that this is a positional argument
+  that does not have a command line option associated with it.  See the
+  `Positional Arguments`_ section for more information.
+
+* The **cl::ConsumeAfter** modifier specifies that this option is used to
+  capture "interpreter style" arguments.  See `this section for more
+  information`_.
+
+.. _prefix:
+.. _cl::Prefix:
+
+* The **cl::Prefix** modifier specifies that this option prefixes its value.
+  With 'Prefix' options, the equal sign does not separate the value from the
+  option name specified. Instead, the value is everything after the prefix,
+  including any equal sign if present. This is useful for processing odd
+  arguments like ``-lmalloc`` and ``-L/usr/lib`` in a linker tool or
+  ``-DNAME=value`` in a compiler tool.  Here, the '``l``', '``D``' and '``L``'
+  options are normal string (or list) options, that have the **cl::Prefix**
+  modifier added to allow the CommandLine library to recognize them.  Note that
+  **cl::Prefix** options must not have the **cl::ValueDisallowed** modifier
+  specified.
+
+.. _grouping:
+.. _cl::Grouping:
+
+* The **cl::Grouping** modifier is used to implement Unix-style tools (like
+  ``ls``) that have lots of single letter arguments, but only require a single
+  dash.  For example, the '``ls -labF``' command actually enables four different
+  options, all of which are single letters.  Note that **cl::Grouping** options
+  cannot have values.
+
+The CommandLine library does not restrict how you use the **cl::Prefix** or
+**cl::Grouping** modifiers, but it is possible to specify ambiguous argument
+settings.  Thus, it is possible to have multiple letter options that are prefix
+or grouping options, and they will still work as designed.
+
+To do this, the CommandLine library uses a greedy algorithm to parse the input
+option into (potentially multiple) prefix and grouping options.  The strategy
+basically looks like this:
+
+::
+
+  parse(string OrigInput) {
+
+  1. string input = OrigInput;
+  2. if (isOption(input)) return getOption(input).parse();  // Normal option
+  3. while (!isOption(input) && !input.empty()) input.pop_back();  // Remove the last letter
+  4. if (input.empty()) return error();  // No matching option
+  5. if (getOption(input).isPrefix())
+       return getOption(input).parse(input);
+  6. while (!input.empty()) {  // Must be grouping options
+       getOption(input).parse();
+       OrigInput.erase(OrigInput.begin(), OrigInput.begin()+input.length());
+       input = OrigInput;
+       while (!isOption(input) && !input.empty()) input.pop_back();
+     }
+  7. if (!OrigInput.empty()) error();
+
+  }
+
+Miscellaneous option modifiers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The miscellaneous option modifiers are the only flags where you can specify more
+than one flag from the set: they are not mutually exclusive.  These flags
+specify boolean properties that modify the option.
+
+.. _cl::CommaSeparated:
+
+* The **cl::CommaSeparated** modifier indicates that any commas specified for an
+  option's value should be used to split the value up into multiple values for
+  the option.  For example, these two options are equivalent when
+  ``cl::CommaSeparated`` is specified: "``-foo=a -foo=b -foo=c``" and
+  "``-foo=a,b,c``".  This option only makes sense to be used in a case where the
+  option is allowed to accept one or more values (i.e. it is a `cl::list`_
+  option).
+
+.. _cl::PositionalEatsArgs:
+
+* The **cl::PositionalEatsArgs** modifier (which only applies to positional
+  arguments, and only makes sense for lists) indicates that positional argument
+  should consume any strings after it (including strings that start with a "-")
+  up until another recognized positional argument.  For example, if you have two
+  "eating" positional arguments, "``pos1``" and "``pos2``", the string "``-pos1
+  -foo -bar baz -pos2 -bork``" would cause the "``-foo -bar -baz``" strings to
+  be applied to the "``-pos1``" option and the "``-bork``" string to be applied
+  to the "``-pos2``" option.
+
+.. _cl::Sink:
+
+* The **cl::Sink** modifier is used to handle unknown options. If there is at
+  least one option with ``cl::Sink`` modifier specified, the parser passes
+  unrecognized option strings to it as values instead of signaling an error. As
+  with ``cl::CommaSeparated``, this modifier only makes sense with a `cl::list`_
+  option.
+
+So far, these are the only three miscellaneous option modifiers.
+
+.. _response files:
+
+Response files
+^^^^^^^^^^^^^^
+
+Some systems, such as certain variants of Microsoft Windows and some older
+Unices have a relatively low limit on command-line length. It is therefore
+customary to use the so-called 'response files' to circumvent this
+restriction. These files are mentioned on the command-line (using the "@file")
+syntax. The program reads these files and inserts the contents into argv,
+thereby working around the command-line length limits. Response files are
+enabled by an optional fourth argument to `cl::ParseEnvironmentOptions`_ and
+`cl::ParseCommandLineOptions`_.
+
+Top-Level Classes and Functions
+-------------------------------
+
+Despite all of the built-in flexibility, the CommandLine option library really
+only consists of one function `cl::ParseCommandLineOptions`_) and three main
+classes: `cl::opt`_, `cl::list`_, and `cl::alias`_.  This section describes
+these three classes in detail.
+
+.. _cl::ParseCommandLineOptions:
+
+The ``cl::ParseCommandLineOptions`` function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::ParseCommandLineOptions`` function is designed to be called directly
+from ``main``, and is used to fill in the values of all of the command line
+option variables once ``argc`` and ``argv`` are available.
+
+The ``cl::ParseCommandLineOptions`` function requires two parameters (``argc``
+and ``argv``), but may also take an optional third parameter which holds
+`additional extra text`_ to emit when the ``-help`` option is invoked, and a
+fourth boolean parameter that enables `response files`_.
+
+.. _cl::ParseEnvironmentOptions:
+
+The ``cl::ParseEnvironmentOptions`` function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::ParseEnvironmentOptions`` function has mostly the same effects as
+`cl::ParseCommandLineOptions`_, except that it is designed to take values for
+options from an environment variable, for those cases in which reading the
+command line is not convenient or desired. It fills in the values of all the
+command line option variables just like `cl::ParseCommandLineOptions`_ does.
+
+It takes four parameters: the name of the program (since ``argv`` may not be
+available, it can't just look in ``argv[0]``), the name of the environment
+variable to examine, the optional `additional extra text`_ to emit when the
+``-help`` option is invoked, and the boolean switch that controls whether
+`response files`_ should be read.
+
+``cl::ParseEnvironmentOptions`` will break the environment variable's value up
+into words and then process them using `cl::ParseCommandLineOptions`_.
+**Note:** Currently ``cl::ParseEnvironmentOptions`` does not support quoting, so
+an environment variable containing ``-option "foo bar"`` will be parsed as three
+words, ``-option``, ``"foo``, and ``bar"``, which is different from what you
+would get from the shell with the same input.
+
+The ``cl::SetVersionPrinter`` function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::SetVersionPrinter`` function is designed to be called directly from
+``main`` and *before* ``cl::ParseCommandLineOptions``. Its use is optional. It
+simply arranges for a function to be called in response to the ``--version``
+option instead of having the ``CommandLine`` library print out the usual version
+string for LLVM. This is useful for programs that are not part of LLVM but wish
+to use the ``CommandLine`` facilities. Such programs should just define a small
+function that takes no arguments and returns ``void`` and that prints out
+whatever version information is appropriate for the program. Pass the address of
+that function to ``cl::SetVersionPrinter`` to arrange for it to be called when
+the ``--version`` option is given by the user.
+
+.. _cl::opt:
+.. _scalar:
+
+The ``cl::opt`` class
+^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::opt`` class is the class used to represent scalar command line
+options, and is the one used most of the time.  It is a templated class which
+can take up to three arguments (all except for the first have default values
+though):
+
+.. code-block:: c++
+
+  namespace cl {
+    template <class DataType, bool ExternalStorage = false,
+              class ParserClass = parser<DataType> >
+    class opt;
+  }
+
+The first template argument specifies what underlying data type the command line
+argument is, and is used to select a default parser implementation.  The second
+template argument is used to specify whether the option should contain the
+storage for the option (the default) or whether external storage should be used
+to contain the value parsed for the option (see `Internal vs External Storage`_
+for more information).
+
+The third template argument specifies which parser to use.  The default value
+selects an instantiation of the ``parser`` class based on the underlying data
+type of the option.  In general, this default works well for most applications,
+so this option is only used when using a `custom parser`_.
+
+.. _lists of arguments:
+.. _cl::list:
+
+The ``cl::list`` class
+^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::list`` class is the class used to represent a list of command line
+options.  It too is a templated class which can take up to three arguments:
+
+.. code-block:: c++
+
+  namespace cl {
+    template <class DataType, class Storage = bool,
+              class ParserClass = parser<DataType> >
+    class list;
+  }
+
+This class works the exact same as the `cl::opt`_ class, except that the second
+argument is the **type** of the external storage, not a boolean value.  For this
+class, the marker type '``bool``' is used to indicate that internal storage
+should be used.
+
+.. _cl::bits:
+
+The ``cl::bits`` class
+^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::bits`` class is the class used to represent a list of command line
+options in the form of a bit vector.  It is also a templated class which can
+take up to three arguments:
+
+.. code-block:: c++
+
+  namespace cl {
+    template <class DataType, class Storage = bool,
+              class ParserClass = parser<DataType> >
+    class bits;
+  }
+
+This class works the exact same as the `cl::list`_ class, except that the second
+argument must be of **type** ``unsigned`` if external storage is used.
+
+.. _cl::alias:
+
+The ``cl::alias`` class
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::alias`` class is a nontemplated class that is used to form aliases for
+other arguments.
+
+.. code-block:: c++
+
+  namespace cl {
+    class alias;
+  }
+
+The `cl::aliasopt`_ attribute should be used to specify which option this is an
+alias for.  Alias arguments default to being `cl::Hidden`_, and use the aliased
+options parser to do the conversion from string to data.
+
+.. _cl::extrahelp:
+
+The ``cl::extrahelp`` class
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cl::extrahelp`` class is a nontemplated class that allows extra help text
+to be printed out for the ``-help`` option.
+
+.. code-block:: c++
+
+  namespace cl {
+    struct extrahelp;
+  }
+
+To use the extrahelp, simply construct one with a ``const char*`` parameter to
+the constructor. The text passed to the constructor will be printed at the
+bottom of the help message, verbatim. Note that multiple ``cl::extrahelp``
+**can** be used, but this practice is discouraged. If your tool needs to print
+additional help information, put all that help into a single ``cl::extrahelp``
+instance.
+
+For example:
+
+.. code-block:: c++
+
+  cl::extrahelp("\nADDITIONAL HELP:\n\n  This is the extra help\n");
+
+.. _different parser:
+.. _discussed previously:
+
+Builtin parsers
+---------------
+
+Parsers control how the string value taken from the command line is translated
+into a typed value, suitable for use in a C++ program.  By default, the
+CommandLine library uses an instance of ``parser<type>`` if the command line
+option specifies that it uses values of type '``type``'.  Because of this,
+custom option processing is specified with specializations of the '``parser``'
+class.
+
+The CommandLine library provides the following builtin parser specializations,
+which are sufficient for most applications. It can, however, also be extended to
+work with new data types and new ways of interpreting the same data.  See the
+`Writing a Custom Parser`_ for more details on this type of library extension.
+
+.. _enums:
+.. _cl::parser:
+
+* The generic ``parser<t>`` parser can be used to map strings values to any data
+  type, through the use of the `cl::values`_ property, which specifies the
+  mapping information.  The most common use of this parser is for parsing enum
+  values, which allows you to use the CommandLine library for all of the error
+  checking to make sure that only valid enum values are specified (as opposed to
+  accepting arbitrary strings).  Despite this, however, the generic parser class
+  can be used for any data type.
+
+.. _boolean flags:
+.. _bool parser:
+
+* The **parser<bool> specialization** is used to convert boolean strings to a
+  boolean value.  Currently accepted strings are "``true``", "``TRUE``",
+  "``True``", "``1``", "``false``", "``FALSE``", "``False``", and "``0``".
+
+* The **parser<boolOrDefault> specialization** is used for cases where the value
+  is boolean, but we also need to know whether the option was specified at all.
+  boolOrDefault is an enum with 3 values, BOU_UNSET, BOU_TRUE and BOU_FALSE.
+  This parser accepts the same strings as **``parser<bool>``**.
+
+.. _strings:
+
+* The **parser<string> specialization** simply stores the parsed string into the
+  string value specified.  No conversion or modification of the data is
+  performed.
+
+.. _integers:
+.. _int:
+
+* The **parser<int> specialization** uses the C ``strtol`` function to parse the
+  string input.  As such, it will accept a decimal number (with an optional '+'
+  or '-' prefix) which must start with a non-zero digit.  It accepts octal
+  numbers, which are identified with a '``0``' prefix digit, and hexadecimal
+  numbers with a prefix of '``0x``' or '``0X``'.
+
+.. _doubles:
+.. _float:
+.. _double:
+
+* The **parser<double>** and **parser<float> specializations** use the standard
+  C ``strtod`` function to convert floating point strings into floating point
+  values.  As such, a broad range of string formats is supported, including
+  exponential notation (ex: ``1.7e15``) and properly supports locales.
+
+.. _Extension Guide:
+.. _extending the library:
+
+Extension Guide
+===============
+
+Although the CommandLine library has a lot of functionality built into it
+already (as discussed previously), one of its true strengths lie in its
+extensibility.  This section discusses how the CommandLine library works under
+the covers and illustrates how to do some simple, common, extensions.
+
+.. _Custom parsers:
+.. _custom parser:
+.. _Writing a Custom Parser:
+
+Writing a custom parser
+-----------------------
+
+One of the simplest and most common extensions is the use of a custom parser.
+As `discussed previously`_, parsers are the portion of the CommandLine library
+that turns string input from the user into a particular parsed data type,
+validating the input in the process.
+
+There are two ways to use a new parser:
+
+#. Specialize the `cl::parser`_ template for your custom data type.
+
+   This approach has the advantage that users of your custom data type will
+   automatically use your custom parser whenever they define an option with a
+   value type of your data type.  The disadvantage of this approach is that it
+   doesn't work if your fundamental data type is something that is already
+   supported.
+
+#. Write an independent class, using it explicitly from options that need it.
+
+   This approach works well in situations where you would line to parse an
+   option using special syntax for a not-very-special data-type.  The drawback
+   of this approach is that users of your parser have to be aware that they are
+   using your parser instead of the builtin ones.
+
+To guide the discussion, we will discuss a custom parser that accepts file
+sizes, specified with an optional unit after the numeric size.  For example, we
+would like to parse "102kb", "41M", "1G" into the appropriate integer value.  In
+this case, the underlying data type we want to parse into is '``unsigned``'.  We
+choose approach #2 above because we don't want to make this the default for all
+``unsigned`` options.
+
+To start out, we declare our new ``FileSizeParser`` class:
+
+.. code-block:: c++
+
+  struct FileSizeParser : public cl::basic_parser<unsigned> {
+    // parse - Return true on error.
+    bool parse(cl::Option &O, const char *ArgName, const std::string &ArgValue,
+               unsigned &Val);
+  };
+
+Our new class inherits from the ``cl::basic_parser`` template class to fill in
+the default, boiler plate code for us.  We give it the data type that we parse
+into, the last argument to the ``parse`` method, so that clients of our custom
+parser know what object type to pass in to the parse method.  (Here we declare
+that we parse into '``unsigned``' variables.)
+
+For most purposes, the only method that must be implemented in a custom parser
+is the ``parse`` method.  The ``parse`` method is called whenever the option is
+invoked, passing in the option itself, the option name, the string to parse, and
+a reference to a return value.  If the string to parse is not well-formed, the
+parser should output an error message and return true.  Otherwise it should
+return false and set '``Val``' to the parsed value.  In our example, we
+implement ``parse`` as:
+
+.. code-block:: c++
+
+  bool FileSizeParser::parse(cl::Option &O, const char *ArgName,
+                             const std::string &Arg, unsigned &Val) {
+    const char *ArgStart = Arg.c_str();
+    char *End;
+
+    // Parse integer part, leaving 'End' pointing to the first non-integer char
+    Val = (unsigned)strtol(ArgStart, &End, 0);
+
+    while (1) {
+      switch (*End++) {
+      case 0: return false;   // No error
+      case 'i':               // Ignore the 'i' in KiB if people use that
+      case 'b': case 'B':     // Ignore B suffix
+        break;
+
+      case 'g': case 'G': Val *= 1024*1024*1024; break;
+      case 'm': case 'M': Val *= 1024*1024;      break;
+      case 'k': case 'K': Val *= 1024;           break;
+
+      default:
+        // Print an error message if unrecognized character!
+        return O.error("'" + Arg + "' value invalid for file size argument!");
+      }
+    }
+  }
+
+This function implements a very simple parser for the kinds of strings we are
+interested in.  Although it has some holes (it allows "``123KKK``" for example),
+it is good enough for this example.  Note that we use the option itself to print
+out the error message (the ``error`` method always returns true) in order to get
+a nice error message (shown below).  Now that we have our parser class, we can
+use it like this:
+
+.. code-block:: c++
+
+  static cl::opt<unsigned, false, FileSizeParser>
+  MFS("max-file-size", cl::desc("Maximum file size to accept"),
+      cl::value_desc("size"));
+
+Which adds this to the output of our program:
+
+::
+
+  OPTIONS:
+    -help                 - display available options (-help-hidden for more)
+    ...
+   -max-file-size=<size> - Maximum file size to accept
+
+And we can test that our parse works correctly now (the test program just prints
+out the max-file-size argument value):
+
+::
+
+  $ ./test
+  MFS: 0
+  $ ./test -max-file-size=123MB
+  MFS: 128974848
+  $ ./test -max-file-size=3G
+  MFS: 3221225472
+  $ ./test -max-file-size=dog
+  -max-file-size option: 'dog' value invalid for file size argument!
+
+It looks like it works.  The error message that we get is nice and helpful, and
+we seem to accept reasonable file sizes.  This wraps up the "custom parser"
+tutorial.
+
+Exploiting external storage
+---------------------------
+
+Several of the LLVM libraries define static ``cl::opt`` instances that will
+automatically be included in any program that links with that library.  This is
+a feature. However, sometimes it is necessary to know the value of the command
+line option outside of the library. In these cases the library does or should
+provide an external storage location that is accessible to users of the
+library. Examples of this include the ``llvm::DebugFlag`` exported by the
+``lib/Support/Debug.cpp`` file and the ``llvm::TimePassesIsEnabled`` flag
+exported by the ``lib/VMCore/PassManager.cpp`` file.
+
+.. todo::
+
+  TODO: complete this section
+
+.. _dynamically loaded options:
+
+Dynamically adding command line options
+
+.. todo::
+
+  TODO: fill in this section
diff --git a/docs/CompilerWriterInfo.html b/docs/CompilerWriterInfo.html
index 5fdb4fc..67da783 100644
--- a/docs/CompilerWriterInfo.html
+++ b/docs/CompilerWriterInfo.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Architecture/platform information for compiler writers</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -260,7 +260,7 @@ processors.</li>
 
   <a href="http://misha.brukman.net">Misha Brukman</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-28 00:56:32 +0200 (Fri, 28 Oct 2011) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 
 </body>
diff --git a/docs/DebuggingJITedCode.html b/docs/DebuggingJITedCode.html
index ffb4cd9..652572c 100644
--- a/docs/DebuggingJITedCode.html
+++ b/docs/DebuggingJITedCode.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Debugging JITed Code With GDB</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -51,7 +51,7 @@ necessary debug information.
 
 <p>In order to debug code JIT-ed by LLVM, you need GDB 7.0 or newer, which is
 available on most modern distributions of Linux.  The version of GDB that Apple
-ships with XCode has been frozen at 6.3 for a while.  LLDB may be a better
+ships with Xcode has been frozen at 6.3 for a while.  LLDB may be a better
 option for debugging JIT-ed code on Mac OS X.
 </p>
 
@@ -178,7 +178,7 @@ Program exited with code 0170.
   <a href="mailto:reid.kleckner@gmail.com">Reid Kleckner</a>,
   <a href="mailto:eliben@gmail.com">Eli Bendersky</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-01 09:58:54 +0200 (Tue, 01 May 2012) $
+  Last modified: $Date: 2012-05-13 16:36:15 +0200 (Sun, 13 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/DeveloperPolicy.html b/docs/DeveloperPolicy.html
deleted file mode 100644
index 264975e..0000000
--- a/docs/DeveloperPolicy.html
+++ /dev/null
@@ -1,642 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM Developer Policy</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-      
-<h1>LLVM Developer Policy</h1>
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#policies">Developer Policies</a>
-  <ol>
-    <li><a href="#informed">Stay Informed</a></li>
-    <li><a href="#patches">Making a Patch</a></li>
-    <li><a href="#reviews">Code Reviews</a></li>
-    <li><a href="#owners">Code Owners</a></li>
-    <li><a href="#testcases">Test Cases</a></li>
-    <li><a href="#quality">Quality</a></li>
-    <li><a href="#commitaccess">Obtaining Commit Access</a></li>
-    <li><a href="#newwork">Making a Major Change</a></li>
-    <li><a href="#incremental">Incremental Development</a></li>
-    <li><a href="#attribution">Attribution of Changes</a></li>
-  </ol></li>
-  <li><a href="#clp">Copyright, License, and Patents</a>
-  <ol>
-    <li><a href="#copyright">Copyright</a></li>
-    <li><a href="#license">License</a></li>
-    <li><a href="#patents">Patents</a></li>
-  </ol></li>
-</ol>
-<div class="doc_author">Written by the LLVM Oversight Team</div>
-
-<!--=========================================================================-->
-<h2><a name="introduction">Introduction</a></h2>
-<!--=========================================================================-->
-<div>
-<p>This document contains the LLVM Developer Policy which defines the project's
-   policy towards developers and their contributions. The intent of this policy
-   is to eliminate miscommunication, rework, and confusion that might arise from
-   the distributed nature of LLVM's development.  By stating the policy in clear
-   terms, we hope each developer can know ahead of time what to expect when
-   making LLVM contributions.  This policy covers all llvm.org subprojects,
-   including Clang, LLDB, libc++, etc.</p>
-<p>This policy is also designed to accomplish the following objectives:</p>
-
-<ol>
-  <li>Attract both users and developers to the LLVM project.</li>
-
-  <li>Make life as simple and easy for contributors as possible.</li>
-
-  <li>Keep the top of Subversion trees as stable as possible.</li>
-
-  <li>Establish awareness of the project's <a href="#clp">copyright,
-      license, and patent policies</a> with contributors to the project.</li>
-</ol>
-  
-<p>This policy is aimed at frequent contributors to LLVM. People interested in
-   contributing one-off patches can do so in an informal way by sending them to
-   the
-   <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits">llvm-commits
-   mailing list</a> and engaging another developer to see it through the
-   process.</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="policies">Developer Policies</a></h2>
-<!--=========================================================================-->
-<div>
-<p>This section contains policies that pertain to frequent LLVM developers.  We
-   always welcome <a href="#patches">one-off patches</a> from people who do not
-   routinely contribute to LLVM, but we expect more from frequent contributors
-   to keep the system as efficient as possible for everyone.  Frequent LLVM
-   contributors are expected to meet the following requirements in order for
-   LLVM to maintain a high standard of quality.<p>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="informed">Stay Informed</a></h3>
-<div>
-<p>Developers should stay informed by reading at least the "dev" mailing list
-   for the projects you are interested in, such as 
-   <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">llvmdev</a> for
-   LLVM, <a href="http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev">cfe-dev</a>
-   for Clang, or <a
-   href="http://lists.cs.uiuc.edu/mailman/listinfo/lldb-dev">lldb-dev</a>
-   for LLDB.  If you are doing anything more than just casual work on LLVM, it
-   is suggested that you also subscribe to the "commits" mailing list for the
-   subproject you're interested in, such as
-  <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits">llvm-commits</a>,
-  <a href="http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits">cfe-commits</a>,
-  or <a href="http://lists.cs.uiuc.edu/mailman/listinfo/lldb-commits">lldb-commits</a>.
-   Reading the "commits" list and paying attention to changes being made by
-   others is a good way to see what other people are interested in and watching
-   the flow of the project as a whole.</p>
-
-<p>We recommend that active developers register an email account with 
-   <a href="http://llvm.org/bugs/">LLVM Bugzilla</a> and preferably subscribe to
-   the <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmbugs">llvm-bugs</a>
-   email list to keep track of bugs and enhancements occurring in LLVM.  We
-   really appreciate people who are proactive at catching incoming bugs in their
-   components and dealing with them promptly.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="patches">Making a Patch</a></h3>
-
-<div>
-<p>When making a patch for review, the goal is to make it as easy for the
-   reviewer to read it as possible.  As such, we recommend that you:</p>
-
-<ol>
-  <li>Make your patch against the Subversion trunk, not a branch, and not an old
-      version of LLVM.  This makes it easy to apply the patch.  For information
-      on how to check out SVN trunk, please see the <a
-      href="GettingStarted.html#checkout">Getting Started Guide</a>.</li>
-        
-  <li>Similarly, patches should be submitted soon after they are generated.  Old
-      patches may not apply correctly if the underlying code changes between the
-      time the patch was created and the time it is applied.</li>
-
-  <li>Patches should be made with <tt>svn diff</tt>, or similar. If you use
-      a different tool, make sure it uses the <tt>diff -u</tt> format and
-      that it doesn't contain clutter which makes it hard to read.</li>
-
-  <li>If you are modifying generated files, such as the top-level
-      <tt>configure</tt> script, please separate out those changes into
-      a separate patch from the rest of your changes.</li>
-</ol>
-  
-<p>When sending a patch to a mailing list, it is a good idea to send it as an
-   <em>attachment</em> to the message, not embedded into the text of the
-   message.  This ensures that your mailer will not mangle the patch when it
-   sends it (e.g. by making whitespace changes or by wrapping lines).</p>
-
-<p><em>For Thunderbird users:</em> Before submitting a patch, please open 
-   <em>Preferences &#8594; Advanced &#8594; General &#8594; Config Editor</em>,
-   find the key <tt>mail.content_disposition_type</tt>, and set its value to
-   <tt>1</tt>. Without this setting, Thunderbird sends your attachment using
-   <tt>Content-Disposition: inline</tt> rather than <tt>Content-Disposition:
-   attachment</tt>. Apple Mail gamely displays such a file inline, making it
-   difficult to work with for reviewers using that program.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="reviews">Code Reviews</a></h3>
-<div>
-<p>LLVM has a code review policy. Code review is one way to increase the quality
-   of software. We generally follow these policies:</p>
-
-<ol>
-  <li>All developers are required to have significant changes reviewed before
-      they are committed to the repository.</li>
-
-  <li>Code reviews are conducted by email, usually on the llvm-commits
-      list.</li>
-
-  <li>Code can be reviewed either before it is committed or after.  We expect
-      major changes to be reviewed before being committed, but smaller changes
-      (or changes where the developer owns the component) can be reviewed after
-      commit.</li>
-
-  <li>The developer responsible for a code change is also responsible for making
-      all necessary review-related changes.</li>
-
-  <li>Code review can be an iterative process, which continues until the patch
-      is ready to be committed.</li>
-</ol>
-  
-<p>Developers should participate in code reviews as both reviewers and
-   reviewees. If someone is kind enough to review your code, you should return
-   the favor for someone else.  Note that anyone is welcome to review and give
-   feedback on a patch, but only people with Subversion write access can approve
-   it.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="owners">Code Owners</a></h3>
-<div>
-
-<p>The LLVM Project relies on two features of its process to maintain rapid
-   development in addition to the high quality of its source base: the
-   combination of code review plus post-commit review for trusted maintainers.
-   Having both is a great way for the project to take advantage of the fact that
-   most people do the right thing most of the time, and only commit patches
-   without pre-commit review when they are confident they are right.</p>
-     
-<p>The trick to this is that the project has to guarantee that all patches that
-   are committed are reviewed after they go in: you don't want everyone to
-   assume someone else will review it, allowing the patch to go unreviewed.  To
-   solve this problem, we have a notion of an 'owner' for a piece of the code.
-   The sole responsibility of a code owner is to ensure that a commit to their
-   area of the code is appropriately reviewed, either by themself or by someone
-   else.  The current code owners are:</p>
-  
-<ol>
-  <li><b>Evan Cheng</b>: Code generator and all targets.</li>
-
-  <li><b>Greg Clayton</b>: LLDB.</li>
-
-  <li><b>Doug Gregor</b>: Clang Frontend Libraries.</li>
-
-  <li><b>Howard Hinnant</b>: libc++.</li>
-
-  <li><b>Anton Korobeynikov</b>: Exception handling, debug information, and
-      Windows codegen.</li>
-
-  <li><b>Ted Kremenek</b>: Clang Static Analyzer.</li>
-
-  <li><b>Chris Lattner</b>: Everything not covered by someone else.</li>
-  
-  <li><b>John McCall</b>: Clang LLVM IR generation.</li>
-
-  <li><b>Jakob Olesen</b>: Register allocators and TableGen.</li>
-
-  <li><b>Duncan Sands</b>: dragonegg and llvm-gcc 4.2.</li>
-  
-  <li><b>Peter Collingbourne</b>: libclc.</li>
-  
-  <li><b>Tobias Grosser</b>: polly.</li>
-</ol>
-  
-<p>Note that code ownership is completely different than reviewers: anyone can
-   review a piece of code, and we welcome code review from anyone who is
-   interested.  Code owners are the "last line of defense" to guarantee that all
-   patches that are committed are actually reviewed.</p>
-
-<p>Being a code owner is a somewhat unglamorous position, but it is incredibly
-   important for the ongoing success of the project.  Because people get busy,
-   interests change, and unexpected things happen, code ownership is purely
-   opt-in, and anyone can choose to resign their "title" at any time. For now,
-   we do not have an official policy on how one gets elected to be a code
-   owner.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="testcases">Test Cases</a></h3>
-<div>
-<p>Developers are required to create test cases for any bugs fixed and any new
-   features added.  Some tips for getting your testcase approved:</p>
-
-<ol>
-  <li>All feature and regression test cases are added to the 
-      <tt>llvm/test</tt> directory. The appropriate sub-directory should be
-      selected (see the <a href="TestingGuide.html">Testing Guide</a> for
-      details).</li>
-
-  <li>Test cases should be written in <a href="LangRef.html">LLVM assembly
-      language</a> unless the feature or regression being tested requires
-      another language (e.g. the bug being fixed or feature being implemented is
-      in the llvm-gcc C++ front-end, in which case it must be written in
-      C++).</li>
-
-  <li>Test cases, especially for regressions, should be reduced as much as
-      possible, by <a href="Bugpoint.html">bugpoint</a> or manually. It is
-      unacceptable to place an entire failing program into <tt>llvm/test</tt> as
-      this creates a <i>time-to-test</i> burden on all developers. Please keep
-      them short.</li>
-</ol>
-  
-<p>Note that llvm/test and clang/test are designed for regression and small
-   feature tests only. More extensive test cases (e.g., entire applications,
-   benchmarks, etc)
-   should be added to the <tt>llvm-test</tt> test suite.  The llvm-test suite is
-   for coverage (correctness, performance, etc) testing, not feature or
-   regression testing.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="quality">Quality</a></h3>
-<div>
-<p>The minimum quality standards that any change must satisfy before being
-   committed to the main development branch are:</p>
-
-<ol>
-  <li>Code must adhere to the <a href="CodingStandards.html">LLVM Coding
-      Standards</a>.</li>
-
-  <li>Code must compile cleanly (no errors, no warnings) on at least one
-      platform.</li>
-
-  <li>Bug fixes and new features should <a href="#testcases">include a
-      testcase</a> so we know if the fix/feature ever regresses in the
-      future.</li>
-
-  <li>Code must pass the <tt>llvm/test</tt> test suite.</li>
-
-  <li>The code must not cause regressions on a reasonable subset of llvm-test,
-      where "reasonable" depends on the contributor's judgement and the scope of
-      the change (more invasive changes require more testing). A reasonable
-      subset might be something like
-      "<tt>llvm-test/MultiSource/Benchmarks</tt>".</li>
-</ol>
-
-<p>Additionally, the committer is responsible for addressing any problems found
-   in the future that the change is responsible for.  For example:</p>
-
-<ul>
-  <li>The code should compile cleanly on all supported platforms.</li>
-
-  <li>The changes should not cause any correctness regressions in the
-      <tt>llvm-test</tt> suite and must not cause any major performance
-      regressions.</li>
-
-  <li>The change set should not cause performance or correctness regressions for
-      the LLVM tools.</li>
-
-  <li>The changes should not cause performance or correctness regressions in
-      code compiled by LLVM on all applicable targets.</li>
-
-  <li>You are expected to address any <a href="http://llvm.org/bugs/">bugzilla
-      bugs</a> that result from your change.</li>
-</ul>
-  
-<p>We prefer for this to be handled before submission but understand that it
-   isn't possible to test all of this for every submission.  Our build bots and
-   nightly testing infrastructure normally finds these problems.  A good rule of
-   thumb is to check the nightly testers for regressions the day after your
-   change.  Build bots will directly email you if a group of commits that
-   included yours caused a failure.  You are expected to check the build bot
-   messages to see if they are your fault and, if so, fix the breakage.</p>
-
-<p>Commits that violate these quality standards (e.g. are very broken) may be
-   reverted. This is necessary when the change blocks other developers from
-   making progress. The developer is welcome to re-commit the change after the
-   problem has been fixed.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="commitaccess">Obtaining Commit Access</a></h3>
-<div>
-
-<p>We grant commit access to contributors with a track record of submitting high
-   quality patches.  If you would like commit access, please send an email to
-   <a href="mailto:sabre@nondot.org">Chris</a> with the following
-   information:</p>
-
-<ol>
-  <li>The user name you want to commit with, e.g. "hacker".</li>
-
-  <li>The full name and email address you want message to llvm-commits to come
-      from, e.g. "J. Random Hacker &lt;hacker@yoyodyne.com&gt;".</li>
-
-  <li>A "password hash" of the password you want to use, e.g. "2ACR96qjUqsyM".  
-      Note that you don't ever tell us what your password is, you just give it
-      to us in an encrypted form.  To get this, run "htpasswd" (a utility that
-      comes with apache) in crypt mode (often enabled with "-d"), or find a web
-      page that will do it for you.</li>
-</ol>
-
-<p>Once you've been granted commit access, you should be able to check out an
-   LLVM tree with an SVN URL of "https://username@llvm.org/..." instead of the
-   normal anonymous URL of "http://llvm.org/...".  The first time you commit
-   you'll have to type in your password.  Note that you may get a warning from
-   SVN about an untrusted key, you can ignore this.  To verify that your commit
-   access works, please do a test commit (e.g. change a comment or add a blank
-   line).  Your first commit to a repository may require the autogenerated email
-   to be approved by a mailing list.  This is normal, and will be done when
-   the mailing list owner has time.</p>
-
-<p>If you have recently been granted commit access, these policies apply:</p>
-
-<ol>
-  <li>You are granted <i>commit-after-approval</i> to all parts of LLVM.  To get
-      approval, submit a <a href="#patches">patch</a> to
-      <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits">llvm-commits</a>.
-      When approved you may commit it yourself.</li>
-
-  <li>You are allowed to commit patches without approval which you think are
-      obvious. This is clearly a subjective decision &mdash; we simply expect
-      you to use good judgement.  Examples include: fixing build breakage,
-      reverting obviously broken patches, documentation/comment changes, any
-      other minor changes.</li>
-
-  <li>You are allowed to commit patches without approval to those portions of
-      LLVM that you have contributed or maintain (i.e., have been assigned
-      responsibility for), with the proviso that such commits must not break the
-      build.  This is a "trust but verify" policy and commits of this nature are
-      reviewed after they are committed.</li>
-
-  <li>Multiple violations of these policies or a single egregious violation may
-      cause commit access to be revoked.</li>
-</ol>
-
-<p>In any case, your changes are still subject to <a href="#reviews">code
-   review</a> (either before or after they are committed, depending on the
-   nature of the change).  You are encouraged to review other peoples' patches
-   as well, but you aren't required to.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="newwork">Making a Major Change</a></h3>
-<div>
-<p>When a developer begins a major new project with the aim of contributing it
-   back to LLVM, s/he should inform the community with an email to
-   the <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">llvmdev</a>
-   email list, to the extent possible. The reason for this is to:
-
-<ol>
-  <li>keep the community informed about future changes to LLVM, </li>
-
-  <li>avoid duplication of effort by preventing multiple parties working on the
-      same thing and not knowing about it, and</li>
-
-  <li>ensure that any technical issues around the proposed work are discussed
-      and resolved before any significant work is done.</li>
-</ol>
-  
-<p>The design of LLVM is carefully controlled to ensure that all the pieces fit
-   together well and are as consistent as possible. If you plan to make a major
-   change to the way LLVM works or want to add a major new extension, it is a
-   good idea to get consensus with the development community before you start
-   working on it.</p>
-  
-<p>Once the design of the new feature is finalized, the work itself should be
-   done as a series of <a href="#incremental">incremental changes</a>, not as a
-   long-term development branch.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="incremental">Incremental Development</a></h3>
-<div>
-<p>In the LLVM project, we do all significant changes as a series of incremental
-   patches.  We have a strong dislike for huge changes or long-term development
-   branches.  Long-term development branches have a number of drawbacks:</p>
-
-<ol>
-  <li>Branches must have mainline merged into them periodically.  If the branch
-      development and mainline development occur in the same pieces of code,
-      resolving merge conflicts can take a lot of time.</li>
-
-  <li>Other people in the community tend to ignore work on branches.</li>
-
-  <li>Huge changes (produced when a branch is merged back onto mainline) are
-      extremely difficult to <a href="#reviews">code review</a>.</li>
-
-  <li>Branches are not routinely tested by our nightly tester
-      infrastructure.</li>
-
-  <li>Changes developed as monolithic large changes often don't work until the
-      entire set of changes is done.  Breaking it down into a set of smaller
-      changes increases the odds that any of the work will be committed to the
-      main repository.</li>
-</ol>    
-  
-<p>To address these problems, LLVM uses an incremental development style and we
-   require contributors to follow this practice when making a large/invasive
-   change.  Some tips:</p>
-
-<ul>
-  <li>Large/invasive changes usually have a number of secondary changes that are
-      required before the big change can be made (e.g. API cleanup, etc).  These
-      sorts of changes can often be done before the major change is done,
-      independently of that work.</li>
-
-  <li>The remaining inter-related work should be decomposed into unrelated sets
-      of changes if possible.  Once this is done, define the first increment and
-      get consensus on what the end goal of the change is.</li>
-
-  <li>Each change in the set can be stand alone (e.g. to fix a bug), or part of
-      a planned series of changes that works towards the development goal.</li>
-    
-  <li>Each change should be kept as small as possible. This simplifies your work
-      (into a logical progression), simplifies code review and reduces the
-      chance that you will get negative feedback on the change. Small increments
-      also facilitate the maintenance of a high quality code base.</li>
-
-  <li>Often, an independent precursor to a big change is to add a new API and
-      slowly migrate clients to use the new API.  Each change to use the new API
-      is often "obvious" and can be committed without review.  Once the new API
-      is in place and used, it is much easier to replace the underlying
-      implementation of the API.  This implementation change is logically
-      separate from the API change.</li>
-</ul>
-  
-<p>If you are interested in making a large change, and this scares you, please
-   make sure to first <a href="#newwork">discuss the change/gather consensus</a>
-   then ask about the best way to go about making the change.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="attribution">Attribution of Changes</a></h3>
-<div>
-<p>We believe in correct attribution of contributions to their contributors.
-   However, we do not want the source code to be littered with random
-   attributions "this code written by J. Random Hacker" (this is noisy and
-   distracting).  In practice, the revision control system keeps a perfect
-   history of who changed what, and the CREDITS.txt file describes higher-level
-   contributions.  If you commit a patch for someone else, please say "patch
-   contributed by J. Random Hacker!" in the commit message.</p>
-
-<p>Overall, please do not add contributor names to the source code.</p>
-</div>
-
-</div>
-
-<!--=========================================================================-->
-<h2>
-  <a name="clp">Copyright, License, and Patents</a>
-</h2>
-<!--=========================================================================-->
-
-<div>
-
-<div class="doc_notes">
-<p style="text-align:center;font-weight:bold">NOTE: This section deals with
-   legal matters but does not provide legal advice.  We are not lawyers &mdash; 
-   please seek legal counsel from an attorney.</p>
-</div>
-
-<div>
-<p>This section addresses the issues of copyright, license and patents for the
-   LLVM project.  The copyright for the code is held by the individual
-   contributors of the code and the terms of its license to LLVM users and 
-   developers is the
-   <a href="http://www.opensource.org/licenses/UoI-NCSA.php">University of 
-   Illinois/NCSA Open Source License</a> (with portions dual licensed under the
-   <a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>,
-   see below).  As contributor to the LLVM project, you agree to allow any 
-   contributions to the project to licensed under these terms.</p>
-
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="copyright">Copyright</a></h3>
-<div>
-
-<p>The LLVM project does not require copyright assignments, which means that the
-   copyright for the code in the project is held by its respective contributors
-   who have each agreed to release their contributed code under the terms of the
-   <a href="#license">LLVM License</a>.</p>
-   
-<p>An implication of this is that the LLVM license is unlikely to ever change:
-   changing it would require tracking down all the contributors to LLVM and
-   getting them to agree that a license change is acceptable for their
-   contribution.  Since there are no plans to change the license, this is not a
-   cause for concern.</p>
-   
-<p>As a contributor to the project, this means that you (or your company) retain
-   ownership of the code you contribute, that it cannot be used in a way that
-   contradicts the license (which is a liberal BSD-style license), and that the
-   license for your contributions won't change without your approval in the
-   future.</p>
-   
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="license">License</a></h3>
-<div>
-<p>We intend to keep LLVM perpetually open source and to use a liberal open
-   source license.  <b>As a contributor to the project, you agree that any 
-   contributions be licensed under the terms of the corresponding 
-   subproject.</b>
-   All of the code in LLVM is available under the
-   <a href="http://www.opensource.org/licenses/UoI-NCSA.php">University of
-   Illinois/NCSA Open Source License</a>, which boils down to this:</p>
-
-<ul>
-  <li>You can freely distribute LLVM.</li>
-  <li>You must retain the copyright notice if you redistribute LLVM.</li>
-  <li>Binaries derived from LLVM must reproduce the copyright notice (e.g. in an
-      included readme file).</li>
-  <li>You can't use our names to promote your LLVM derived products.</li>
-  <li>There's no warranty on LLVM at all.</li>
-</ul>
-  
-<p>We believe this fosters the widest adoption of LLVM because it <b>allows
-   commercial products to be derived from LLVM</b> with few restrictions and
-   without a requirement for making any derived works also open source (i.e.
-   LLVM's license is not a "copyleft" license like the GPL). We suggest that you
-   read the <a href="http://www.opensource.org/licenses/UoI-NCSA.php">License</a>
-   if further clarification is needed.</p>
-   
-<p>In addition to the UIUC license, the runtime library components of LLVM
-   (<b>compiler_rt, libc++, and libclc</b>) are also licensed under the <a
-   href="http://www.opensource.org/licenses/mit-license.php">MIT license</a>,
-   which does not contain the binary redistribution clause.  As a user of these
-   runtime libraries, it means that you can choose to use the code under either
-   license (and thus don't need the binary redistribution clause), and as a
-   contributor to the code that you agree that any contributions to these
-   libraries be licensed under both licenses.  We feel that this is important
-   for runtime libraries, because they are implicitly linked into applications
-   and therefore should not subject those applications to the binary
-   redistribution clause. This also means that it is ok to move code from (e.g.)
-   libc++ to the LLVM core without concern, but that code cannot be moved from
-   the LLVM core to libc++ without the copyright owner's permission.
-</p>
-
-<p>Note that the LLVM Project does distribute llvm-gcc and dragonegg, <b>which  
-   are GPL.</b>
-   This means that anything "linked" into llvm-gcc must itself be compatible
-   with the GPL, and must be releasable under the terms of the GPL.  This
-   implies that <b>any code linked into llvm-gcc and distributed to others may
-   be subject to the viral aspects of the GPL</b> (for example, a proprietary
-   code generator linked into llvm-gcc must be made available under the GPL).
-   This is not a problem for code already distributed under a more liberal
-   license (like the UIUC license), and GPL-containing subprojects are kept
-   in separate SVN repositories whose LICENSE.txt files specifically indicate
-   that they contain GPL code.</p>
-  
-<p>We have no plans to change the license of LLVM.  If you have questions or
-   comments about the license, please contact the
-   <a href="mailto:llvmdev@cs.uiuc.edu">LLVM Developer's Mailing List</a>.</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="patents">Patents</a></h3>
-<div>
-<p>To the best of our knowledge, LLVM does not infringe on any patents (we have
-   actually removed code from LLVM in the past that was found to infringe).
-   Having code in LLVM that infringes on patents would violate an important goal
-   of the project by making it hard or impossible to reuse the code for
-   arbitrary purposes (including commercial use).</p>
-   
-<p>When contributing code, we expect contributors to notify us of any potential
-   for patent-related trouble with their changes (including from third parties).  
-   If you or your employer own
-   the rights to a patent and would like to contribute code to LLVM that relies
-   on it, we require that the copyright owner sign an agreement that allows any
-   other user of LLVM to freely use your patent.  Please contact
-   the <a href="mailto:llvm-oversight@cs.uiuc.edu">oversight group</a> for more
-   details.</p>
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  Written by the 
-  <a href="mailto:llvm-oversight@cs.uiuc.edu">LLVM Oversight Group</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
-</address>
-</body>
-</html>
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
new file mode 100644
index 0000000..cda281a
--- /dev/null
+++ b/docs/DeveloperPolicy.rst
@@ -0,0 +1,508 @@
+.. _developer_policy:
+
+=====================
+LLVM Developer Policy
+=====================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document contains the LLVM Developer Policy which defines the project's
+policy towards developers and their contributions. The intent of this policy is
+to eliminate miscommunication, rework, and confusion that might arise from the
+distributed nature of LLVM's development.  By stating the policy in clear terms,
+we hope each developer can know ahead of time what to expect when making LLVM
+contributions.  This policy covers all llvm.org subprojects, including Clang,
+LLDB, libc++, etc.
+
+This policy is also designed to accomplish the following objectives:
+
+#. Attract both users and developers to the LLVM project.
+
+#. Make life as simple and easy for contributors as possible.
+
+#. Keep the top of Subversion trees as stable as possible.
+
+#. Establish awareness of the project's `copyright, license, and patent
+   policies`_ with contributors to the project.
+
+This policy is aimed at frequent contributors to LLVM. People interested in
+contributing one-off patches can do so in an informal way by sending them to the
+`llvm-commits mailing list
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ and engaging another
+developer to see it through the process.
+
+Developer Policies
+==================
+
+This section contains policies that pertain to frequent LLVM developers.  We
+always welcome `one-off patches`_ from people who do not routinely contribute to
+LLVM, but we expect more from frequent contributors to keep the system as
+efficient as possible for everyone.  Frequent LLVM contributors are expected to
+meet the following requirements in order for LLVM to maintain a high standard of
+quality.
+
+Stay Informed
+-------------
+
+Developers should stay informed by reading at least the "dev" mailing list for
+the projects you are interested in, such as `llvmdev
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ for LLVM, `cfe-dev
+<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev>`_ for Clang, or `lldb-dev
+<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-dev>`_ for LLDB.  If you are
+doing anything more than just casual work on LLVM, it is suggested that you also
+subscribe to the "commits" mailing list for the subproject you're interested in,
+such as `llvm-commits
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_, `cfe-commits
+<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits>`_, or `lldb-commits
+<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-commits>`_.  Reading the
+"commits" list and paying attention to changes being made by others is a good
+way to see what other people are interested in and watching the flow of the
+project as a whole.
+
+We recommend that active developers register an email account with `LLVM
+Bugzilla <http://llvm.org/bugs/>`_ and preferably subscribe to the `llvm-bugs
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvmbugs>`_ email list to keep track
+of bugs and enhancements occurring in LLVM.  We really appreciate people who are
+proactive at catching incoming bugs in their components and dealing with them
+promptly.
+
+.. _patch:
+.. _one-off patches:
+
+Making a Patch
+--------------
+
+When making a patch for review, the goal is to make it as easy for the reviewer
+to read it as possible.  As such, we recommend that you:
+
+#. Make your patch against the Subversion trunk, not a branch, and not an old
+   version of LLVM.  This makes it easy to apply the patch.  For information on
+   how to check out SVN trunk, please see the `Getting Started
+   Guide <GettingStarted.html#checkout>`_.
+
+#. Similarly, patches should be submitted soon after they are generated.  Old
+   patches may not apply correctly if the underlying code changes between the
+   time the patch was created and the time it is applied.
+
+#. Patches should be made with ``svn diff``, or similar. If you use a
+   different tool, make sure it uses the ``diff -u`` format and that it
+   doesn't contain clutter which makes it hard to read.
+
+#. If you are modifying generated files, such as the top-level ``configure``
+   script, please separate out those changes into a separate patch from the rest
+   of your changes.
+
+When sending a patch to a mailing list, it is a good idea to send it as an
+*attachment* to the message, not embedded into the text of the message.  This
+ensures that your mailer will not mangle the patch when it sends it (e.g. by
+making whitespace changes or by wrapping lines).
+
+*For Thunderbird users:* Before submitting a patch, please open *Preferences >
+Advanced > General > Config Editor*, find the key
+``mail.content_disposition_type``, and set its value to ``1``. Without this
+setting, Thunderbird sends your attachment using ``Content-Disposition: inline``
+rather than ``Content-Disposition: attachment``. Apple Mail gamely displays such
+a file inline, making it difficult to work with for reviewers using that
+program.
+
+.. _code review:
+
+Code Reviews
+------------
+
+LLVM has a code review policy. Code review is one way to increase the quality of
+software. We generally follow these policies:
+
+#. All developers are required to have significant changes reviewed before they
+   are committed to the repository.
+
+#. Code reviews are conducted by email, usually on the llvm-commits list.
+
+#. Code can be reviewed either before it is committed or after.  We expect major
+   changes to be reviewed before being committed, but smaller changes (or
+   changes where the developer owns the component) can be reviewed after commit.
+
+#. The developer responsible for a code change is also responsible for making
+   all necessary review-related changes.
+
+#. Code review can be an iterative process, which continues until the patch is
+   ready to be committed.
+
+Developers should participate in code reviews as both reviewers and
+reviewees. If someone is kind enough to review your code, you should return the
+favor for someone else.  Note that anyone is welcome to review and give feedback
+on a patch, but only people with Subversion write access can approve it.
+
+Code Owners
+-----------
+
+The LLVM Project relies on two features of its process to maintain rapid
+development in addition to the high quality of its source base: the combination
+of code review plus post-commit review for trusted maintainers.  Having both is
+a great way for the project to take advantage of the fact that most people do
+the right thing most of the time, and only commit patches without pre-commit
+review when they are confident they are right.
+
+The trick to this is that the project has to guarantee that all patches that are
+committed are reviewed after they go in: you don't want everyone to assume
+someone else will review it, allowing the patch to go unreviewed.  To solve this
+problem, we have a notion of an 'owner' for a piece of the code.  The sole
+responsibility of a code owner is to ensure that a commit to their area of the
+code is appropriately reviewed, either by themself or by someone else.  The list
+of current code owners can be found in the file
+`CODE_OWNERS.TXT <http://llvm.org/viewvc/llvm-project/llvm/trunk/CODE_OWNERS.TXT?view=markup>`_
+in the root of the LLVM source tree.
+
+Note that code ownership is completely different than reviewers: anyone can
+review a piece of code, and we welcome code review from anyone who is
+interested.  Code owners are the "last line of defense" to guarantee that all
+patches that are committed are actually reviewed.
+
+Being a code owner is a somewhat unglamorous position, but it is incredibly
+important for the ongoing success of the project.  Because people get busy,
+interests change, and unexpected things happen, code ownership is purely opt-in,
+and anyone can choose to resign their "title" at any time. For now, we do not
+have an official policy on how one gets elected to be a code owner.
+
+.. _include a testcase:
+
+Test Cases
+----------
+
+Developers are required to create test cases for any bugs fixed and any new
+features added.  Some tips for getting your testcase approved:
+
+* All feature and regression test cases are added to the ``llvm/test``
+  directory. The appropriate sub-directory should be selected (see the `Testing
+  Guide <TestingGuide.html>`_ for details).
+
+* Test cases should be written in `LLVM assembly language <LangRef.html>`_
+  unless the feature or regression being tested requires another language
+  (e.g. the bug being fixed or feature being implemented is in the llvm-gcc C++
+  front-end, in which case it must be written in C++).
+
+* Test cases, especially for regressions, should be reduced as much as possible,
+  by `bugpoint <Bugpoint.html>`_ or manually. It is unacceptable to place an
+  entire failing program into ``llvm/test`` as this creates a *time-to-test*
+  burden on all developers. Please keep them short.
+
+Note that llvm/test and clang/test are designed for regression and small feature
+tests only. More extensive test cases (e.g., entire applications, benchmarks,
+etc) should be added to the ``llvm-test`` test suite.  The llvm-test suite is
+for coverage (correctness, performance, etc) testing, not feature or regression
+testing.
+
+Quality
+-------
+
+The minimum quality standards that any change must satisfy before being
+committed to the main development branch are:
+
+#. Code must adhere to the `LLVM Coding Standards <CodingStandards.html>`_.
+
+#. Code must compile cleanly (no errors, no warnings) on at least one platform.
+
+#. Bug fixes and new features should `include a testcase`_ so we know if the
+   fix/feature ever regresses in the future.
+
+#. Code must pass the ``llvm/test`` test suite.
+
+#. The code must not cause regressions on a reasonable subset of llvm-test,
+   where "reasonable" depends on the contributor's judgement and the scope of
+   the change (more invasive changes require more testing). A reasonable subset
+   might be something like "``llvm-test/MultiSource/Benchmarks``".
+
+Additionally, the committer is responsible for addressing any problems found in
+the future that the change is responsible for.  For example:
+
+* The code should compile cleanly on all supported platforms.
+
+* The changes should not cause any correctness regressions in the ``llvm-test``
+  suite and must not cause any major performance regressions.
+
+* The change set should not cause performance or correctness regressions for the
+  LLVM tools.
+
+* The changes should not cause performance or correctness regressions in code
+  compiled by LLVM on all applicable targets.
+
+* You are expected to address any `Bugzilla bugs <http://llvm.org/bugs/>`_ that
+  result from your change.
+
+We prefer for this to be handled before submission but understand that it isn't
+possible to test all of this for every submission.  Our build bots and nightly
+testing infrastructure normally finds these problems.  A good rule of thumb is
+to check the nightly testers for regressions the day after your change.  Build
+bots will directly email you if a group of commits that included yours caused a
+failure.  You are expected to check the build bot messages to see if they are
+your fault and, if so, fix the breakage.
+
+Commits that violate these quality standards (e.g. are very broken) may be
+reverted. This is necessary when the change blocks other developers from making
+progress. The developer is welcome to re-commit the change after the problem has
+been fixed.
+
+Obtaining Commit Access
+-----------------------
+
+We grant commit access to contributors with a track record of submitting high
+quality patches.  If you would like commit access, please send an email to
+`Chris <mailto:sabre@nondot.org>`_ with the following information:
+
+#. The user name you want to commit with, e.g. "hacker".
+
+#. The full name and email address you want message to llvm-commits to come
+   from, e.g. "J. Random Hacker <hacker@yoyodyne.com>".
+
+#. A "password hash" of the password you want to use, e.g. "``2ACR96qjUqsyM``".
+   Note that you don't ever tell us what your password is, you just give it to
+   us in an encrypted form.  To get this, run "``htpasswd``" (a utility that
+   comes with apache) in crypt mode (often enabled with "``-d``"), or find a web
+   page that will do it for you.
+
+Once you've been granted commit access, you should be able to check out an LLVM
+tree with an SVN URL of "https://username@llvm.org/..." instead of the normal
+anonymous URL of "http://llvm.org/...".  The first time you commit you'll have
+to type in your password.  Note that you may get a warning from SVN about an
+untrusted key, you can ignore this.  To verify that your commit access works,
+please do a test commit (e.g. change a comment or add a blank line).  Your first
+commit to a repository may require the autogenerated email to be approved by a
+mailing list.  This is normal, and will be done when the mailing list owner has
+time.
+
+If you have recently been granted commit access, these policies apply:
+
+#. You are granted *commit-after-approval* to all parts of LLVM.  To get
+   approval, submit a `patch`_ to `llvm-commits
+   <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_. When approved
+   you may commit it yourself.</li>
+
+#. You are allowed to commit patches without approval which you think are
+   obvious. This is clearly a subjective decision --- we simply expect you to
+   use good judgement.  Examples include: fixing build breakage, reverting
+   obviously broken patches, documentation/comment changes, any other minor
+   changes.
+
+#. You are allowed to commit patches without approval to those portions of LLVM
+   that you have contributed or maintain (i.e., have been assigned
+   responsibility for), with the proviso that such commits must not break the
+   build.  This is a "trust but verify" policy and commits of this nature are
+   reviewed after they are committed.
+
+#. Multiple violations of these policies or a single egregious violation may
+   cause commit access to be revoked.
+
+In any case, your changes are still subject to `code review`_ (either before or
+after they are committed, depending on the nature of the change).  You are
+encouraged to review other peoples' patches as well, but you aren't required
+to.
+
+.. _discuss the change/gather consensus:
+
+Making a Major Change
+---------------------
+
+When a developer begins a major new project with the aim of contributing it back
+to LLVM, s/he should inform the community with an email to the `llvmdev
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ email list, to the extent
+possible. The reason for this is to:
+
+#. keep the community informed about future changes to LLVM,
+
+#. avoid duplication of effort by preventing multiple parties working on the
+   same thing and not knowing about it, and
+
+#. ensure that any technical issues around the proposed work are discussed and
+   resolved before any significant work is done.
+
+The design of LLVM is carefully controlled to ensure that all the pieces fit
+together well and are as consistent as possible. If you plan to make a major
+change to the way LLVM works or want to add a major new extension, it is a good
+idea to get consensus with the development community before you start working on
+it.
+
+Once the design of the new feature is finalized, the work itself should be done
+as a series of `incremental changes`_, not as a long-term development branch.
+
+.. _incremental changes:
+
+Incremental Development
+-----------------------
+
+In the LLVM project, we do all significant changes as a series of incremental
+patches.  We have a strong dislike for huge changes or long-term development
+branches.  Long-term development branches have a number of drawbacks:
+
+#. Branches must have mainline merged into them periodically.  If the branch
+   development and mainline development occur in the same pieces of code,
+   resolving merge conflicts can take a lot of time.
+
+#. Other people in the community tend to ignore work on branches.
+
+#. Huge changes (produced when a branch is merged back onto mainline) are
+   extremely difficult to `code review`_.
+
+#. Branches are not routinely tested by our nightly tester infrastructure.
+
+#. Changes developed as monolithic large changes often don't work until the
+   entire set of changes is done.  Breaking it down into a set of smaller
+   changes increases the odds that any of the work will be committed to the main
+   repository.
+
+To address these problems, LLVM uses an incremental development style and we
+require contributors to follow this practice when making a large/invasive
+change.  Some tips:
+
+* Large/invasive changes usually have a number of secondary changes that are
+  required before the big change can be made (e.g. API cleanup, etc).  These
+  sorts of changes can often be done before the major change is done,
+  independently of that work.
+
+* The remaining inter-related work should be decomposed into unrelated sets of
+  changes if possible.  Once this is done, define the first increment and get
+  consensus on what the end goal of the change is.
+
+* Each change in the set can be stand alone (e.g. to fix a bug), or part of a
+  planned series of changes that works towards the development goal.
+
+* Each change should be kept as small as possible. This simplifies your work
+  (into a logical progression), simplifies code review and reduces the chance
+  that you will get negative feedback on the change. Small increments also
+  facilitate the maintenance of a high quality code base.
+
+* Often, an independent precursor to a big change is to add a new API and slowly
+  migrate clients to use the new API.  Each change to use the new API is often
+  "obvious" and can be committed without review.  Once the new API is in place
+  and used, it is much easier to replace the underlying implementation of the
+  API.  This implementation change is logically separate from the API
+  change.
+
+If you are interested in making a large change, and this scares you, please make
+sure to first `discuss the change/gather consensus`_ then ask about the best way
+to go about making the change.
+
+Attribution of Changes
+----------------------
+
+We believe in correct attribution of contributions to their contributors.
+However, we do not want the source code to be littered with random attributions
+"this code written by J. Random Hacker" (this is noisy and distracting).  In
+practice, the revision control system keeps a perfect history of who changed
+what, and the CREDITS.txt file describes higher-level contributions.  If you
+commit a patch for someone else, please say "patch contributed by J. Random
+Hacker!" in the commit message.
+
+Overall, please do not add contributor names to the source code.
+
+.. _copyright, license, and patent policies:
+
+Copyright, License, and Patents
+===============================
+
+.. note::
+
+   This section deals with legal matters but does not provide legal advice.  We
+   are not lawyers --- please seek legal counsel from an attorney.
+
+This section addresses the issues of copyright, license and patents for the LLVM
+project.  The copyright for the code is held by the individual contributors of
+the code and the terms of its license to LLVM users and developers is the
+`University of Illinois/NCSA Open Source License
+<http://www.opensource.org/licenses/UoI-NCSA.php>`_ (with portions dual licensed
+under the `MIT License <http://www.opensource.org/licenses/mit-license.php>`_,
+see below).  As contributor to the LLVM project, you agree to allow any
+contributions to the project to licensed under these terms.
+
+Copyright
+---------
+
+The LLVM project does not require copyright assignments, which means that the
+copyright for the code in the project is held by its respective contributors who
+have each agreed to release their contributed code under the terms of the `LLVM
+License`_.
+
+An implication of this is that the LLVM license is unlikely to ever change:
+changing it would require tracking down all the contributors to LLVM and getting
+them to agree that a license change is acceptable for their contribution.  Since
+there are no plans to change the license, this is not a cause for concern.
+
+As a contributor to the project, this means that you (or your company) retain
+ownership of the code you contribute, that it cannot be used in a way that
+contradicts the license (which is a liberal BSD-style license), and that the
+license for your contributions won't change without your approval in the
+future.
+
+.. _LLVM License:
+
+License
+-------
+
+We intend to keep LLVM perpetually open source and to use a liberal open source
+license. **As a contributor to the project, you agree that any contributions be
+licensed under the terms of the corresponding subproject.** All of the code in
+LLVM is available under the `University of Illinois/NCSA Open Source License
+<http://www.opensource.org/licenses/UoI-NCSA.php>`_, which boils down to
+this:
+
+* You can freely distribute LLVM.
+* You must retain the copyright notice if you redistribute LLVM.
+* Binaries derived from LLVM must reproduce the copyright notice (e.g. in an
+  included readme file).
+* You can't use our names to promote your LLVM derived products.
+* There's no warranty on LLVM at all.
+
+We believe this fosters the widest adoption of LLVM because it **allows
+commercial products to be derived from LLVM** with few restrictions and without
+a requirement for making any derived works also open source (i.e.  LLVM's
+license is not a "copyleft" license like the GPL). We suggest that you read the
+`License <http://www.opensource.org/licenses/UoI-NCSA.php>`_ if further
+clarification is needed.
+
+In addition to the UIUC license, the runtime library components of LLVM
+(**compiler_rt, libc++, and libclc**) are also licensed under the `MIT License
+<http://www.opensource.org/licenses/mit-license.php>`_, which does not contain
+the binary redistribution clause.  As a user of these runtime libraries, it
+means that you can choose to use the code under either license (and thus don't
+need the binary redistribution clause), and as a contributor to the code that
+you agree that any contributions to these libraries be licensed under both
+licenses.  We feel that this is important for runtime libraries, because they
+are implicitly linked into applications and therefore should not subject those
+applications to the binary redistribution clause. This also means that it is ok
+to move code from (e.g.)  libc++ to the LLVM core without concern, but that code
+cannot be moved from the LLVM core to libc++ without the copyright owner's
+permission.
+
+Note that the LLVM Project does distribute llvm-gcc and dragonegg, **which are
+GPL.** This means that anything "linked" into llvm-gcc must itself be compatible
+with the GPL, and must be releasable under the terms of the GPL.  This implies
+that **any code linked into llvm-gcc and distributed to others may be subject to
+the viral aspects of the GPL** (for example, a proprietary code generator linked
+into llvm-gcc must be made available under the GPL).  This is not a problem for
+code already distributed under a more liberal license (like the UIUC license),
+and GPL-containing subprojects are kept in separate SVN repositories whose
+LICENSE.txt files specifically indicate that they contain GPL code.
+
+We have no plans to change the license of LLVM.  If you have questions or
+comments about the license, please contact the `LLVM Developer's Mailing
+List <mailto:llvmdev@cs.uiuc.edu>`_.
+
+Patents
+-------
+
+To the best of our knowledge, LLVM does not infringe on any patents (we have
+actually removed code from LLVM in the past that was found to infringe).  Having
+code in LLVM that infringes on patents would violate an important goal of the
+project by making it hard or impossible to reuse the code for arbitrary purposes
+(including commercial use).
+
+When contributing code, we expect contributors to notify us of any potential for
+patent-related trouble with their changes (including from third parties).  If
+you or your employer own the rights to a patent and would like to contribute
+code to LLVM that relies on it, we require that the copyright owner sign an
+agreement that allows any other user of LLVM to freely use your patent.  Please
+contact the `oversight group <mailto:llvm-oversight@cs.uiuc.edu>`_ for more
+details.
diff --git a/docs/ExceptionHandling.html b/docs/ExceptionHandling.html
deleted file mode 100644
index 49e6b01..0000000
--- a/docs/ExceptionHandling.html
+++ /dev/null
@@ -1,563 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <title>Exception Handling in LLVM</title>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <meta name="description"
-        content="Exception Handling in LLVM.">
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-
-<body>
-
-<h1>Exception Handling in LLVM</h1>
-
-<table class="layout" style="width:100%">
-  <tr class="layout">
-    <td class="left">
-<ul>
-  <li><a href="#introduction">Introduction</a>
-  <ol>
-    <li><a href="#itanium">Itanium ABI Zero-cost Exception Handling</a></li>
-    <li><a href="#sjlj">Setjmp/Longjmp Exception Handling</a></li>
-    <li><a href="#overview">Overview</a></li>
-  </ol></li>
-  <li><a href="#codegen">LLVM Code Generation</a>
-  <ol>
-    <li><a href="#throw">Throw</a></li>
-    <li><a href="#try_catch">Try/Catch</a></li>
-    <li><a href="#cleanups">Cleanups</a></li>
-    <li><a href="#throw_filters">Throw Filters</a></li>
-    <li><a href="#restrictions">Restrictions</a></li>
-  </ol></li>
-  <li><a href="#format_common_intrinsics">Exception Handling Intrinsics</a>
-  <ol>
-  	<li><a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a></li>
-  	<li><a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a></li>
-  	<li><a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a></li>
-  	<li><a href="#llvm_eh_sjlj_lsda"><tt>llvm.eh.sjlj.lsda</tt></a></li>
-  	<li><a href="#llvm_eh_sjlj_callsite"><tt>llvm.eh.sjlj.callsite</tt></a></li>
-  </ol></li>
-  <li><a href="#asm">Asm Table Formats</a>
-  <ol>
-    <li><a href="#unwind_tables">Exception Handling Frame</a></li>
-    <li><a href="#exception_tables">Exception Tables</a></li>
-  </ol></li>
-</ul>
-</td>
-</tr></table>
-
-<div class="doc_author">
-  <p>Written by the <a href="http://llvm.org/">LLVM Team</a></p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document is the central repository for all information pertaining to
-   exception handling in LLVM.  It describes the format that LLVM exception
-   handling information takes, which is useful for those interested in creating
-   front-ends or dealing directly with the information.  Further, this document
-   provides specific examples of what exception handling information is used for
-   in C and C++.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="itanium">Itanium ABI Zero-cost Exception Handling</a>
-</h3>
-
-<div>
-
-<p>Exception handling for most programming languages is designed to recover from
-   conditions that rarely occur during general use of an application.  To that
-   end, exception handling should not interfere with the main flow of an
-   application's algorithm by performing checkpointing tasks, such as saving the
-   current pc or register state.</p>
-
-<p>The Itanium ABI Exception Handling Specification defines a methodology for
-   providing outlying data in the form of exception tables without inlining
-   speculative exception handling code in the flow of an application's main
-   algorithm.  Thus, the specification is said to add "zero-cost" to the normal
-   execution of an application.</p>
-
-<p>A more complete description of the Itanium ABI exception handling runtime
-   support of can be found at
-   <a href="http://www.codesourcery.com/cxx-abi/abi-eh.html">Itanium C++ ABI:
-   Exception Handling</a>. A description of the exception frame format can be
-   found at
-   <a href="http://refspecs.freestandards.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html">Exception
-   Frames</a>, with details of the DWARF 4 specification at
-   <a href="http://dwarfstd.org/Dwarf4Std.php">DWARF 4 Standard</a>.
-   A description for the C++ exception table formats can be found at
-   <a href="http://www.codesourcery.com/cxx-abi/exceptions.pdf">Exception Handling
-   Tables</a>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="sjlj">Setjmp/Longjmp Exception Handling</a>
-</h3>
-
-<div>
-
-<p>Setjmp/Longjmp (SJLJ) based exception handling uses LLVM intrinsics
-   <a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a> and
-   <a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a> to
-   handle control flow for exception handling.</p>
-
-<p>For each function which does exception processing &mdash; be
-   it <tt>try</tt>/<tt>catch</tt> blocks or cleanups &mdash; that function
-   registers itself on a global frame list. When exceptions are unwinding, the
-   runtime uses this list to identify which functions need processing.<p>
-
-<p>Landing pad selection is encoded in the call site entry of the function
-   context. The runtime returns to the function via
-   <a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a>, where
-   a switch table transfers control to the appropriate landing pad based on
-   the index stored in the function context.</p>
-
-<p>In contrast to DWARF exception handling, which encodes exception regions
-   and frame information in out-of-line tables, SJLJ exception handling
-   builds and removes the unwind frame context at runtime. This results in
-   faster exception handling at the expense of slower execution when no
-   exceptions are thrown. As exceptions are, by their nature, intended for
-   uncommon code paths, DWARF exception handling is generally preferred to
-   SJLJ.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="overview">Overview</a>
-</h3>
-
-<div>
-
-<p>When an exception is thrown in LLVM code, the runtime does its best to find a
-   handler suited to processing the circumstance.</p>
-
-<p>The runtime first attempts to find an <i>exception frame</i> corresponding to
-   the function where the exception was thrown.  If the programming language
-   supports exception handling (e.g. C++), the exception frame contains a
-   reference to an exception table describing how to process the exception.  If
-   the language does not support exception handling (e.g. C), or if the
-   exception needs to be forwarded to a prior activation, the exception frame
-   contains information about how to unwind the current activation and restore
-   the state of the prior activation.  This process is repeated until the
-   exception is handled. If the exception is not handled and no activations
-   remain, then the application is terminated with an appropriate error
-   message.</p>
-
-<p>Because different programming languages have different behaviors when
-   handling exceptions, the exception handling ABI provides a mechanism for
-   supplying <i>personalities</i>. An exception handling personality is defined
-   by way of a <i>personality function</i> (e.g. <tt>__gxx_personality_v0</tt>
-   in C++), which receives the context of the exception, an <i>exception
-   structure</i> containing the exception object type and value, and a reference
-   to the exception table for the current function.  The personality function
-   for the current compile unit is specified in a <i>common exception
-   frame</i>.</p>
-
-<p>The organization of an exception table is language dependent. For C++, an
-   exception table is organized as a series of code ranges defining what to do
-   if an exception occurs in that range. Typically, the information associated
-   with a range defines which types of exception objects (using C++ <i>type
-   info</i>) that are handled in that range, and an associated action that
-   should take place. Actions typically pass control to a <i>landing
-   pad</i>.</p>
-
-<p>A landing pad corresponds roughly to the code found in the <tt>catch</tt>
-   portion of a <tt>try</tt>/<tt>catch</tt> sequence. When execution resumes at
-   a landing pad, it receives an <i>exception structure</i> and a
-   <i>selector value</i> corresponding to the <i>type</i> of exception
-   thrown. The selector is then used to determine which <i>catch</i> should
-   actually process the exception.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h2>
-  <a name="codegen">LLVM Code Generation</a>
-</h2>
-
-<div>
-
-<p>From a C++ developer's perspective, exceptions are defined in terms of the
-   <tt>throw</tt> and <tt>try</tt>/<tt>catch</tt> statements. In this section
-   we will describe the implementation of LLVM exception handling in terms of
-   C++ examples.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="throw">Throw</a>
-</h3>
-
-<div>
-
-<p>Languages that support exception handling typically provide a <tt>throw</tt>
-   operation to initiate the exception process. Internally, a <tt>throw</tt>
-   operation breaks down into two steps.</p>
-
-<ol>
-  <li>A request is made to allocate exception space for an exception structure.
-      This structure needs to survive beyond the current activation. This
-      structure will contain the type and value of the object being thrown.</li>
-
-  <li>A call is made to the runtime to raise the exception, passing the
-      exception structure as an argument.</li>
-</ol>
-
-<p>In C++, the allocation of the exception structure is done by the
-   <tt>__cxa_allocate_exception</tt> runtime function. The exception raising is
-   handled by <tt>__cxa_throw</tt>. The type of the exception is represented
-   using a C++ RTTI structure.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="try_catch">Try/Catch</a>
-</h3>
-
-<div>
-
-<p>A call within the scope of a <i>try</i> statement can potentially raise an
-   exception. In those circumstances, the LLVM C++ front-end replaces the call
-   with an <tt>invoke</tt> instruction. Unlike a call, the <tt>invoke</tt> has
-   two potential continuation points:</p>
-
-<ol>
-  <li>where to continue when the call succeeds as per normal, and</li>
-
-  <li>where to continue if the call raises an exception, either by a throw or
-      the unwinding of a throw</li>
-</ol>
-
-<p>The term used to define a the place where an <tt>invoke</tt> continues after
-   an exception is called a <i>landing pad</i>. LLVM landing pads are
-   conceptually alternative function entry points where an exception structure
-   reference and a type info index are passed in as arguments. The landing pad
-   saves the exception structure reference and then proceeds to select the catch
-   block that corresponds to the type info of the exception object.</p>
-
-<p>The LLVM <a href="LangRef.html#i_landingpad"><tt>landingpad</tt>
-   instruction</a> is used to convey information about the landing pad to the
-   back end. For C++, the <tt>landingpad</tt> instruction returns a pointer and
-   integer pair corresponding to the pointer to the <i>exception structure</i>
-   and the <i>selector value</i> respectively.</p>
-
-<p>The <tt>landingpad</tt> instruction takes a reference to the personality
-   function to be used for this <tt>try</tt>/<tt>catch</tt> sequence. The
-   remainder of the instruction is a list of <i>cleanup</i>, <i>catch</i>,
-   and <i>filter</i> clauses. The exception is tested against the clauses
-   sequentially from first to last. The selector value is a positive number if
-   the exception matched a type info, a negative number if it matched a filter,
-   and zero if it matched a cleanup. If nothing is matched, the behavior of
-   the program is <a href="#restrictions">undefined</a>. If a type info matched,
-   then the selector value is the index of the type info in the exception table,
-   which can be obtained using the
-   <a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a> intrinsic.</p>
-
-<p>Once the landing pad has the type info selector, the code branches to the
-   code for the first catch. The catch then checks the value of the type info
-   selector against the index of type info for that catch.  Since the type info
-   index is not known until all the type infos have been gathered in the
-   backend, the catch code must call the
-   <a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a> intrinsic to
-   determine the index for a given type info. If the catch fails to match the
-   selector then control is passed on to the next catch.</p>
-
-<p>Finally, the entry and exit of catch code is bracketed with calls to
-   <tt>__cxa_begin_catch</tt> and <tt>__cxa_end_catch</tt>.</p>
-
-<ul>
-  <li><tt>__cxa_begin_catch</tt> takes an exception structure reference as an
-      argument and returns the value of the exception object.</li>
-
-  <li><tt>__cxa_end_catch</tt> takes no arguments. This function:<br><br>
-    <ol>
-      <li>Locates the most recently caught exception and decrements its handler
-          count,</li>
-      <li>Removes the exception from the <i>caught</i> stack if the handler
-          count goes to zero, and</li>
-      <li>Destroys the exception if the handler count goes to zero and the
-          exception was not re-thrown by throw.</li>
-    </ol>
-    <p><b>Note:</b> a rethrow from within the catch may replace this call with
-       a <tt>__cxa_rethrow</tt>.</p></li>
-</ul>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="cleanups">Cleanups</a>
-</h3>
-
-<div>
-
-<p>A cleanup is extra code which needs to be run as part of unwinding a scope.
-   C++ destructors are a typical example, but other languages and language
-   extensions provide a variety of different kinds of cleanups. In general, a
-   landing pad may need to run arbitrary amounts of cleanup code before actually
-   entering a catch block. To indicate the presence of cleanups, a
-   <a href="LangRef.html#i_landingpad"><tt>landingpad</tt> instruction</a>
-   should have a <i>cleanup</i> clause. Otherwise, the unwinder will not stop at
-   the landing pad if there are no catches or filters that require it to.</p>
-
-<p><b>Note:</b> Do not allow a new exception to propagate out of the execution
-   of a cleanup. This can corrupt the internal state of the unwinder.
-   Different languages describe different high-level semantics for these
-   situations: for example, C++ requires that the process be terminated, whereas
-   Ada cancels both exceptions and throws a third.</p>
-
-<p>When all cleanups are finished, if the exception is not handled by the
-   current function, resume unwinding by calling the
-   <a href="LangRef.html#i_resume"><tt>resume</tt> instruction</a>, passing in
-   the result of the <tt>landingpad</tt> instruction for the original landing
-   pad.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="throw_filters">Throw Filters</a>
-</h3>
-
-<div>
-
-<p>C++ allows the specification of which exception types may be thrown from a
-   function. To represent this, a top level landing pad may exist to filter out
-   invalid types. To express this in LLVM code the
-   <a href="LangRef.html#i_landingpad"><tt>landingpad</tt> instruction</a> will
-   have a filter clause. The clause consists of an array of type infos.
-   <tt>landingpad</tt> will return a negative value if the exception does not
-   match any of the type infos. If no match is found then a call
-   to <tt>__cxa_call_unexpected</tt> should be made, otherwise
-   <tt>_Unwind_Resume</tt>.  Each of these functions requires a reference to the
-   exception structure.  Note that the most general form of a
-   <a href="LangRef.html#i_landingpad"><tt>landingpad</tt> instruction</a> can
-   have any number of catch, cleanup, and filter clauses (though having more
-   than one cleanup is pointless). The LLVM C++ front-end can generate such
-   <a href="LangRef.html#i_landingpad"><tt>landingpad</tt> instructions</a> due
-   to inlining creating nested exception handling scopes.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="restrictions">Restrictions</a>
-</h3>
-
-<div>
-
-<p>The unwinder delegates the decision of whether to stop in a call frame to
-   that call frame's language-specific personality function. Not all unwinders
-   guarantee that they will stop to perform cleanups. For example, the GNU C++
-   unwinder doesn't do so unless the exception is actually caught somewhere
-   further up the stack.</p>
-
-<p>In order for inlining to behave correctly, landing pads must be prepared to
-   handle selector results that they did not originally advertise. Suppose that
-   a function catches exceptions of type <tt>A</tt>, and it's inlined into a
-   function that catches exceptions of type <tt>B</tt>. The inliner will update
-   the <tt>landingpad</tt> instruction for the inlined landing pad to include
-   the fact that <tt>B</tt> is also caught. If that landing pad assumes that it
-   will only be entered to catch an <tt>A</tt>, it's in for a rude awakening.
-   Consequently, landing pads must test for the selector results they understand
-   and then resume exception propagation with the
-   <a href="LangRef.html#i_resume"><tt>resume</tt> instruction</a> if none of
-   the conditions match.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h2>
-  <a name="format_common_intrinsics">Exception Handling Intrinsics</a>
-</h2>
-
-<div>
-
-<p>In addition to the
-   <a href="LangRef.html#i_landingpad"><tt>landingpad</tt></a> and
-   <a href="LangRef.html#i_resume"><tt>resume</tt></a> instructions, LLVM uses
-   several intrinsic functions (name prefixed with <i><tt>llvm.eh</tt></i>) to
-   provide exception handling information at various points in generated
-   code.</p>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="llvm_eh_typeid_for">llvm.eh.typeid.for</a>
-</h4>
-
-<div>
-
-<pre>
-  i32 @llvm.eh.typeid.for(i8* %type_info)
-</pre>
-
-<p>This intrinsic returns the type info index in the exception table of the
-   current function.  This value can be used to compare against the result
-   of <a href="LangRef.html#i_landingpad"><tt>landingpad</tt> instruction</a>.
-   The single argument is a reference to a type info.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="llvm_eh_sjlj_setjmp">llvm.eh.sjlj.setjmp</a>
-</h4>
-
-<div>
-
-<pre>
-  i32 @llvm.eh.sjlj.setjmp(i8* %setjmp_buf)
-</pre>
-
-<p>For SJLJ based exception handling, this intrinsic forces register saving for
-   the current function and stores the address of the following instruction for
-   use as a destination address
-   by <a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a>. The
-   buffer format and the overall functioning of this intrinsic is compatible
-   with the GCC <tt>__builtin_setjmp</tt> implementation allowing code built
-   with the clang and GCC to interoperate.</p>
-
-<p>The single parameter is a pointer to a five word buffer in which the calling
-   context is saved. The front end places the frame pointer in the first word,
-   and the target implementation of this intrinsic should place the destination
-   address for a
-   <a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a> in the
-   second word. The following three words are available for use in a
-   target-specific manner.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="llvm_eh_sjlj_longjmp">llvm.eh.sjlj.longjmp</a>
-</h4>
-
-<div>
-
-<pre>
-  void @llvm.eh.sjlj.longjmp(i8* %setjmp_buf)
-</pre>
-
-<p>For SJLJ based exception handling, the <tt>llvm.eh.sjlj.longjmp</tt>
-   intrinsic is used to implement <tt>__builtin_longjmp()</tt>. The single
-   parameter is a pointer to a buffer populated
-   by <a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a>. The frame
-   pointer and stack pointer are restored from the buffer, then control is
-   transferred to the destination address.</p>
-
-</div>
-<!-- ======================================================================= -->
-<h4>
-  <a name="llvm_eh_sjlj_lsda">llvm.eh.sjlj.lsda</a>
-</h4>
-
-<div>
-
-<pre>
-  i8* @llvm.eh.sjlj.lsda()
-</pre>
-
-<p>For SJLJ based exception handling, the <tt>llvm.eh.sjlj.lsda</tt> intrinsic
-   returns the address of the Language Specific Data Area (LSDA) for the current
-   function. The SJLJ front-end code stores this address in the exception
-   handling function context for use by the runtime.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="llvm_eh_sjlj_callsite">llvm.eh.sjlj.callsite</a>
-</h4>
-
-<div>
-
-<pre>
-  void @llvm.eh.sjlj.callsite(i32 %call_site_num)
-</pre>
-
-<p>For SJLJ based exception handling, the <tt>llvm.eh.sjlj.callsite</tt>
-   intrinsic identifies the callsite value associated with the
-   following <tt>invoke</tt> instruction. This is used to ensure that landing
-   pad entries in the LSDA are generated in matching order.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h2>
-  <a name="asm">Asm Table Formats</a>
-</h2>
-
-<div>
-
-<p>There are two tables that are used by the exception handling runtime to
-   determine which actions should be taken when an exception is thrown.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="unwind_tables">Exception Handling Frame</a>
-</h3>
-
-<div>
-
-<p>An exception handling frame <tt>eh_frame</tt> is very similar to the unwind
-   frame used by DWARF debug info. The frame contains all the information
-   necessary to tear down the current frame and restore the state of the prior
-   frame. There is an exception handling frame for each function in a compile
-   unit, plus a common exception handling frame that defines information common
-   to all functions in the unit.</p>
-
-<!-- Todo - Table details here. -->
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="exception_tables">Exception Tables</a>
-</h3>
-
-<div>
-
-<p>An exception table contains information about what actions to take when an
-   exception is thrown in a particular part of a function's code. There is one
-   exception table per function, except leaf functions and functions that have
-   calls only to non-throwing functions. They do not need an exception
-   table.</p>
-
-<!-- Todo - Table details here. -->
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
new file mode 100644
index 0000000..190f182
--- /dev/null
+++ b/docs/ExceptionHandling.rst
@@ -0,0 +1,367 @@
+.. _exception_handling:
+
+==========================
+Exception Handling in LLVM
+==========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document is the central repository for all information pertaining to
+exception handling in LLVM.  It describes the format that LLVM exception
+handling information takes, which is useful for those interested in creating
+front-ends or dealing directly with the information.  Further, this document
+provides specific examples of what exception handling information is used for in
+C and C++.
+
+Itanium ABI Zero-cost Exception Handling
+----------------------------------------
+
+Exception handling for most programming languages is designed to recover from
+conditions that rarely occur during general use of an application.  To that end,
+exception handling should not interfere with the main flow of an application's
+algorithm by performing checkpointing tasks, such as saving the current pc or
+register state.
+
+The Itanium ABI Exception Handling Specification defines a methodology for
+providing outlying data in the form of exception tables without inlining
+speculative exception handling code in the flow of an application's main
+algorithm.  Thus, the specification is said to add "zero-cost" to the normal
+execution of an application.
+
+A more complete description of the Itanium ABI exception handling runtime
+support of can be found at `Itanium C++ ABI: Exception Handling
+<http://www.codesourcery.com/cxx-abi/abi-eh.html>`_. A description of the
+exception frame format can be found at `Exception Frames
+<http://refspecs.freestandards.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html>`_,
+with details of the DWARF 4 specification at `DWARF 4 Standard
+<http://dwarfstd.org/Dwarf4Std.php>`_.  A description for the C++ exception
+table formats can be found at `Exception Handling Tables
+<http://www.codesourcery.com/cxx-abi/exceptions.pdf>`_.
+
+Setjmp/Longjmp Exception Handling
+---------------------------------
+
+Setjmp/Longjmp (SJLJ) based exception handling uses LLVM intrinsics
+`llvm.eh.sjlj.setjmp`_ and `llvm.eh.sjlj.longjmp`_ to handle control flow for
+exception handling.
+
+For each function which does exception processing --- be it ``try``/``catch``
+blocks or cleanups --- that function registers itself on a global frame
+list. When exceptions are unwinding, the runtime uses this list to identify
+which functions need processing.
+
+Landing pad selection is encoded in the call site entry of the function
+context. The runtime returns to the function via `llvm.eh.sjlj.longjmp`_, where
+a switch table transfers control to the appropriate landing pad based on the
+index stored in the function context.
+
+In contrast to DWARF exception handling, which encodes exception regions and
+frame information in out-of-line tables, SJLJ exception handling builds and
+removes the unwind frame context at runtime. This results in faster exception
+handling at the expense of slower execution when no exceptions are thrown. As
+exceptions are, by their nature, intended for uncommon code paths, DWARF
+exception handling is generally preferred to SJLJ.
+
+Overview
+--------
+
+When an exception is thrown in LLVM code, the runtime does its best to find a
+handler suited to processing the circumstance.
+
+The runtime first attempts to find an *exception frame* corresponding to the
+function where the exception was thrown.  If the programming language supports
+exception handling (e.g. C++), the exception frame contains a reference to an
+exception table describing how to process the exception.  If the language does
+not support exception handling (e.g. C), or if the exception needs to be
+forwarded to a prior activation, the exception frame contains information about
+how to unwind the current activation and restore the state of the prior
+activation.  This process is repeated until the exception is handled. If the
+exception is not handled and no activations remain, then the application is
+terminated with an appropriate error message.
+
+Because different programming languages have different behaviors when handling
+exceptions, the exception handling ABI provides a mechanism for
+supplying *personalities*. An exception handling personality is defined by
+way of a *personality function* (e.g. ``__gxx_personality_v0`` in C++),
+which receives the context of the exception, an *exception structure*
+containing the exception object type and value, and a reference to the exception
+table for the current function.  The personality function for the current
+compile unit is specified in a *common exception frame*.
+
+The organization of an exception table is language dependent. For C++, an
+exception table is organized as a series of code ranges defining what to do if
+an exception occurs in that range. Typically, the information associated with a
+range defines which types of exception objects (using C++ *type info*) that are
+handled in that range, and an associated action that should take place. Actions
+typically pass control to a *landing pad*.
+
+A landing pad corresponds roughly to the code found in the ``catch`` portion of
+a ``try``/``catch`` sequence. When execution resumes at a landing pad, it
+receives an *exception structure* and a *selector value* corresponding to the
+*type* of exception thrown. The selector is then used to determine which *catch*
+should actually process the exception.
+
+LLVM Code Generation
+====================
+
+From a C++ developer's perspective, exceptions are defined in terms of the
+``throw`` and ``try``/``catch`` statements. In this section we will describe the
+implementation of LLVM exception handling in terms of C++ examples.
+
+Throw
+-----
+
+Languages that support exception handling typically provide a ``throw``
+operation to initiate the exception process. Internally, a ``throw`` operation
+breaks down into two steps.
+
+#. A request is made to allocate exception space for an exception structure.
+   This structure needs to survive beyond the current activation. This structure
+   will contain the type and value of the object being thrown.
+
+#. A call is made to the runtime to raise the exception, passing the exception
+   structure as an argument.
+
+In C++, the allocation of the exception structure is done by the
+``__cxa_allocate_exception`` runtime function. The exception raising is handled
+by ``__cxa_throw``. The type of the exception is represented using a C++ RTTI
+structure.
+
+Try/Catch
+---------
+
+A call within the scope of a *try* statement can potentially raise an
+exception. In those circumstances, the LLVM C++ front-end replaces the call with
+an ``invoke`` instruction. Unlike a call, the ``invoke`` has two potential
+continuation points:
+
+#. where to continue when the call succeeds as per normal, and
+
+#. where to continue if the call raises an exception, either by a throw or the
+   unwinding of a throw
+
+The term used to define a the place where an ``invoke`` continues after an
+exception is called a *landing pad*. LLVM landing pads are conceptually
+alternative function entry points where an exception structure reference and a
+type info index are passed in as arguments. The landing pad saves the exception
+structure reference and then proceeds to select the catch block that corresponds
+to the type info of the exception object.
+
+The LLVM `landingpad instruction <LangRef.html#i_landingpad>`_ is used to convey
+information about the landing pad to the back end. For C++, the ``landingpad``
+instruction returns a pointer and integer pair corresponding to the pointer to
+the *exception structure* and the *selector value* respectively.
+
+The ``landingpad`` instruction takes a reference to the personality function to
+be used for this ``try``/``catch`` sequence. The remainder of the instruction is
+a list of *cleanup*, *catch*, and *filter* clauses. The exception is tested
+against the clauses sequentially from first to last. The selector value is a
+positive number if the exception matched a type info, a negative number if it
+matched a filter, and zero if it matched a cleanup. If nothing is matched, the
+behavior of the program is `undefined`_. If a type info matched, then the
+selector value is the index of the type info in the exception table, which can
+be obtained using the `llvm.eh.typeid.for`_ intrinsic.
+
+Once the landing pad has the type info selector, the code branches to the code
+for the first catch. The catch then checks the value of the type info selector
+against the index of type info for that catch.  Since the type info index is not
+known until all the type infos have been gathered in the backend, the catch code
+must call the `llvm.eh.typeid.for`_ intrinsic to determine the index for a given
+type info. If the catch fails to match the selector then control is passed on to
+the next catch.
+
+Finally, the entry and exit of catch code is bracketed with calls to
+``__cxa_begin_catch`` and ``__cxa_end_catch``.
+
+* ``__cxa_begin_catch`` takes an exception structure reference as an argument
+  and returns the value of the exception object.
+
+* ``__cxa_end_catch`` takes no arguments. This function:
+
+  #. Locates the most recently caught exception and decrements its handler
+     count,
+
+  #. Removes the exception from the *caught* stack if the handler count goes to
+     zero, and
+
+  #. Destroys the exception if the handler count goes to zero and the exception
+     was not re-thrown by throw.
+
+  .. note::
+
+    a rethrow from within the catch may replace this call with a
+    ``__cxa_rethrow``.
+
+Cleanups
+--------
+
+A cleanup is extra code which needs to be run as part of unwinding a scope.  C++
+destructors are a typical example, but other languages and language extensions
+provide a variety of different kinds of cleanups. In general, a landing pad may
+need to run arbitrary amounts of cleanup code before actually entering a catch
+block. To indicate the presence of cleanups, a `landingpad
+instruction <LangRef.html#i_landingpad>`_ should have a *cleanup*
+clause. Otherwise, the unwinder will not stop at the landing pad if there are no
+catches or filters that require it to.
+
+.. note::
+
+  Do not allow a new exception to propagate out of the execution of a
+  cleanup. This can corrupt the internal state of the unwinder.  Different
+  languages describe different high-level semantics for these situations: for
+  example, C++ requires that the process be terminated, whereas Ada cancels both
+  exceptions and throws a third.
+
+When all cleanups are finished, if the exception is not handled by the current
+function, resume unwinding by calling the `resume
+instruction <LangRef.html#i_resume>`_, passing in the result of the
+``landingpad`` instruction for the original landing pad.
+
+Throw Filters
+-------------
+
+C++ allows the specification of which exception types may be thrown from a
+function. To represent this, a top level landing pad may exist to filter out
+invalid types. To express this in LLVM code the `landingpad
+instruction <LangRef.html#i_landingpad>`_ will have a filter clause. The clause
+consists of an array of type infos.  ``landingpad`` will return a negative value
+if the exception does not match any of the type infos. If no match is found then
+a call to ``__cxa_call_unexpected`` should be made, otherwise
+``_Unwind_Resume``.  Each of these functions requires a reference to the
+exception structure.  Note that the most general form of a ``landingpad``
+instruction can have any number of catch, cleanup, and filter clauses (though
+having more than one cleanup is pointless). The LLVM C++ front-end can generate
+such ``landingpad`` instructions due to inlining creating nested exception
+handling scopes.
+
+.. _undefined:
+
+Restrictions
+------------
+
+The unwinder delegates the decision of whether to stop in a call frame to that
+call frame's language-specific personality function. Not all unwinders guarantee
+that they will stop to perform cleanups. For example, the GNU C++ unwinder
+doesn't do so unless the exception is actually caught somewhere further up the
+stack.
+
+In order for inlining to behave correctly, landing pads must be prepared to
+handle selector results that they did not originally advertise. Suppose that a
+function catches exceptions of type ``A``, and it's inlined into a function that
+catches exceptions of type ``B``. The inliner will update the ``landingpad``
+instruction for the inlined landing pad to include the fact that ``B`` is also
+caught. If that landing pad assumes that it will only be entered to catch an
+``A``, it's in for a rude awakening.  Consequently, landing pads must test for
+the selector results they understand and then resume exception propagation with
+the `resume instruction <LangRef.html#i_resume>`_ if none of the conditions
+match.
+
+Exception Handling Intrinsics
+=============================
+
+In addition to the ``landingpad`` and ``resume`` instructions, LLVM uses several
+intrinsic functions (name prefixed with ``llvm.eh``) to provide exception
+handling information at various points in generated code.
+
+.. _llvm.eh.typeid.for:
+
+llvm.eh.typeid.for
+------------------
+
+.. code-block:: llvm
+
+  i32 @llvm.eh.typeid.for(i8* %type_info)
+
+
+This intrinsic returns the type info index in the exception table of the current
+function.  This value can be used to compare against the result of
+``landingpad`` instruction.  The single argument is a reference to a type info.
+
+.. _llvm.eh.sjlj.setjmp:
+
+llvm.eh.sjlj.setjmp
+-------------------
+
+.. code-block:: llvm
+
+  i32 @llvm.eh.sjlj.setjmp(i8* %setjmp_buf)
+
+For SJLJ based exception handling, this intrinsic forces register saving for the
+current function and stores the address of the following instruction for use as
+a destination address by `llvm.eh.sjlj.longjmp`_. The buffer format and the
+overall functioning of this intrinsic is compatible with the GCC
+``__builtin_setjmp`` implementation allowing code built with the clang and GCC
+to interoperate.
+
+The single parameter is a pointer to a five word buffer in which the calling
+context is saved. The front end places the frame pointer in the first word, and
+the target implementation of this intrinsic should place the destination address
+for a `llvm.eh.sjlj.longjmp`_ in the second word. The following three words are
+available for use in a target-specific manner.
+
+.. _llvm.eh.sjlj.longjmp:
+
+llvm.eh.sjlj.longjmp
+--------------------
+
+.. code-block:: llvm
+
+  void @llvm.eh.sjlj.longjmp(i8* %setjmp_buf)
+
+For SJLJ based exception handling, the ``llvm.eh.sjlj.longjmp`` intrinsic is
+used to implement ``__builtin_longjmp()``. The single parameter is a pointer to
+a buffer populated by `llvm.eh.sjlj.setjmp`_. The frame pointer and stack
+pointer are restored from the buffer, then control is transferred to the
+destination address.
+
+llvm.eh.sjlj.lsda
+-----------------
+
+.. code-block:: llvm
+
+  i8* @llvm.eh.sjlj.lsda()
+
+For SJLJ based exception handling, the ``llvm.eh.sjlj.lsda`` intrinsic returns
+the address of the Language Specific Data Area (LSDA) for the current
+function. The SJLJ front-end code stores this address in the exception handling
+function context for use by the runtime.
+
+llvm.eh.sjlj.callsite
+---------------------
+
+.. code-block:: llvm
+
+  void @llvm.eh.sjlj.callsite(i32 %call_site_num)
+
+For SJLJ based exception handling, the ``llvm.eh.sjlj.callsite`` intrinsic
+identifies the callsite value associated with the following ``invoke``
+instruction. This is used to ensure that landing pad entries in the LSDA are
+generated in matching order.
+
+Asm Table Formats
+=================
+
+There are two tables that are used by the exception handling runtime to
+determine which actions should be taken when an exception is thrown.
+
+Exception Handling Frame
+------------------------
+
+An exception handling frame ``eh_frame`` is very similar to the unwind frame
+used by DWARF debug info. The frame contains all the information necessary to
+tear down the current frame and restore the state of the prior frame. There is
+an exception handling frame for each function in a compile unit, plus a common
+exception handling frame that defines information common to all functions in the
+unit.
+
+Exception Tables
+----------------
+
+An exception table contains information about what actions to take when an
+exception is thrown in a particular part of a function's code. There is one
+exception table per function, except leaf functions and functions that have
+calls only to non-throwing functions. They do not need an exception table.
diff --git a/docs/ExtendingLLVM.html b/docs/ExtendingLLVM.html
index a0cc4ea..6782787 100644
--- a/docs/ExtendingLLVM.html
+++ b/docs/ExtendingLLVM.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Extending LLVM: Adding instructions, intrinsics, types, etc.</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -372,7 +372,7 @@ void calcTypeName(const Type *Ty,
 
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2012-03-23 06:50:46 +0100 (Fri, 23 Mar 2012) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 
 </body>
diff --git a/docs/FAQ.html b/docs/FAQ.html
deleted file mode 100644
index 78c0268..0000000
--- a/docs/FAQ.html
+++ /dev/null
@@ -1,948 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM: Frequently Asked Questions</title>
-  <style type="text/css">
-    @import url("llvm.css");
-    .question { font-weight: bold }
-    .answer   { margin-left: 2em  }
-  </style>
-</head>
-<body>
-
-<h1>
-  LLVM: Frequently Asked Questions
-</h1>
-
-<ol>
-  <li><a href="#license">License</a>
-  <ol>
-    <li>Why are the LLVM source code and the front-end distributed under
-        different licenses?</li>
-
-    <li>Does the University of Illinois Open Source License really qualify as an
-       "open source" license?</li>
-
-    <li>Can I modify LLVM source code and redistribute the modified source?</li>
-
-    <li>Can I modify LLVM source code and redistribute binaries or other tools
-        based on it, without redistributing the source?</li>
-  </ol></li>
-
-  <li><a href="#source">Source code</a>
-  <ol>
-    <li>In what language is LLVM written?</li>
-
-    <li>How portable is the LLVM source code?</li>
-  </ol></li>
-
-  <li><a href="#build">Build Problems</a>
-  <ol>
-    <li>When I run configure, it finds the wrong C compiler.</li>
-
-    <li>The <tt>configure</tt> script finds the right C compiler, but it uses
-        the LLVM linker from a previous build.  What do I do?</li>
-
-    <li>When creating a dynamic library, I get a strange GLIBC error.</li>
-
-    <li>I've updated my source tree from Subversion, and now my build is trying
-        to use a file/directory that doesn't exist.</li>
-
-    <li>I've modified a Makefile in my source tree, but my build tree keeps
-        using the old version.  What do I do?</li>
-
-    <li>I've upgraded to a new version of LLVM, and I get strange build
-        errors.</li>
-
-    <li>I've built LLVM and am testing it, but the tests freeze.</li>
-
-    <li>Why do test results differ when I perform different types of
-        builds?</li>
-
-    <li>Compiling LLVM with GCC 3.3.2 fails, what should I do?</li>
-
-    <li>Compiling LLVM with GCC succeeds, but the resulting tools do not work,
-        what can be wrong?</li>
-
-    <li>When I use the test suite, all of the C Backend tests fail.  What is
-        wrong?</li>
-
-    <li>After Subversion update, rebuilding gives the error "No rule to make
-        target".</li>
-
-    <li><a href="#srcdir-objdir">When I compile LLVM-GCC with srcdir == objdir,
-        it fails. Why?</a></li>
-  </ol></li>
-
-  <li><a href="#felangs">Source Languages</a>
-  <ol>
-    <li><a href="#langs">What source languages are supported?</a></li>
-
-    <li><a href="#langirgen">I'd like to write a self-hosting LLVM compiler. How
-        should I interface with the LLVM middle-end optimizers and back-end code
-        generators?</a></li>
-
-    <li><a href="#langhlsupp">What support is there for higher level source
-        language constructs for building a compiler?</a></li>
-
-    <li><a href="GetElementPtr.html">I don't understand the GetElementPtr
-      instruction. Help!</a></li>
-  </ol>
-
-  <li><a href="#cfe">Using the GCC Front End</a>
-  <ol>
-    <li>When I compile software that uses a configure script, the configure
-        script thinks my system has all of the header files and libraries it is
-        testing for.  How do I get configure to work correctly?</li>
-
-    <li>When I compile code using the LLVM GCC front end, it complains that it
-        cannot find libcrtend.a?</li>
-
-    <li>How can I disable all optimizations when compiling code using the LLVM
-        GCC front end?</li>
-
-    <li><a href="#translatecxx">Can I use LLVM to convert C++ code to C
-        code?</a></li>
-
-    <li><a href="#platformindependent">Can I compile C or C++ code to
-        platform-independent LLVM bitcode?</a></li>
-  </ol>
-  </li>
-
-  <li><a href="#cfe_code">Questions about code generated by the GCC front-end</a>
-  <ol>
-     <li><a href="#iosinit">What is this <tt>llvm.global_ctors</tt> and
-          <tt>_GLOBAL__I__tmp_webcompile...</tt> stuff that happens when I
-          #include &lt;iostream&gt;?</a></li>
-
-     <li><a href="#codedce">Where did all of my code go??</a></li>
-
-     <li><a href="#undef">What is this "<tt>undef</tt>" thing that shows up in
-         my code?</a></li>
-         
-      <li><a href="#callconvwrong">Why does instcombine + simplifycfg turn
-   a call to a function with a mismatched calling convention into "unreachable"?
-   Why not make the verifier reject it?</a></li>
-  </ol>
-  </li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="http://llvm.org/">The LLVM Team</a></p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="license">License</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<div class="question">
-<p>Why are the LLVM source code and the front-end distributed under different
-   licenses?</p>
-</div>
-	
-<div class="answer">
-<p>The C/C++ front-ends are based on GCC and must be distributed under the GPL.
-   Our aim is to distribute LLVM source code under a <em>much less
-   restrictive</em> license, in particular one that does not compel users who
-   distribute tools based on modifying the source to redistribute the modified
-   source code as well.</p>
-</div>
-
-<div class="question">
-<p>Does the University of Illinois Open Source License really qualify as an
-   "open source" license?</p>
-</div>
-
-<div class="answer">
-<p>Yes, the license
-   is <a href="http://www.opensource.org/licenses/UoI-NCSA.php">certified</a> by
-   the Open Source Initiative (OSI).</p>
-</div>
-
-<div class="question">
-<p>Can I modify LLVM source code and redistribute the modified source?</p>
-</div>
-
-<div class="answer">
-<p>Yes.  The modified source distribution must retain the copyright notice and
-   follow the three bulletted conditions listed in
-   the <a href="http://llvm.org/svn/llvm-project/llvm/trunk/LICENSE.TXT">LLVM
-   license</a>.</p>
-</div>
-
-<div class="question">
-<p>Can I modify LLVM source code and redistribute binaries or other tools based
-   on it, without redistributing the source?</p>
-</div>
-
-<div class="answer">
-<p>Yes. This is why we distribute LLVM under a less restrictive license than
-   GPL, as explained in the first question above.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="source">Source Code</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<div class="question">
-<p>In what language is LLVM written?</p>
-</div>
-
-<div class="answer">
-<p>All of the LLVM tools and libraries are written in C++ with extensive use of
-   the STL.</p>
-</div>
-
-<div class="question">
-<p>How portable is the LLVM source code?</p>
-</div>
-
-<div class="answer">
-<p>The LLVM source code should be portable to most modern UNIX-like operating
-systems.  Most of the code is written in standard C++ with operating system
-services abstracted to a support library.  The tools required to build and test
-LLVM have been ported to a plethora of platforms.</p>
-
-<p>Some porting problems may exist in the following areas:</p>
-
-<ul>
-  <li>The GCC front end code is not as portable as the LLVM suite, so it may not
-      compile as well on unsupported platforms.</li>
-
-  <li>The LLVM build system relies heavily on UNIX shell tools, like the Bourne
-      Shell and sed.  Porting to systems without these tools (MacOS 9, Plan 9)
-      will require more effort.</li>
-</ul>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="build">Build Problems</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<div class="question">
-<p>When I run configure, it finds the wrong C compiler.</p>
-</div>
-
-<div class="answer">
-<p>The <tt>configure</tt> script attempts to locate first <tt>gcc</tt> and then
-   <tt>cc</tt>, unless it finds compiler paths set in <tt>CC</tt>
-   and <tt>CXX</tt> for the C and C++ compiler, respectively.</p>
-
-<p>If <tt>configure</tt> finds the wrong compiler, either adjust your
-   <tt>PATH</tt> environment variable or set <tt>CC</tt> and <tt>CXX</tt>
-   explicitly.</p>
-
-</div>
-
-<div class="question">
-<p>The <tt>configure</tt> script finds the right C compiler, but it uses the
-   LLVM linker from a previous build.  What do I do?</p>
-</div>
-
-<div class="answer">
-<p>The <tt>configure</tt> script uses the <tt>PATH</tt> to find executables, so
-   if it's grabbing the wrong linker/assembler/etc, there are two ways to fix
-   it:</p>
-
-<ol>
-  <li><p>Adjust your <tt>PATH</tt> environment variable so that the correct
-      program appears first in the <tt>PATH</tt>.  This may work, but may not be
-      convenient when you want them <i>first</i> in your path for other
-      work.</p></li>
-
-  <li><p>Run <tt>configure</tt> with an alternative <tt>PATH</tt> that is
-      correct. In a Borne compatible shell, the syntax would be:</p>
-
-<pre class="doc_code">
-% PATH=[the path without the bad program] ./configure ...
-</pre>
-
-      <p>This is still somewhat inconvenient, but it allows <tt>configure</tt>
-         to do its work without having to adjust your <tt>PATH</tt>
-         permanently.</p></li>
-</ol>
-</div>
-
-<div class="question">
-<p>When creating a dynamic library, I get a strange GLIBC error.</p>
-</div>
-
-<div class="answer">
-<p>Under some operating systems (i.e. Linux), libtool does not work correctly if
-   GCC was compiled with the --disable-shared option.  To work around this,
-   install your own version of GCC that has shared libraries enabled by
-   default.</p>
-</div>
-
-<div class="question">
-<p>I've updated my source tree from Subversion, and now my build is trying to
-   use a file/directory that doesn't exist.</p>
-</div>
-
-<div class="answer">
-<p>You need to re-run configure in your object directory.  When new Makefiles
-   are added to the source tree, they have to be copied over to the object tree
-   in order to be used by the build.</p>
-</div>
-
-<div class="question">
-<p>I've modified a Makefile in my source tree, but my build tree keeps using the
-   old version.  What do I do?</p>
-</div>
-
-<div class="answer">
-<p>If the Makefile already exists in your object tree, you can just run the
-   following command in the top level directory of your object tree:</p>
-
-<pre class="doc_code">
-% ./config.status &lt;relative path to Makefile&gt;
-</pre>
-
-<p>If the Makefile is new, you will have to modify the configure script to copy
-   it over.</p>
-</div>
-
-<div class="question">
-<p>I've upgraded to a new version of LLVM, and I get strange build errors.</p>
-</div>
-
-<div class="answer">
-
-<p>Sometimes, changes to the LLVM source code alters how the build system works.
-   Changes in libtool, autoconf, or header file dependencies are especially
-   prone to this sort of problem.</p>
-
-<p>The best thing to try is to remove the old files and re-build.  In most
-   cases, this takes care of the problem.  To do this, just type <tt>make
-   clean</tt> and then <tt>make</tt> in the directory that fails to build.</p>
-</div>
-
-<div class="question">
-<p>I've built LLVM and am testing it, but the tests freeze.</p>
-</div>
-
-<div class="answer">
-<p>This is most likely occurring because you built a profile or release
-   (optimized) build of LLVM and have not specified the same information on the
-   <tt>gmake</tt> command line.</p>
-
-<p>For example, if you built LLVM with the command:</p>
-
-<pre class="doc_code">
-% gmake ENABLE_PROFILING=1
-</pre>
-
-<p>...then you must run the tests with the following commands:</p>
-
-<pre class="doc_code">
-% cd llvm/test
-% gmake ENABLE_PROFILING=1
-</pre>
-</div>
-
-<div class="question">
-<p>Why do test results differ when I perform different types of builds?</p>
-</div>
-
-<div class="answer">
-<p>The LLVM test suite is dependent upon several features of the LLVM tools and
-   libraries.</p>
-
-<p>First, the debugging assertions in code are not enabled in optimized or
-   profiling builds.  Hence, tests that used to fail may pass.</p>
-	
-<p>Second, some tests may rely upon debugging options or behavior that is only
-   available in the debug build.  These tests will fail in an optimized or
-   profile build.</p>
-</div>
-
-<div class="question">
-<p>Compiling LLVM with GCC 3.3.2 fails, what should I do?</p>
-</div>
-
-<div class="answer">
-<p>This is <a href="http://gcc.gnu.org/bugzilla/show_bug.cgi?id=13392">a bug in
-   GCC</a>, and affects projects other than LLVM.  Try upgrading or downgrading
-   your GCC.</p>
-</div>
-
-<div class="question">
-<p>Compiling LLVM with GCC succeeds, but the resulting tools do not work, what
-   can be wrong?</p>
-</div>
-
-<div class="answer">
-<p>Several versions of GCC have shown a weakness in miscompiling the LLVM
-   codebase. Please consult your compiler version (<tt>gcc --version</tt>) to
-   find out whether it is <a href="GettingStarted.html#brokengcc">broken</a>.
-   If so, your only option is to upgrade GCC to a known good version.</p>
-</div>
-
-<div class="question">
-<p>After Subversion update, rebuilding gives the error "No rule to make
-   target".</p>
-</div>
-
-<div class="answer">
-<p>If the error is of the form:</p>
-
-<pre class="doc_code">
-gmake[2]: *** No rule to make target `/path/to/somefile', needed by
-`/path/to/another/file.d'.<br>
-Stop.
-</pre>
-
-<p>This may occur anytime files are moved within the Subversion repository or
-   removed entirely.  In this case, the best solution is to erase all
-   <tt>.d</tt> files, which list dependencies for source files, and rebuild:</p>
-
-<pre class="doc_code">
-% cd $LLVM_OBJ_DIR
-% rm -f `find . -name \*\.d` 
-% gmake 
-</pre>
-
-<p>In other cases, it may be necessary to run <tt>make clean</tt> before
-   rebuilding.</p>
-</div>
-
-<div class="question">
-<p><a name="srcdir-objdir">When I compile LLVM-GCC with srcdir == objdir, it
-   fails. Why?</a></p>
-</div>
-
-<div class="answer">
-<p>The <tt>GNUmakefile</tt> in the top-level directory of LLVM-GCC is a special
-   <tt>Makefile</tt> used by Apple to invoke the <tt>build_gcc</tt> script after
-   setting up a special environment. This has the unfortunate side-effect that
-   trying to build LLVM-GCC with srcdir == objdir in a "non-Apple way" invokes
-   the <tt>GNUmakefile</tt> instead of <tt>Makefile</tt>. Because the
-   environment isn't set up correctly to do this, the build fails.</p>
-
-<p>People not building LLVM-GCC the "Apple way" need to build LLVM-GCC with
-   srcdir != objdir, or simply remove the GNUmakefile entirely.</p>
-
-<p>We regret the inconvenience.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="felangs">Source Languages</a>
-</h2>
-
-<div>
-
-<div class="question">
-<p><a name="langs">What source languages are supported?</a></p>
-</div>
-
-<div class="answer">
-<p>LLVM currently has full support for C and C++ source languages. These are
-   available through a special version of GCC that LLVM calls the
-   <a href="#cfe">C Front End</a></p>
-
-<p>There is an incomplete version of a Java front end available in the
-   <tt>java</tt> module. There is no documentation on this yet so you'll need to
-   download the code, compile it, and try it.</p>
-
-<p>The PyPy developers are working on integrating LLVM into the PyPy backend so
-   that PyPy language can translate to LLVM.</p>
-</div>
-
-<div class="question">
-<p><a name="langirgen">I'd like to write a self-hosting LLVM compiler. How
-   should I interface with the LLVM middle-end optimizers and back-end code
-   generators?</a></p>
-</div>
-
-<div class="answer">
-<p>Your compiler front-end will communicate with LLVM by creating a module in
-   the LLVM intermediate representation (IR) format. Assuming you want to write
-   your language's compiler in the language itself (rather than C++), there are
-   3 major ways to tackle generating LLVM IR from a front-end:</p>
-
-<ul>
-  <li><strong>Call into the LLVM libraries code using your language's FFI
-      (foreign function interface).</strong>
-
-    <ul>
-      <li><em>for:</em> best tracks changes to the LLVM IR, .ll syntax, and .bc
-          format</li>
-
-      <li><em>for:</em> enables running LLVM optimization passes without a
-          emit/parse overhead</li>
-
-      <li><em>for:</em> adapts well to a JIT context</li>
-
-      <li><em>against:</em> lots of ugly glue code to write</li>
-    </ul></li>
-
-  <li>  <strong>Emit LLVM assembly from your compiler's native language.</strong>
-    <ul>
-      <li><em>for:</em> very straightforward to get started</li>
-
-      <li><em>against:</em> the .ll parser is slower than the bitcode reader
-          when interfacing to the middle end</li>
-
-      <li><em>against:</em> you'll have to re-engineer the LLVM IR object model
-          and asm writer in your language</li>
-
-      <li><em>against:</em> it may be harder to track changes to the IR</li>
-    </ul></li>
-
-  <li><strong>Emit LLVM bitcode from your compiler's native language.</strong>
-
-    <ul>
-      <li><em>for:</em> can use the more-efficient bitcode reader when
-          interfacing to the middle end</li>
-
-      <li><em>against:</em> you'll have to re-engineer the LLVM IR object 
-          model and bitcode writer in your language</li>
-
-      <li><em>against:</em> it may be harder to track changes to the IR</li>
-    </ul></li>
-</ul>
-
-<p>If you go with the first option, the C bindings in include/llvm-c should help
-   a lot, since most languages have strong support for interfacing with C. The
-   most common hurdle with calling C from managed code is interfacing with the
-   garbage collector. The C interface was designed to require very little memory
-   management, and so is straightforward in this regard.</p>
-</div>
-
-<div class="question">
-<p><a name="langhlsupp">What support is there for a higher level source language
-   constructs for building a compiler?</a></p>
-</div>
-
-<div class="answer">
-<p>Currently, there isn't much. LLVM supports an intermediate representation
-   which is useful for code representation but will not support the high level
-   (abstract syntax tree) representation needed by most compilers. There are no
-   facilities for lexical nor semantic analysis.</p>
-</div>
-
-<div class="question">
-<p><a name="getelementptr">I don't understand the GetElementPtr
-   instruction. Help!</a></p>
-</div>
-
-<div class="answer">
-<p>See <a href="GetElementPtr.html">The Often Misunderstood GEP
-   Instruction</a>.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="cfe">Using the GCC Front End</a>
-</h2>
-
-<div>
-
-<div class="question">
-<p>When I compile software that uses a configure script, the configure script
-   thinks my system has all of the header files and libraries it is testing for.
-   How do I get configure to work correctly?</p>
-</div>
-
-<div class="answer">
-<p>The configure script is getting things wrong because the LLVM linker allows
-   symbols to be undefined at link time (so that they can be resolved during JIT
-   or translation to the C back end).  That is why configure thinks your system
-   "has everything."</p>
-
-<p>To work around this, perform the following steps:</p>
-
-<ol>
-  <li>Make sure the CC and CXX environment variables contains the full path to
-      the LLVM GCC front end.</li>
-
-  <li>Make sure that the regular C compiler is first in your PATH. </li>
-
-  <li>Add the string "-Wl,-native" to your CFLAGS environment variable.</li>
-</ol>
-
-<p>This will allow the <tt>llvm-ld</tt> linker to create a native code
-   executable instead of shell script that runs the JIT.  Creating native code
-   requires standard linkage, which in turn will allow the configure script to
-   find out if code is not linking on your system because the feature isn't
-   available on your system.</p>
-</div>
-
-<div class="question">
-<p>When I compile code using the LLVM GCC front end, it complains that it cannot
-   find libcrtend.a.
-</p>
-</div>
-
-<div class="answer">
-<p>The only way this can happen is if you haven't installed the runtime
-   library. To correct this, do:</p>
-
-<pre class="doc_code">
-% cd llvm/runtime
-% make clean ; make install-bytecode
-</pre>
-</div>
-
-<div class="question">
-<p>How can I disable all optimizations when compiling code using the LLVM GCC
-   front end?</p>
-</div>
-
-<div class="answer">
-<p>Passing "-Wa,-disable-opt -Wl,-disable-opt" will disable *all* cleanup and
-   optimizations done at the llvm level, leaving you with the truly horrible
-   code that you desire.</p>
-</div>
-
-
-<div class="question">
-<p><a name="translatecxx">Can I use LLVM to convert C++ code to C code?</a></p>
-</div>
-
-<div class="answer">
-<p>Yes, you can use LLVM to convert code from any language LLVM supports to C.
-   Note that the generated C code will be very low level (all loops are lowered
-   to gotos, etc) and not very pretty (comments are stripped, original source
-   formatting is totally lost, variables are renamed, expressions are
-   regrouped), so this may not be what you're looking for. Also, there are
-   several limitations noted below.<p>
-
-<p>Use commands like this:</p>
-
-<ol>
-  <li><p>Compile your program with llvm-g++:</p>
-
-<pre class="doc_code">
-% llvm-g++ -emit-llvm x.cpp -o program.bc -c
-</pre>
-
-      <p>or:</p>
-
-<pre class="doc_code">
-% llvm-g++ a.cpp -c -emit-llvm
-% llvm-g++ b.cpp -c -emit-llvm
-% llvm-ld a.o b.o -o program
-</pre>
-
-   <p>This will generate program and program.bc.  The .bc
-      file is the LLVM version of the program all linked together.</p></li>
-
-  <li><p>Convert the LLVM code to C code, using the LLC tool with the C
-      backend:</p>
-
-<pre class="doc_code">
-% llc -march=c program.bc -o program.c
-</pre></li>
-
-  <li><p>Finally, compile the C file:</p>
-
-<pre class="doc_code">
-% cc x.c -lstdc++
-</pre></li>
-
-</ol>
-
-<p>Using LLVM does not eliminate the need for C++ library support.  If you use
-   the llvm-g++ front-end, the generated code will depend on g++'s C++ support
-   libraries in the same way that code generated from g++ would.  If you use
-   another C++ front-end, the generated code will depend on whatever library
-   that front-end would normally require.</p>
-
-<p>If you are working on a platform that does not provide any C++ libraries, you
-   may be able to manually compile libstdc++ to LLVM bitcode, statically link it
-   into your program, then use the commands above to convert the whole result
-   into C code.  Alternatively, you might compile the libraries and your
-   application into two different chunks of C code and link them.</p>
-
-<p>Note that, by default, the C back end does not support exception handling.
-   If you want/need it for a certain program, you can enable it by passing
-   "-enable-correct-eh-support" to the llc program.  The resultant code will use
-   setjmp/longjmp to implement exception support that is relatively slow, and
-   not C++-ABI-conforming on most platforms, but otherwise correct.</p>
-
-<p>Also, there are a number of other limitations of the C backend that cause it
-   to produce code that does not fully conform to the C++ ABI on most
-   platforms. Some of the C++ programs in LLVM's test suite are known to fail
-   when compiled with the C back end because of ABI incompatibilities with
-   standard C++ libraries.</p>
-</div>
-
-<div class="question">
-<p><a name="platformindependent">Can I compile C or C++ code to
-   platform-independent LLVM bitcode?</a></p>
-</div>
-
-<div class="answer">
-<p>No. C and C++ are inherently platform-dependent languages. The most obvious
-   example of this is the preprocessor. A very common way that C code is made
-   portable is by using the preprocessor to include platform-specific code. In
-   practice, information about other platforms is lost after preprocessing, so
-   the result is inherently dependent on the platform that the preprocessing was
-   targeting.</p>
-
-<p>Another example is <tt>sizeof</tt>. It's common for <tt>sizeof(long)</tt> to
-   vary between platforms. In most C front-ends, <tt>sizeof</tt> is expanded to
-   a constant immediately, thus hard-wiring a platform-specific detail.</p>
-
-<p>Also, since many platforms define their ABIs in terms of C, and since LLVM is
-   lower-level than C, front-ends currently must emit platform-specific IR in
-   order to have the result conform to the platform ABI.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="cfe_code">Questions about code generated by the GCC front-end</a>
-</h2>
-
-<div>
-
-<div class="question">
-<p><a name="iosinit">What is this <tt>llvm.global_ctors</tt> and
-   <tt>_GLOBAL__I__tmp_webcompile...</tt> stuff that happens when I <tt>#include
-   &lt;iostream&gt;</tt>?</a></p>
-</div>
-
-<div class="answer">
-<p>If you <tt>#include</tt> the <tt>&lt;iostream&gt;</tt> header into a C++
-   translation unit, the file will probably use
-   the <tt>std::cin</tt>/<tt>std::cout</tt>/... global objects.  However, C++
-   does not guarantee an order of initialization between static objects in
-   different translation units, so if a static ctor/dtor in your .cpp file
-   used <tt>std::cout</tt>, for example, the object would not necessarily be
-   automatically initialized before your use.</p>
-
-<p>To make <tt>std::cout</tt> and friends work correctly in these scenarios, the
-   STL that we use declares a static object that gets created in every
-   translation unit that includes <tt>&lt;iostream&gt;</tt>.  This object has a
-   static constructor and destructor that initializes and destroys the global
-   iostream objects before they could possibly be used in the file.  The code
-   that you see in the .ll file corresponds to the constructor and destructor
-   registration code.
-</p>
-
-<p>If you would like to make it easier to <b>understand</b> the LLVM code
-   generated by the compiler in the demo page, consider using <tt>printf()</tt>
-   instead of <tt>iostream</tt>s to print values.</p>
-</div>
-
-<!--=========================================================================-->
-
-<div class="question">
-<p><a name="codedce">Where did all of my code go??</a></p>
-</div>
-
-<div class="answer">
-<p>If you are using the LLVM demo page, you may often wonder what happened to
-   all of the code that you typed in.  Remember that the demo script is running
-   the code through the LLVM optimizers, so if your code doesn't actually do
-   anything useful, it might all be deleted.</p>
-
-<p>To prevent this, make sure that the code is actually needed.  For example, if
-   you are computing some expression, return the value from the function instead
-   of leaving it in a local variable.  If you really want to constrain the
-   optimizer, you can read from and assign to <tt>volatile</tt> global
-   variables.</p>
-</div>
-
-<!--=========================================================================-->
-
-<div class="question">
-<p><a name="undef">What is this "<tt>undef</tt>" thing that shows up in my
-   code?</a></p>
-</div>
-
-<div class="answer">
-<p><a href="LangRef.html#undef"><tt>undef</tt></a> is the LLVM way of
-   representing a value that is not defined.  You can get these if you do not
-   initialize a variable before you use it.  For example, the C function:</p>
-
-<pre class="doc_code">
-int X() { int i; return i; }
-</pre>
-
-<p>Is compiled to "<tt>ret i32 undef</tt>" because "<tt>i</tt>" never has a
-   value specified for it.</p>
-</div>
-
-<!--=========================================================================-->
-
-<div class="question">
-<p><a name="callconvwrong">Why does instcombine + simplifycfg turn
-   a call to a function with a mismatched calling convention into "unreachable"?
-   Why not make the verifier reject it?</a></p>
-</div>
-
-<div class="answer">
-<p>This is a common problem run into by authors of front-ends that are using
-custom calling conventions: you need to make sure to set the right calling
-convention on both the function and on each call to the function.  For example,
-this code:</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-        ret void
-}
-define void @bar() {
-        call void @foo()
-        ret void
-}
-</pre>
-
-<p>Is optimized to:</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-	ret void
-}
-define void @bar() {
-	unreachable
-}
-</pre>
-
-<p>... with "opt -instcombine -simplifycfg".  This often bites people because
-"all their code disappears".  Setting the calling convention on the caller and
-callee is required for indirect calls to work, so people often ask why not make
-the verifier reject this sort of thing.</p>
-
-<p>The answer is that this code has undefined behavior, but it is not illegal.
-If we made it illegal, then every transformation that could potentially create
-this would have to ensure that it doesn't, and there is valid code that can
-create this sort of construct (in dead code).  The sorts of things that can
-cause this to happen are fairly contrived, but we still need to accept them.
-Here's an example:</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-        ret void
-}
-define internal void @bar(void()* %FP, i1 %cond) {
-        br i1 %cond, label %T, label %F
-T:  
-        call void %FP()
-        ret void
-F:
-        call fastcc void %FP()
-        ret void
-}
-define void @test() {
-        %X = or i1 false, false
-        call void @bar(void()* @foo, i1 %X)
-        ret void
-} 
-</pre>
-
-<p>In this example, "test" always passes @foo/false into bar, which ensures that
-   it is dynamically called with the right calling conv (thus, the code is
-   perfectly well defined).  If you run this through the inliner, you get this
-   (the explicit "or" is there so that the inliner doesn't dead code eliminate
-   a bunch of stuff):
-</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-	ret void
-}
-define void @test() {
-	%X = or i1 false, false
-	br i1 %X, label %T.i, label %F.i
-T.i:
-	call void @foo()
-	br label %bar.exit
-F.i:
-	call fastcc void @foo()
-	br label %bar.exit
-bar.exit:
-	ret void
-}
-</pre>
-
-<p>Here you can see that the inlining pass made an undefined call to @foo with
-  the wrong calling convention.  We really don't want to make the inliner have
-  to know about this sort of thing, so it needs to be valid code.  In this case,
-  dead code elimination can trivially remove the undefined code.  However, if %X
-  was an input argument to @test, the inliner would produce this:
-</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-	ret void
-}
-
-define void @test(i1 %X) {
-	br i1 %X, label %T.i, label %F.i
-T.i:
-	call void @foo()
-	br label %bar.exit
-F.i:
-	call fastcc void @foo()
-	br label %bar.exit
-bar.exit:
-	ret void
-}
-</pre>
-
-<p>The interesting thing about this is that %X <em>must</em> be false for the
-code to be well-defined, but no amount of dead code elimination will be able to
-delete the broken call as unreachable.  However, since instcombine/simplifycfg
-turns the undefined call into unreachable, we end up with a branch on a
-condition that goes to unreachable: a branch to unreachable can never happen, so
-"-inline -instcombine -simplifycfg" is able to produce:</p>
-
-<pre class="doc_code">
-define fastcc void @foo() {
-	ret void
-}
-define void @test(i1 %X) {
-F.i:
-	call fastcc void @foo()
-	ret void
-}
-</pre>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
new file mode 100644
index 0000000..b0e3ca0
--- /dev/null
+++ b/docs/FAQ.rst
@@ -0,0 +1,464 @@
+.. _faq:
+
+================================
+Frequently Asked Questions (FAQ)
+================================
+
+.. contents::
+   :local:
+
+
+License
+=======
+
+Does the University of Illinois Open Source License really qualify as an "open source" license?
+-----------------------------------------------------------------------------------------------
+Yes, the license is `certified
+<http://www.opensource.org/licenses/UoI-NCSA.php>`_ by the Open Source
+Initiative (OSI).
+
+
+Can I modify LLVM source code and redistribute the modified source?
+-------------------------------------------------------------------
+Yes.  The modified source distribution must retain the copyright notice and
+follow the three bulletted conditions listed in the `LLVM license
+<http://llvm.org/svn/llvm-project/llvm/trunk/LICENSE.TXT>`_.
+
+
+Can I modify the LLVM source code and redistribute binaries or other tools based on it, without redistributing the source?
+--------------------------------------------------------------------------------------------------------------------------
+Yes. This is why we distribute LLVM under a less restrictive license than GPL,
+as explained in the first question above.
+
+
+Source Code
+===========
+
+In what language is LLVM written?
+---------------------------------
+All of the LLVM tools and libraries are written in C++ with extensive use of
+the STL.
+
+
+How portable is the LLVM source code?
+-------------------------------------
+The LLVM source code should be portable to most modern Unix-like operating
+systems.  Most of the code is written in standard C++ with operating system
+services abstracted to a support library.  The tools required to build and
+test LLVM have been ported to a plethora of platforms.
+
+Some porting problems may exist in the following areas:
+
+* The autoconf/makefile build system relies heavily on UNIX shell tools,
+  like the Bourne Shell and sed.  Porting to systems without these tools
+  (MacOS 9, Plan 9) will require more effort.
+
+
+Build Problems
+==============
+
+When I run configure, it finds the wrong C compiler.
+----------------------------------------------------
+The ``configure`` script attempts to locate first ``gcc`` and then ``cc``,
+unless it finds compiler paths set in ``CC`` and ``CXX`` for the C and C++
+compiler, respectively.
+
+If ``configure`` finds the wrong compiler, either adjust your ``PATH``
+environment variable or set ``CC`` and ``CXX`` explicitly.
+
+
+The ``configure`` script finds the right C compiler, but it uses the LLVM tools from a previous build.  What do I do?
+---------------------------------------------------------------------------------------------------------------------
+The ``configure`` script uses the ``PATH`` to find executables, so if it's
+grabbing the wrong linker/assembler/etc, there are two ways to fix it:
+
+#. Adjust your ``PATH`` environment variable so that the correct program
+   appears first in the ``PATH``.  This may work, but may not be convenient
+   when you want them *first* in your path for other work.
+
+#. Run ``configure`` with an alternative ``PATH`` that is correct. In a
+   Bourne compatible shell, the syntax would be:
+
+.. code-block:: bash
+
+   % PATH=[the path without the bad program] ./configure ...
+
+This is still somewhat inconvenient, but it allows ``configure`` to do its
+work without having to adjust your ``PATH`` permanently.
+
+
+When creating a dynamic library, I get a strange GLIBC error.
+-------------------------------------------------------------
+Under some operating systems (i.e. Linux), libtool does not work correctly if
+GCC was compiled with the ``--disable-shared option``.  To work around this,
+install your own version of GCC that has shared libraries enabled by default.
+
+
+I've updated my source tree from Subversion, and now my build is trying to use a file/directory that doesn't exist.
+-------------------------------------------------------------------------------------------------------------------
+You need to re-run configure in your object directory.  When new Makefiles
+are added to the source tree, they have to be copied over to the object tree
+in order to be used by the build.
+
+
+I've modified a Makefile in my source tree, but my build tree keeps using the old version.  What do I do?
+---------------------------------------------------------------------------------------------------------
+If the Makefile already exists in your object tree, you can just run the
+following command in the top level directory of your object tree:
+
+.. code-block:: bash
+
+   % ./config.status <relative path to Makefile>;
+
+If the Makefile is new, you will have to modify the configure script to copy
+it over.
+
+
+I've upgraded to a new version of LLVM, and I get strange build errors.
+-----------------------------------------------------------------------
+Sometimes, changes to the LLVM source code alters how the build system works.
+Changes in ``libtool``, ``autoconf``, or header file dependencies are
+especially prone to this sort of problem.
+
+The best thing to try is to remove the old files and re-build.  In most cases,
+this takes care of the problem.  To do this, just type ``make clean`` and then
+``make`` in the directory that fails to build.
+
+
+I've built LLVM and am testing it, but the tests freeze.
+--------------------------------------------------------
+This is most likely occurring because you built a profile or release
+(optimized) build of LLVM and have not specified the same information on the
+``gmake`` command line.
+
+For example, if you built LLVM with the command:
+
+.. code-block:: bash
+
+   % gmake ENABLE_PROFILING=1
+
+...then you must run the tests with the following commands:
+
+.. code-block:: bash
+
+   % cd llvm/test
+   % gmake ENABLE_PROFILING=1
+
+Why do test results differ when I perform different types of builds?
+--------------------------------------------------------------------
+The LLVM test suite is dependent upon several features of the LLVM tools and
+libraries.
+
+First, the debugging assertions in code are not enabled in optimized or
+profiling builds.  Hence, tests that used to fail may pass.
+
+Second, some tests may rely upon debugging options or behavior that is only
+available in the debug build.  These tests will fail in an optimized or
+profile build.
+
+
+Compiling LLVM with GCC 3.3.2 fails, what should I do?
+------------------------------------------------------
+This is `a bug in GCC <http://gcc.gnu.org/bugzilla/show_bug.cgi?id=13392>`_,
+and affects projects other than LLVM.  Try upgrading or downgrading your GCC.
+
+
+Compiling LLVM with GCC succeeds, but the resulting tools do not work, what can be wrong?
+-----------------------------------------------------------------------------------------
+Several versions of GCC have shown a weakness in miscompiling the LLVM
+codebase.  Please consult your compiler version (``gcc --version``) to find
+out whether it is `broken <GettingStarted.html#brokengcc>`_.  If so, your only
+option is to upgrade GCC to a known good version.
+
+
+After Subversion update, rebuilding gives the error "No rule to make target".
+-----------------------------------------------------------------------------
+If the error is of the form:
+
+.. code-block:: bash
+
+   gmake[2]: *** No rule to make target `/path/to/somefile',
+   needed by `/path/to/another/file.d'.
+   Stop.
+
+This may occur anytime files are moved within the Subversion repository or
+removed entirely.  In this case, the best solution is to erase all ``.d``
+files, which list dependencies for source files, and rebuild:
+
+.. code-block:: bash
+
+   % cd $LLVM_OBJ_DIR
+   % rm -f `find . -name \*\.d`
+   % gmake
+
+In other cases, it may be necessary to run ``make clean`` before rebuilding.
+
+
+Source Languages
+================
+
+What source languages are supported?
+------------------------------------
+LLVM currently has full support for C and C++ source languages. These are
+available through both `Clang <http://clang.llvm.org/>`_ and `DragonEgg
+<http://dragonegg.llvm.org/>`_.
+
+The PyPy developers are working on integrating LLVM into the PyPy backend so
+that PyPy language can translate to LLVM.
+
+
+I'd like to write a self-hosting LLVM compiler. How should I interface with the LLVM middle-end optimizers and back-end code generators?
+----------------------------------------------------------------------------------------------------------------------------------------
+Your compiler front-end will communicate with LLVM by creating a module in the
+LLVM intermediate representation (IR) format. Assuming you want to write your
+language's compiler in the language itself (rather than C++), there are 3
+major ways to tackle generating LLVM IR from a front-end:
+
+1. **Call into the LLVM libraries code using your language's FFI (foreign
+   function interface).**
+
+  * *for:* best tracks changes to the LLVM IR, .ll syntax, and .bc format
+
+  * *for:* enables running LLVM optimization passes without a emit/parse
+    overhead
+
+  * *for:* adapts well to a JIT context
+
+  * *against:* lots of ugly glue code to write
+
+2. **Emit LLVM assembly from your compiler's native language.**
+
+  * *for:* very straightforward to get started
+
+  * *against:* the .ll parser is slower than the bitcode reader when
+    interfacing to the middle end
+
+  * *against:* it may be harder to track changes to the IR
+
+3. **Emit LLVM bitcode from your compiler's native language.**
+
+  * *for:* can use the more-efficient bitcode reader when interfacing to the
+    middle end
+
+  * *against:* you'll have to re-engineer the LLVM IR object model and bitcode
+    writer in your language
+
+  * *against:* it may be harder to track changes to the IR
+
+If you go with the first option, the C bindings in include/llvm-c should help
+a lot, since most languages have strong support for interfacing with C. The
+most common hurdle with calling C from managed code is interfacing with the
+garbage collector. The C interface was designed to require very little memory
+management, and so is straightforward in this regard.
+
+What support is there for a higher level source language constructs for building a compiler?
+--------------------------------------------------------------------------------------------
+Currently, there isn't much. LLVM supports an intermediate representation
+which is useful for code representation but will not support the high level
+(abstract syntax tree) representation needed by most compilers. There are no
+facilities for lexical nor semantic analysis.
+
+
+I don't understand the ``GetElementPtr`` instruction. Help!
+-----------------------------------------------------------
+See `The Often Misunderstood GEP Instruction <GetElementPtr.html>`_.
+
+
+Using the C and C++ Front Ends
+==============================
+
+Can I compile C or C++ code to platform-independent LLVM bitcode?
+-----------------------------------------------------------------
+No. C and C++ are inherently platform-dependent languages. The most obvious
+example of this is the preprocessor. A very common way that C code is made
+portable is by using the preprocessor to include platform-specific code. In
+practice, information about other platforms is lost after preprocessing, so
+the result is inherently dependent on the platform that the preprocessing was
+targeting.
+
+Another example is ``sizeof``. It's common for ``sizeof(long)`` to vary
+between platforms. In most C front-ends, ``sizeof`` is expanded to a
+constant immediately, thus hard-wiring a platform-specific detail.
+
+Also, since many platforms define their ABIs in terms of C, and since LLVM is
+lower-level than C, front-ends currently must emit platform-specific IR in
+order to have the result conform to the platform ABI.
+
+
+Questions about code generated by the demo page
+===============================================
+
+What is this ``llvm.global_ctors`` and ``_GLOBAL__I_a...`` stuff that happens when I ``#include <iostream>``?
+-------------------------------------------------------------------------------------------------------------
+If you ``#include`` the ``<iostream>`` header into a C++ translation unit,
+the file will probably use the ``std::cin``/``std::cout``/... global objects.
+However, C++ does not guarantee an order of initialization between static
+objects in different translation units, so if a static ctor/dtor in your .cpp
+file used ``std::cout``, for example, the object would not necessarily be
+automatically initialized before your use.
+
+To make ``std::cout`` and friends work correctly in these scenarios, the STL
+that we use declares a static object that gets created in every translation
+unit that includes ``<iostream>``.  This object has a static constructor
+and destructor that initializes and destroys the global iostream objects
+before they could possibly be used in the file.  The code that you see in the
+``.ll`` file corresponds to the constructor and destructor registration code.
+
+If you would like to make it easier to *understand* the LLVM code generated
+by the compiler in the demo page, consider using ``printf()`` instead of
+``iostream``\s to print values.
+
+
+Where did all of my code go??
+-----------------------------
+If you are using the LLVM demo page, you may often wonder what happened to
+all of the code that you typed in.  Remember that the demo script is running
+the code through the LLVM optimizers, so if your code doesn't actually do
+anything useful, it might all be deleted.
+
+To prevent this, make sure that the code is actually needed.  For example, if
+you are computing some expression, return the value from the function instead
+of leaving it in a local variable.  If you really want to constrain the
+optimizer, you can read from and assign to ``volatile`` global variables.
+
+
+What is this "``undef``" thing that shows up in my code?
+--------------------------------------------------------
+``undef`` is the LLVM way of representing a value that is not defined.  You
+can get these if you do not initialize a variable before you use it.  For
+example, the C function:
+
+.. code-block:: c
+
+   int X() { int i; return i; }
+
+Is compiled to "``ret i32 undef``" because "``i``" never has a value specified
+for it.
+
+
+Why does instcombine + simplifycfg turn a call to a function with a mismatched calling convention into "unreachable"? Why not make the verifier reject it?
+----------------------------------------------------------------------------------------------------------------------------------------------------------
+This is a common problem run into by authors of front-ends that are using
+custom calling conventions: you need to make sure to set the right calling
+convention on both the function and on each call to the function.  For
+example, this code:
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+       ret void
+   }
+   define void @bar() {
+       call void @foo()
+       ret void
+   }
+
+Is optimized to:
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+       ret void
+   }
+   define void @bar() {
+       unreachable
+   }
+
+... with "``opt -instcombine -simplifycfg``".  This often bites people because
+"all their code disappears".  Setting the calling convention on the caller and
+callee is required for indirect calls to work, so people often ask why not
+make the verifier reject this sort of thing.
+
+The answer is that this code has undefined behavior, but it is not illegal.
+If we made it illegal, then every transformation that could potentially create
+this would have to ensure that it doesn't, and there is valid code that can
+create this sort of construct (in dead code).  The sorts of things that can
+cause this to happen are fairly contrived, but we still need to accept them.
+Here's an example:
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+       ret void
+   }
+   define internal void @bar(void()* %FP, i1 %cond) {
+       br i1 %cond, label %T, label %F
+   T:
+       call void %FP()
+       ret void
+   F:
+       call fastcc void %FP()
+       ret void
+   }
+   define void @test() {
+       %X = or i1 false, false
+       call void @bar(void()* @foo, i1 %X)
+       ret void
+   }
+
+In this example, "test" always passes ``@foo``/``false`` into ``bar``, which
+ensures that it is dynamically called with the right calling conv (thus, the
+code is perfectly well defined).  If you run this through the inliner, you
+get this (the explicit "or" is there so that the inliner doesn't dead code
+eliminate a bunch of stuff):
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+       ret void
+   }
+   define void @test() {
+       %X = or i1 false, false
+       br i1 %X, label %T.i, label %F.i
+   T.i:
+       call void @foo()
+       br label %bar.exit
+   F.i:
+       call fastcc void @foo()
+       br label %bar.exit
+   bar.exit:
+       ret void
+   }
+
+Here you can see that the inlining pass made an undefined call to ``@foo``
+with the wrong calling convention.  We really don't want to make the inliner
+have to know about this sort of thing, so it needs to be valid code.  In this
+case, dead code elimination can trivially remove the undefined code.  However,
+if ``%X`` was an input argument to ``@test``, the inliner would produce this:
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+       ret void
+   }
+
+   define void @test(i1 %X) {
+       br i1 %X, label %T.i, label %F.i
+   T.i:
+       call void @foo()
+       br label %bar.exit
+   F.i:
+       call fastcc void @foo()
+       br label %bar.exit
+   bar.exit:
+       ret void
+   }
+
+The interesting thing about this is that ``%X`` *must* be false for the
+code to be well-defined, but no amount of dead code elimination will be able
+to delete the broken call as unreachable.  However, since
+``instcombine``/``simplifycfg`` turns the undefined call into unreachable, we
+end up with a branch on a condition that goes to unreachable: a branch to
+unreachable can never happen, so "``-inline -instcombine -simplifycfg``" is
+able to produce:
+
+.. code-block:: llvm
+
+   define fastcc void @foo() {
+      ret void
+   }
+   define void @test(i1 %X) {
+   F.i:
+      call fastcc void @foo()
+      ret void
+   }
diff --git a/docs/GCCFEBuildInstrs.html b/docs/GCCFEBuildInstrs.html
index f502481..37800c8 100644
--- a/docs/GCCFEBuildInstrs.html
+++ b/docs/GCCFEBuildInstrs.html
@@ -3,7 +3,7 @@
 <html>
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-  <link rel="stylesheet" href="llvm.css" type="text/css" media="screen">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css" media="screen">
   <title>Building the LLVM GCC Front-End</title>
 </head>
 <body>
@@ -272,7 +272,7 @@ More information is <a href="FAQ.html#license">available in the FAQ</a>.
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 
 </body>
diff --git a/docs/GarbageCollection.html b/docs/GarbageCollection.html
index 9463eaa..0b8f588 100644
--- a/docs/GarbageCollection.html
+++ b/docs/GarbageCollection.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" Content="text/html; charset=UTF-8" >
   <title>Accurate Garbage Collection with LLVM</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
   <style type="text/css">
     .rowhead { text-align: left; background: inherit; }
     .indent { padding-left: 1em; }
@@ -475,7 +475,7 @@ Entry:
    ;; Tell LLVM that the stack space is a stack root.
    ;; Java has type-tags on objects, so we pass null as metadata.
    %tmp = bitcast %Object** %X to i8**
-   call void @llvm.gcroot(i8** %X, i8* null)
+   call void @llvm.gcroot(i8** %tmp, i8* null)
    ...
 
    ;; "CodeBlock" is the block corresponding to the start
@@ -1382,7 +1382,7 @@ Fergus Henderson. International Symposium on Memory Management 2002.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-03 05:32:33 +0100 (Sat, 03 Mar 2012) $
+  Last modified: $Date: 2012-05-03 17:25:19 +0200 (Thu, 03 May 2012) $
 </address>
 
 </body>
diff --git a/docs/GetElementPtr.html b/docs/GetElementPtr.html
deleted file mode 100644
index 17a93f5..0000000
--- a/docs/GetElementPtr.html
+++ /dev/null
@@ -1,753 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>The Often Misunderstood GEP Instruction</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-  <style type="text/css">
-    TABLE   { text-align: left; border: 1px solid black; border-collapse: collapse; margin: 0 0 0 0; }
-  </style>
-</head>
-<body>
-
-<h1>
-  The Often Misunderstood GEP Instruction
-</h1>
-
-<ol>
-  <li><a href="#intro">Introduction</a></li>
-  <li><a href="#addresses">Address Computation</a>
-  <ol>
-    <li><a href="#extra_index">Why is the extra 0 index required?</a></li>
-    <li><a href="#deref">What is dereferenced by GEP?</a></li>
-    <li><a href="#firstptr">Why can you index through the first pointer but not
-      subsequent ones?</a></li>
-    <li><a href="#lead0">Why don't GEP x,0,0,1 and GEP x,1 alias? </a></li>
-    <li><a href="#trail0">Why do GEP x,1,0,0 and GEP x,1 alias? </a></li>
-    <li><a href="#vectors">Can GEP index into vector elements?</a>
-    <li><a href="#addrspace">What effect do address spaces have on GEPs?</a>
-    <li><a href="#int">How is GEP different from ptrtoint, arithmetic, and inttoptr?</a></li>
-    <li><a href="#be">I'm writing a backend for a target which needs custom lowering for GEP. How do I do this?</a>
-    <li><a href="#vla">How does VLA addressing work with GEPs?</a>
-  </ol></li>
-  <li><a href="#rules">Rules</a>
-  <ol>
-    <li><a href="#bounds">What happens if an array index is out of bounds?</a>
-    <li><a href="#negative">Can array indices be negative?</a>
-    <li><a href="#compare">Can I compare two values computed with GEPs?</a>
-    <li><a href="#types">Can I do GEP with a different pointer type than the type of the underlying object?</a>
-    <li><a href="#null">Can I cast an object's address to integer and add it to null?</a>
-    <li><a href="#ptrdiff">Can I compute the distance between two objects, and add that value to one address to compute the other address?</a>
-    <li><a href="#tbaa">Can I do type-based alias analysis on LLVM IR?</a>
-    <li><a href="#overflow">What happens if a GEP computation overflows?</a>
-    <li><a href="#check">How can I tell if my front-end is following the rules?</a>
-  </ol></li>
-  <li><a href="#rationale">Rationale</a>
-  <ol>
-    <li><a href="#goals">Why is GEP designed this way?</a></li>
-    <li><a href="#i32">Why do struct member indices always use i32?</a></li>
-    <li><a href="#uglygep">What's an uglygep?</a>
-  </ol></li>
-  <li><a href="#summary">Summary</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by: <a href="mailto:rspencer@reidspencer.com">Reid Spencer</a>.</p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2><a name="intro">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-  <p>This document seeks to dispel the mystery and confusion surrounding LLVM's
-  <a href="LangRef.html#i_getelementptr">GetElementPtr</a> (GEP) instruction.
-  Questions about the wily GEP instruction are
-  probably the most frequently occurring questions once a developer gets down to
-  coding with LLVM. Here we lay out the sources of confusion and show that the
-  GEP instruction is really quite simple.
-  </p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="addresses">Address Computation</a></h2>
-<!-- *********************************************************************** -->
-<div>
-  <p>When people are first confronted with the GEP instruction, they tend to
-  relate it to known concepts from other programming paradigms, most notably C
-  array indexing and field selection. GEP closely resembles C array indexing
-  and field selection, however it's is a little different and this leads to
-  the following questions.</p>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="firstptr">What is the first index of the GEP instruction?</a>
-</h3>
-<div>
-  <p>Quick answer: The index stepping through the first operand.</p> 
-  <p>The confusion with the first index usually arises from thinking about 
-  the GetElementPtr instruction as if it was a C index operator. They aren't the
-  same. For example, when we write, in "C":</p>
-
-<div class="doc_code">
-<pre>
-AType *Foo;
-...
-X = &amp;Foo-&gt;F;
-</pre>
-</div>
-
-  <p>it is natural to think that there is only one index, the selection of the
-  field <tt>F</tt>.  However, in this example, <tt>Foo</tt> is a pointer. That 
-  pointer must be indexed explicitly in LLVM. C, on the other hand, indices
-  through it transparently.  To arrive at the same address location as the C 
-  code, you would provide the GEP instruction with two index operands. The 
-  first operand indexes through the pointer; the second operand indexes the 
-  field <tt>F</tt> of the structure, just as if you wrote:</p>
-
-<div class="doc_code">
-<pre>
-X = &amp;Foo[0].F;
-</pre>
-</div>
-
-  <p>Sometimes this question gets rephrased as:</p>
-  <blockquote><p><i>Why is it okay to index through the first pointer, but 
-      subsequent pointers won't be dereferenced?</i></p></blockquote> 
-  <p>The answer is simply because memory does not have to be accessed to 
-  perform the computation. The first operand to the GEP instruction must be a 
-  value of a pointer type. The value of the pointer is provided directly to 
-  the GEP instruction as an operand without any need for accessing memory. It 
-  must, therefore be indexed and requires an index operand. Consider this 
-  example:</p>
-
-<div class="doc_code">
-<pre>
-struct munger_struct {
-  int f1;
-  int f2;
-};
-void munge(struct munger_struct *P) {
-  P[0].f1 = P[1].f1 + P[2].f2;
-}
-...
-munger_struct Array[3];
-...
-munge(Array);
-</pre>
-</div>
-
-  <p>In this "C" example, the front end compiler (llvm-gcc) will generate three
-  GEP instructions for the three indices through "P" in the assignment
-  statement.  The function argument <tt>P</tt> will be the first operand of each
-  of these GEP instructions.  The second operand indexes through that pointer.
-  The third operand will be the field offset into the 
-  <tt>struct munger_struct</tt> type,  for either the <tt>f1</tt> or 
-  <tt>f2</tt> field. So, in LLVM assembly the <tt>munge</tt> function looks 
-  like:</p>
-
-<div class="doc_code">
-<pre>
-void %munge(%struct.munger_struct* %P) {
-entry:
-  %tmp = getelementptr %struct.munger_struct* %P, i32 1, i32 0
-  %tmp = load i32* %tmp
-  %tmp6 = getelementptr %struct.munger_struct* %P, i32 2, i32 1
-  %tmp7 = load i32* %tmp6
-  %tmp8 = add i32 %tmp7, %tmp
-  %tmp9 = getelementptr %struct.munger_struct* %P, i32 0, i32 0
-  store i32 %tmp8, i32* %tmp9
-  ret void
-}
-</pre>
-</div>
-
-  <p>In each case the first operand is the pointer through which the GEP
-  instruction starts. The same is true whether the first operand is an
-  argument, allocated memory, or a global variable. </p>
-  <p>To make this clear, let's consider a more obtuse example:</p>
-
-<div class="doc_code">
-<pre>
-%MyVar = uninitialized global i32
-...
-%idx1 = getelementptr i32* %MyVar, i64 0
-%idx2 = getelementptr i32* %MyVar, i64 1
-%idx3 = getelementptr i32* %MyVar, i64 2
-</pre>
-</div>
-
-  <p>These GEP instructions are simply making address computations from the 
-  base address of <tt>MyVar</tt>.  They compute, as follows (using C syntax):
-  </p>
-
-<div class="doc_code">
-<pre>
-idx1 = (char*) &amp;MyVar + 0
-idx2 = (char*) &amp;MyVar + 4
-idx3 = (char*) &amp;MyVar + 8
-</pre>
-</div>
-
-  <p>Since the type <tt>i32</tt> is known to be four bytes long, the indices 
-  0, 1 and 2 translate into memory offsets of 0, 4, and 8, respectively. No 
-  memory is accessed to make these computations because the address of 
-  <tt>%MyVar</tt> is passed directly to the GEP instructions.</p>
-  <p>The obtuse part of this example is in the cases of <tt>%idx2</tt> and 
-  <tt>%idx3</tt>. They result in the computation of addresses that point to
-  memory past the end of the <tt>%MyVar</tt> global, which is only one
-  <tt>i32</tt> long, not three <tt>i32</tt>s long.  While this is legal in LLVM,
-  it is inadvisable because any load or store with the pointer that results 
-  from these GEP instructions would produce undefined results.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="extra_index">Why is the extra 0 index required?</a>
-</h3>
-<!-- *********************************************************************** -->
-<div>
-  <p>Quick answer: there are no superfluous indices.</p>
-  <p>This question arises most often when the GEP instruction is applied to a
-  global variable which is always a pointer type. For example, consider
-  this:</p>
-
-<div class="doc_code">
-<pre>
-%MyStruct = uninitialized global { float*, i32 }
-...
-%idx = getelementptr { float*, i32 }* %MyStruct, i64 0, i32 1
-</pre>
-</div>
-
-  <p>The GEP above yields an <tt>i32*</tt> by indexing the <tt>i32</tt> typed 
-  field of the structure <tt>%MyStruct</tt>. When people first look at it, they 
-  wonder why the <tt>i64 0</tt> index is needed. However, a closer inspection 
-  of how globals and GEPs work reveals the need. Becoming aware of the following
-  facts will dispel the confusion:</p>
-  <ol>
-    <li>The type of <tt>%MyStruct</tt> is <i>not</i> <tt>{ float*, i32 }</tt> 
-    but rather <tt>{ float*, i32 }*</tt>. That is, <tt>%MyStruct</tt> is a 
-    pointer to a structure containing a pointer to a <tt>float</tt> and an 
-    <tt>i32</tt>.</li>
-    <li>Point #1 is evidenced by noticing the type of the first operand of 
-    the GEP instruction (<tt>%MyStruct</tt>) which is 
-    <tt>{ float*, i32 }*</tt>.</li>
-    <li>The first index, <tt>i64 0</tt> is required to step over the global
-    variable <tt>%MyStruct</tt>.  Since the first argument to the GEP
-    instruction must always be a value of pointer type, the first index 
-    steps through that pointer. A value of 0 means 0 elements offset from that
-    pointer.</li>
-    <li>The second index, <tt>i32 1</tt> selects the second field of the
-    structure (the <tt>i32</tt>). </li>
-  </ol>
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="deref">What is dereferenced by GEP?</a>
-</h3>
-<div>
-  <p>Quick answer: nothing.</p> 
-  <p>The GetElementPtr instruction dereferences nothing. That is, it doesn't
-  access memory in any way. That's what the Load and Store instructions are for.
-  GEP is only involved in the computation of addresses. For example, consider 
-  this:</p>
-
-<div class="doc_code">
-<pre>
-%MyVar = uninitialized global { [40 x i32 ]* }
-...
-%idx = getelementptr { [40 x i32]* }* %MyVar, i64 0, i32 0, i64 0, i64 17
-</pre>
-</div>
-
-  <p>In this example, we have a global variable, <tt>%MyVar</tt> that is a
-  pointer to a structure containing a pointer to an array of 40 ints. The 
-  GEP instruction seems to be accessing the 18th integer of the structure's
-  array of ints. However, this is actually an illegal GEP instruction. It 
-  won't compile. The reason is that the pointer in the structure <i>must</i>
-  be dereferenced in order to index into the array of 40 ints. Since the 
-  GEP instruction never accesses memory, it is illegal.</p>
-  <p>In order to access the 18th integer in the array, you would need to do the
-  following:</p>
-
-<div class="doc_code">
-<pre>
-%idx = getelementptr { [40 x i32]* }* %, i64 0, i32 0
-%arr = load [40 x i32]** %idx
-%idx = getelementptr [40 x i32]* %arr, i64 0, i64 17
-</pre>
-</div>
-
-  <p>In this case, we have to load the pointer in the structure with a load
-  instruction before we can index into the array. If the example was changed 
-  to:</p>
-
-<div class="doc_code">
-<pre>
-%MyVar = uninitialized global { [40 x i32 ] }
-...
-%idx = getelementptr { [40 x i32] }*, i64 0, i32 0, i64 17
-</pre>
-</div>
-
-  <p>then everything works fine. In this case, the structure does not contain a
-  pointer and the GEP instruction can index through the global variable,
-  into the first field of the structure and access the 18th <tt>i32</tt> in the 
-  array there.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="lead0">Why don't GEP x,0,0,1 and GEP x,1 alias?</a>
-</h3>
-<div>
-  <p>Quick Answer: They compute different address locations.</p>
-  <p>If you look at the first indices in these GEP
-  instructions you find that they are different (0 and 1), therefore the address
-  computation diverges with that index. Consider this example:</p>
-
-<div class="doc_code">
-<pre>
-%MyVar = global { [10 x i32 ] }
-%idx1 = getelementptr { [10 x i32 ] }* %MyVar, i64 0, i32 0, i64 1
-%idx2 = getelementptr { [10 x i32 ] }* %MyVar, i64 1
-</pre>
-</div>
-
-  <p>In this example, <tt>idx1</tt> computes the address of the second integer
-  in the array that is in the structure in <tt>%MyVar</tt>, that is
-  <tt>MyVar+4</tt>. The type of <tt>idx1</tt> is <tt>i32*</tt>. However,
-  <tt>idx2</tt> computes the address of <i>the next</i> structure after
-  <tt>%MyVar</tt>. The type of <tt>idx2</tt> is <tt>{ [10 x i32] }*</tt> and its
-  value is equivalent to <tt>MyVar + 40</tt> because it indexes past the ten
-  4-byte integers in <tt>MyVar</tt>. Obviously, in such a situation, the
-  pointers don't alias.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="trail0">Why do GEP x,1,0,0 and GEP x,1 alias?</a>
-</h3>
-<div>
-  <p>Quick Answer: They compute the same address location.</p>
-  <p>These two GEP instructions will compute the same address because indexing
-  through the 0th element does not change the address. However, it does change
-  the type. Consider this example:</p>
-
-<div class="doc_code">
-<pre>
-%MyVar = global { [10 x i32 ] }
-%idx1 = getelementptr { [10 x i32 ] }* %MyVar, i64 1, i32 0, i64 0
-%idx2 = getelementptr { [10 x i32 ] }* %MyVar, i64 1
-</pre>
-</div>
-
-  <p>In this example, the value of <tt>%idx1</tt> is <tt>%MyVar+40</tt> and
-  its type is <tt>i32*</tt>. The value of <tt>%idx2</tt> is also 
-  <tt>MyVar+40</tt> but its type is <tt>{ [10 x i32] }*</tt>.</p>
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="vectors">Can GEP index into vector elements?</a>
-</h3>
-<div>
-  <p>This hasn't always been forcefully disallowed, though it's not recommended.
-     It leads to awkward special cases in the optimizers, and fundamental
-     inconsistency in the IR. In the future, it will probably be outright
-     disallowed.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="addrspace">What effect do address spaces have on GEPs?</a>
-</h3>
-<div>
-   <p>None, except that the address space qualifier on the first operand pointer
-      type always matches the address space qualifier on the result type.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="int">
-    How is GEP different from ptrtoint, arithmetic, and inttoptr?
-  </a>
-</h3>
-<div>
-  <p>It's very similar; there are only subtle differences.</p>
-
-  <p>With ptrtoint, you have to pick an integer type. One approach is to pick i64;
-     this is safe on everything LLVM supports (LLVM internally assumes pointers
-     are never wider than 64 bits in many places), and the optimizer will actually
-     narrow the i64 arithmetic down to the actual pointer size on targets which
-     don't support 64-bit arithmetic in most cases. However, there are some cases
-     where it doesn't do this. With GEP you can avoid this problem.
-
-  <p>Also, GEP carries additional pointer aliasing rules. It's invalid to take a
-     GEP from one object, address into a different separately allocated
-     object, and dereference it. IR producers (front-ends) must follow this rule,
-     and consumers (optimizers, specifically alias analysis) benefit from being
-     able to rely on it. See the <a href="#rules">Rules</a> section for more
-     information.</p>
-
-  <p>And, GEP is more concise in common cases.</p>
-
-  <p>However, for the underlying integer computation implied, there
-     is no difference.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="be">
-    I'm writing a backend for a target which needs custom lowering for GEP.
-    How do I do this?
-  </a>
-</h3>
-<div>
-  <p>You don't. The integer computation implied by a GEP is target-independent.
-     Typically what you'll need to do is make your backend pattern-match
-     expressions trees involving ADD, MUL, etc., which are what GEP is lowered
-     into. This has the advantage of letting your code work correctly in more
-     cases.</p>
-
-  <p>GEP does use target-dependent parameters for the size and layout of data
-     types, which targets can customize.</p>
-
-  <p>If you require support for addressing units which are not 8 bits, you'll
-     need to fix a lot of code in the backend, with GEP lowering being only a
-     small piece of the overall picture.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="vla">How does VLA addressing work with GEPs?</a>
-</h3>
-<div>
-  <p>GEPs don't natively support VLAs. LLVM's type system is entirely static,
-     and GEP address computations are guided by an LLVM type.</p>
-
-  <p>VLA indices can be implemented as linearized indices. For example, an
-     expression like X[a][b][c], must be effectively lowered into a form
-     like X[a*m+b*n+c], so that it appears to the GEP as a single-dimensional
-     array reference.</p>
-
-  <p>This means if you want to write an analysis which understands array
-     indices and you want to support VLAs, your code will have to be
-     prepared to reverse-engineer the linearization. One way to solve this
-     problem is to use the ScalarEvolution library, which always presents
-     VLA and non-VLA indexing in the same manner.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="rules">Rules</a></h2>
-<!-- *********************************************************************** -->
-<div>
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="bounds">What happens if an array index is out of bounds?</a>
-</h3>
-<div>
-  <p>There are two senses in which an array index can be out of bounds.</p>
-
-  <p>First, there's the array type which comes from the (static) type of
-     the first operand to the GEP. Indices greater than the number of elements
-     in the corresponding static array type are valid. There is no problem with
-     out of bounds indices in this sense. Indexing into an array only depends
-     on the size of the array element, not the number of elements.</p>
-     
-  <p>A common example of how this is used is arrays where the size is not known.
-     It's common to use array types with zero length to represent these. The
-     fact that the static type says there are zero elements is irrelevant; it's
-     perfectly valid to compute arbitrary element indices, as the computation
-     only depends on the size of the array element, not the number of
-     elements. Note that zero-sized arrays are not a special case here.</p>
-
-  <p>This sense is unconnected with <tt>inbounds</tt> keyword. The
-     <tt>inbounds</tt> keyword is designed to describe low-level pointer
-     arithmetic overflow conditions, rather than high-level array
-     indexing rules.
-
-  <p>Analysis passes which wish to understand array indexing should not
-     assume that the static array type bounds are respected.</p>
-
-  <p>The second sense of being out of bounds is computing an address that's
-     beyond the actual underlying allocated object.</p>
-
-  <p>With the <tt>inbounds</tt> keyword, the result value of the GEP is
-     undefined if the address is outside the actual underlying allocated
-     object and not the address one-past-the-end.</p>
-
-  <p>Without the <tt>inbounds</tt> keyword, there are no restrictions
-     on computing out-of-bounds addresses. Obviously, performing a load or
-     a store requires an address of allocated and sufficiently aligned
-     memory. But the GEP itself is only concerned with computing addresses.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="negative">Can array indices be negative?</a>
-</h3>
-<div>
-  <p>Yes. This is basically a special case of array indices being out
-     of bounds.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="compare">Can I compare two values computed with GEPs?</a>
-</h3>
-<div>
-  <p>Yes. If both addresses are within the same allocated object, or 
-     one-past-the-end, you'll get the comparison result you expect. If either
-     is outside of it, integer arithmetic wrapping may occur, so the
-     comparison may not be meaningful.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="types">
-    Can I do GEP with a different pointer type than the type of
-    the underlying object?
-  </a>
-</h3>
-<div>
-  <p>Yes. There are no restrictions on bitcasting a pointer value to an arbitrary
-     pointer type. The types in a GEP serve only to define the parameters for the
-     underlying integer computation. They need not correspond with the actual
-     type of the underlying object.</p>
-
-  <p>Furthermore, loads and stores don't have to use the same types as the type
-     of the underlying object. Types in this context serve only to specify
-     memory size and alignment. Beyond that there are merely a hint to the
-     optimizer indicating how the value will likely be used.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="null">
-    Can I cast an object's address to integer and add it to null?
-  </a>
-</h3>
-<div>
-  <p>You can compute an address that way, but if you use GEP to do the add,
-     you can't use that pointer to actually access the object, unless the
-     object is managed outside of LLVM.</p>
-
-  <p>The underlying integer computation is sufficiently defined; null has a
-     defined value -- zero -- and you can add whatever value you want to it.</p>
-
-  <p>However, it's invalid to access (load from or store to) an LLVM-aware
-     object with such a pointer. This includes GlobalVariables, Allocas, and
-     objects pointed to by noalias pointers.</p>
-
-  <p>If you really need this functionality, you can do the arithmetic with
-     explicit integer instructions, and use inttoptr to convert the result to
-     an address. Most of GEP's special aliasing rules do not apply to pointers
-     computed from ptrtoint, arithmetic, and inttoptr sequences.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="ptrdiff">
-    Can I compute the distance between two objects, and add
-    that value to one address to compute the other address?
-  </a>
-</h3>
-<div>
-  <p>As with arithmetic on null, You can use GEP to compute an address that
-     way, but you can't use that pointer to actually access the object if you
-     do, unless the object is managed outside of LLVM.</p>
-
-  <p>Also as above, ptrtoint and inttoptr provide an alternative way to do this
-     which do not have this restriction.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="tbaa">Can I do type-based alias analysis on LLVM IR?</a>
-</h3>
-<div>
-  <p>You can't do type-based alias analysis using LLVM's built-in type system,
-     because LLVM has no restrictions on mixing types in addressing, loads or
-     stores.</p>
-
-  <p>LLVM's type-based alias analysis pass uses metadata to describe a different
-     type system (such as the C type system), and performs type-based aliasing
-     on top of that.  Further details are in the
-     <a href="LangRef.html#tbaa">language reference</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="overflow">What happens if a GEP computation overflows?</a>
-</h3>
-<div>
-   <p>If the GEP lacks the <tt>inbounds</tt> keyword, the value is the result
-      from evaluating the implied two's complement integer computation. However,
-      since there's no guarantee of where an object will be allocated in the
-      address space, such values have limited meaning.</p>
-
-  <p>If the GEP has the <tt>inbounds</tt> keyword, the result value is
-     undefined (a "<a href="LangRef.html#trapvalues">trap value</a>") if the GEP
-     overflows (i.e. wraps around the end of the address space).</p>
-  
-  <p>As such, there are some ramifications of this for inbounds GEPs: scales
-     implied by array/vector/pointer indices are always known to be "nsw" since
-     they are signed values that are scaled by the element size.  These values
-     are also allowed to be negative (e.g. "gep i32 *%P, i32 -1") but the
-     pointer itself is logically treated as an unsigned value.  This means that
-     GEPs have an asymmetric relation between the pointer base (which is treated
-     as unsigned) and the offset applied to it (which is treated as signed). The
-     result of the additions within the offset calculation cannot have signed
-     overflow, but when applied to the base pointer, there can be signed
-     overflow.
-  </p>
-  
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="check">
-    How can I tell if my front-end is following the rules?
-  </a>
-</h3>
-<div>
-   <p>There is currently no checker for the getelementptr rules. Currently,
-      the only way to do this is to manually check each place in your front-end
-      where GetElementPtr operators are created.</p>
-
-   <p>It's not possible to write a checker which could find all rule
-      violations statically. It would be possible to write a checker which
-      works by instrumenting the code with dynamic checks though. Alternatively,
-      it would be possible to write a static checker which catches a subset of
-      possible problems. However, no such checker exists today.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="rationale">Rationale</a></h2>
-<!-- *********************************************************************** -->
-<div>
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="goals">Why is GEP designed this way?</a>
-</h3>
-<div>
-   <p>The design of GEP has the following goals, in rough unofficial
-      order of priority:</p>
-   <ul>
-     <li>Support C, C-like languages, and languages which can be
-         conceptually lowered into C (this covers a lot).</li>
-     <li>Support optimizations such as those that are common in
-         C compilers. In particular, GEP is a cornerstone of LLVM's
-         <a href="LangRef.html#pointeraliasing">pointer aliasing model</a>.</li>
-     <li>Provide a consistent method for computing addresses so that
-         address computations don't need to be a part of load and
-         store instructions in the IR.</li>
-     <li>Support non-C-like languages, to the extent that it doesn't
-         interfere with other goals.</li>
-     <li>Minimize target-specific information in the IR.</li>
-   </ul>
-</div>
-
-<!-- *********************************************************************** -->
-<h3>
-  <a name="i32">Why do struct member indices always use i32?</a>
-</h3>
-<div>
-  <p>The specific type i32 is probably just a historical artifact, however it's
-     wide enough for all practical purposes, so there's been no need to change it.
-     It doesn't necessarily imply i32 address arithmetic; it's just an identifier
-     which identifies a field in a struct. Requiring that all struct indices be
-     the same reduces the range of possibilities for cases where two GEPs are
-     effectively the same but have distinct operand types.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<h3>
-  <a name="uglygep">What's an uglygep?</a>
-</h3>
-<div>
-  <p>Some LLVM optimizers operate on GEPs by internally lowering them into
-     more primitive integer expressions, which allows them to be combined
-     with other integer expressions and/or split into multiple separate
-     integer expressions. If they've made non-trivial changes, translating
-     back into LLVM IR can involve reverse-engineering the structure of
-     the addressing in order to fit it into the static type of the original
-     first operand. It isn't always possibly to fully reconstruct this
-     structure; sometimes the underlying addressing doesn't correspond with
-     the static type at all. In such cases the optimizer instead will emit
-     a GEP with the base pointer casted to a simple address-unit pointer,
-     using the name "uglygep". This isn't pretty, but it's just as
-     valid, and it's sufficient to preserve the pointer aliasing guarantees
-     that GEP provides.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="summary">Summary</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-  <p>In summary, here's some things to always remember about the GetElementPtr
-  instruction:</p>
-  <ol>
-    <li>The GEP instruction never accesses memory, it only provides pointer
-    computations.</li>
-    <li>The first operand to the GEP instruction is always a pointer and it must
-    be indexed.</li>
-    <li>There are no superfluous indices for the GEP instruction.</li>
-    <li>Trailing zero indices are superfluous for pointer aliasing, but not for
-    the types of the pointers.</li>
-    <li>Leading zero indices are not superfluous for pointer aliasing nor the
-    types of the pointers.</li>
-  </ol>
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-31 14:04:26 +0100 (Mon, 31 Oct 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
new file mode 100644
index 0000000..f6f904b
--- /dev/null
+++ b/docs/GetElementPtr.rst
@@ -0,0 +1,538 @@
+.. _gep:
+
+=======================================
+The Often Misunderstood GEP Instruction
+=======================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document seeks to dispel the mystery and confusion surrounding LLVM's
+`GetElementPtr <LangRef.html#i_getelementptr>`_ (GEP) instruction.  Questions
+about the wily GEP instruction are probably the most frequently occurring
+questions once a developer gets down to coding with LLVM. Here we lay out the
+sources of confusion and show that the GEP instruction is really quite simple.
+
+Address Computation
+===================
+
+When people are first confronted with the GEP instruction, they tend to relate
+it to known concepts from other programming paradigms, most notably C array
+indexing and field selection. GEP closely resembles C array indexing and field
+selection, however it's is a little different and this leads to the following
+questions.
+
+What is the first index of the GEP instruction?
+-----------------------------------------------
+
+Quick answer: The index stepping through the first operand.
+
+The confusion with the first index usually arises from thinking about the
+GetElementPtr instruction as if it was a C index operator. They aren't the
+same. For example, when we write, in "C":
+
+.. code-block:: c++
+
+  AType *Foo;
+  ...
+  X = &Foo->F;
+
+it is natural to think that there is only one index, the selection of the field
+``F``.  However, in this example, ``Foo`` is a pointer. That pointer
+must be indexed explicitly in LLVM. C, on the other hand, indices through it
+transparently.  To arrive at the same address location as the C code, you would
+provide the GEP instruction with two index operands. The first operand indexes
+through the pointer; the second operand indexes the field ``F`` of the
+structure, just as if you wrote:
+
+.. code-block:: c++
+
+  X = &Foo[0].F;
+
+Sometimes this question gets rephrased as:
+
+.. _GEP index through first pointer:
+
+  *Why is it okay to index through the first pointer, but subsequent pointers
+  won't be dereferenced?*
+
+The answer is simply because memory does not have to be accessed to perform the
+computation. The first operand to the GEP instruction must be a value of a
+pointer type. The value of the pointer is provided directly to the GEP
+instruction as an operand without any need for accessing memory. It must,
+therefore be indexed and requires an index operand. Consider this example:
+
+.. code-block:: c++
+
+  struct munger_struct {
+    int f1;
+    int f2;
+  };
+  void munge(struct munger_struct *P) {
+    P[0].f1 = P[1].f1 + P[2].f2;
+  }
+  ...
+  munger_struct Array[3];
+  ...
+  munge(Array);
+
+In this "C" example, the front end compiler (llvm-gcc) will generate three GEP
+instructions for the three indices through "P" in the assignment statement.  The
+function argument ``P`` will be the first operand of each of these GEP
+instructions.  The second operand indexes through that pointer.  The third
+operand will be the field offset into the ``struct munger_struct`` type, for
+either the ``f1`` or ``f2`` field. So, in LLVM assembly the ``munge`` function
+looks like:
+
+.. code-block:: llvm
+
+  void %munge(%struct.munger_struct* %P) {
+  entry:
+    %tmp = getelementptr %struct.munger_struct* %P, i32 1, i32 0
+    %tmp = load i32* %tmp
+    %tmp6 = getelementptr %struct.munger_struct* %P, i32 2, i32 1
+    %tmp7 = load i32* %tmp6
+    %tmp8 = add i32 %tmp7, %tmp
+    %tmp9 = getelementptr %struct.munger_struct* %P, i32 0, i32 0
+    store i32 %tmp8, i32* %tmp9
+    ret void
+  }
+
+In each case the first operand is the pointer through which the GEP instruction
+starts. The same is true whether the first operand is an argument, allocated
+memory, or a global variable.
+
+To make this clear, let's consider a more obtuse example:
+
+.. code-block:: llvm
+
+  %MyVar = uninitialized global i32
+  ...
+  %idx1 = getelementptr i32* %MyVar, i64 0
+  %idx2 = getelementptr i32* %MyVar, i64 1
+  %idx3 = getelementptr i32* %MyVar, i64 2
+
+These GEP instructions are simply making address computations from the base
+address of ``MyVar``.  They compute, as follows (using C syntax):
+
+.. code-block:: c++
+
+  idx1 = (char*) &MyVar + 0
+  idx2 = (char*) &MyVar + 4
+  idx3 = (char*) &MyVar + 8
+
+Since the type ``i32`` is known to be four bytes long, the indices 0, 1 and 2
+translate into memory offsets of 0, 4, and 8, respectively. No memory is
+accessed to make these computations because the address of ``%MyVar`` is passed
+directly to the GEP instructions.
+
+The obtuse part of this example is in the cases of ``%idx2`` and ``%idx3``. They
+result in the computation of addresses that point to memory past the end of the
+``%MyVar`` global, which is only one ``i32`` long, not three ``i32``\s long.
+While this is legal in LLVM, it is inadvisable because any load or store with
+the pointer that results from these GEP instructions would produce undefined
+results.
+
+Why is the extra 0 index required?
+----------------------------------
+
+Quick answer: there are no superfluous indices.
+
+This question arises most often when the GEP instruction is applied to a global
+variable which is always a pointer type. For example, consider this:
+
+.. code-block:: llvm
+
+  %MyStruct = uninitialized global { float*, i32 }
+  ...
+  %idx = getelementptr { float*, i32 }* %MyStruct, i64 0, i32 1
+
+The GEP above yields an ``i32*`` by indexing the ``i32`` typed field of the
+structure ``%MyStruct``. When people first look at it, they wonder why the ``i64
+0`` index is needed. However, a closer inspection of how globals and GEPs work
+reveals the need. Becoming aware of the following facts will dispel the
+confusion:
+
+#. The type of ``%MyStruct`` is *not* ``{ float*, i32 }`` but rather ``{ float*,
+   i32 }*``. That is, ``%MyStruct`` is a pointer to a structure containing a
+   pointer to a ``float`` and an ``i32``.
+
+#. Point #1 is evidenced by noticing the type of the first operand of the GEP
+   instruction (``%MyStruct``) which is ``{ float*, i32 }*``.
+
+#. The first index, ``i64 0`` is required to step over the global variable
+   ``%MyStruct``.  Since the first argument to the GEP instruction must always
+   be a value of pointer type, the first index steps through that pointer. A
+   value of 0 means 0 elements offset from that pointer.
+
+#. The second index, ``i32 1`` selects the second field of the structure (the
+   ``i32``).
+
+What is dereferenced by GEP?
+----------------------------
+
+Quick answer: nothing.
+
+The GetElementPtr instruction dereferences nothing. That is, it doesn't access
+memory in any way. That's what the Load and Store instructions are for.  GEP is
+only involved in the computation of addresses. For example, consider this:
+
+.. code-block:: llvm
+
+  %MyVar = uninitialized global { [40 x i32 ]* }
+  ...
+  %idx = getelementptr { [40 x i32]* }* %MyVar, i64 0, i32 0, i64 0, i64 17
+
+In this example, we have a global variable, ``%MyVar`` that is a pointer to a
+structure containing a pointer to an array of 40 ints. The GEP instruction seems
+to be accessing the 18th integer of the structure's array of ints. However, this
+is actually an illegal GEP instruction. It won't compile. The reason is that the
+pointer in the structure <i>must</i> be dereferenced in order to index into the
+array of 40 ints. Since the GEP instruction never accesses memory, it is
+illegal.
+
+In order to access the 18th integer in the array, you would need to do the
+following:
+
+.. code-block:: llvm
+
+  %idx = getelementptr { [40 x i32]* }* %, i64 0, i32 0
+  %arr = load [40 x i32]** %idx
+  %idx = getelementptr [40 x i32]* %arr, i64 0, i64 17
+
+In this case, we have to load the pointer in the structure with a load
+instruction before we can index into the array. If the example was changed to:
+
+.. code-block:: llvm
+
+  %MyVar = uninitialized global { [40 x i32 ] }
+  ...
+  %idx = getelementptr { [40 x i32] }*, i64 0, i32 0, i64 17
+
+then everything works fine. In this case, the structure does not contain a
+pointer and the GEP instruction can index through the global variable, into the
+first field of the structure and access the 18th ``i32`` in the array there.
+
+Why don't GEP x,0,0,1 and GEP x,1 alias?
+----------------------------------------
+
+Quick Answer: They compute different address locations.
+
+If you look at the first indices in these GEP instructions you find that they
+are different (0 and 1), therefore the address computation diverges with that
+index. Consider this example:
+
+.. code-block:: llvm
+
+  %MyVar = global { [10 x i32 ] }
+  %idx1 = getelementptr { [10 x i32 ] }* %MyVar, i64 0, i32 0, i64 1
+  %idx2 = getelementptr { [10 x i32 ] }* %MyVar, i64 1
+
+In this example, ``idx1`` computes the address of the second integer in the
+array that is in the structure in ``%MyVar``, that is ``MyVar+4``. The type of
+``idx1`` is ``i32*``. However, ``idx2`` computes the address of *the next*
+structure after ``%MyVar``. The type of ``idx2`` is ``{ [10 x i32] }*`` and its
+value is equivalent to ``MyVar + 40`` because it indexes past the ten 4-byte
+integers in ``MyVar``. Obviously, in such a situation, the pointers don't
+alias.
+
+Why do GEP x,1,0,0 and GEP x,1 alias?
+-------------------------------------
+
+Quick Answer: They compute the same address location.
+
+These two GEP instructions will compute the same address because indexing
+through the 0th element does not change the address. However, it does change the
+type. Consider this example:
+
+.. code-block:: llvm
+
+  %MyVar = global { [10 x i32 ] }
+  %idx1 = getelementptr { [10 x i32 ] }* %MyVar, i64 1, i32 0, i64 0
+  %idx2 = getelementptr { [10 x i32 ] }* %MyVar, i64 1
+
+In this example, the value of ``%idx1`` is ``%MyVar+40`` and its type is
+``i32*``. The value of ``%idx2`` is also ``MyVar+40`` but its type is ``{ [10 x
+i32] }*``.
+
+Can GEP index into vector elements?
+-----------------------------------
+
+This hasn't always been forcefully disallowed, though it's not recommended.  It
+leads to awkward special cases in the optimizers, and fundamental inconsistency
+in the IR. In the future, it will probably be outright disallowed.
+
+What effect do address spaces have on GEPs?
+-------------------------------------------
+
+None, except that the address space qualifier on the first operand pointer type
+always matches the address space qualifier on the result type.
+
+How is GEP different from ``ptrtoint``, arithmetic, and ``inttoptr``?
+---------------------------------------------------------------------
+
+It's very similar; there are only subtle differences.
+
+With ptrtoint, you have to pick an integer type. One approach is to pick i64;
+this is safe on everything LLVM supports (LLVM internally assumes pointers are
+never wider than 64 bits in many places), and the optimizer will actually narrow
+the i64 arithmetic down to the actual pointer size on targets which don't
+support 64-bit arithmetic in most cases. However, there are some cases where it
+doesn't do this. With GEP you can avoid this problem.
+
+Also, GEP carries additional pointer aliasing rules. It's invalid to take a GEP
+from one object, address into a different separately allocated object, and
+dereference it. IR producers (front-ends) must follow this rule, and consumers
+(optimizers, specifically alias analysis) benefit from being able to rely on
+it. See the `Rules`_ section for more information.
+
+And, GEP is more concise in common cases.
+
+However, for the underlying integer computation implied, there is no
+difference.
+
+
+I'm writing a backend for a target which needs custom lowering for GEP. How do I do this?
+-----------------------------------------------------------------------------------------
+
+You don't. The integer computation implied by a GEP is target-independent.
+Typically what you'll need to do is make your backend pattern-match expressions
+trees involving ADD, MUL, etc., which are what GEP is lowered into. This has the
+advantage of letting your code work correctly in more cases.
+
+GEP does use target-dependent parameters for the size and layout of data types,
+which targets can customize.
+
+If you require support for addressing units which are not 8 bits, you'll need to
+fix a lot of code in the backend, with GEP lowering being only a small piece of
+the overall picture.
+
+How does VLA addressing work with GEPs?
+---------------------------------------
+
+GEPs don't natively support VLAs. LLVM's type system is entirely static, and GEP
+address computations are guided by an LLVM type.
+
+VLA indices can be implemented as linearized indices. For example, an expression
+like ``X[a][b][c]``, must be effectively lowered into a form like
+``X[a*m+b*n+c]``, so that it appears to the GEP as a single-dimensional array
+reference.
+
+This means if you want to write an analysis which understands array indices and
+you want to support VLAs, your code will have to be prepared to reverse-engineer
+the linearization. One way to solve this problem is to use the ScalarEvolution
+library, which always presents VLA and non-VLA indexing in the same manner.
+
+.. _Rules:
+
+Rules
+=====
+
+What happens if an array index is out of bounds?
+------------------------------------------------
+
+There are two senses in which an array index can be out of bounds.
+
+First, there's the array type which comes from the (static) type of the first
+operand to the GEP. Indices greater than the number of elements in the
+corresponding static array type are valid. There is no problem with out of
+bounds indices in this sense. Indexing into an array only depends on the size of
+the array element, not the number of elements.
+
+A common example of how this is used is arrays where the size is not known.
+It's common to use array types with zero length to represent these. The fact
+that the static type says there are zero elements is irrelevant; it's perfectly
+valid to compute arbitrary element indices, as the computation only depends on
+the size of the array element, not the number of elements. Note that zero-sized
+arrays are not a special case here.
+
+This sense is unconnected with ``inbounds`` keyword. The ``inbounds`` keyword is
+designed to describe low-level pointer arithmetic overflow conditions, rather
+than high-level array indexing rules.
+
+Analysis passes which wish to understand array indexing should not assume that
+the static array type bounds are respected.
+
+The second sense of being out of bounds is computing an address that's beyond
+the actual underlying allocated object.
+
+With the ``inbounds`` keyword, the result value of the GEP is undefined if the
+address is outside the actual underlying allocated object and not the address
+one-past-the-end.
+
+Without the ``inbounds`` keyword, there are no restrictions on computing
+out-of-bounds addresses. Obviously, performing a load or a store requires an
+address of allocated and sufficiently aligned memory. But the GEP itself is only
+concerned with computing addresses.
+
+Can array indices be negative?
+------------------------------
+
+Yes. This is basically a special case of array indices being out of bounds.
+
+Can I compare two values computed with GEPs?
+--------------------------------------------
+
+Yes. If both addresses are within the same allocated object, or
+one-past-the-end, you'll get the comparison result you expect. If either is
+outside of it, integer arithmetic wrapping may occur, so the comparison may not
+be meaningful.
+
+Can I do GEP with a different pointer type than the type of the underlying object?
+----------------------------------------------------------------------------------
+
+Yes. There are no restrictions on bitcasting a pointer value to an arbitrary
+pointer type. The types in a GEP serve only to define the parameters for the
+underlying integer computation. They need not correspond with the actual type of
+the underlying object.
+
+Furthermore, loads and stores don't have to use the same types as the type of
+the underlying object. Types in this context serve only to specify memory size
+and alignment. Beyond that there are merely a hint to the optimizer indicating
+how the value will likely be used.
+
+Can I cast an object's address to integer and add it to null?
+-------------------------------------------------------------
+
+You can compute an address that way, but if you use GEP to do the add, you can't
+use that pointer to actually access the object, unless the object is managed
+outside of LLVM.
+
+The underlying integer computation is sufficiently defined; null has a defined
+value --- zero --- and you can add whatever value you want to it.
+
+However, it's invalid to access (load from or store to) an LLVM-aware object
+with such a pointer. This includes ``GlobalVariables``, ``Allocas``, and objects
+pointed to by noalias pointers.
+
+If you really need this functionality, you can do the arithmetic with explicit
+integer instructions, and use inttoptr to convert the result to an address. Most
+of GEP's special aliasing rules do not apply to pointers computed from ptrtoint,
+arithmetic, and inttoptr sequences.
+
+Can I compute the distance between two objects, and add that value to one address to compute the other address?
+---------------------------------------------------------------------------------------------------------------
+
+As with arithmetic on null, You can use GEP to compute an address that way, but
+you can't use that pointer to actually access the object if you do, unless the
+object is managed outside of LLVM.
+
+Also as above, ptrtoint and inttoptr provide an alternative way to do this which
+do not have this restriction.
+
+Can I do type-based alias analysis on LLVM IR?
+----------------------------------------------
+
+You can't do type-based alias analysis using LLVM's built-in type system,
+because LLVM has no restrictions on mixing types in addressing, loads or stores.
+
+LLVM's type-based alias analysis pass uses metadata to describe a different type
+system (such as the C type system), and performs type-based aliasing on top of
+that.  Further details are in the `language reference <LangRef.html#tbaa>`_.
+
+What happens if a GEP computation overflows?
+--------------------------------------------
+
+If the GEP lacks the ``inbounds`` keyword, the value is the result from
+evaluating the implied two's complement integer computation. However, since
+there's no guarantee of where an object will be allocated in the address space,
+such values have limited meaning.
+
+If the GEP has the ``inbounds`` keyword, the result value is undefined (a "trap
+value") if the GEP overflows (i.e. wraps around the end of the address space).
+
+As such, there are some ramifications of this for inbounds GEPs: scales implied
+by array/vector/pointer indices are always known to be "nsw" since they are
+signed values that are scaled by the element size.  These values are also
+allowed to be negative (e.g. "``gep i32 *%P, i32 -1``") but the pointer itself
+is logically treated as an unsigned value.  This means that GEPs have an
+asymmetric relation between the pointer base (which is treated as unsigned) and
+the offset applied to it (which is treated as signed). The result of the
+additions within the offset calculation cannot have signed overflow, but when
+applied to the base pointer, there can be signed overflow.
+
+How can I tell if my front-end is following the rules?
+------------------------------------------------------
+
+There is currently no checker for the getelementptr rules. Currently, the only
+way to do this is to manually check each place in your front-end where
+GetElementPtr operators are created.
+
+It's not possible to write a checker which could find all rule violations
+statically. It would be possible to write a checker which works by instrumenting
+the code with dynamic checks though. Alternatively, it would be possible to
+write a static checker which catches a subset of possible problems. However, no
+such checker exists today.
+
+Rationale
+=========
+
+Why is GEP designed this way?
+-----------------------------
+
+The design of GEP has the following goals, in rough unofficial order of
+priority:
+
+* Support C, C-like languages, and languages which can be conceptually lowered
+  into C (this covers a lot).
+
+* Support optimizations such as those that are common in C compilers. In
+  particular, GEP is a cornerstone of LLVM's `pointer aliasing
+  model <LangRef.html#pointeraliasing>`_.
+
+* Provide a consistent method for computing addresses so that address
+  computations don't need to be a part of load and store instructions in the IR.
+
+* Support non-C-like languages, to the extent that it doesn't interfere with
+  other goals.
+
+* Minimize target-specific information in the IR.
+
+Why do struct member indices always use ``i32``?
+------------------------------------------------
+
+The specific type i32 is probably just a historical artifact, however it's wide
+enough for all practical purposes, so there's been no need to change it.  It
+doesn't necessarily imply i32 address arithmetic; it's just an identifier which
+identifies a field in a struct. Requiring that all struct indices be the same
+reduces the range of possibilities for cases where two GEPs are effectively the
+same but have distinct operand types.
+
+What's an uglygep?
+------------------
+
+Some LLVM optimizers operate on GEPs by internally lowering them into more
+primitive integer expressions, which allows them to be combined with other
+integer expressions and/or split into multiple separate integer expressions. If
+they've made non-trivial changes, translating back into LLVM IR can involve
+reverse-engineering the structure of the addressing in order to fit it into the
+static type of the original first operand. It isn't always possibly to fully
+reconstruct this structure; sometimes the underlying addressing doesn't
+correspond with the static type at all. In such cases the optimizer instead will
+emit a GEP with the base pointer casted to a simple address-unit pointer, using
+the name "uglygep". This isn't pretty, but it's just as valid, and it's
+sufficient to preserve the pointer aliasing guarantees that GEP provides.
+
+Summary
+=======
+
+In summary, here's some things to always remember about the GetElementPtr
+instruction:
+
+
+#. The GEP instruction never accesses memory, it only provides pointer
+   computations.
+
+#. The first operand to the GEP instruction is always a pointer and it must be
+   indexed.
+
+#. There are no superfluous indices for the GEP instruction.
+
+#. Trailing zero indices are superfluous for pointer aliasing, but not for the
+   types of the pointers.
+
+#. Leading zero indices are not superfluous for pointer aliasing nor the types
+   of the pointers.
diff --git a/docs/GettingStarted.html b/docs/GettingStarted.html
index 52baf90..61335af 100644
--- a/docs/GettingStarted.html
+++ b/docs/GettingStarted.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Getting Started with LLVM System</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -344,7 +344,7 @@ up</a></li>
 <li><a name="pf_7">Native code generation exists but is not complete.</a></li>
 <li><a name="pf_8">Binutils 2.20 or later is required to build the assembler
     generated by LLVM properly.</a></li>
-<li><a name="pf_9">XCode 2.5 and gcc 4.0.1</a> (Apple Build 5370) will trip
+<li><a name="pf_9">Xcode 2.5 and gcc 4.0.1</a> (Apple Build 5370) will trip
     internal LLVM assert messages when compiled for Release at optimization
     levels greater than 0 (i.e., <i>"-O1"</i> and higher).
     Add <i>OPTIMIZE_OPTION="-O0"</i> to the build command line
@@ -590,6 +590,7 @@ as the previous one. It appears to work with ENABLE_OPTIMIZED=0 (the default).</
 <p><b>GCC 4.3.3 (Debian 4.3.3-10) on ARM</b>: Miscompiles parts of LLVM 2.6
 when optimizations are turned on. The symptom is an infinite loop in
 FoldingSetImpl::RemoveNode while running the code generator.</p>
+<p><b>SUSE 11 GCC 4.3.4</b>: Miscompiles LLVM, causing crashes in ValueHandle logic.</p>
 <p><b>GCC 4.3.5 and GCC 4.4.5 on ARM</b>: These can miscompile <tt>value >>
 1</tt> even at -O0. A test failure in <tt>test/Assembler/alignstack.ll</tt> is
 one symptom of the problem.
@@ -626,7 +627,7 @@ upgrading to a newer version of Gold.</p>
 LLVM and to give you some basic information about the LLVM environment.</p>
 
 <p>The later sections of this guide describe the <a
-href="#layout">general layout</a> of the the LLVM source tree, a <a
+href="#layout">general layout</a> of the LLVM source tree, a <a
 href="#tutorial">simple example</a> using the LLVM tool chain, and <a
 href="#links">links</a> to find more information about LLVM or to get
 help via e-mail.</p>
@@ -748,6 +749,8 @@ revision), you can checkout it from the '<tt>tags</tt>' directory (instead of
 subdirectories of the '<tt>tags</tt>' directory:</p>
 
 <ul>
+<li>Release 3.1: <b>RELEASE_31/final</b></li>
+<li>Release 3.0: <b>RELEASE_30/final</b></li>
 <li>Release 2.9: <b>RELEASE_29/final</b></li>
 <li>Release 2.8: <b>RELEASE_28</b></li>
 <li>Release 2.7: <b>RELEASE_27</b></li>
@@ -1015,7 +1018,7 @@ script to configure the build system:</p>
   selected as the target of the build host. You can also specify a comma 
   separated list of target names that you want available in llc. The target 
   names use all lower case. The current set of targets is: <br>
-  <tt>arm, cbe, cpp, hexagon, mblaze, mips, mipsel, msp430, powerpc, ptx, sparc, spu, x86, x86_64, xcore</tt>.
+  <tt>arm, cpp, hexagon, mblaze, mips, mipsel, msp430, powerpc, ptx, sparc, spu, x86, x86_64, xcore</tt>.
   <br><br></dd>
   <dt><i>--enable-doxygen</i></dt>
   <dd>Look for the doxygen program and enable construction of doxygen based
@@ -1510,12 +1513,6 @@ information is in the <a href="CommandGuide/index.html">Command Guide</a>.</p>
   <dd>The disassembler transforms the LLVM bitcode to human readable 
   LLVM assembly.</dd>
 
-  <dt><tt><b>llvm-ld</b></tt></dt>
-  <dd><tt>llvm-ld</tt> is a general purpose and extensible linker for LLVM. 
-  It performs standard link time optimizations and allows optimization
-  modules to be loaded and run so that language specific optimizations can 
-  be applied at link time.</dd>
-
   <dt><tt><b>llvm-link</b></tt></dt>
   <dd><tt>llvm-link</tt>, not surprisingly, links multiple LLVM modules into 
   a single program.</dd>
@@ -1538,7 +1535,7 @@ information is in the <a href="CommandGuide/index.html">Command Guide</a>.</p>
   bitcode or assembly (with the <tt>-emit-llvm</tt> option) instead of the
   usual machine code output.  It works just like any other GCC compiler, 
   taking the typical <tt>-c, -S, -E, -o</tt> options that are typically used.  
-  Additionally, the the source code for <tt>llvm-gcc</tt> is available as a 
+  Additionally, the source code for <tt>llvm-gcc</tt> is available as a 
   separate Subversion module.</dd>
 
   <dt><tt><b>opt</b></tt></dt>
@@ -1757,7 +1754,7 @@ out:</p>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.x10sys.com/rspencer/">Reid Spencer</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
+  Last modified: $Date: 2012-07-23 10:51:15 +0200 (Mon, 23 Jul 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/GettingStartedVS.html b/docs/GettingStartedVS.html
deleted file mode 100644
index beadd0b..0000000
--- a/docs/GettingStartedVS.html
+++ /dev/null
@@ -1,368 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Getting Started with LLVM System for Microsoft Visual Studio</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  Getting Started with the LLVM System using Microsoft Visual Studio
-</h1>
-
-<ul>
-  <li><a href="#overview">Overview</a>
-  <li><a href="#requirements">Requirements</a>
-    <ol>
-      <li><a href="#hardware">Hardware</a>
-      <li><a href="#software">Software</a>
-    </ol></li>
-  <li><a href="#quickstart">Getting Started</a>
-  <li><a href="#tutorial">An Example Using the LLVM Tool Chain</a>
-  <li><a href="#problems">Common Problems</a>
-  <li><a href="#links">Links</a>
-</ul>
-
-<div class="doc_author">
-  <p>Written by: <a href="http://llvm.org/">The LLVM Team</a></p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="overview"><b>Overview</b></a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p>Welcome to LLVM on Windows! This document only covers LLVM on Windows using
-  Visual Studio, not mingw or cygwin. In order to get started, you first need to
-  know some basic information.</p>
-
-  <p>There are many different projects that compose LLVM. The first is the LLVM
-  suite. This contains all of the tools, libraries, and header files needed to
-  use LLVM. It contains an assembler, disassembler,
-  bitcode analyzer and bitcode optimizer. It also contains a test suite that can
-  be used to test the LLVM tools.</p>
-
-  <p>Another useful project on Windows is
-  <a href="http://clang.llvm.org/">clang</a>. Clang is a C family
-  ([Objective]C/C++) compiler. Clang mostly works on Windows, but does not
-  currently understand all of the Microsoft extensions to C and C++. Because of
-  this, clang cannot parse the C++ standard library included with Visual Studio,
-  nor parts of the Windows Platform SDK. However, most standard C programs do
-  compile. Clang can be used to emit bitcode, directly emit object files or
-  even linked executables using Visual Studio's <tt>link.exe</tt></p>
-
-  <p>The large LLVM test suite cannot be run on the Visual Studio port at this
-  time.</p>
-
-  <p>Most of the tools build and work.  <tt>bugpoint</tt> does build, but does
-  not work.</p>
-
-  <p>Additional information about the LLVM directory structure and tool chain
-  can be found on the main <a href="GettingStarted.html">Getting Started</a>
-  page.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="requirements"><b>Requirements</b></a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-  <p>Before you begin to use the LLVM system, review the requirements given
-  below.  This may save you some trouble by knowing ahead of time what hardware
-  and software you will need.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="hardware"><b>Hardware</b></a>
-</h3>
-
-<div>
-
-  <p>Any system that can adequately run Visual Studio 2008 is fine. The LLVM
-  source tree and object files, libraries and executables will consume
-  approximately 3GB.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="software"><b>Software</b></a></h3>
-<div>
-
-  <p>You will need Visual Studio 2008 or higher.  Earlier versions of Visual
-  Studio have bugs, are not completely compatible, or do not support the C++
-  standard well enough.</p>
-
-  <p>You will also need the <a href="http://www.cmake.org/">CMake</a> build
-  system since it generates the project files you will use to build with.</p>
-
-  <p>If you would like to run the LLVM tests you will need
-  <a href="http://www.python.org/">Python</a>. Versions 2.4-2.7 are known to
-  work. You will need <a href="http://gnuwin32.sourceforge.net/">"GnuWin32"</a>
-  tools, too.</p>
-
-  <p>Do not install the LLVM directory tree into a path containing spaces (e.g.
-  C:\Documents and Settings\...) as the configure step will fail.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="quickstart"><b>Getting Started</b></a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Here's the short story for getting up and running quickly with LLVM:</p>
-
-<ol>
-  <li>Read the documentation.</li>
-  <li>Seriously, read the documentation.</li>
-  <li>Remember that you were warned twice about reading the documentation.</li>
-
-  <li>Get the Source Code
-  <ul>
-    <li>With the distributed files:
-    <ol>
-      <li><tt>cd <i>where-you-want-llvm-to-live</i></tt>
-      <li><tt>gunzip --stdout llvm-<i>version</i>.tar.gz | tar -xvf -</tt>
-      <i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;or use WinZip</i>
-      <li><tt>cd llvm</tt></li>
-    </ol></li>
-
-    <li>With anonymous Subversion access:
-    <ol>
-      <li><tt>cd <i>where-you-want-llvm-to-live</i></tt></li>
-      <li><tt>svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm</tt></li>
-      <li><tt>cd llvm</tt></li>
-    </ol></li>
-  </ul></li>
-
-  <li> Use <a href="http://www.cmake.org/">CMake</a> to generate up-to-date
-    project files:
-    <ul>
-      <li>Once CMake is installed then the simplest way is to just start the
-        CMake GUI, select the directory where you have LLVM extracted to, and the
-        default options should all be fine.  One option you may really want to
-        change, regardless of anything else, might be the CMAKE_INSTALL_PREFIX
-        setting to select a directory to INSTALL to once compiling is complete,
-        although installation is not mandatory for using LLVM.  Another important
-        option is LLVM_TARGETS_TO_BUILD, which controls the LLVM target
-        architectures that are included on the build.
-      <li>See the <a href="CMake.html">LLVM CMake guide</a> for
-        detailed information about how to configure the LLVM
-        build.</li>
-    </ul>
-  </li>
-
-  <li>Start Visual Studio
-  <ul>
-    <li>In the directory you created the project files will have
-    an <tt>llvm.sln</tt> file, just double-click on that to open
-    Visual Studio.</li>
-  </ul></li>
-
-  <li>Build the LLVM Suite:
-  <ul>
-    <li>The projects may still be built individually, but
-    to build them all do not just select all of them in batch build (as some
-    are meant as configuration projects), but rather select and build just
-    the ALL_BUILD project to build everything, or the INSTALL project, which
-    first builds the ALL_BUILD project, then installs the LLVM headers, libs,
-    and other useful things to the directory set by the CMAKE_INSTALL_PREFIX
-    setting when you first configured CMake.</li>
-    <li>The Fibonacci project is a sample program that uses the JIT.
-    Modify the project's debugging properties to provide a numeric
-    command line argument or run it from the command line.  The
-    program will print the corresponding fibonacci value.</li>
-  </ul></li>
-
-  <li>Test LLVM on Visual Studio:
-  <ul>
-    <li>If %PATH% does not contain GnuWin32, you may specify LLVM_LIT_TOOLS_DIR
-    on CMake for the path to GnuWin32.</li>
-    <li>You can run LLVM tests by merely building the project
-      "check". The test results will be shown in the VS output
-      window.</li>
-  </ul>
-  </li>
-
-  <!-- FIXME: Is it up-to-date? -->
-  <li>Test LLVM:
-  <ul>
-    <li>The LLVM tests can be run by <tt>cd</tt>ing to the llvm source directory
-        and running:
-
-<div class="doc_code">
-<pre>
-% llvm-lit test
-</pre>
-</div>
-
-    <p>Note that quite a few of these test will fail.</p>
-    </li>
-
-    <li>A specific test or test directory can be run with:
-
-<div class="doc_code">
-<pre>
-% llvm-lit test/path/to/test
-</pre>
-</div>
-    </li>
-  </ul>
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="tutorial">An Example Using the LLVM Tool Chain</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<ol>
-  <li><p>First, create a simple C file, name it 'hello.c':</p>
-
-<div class="doc_code">
-<pre>
-#include &lt;stdio.h&gt;
-int main() {
-  printf("hello world\n");
-  return 0;
-}
-</pre></div></li>
-
-  <li><p>Next, compile the C file into a LLVM bitcode file:</p>
-
-<div class="doc_code">
-<pre>
-% clang -c hello.c -emit-llvm -o hello.bc
-</pre>
-</div>
-
-      <p>This will create the result file <tt>hello.bc</tt> which is the LLVM
-         bitcode that corresponds the the compiled program and the library
-         facilities that it required.  You can execute this file directly using
-         <tt>lli</tt> tool, compile it to native assembly with the <tt>llc</tt>,
-         optimize or analyze it further with the <tt>opt</tt> tool, etc.</p>
-
-      <p>Alternatively you can directly output an executable with clang with:
-      </p>
-
-<div class="doc_code">
-<pre>
-% clang hello.c -o hello.exe
-</pre>
-</div>
-
-  <p>The <tt>-o hello.exe</tt> is required because clang currently outputs
-  <tt>a.out</tt> when neither <tt>-o</tt> nor <tt>-c</tt> are given.</p>
-
-  <li><p>Run the program using the just-in-time compiler:</p>
-
-<div class="doc_code">
-<pre>
-% lli hello.bc
-</pre>
-</div>
-
-  <li><p>Use the <tt>llvm-dis</tt> utility to take a look at the LLVM assembly
-      code:</p>
-
-<div class="doc_code">
-<pre>
-% llvm-dis &lt; hello.bc | more
-</pre>
-</div></li>
-
-  <li><p>Compile the program to object code using the LLC code generator:</p>
-
-<div class="doc_code">
-<pre>
-% llc -filetype=obj hello.bc
-</pre>
-</div></li>
-
-  <li><p>Link to binary using Microsoft link:</p>
-
-<div class="doc_code">
-<pre>
-% link hello.obj -defaultlib:libcmt
-</pre>
-</div>
-
-  <li><p>Execute the native code program:</p>
-
-<div class="doc_code">
-<pre>
-% hello.exe
-</pre>
-</div></li>
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="problems">Common Problems</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you are having problems building or using LLVM, or if you have any other
-general questions about LLVM, please consult the <a href="FAQ.html">Frequently
-Asked Questions</a> page.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="links">Links</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document is just an <b>introduction</b> to how to use LLVM to do
-some simple things... there are many more interesting and complicated things
-that you can do that aren't documented here (but we'll gladly accept a patch
-if you want to write something up!).  For more information about LLVM, check
-out:</p>
-
-<ul>
-  <li><a href="http://llvm.org/">LLVM homepage</a></li>
-  <li><a href="http://llvm.org/doxygen/">LLVM doxygen tree</a></li>
-</ul>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-01-25 23:00:23 +0100 (Wed, 25 Jan 2012) $
-</address>
-</body>
-</html>
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
new file mode 100644
index 0000000..35f97f0
--- /dev/null
+++ b/docs/GettingStartedVS.rst
@@ -0,0 +1,234 @@
+.. _winvs:
+
+==================================================================
+Getting Started with the LLVM System using Microsoft Visual Studio
+==================================================================
+
+.. contents::
+   :local:
+
+
+Overview
+========
+Welcome to LLVM on Windows! This document only covers LLVM on Windows using
+Visual Studio, not mingw or cygwin. In order to get started, you first need to
+know some basic information.
+
+There are many different projects that compose LLVM. The first is the LLVM
+suite. This contains all of the tools, libraries, and header files needed to
+use LLVM. It contains an assembler, disassembler,
+bitcode analyzer and bitcode optimizer. It also contains a test suite that can
+be used to test the LLVM tools.
+
+Another useful project on Windows is `Clang <http://clang.llvm.org/>`_.
+Clang is a C family ([Objective]C/C++) compiler. Clang mostly works on
+Windows, but does not currently understand all of the Microsoft extensions
+to C and C++. Because of this, clang cannot parse the C++ standard library
+included with Visual Studio, nor parts of the Windows Platform SDK. However,
+most standard C programs do compile. Clang can be used to emit bitcode,
+directly emit object files or even linked executables using Visual Studio's
+``link.exe``.
+
+The large LLVM test suite cannot be run on the Visual Studio port at this
+time.
+
+Most of the tools build and work.  ``bugpoint`` does build, but does
+not work.
+
+Additional information about the LLVM directory structure and tool chain
+can be found on the main `Getting Started <GettingStarted.html>`_ page.
+
+
+Requirements
+============
+Before you begin to use the LLVM system, review the requirements given
+below.  This may save you some trouble by knowing ahead of time what hardware
+and software you will need.
+
+Hardware
+--------
+Any system that can adequately run Visual Studio 2008 is fine. The LLVM
+source tree and object files, libraries and executables will consume
+approximately 3GB.
+
+Software
+--------
+You will need Visual Studio 2008 or higher.  Earlier versions of Visual
+Studio have bugs, are not completely compatible, or do not support the C++
+standard well enough.
+
+You will also need the `CMake <http://www.cmake.org/>`_ build system since it
+generates the project files you will use to build with.
+
+If you would like to run the LLVM tests you will need `Python
+<http://www.python.org/>`_. Versions 2.4-2.7 are known to work. You will need
+`GnuWin32 <http://gnuwin32.sourceforge.net/>`_ tools, too.
+
+Do not install the LLVM directory tree into a path containing spaces (e.g.
+``C:\Documents and Settings\...``) as the configure step will fail.
+
+
+Getting Started
+===============
+Here's the short story for getting up and running quickly with LLVM:
+
+1. Read the documentation.
+2. Seriously, read the documentation.
+3. Remember that you were warned twice about reading the documentation.
+4. Get the Source Code
+
+   * With the distributed files:
+
+      1. ``cd <where-you-want-llvm-to-live>``
+      2. ``gunzip --stdout llvm-VERSION.tar.gz | tar -xvf -``
+         (*or use WinZip*)
+      3. ``cd llvm``
+
+   * With anonymous Subversion access:
+
+      1. ``cd <where-you-want-llvm-to-live>``
+      2. ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
+      3. ``cd llvm``
+
+5. Use `CMake <http://www.cmake.org/>`_ to generate up-to-date project files:
+
+   * Once CMake is installed then the simplest way is to just start the
+     CMake GUI, select the directory where you have LLVM extracted to, and
+     the default options should all be fine.  One option you may really
+     want to change, regardless of anything else, might be the
+     ``CMAKE_INSTALL_PREFIX`` setting to select a directory to INSTALL to
+     once compiling is complete, although installation is not mandatory for
+     using LLVM.  Another important option is ``LLVM_TARGETS_TO_BUILD``,
+     which controls the LLVM target architectures that are included on the
+     build.
+   * See the `LLVM CMake guide <CMake.html>`_ for detailed information about
+     how to configure the LLVM build.
+
+6. Start Visual Studio
+
+   * In the directory you created the project files will have an ``llvm.sln``
+     file, just double-click on that to open Visual Studio.
+
+7. Build the LLVM Suite:
+
+   * The projects may still be built individually, but to build them all do
+     not just select all of them in batch build (as some are meant as
+     configuration projects), but rather select and build just the
+     ``ALL_BUILD`` project to build everything, or the ``INSTALL`` project,
+     which first builds the ``ALL_BUILD`` project, then installs the LLVM
+     headers, libs, and other useful things to the directory set by the
+     ``CMAKE_INSTALL_PREFIX`` setting when you first configured CMake.
+   * The Fibonacci project is a sample program that uses the JIT. Modify the
+     project's debugging properties to provide a numeric command line argument
+     or run it from the command line.  The program will print the
+     corresponding fibonacci value.
+
+8. Test LLVM on Visual Studio:
+
+   * If ``%PATH%`` does not contain GnuWin32, you may specify
+     ``LLVM_LIT_TOOLS_DIR`` on CMake for the path to GnuWin32.
+   * You can run LLVM tests by merely building the project "check". The test
+     results will be shown in the VS output window.
+
+.. FIXME: Is it up-to-date?
+
+9. Test LLVM:
+
+   * The LLVM tests can be run by changing directory to the llvm source
+     directory and running:
+
+     .. code-block:: bat
+
+        C:\..\llvm> llvm-lit test
+
+     Note that quite a few of these test will fail.
+
+     A specific test or test directory can be run with:
+
+     .. code-block:: bat
+
+        C:\..\llvm> llvm-lit test/path/to/test
+
+
+An Example Using the LLVM Tool Chain
+====================================
+
+1. First, create a simple C file, name it '``hello.c``':
+
+   .. code-block:: c
+
+      #include <stdio.h>
+      int main() {
+        printf("hello world\n");
+        return 0;
+      }
+
+2. Next, compile the C file into a LLVM bitcode file:
+
+   .. code-block:: bat
+
+      C:\..> clang -c hello.c -emit-llvm -o hello.bc
+
+   This will create the result file ``hello.bc`` which is the LLVM bitcode
+   that corresponds the compiled program and the library facilities that
+   it required.  You can execute this file directly using ``lli`` tool,
+   compile it to native assembly with the ``llc``, optimize or analyze it
+   further with the ``opt`` tool, etc.
+
+   Alternatively you can directly output an executable with clang with:
+
+   .. code-block:: bat
+
+      C:\..> clang hello.c -o hello.exe
+
+   The ``-o hello.exe`` is required because clang currently outputs ``a.out``
+   when neither ``-o`` nor ``-c`` are given.
+
+3. Run the program using the just-in-time compiler:
+
+   .. code-block:: bat
+
+      C:\..> lli hello.bc
+
+4. Use the ``llvm-dis`` utility to take a look at the LLVM assembly code:
+
+   .. code-block:: bat
+
+      C:\..> llvm-dis < hello.bc | more
+
+5. Compile the program to object code using the LLC code generator:
+
+   .. code-block:: bat
+
+      C:\..> llc -filetype=obj hello.bc
+
+6. Link to binary using Microsoft link:
+
+   .. code-block:: bat
+
+      C:\..> link hello.obj -defaultlib:libcmt
+
+7. Execute the native code program:
+
+   .. code-block:: bat
+
+      C:\..> hello.exe
+
+
+Common Problems
+===============
+If you are having problems building or using LLVM, or if you have any other
+general questions about LLVM, please consult the `Frequently Asked Questions
+<FAQ.html>`_ page.
+
+
+Links
+=====
+This document is just an **introduction** to how to use LLVM to do some simple
+things... there are many more interesting and complicated things that you can
+do that aren't documented here (but we'll gladly accept a patch if you want to
+write something up!).  For more information about LLVM, check out:
+
+* `LLVM homepage <http://llvm.org/>`_
+* `LLVM doxygen tree <http://llvm.org/doxygen/>`_
+
diff --git a/docs/GoldPlugin.html b/docs/GoldPlugin.html
index 2c08bd0..1e99a5a 100644
--- a/docs/GoldPlugin.html
+++ b/docs/GoldPlugin.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>LLVM gold plugin</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
       
diff --git a/docs/HowToAddABuilder.html b/docs/HowToAddABuilder.html
index 0de2dac..985b30e 100644
--- a/docs/HowToAddABuilder.html
+++ b/docs/HowToAddABuilder.html
@@ -6,7 +6,7 @@
   <title>
     How To Add Your Build Configuration To LLVM Buildbot Infrastructure
   </title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
diff --git a/docs/HowToReleaseLLVM.html b/docs/HowToReleaseLLVM.html
index 30c4f0c..6fdec2c 100644
--- a/docs/HowToReleaseLLVM.html
+++ b/docs/HowToReleaseLLVM.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>How To Release LLVM To The Public</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -476,7 +476,7 @@ $ tar -cvf - llvm-test-<i>X.Y</i>rc1   | gzip &gt; llvm-test-<i>X.Y</i>rc1.src.t
 <p>Review the documentation and ensure that it is up to date. The "Release
    Notes" must be updated to reflect new features, bug fixes, new known issues,
    and changes in the list of supported platforms. The "Getting Started Guide"
-   should be updated to reflect the new release version number tag avaiable from
+   should be updated to reflect the new release version number tag available from
    Subversion and changes in basic system requirements. Merge both changes from
    mainline into the release branch.</p>
 
@@ -575,7 +575,7 @@ $ svn copy https://llvm.org/svn/llvm-project/test-suite/branches/release_XY \
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
+  Last modified: $Date: 2012-07-31 09:05:57 +0200 (Tue, 31 Jul 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/HowToSubmitABug.html b/docs/HowToSubmitABug.html
index 0071ec6..ef7cf9e 100644
--- a/docs/HowToSubmitABug.html
+++ b/docs/HowToSubmitABug.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>How to submit an LLVM bug report</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -31,9 +31,6 @@
                 <a href="http://misha.brukman.net">Misha Brukman</a></p>
 </div>
 </td>
-<td class="right">
-  <img src="img/Debugging.gif" alt="Debugging" width="444" height="314">
-</td>
 </tr>
 </table>
 
@@ -226,12 +223,12 @@ we have chased down ended up being bugs in the program being compiled, not
  LLVM.</p>
 
 <p>Once you determine that the program itself is not buggy, you should choose 
-which code generator you wish to compile the program with (e.g. C backend, the 
-JIT, or LLC) and optionally a series of LLVM passes to run.  For example:</p>
+which code generator you wish to compile the program with (e.g. LLC or the JIT)
+and optionally a series of LLVM passes to run.  For example:</p>
 
 <div class="doc_code">
 <p><tt>
-<b>bugpoint</b> -run-cbe [... optzn passes ...] file-to-test.bc --args -- [program arguments]</tt></p>
+<b>bugpoint</b> -run-llc [... optzn passes ...] file-to-test.bc --args -- [program arguments]</tt></p>
 </div>
 
 <p><tt>bugpoint</tt> will try to narrow down your list of passes to the one pass
@@ -341,7 +338,7 @@ the following:</p>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
+  Last modified: $Date: 2012-06-14 18:52:55 +0200 (Thu, 14 Jun 2012) $
 </address>
 
 </body>
diff --git a/docs/LLVMBuild.html b/docs/LLVMBuild.html
index a8420dd..9e7f8c7 100644
--- a/docs/LLVMBuild.html
+++ b/docs/LLVMBuild.html
@@ -3,7 +3,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>LLVMBuild Documentation</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
diff --git a/docs/LangRef.html b/docs/LangRef.html
index 5261230..946380e 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -7,7 +7,7 @@
   <meta name="author" content="Chris Lattner">
   <meta name="description"
   content="LLVM Assembly Language Reference Manual.">
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -257,6 +257,8 @@
           <li><a href="#int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a></li>
           <li><a href="#int_log">'<tt>llvm.log.*</tt>' Intrinsic</a></li>
           <li><a href="#int_fma">'<tt>llvm.fma.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_fabs">'<tt>llvm.fabs.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_floor">'<tt>llvm.floor.*</tt>' Intrinsic</a></li>
         </ol>
       </li>
       <li><a href="#int_manip">Bit Manipulation Intrinsics</a>
@@ -277,6 +279,11 @@
           <li><a href="#int_umul_overflow">'<tt>llvm.umul.with.overflow.*</tt> Intrinsics</a></li>
         </ol>
       </li>
+      <li><a href="#spec_arithmetic">Specialised Arithmetic Intrinsics</a>
+        <ol>
+          <li><a href="#fmuladd">'<tt>llvm.fmuladd</tt> Intrinsic</a></li>
+        </ol>
+      </li>
       <li><a href="#int_fp16">Half Precision Floating Point Intrinsics</a>
         <ol>
           <li><a href="#int_convert_to_fp16">'<tt>llvm.convert.to.fp16</tt>' Intrinsic</a></li>
@@ -307,12 +314,16 @@
             '<tt>llvm.annotation.*</tt>' Intrinsic</a></li>
           <li><a href="#int_trap">
             '<tt>llvm.trap</tt>' Intrinsic</a></li>
+          <li><a href="#int_debugtrap">
+            '<tt>llvm.debugtrap</tt>' Intrinsic</a></li>
           <li><a href="#int_stackprotector">
             '<tt>llvm.stackprotector</tt>' Intrinsic</a></li>
-	  <li><a href="#int_objectsize">
+          <li><a href="#int_objectsize">
             '<tt>llvm.objectsize</tt>' Intrinsic</a></li>
-	  <li><a href="#int_expect">
+          <li><a href="#int_expect">
             '<tt>llvm.expect</tt>' Intrinsic</a></li>
+          <li><a href="#int_donothing">
+            '<tt>llvm.donothing</tt>' Intrinsic</a></li>
         </ol>
       </li>
     </ol>
@@ -831,9 +842,32 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 <p>Global variables define regions of memory allocated at compilation time
    instead of run-time.  Global variables may optionally be initialized, may
    have an explicit section to be placed in, and may have an optional explicit
-   alignment specified.  A variable may be defined as "thread_local", which
+   alignment specified.</p>
+
+<p>A variable may be defined as <tt>thread_local</tt>, which
    means that it will not be shared by threads (each thread will have a
-   separated copy of the variable).  A variable may be defined as a global
+   separated copy of the variable).  Not all targets support thread-local
+   variables.  Optionally, a TLS model may be specified:</p>
+
+<dl>
+  <dt><b><tt>localdynamic</tt></b>:</dt>
+  <dd>For variables that are only used within the current shared library.</dd>
+
+  <dt><b><tt>initialexec</tt></b>:</dt>
+  <dd>For variables in modules that will not be loaded dynamically.</dd>
+
+  <dt><b><tt>localexec</tt></b>:</dt>
+  <dd>For variables defined in the executable and only used within it.</dd>
+</dl>
+
+<p>The models correspond to the ELF TLS models; see
+   <a href="http://people.redhat.com/drepper/tls.pdf">ELF
+   Handling For Thread-Local Storage</a> for more information on under which
+   circumstances the different models may be used.  The target may choose a
+   different TLS model if the specified model is not supported, or if a better
+   choice of model can be made.</p>
+
+<p>A variable may be defined as a global
    "constant," which indicates that the contents of the variable
    will <b>never</b> be modified (enabling better optimization, allowing the
    global data to be placed in the read-only section of an executable, etc).
@@ -886,6 +920,13 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 @G = addrspace(5) constant float 1.0, section "foo", align 4
 </pre>
 
+<p>The following example defines a thread-local global with
+   the <tt>initialexec</tt> TLS model:</p>
+
+<pre class="doc_code">
+@G = thread_local(initialexec) global i32 0, align 4
+</pre>
+
 </div>
 
 
@@ -1048,7 +1089,7 @@ declare signext i8 @returns_signed_char()
       value to the function.  The attribute implies that a hidden copy of the
       pointee
       is made between the caller and the callee, so the callee is unable to
-      modify the value in the callee.  This attribute is only valid on LLVM
+      modify the value in the caller.  This attribute is only valid on LLVM
       pointer arguments.  It is generally used to pass structs and arrays by
       value, but is also valid on pointers to scalars.  The copy is considered
       to belong to the caller not the callee (for example,
@@ -1167,6 +1208,13 @@ define void @f() optsize { ... }
       may make calls to the function faster, at the cost of extra program
       startup time if the function is not called during program startup.</dd>
 
+  <dt><tt><b>ia_nsdialect</b></tt></dt>
+  <dd>This attribute indicates the associated inline assembly call is using a
+      non-standard assembly dialect.  The standard dialect is ATT, which is
+      assumed when this attribute is not present.  When present, the dialect
+      is assumed to be Intel.  Currently, ATT and Intel are the only supported
+      dialects.</dd>
+
   <dt><tt><b>inlinehint</b></tt></dt>
   <dd>This attribute indicates that the source code contained a hint that inlining
       this function is desirable (such as the "inline" keyword in C/C++).  It
@@ -1392,7 +1440,7 @@ target datalayout = "<i>layout specification</i>"
   <li>If no match is found, and the type sought is an integer type, then the
       smallest integer type that is larger than the bitwidth of the sought type
       is used. If none of the specifications are larger than the bitwidth then
-      the the largest integer type is used. For example, given the default
+      the largest integer type is used. For example, given the default
       specifications above, the i7 type will use the alignment of i8 (next
       largest) while both i65 and i256 will use the alignment of i64 (largest
       specified).</li>
@@ -2287,8 +2335,9 @@ in signal handlers).</p>
    by <tt>0xM</tt> followed by 32 hexadecimal digits.  The IEEE 128-bit format
    is represented by <tt>0xL</tt> followed by 32 hexadecimal digits; no
    currently supported target uses this format.  Long doubles will only work if
-   they match the long double format on your target.  All hexadecimal formats
-   are big-endian (sign bit at the left).</p>
+   they match the long double format on your target. The IEEE 16-bit format
+   (half precision) is represented by <tt>0xH</tt> followed by 4 hexadecimal
+   digits. All hexadecimal formats are big-endian (sign bit at the left).</p>
 
 <p>There are no constants of type x86mmx.</p>
 </div>
@@ -2739,7 +2788,7 @@ second_end:
       make it fit in <tt>TYPE</tt>.</dd>
 
   <dt><b><tt>inttoptr (CST to TYPE)</tt></b></dt>
-  <dd>Convert a integer constant to a pointer constant.  TYPE must be a pointer
+  <dd>Convert an integer constant to a pointer constant.  TYPE must be a pointer
       type.  CST must be of integer type. The CST value is zero extended,
       truncated, or unchanged to make it fit in a pointer size. This one is
       <i>really</i> dangerous!</dd>
@@ -2826,8 +2875,9 @@ i32 (i32) asm "bswap $0", "=r,r"
 </pre>
 
 <p>Inline assembler expressions may <b>only</b> be used as the callee operand of
-   a <a href="#i_call"><tt>call</tt> instruction</a>.  Thus, typically we
-   have:</p>
+   a <a href="#i_call"><tt>call</tt></a> or an
+   <a href="#i_invoke"><tt>invoke</tt></a> instruction.
+   Thus, typically we have:</p>
 
 <pre class="doc_code">
 %X = call i32 asm "<a href="#int_bswap">bswap</a> $0", "=r,r"(i32 %Y)
@@ -3051,6 +3101,8 @@ call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
    <li>The range should not represent the full or empty set. That is,
        <tt>a!=b</tt>. </li>
 </ul>
+<p> In addition, the pairs must be in signed order of the lower bound and
+  they must be non-contiguous.</p>
 
 <p>Examples:</p>
 <div class="doc_code">
@@ -3058,10 +3110,12 @@ call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
   %a = load i8* %x, align 1, !range !0 ; Can only be 0 or 1
   %b = load i8* %y, align 1, !range !1 ; Can only be 255 (-1), 0 or 1
   %c = load i8* %z, align 1, !range !2 ; Can only be 0, 1, 3, 4 or 5
+  %d = load i8* %z, align 1, !range !3 ; Can only be -2, -1, 3, 4 or 5
 ...
 !0 = metadata !{ i8 0, i8 2 }
 !1 = metadata !{ i8 255, i8 2 }
 !2 = metadata !{ i8 0, i8 2, i8 3, i8 6 }
+!3 = metadata !{ i8 -2, i8 0, i8 3, i8 6 }
 </pre>
 </div>
 </div>
@@ -4727,7 +4781,7 @@ IfUnequal:
 
 <h5>Arguments:</h5>
 <p>The first two operands of a '<tt>shufflevector</tt>' instruction are vectors
-   with types that match each other. The third argument is a shuffle mask whose
+   with the same type.  The third argument is a shuffle mask whose
    element type is always 'i32'.  The result of the instruction is a vector
    whose length is the same as the shuffle mask and whose element type is the
    same as the element type of the first two operands.</p>
@@ -7464,6 +7518,74 @@ LLVM</a>.</p>
 
 </div>
 
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_fabs">'<tt>llvm.fabs.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.fabs</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.fabs.f32(float  %Val)
+  declare double    @llvm.fabs.f64(double %Val)
+  declare x86_fp80  @llvm.fabs.f80(x86_fp80  %Val)
+  declare fp128     @llvm.fabs.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.fabs.*</tt>' intrinsics return the absolute value of
+   the operand.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>fabs</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_floor">'<tt>llvm.floor.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.floor</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.floor.f32(float  %Val)
+  declare double    @llvm.floor.f64(double %Val)
+  declare x86_fp80  @llvm.floor.f80(x86_fp80  %Val)
+  declare fp128     @llvm.floor.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.floor.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.floor.*</tt>' intrinsics return the floor of
+   the operand.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>floor</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
 </div>
 
 <!-- ======================================================================= -->
@@ -7940,12 +8062,59 @@ LLVM</a>.</p>
 
 <!-- ======================================================================= -->
 <h3>
+  <a name="spec_arithmetic">Specialised Arithmetic Intrinsics</a>
+</h3>
+
+<!-- _______________________________________________________________________ -->
+
+<h4>
+  <a name="fmuladd">'<tt>llvm.fmuladd.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<pre>
+  declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  declare double @llvm.fmuladd.f64(double %a, double %b, double %c)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.fmuladd.*</tt>' intrinsic functions represent multiply-add
+expressions that can be fused if the code generator determines that the fused
+expression would be legal and efficient.</p>
+
+<h5>Arguments:</h5>
+<p>The '<tt>llvm.fmuladd.*</tt>' intrinsics each take three arguments: two
+multiplicands, a and b, and an addend c.</p>
+
+<h5>Semantics:</h5>
+<p>The expression:</p>
+<pre>
+  %0 = call float @llvm.fmuladd.f32(%a, %b, %c)
+</pre>
+<p>is equivalent to the expression a * b + c, except that rounding will not be
+performed between the multiplication and addition steps if the code generator
+fuses the operations. Fusion is not guaranteed, even if the target platform
+supports it. If a fused multiply-add is required the corresponding llvm.fma.*
+intrinsic function should be used instead.</p>
+
+<h5>Examples:</h5>
+<pre>
+  %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields {float}:r2 = (a * b) + c
+</pre>
+
+</div>
+
+<!-- ======================================================================= -->
+<h3>
   <a name="int_fp16">Half Precision Floating Point Intrinsics</a>
 </h3>
 
 <div>
 
-<p>Half precision floating point is a storage-only format. This means that it is
+<p>For most target platforms, half precision floating point is a storage-only
+   format. This means that it is
    a dense encoding (in memory) but does not support computation in the
    format.</p>
    
@@ -8382,7 +8551,7 @@ LLVM</a>.</p>
 
 <h5>Syntax:</h5>
 <pre>
-  declare void @llvm.trap()
+  declare void @llvm.trap() noreturn nounwind
 </pre>
 
 <h5>Overview:</h5>
@@ -8392,9 +8561,33 @@ LLVM</a>.</p>
 <p>None.</p>
 
 <h5>Semantics:</h5>
-<p>This intrinsics is lowered to the target dependent trap instruction. If the
+<p>This intrinsic is lowered to the target dependent trap instruction. If the
    target does not have a trap instruction, this intrinsic will be lowered to
-   the call of the <tt>abort()</tt> function.</p>
+   a call of the <tt>abort()</tt> function.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_debugtrap">'<tt>llvm.debugtrap</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<pre>
+  declare void @llvm.debugtrap() nounwind
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.debugtrap</tt>' intrinsic.</p>
+
+<h5>Arguments:</h5>
+<p>None.</p>
+
+<h5>Semantics:</h5>
+<p>This intrinsic is lowered to code which is intended to cause an execution
+   trap with the intention of requesting the attention of a debugger.</p>
 
 </div>
 
@@ -8441,8 +8634,8 @@ LLVM</a>.</p>
 
 <h5>Syntax:</h5>
 <pre>
-  declare i32 @llvm.objectsize.i32(i8* &lt;object&gt;, i1 &lt;type&gt;)
-  declare i64 @llvm.objectsize.i64(i8* &lt;object&gt;, i1 &lt;type&gt;)
+  declare i32 @llvm.objectsize.i32(i8* &lt;object&gt;, i1 &lt;min&gt;)
+  declare i64 @llvm.objectsize.i64(i8* &lt;object&gt;, i1 &lt;min&gt;)
 </pre>
 
 <h5>Overview:</h5>
@@ -8455,15 +8648,15 @@ LLVM</a>.</p>
 <h5>Arguments:</h5>
 <p>The <tt>llvm.objectsize</tt> intrinsic takes two arguments. The first
    argument is a pointer to or into the <tt>object</tt>. The second argument
-   is a boolean 0 or 1. This argument determines whether you want the 
-   maximum (0) or minimum (1) bytes remaining. This needs to be a literal 0 or
-   1, variables are not allowed.</p>
+   is a boolean and determines whether <tt>llvm.objectsize</tt> returns 0 (if
+   true) or -1 (if false) when the object size is unknown.
+   The second argument only accepts constants.</p>
    
 <h5>Semantics:</h5>
-<p>The <tt>llvm.objectsize</tt> intrinsic is lowered to either a constant
-   representing the size of the object concerned, or <tt>i32/i64 -1 or 0</tt>,
-   depending on the <tt>type</tt> argument, if the size cannot be determined at
-   compile time.</p>
+<p>The <tt>llvm.objectsize</tt> intrinsic is lowered to a constant representing
+   the size of the object concerned. If the size cannot be determined at compile
+   time, <tt>llvm.objectsize</tt> returns <tt>i32/i64 -1 or 0</tt>
+   (depending on the <tt>min</tt> argument).</p>
 
 </div>
 <!-- _______________________________________________________________________ -->
@@ -8492,6 +8685,30 @@ LLVM</a>.</p>
 <p>This intrinsic is lowered to the <tt>val</tt>.</p>
 </div>
 
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_donothing">'<tt>llvm.donothing</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<pre>
+  declare void @llvm.donothing() nounwind readnone
+</pre>
+
+<h5>Overview:</h5>
+<p>The <tt>llvm.donothing</tt> intrinsic doesn't perform any operation. It's the
+only intrinsic that can be called with an invoke instruction.</p>
+
+<h5>Arguments:</h5>
+<p>None.</p>
+
+<h5>Semantics:</h5>
+<p>This intrinsic does nothing, and it's removed by optimizers and ignored by
+codegen.</p>
+</div>
+
 </div>
 
 </div>
@@ -8505,7 +8722,7 @@ LLVM</a>.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-16 21:39:33 +0200 (Mon, 16 Apr 2012) $
+  Last modified: $Date: 2012-08-10 02:00:22 +0200 (Fri, 10 Aug 2012) $
 </address>
 
 </body>
diff --git a/docs/Lexicon.html b/docs/Lexicon.html
deleted file mode 100644
index dbb7f9b..0000000
--- a/docs/Lexicon.html
+++ /dev/null
@@ -1,292 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>The LLVM Lexicon</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-  <meta name="author" content="Various">
-  <meta name="description" 
-  content="A glossary of terms used with the LLVM project.">
-</head>
-<body>
-<h1>The LLVM Lexicon</h1>
-<p class="doc_warning">NOTE: This document is a work in progress!</p>
-<!-- *********************************************************************** -->
-<h2>Table Of Contents</h2>
-<!-- *********************************************************************** -->
-<div>
-  <table>
-    <tr><th colspan="8"><b>- <a href="#A">A</a> -</b></th></tr>
-    <tr>
-      <td><a href="#ADCE">ADCE</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#B">B</a> -</b></th></tr>
-    <tr>
-      <td><a href="#BURS">BURS</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#C">C</a> -</b></th></tr>
-    <tr>
-      <td><a href="#CSE">CSE</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#D">D</a> -</b></th></tr>
-    <tr>
-      <td><a href="#DAG">DAG</a></td>
-      <td><a href="#Derived_Pointer">Derived Pointer</a></td>
-      <td><a href="#DSA">DSA</a></td>
-      <td><a href="#DSE">DSE</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#F">F</a> -</b></th></tr>
-    <tr>
-      <td><a href="#FCA">FCA</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#G">G</a> -</b></th></tr>
-    <tr>
-      <td><a href="#GC">GC</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#I">I</a> -</b></th></tr>
-    <tr>
-      <td><a href="#IPA">IPA</a></td>
-      <td><a href="#IPO">IPO</a></td>
-      <td><a href="#ISel">ISel</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#L">L</a> -</b></th></tr>
-    <tr>
-      <td><a href="#LCSSA">LCSSA</a></td>
-      <td><a href="#LICM">LICM</a></td>
-      <td><a href="#Load-VN">Load-VN</a></td>
-      <td><a href="#LTO">LTO</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#M">M</a> -</b></th></tr>
-    <tr>
-      <td><a href="#MC">MC</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#O">O</a> -</b></th></tr>
-    <tr>
-      <td><a href="#Object_Pointer">Object Pointer</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#P">P</a> -</b></th></tr>
-    <tr>
-      <td><a href="#PRE">PRE</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#R">R</a> -</b></th></tr>
-    <tr>
-      <td><a href="#RAUW">RAUW</a></td>
-      <td><a href="#Reassociation">Reassociation</a></td>
-      <td><a href="#Root">Root</a></td>
-    </tr>
-    <tr><th colspan="8"><b>- <a href="#S">S</a> -</b></th></tr>
-    <tr>
-      <td><a href="#Safe_Point">Safe Point</a></td>
-      <td><a href="#SCC">SCC</a></td>
-      <td><a href="#SCCP">SCCP</a></td>
-      <td><a href="#SDISel">SDISel</a></td>
-      <td><a href="#SRoA">SRoA</a></td>
-      <td><a href="#Stack_Map">Stack Map</a></td>
-    </tr>
-  </table>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>Definitions</h2>
-<!-- *********************************************************************** -->
-<div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="A">- A -</a></h3>
-<div>
-  <dl>
-    <dt><a name="ADCE"><b>ADCE</b></a></dt>
-    <dd>Aggressive Dead Code Elimination</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="B">- B -</a></h3>
-<div>
-  <dl>
-    <dt><a name="BURS"><b>BURS</b></a></dt>
-    <dd>Bottom Up Rewriting System&mdash;A method of instruction selection for
-        code generation.  An example is the <a 
-href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="C">- C -</a></h3>
-<div>
-  <dl>
-    <dt><a name="CSE"><b>CSE</b></a></dt>
-    <dd>Common Subexpression Elimination. An optimization that removes common
-    subexpression compuation. For example <tt>(a+b)*(a+b)</tt> has two
-    subexpressions that are the same: <tt>(a+b)</tt>. This optimization would
-    perform the addition only once and then perform the multiply (but only if
-    it's compulationally correct/safe).
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="D">- D -</a></h3>
-<div>
-  <dl>
-    <dt><a name="DAG"><b>DAG</b></a></dt>
-    <dd>Directed Acyclic Graph</dd>
-    <dt><a name="Derived_Pointer"><b>Derived Pointer</b></a></dt>
-    <dd>A pointer to the interior of an object, such that a garbage collector
-    is unable to use the pointer for reachability analysis. While a derived
-    pointer is live, the corresponding object pointer must be kept in a root,
-    otherwise the collector might free the referenced object. With copying
-    collectors, derived pointers pose an additional hazard that they may be
-    invalidated at any <a href="Safe_Point">safe point</a>. This term is used in
-    opposition to <a href="#Object_Pointer">object pointer</a>.</dd>
-    <dt><a name="DSA"><b>DSA</b></a></dt>
-    <dd>Data Structure Analysis</dd>
-    <dt><a name="DSE"><b>DSE</b></a></dt>
-    <dd>Dead Store Elimination</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="F">- F -</a></h3>
-<div>
-  <dl>
-    <dt><a name="FCA"><b>FCA</b></a></dt>
-    <dd>First Class Aggregate</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="G">- G -</a></h3>
-<div>
-  <dl>
-    <dt><a name="GC"><b>GC</b></a></dt>
-    <dd>Garbage Collection. The practice of using reachability analysis instead
-    of explicit memory management to reclaim unused memory.</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="H">- H -</a></h3>
-<div>
-  <dl>
-    <dt><a name="Heap"><b>Heap</b></a></dt>
-    <dd>In garbage collection, the region of memory which is managed using
-    reachability analysis.</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="I">- I -</a></h3>
-<div>
-  <dl>
-    <dt><a name="IPA"><b>IPA</b></a></dt>
-    <dd>Inter-Procedural Analysis. Refers to any variety of code analysis that
-    occurs between procedures, functions or compilation units (modules).</dd>
-    <dt><a name="IPO"><b>IPO</b></a></dt>
-    <dd>Inter-Procedural Optimization. Refers to any variety of code
-    optimization that occurs between procedures, functions or compilation units
-    (modules).</dd>
-    <dt><a name="ISel"><b>ISel</b></a></dt>
-    <dd>Instruction Selection.</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="L">- L -</a></h3>
-<div>
-  <dl>
-    <dt><a name="LCSSA"><b>LCSSA</b></a></dt>
-    <dd>Loop-Closed Static Single Assignment Form</dd>
-    <dt><a name="LICM"><b>LICM</b></a></dt>
-    <dd>Loop Invariant Code Motion</dd>
-    <dt><a name="Load-VN"><b>Load-VN</b></a></dt>
-    <dd>Load Value Numbering</dd>
-    <dt><a name="LTO"><b>LTO</b></a></dt>
-    <dd>Link-Time Optimization</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="M">- M -</a></h3>
-<div>
-  <dl>
-    <dt><a name="MC"><b>MC</b></a></dt>
-    <dd>Machine Code</dd>
-  </dl>
-</div>
-<!-- _______________________________________________________________________ -->
-<h3><a name="O">- O -</a></h3>
-<div>
-  <dl>
-    <dt><a name="Object_Pointer"><b>Object Pointer</b></a></dt>
-    <dd>A pointer to an object such that the garbage collector is able to trace
-    references contained within the object. This term is used in opposition to
-    <a href="#Derived_Pointer">derived pointer</a>.</dd>
-  </dl>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="P">- P -</a></h3>
-<div>
-  <dl>
-    <dt><a name="PRE"><b>PRE</b></a></dt>
-    <dd>Partial Redundancy Elimination</dd>
-  </dl>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="R">- R -</a></h3>
-<div>
-  <dl>
-  	<dt><a name="RAUW"><b>RAUW</b></a></dt> <dd>An abbreviation for Replace
-  	All Uses With. The functions User::replaceUsesOfWith(), 
-  	Value::replaceAllUsesWith(), and Constant::replaceUsesOfWithOnConstant()
-  	implement the replacement of one Value with another by iterating over its
-  	def/use chain and fixing up all of the pointers to point to the new value.
-  	See also <a href="ProgrammersManual.html#iterate_chains">def/use chains</a>.
-  	</dd>
-    <dt><a name="Reassociation"><b>Reassociation</b></a></dt> <dd>Rearranging
-    associative expressions to promote better redundancy elimination and other
-    optimization.  For example, changing (A+B-A) into (B+A-A), permitting it to
-    be optimized into (B+0) then (B).</dd>
-    <dt><a name="Root"><b>Root</b></a></dt> <dd>In garbage collection, a
-    pointer variable lying outside of the <a href="#Heap">heap</a> from which
-    the collector begins its reachability analysis. In the context of code
-    generation, "root" almost always refers to a "stack root" -- a local or
-    temporary variable within an executing function.</dd>
-  </dl>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h3><a name="S">- S -</a></h3>
-<div>
-  <dl>
-    <dt><a name="Safe_Point"><b>Safe Point</b></a></dt>
-    <dd>In garbage collection, it is necessary to identify <a href="#Root">stack
-    roots</a> so that reachability analysis may proceed. It may be infeasible to
-    provide this information for every instruction, so instead the information
-    may is calculated only at designated safe points. With a copying collector,
-    <a href="#Derived_Pointers">derived pointers</a> must not be retained across
-    safe points and <a href="#Object_Pointers">object pointers</a> must be
-    reloaded from stack roots.</dd>
-    <dt><a name="SDISel"><b>SDISel</b></a></dt>
-    <dd>Selection DAG Instruction Selection.</dd>
-    <dt><a name="SCC"><b>SCC</b></a></dt>
-    <dd>Strongly Connected Component</dd>
-    <dt><a name="SCCP"><b>SCCP</b></a></dt>
-    <dd>Sparse Conditional Constant Propagation</dd>
-    <dt><a name="SRoA"><b>SRoA</b></a></dt>
-    <dd>Scalar Replacement of Aggregates</dd>
-    <dt><a name="SSA"><b>SSA</b></a></dt>
-    <dd>Static Single Assignment</dd>
-    <dt><a name="Stack_Map"><b>Stack Map</b></a></dt>
-    <dd>In garbage collection, metadata emitted by the code generator which
-    identifies <a href="#Root">roots</a> within the stack frame of an executing
-    function.</dd>
-  </dl>
-</div>
-
-</div>  
-<!-- *********************************************************************** -->
-<hr>
-<address> <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
- src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a><a
- href="http://validator.w3.org/check/referer"><img
- src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a><a
- href="http://llvm.org/">The LLVM Team</a><br>
-<a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-Last modified: $Date: 2012-01-05 09:18:41 +0100 (Thu, 05 Jan 2012) $
-</address>
-<!-- vim: sw=2
--->
-</body>
-</html>
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
new file mode 100644
index 0000000..6ebe614
--- /dev/null
+++ b/docs/Lexicon.rst
@@ -0,0 +1,194 @@
+.. _lexicon:
+
+================
+The LLVM Lexicon
+================
+
+.. note::
+
+    This document is a work in progress!
+
+Definitions
+===========
+
+A
+-
+
+**ADCE**
+    Aggressive Dead Code Elimination
+
+B
+-
+
+**BURS**
+
+    Bottom Up Rewriting System --- A method of instruction selection for code
+    generation.  An example is the `BURG
+    <http://www.program-transformation.org/Transform/BURG>`_ tool.
+
+C
+-
+
+**CSE**
+    Common Subexpression Elimination. An optimization that removes common
+    subexpression compuation. For example ``(a+b)*(a+b)`` has two subexpressions
+    that are the same: ``(a+b)``. This optimization would perform the addition
+    only once and then perform the multiply (but only if it's compulationally
+    correct/safe).
+
+D
+-
+
+**DAG**
+    Directed Acyclic Graph
+
+.. _derived pointer:
+.. _derived pointers:
+
+**Derived Pointer**
+    A pointer to the interior of an object, such that a garbage collector is
+    unable to use the pointer for reachability analysis. While a derived pointer
+    is live, the corresponding object pointer must be kept in a root, otherwise
+    the collector might free the referenced object. With copying collectors,
+    derived pointers pose an additional hazard that they may be invalidated at
+    any `safe point`_. This term is used in opposition to `object pointer`_.
+
+**DSA**
+    Data Structure Analysis
+
+**DSE**
+    Dead Store Elimination
+
+F
+-
+
+**FCA**
+    First Class Aggregate
+
+G
+-
+
+**GC**
+    Garbage Collection. The practice of using reachability analysis instead of
+    explicit memory management to reclaim unused memory.
+
+H
+-
+
+.. _heap:
+
+**Heap**
+    In garbage collection, the region of memory which is managed using
+    reachability analysis.
+
+I
+-
+
+**IPA**
+    Inter-Procedural Analysis. Refers to any variety of code analysis that
+    occurs between procedures, functions or compilation units (modules).
+
+**IPO**
+    Inter-Procedural Optimization. Refers to any variety of code optimization
+    that occurs between procedures, functions or compilation units (modules).
+
+**ISel**
+    Instruction Selection
+
+L
+-
+
+**LCSSA**
+    Loop-Closed Static Single Assignment Form
+
+**LICM**
+    Loop Invariant Code Motion
+
+**Load-VN**
+    Load Value Numbering
+
+**LTO**
+    Link-Time Optimization
+
+M
+-
+
+**MC**
+    Machine Code
+
+O
+-
+.. _object pointer:
+.. _object pointers:
+
+**Object Pointer**
+    A pointer to an object such that the garbage collector is able to trace
+    references contained within the object. This term is used in opposition to
+    `derived pointer`_.
+
+P
+-
+
+**PRE**
+    Partial Redundancy Elimination
+
+R
+-
+
+**RAUW**
+
+    Replace All Uses With. The functions ``User::replaceUsesOfWith()``,
+    ``Value::replaceAllUsesWith()``, and
+    ``Constant::replaceUsesOfWithOnConstant()`` implement the replacement of one
+    Value with another by iterating over its def/use chain and fixing up all of
+    the pointers to point to the new value.  See
+    also `def/use chains <ProgrammersManual.html#iterate_chains>`_.
+
+**Reassociation**
+    Rearranging associative expressions to promote better redundancy elimination
+    and other optimization.  For example, changing ``(A+B-A)`` into ``(B+A-A)``,
+    permitting it to be optimized into ``(B+0)`` then ``(B)``.
+
+.. _roots:
+.. _stack roots:
+
+**Root**
+    In garbage collection, a pointer variable lying outside of the `heap`_ from
+    which the collector begins its reachability analysis. In the context of code
+    generation, "root" almost always refers to a "stack root" --- a local or
+    temporary variable within an executing function.</dd>
+
+**RPO**
+    Reverse postorder
+
+S
+-
+
+.. _safe point:
+
+**Safe Point**
+    In garbage collection, it is necessary to identify `stack roots`_ so that
+    reachability analysis may proceed. It may be infeasible to provide this
+    information for every instruction, so instead the information may is
+    calculated only at designated safe points. With a copying collector,
+    `derived pointers`_ must not be retained across safe points and `object
+    pointers`_ must be reloaded from stack roots.
+
+**SDISel**
+    Selection DAG Instruction Selection.
+
+**SCC**
+    Strongly Connected Component
+
+**SCCP**
+    Sparse Conditional Constant Propagation
+
+**SRoA**
+    Scalar Replacement of Aggregates
+
+**SSA**
+    Static Single Assignment
+
+**Stack Map**
+    In garbage collection, metadata emitted by the code generator which
+    identifies `roots`_ within the stack frame of an executing function.
diff --git a/docs/LinkTimeOptimization.html b/docs/LinkTimeOptimization.html
deleted file mode 100644
index 5652555..0000000
--- a/docs/LinkTimeOptimization.html
+++ /dev/null
@@ -1,401 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- <title>LLVM Link Time Optimization: Design and Implementation</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-
-<h1>
-  LLVM Link Time Optimization: Design and Implementation
-</h1>
-
-<ul>
-  <li><a href="#desc">Description</a></li>
-  <li><a href="#design">Design Philosophy</a>
-  <ul>
-    <li><a href="#example1">Example of link time optimization</a></li>
-    <li><a href="#alternative_approaches">Alternative Approaches</a></li>
-  </ul></li>
-  <li><a href="#multiphase">Multi-phase communication between LLVM and linker</a>
-  <ul>
-    <li><a href="#phase1">Phase 1 : Read LLVM Bitcode Files</a></li>
-    <li><a href="#phase2">Phase 2 : Symbol Resolution</a></li>
-    <li><a href="#phase3">Phase 3 : Optimize Bitcode Files</a></li>
-    <li><a href="#phase4">Phase 4 : Symbol Resolution after optimization</a></li>
-  </ul></li>
-  <li><a href="#lto">libLTO</a>
-  <ul>
-    <li><a href="#lto_module_t">lto_module_t</a></li>
-    <li><a href="#lto_code_gen_t">lto_code_gen_t</a></li>
-  </ul>
-</ul>
-
-<div class="doc_author">
-<p>Written by Devang Patel and Nick Kledzik</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="desc">Description</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>
-LLVM features powerful intermodular optimizations which can be used at link 
-time.  Link Time Optimization (LTO) is another name for intermodular optimization 
-when performed during the link stage. This document describes the interface 
-and design between the LTO optimizer and the linker.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="design">Design Philosophy</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>
-The LLVM Link Time Optimizer provides complete transparency, while doing 
-intermodular optimization, in the compiler tool chain. Its main goal is to let 
-the developer take advantage of intermodular optimizations without making any 
-significant changes to the developer's makefiles or build system. This is 
-achieved through tight integration with the linker. In this model, the linker 
-treates LLVM bitcode files like native object files and allows mixing and 
-matching among them. The linker uses <a href="#lto">libLTO</a>, a shared
-object, to handle LLVM bitcode files. This tight integration between 
-the linker and LLVM optimizer helps to do optimizations that are not possible 
-in other models. The linker input allows the optimizer to avoid relying on 
-conservative escape analysis.
-</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="example1">Example of link time optimization</a>
-</h3>
-
-<div>
-  <p>The following example illustrates the advantages of LTO's integrated
-  approach and clean interface. This example requires a system linker which
-  supports LTO through the interface described in this document.  Here,
-  clang transparently invokes system linker. </p>
-  <ul>
-    <li> Input source file <tt>a.c</tt> is compiled into LLVM bitcode form.
-    <li> Input source file <tt>main.c</tt> is compiled into native object code.
-  </ul>
-<pre class="doc_code">
---- a.h ---
-extern int foo1(void);
-extern void foo2(void);
-extern void foo4(void);
-
---- a.c ---
-#include "a.h"
-
-static signed int i = 0;
-
-void foo2(void) {
-  i = -1;
-}
-
-static int foo3() {
-  foo4();
-  return 10;
-}
-
-int foo1(void) {
-  int data = 0;
-
-  if (i &lt; 0) 
-    data = foo3();
-
-  data = data + 42;
-  return data;
-}
-
---- main.c ---
-#include &lt;stdio.h&gt;
-#include "a.h"
-
-void foo4(void) {
-  printf("Hi\n");
-}
-
-int main() {
-  return foo1();
-}
-
---- command lines ---
-$ clang -emit-llvm -c a.c -o a.o   # &lt;-- a.o is LLVM bitcode file
-$ clang -c main.c -o main.o        # &lt;-- main.o is native object file
-$ clang a.o main.o -o main         # &lt;-- standard link command without any modifications
-</pre>
-
-<ul>
-  <li>In this example, the linker recognizes that <tt>foo2()</tt> is an
-      externally visible symbol defined in LLVM bitcode file. The linker
-      completes its usual symbol resolution pass and finds that <tt>foo2()</tt>
-      is not used anywhere. This information is used by the LLVM optimizer and
-      it removes <tt>foo2()</tt>.</li>
-  <li>As soon as <tt>foo2()</tt> is removed, the optimizer recognizes that condition 
-      <tt>i &lt; 0</tt> is always false, which means <tt>foo3()</tt> is never 
-      used. Hence, the optimizer also removes <tt>foo3()</tt>.</li>
-  <li>And this in turn, enables linker to remove <tt>foo4()</tt>.</li>
-</ul>
-
-<p>This example illustrates the advantage of tight integration with the
-   linker. Here, the optimizer can not remove <tt>foo3()</tt> without the
-   linker's input.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="alternative_approaches">Alternative Approaches</a>
-</h3>
-
-<div>
-  <dl>
-    <dt><b>Compiler driver invokes link time optimizer separately.</b></dt>
-    <dd>In this model the link time optimizer is not able to take advantage of 
-    information collected during the linker's normal symbol resolution phase. 
-    In the above example, the optimizer can not remove <tt>foo2()</tt> without 
-    the linker's input because it is externally visible. This in turn prohibits
-    the optimizer from removing <tt>foo3()</tt>.</dd>
-    <dt><b>Use separate tool to collect symbol information from all object
-    files.</b></dt>
-    <dd>In this model, a new, separate, tool or library replicates the linker's
-    capability to collect information for link time optimization. Not only is
-    this code duplication difficult to justify, but it also has several other 
-    disadvantages.  For example, the linking semantics and the features 
-    provided by the linker on various platform are not unique. This means, 
-    this new tool needs to support all such features and platforms in one 
-    super tool or a separate tool per platform is required. This increases 
-    maintenance cost for link time optimizer significantly, which is not 
-    necessary. This approach also requires staying synchronized with linker 
-    developements on various platforms, which is not the main focus of the link 
-    time optimizer. Finally, this approach increases end user's build time due 
-    to the duplication of work done by this separate tool and the linker itself.
-    </dd>
-  </dl>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="multiphase">Multi-phase communication between libLTO and linker</a>
-</h2>
-
-<div>
-  <p>The linker collects information about symbol defininitions and uses in 
-  various link objects which is more accurate than any information collected 
-  by other tools during typical build cycles.  The linker collects this 
-  information by looking at the definitions and uses of symbols in native .o 
-  files and using symbol visibility information. The linker also uses 
-  user-supplied information, such as a list of exported symbols. LLVM 
-  optimizer collects control flow information, data flow information and knows 
-  much more about program structure from the optimizer's point of view. 
-  Our goal is to take advantage of tight integration between the linker and 
-  the optimizer by sharing this information during various linking phases.
-</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="phase1">Phase 1 : Read LLVM Bitcode Files</a>
-</h3>
-
-<div>
-  <p>The linker first reads all object files in natural order and collects 
-  symbol information. This includes native object files as well as LLVM bitcode 
-  files.  To minimize the cost to the linker in the case that all .o files
-  are native object files, the linker only calls <tt>lto_module_create()</tt> 
-  when a supplied object file is found to not be a native object file.  If
-  <tt>lto_module_create()</tt> returns that the file is an LLVM bitcode file, 
-  the linker
-  then iterates over the module using <tt>lto_module_get_symbol_name()</tt> and
-  <tt>lto_module_get_symbol_attribute()</tt> to get all symbols defined and 
-  referenced.
-  This information is added to the linker's global symbol table.
-</p>
-  <p>The lto* functions are all implemented in a shared object libLTO.  This
-  allows the LLVM LTO code to be updated independently of the linker tool.
-  On platforms that support it, the shared object is lazily loaded. 
-</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="phase2">Phase 2 : Symbol Resolution</a>
-</h3>
-
-<div>
-  <p>In this stage, the linker resolves symbols using global symbol table. 
-  It may report undefined symbol errors, read archive members, replace 
-  weak symbols, etc.  The linker is able to do this seamlessly even though it 
-  does not know the exact content of input LLVM bitcode files.  If dead code 
-  stripping is enabled then the linker collects the list of live symbols.
-  </p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="phase3">Phase 3 : Optimize Bitcode Files</a>
-</h3>
-<div>
-  <p>After symbol resolution, the linker tells the LTO shared object which
-  symbols are needed by native object files.  In the example above, the linker 
-  reports that only <tt>foo1()</tt> is used by native object files using 
-  <tt>lto_codegen_add_must_preserve_symbol()</tt>.  Next the linker invokes
-  the LLVM optimizer and code generators using <tt>lto_codegen_compile()</tt>
-  which returns a native object file creating by merging the LLVM bitcode files 
-  and applying various optimization passes.  
-</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="phase4">Phase 4 : Symbol Resolution after optimization</a>
-</h3>
-
-<div>
-  <p>In this phase, the linker reads optimized a native object file and 
-  updates the internal global symbol table to reflect any changes. The linker 
-  also collects information about any changes in use of external symbols by 
-  LLVM bitcode files. In the example above, the linker notes that 
-  <tt>foo4()</tt> is not used any more. If dead code stripping is enabled then 
-  the linker refreshes the live symbol information appropriately and performs 
-  dead code stripping.</p>
-  <p>After this phase, the linker continues linking as if it never saw LLVM 
-  bitcode files.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-<a name="lto">libLTO</a>
-</h2>
-
-<div>
-  <p><tt>libLTO</tt> is a shared object that is part of the LLVM tools, and 
-  is intended for use by a linker. <tt>libLTO</tt> provides an abstract C 
-  interface to use the LLVM interprocedural optimizer without exposing details 
-  of LLVM's internals. The intention is to keep the interface as stable as 
-  possible even when the LLVM optimizer continues to evolve. It should even
-  be possible for a completely different compilation technology to provide
-  a different libLTO that works with their object files and the standard
-  linker tool.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="lto_module_t">lto_module_t</a>
-</h3>
-
-<div>
-
-<p>A non-native object file is handled via an <tt>lto_module_t</tt>.  
-The following functions allow the linker to check if a file (on disk
-or in a memory buffer) is a file which libLTO can process:</p>
-
-<pre class="doc_code">
-lto_module_is_object_file(const char*)
-lto_module_is_object_file_for_target(const char*, const char*)
-lto_module_is_object_file_in_memory(const void*, size_t)
-lto_module_is_object_file_in_memory_for_target(const void*, size_t, const char*)
-</pre>
-
-<p>If the object file can be processed by libLTO, the linker creates a
-<tt>lto_module_t</tt> by using one of</p>
-
-<pre class="doc_code">
-lto_module_create(const char*)
-lto_module_create_from_memory(const void*, size_t)
-</pre>
-
-<p>and when done, the handle is released via</p>
-
-<pre class="doc_code">
-lto_module_dispose(lto_module_t)
-</pre>
-
-<p>The linker can introspect the non-native object file by getting the number of
-symbols and getting the name and attributes of each symbol via:</p>
-
-<pre class="doc_code">
-lto_module_get_num_symbols(lto_module_t)
-lto_module_get_symbol_name(lto_module_t, unsigned int)
-lto_module_get_symbol_attribute(lto_module_t, unsigned int)
-</pre>
-
-<p>The attributes of a symbol include the alignment, visibility, and kind.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="lto_code_gen_t">lto_code_gen_t</a>
-</h3>
-
-<div>
-
-<p>Once the linker has loaded each non-native object files into an
-<tt>lto_module_t</tt>, it can request libLTO to process them all and
-generate a native object file.  This is done in a couple of steps.
-First, a code generator is created with:</p>
-
-<pre class="doc_code">lto_codegen_create()</pre>
-
-<p>Then, each non-native object file is added to the code generator with:</p>
-
-<pre class="doc_code">
-lto_codegen_add_module(lto_code_gen_t, lto_module_t)
-</pre>
-
-<p>The linker then has the option of setting some codegen options.  Whether or
-not to generate DWARF debug info is set with:</p>
-  
-<pre class="doc_code">lto_codegen_set_debug_model(lto_code_gen_t)</pre>
-
-<p>Which kind of position independence is set with:</p>
-
-<pre class="doc_code">lto_codegen_set_pic_model(lto_code_gen_t) </pre>
-  
-<p>And each symbol that is referenced by a native object file or otherwise must
-not be optimized away is set with:</p>
-
-<pre class="doc_code">
-lto_codegen_add_must_preserve_symbol(lto_code_gen_t, const char*)
-</pre>
-
-<p>After all these settings are done, the linker requests that a native object
-file be created from the modules with the settings using:</p>
-
-<pre class="doc_code">lto_codegen_compile(lto_code_gen_t, size*)</pre>
-
-<p>which returns a pointer to a buffer containing the generated native
-object file.  The linker then parses that and links it with the rest 
-of the native object files.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  Devang Patel and Nick Kledzik<br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
-</address>
-
-</body>
-</html>
-
diff --git a/docs/LinkTimeOptimization.rst b/docs/LinkTimeOptimization.rst
new file mode 100644
index 0000000..53d673e
--- /dev/null
+++ b/docs/LinkTimeOptimization.rst
@@ -0,0 +1,298 @@
+.. _lto:
+
+======================================================
+LLVM Link Time Optimization: Design and Implementation
+======================================================
+
+.. contents::
+   :local:
+
+Description
+===========
+
+LLVM features powerful intermodular optimizations which can be used at link
+time.  Link Time Optimization (LTO) is another name for intermodular
+optimization when performed during the link stage. This document describes the
+interface and design between the LTO optimizer and the linker.
+
+Design Philosophy
+=================
+
+The LLVM Link Time Optimizer provides complete transparency, while doing
+intermodular optimization, in the compiler tool chain. Its main goal is to let
+the developer take advantage of intermodular optimizations without making any
+significant changes to the developer's makefiles or build system. This is
+achieved through tight integration with the linker. In this model, the linker
+treates LLVM bitcode files like native object files and allows mixing and
+matching among them. The linker uses `libLTO`_, a shared object, to handle LLVM
+bitcode files. This tight integration between the linker and LLVM optimizer
+helps to do optimizations that are not possible in other models. The linker
+input allows the optimizer to avoid relying on conservative escape analysis.
+
+Example of link time optimization
+---------------------------------
+
+The following example illustrates the advantages of LTO's integrated approach
+and clean interface. This example requires a system linker which supports LTO
+through the interface described in this document.  Here, clang transparently
+invokes system linker.
+
+* Input source file ``a.c`` is compiled into LLVM bitcode form.
+* Input source file ``main.c`` is compiled into native object code.
+
+.. code-block:: c++
+
+  --- a.h ---
+  extern int foo1(void);
+  extern void foo2(void);
+  extern void foo4(void);
+
+  --- a.c ---
+  #include "a.h"
+
+  static signed int i = 0;
+
+  void foo2(void) {
+    i = -1;
+  }
+
+  static int foo3() {
+    foo4();
+    return 10;
+  }
+
+  int foo1(void) {
+    int data = 0;
+
+    if (i < 0) 
+      data = foo3();
+
+    data = data + 42;
+    return data;
+  }
+
+  --- main.c ---
+  #include <stdio.h>
+  #include "a.h"
+
+  void foo4(void) {
+    printf("Hi\n");
+  }
+
+  int main() {
+    return foo1();
+  }
+
+.. code-block:: bash
+
+  --- command lines ---
+  % clang -emit-llvm -c a.c -o a.o   # <-- a.o is LLVM bitcode file
+  % clang -c main.c -o main.o        # <-- main.o is native object file
+  % clang a.o main.o -o main         # <-- standard link command without modifications
+
+* In this example, the linker recognizes that ``foo2()`` is an externally
+  visible symbol defined in LLVM bitcode file. The linker completes its usual
+  symbol resolution pass and finds that ``foo2()`` is not used
+  anywhere. This information is used by the LLVM optimizer and it
+  removes ``foo2()``.</li>
+
+* As soon as ``foo2()`` is removed, the optimizer recognizes that condition ``i
+  < 0`` is always false, which means ``foo3()`` is never used. Hence, the
+  optimizer also removes ``foo3()``.
+
+* And this in turn, enables linker to remove ``foo4()``.
+
+This example illustrates the advantage of tight integration with the
+linker. Here, the optimizer can not remove ``foo3()`` without the linker's
+input.
+
+Alternative Approaches
+----------------------
+
+**Compiler driver invokes link time optimizer separately.**
+    In this model the link time optimizer is not able to take advantage of
+    information collected during the linker's normal symbol resolution phase.
+    In the above example, the optimizer can not remove ``foo2()`` without the
+    linker's input because it is externally visible. This in turn prohibits the
+    optimizer from removing ``foo3()``.
+
+**Use separate tool to collect symbol information from all object files.**
+    In this model, a new, separate, tool or library replicates the linker's
+    capability to collect information for link time optimization. Not only is
+    this code duplication difficult to justify, but it also has several other
+    disadvantages.  For example, the linking semantics and the features provided
+    by the linker on various platform are not unique. This means, this new tool
+    needs to support all such features and platforms in one super tool or a
+    separate tool per platform is required. This increases maintenance cost for
+    link time optimizer significantly, which is not necessary. This approach
+    also requires staying synchronized with linker developements on various
+    platforms, which is not the main focus of the link time optimizer. Finally,
+    this approach increases end user's build time due to the duplication of work
+    done by this separate tool and the linker itself.
+
+Multi-phase communication between ``libLTO`` and linker
+=======================================================
+
+The linker collects information about symbol defininitions and uses in various
+link objects which is more accurate than any information collected by other
+tools during typical build cycles.  The linker collects this information by
+looking at the definitions and uses of symbols in native .o files and using
+symbol visibility information. The linker also uses user-supplied information,
+such as a list of exported symbols. LLVM optimizer collects control flow
+information, data flow information and knows much more about program structure
+from the optimizer's point of view.  Our goal is to take advantage of tight
+integration between the linker and the optimizer by sharing this information
+during various linking phases.
+
+Phase 1 : Read LLVM Bitcode Files
+---------------------------------
+
+The linker first reads all object files in natural order and collects symbol
+information. This includes native object files as well as LLVM bitcode files.
+To minimize the cost to the linker in the case that all .o files are native
+object files, the linker only calls ``lto_module_create()`` when a supplied
+object file is found to not be a native object file.  If ``lto_module_create()``
+returns that the file is an LLVM bitcode file, the linker then iterates over the
+module using ``lto_module_get_symbol_name()`` and
+``lto_module_get_symbol_attribute()`` to get all symbols defined and referenced.
+This information is added to the linker's global symbol table.
+
+
+The lto* functions are all implemented in a shared object libLTO.  This allows
+the LLVM LTO code to be updated independently of the linker tool.  On platforms
+that support it, the shared object is lazily loaded.
+
+Phase 2 : Symbol Resolution
+---------------------------
+
+In this stage, the linker resolves symbols using global symbol table.  It may
+report undefined symbol errors, read archive members, replace weak symbols, etc.
+The linker is able to do this seamlessly even though it does not know the exact
+content of input LLVM bitcode files.  If dead code stripping is enabled then the
+linker collects the list of live symbols.
+
+Phase 3 : Optimize Bitcode Files
+--------------------------------
+
+After symbol resolution, the linker tells the LTO shared object which symbols
+are needed by native object files.  In the example above, the linker reports
+that only ``foo1()`` is used by native object files using
+``lto_codegen_add_must_preserve_symbol()``.  Next the linker invokes the LLVM
+optimizer and code generators using ``lto_codegen_compile()`` which returns a
+native object file creating by merging the LLVM bitcode files and applying
+various optimization passes.
+
+Phase 4 : Symbol Resolution after optimization
+----------------------------------------------
+
+In this phase, the linker reads optimized a native object file and updates the
+internal global symbol table to reflect any changes. The linker also collects
+information about any changes in use of external symbols by LLVM bitcode
+files. In the example above, the linker notes that ``foo4()`` is not used any
+more. If dead code stripping is enabled then the linker refreshes the live
+symbol information appropriately and performs dead code stripping.
+
+After this phase, the linker continues linking as if it never saw LLVM bitcode
+files.
+
+.. _libLTO:
+
+``libLTO``
+==========
+
+``libLTO`` is a shared object that is part of the LLVM tools, and is intended
+for use by a linker. ``libLTO`` provides an abstract C interface to use the LLVM
+interprocedural optimizer without exposing details of LLVM's internals. The
+intention is to keep the interface as stable as possible even when the LLVM
+optimizer continues to evolve. It should even be possible for a completely
+different compilation technology to provide a different libLTO that works with
+their object files and the standard linker tool.
+
+``lto_module_t``
+----------------
+
+A non-native object file is handled via an ``lto_module_t``.  The following
+functions allow the linker to check if a file (on disk or in a memory buffer) is
+a file which libLTO can process:
+
+.. code-block:: c
+
+  lto_module_is_object_file(const char*)
+  lto_module_is_object_file_for_target(const char*, const char*)
+  lto_module_is_object_file_in_memory(const void*, size_t)
+  lto_module_is_object_file_in_memory_for_target(const void*, size_t, const char*)
+
+If the object file can be processed by ``libLTO``, the linker creates a
+``lto_module_t`` by using one of:
+
+.. code-block:: c
+
+  lto_module_create(const char*)
+  lto_module_create_from_memory(const void*, size_t)
+
+and when done, the handle is released via
+
+.. code-block:: c
+
+  lto_module_dispose(lto_module_t)
+
+
+The linker can introspect the non-native object file by getting the number of
+symbols and getting the name and attributes of each symbol via:
+
+.. code-block:: c
+
+  lto_module_get_num_symbols(lto_module_t)
+  lto_module_get_symbol_name(lto_module_t, unsigned int)
+  lto_module_get_symbol_attribute(lto_module_t, unsigned int)
+
+The attributes of a symbol include the alignment, visibility, and kind.
+
+``lto_code_gen_t``
+------------------
+
+Once the linker has loaded each non-native object files into an
+``lto_module_t``, it can request ``libLTO`` to process them all and generate a
+native object file.  This is done in a couple of steps.  First, a code generator
+is created with:
+
+.. code-block:: c
+
+  lto_codegen_create()
+
+Then, each non-native object file is added to the code generator with:
+
+.. code-block:: c
+
+  lto_codegen_add_module(lto_code_gen_t, lto_module_t)
+
+The linker then has the option of setting some codegen options.  Whether or not
+to generate DWARF debug info is set with:
+  
+.. code-block:: c
+
+  lto_codegen_set_debug_model(lto_code_gen_t)
+
+Which kind of position independence is set with:
+
+.. code-block:: c
+
+  lto_codegen_set_pic_model(lto_code_gen_t)
+  
+And each symbol that is referenced by a native object file or otherwise must not
+be optimized away is set with:
+
+.. code-block:: c
+
+  lto_codegen_add_must_preserve_symbol(lto_code_gen_t, const char*)
+
+After all these settings are done, the linker requests that a native object file
+be created from the modules with the settings using:
+
+.. code-block:: c
+
+  lto_codegen_compile(lto_code_gen_t, size*)
+
+which returns a pointer to a buffer containing the generated native object file.
+The linker then parses that and links it with the rest of the native object
+files.
diff --git a/docs/Makefile b/docs/Makefile
index 389fd90..122c4b8 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL      := ..
-DIRS       := CommandGuide tutorial
+DIRS       :=
 
 ifdef BUILD_FOR_WEBSITE
 PROJ_OBJ_DIR = .
@@ -26,10 +26,9 @@ include $(LEVEL)/Makefile.common
 
 HTML       := $(wildcard $(PROJ_SRC_DIR)/*.html) \
               $(wildcard $(PROJ_SRC_DIR)/*.css)
-IMAGES     := $(wildcard $(PROJ_SRC_DIR)/img/*.*)
 DOXYFILES  := doxygen.cfg.in doxygen.css doxygen.footer doxygen.header \
               doxygen.intro
-EXTRA_DIST := $(HTML) $(DOXYFILES) llvm.css CommandGuide img
+EXTRA_DIST := $(HTML) $(DOXYFILES) llvm.css CommandGuide
 
 .PHONY: install-html install-doxygen doxygen install-ocamldoc ocamldoc generated
 
@@ -56,9 +55,7 @@ generated:: $(generated_targets)
 install-html: $(PROJ_OBJ_DIR)/html.tar.gz
 	$(Echo) Installing HTML documentation
 	$(Verb) $(MKDIR) $(DESTDIR)$(PROJ_docsdir)/html
-	$(Verb) $(MKDIR) $(DESTDIR)$(PROJ_docsdir)/html/img
 	$(Verb) $(DataInstall) $(HTML) $(DESTDIR)$(PROJ_docsdir)/html
-	$(Verb) $(DataInstall) $(IMAGES) $(DESTDIR)$(PROJ_docsdir)/html/img
 	$(Verb) $(DataInstall) $(PROJ_OBJ_DIR)/html.tar.gz $(DESTDIR)$(PROJ_docsdir)
 
 $(PROJ_OBJ_DIR)/html.tar.gz: $(HTML)
diff --git a/docs/Makefile.sphinx b/docs/Makefile.sphinx
new file mode 100644
index 0000000..21f6648
--- /dev/null
+++ b/docs/Makefile.sphinx
@@ -0,0 +1,155 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+all:	html
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/llvm.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/llvm.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/llvm"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/llvm"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/docs/MakefileGuide.html b/docs/MakefileGuide.html
deleted file mode 100644
index 1e7c3469..0000000
--- a/docs/MakefileGuide.html
+++ /dev/null
@@ -1,1039 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM Makefile Guide</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>LLVM Makefile Guide</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#general">General Concepts</a>
-    <ol>
-      <li><a href="#projects">Projects</a></li>
-      <li><a href="#varvals">Variable Values</a></li>
-      <li><a href="#including">Including Makefiles</a>
-        <ol>
-          <li><a href="#Makefile">Makefile</a></li>
-          <li><a href="#Makefile.common">Makefile.common</a></li>
-          <li><a href="#Makefile.config">Makefile.config</a></li>
-          <li><a href="#Makefile.rules">Makefile.rules</a></li>
-        </ol>
-      </li>
-      <li><a href="#Comments">Comments</a></li>
-    </ol>
-  </li>
-  <li><a href="#tutorial">Tutorial</a>
-    <ol>
-      <li><a href="#libraries">Libraries</a>
-        <ol>
-	  <li><a href="#BCModules">Bitcode Modules</a></li>
-	  <li><a href="#LoadableModules">Loadable Modules</a></li>
-	</ol>
-      </li>
-      <li><a href="#tools">Tools</a>
-        <ol>
-	  <li><a href="#JIT">JIT Tools</a></li>
-	</ol>
-      </li>
-      <li><a href="#projects">Projects</a></li>
-    </ol>
-  </li>
-  <li><a href="#targets">Targets Supported</a>
-    <ol>
-      <li><a href="#all">all</a></li>
-      <li><a href="#all-local">all-local</a></li>
-      <li><a href="#check">check</a></li>
-      <li><a href="#check-local">check-local</a></li>
-      <li><a href="#clean">clean</a></li>
-      <li><a href="#clean-local">clean-local</a></li>
-      <li><a href="#dist">dist</a></li>
-      <li><a href="#dist-check">dist-check</a></li>
-      <li><a href="#dist-clean">dist-clean</a></li>
-      <li><a href="#install">install</a></li>
-      <li><a href="#preconditions">preconditions</a></li>
-      <li><a href="#printvars">printvars</a></li>
-      <li><a href="#reconfigure">reconfigure</a></li>
-      <li><a href="#spotless">spotless</a></li>
-      <li><a href="#tags">tags</a></li>
-      <li><a href="#uninstall">uninstall</a></li>
-    </ol>
-  </li>
-  <li><a href="#variables">Using Variables</a>
-    <ol>
-      <li><a href="#setvars">Control Variables</a></li>
-      <li><a href="#overvars">Override Variables</a></li>
-      <li><a href="#getvars">Readable Variables</a></li>
-      <li><a href="#intvars">Internal Variables</a></li>
-    </ol>
-  </li>
-</ol>
-
-<div class="doc_author">    
-  <p>Written by <a href="mailto:reid@x10sys.com">Reid Spencer</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-  <p>This document provides <em>usage</em> information about the LLVM makefile 
-  system. While loosely patterned after the BSD makefile system, LLVM has taken 
-  a departure from BSD in order to implement additional features needed by LLVM.
-  Although makefile systems such as automake were attempted at one point, it
-  has become clear that the features needed by LLVM and the Makefile norm are 
-  too great to use a more limited tool. Consequently, LLVM requires simply GNU 
-  Make 3.79, a widely portable makefile processor. LLVM unabashedly makes heavy 
-  use of the features of GNU Make so the dependency on GNU Make is firm. If 
-  you're not familiar with <tt>make</tt>, it is recommended that you read the 
-  <a href="http://www.gnu.org/software/make/manual/make.html">GNU Makefile 
-  Manual</a>.</p>
-  <p>While this document is rightly part of the 
-  <a href="ProgrammersManual.html">LLVM Programmer's Manual</a>, it is treated
-  separately here because of the volume of content and because it is often an
-  early source of bewilderment for new developers.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="general">General Concepts</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-  <p>The LLVM Makefile System is the component of LLVM that is responsible for
-  building the software, testing it,  generating distributions, checking those
-  distributions, installing and uninstalling, etc. It consists of a several
-  files throughout the source tree. These files and other general concepts are
-  described in this section.</p>
-
-<!-- ======================================================================= -->
-<h3><a name="projects">Projects</a></h3>
-<div>
-  <p>The LLVM Makefile System is quite generous. It not only builds its own
-  software, but it can build yours too. Built into the system is knowledge of
-  the <tt>llvm/projects</tt> directory. Any directory under <tt>projects</tt>
-  that has both a <tt>configure</tt> script and a <tt>Makefile</tt> is assumed
-  to be a project that uses the LLVM Makefile system.  Building software that
-  uses LLVM does not require the LLVM Makefile System nor even placement in the
-  <tt>llvm/projects</tt> directory. However, doing so will allow your project
-  to get up and running quickly by utilizing the built-in features that are used
-  to compile LLVM. LLVM compiles itself using the same features of the makefile
-  system as used for projects.</p>
-  <p>For complete details on setting up your projects configuration, simply
-  mimic the <tt>llvm/projects/sample</tt> project or for further details, 
-  consult the <a href="Projects.html">Projects.html</a> page.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="varvalues">Variable Values</a></h3>
-<div>
-  <p>To use the makefile system, you simply create a file named 
-  <tt>Makefile</tt> in your directory and declare values for certain variables. 
-  The variables and values that you select determine what the makefile system
-  will do. These variables enable rules and processing in the makefile system
-  that automatically Do The Right Thing&trade;. 
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="including">Including Makefiles</a></h3>
-<div>
-  <p>Setting variables alone is not enough. You must include into your Makefile
-  additional files that provide the rules of the LLVM Makefile system. The 
-  various files involved are described in the sections that follow.</p>
-
-<!-- ======================================================================= -->
-<h4><a name="Makefile">Makefile</a></h4>
-<div>
-  <p>Each directory to participate in the build needs to have a file named
-  <tt>Makefile</tt>. This is the file first read by <tt>make</tt>. It has three
-  sections:</p>
-  <ol>
-    <li><a href="#setvars">Settable Variables</a> - Required that must be set
-    first.</li>
-    <li><a href="#Makefile.common">include <tt>$(LEVEL)/Makefile.common</tt></a>
-    - include the LLVM Makefile system.
-    <li><a href="#overvars">Override Variables</a> - Override variables set by
-    the LLVM Makefile system.
-  </ol>
-</div>
-
-<!-- ======================================================================= -->
-<h4><a name="Makefile.common">Makefile.common</a></h4>
-<div>
-  <p>Every project must have a <tt>Makefile.common</tt> file at its top source 
-  directory. This file serves three purposes:</p>
-  <ol>
-    <li>It includes the project's configuration makefile to obtain values
-    determined by the <tt>configure</tt> script. This is done by including the
-    <a href="#Makefile.config"><tt>$(LEVEL)/Makefile.config</tt></a> file.</li>
-    <li>It specifies any other (static) values that are needed throughout the
-    project. Only values that are used in all or a large proportion of the
-    project's directories should be placed here.</li>
-    <li>It includes the standard rules for the LLVM Makefile system,
-    <a href="#Makefile.rules"><tt>$(LLVM_SRC_ROOT)/Makefile.rules</tt></a>. 
-    This file is the "guts" of the LLVM Makefile system.</li>
-  </ol>
-</div>
-
-<!-- ======================================================================= -->
-<h4><a name="Makefile.config">Makefile.config</a></h4>
-<div>
-  <p>Every project must have a <tt>Makefile.config</tt> at the top of its
-  <em>build</em> directory. This file is <b>generated</b> by the
-  <tt>configure</tt> script from the pattern provided by the
-  <tt>Makefile.config.in</tt> file located at the top of the project's
-  <em>source</em> directory. The contents of this file depend largely on what
-  configuration items the project uses, however most projects can get what they
-  need by just relying on LLVM's configuration found in
-  <tt>$(LLVM_OBJ_ROOT)/Makefile.config</tt>.
-</div>
-
-<!-- ======================================================================= -->
-<h4><a name="Makefile.rules">Makefile.rules</a></h4>
-<div>
-  <p>This file, located at <tt>$(LLVM_SRC_ROOT)/Makefile.rules</tt> is the heart
-  of the LLVM Makefile System. It provides all the logic, dependencies, and
-  rules for building the targets supported by the system. What it does largely
-  depends on the values of <tt>make</tt> <a href="#variables">variables</a> that
-  have been set <em>before</em> <tt>Makefile.rules</tt> is included.
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="Comments">Comments</a></h3>
-<div>
-  <p>User Makefiles need not have comments in them unless the construction is
-  unusual or it does not strictly follow the rules and patterns of the LLVM
-  makefile system. Makefile comments are invoked with the pound (#) character.
-  The # character and any text following it, to the end of the line, are ignored
-  by <tt>make</tt>.</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="tutorial">Tutorial</a></h2>
-<!-- *********************************************************************** -->
-<div>
-  <p>This section provides some examples of the different kinds of modules you
-  can build with the LLVM makefile system. In general, each directory you 
-  provide will build a single object although that object may be composed of
-  additionally compiled components.</p>
-
-<!-- ======================================================================= -->
-<h3><a name="libraries">Libraries</a></h3>
-<div>
-  <p>Only a few variable definitions are needed to build a regular library.
-  Normally, the makefile system will build all the software into a single
-  <tt>libname.o</tt> (pre-linked) object. This means the library is not
-  searchable and that the distinction between compilation units has been
-  dissolved. Optionally, you can ask for a shared library (.so) or archive
-  library (.a) built.  Archive libraries are the default. For example:</p>
-  <pre><tt>
-      LIBRARYNAME = mylib
-      SHARED_LIBRARY = 1
-      ARCHIVE_LIBRARY = 1
-  </tt></pre>
-  <p>says to build a library named "mylib" with both a shared library 
-  (<tt>mylib.so</tt>) and an archive library (<tt>mylib.a</tt>) version. The
-  contents of all the
-  libraries produced will be the same, they are just constructed differently.
-  Note that you normally do not need to specify the sources involved. The LLVM
-  Makefile system will infer the source files from the contents of the source
-  directory.</p>
-  <p>The <tt>LOADABLE_MODULE=1</tt> directive can be used in conjunction with
-  <tt>SHARED_LIBRARY=1</tt> to indicate that the resulting shared library should
-  be openable with the <tt>dlopen</tt> function and searchable with the
-  <tt>dlsym</tt> function (or your operating system's equivalents). While this
-  isn't strictly necessary on Linux and a few other platforms, it is required
-  on systems like HP-UX and Darwin. You should use <tt>LOADABLE_MODULE</tt> for
-  any shared library that you intend to be loaded into an tool via the
-  <tt>-load</tt> option. See the 
-  <a href="WritingAnLLVMPass.html#makefile">WritingAnLLVMPass.html</a> document
-  for an example of why you might want to do this.
-
-<!-- ======================================================================= -->
-<h4><a name="BCModules">Bitcode Modules</a></h4>
-<div>
-  <p>In some situations, it is desirable to build a single bitcode module from
-  a variety of sources, instead of an archive, shared library, or bitcode 
-  library. Bitcode modules can be specified in addition to any of the other
-  types of libraries by defining the <a href="#MODULE_NAME">MODULE_NAME</a>
-  variable. For example:</p>
-  <pre><tt>
-      LIBRARYNAME = mylib
-      BYTECODE_LIBRARY = 1
-      MODULE_NAME = mymod
-  </tt></pre>
-  <p>will build a module named <tt>mymod.bc</tt> from the sources in the
-  directory. This module will be an aggregation of all the bitcode modules 
-  derived from the sources. The example will also build a bitcode archive 
-  containing a bitcode module for each compiled source file. The difference is
-  subtle, but important depending on how the module or library is to be linked.
-  </p>
-</div>
-
-<!-- ======================================================================= -->
-<h4>
-  <a name="LoadableModules">Loadable Modules</a>
-</h4>
-<div>
-  <p>In some situations, you need to create a loadable module. Loadable modules
-  can be loaded into programs like <tt>opt</tt> or <tt>llc</tt> to specify
-  additional passes to run or targets to support.  Loadable modules are also
-  useful for debugging a pass or providing a pass with another package if that
-  pass can't be included in LLVM.</p>
-  <p>LLVM provides complete support for building such a module. All you need to
-  do is use the LOADABLE_MODULE variable in your Makefile. For example, to 
-  build a loadable module named <tt>MyMod</tt> that uses the LLVM libraries
-  <tt>LLVMSupport.a</tt> and <tt>LLVMSystem.a</tt>, you would specify:</p>
-  <pre><tt>
-     LIBRARYNAME := MyMod
-     LOADABLE_MODULE := 1
-     LINK_COMPONENTS := support system
-  </tt></pre>
-  <p>Use of the <tt>LOADABLE_MODULE</tt> facility implies several things:</p>
-  <ol>
-    <li>There will be no "lib" prefix on the module. This differentiates it from
-    a standard shared library of the same name.</li>
-    <li>The <a href="#SHARED_LIBRARY">SHARED_LIBRARY</a> variable is turned 
-    on.</li>
-    <li>The <a href="#LINK_LIBS_IN_SHARED">LINK_LIBS_IN_SHARED</a> variable
-    is turned on.</li>
-  </ol>
-  <p>A loadable module is loaded by LLVM via the facilities of libtool's libltdl
-  library which is part of <tt>lib/System</tt> implementation.</p>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="tools">Tools</a></h3>
-<div>
-  <p>For building executable programs (tools), you must provide the name of the
-  tool and the names of the libraries you wish to link with the tool. For
-  example:</p>
-  <pre><tt>
-      TOOLNAME = mytool
-      USEDLIBS = mylib
-      LINK_COMPONENTS = support system
-  </tt></pre>
-  <p>says that we are to build a tool name <tt>mytool</tt> and that it requires
-  three libraries: <tt>mylib</tt>, <tt>LLVMSupport.a</tt> and
-  <tt>LLVMSystem.a</tt>.</p>
-  <p>Note that two different variables are use to indicate which libraries are
-  linked: <tt>USEDLIBS</tt> and <tt>LLVMLIBS</tt>. This distinction is necessary
-  to support projects. <tt>LLVMLIBS</tt> refers to the LLVM libraries found in 
-  the LLVM object directory. <tt>USEDLIBS</tt> refers to the libraries built by 
-  your project. In the case of building LLVM tools, <tt>USEDLIBS</tt> and 
-  <tt>LLVMLIBS</tt> can be used interchangeably since the "project" is LLVM 
-  itself and <tt>USEDLIBS</tt> refers to the same place as <tt>LLVMLIBS</tt>.
-  </p>
-  <p>Also note that there are two different ways of specifying a library: with a
-  <tt>.a</tt> suffix and without. Without the suffix, the entry refers to the
-  re-linked (.o) file which will include <em>all</em> symbols of the library.
-  This is useful, for example, to include all passes from a library of passes.
-  If the <tt>.a</tt> suffix is used then the library is linked as a searchable
-  library (with the <tt>-l</tt> option). In this case, only the symbols that are
-  unresolved <em>at that point</em> will be resolved from the library, if they
-  exist. Other (unreferenced) symbols will not be included when the <tt>.a</tt>
-  syntax is used. Note that in order to use the <tt>.a</tt> suffix, the library
-  in question must have been built with the <tt>ARCHIVE_LIBRARY</tt> option set.
-  </p>
-
-<!-- ======================================================================= -->
-<h4><a name="JIT">JIT Tools</a></h4>
-<div>
-  <p>Many tools will want to use the JIT features of LLVM.  To do this, you
-     simply specify that you want an execution 'engine', and the makefiles will
-     automatically link in the appropriate JIT for the host or an interpreter
-     if none is available:</p>
-  <pre><tt>
-      TOOLNAME = my_jit_tool
-      USEDLIBS = mylib
-      LINK_COMPONENTS = engine
-  </tt></pre>
-  <p>Of course, any additional libraries may be listed as other components.  To
-  get a full understanding of how this changes the linker command, it is
-  recommended that you:</p>
-  <pre><tt>
-      cd examples/Fibonacci
-      make VERBOSE=1
-  </tt></pre>
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="targets">Targets Supported</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-  <p>This section describes each of the targets that can be built using the LLVM
-  Makefile system. Any target can be invoked from any directory but not all are
-  applicable to a given directory (e.g. "check", "dist" and "install" will
-  always operate as if invoked from the top level directory).</p>
-
-  <table style="text-align:left">
-    <tr>
-      <th>Target Name</th><th>Implied Targets</th><th>Target Description</th>
-    </tr>
-    <tr><td><a href="#all"><tt>all</tt></a></td><td></td>
-      <td>Compile the software recursively. Default target.
-    </td></tr>
-    <tr><td><a href="#all-local"><tt>all-local</tt></a></td><td></td>
-      <td>Compile the software in the local directory only.
-    </td></tr>
-    <tr><td><a href="#check"><tt>check</tt></a></td><td></td>
-      <td>Change to the <tt>test</tt> directory in a project and run the
-      test suite there.
-    </td></tr>
-    <tr><td><a href="#check-local"><tt>check-local</tt></a></td><td></td>
-      <td>Run a local test suite. Generally this is only defined in the 
-        <tt>Makefile</tt> of the project's <tt>test</tt> directory.
-    </td></tr>
-    <tr><td><a href="#clean"><tt>clean</tt></a></td><td></td>
-      <td>Remove built objects recursively.
-    </td></tr>
-    <tr><td><a href="#clean-local"><tt>clean-local</tt></a></td><td></td>
-      <td>Remove built objects from the local directory only.
-    </td></tr>
-    <tr><td><a href="#dist"><tt>dist</tt></a></td><td>all</td>
-      <td>Prepare a source distribution tarball.
-    </td></tr>
-    <tr><td><a href="#dist-check"><tt>dist-check</tt></a></td><td>all</td>
-      <td>Prepare a source distribution tarball and check that it builds.
-    </td></tr>
-    <tr><td><a href="#dist-clean"><tt>dist-clean</tt></a></td><td>clean</td>
-      <td>Clean source distribution tarball temporary files.
-    </td></tr>
-    <tr><td><a href="#install"><tt>install</tt></a></td><td>all</td>
-      <td>Copy built objects to installation directory.
-    </td></tr>
-    <tr><td><a href="#preconditions"><tt>preconditions</tt></a></td><td>all</td>
-      <td>Check to make sure configuration and makefiles are up to date.
-    </td></tr>
-    <tr><td><a href="#printvars"><tt>printvars</tt></a></td><td>all</td>
-      <td>Prints variables defined by the makefile system (for debugging).
-    </td></tr>
-    <tr><td><a href="#tags"><tt>tags</tt></a></td><td></td>
-      <td>Make C and C++ tags files for emacs and vi.
-    </td></tr>
-    <tr><td><a href="#uninstall"><tt>uninstall</tt></a></td><td></td>
-      <td>Remove built objects from installation directory.
-    </td></tr>
-  </table>
-
-<!-- ======================================================================= -->
-<h3><a name="all">all (default)</a></h3>
-<div>
-  <p>When you invoke <tt>make</tt> with no arguments, you are implicitly
-  instructing it to seek the "all" target (goal). This target is used for
-  building the software recursively and will do different things in different 
-  directories.  For example, in a <tt>lib</tt> directory, the "all" target will 
-  compile source files and generate libraries. But, in a <tt>tools</tt> 
-  directory, it will link libraries and generate executables.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="all-local">all-local</a></h3>
-<div>
-  <p>This target is the same as <a href="#all">all</a> but it operates only on
-  the current directory instead of recursively.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="check">check</a></h3>
-<div>
-  <p>This target can be invoked from anywhere within a project's directories
-  but always invokes the <a href="#check-local"><tt>check-local</tt></a> target 
-  in the project's <tt>test</tt> directory, if it exists and has a 
-  <tt>Makefile</tt>. A warning is produced otherwise.  If 
-  <a href="#TESTSUITE"><tt>TESTSUITE</tt></a> is defined on the <tt>make</tt>
-  command line, it will be passed down to the invocation of 
-  <tt>make check-local</tt> in the <tt>test</tt> directory. The intended usage 
-  for this is to assist in running specific suites of tests. If
-  <tt>TESTSUITE</tt> is not set, the implementation of <tt>check-local</tt> 
-  should run all normal tests.  It is up to the project to define what 
-  different values for <tt>TESTSUTE</tt> will do. See the 
-  <a href="TestingGuide.html">TestingGuide</a> for further details.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="check-local">check-local</a></h3>
-<div>
-  <p>This target should be implemented by the <tt>Makefile</tt> in the project's
-  <tt>test</tt> directory. It is invoked by the <tt>check</tt> target elsewhere.
-  Each project is free to define the actions of <tt>check-local</tt> as 
-  appropriate for that project. The LLVM project itself uses dejagnu to run a 
-  suite of feature and regresson tests. Other projects may choose to use 
-  dejagnu or any other testing mechanism.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="clean">clean</a></h3>
-<div>
-  <p>This target cleans the build directory, recursively removing all things
-  that the Makefile builds. The cleaning rules have been made guarded so they 
-  shouldn't go awry (via <tt>rm -f $(UNSET_VARIABLE)/*</tt> which will attempt
-  to erase the entire directory structure.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="clean-local">clean-local</a></h3>
-<div>
-  <p>This target does the same thing as <tt>clean</tt> but only for the current
-  (local) directory.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="dist">dist</a></h3>
-<div>
-  <p>This target builds a distribution tarball. It first builds the entire
-  project using the <tt>all</tt> target and then tars up the necessary files and
-  compresses it. The generated tarball is sufficient for a casual source 
-  distribution, but probably not for a release (see <tt>dist-check</tt>).</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="dist-check">dist-check</a></h3>
-<div>
-  <p>This target does the same thing as the <tt>dist</tt> target but also checks
-  the distribution tarball. The check is made by unpacking the tarball to a new
-  directory, configuring it, building it, installing it, and then verifying that
-  the installation results are correct (by comparing to the original build).
-  This target can take a long time to run but should be done before a release
-  goes out to make sure that the distributed tarball can actually be built into
-  a working release.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="dist-clean">dist-clean</a></h3>
-<div>
-  <p>This is a special form of the <tt>clean</tt> clean target. It performs a
-  normal <tt>clean</tt> but also removes things pertaining to building the
-  distribution.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="install">install</a></h3>
-<div>
-  <p>This target finalizes shared objects and executables and copies all
-  libraries, headers, executables and documentation to the directory given 
-  with the <tt>--prefix</tt> option to <tt>configure</tt>.  When completed, 
-  the prefix directory will have everything needed to <b>use</b> LLVM. </p>
-  <p>The LLVM makefiles can generate complete <b>internal</b> documentation 
-  for all the classes by using <tt>doxygen</tt>. By default, this feature is 
-  <b>not</b> enabled because it takes a long time and generates a massive 
-  amount of data (>100MB). If you want this feature, you must configure LLVM
-  with the --enable-doxygen switch and ensure that a modern version of doxygen
-  (1.3.7 or later) is available in your <tt>PATH</tt>. You can download 
-  doxygen from 
-  <a href="http://www.stack.nl/~dimitri/doxygen/download.html#latestsrc">
-  here</a>.
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="preconditions">preconditions</a></h3>
-<div>
-  <p>This utility target checks to see if the <tt>Makefile</tt> in the object
-  directory is older than the <tt>Makefile</tt> in the source directory and
-  copies it if so. It also reruns the <tt>configure</tt> script if that needs to
-  be done and rebuilds the <tt>Makefile.config</tt> file similarly. Users may
-  overload this target to ensure that sanity checks are run <em>before</em> any
-  building of targets as all the targets depend on <tt>preconditions</tt>.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="printvars">printvars</a></h3>
-<div>
-  <p>This utility target just causes the LLVM makefiles to print out some of 
-  the makefile variables so that you can double check how things are set. </p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="reconfigure">reconfigure</a></h3>
-<div>
-  <p>This utility target will force a reconfigure of LLVM or your project. It 
-  simply runs <tt>$(PROJ_OBJ_ROOT)/config.status --recheck</tt> to rerun the
-  configuration tests and rebuild the configured files. This isn't generally
-  useful as the makefiles will reconfigure themselves whenever its necessary.
-  </p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="spotless">spotless</a></h3>
-<div>
-  <p>This utility target, only available when <tt>$(PROJ_OBJ_ROOT)</tt> is not 
-  the same as <tt>$(PROJ_SRC_ROOT)</tt>, will completely clean the
-  <tt>$(PROJ_OBJ_ROOT)</tt> directory by removing its content entirely and 
-  reconfiguring the directory. This returns the <tt>$(PROJ_OBJ_ROOT)</tt> 
-  directory to a completely fresh state. All content in the directory except 
-  configured files and top-level makefiles will be lost.</p>
-  <div class="doc_warning"><p>Use with caution.</p></div>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="tags">tags</a></h3>
-<div>
-  <p>This target will generate a <tt>TAGS</tt> file in the top-level source
-  directory. It is meant for use with emacs, XEmacs, or ViM. The TAGS file
-  provides an index of symbol definitions so that the editor can jump you to the
-  definition quickly. </p>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="uninstall">uninstall</a></h3>
-<div>
-  <p>This target is the opposite of the <tt>install</tt> target. It removes the
-  header, library and executable files from the installation directories. Note
-  that the directories themselves are not removed because it is not guaranteed
-  that LLVM is the only thing installing there (e.g. --prefix=/usr).</p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="variables">Variables</a></h2>
-<!-- *********************************************************************** -->
-<div>
-  <p>Variables are used to tell the LLVM Makefile System what to do and to
-  obtain information from it. Variables are also used internally by the LLVM
-  Makefile System. Variable names that contain only the upper case alphabetic
-  letters and underscore are intended for use by the end user. All other
-  variables are internal to the LLVM Makefile System and should not be relied
-  upon nor modified. The sections below describe how to use the LLVM Makefile 
-  variables.</p>
-
-<!-- ======================================================================= -->
-<h3><a name="setvars">Control Variables</a></h3>
-<div>
-  <p>Variables listed in the table below should be set <em>before</em> the 
-  inclusion of <a href="#Makefile.common"><tt>$(LEVEL)/Makefile.common</tt></a>.
-  These variables provide input to the LLVM make system that tell it what to do 
-  for the current directory.</p>
-  <dl>
-    <dt><a name="BUILD_ARCHIVE"><tt>BUILD_ARCHIVE</tt></a></dt>
-    <dd>If set to any value, causes an archive (.a) library to be built.</dd>
-    <dt><a name="BUILT_SOURCES"><tt>BUILT_SOURCES</tt></a></dt>
-    <dd>Specifies a set of source files that are generated from other source
-    files. These sources will be built before any other target processing to 
-    ensure they are present.</dd>
-    <dt><a name="BYTECODE_LIBRARY"><tt>BYTECODE_LIBRARY</tt></a></dt>
-    <dd>If set to any value, causes a bitcode library (.bc) to be built.</dd>
-    <dt><a name="CONFIG_FILES"><tt>CONFIG_FILES</tt></a></dt>
-    <dd>Specifies a set of configuration files to be installed.</dd>
-    <dt><a name="DEBUG_SYMBOLS"><tt>DEBUG_SYMBOLS</tt></a></dt>
-    <dd>If set to any value, causes the build to include debugging
-    symbols even in optimized objects, libraries and executables. This
-    alters the flags specified to the compilers and linkers. Debugging
-    isn't fun in an optimized build, but it is possible.</dd>
-    <dt><a name="DIRS"><tt>DIRS</tt></a></dt>
-    <dd>Specifies a set of directories, usually children of the current
-    directory, that should also be made using the same goal. These directories 
-    will be built serially.</dd>
-    <dt><a name="DISABLE_AUTO_DEPENDENCIES"><tt>DISABLE_AUTO_DEPENDENCIES</tt></a></dt>
-    <dd>If set to any value, causes the makefiles to <b>not</b> automatically
-    generate dependencies when running the compiler. Use of this feature is
-    discouraged and it may be removed at a later date.</dd>
-    <dt><a name="ENABLE_OPTIMIZED"><tt>ENABLE_OPTIMIZED</tt></a></dt>
-    <dd>If set to 1, causes the build to generate optimized objects,
-    libraries and executables. This alters the flags specified to the compilers
-    and linkers. Generally debugging won't be a fun experience with an optimized
-    build.</dd>
-    <dt><a name="ENABLE_PROFILING"><tt>ENABLE_PROFILING</tt></a></dt>
-    <dd>If set to 1, causes the build to generate both optimized and 
-    profiled objects, libraries and executables. This alters the flags specified
-    to the compilers and linkers to ensure that profile data can be collected
-    from the tools built. Use the <tt>gprof</tt> tool to analyze the output from
-    the profiled tools (<tt>gmon.out</tt>).</dd>
-    <dt><a name="DISABLE_ASSERTIONS"><tt>DISABLE_ASSERTIONS</tt></a></dt>
-    <dd>If set to 1, causes the build to disable assertions, even if 
-    building a debug or profile build.  This will exclude all assertion check
-    code from the build. LLVM will execute faster, but with little help when
-    things go wrong.</dd>
-    <dt><a name="EXPERIMENTAL_DIRS"><tt>EXPERIMENTAL_DIRS</tt></a></dt>
-    <dd>Specify a set of directories that should be built, but if they fail, it
-    should not cause the build to fail. Note that this should only be used 
-    temporarily while code is being written.</dd> 
-    <dt><a name="EXPORTED_SYMBOL_FILE"><tt>EXPORTED_SYMBOL_FILE</tt></a></dt>
-    <dd>Specifies the name of a single file that contains a list of the 
-    symbols to be exported by the linker. One symbol per line.</dd>
-    <dt><a name="EXPORTED_SYMBOL_LIST"><tt>EXPORTED_SYMBOL_LIST</tt></a></dt>
-    <dd>Specifies a set of symbols to be exported by the linker.</dd>
-    <dt><a name="EXTRA_DIST"><tt>EXTRA_DIST</tt></a></dt>
-    <dd>Specifies additional files that should be distributed with LLVM. All
-    source files, all built sources, all Makefiles, and most documentation files
-    will be automatically distributed. Use this variable to distribute any 
-    files that are not automatically distributed.</dd>
-    <dt><a name="KEEP_SYMBOLS"><tt>KEEP_SYMBOLS</tt></a></dt>
-    <dd>If set to any value, specifies that when linking executables the
-    makefiles should retain debug symbols in the executable. Normally, symbols
-    are stripped from the executable.</dd>
-    <dt><a name="LEVEL"><tt>LEVEL</tt></a><small>(required)</small></dt>
-    <dd>Specify the level of nesting from the top level. This variable must be
-    set in each makefile as it is used to find the top level and thus the other
-    makefiles.</dd>
-    <dt><a name="LIBRARYNAME"><tt>LIBRARYNAME</tt></a></dt>
-    <dd>Specify the name of the library to be built. (Required For
-    Libraries)</dd>
-    <dt><a name="LINK_COMPONENTS"><tt>LINK_COMPONENTS</tt></a></dt>
-    <dd>When specified for building a tool, the value of this variable will be
-    passed to the <tt>llvm-config</tt> tool to generate a link line for the
-    tool. Unlike <tt>USEDLIBS</tt> and <tt>LLVMLIBS</tt>, not all libraries need
-    to be specified. The <tt>llvm-config</tt> tool will figure out the library
-    dependencies and add any libraries that are needed. The <tt>USEDLIBS</tt>
-    variable can still be used in conjunction with <tt>LINK_COMPONENTS</tt> so
-    that additional project-specific libraries can be linked with the LLVM 
-    libraries specified by <tt>LINK_COMPONENTS</tt></dd>
-    <dt><a name="LINK_LIBS_IN_SHARED"><tt>LINK_LIBS_IN_SHARED</tt></a></dt>
-    <dd>By default, shared library linking will ignore any libraries specified
-    with the <a href="LLVMLIBS">LLVMLIBS</a> or <a href="USEDLIBS">USEDLIBS</a>.
-    This prevents shared libs from including things that will be in the LLVM
-    tool the shared library will be loaded into. However, sometimes it is useful
-    to link certain libraries into your shared library and this option enables
-    that feature.</dd>
-    <dt><a name="LLVMLIBS"><tt>LLVMLIBS</tt></a></dt>
-    <dd>Specifies the set of libraries from the LLVM $(ObjDir) that will be
-    linked into the tool or library.</dd>
-    <dt><a name="LOADABLE_MODULE"><tt>LOADABLE_MODULE</tt></a></dt>
-    <dd>If set to any value, causes the shared library being built to also be
-    a loadable module. Loadable modules can be opened with the dlopen() function
-    and searched with dlsym (or the operating system's equivalent). Note that
-    setting this variable without also setting <tt>SHARED_LIBRARY</tt> will have
-    no effect.</dd>
-    <dt><a name="MODULE_NAME"><tt>MODULE_NAME</tt></a></dt>
-    <dd>Specifies the name of a bitcode module to be created. A bitcode 
-    module can be specified in conjunction with other kinds of library builds 
-    or by itself. It constructs from the sources a single linked bitcode 
-    file.</dd>
-    <dt><a name="NO_INSTALL"><tt>NO_INSTALL</tt></a></dt>
-    <dd>Specifies that the build products of the directory should not be
-    installed but should be built even if the <tt>install</tt> target is given.
-    This is handy for directories that build libraries or tools that are only
-    used as part of the build process, such as code generators (e.g.
-    <tt>tblgen</tt>).</dd>
-    <dt><a name="OPTIONAL_DIRS"><tt>OPTIONAL_DIRS</tt></a></dt>
-    <dd>Specify a set of directories that may be built, if they exist, but its
-    not an error for them not to exist.</dd>
-    <dt><a name="PARALLEL_DIRS"><tt>PARALLEL_DIRS</tt></a></dt>
-    <dd>Specify a set of directories to build recursively and in parallel if
-    the -j option was used with <tt>make</tt>.</dd>
-    <dt><a name="SHARED_LIBRARY"><tt>SHARED_LIBRARY</tt></a></dt>
-    <dd>If set to any value, causes a shared library (.so) to be built in
-    addition to any other kinds of libraries. Note that this option will cause
-    all source files to be built twice: once with options for position
-    independent code and once without. Use it only where you really need a
-    shared library.</dd>
-    <dt><a name="SOURCES"><tt>SOURCES</tt><small>(optional)</small></a></dt>
-    <dd>Specifies the list of source files in the current directory to be
-    built. Source files of any type may be specified (programs, documentation, 
-    config files, etc.). If not specified, the makefile system will infer the
-    set of source files from the files present in the current directory.</dd>
-    <dt><a name="SUFFIXES"><tt>SUFFIXES</tt></a></dt>
-    <dd>Specifies a set of filename suffixes that occur in suffix match rules.
-    Only set this if your local <tt>Makefile</tt> specifies additional suffix
-    match rules.</dd> 
-    <dt><a name="TARGET"><tt>TARGET</tt></a></dt>
-    <dd>Specifies the name of the LLVM code generation target that the
-    current directory builds. Setting this variable enables additional rules to
-    build <tt>.inc</tt> files from <tt>.td</tt> files. </dd>
-    <dt><a name="TESTSUITE"><tt>TESTSUITE</tt></a></dt>
-    <dd>Specifies the directory of tests to run in <tt>llvm/test</tt>.</dd>
-    <dt><a name="TOOLNAME"><tt>TOOLNAME</tt></a></dt>
-    <dd>Specifies the name of the tool that the current directory should
-    build.</dd>
-    <dt><a name="TOOL_VERBOSE"><tt>TOOL_VERBOSE</tt></a></dt>
-    <dd>Implies VERBOSE and also tells each tool invoked to be verbose. This is
-    handy when you're trying to see the sub-tools invoked by each tool invoked 
-    by the makefile. For example, this will pass <tt>-v</tt> to the GCC 
-    compilers which causes it to print out the command lines it uses to invoke
-    sub-tools (compiler, assembler, linker).</dd>
-    <dt><a name="USEDLIBS"><tt>USEDLIBS</tt></a></dt>
-    <dd>Specifies the list of project libraries that will be linked into the
-    tool or library.</dd>
-    <dt><a name="VERBOSE"><tt>VERBOSE</tt></a></dt>
-    <dd>Tells the Makefile system to produce detailed output of what it is doing
-    instead of just summary comments. This will generate a LOT of output.</dd>
-  </dl>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="overvars">Override Variables</a></h3>
-<div>
-  <p>Override variables can be used to override the default
-  values provided by the LLVM makefile system. These variables can be set in 
-  several ways:</p>
-  <ul>
-    <li>In the environment (e.g. setenv, export) -- not recommended.</li>
-    <li>On the <tt>make</tt> command line -- recommended.</li>
-    <li>On the <tt>configure</tt> command line</li>
-    <li>In the Makefile (only <em>after</em> the inclusion of <a
-    href="#Makefile.common"><tt>$(LEVEL)/Makefile.common</tt></a>).</li>
-  </ul>
-  <p>The override variables are given below:</p>
-  <dl>
-    <dt><a name="AR"><tt>AR</tt></a> <small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>ar</tt> tool.</dd>
-    <dt><a name="PROJ_OBJ_DIR"><tt>PROJ_OBJ_DIR</tt></a></dt>
-    <dd>The directory into which the products of build rules will be placed.
-    This might be the same as 
-    <a href="#PROJ_SRC_DIR"><tt>PROJ_SRC_DIR</tt></a> but typically is
-    not.</dd>
-    <dt><a name="PROJ_SRC_DIR"><tt>PROJ_SRC_DIR</tt></a></dt>
-    <dd>The directory which contains the source files to be built.</dd>
-    <dt><a name="BUILD_EXAMPLES"><tt>BUILD_EXAMPLES</tt></a></dt>
-    <dd>If set to 1, build examples in <tt>examples</tt> and (if building
-    Clang) <tt>tools/clang/examples</tt> directories.</dd>
-    <dt><a name="BZIP2"><tt>BZIP2</tt></a><small>(configured)</small></dt>
-    <dd>The path to the <tt>bzip2</tt> tool.</dd>
-    <dt><a name="CC"><tt>CC</tt></a><small>(configured)</small></dt>
-    <dd>The path to the 'C' compiler.</dd>
-    <dt><a name="CFLAGS"><tt>CFLAGS</tt></a></dt>
-    <dd>Additional flags to be passed to the 'C' compiler.</dd>
-    <dt><a name="CXX"><tt>CXX</tt></a></dt>
-    <dd>Specifies the path to the C++ compiler.</dd>
-    <dt><a name="CXXFLAGS"><tt>CXXFLAGS</tt></a></dt>
-    <dd>Additional flags to be passed to the C++ compiler.</dd>
-    <dt><a name="DATE"><tt>DATE<small>(configured)</small></tt></a></dt>
-    <dd>Specifies the path to the <tt>date</tt> program or any program that can
-    generate the current date and time on its standard output</dd>
-    <dt><a name="DOT"><tt>DOT</tt></a><small>(configured)</small></dt>
-    <dd>Specifies the path to the <tt>dot</tt> tool or <tt>false</tt> if there
-    isn't one.</dd>
-    <dt><a name="ECHO"><tt>ECHO</tt></a><small>(configured)</small></dt>
-    <dd>Specifies the path to the <tt>echo</tt> tool for printing output.</dd>
-    <dt><a name="EXEEXT"><tt>EXEEXT</tt></a><small>(configured)</small></dt>
-    <dd>Provides the extension to be used on executables built by the makefiles.
-    The value may be empty on platforms that do not use file extensions for
-    executables (e.g. Unix).</dd>
-    <dt><a name="INSTALL"><tt>INSTALL</tt></a><small>(configured)</small></dt>
-    <dd>Specifies the path to the <tt>install</tt> tool.</dd>
-    <dt><a name="LDFLAGS"><tt>LDFLAGS</tt></a><small>(configured)</small></dt>
-    <dd>Allows users to specify additional flags to pass to the linker.</dd>
-    <dt><a name="LIBS"><tt>LIBS</tt></a><small>(configured)</small></dt>
-    <dd>The list of libraries that should be linked with each tool.</dd>
-    <dt><a name="LIBTOOL"><tt>LIBTOOL</tt></a><small>(configured)</small></dt>
-    <dd>Specifies the path to the <tt>libtool</tt> tool. This tool is renamed
-    <tt>mklib</tt> by the <tt>configure</tt> script and always located in the 
-    <dt><a name="LLVMAS"><tt>LLVMAS</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>llvm-as</tt> tool.</dd>
-    <dt><a name="LLVMCC"><tt>LLVMCC</tt></a></dt>
-    <dd>Specifies the path to the LLVM capable compiler.</dd>
-    <dt><a name="LLVMCXX"><tt>LLVMCXX</tt></a></dt>
-    <dd>Specifies the path to the LLVM C++ capable compiler.</dd>
-    <dt><a name="LLVMGCC"><tt>LLVMGCC</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the LLVM version of the GCC 'C' Compiler</dd>
-    <dt><a name="LLVMGXX"><tt>LLVMGXX</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the LLVM version of the GCC C++ Compiler</dd>
-    <dt><a name="LLVMLD"><tt>LLVMLD</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the LLVM bitcode linker tool</dd>
-    <dt><a name="LLVM_OBJ_ROOT"><tt>LLVM_OBJ_ROOT</tt></a><small>(configured)
-    </small></dt>
-    <dd>Specifies the top directory into which the output of the build is
-    placed.</dd>
-    <dt><a name="LLVM_SRC_ROOT"><tt>LLVM_SRC_ROOT</tt></a><small>(configured)
-    </small></dt>
-    <dd>Specifies the top directory in which the sources are found.</dd>
-    <dt><a name="LLVM_TARBALL_NAME"><tt>LLVM_TARBALL_NAME</tt></a>
-    <small>(configured)</small></dt>
-    <dd>Specifies the name of the distribution tarball to create. This is
-    configured from the name of the project and its version number.</dd>
-    <dt><a name="MKDIR"><tt>MKDIR</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>mkdir</tt> tool that creates
-    directories.</dd>
-    <dt><a name="ONLY_TOOLS"><tt>ONLY_TOOLS</tt></a></dt>
-    <dd>If set, specifies the list of tools to build.</dd>
-    <dt><a name="PLATFORMSTRIPOPTS"><tt>PLATFORMSTRIPOPTS</tt></a></dt>
-    <dd>The options to provide to the linker to specify that a stripped (no
-    symbols) executable should be built.</dd>
-    <dt><a name="RANLIB"><tt>RANLIB</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>ranlib</tt> tool.</dd>
-    <dt><a name="RM"><tt>RM</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>rm</tt> tool.</dd>
-    <dt><a name="SED"><tt>SED</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>sed</tt> tool.</dd>
-    <dt><a name="SHLIBEXT"><tt>SHLIBEXT</tt></a><small>(configured)</small></dt>
-    <dd>Provides the filename extension to use for shared libraries.</dd>
-    <dt><a name="TBLGEN"><tt>TBLGEN</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>tblgen</tt> tool.</dd>
-    <dt><a name="TAR"><tt>TAR</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>tar</tt> tool.</dd>
-    <dt><a name="ZIP"><tt>ZIP</tt></a><small>(defaulted)</small></dt>
-    <dd>Specifies the path to the <tt>zip</tt> tool.</dd>
-  </dl>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="getvars">Readable Variables</a></h3>
-<div>
-  <p>Variables listed in the table below can be used by the user's Makefile but
-  should not be changed. Changing the value will generally cause the build to go
-  wrong, so don't do it.</p>
-  <dl>
-    <dt><a name="bindir"><tt>bindir</tt></a></dt>
-    <dd>The directory into which executables will ultimately be installed. This
-    value is derived from the <tt>--prefix</tt> option given to
-    <tt>configure</tt>.</dd>
-    <dt><a name="BuildMode"><tt>BuildMode</tt></a></dt>
-    <dd>The name of the type of build being performed: Debug, Release, or 
-    Profile</dd>
-    <dt><a name="bitcode_libdir"><tt>bytecode_libdir</tt></a></dt>
-    <dd>The directory into which bitcode libraries will ultimately be 
-    installed.  This value is derived from the <tt>--prefix</tt> option given to
-    <tt>configure</tt>.</dd>
-    <dt><a name="ConfigureScriptFLAGS"><tt>ConfigureScriptFLAGS</tt></a></dt>
-    <dd>Additional flags given to the <tt>configure</tt> script when
-    reconfiguring.</dd>
-    <dt><a name="DistDir"><tt>DistDir</tt></a></dt>
-    <dd>The <em>current</em> directory for which a distribution copy is being
-    made.</dd>
-    <dt><a name="Echo"><tt>Echo</tt></a></dt>
-    <dd>The LLVM Makefile System output command. This provides the
-    <tt>llvm[n]</tt> prefix and starts with @ so the command itself is not
-    printed by <tt>make</tt>.</dd>
-    <dt><a name="EchoCmd"><tt>EchoCmd</tt></a></dt>
-    <dd> Same as <a href="#Echo"><tt>Echo</tt></a> but without the leading @.
-    </dd>
-    <dt><a name="includedir"><tt>includedir</tt></a></dt>
-    <dd>The directory into which include files will ultimately be installed. 
-    This value is derived from the <tt>--prefix</tt> option given to
-    <tt>configure</tt>.</dd>
-    <dt><a name="libdir"><tt>libdir</tt></a></dt><dd></dd>
-    <dd>The directory into which native libraries will ultimately be installed. 
-    This value is derived from the <tt>--prefix</tt> option given to
-    <tt>configure</tt>.</dd>
-    <dt><a name="LibDir"><tt>LibDir</tt></a></dt>
-    <dd>The configuration specific directory into which libraries are placed
-    before installation.</dd>
-    <dt><a name="MakefileConfig"><tt>MakefileConfig</tt></a></dt>
-    <dd>Full path of the <tt>Makefile.config</tt> file.</dd>
-    <dt><a name="MakefileConfigIn"><tt>MakefileConfigIn</tt></a></dt>
-    <dd>Full path of the <tt>Makefile.config.in</tt> file.</dd>
-    <dt><a name="ObjDir"><tt>ObjDir</tt></a></dt>
-    <dd>The configuration and directory specific directory where build objects
-    (compilation results) are placed.</dd>
-    <dt><a name="SubDirs"><tt>SubDirs</tt></a></dt>
-    <dd>The complete list of sub-directories of the current directory as
-    specified by other variables.</dd>
-    <dt><a name="Sources"><tt>Sources</tt></a></dt>
-    <dd>The complete list of source files.</dd>
-    <dt><a name="sysconfdir"><tt>sysconfdir</tt></a></dt>
-    <dd>The directory into which configuration files will ultimately be
-    installed. This value is derived from the <tt>--prefix</tt> option given to
-    <tt>configure</tt>.</dd>
-    <dt><a name="ToolDir"><tt>ToolDir</tt></a></dt>
-    <dd>The configuration specific directory into which executables are placed
-    before they are installed.</dd>
-    <dt><a name="TopDistDir"><tt>TopDistDir</tt></a></dt>
-    <dd>The top most directory into which the distribution files are copied.
-    </dd>
-    <dt><a name="Verb"><tt>Verb</tt></a></dt>
-    <dd>Use this as the first thing on your build script lines to enable or
-    disable verbose mode. It expands to either an @ (quiet mode) or nothing
-    (verbose mode). </dd>
-  </dl>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="intvars">Internal Variables</a></h3>
-<div>
-  <p>Variables listed below are used by the LLVM Makefile System 
-  and considered internal. You should not use these variables under any
-  circumstances.</p>
-  <p><tt>
-    Archive
-    AR.Flags
-    BaseNameSources
-    BCCompile.C
-    BCCompile.CXX
-    BCLinkLib
-    C.Flags
-    Compile.C
-    CompileCommonOpts
-    Compile.CXX
-    ConfigStatusScript
-    ConfigureScript
-    CPP.Flags
-    CPP.Flags 
-    CXX.Flags
-    DependFiles
-    DestArchiveLib
-    DestBitcodeLib
-    DestModule
-    DestSharedLib
-    DestTool
-    DistAlways
-    DistCheckDir
-    DistCheckTop
-    DistFiles
-    DistName
-    DistOther
-    DistSources
-    DistSubDirs
-    DistTarBZ2
-    DistTarGZip
-    DistZip
-    ExtraLibs
-    FakeSources
-    INCFiles
-    InternalTargets
-    LD.Flags
-    LibName.A
-    LibName.BC
-    LibName.LA
-    LibName.O
-    LibTool.Flags
-    Link
-    LinkModule
-    LLVMLibDir
-    LLVMLibsOptions
-    LLVMLibsPaths
-    LLVMToolDir
-    LLVMUsedLibs
-    LocalTargets
-    Module
-    ObjectsBC
-    ObjectsLO
-    ObjectsO
-    ObjMakefiles
-    ParallelTargets
-    PreConditions
-    ProjLibsOptions
-    ProjLibsPaths
-    ProjUsedLibs
-    Ranlib
-    RecursiveTargets
-    SrcMakefiles
-    Strip
-    StripWarnMsg
-    TableGen
-    TDFiles
-    ToolBuildPath
-    TopLevelTargets
-    UserTargets
-  </tt></p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/MakefileGuide.rst b/docs/MakefileGuide.rst
new file mode 100644
index 0000000..d2bdd24
--- /dev/null
+++ b/docs/MakefileGuide.rst
@@ -0,0 +1,956 @@
+.. _makefile_guide:
+
+===================
+LLVM Makefile Guide
+===================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document provides *usage* information about the LLVM makefile system. While
+loosely patterned after the BSD makefile system, LLVM has taken a departure from
+BSD in order to implement additional features needed by LLVM.  Although makefile
+systems, such as ``automake``, were attempted at one point, it has become clear
+that the features needed by LLVM and the ``Makefile`` norm are too great to use
+a more limited tool. Consequently, LLVM requires simply GNU Make 3.79, a widely
+portable makefile processor. LLVM unabashedly makes heavy use of the features of
+GNU Make so the dependency on GNU Make is firm. If you're not familiar with
+``make``, it is recommended that you read the `GNU Makefile Manual
+<http://www.gnu.org/software/make/manual/make.html>`_.
+
+While this document is rightly part of the `LLVM Programmer's
+Manual <ProgrammersManual.html>`_, it is treated separately here because of the
+volume of content and because it is often an early source of bewilderment for
+new developers.
+
+General Concepts
+================
+
+The LLVM Makefile System is the component of LLVM that is responsible for
+building the software, testing it, generating distributions, checking those
+distributions, installing and uninstalling, etc. It consists of a several files
+throughout the source tree. These files and other general concepts are described
+in this section.
+
+Projects
+--------
+
+The LLVM Makefile System is quite generous. It not only builds its own software,
+but it can build yours too. Built into the system is knowledge of the
+``llvm/projects`` directory. Any directory under ``projects`` that has both a
+``configure`` script and a ``Makefile`` is assumed to be a project that uses the
+LLVM Makefile system.  Building software that uses LLVM does not require the
+LLVM Makefile System nor even placement in the ``llvm/projects``
+directory. However, doing so will allow your project to get up and running
+quickly by utilizing the built-in features that are used to compile LLVM. LLVM
+compiles itself using the same features of the makefile system as used for
+projects.
+
+For complete details on setting up your projects configuration, simply mimic the
+``llvm/projects/sample`` project. Or for further details, consult the
+`Projects <Projects.html>`_ page.
+
+Variable Values
+---------------
+
+To use the makefile system, you simply create a file named ``Makefile`` in your
+directory and declare values for certain variables.  The variables and values
+that you select determine what the makefile system will do. These variables
+enable rules and processing in the makefile system that automatically Do The
+Right Thing&trade;.
+
+Including Makefiles
+-------------------
+
+Setting variables alone is not enough. You must include into your Makefile
+additional files that provide the rules of the LLVM Makefile system. The various
+files involved are described in the sections that follow.
+
+``Makefile``
+^^^^^^^^^^^^
+
+Each directory to participate in the build needs to have a file named
+``Makefile``. This is the file first read by ``make``. It has three
+sections:
+
+#. Settable Variables --- Required that must be set first.
+#. ``include $(LEVEL)/Makefile.common`` --- include the LLVM Makefile system.
+#. Override Variables --- Override variables set by the LLVM Makefile system.
+
+.. _$(LEVEL)/Makefile.common:
+
+``Makefile.common``
+^^^^^^^^^^^^^^^^^^^
+
+Every project must have a ``Makefile.common`` file at its top source
+directory. This file serves three purposes:
+
+#. It includes the project's configuration makefile to obtain values determined
+   by the ``configure`` script. This is done by including the
+   `$(LEVEL)/Makefile.config`_ file.
+
+#. It specifies any other (static) values that are needed throughout the
+   project. Only values that are used in all or a large proportion of the
+   project's directories should be placed here.
+
+#. It includes the standard rules for the LLVM Makefile system,
+   `$(LLVM_SRC_ROOT)/Makefile.rules`_.  This file is the *guts* of the LLVM
+   ``Makefile`` system.
+
+.. _$(LEVEL)/Makefile.config:
+
+``Makefile.config``
+^^^^^^^^^^^^^^^^^^^
+
+Every project must have a ``Makefile.config`` at the top of its *build*
+directory. This file is **generated** by the ``configure`` script from the
+pattern provided by the ``Makefile.config.in`` file located at the top of the
+project's *source* directory. The contents of this file depend largely on what
+configuration items the project uses, however most projects can get what they
+need by just relying on LLVM's configuration found in
+``$(LLVM_OBJ_ROOT)/Makefile.config``.
+
+.. _$(LLVM_SRC_ROOT)/Makefile.rules:
+
+``Makefile.rules``
+^^^^^^^^^^^^^^^^^^
+
+This file, located at ``$(LLVM_SRC_ROOT)/Makefile.rules`` is the heart of the
+LLVM Makefile System. It provides all the logic, dependencies, and rules for
+building the targets supported by the system. What it does largely depends on
+the values of ``make`` `variables`_ that have been set *before*
+``Makefile.rules`` is included.
+
+Comments
+^^^^^^^^
+
+User ``Makefile``\s need not have comments in them unless the construction is
+unusual or it does not strictly follow the rules and patterns of the LLVM
+makefile system. Makefile comments are invoked with the pound (``#``) character.
+The ``#`` character and any text following it, to the end of the line, are
+ignored by ``make``.
+
+Tutorial
+========
+
+This section provides some examples of the different kinds of modules you can
+build with the LLVM makefile system. In general, each directory you provide will
+build a single object although that object may be composed of additionally
+compiled components.
+
+Libraries
+---------
+
+Only a few variable definitions are needed to build a regular library.
+Normally, the makefile system will build all the software into a single
+``libname.o`` (pre-linked) object. This means the library is not searchable and
+that the distinction between compilation units has been dissolved. Optionally,
+you can ask for a shared library (.so) or archive library (.a) built.  Archive
+libraries are the default. For example:
+
+.. code-block:: makefile
+
+  LIBRARYNAME = mylib
+  SHARED_LIBRARY = 1
+  ARCHIVE_LIBRARY = 1
+
+says to build a library named ``mylib`` with both a shared library
+(``mylib.so``) and an archive library (``mylib.a``) version. The contents of all
+the libraries produced will be the same, they are just constructed differently.
+Note that you normally do not need to specify the sources involved. The LLVM
+Makefile system will infer the source files from the contents of the source
+directory.
+
+The ``LOADABLE_MODULE=1`` directive can be used in conjunction with
+``SHARED_LIBRARY=1`` to indicate that the resulting shared library should be
+openable with the ``dlopen`` function and searchable with the ``dlsym`` function
+(or your operating system's equivalents). While this isn't strictly necessary on
+Linux and a few other platforms, it is required on systems like HP-UX and
+Darwin. You should use ``LOADABLE_MODULE`` for any shared library that you
+intend to be loaded into an tool via the ``-load`` option. See the
+`WritingAnLLVMPass.html <WritingAnLLVMPass.html#makefile>`_ document for an
+example of why you might want to do this.
+
+Bitcode Modules
+^^^^^^^^^^^^^^^
+
+In some situations, it is desirable to build a single bitcode module from a
+variety of sources, instead of an archive, shared library, or bitcode
+library. Bitcode modules can be specified in addition to any of the other types
+of libraries by defining the `MODULE_NAME`_ variable. For example:
+
+.. code-block:: makefile
+
+  LIBRARYNAME = mylib
+  BYTECODE_LIBRARY = 1
+  MODULE_NAME = mymod
+
+will build a module named ``mymod.bc`` from the sources in the directory. This
+module will be an aggregation of all the bitcode modules derived from the
+sources. The example will also build a bitcode archive containing a bitcode
+module for each compiled source file. The difference is subtle, but important
+depending on how the module or library is to be linked.
+
+Loadable Modules
+^^^^^^^^^^^^^^^^
+
+In some situations, you need to create a loadable module. Loadable modules can
+be loaded into programs like ``opt`` or ``llc`` to specify additional passes to
+run or targets to support.  Loadable modules are also useful for debugging a
+pass or providing a pass with another package if that pass can't be included in
+LLVM.
+
+LLVM provides complete support for building such a module. All you need to do is
+use the ``LOADABLE_MODULE`` variable in your ``Makefile``. For example, to build
+a loadable module named ``MyMod`` that uses the LLVM libraries ``LLVMSupport.a``
+and ``LLVMSystem.a``, you would specify:
+
+.. code-block:: makefile
+
+  LIBRARYNAME := MyMod
+  LOADABLE_MODULE := 1
+  LINK_COMPONENTS := support system
+
+Use of the ``LOADABLE_MODULE`` facility implies several things:
+
+#. There will be no "``lib``" prefix on the module. This differentiates it from
+    a standard shared library of the same name.
+
+#. The `SHARED_LIBRARY`_ variable is turned on.
+
+#. The `LINK_LIBS_IN_SHARED`_ variable is turned on.
+
+A loadable module is loaded by LLVM via the facilities of libtool's libltdl
+library which is part of ``lib/System`` implementation.
+
+Tools
+-----
+
+For building executable programs (tools), you must provide the name of the tool
+and the names of the libraries you wish to link with the tool. For example:
+
+.. code-block:: makefile
+
+  TOOLNAME = mytool
+  USEDLIBS = mylib
+  LINK_COMPONENTS = support system
+
+says that we are to build a tool name ``mytool`` and that it requires three
+libraries: ``mylib``, ``LLVMSupport.a`` and ``LLVMSystem.a``.
+
+Note that two different variables are use to indicate which libraries are
+linked: ``USEDLIBS`` and ``LLVMLIBS``. This distinction is necessary to support
+projects. ``LLVMLIBS`` refers to the LLVM libraries found in the LLVM object
+directory. ``USEDLIBS`` refers to the libraries built by your project. In the
+case of building LLVM tools, ``USEDLIBS`` and ``LLVMLIBS`` can be used
+interchangeably since the "project" is LLVM itself and ``USEDLIBS`` refers to
+the same place as ``LLVMLIBS``.
+
+Also note that there are two different ways of specifying a library: with a
+``.a`` suffix and without. Without the suffix, the entry refers to the re-linked
+(.o) file which will include *all* symbols of the library.  This is
+useful, for example, to include all passes from a library of passes.  If the
+``.a`` suffix is used then the library is linked as a searchable library (with
+the ``-l`` option). In this case, only the symbols that are unresolved *at
+that point* will be resolved from the library, if they exist. Other
+(unreferenced) symbols will not be included when the ``.a`` syntax is used. Note
+that in order to use the ``.a`` suffix, the library in question must have been
+built with the ``ARCHIVE_LIBRARY`` option set.
+
+JIT Tools
+^^^^^^^^^
+
+Many tools will want to use the JIT features of LLVM.  To do this, you simply
+specify that you want an execution 'engine', and the makefiles will
+automatically link in the appropriate JIT for the host or an interpreter if none
+is available:
+
+.. code-block:: makefile
+
+  TOOLNAME = my_jit_tool
+  USEDLIBS = mylib
+  LINK_COMPONENTS = engine
+
+Of course, any additional libraries may be listed as other components.  To get a
+full understanding of how this changes the linker command, it is recommended
+that you:
+
+.. code-block:: bash
+
+  % cd examples/Fibonacci
+  % make VERBOSE=1
+
+Targets Supported
+=================
+
+This section describes each of the targets that can be built using the LLVM
+Makefile system. Any target can be invoked from any directory but not all are
+applicable to a given directory (e.g. "check", "dist" and "install" will always
+operate as if invoked from the top level directory).
+
+================= ===============      ==================
+Target Name       Implied Targets      Target Description
+================= ===============      ==================
+``all``           \                    Compile the software recursively. Default target.
+``all-local``     \                    Compile the software in the local directory only.
+``check``         \                    Change to the ``test`` directory in a project and run the test suite there.
+``check-local``   \                    Run a local test suite. Generally this is only defined in the  ``Makefile`` of the project's ``test`` directory.
+``clean``         \                    Remove built objects recursively.
+``clean-local``   \                    Remove built objects from the local directory only.
+``dist``          ``all``              Prepare a source distribution tarball.
+``dist-check``    ``all``              Prepare a source distribution tarball and check that it builds.
+``dist-clean``    ``clean``            Clean source distribution tarball temporary files.
+``install``       ``all``              Copy built objects to installation directory.
+``preconditions`` ``all``              Check to make sure configuration and makefiles are up to date.
+``printvars``     ``all``              Prints variables defined by the makefile system (for debugging).
+``tags``          \                    Make C and C++ tags files for emacs and vi.
+``uninstall``     \                    Remove built objects from installation directory.
+================= ===============      ==================
+
+.. _all:
+
+``all`` (default)
+-----------------
+
+When you invoke ``make`` with no arguments, you are implicitly instructing it to
+seek the ``all`` target (goal). This target is used for building the software
+recursively and will do different things in different directories.  For example,
+in a ``lib`` directory, the ``all`` target will compile source files and
+generate libraries. But, in a ``tools`` directory, it will link libraries and
+generate executables.
+
+``all-local``
+-------------
+
+This target is the same as `all`_ but it operates only on the current directory
+instead of recursively.
+
+``check``
+---------
+
+This target can be invoked from anywhere within a project's directories but
+always invokes the `check-local`_ target in the project's ``test`` directory, if
+it exists and has a ``Makefile``. A warning is produced otherwise.  If
+`TESTSUITE`_ is defined on the ``make`` command line, it will be passed down to
+the invocation of ``make check-local`` in the ``test`` directory. The intended
+usage for this is to assist in running specific suites of tests. If
+``TESTSUITE`` is not set, the implementation of ``check-local`` should run all
+normal tests.  It is up to the project to define what different values for
+``TESTSUTE`` will do. See the `Testing Guide <TestingGuide.html>`_ for further
+details.
+
+``check-local``
+---------------
+
+This target should be implemented by the ``Makefile`` in the project's ``test``
+directory. It is invoked by the ``check`` target elsewhere.  Each project is
+free to define the actions of ``check-local`` as appropriate for that
+project. The LLVM project itself uses dejagnu to run a suite of feature and
+regresson tests. Other projects may choose to use dejagnu or any other testing
+mechanism.
+
+``clean``
+---------
+
+This target cleans the build directory, recursively removing all things that the
+Makefile builds. The cleaning rules have been made guarded so they shouldn't go
+awry (via ``rm -f $(UNSET_VARIABLE)/*`` which will attempt to erase the entire
+directory structure.
+
+``clean-local``
+---------------
+
+This target does the same thing as ``clean`` but only for the current (local)
+directory.
+
+``dist``
+--------
+
+This target builds a distribution tarball. It first builds the entire project
+using the ``all`` target and then tars up the necessary files and compresses
+it. The generated tarball is sufficient for a casual source distribution, but
+probably not for a release (see ``dist-check``).
+
+``dist-check``
+--------------
+
+This target does the same thing as the ``dist`` target but also checks the
+distribution tarball. The check is made by unpacking the tarball to a new
+directory, configuring it, building it, installing it, and then verifying that
+the installation results are correct (by comparing to the original build).  This
+target can take a long time to run but should be done before a release goes out
+to make sure that the distributed tarball can actually be built into a working
+release.
+
+``dist-clean``
+--------------
+
+This is a special form of the ``clean`` clean target. It performs a normal
+``clean`` but also removes things pertaining to building the distribution.
+
+``install``
+-----------
+
+This target finalizes shared objects and executables and copies all libraries,
+headers, executables and documentation to the directory given with the
+``--prefix`` option to ``configure``.  When completed, the prefix directory will
+have everything needed to **use** LLVM.
+
+The LLVM makefiles can generate complete **internal** documentation for all the
+classes by using ``doxygen``. By default, this feature is **not** enabled
+because it takes a long time and generates a massive amount of data (>100MB). If
+you want this feature, you must configure LLVM with the --enable-doxygen switch
+and ensure that a modern version of doxygen (1.3.7 or later) is available in
+your ``PATH``. You can download doxygen from `here
+<http://www.stack.nl/~dimitri/doxygen/download.html#latestsrc>`_.
+
+``preconditions``
+-----------------
+
+This utility target checks to see if the ``Makefile`` in the object directory is
+older than the ``Makefile`` in the source directory and copies it if so. It also
+reruns the ``configure`` script if that needs to be done and rebuilds the
+``Makefile.config`` file similarly. Users may overload this target to ensure
+that sanity checks are run *before* any building of targets as all the targets
+depend on ``preconditions``.
+
+``printvars``
+-------------
+
+This utility target just causes the LLVM makefiles to print out some of the
+makefile variables so that you can double check how things are set.
+
+``reconfigure``
+---------------
+
+This utility target will force a reconfigure of LLVM or your project. It simply
+runs ``$(PROJ_OBJ_ROOT)/config.status --recheck`` to rerun the configuration
+tests and rebuild the configured files. This isn't generally useful as the
+makefiles will reconfigure themselves whenever its necessary.
+
+``spotless``
+------------
+
+.. warning::
+
+  Use with caution!
+
+This utility target, only available when ``$(PROJ_OBJ_ROOT)`` is not the same as
+``$(PROJ_SRC_ROOT)``, will completely clean the ``$(PROJ_OBJ_ROOT)`` directory
+by removing its content entirely and reconfiguring the directory. This returns
+the ``$(PROJ_OBJ_ROOT)`` directory to a completely fresh state. All content in
+the directory except configured files and top-level makefiles will be lost.
+
+``tags``
+--------
+
+This target will generate a ``TAGS`` file in the top-level source directory. It
+is meant for use with emacs, XEmacs, or ViM. The TAGS file provides an index of
+symbol definitions so that the editor can jump you to the definition
+quickly.
+
+``uninstall``
+-------------
+
+This target is the opposite of the ``install`` target. It removes the header,
+library and executable files from the installation directories. Note that the
+directories themselves are not removed because it is not guaranteed that LLVM is
+the only thing installing there (e.g. ``--prefix=/usr``).
+
+.. _variables:
+
+Variables
+=========
+
+Variables are used to tell the LLVM Makefile System what to do and to obtain
+information from it. Variables are also used internally by the LLVM Makefile
+System. Variable names that contain only the upper case alphabetic letters and
+underscore are intended for use by the end user. All other variables are
+internal to the LLVM Makefile System and should not be relied upon nor
+modified. The sections below describe how to use the LLVM Makefile
+variables.
+
+Control Variables
+-----------------
+
+Variables listed in the table below should be set *before* the inclusion of
+`$(LEVEL)/Makefile.common`_.  These variables provide input to the LLVM make
+system that tell it what to do for the current directory.
+
+``BUILD_ARCHIVE``
+    If set to any value, causes an archive (.a) library to be built.
+
+``BUILT_SOURCES``
+    Specifies a set of source files that are generated from other source
+    files. These sources will be built before any other target processing to
+    ensure they are present.
+
+``BYTECODE_LIBRARY``
+    If set to any value, causes a bitcode library (.bc) to be built.
+
+``CONFIG_FILES``
+    Specifies a set of configuration files to be installed.
+
+``DEBUG_SYMBOLS``
+    If set to any value, causes the build to include debugging symbols even in
+    optimized objects, libraries and executables. This alters the flags
+    specified to the compilers and linkers. Debugging isn't fun in an optimized
+    build, but it is possible.
+
+``DIRS``
+    Specifies a set of directories, usually children of the current directory,
+    that should also be made using the same goal. These directories will be
+    built serially.
+
+``DISABLE_AUTO_DEPENDENCIES``
+    If set to any value, causes the makefiles to **not** automatically generate
+    dependencies when running the compiler. Use of this feature is discouraged
+    and it may be removed at a later date.
+
+``ENABLE_OPTIMIZED``
+    If set to 1, causes the build to generate optimized objects, libraries and
+    executables. This alters the flags specified to the compilers and
+    linkers. Generally debugging won't be a fun experience with an optimized
+    build.
+
+``ENABLE_PROFILING``
+    If set to 1, causes the build to generate both optimized and profiled
+    objects, libraries and executables. This alters the flags specified to the
+    compilers and linkers to ensure that profile data can be collected from the
+    tools built. Use the ``gprof`` tool to analyze the output from the profiled
+    tools (``gmon.out``).
+
+``DISABLE_ASSERTIONS``
+    If set to 1, causes the build to disable assertions, even if building a
+    debug or profile build.  This will exclude all assertion check code from the
+    build. LLVM will execute faster, but with little help when things go
+    wrong.
+
+``EXPERIMENTAL_DIRS``
+    Specify a set of directories that should be built, but if they fail, it
+    should not cause the build to fail. Note that this should only be used
+    temporarily while code is being written.
+
+``EXPORTED_SYMBOL_FILE``
+    Specifies the name of a single file that contains a list of the symbols to
+    be exported by the linker. One symbol per line.
+
+``EXPORTED_SYMBOL_LIST``
+    Specifies a set of symbols to be exported by the linker.
+
+``EXTRA_DIST``
+    Specifies additional files that should be distributed with LLVM. All source
+    files, all built sources, all Makefiles, and most documentation files will
+    be automatically distributed. Use this variable to distribute any files that
+    are not automatically distributed.
+
+``KEEP_SYMBOLS``
+    If set to any value, specifies that when linking executables the makefiles
+    should retain debug symbols in the executable. Normally, symbols are
+    stripped from the executable.
+
+``LEVEL`` (required)
+    Specify the level of nesting from the top level. This variable must be set
+    in each makefile as it is used to find the top level and thus the other
+    makefiles.
+
+``LIBRARYNAME``
+    Specify the name of the library to be built. (Required For Libraries)
+
+``LINK_COMPONENTS``
+    When specified for building a tool, the value of this variable will be
+    passed to the ``llvm-config`` tool to generate a link line for the
+    tool. Unlike ``USEDLIBS`` and ``LLVMLIBS``, not all libraries need to be
+    specified. The ``llvm-config`` tool will figure out the library dependencies
+    and add any libraries that are needed. The ``USEDLIBS`` variable can still
+    be used in conjunction with ``LINK_COMPONENTS`` so that additional
+    project-specific libraries can be linked with the LLVM libraries specified
+    by ``LINK_COMPONENTS``.
+
+.. _LINK_LIBS_IN_SHARED:
+
+``LINK_LIBS_IN_SHARED``
+    By default, shared library linking will ignore any libraries specified with
+    the `LLVMLIBS`_ or `USEDLIBS`_. This prevents shared libs from including
+    things that will be in the LLVM tool the shared library will be loaded
+    into. However, sometimes it is useful to link certain libraries into your
+    shared library and this option enables that feature.
+
+.. _LLVMLIBS:
+
+``LLVMLIBS``
+    Specifies the set of libraries from the LLVM ``$(ObjDir)`` that will be
+    linked into the tool or library.
+
+``LOADABLE_MODULE``
+    If set to any value, causes the shared library being built to also be a
+    loadable module. Loadable modules can be opened with the dlopen() function
+    and searched with dlsym (or the operating system's equivalent). Note that
+    setting this variable without also setting ``SHARED_LIBRARY`` will have no
+    effect.
+
+.. _MODULE_NAME:
+
+``MODULE_NAME``
+    Specifies the name of a bitcode module to be created. A bitcode module can
+    be specified in conjunction with other kinds of library builds or by
+    itself. It constructs from the sources a single linked bitcode file.
+
+``NO_INSTALL``
+    Specifies that the build products of the directory should not be installed
+    but should be built even if the ``install`` target is given.  This is handy
+    for directories that build libraries or tools that are only used as part of
+    the build process, such as code generators (e.g.  ``tblgen``).
+
+``OPTIONAL_DIRS``
+    Specify a set of directories that may be built, if they exist, but its not
+    an error for them not to exist.
+
+``PARALLEL_DIRS``
+    Specify a set of directories to build recursively and in parallel if the
+    ``-j`` option was used with ``make``.
+
+.. _SHARED_LIBRARY:
+
+``SHARED_LIBRARY``
+    If set to any value, causes a shared library (``.so``) to be built in
+    addition to any other kinds of libraries. Note that this option will cause
+    all source files to be built twice: once with options for position
+    independent code and once without. Use it only where you really need a
+    shared library.
+
+``SOURCES`` (optional)
+    Specifies the list of source files in the current directory to be
+    built. Source files of any type may be specified (programs, documentation,
+    config files, etc.). If not specified, the makefile system will infer the
+    set of source files from the files present in the current directory.
+
+``SUFFIXES``
+    Specifies a set of filename suffixes that occur in suffix match rules.  Only
+    set this if your local ``Makefile`` specifies additional suffix match
+    rules.
+
+``TARGET``
+    Specifies the name of the LLVM code generation target that the current
+    directory builds. Setting this variable enables additional rules to build
+    ``.inc`` files from ``.td`` files. 
+
+.. _TESTSUITE:
+
+``TESTSUITE``
+    Specifies the directory of tests to run in ``llvm/test``.
+
+``TOOLNAME``
+    Specifies the name of the tool that the current directory should build.
+
+``TOOL_VERBOSE``
+    Implies ``VERBOSE`` and also tells each tool invoked to be verbose. This is
+    handy when you're trying to see the sub-tools invoked by each tool invoked
+    by the makefile. For example, this will pass ``-v`` to the GCC compilers
+    which causes it to print out the command lines it uses to invoke sub-tools
+    (compiler, assembler, linker).
+
+.. _USEDLIBS:
+
+``USEDLIBS``
+    Specifies the list of project libraries that will be linked into the tool or
+    library.
+
+``VERBOSE``
+    Tells the Makefile system to produce detailed output of what it is doing
+    instead of just summary comments. This will generate a LOT of output.
+
+Override Variables
+------------------
+
+Override variables can be used to override the default values provided by the
+LLVM makefile system. These variables can be set in several ways:
+
+* In the environment (e.g. setenv, export) --- not recommended.
+* On the ``make`` command line --- recommended.
+* On the ``configure`` command line.
+* In the Makefile (only *after* the inclusion of `$(LEVEL)/Makefile.common`_).
+
+The override variables are given below:
+
+``AR`` (defaulted)
+    Specifies the path to the ``ar`` tool.
+
+``PROJ_OBJ_DIR``
+    The directory into which the products of build rules will be placed.  This
+    might be the same as `PROJ_SRC_DIR`_ but typically is not.
+
+.. _PROJ_SRC_DIR:
+
+``PROJ_SRC_DIR``
+    The directory which contains the source files to be built.
+
+``BUILD_EXAMPLES``
+    If set to 1, build examples in ``examples`` and (if building Clang)
+    ``tools/clang/examples`` directories.
+
+``BZIP2`` (configured)
+    The path to the ``bzip2`` tool.
+
+``CC`` (configured)
+    The path to the 'C' compiler.
+
+``CFLAGS``
+    Additional flags to be passed to the 'C' compiler.
+
+``CXX``
+    Specifies the path to the C++ compiler.
+
+``CXXFLAGS``
+    Additional flags to be passed to the C++ compiler.
+
+``DATE`` (configured)
+    Specifies the path to the ``date`` program or any program that can generate
+    the current date and time on its standard output.
+
+``DOT`` (configured)
+    Specifies the path to the ``dot`` tool or ``false`` if there isn't one.
+
+``ECHO`` (configured)
+    Specifies the path to the ``echo`` tool for printing output.
+
+``EXEEXT`` (configured)
+    Provides the extension to be used on executables built by the makefiles.
+    The value may be empty on platforms that do not use file extensions for
+    executables (e.g. Unix).
+
+``INSTALL`` (configured)
+    Specifies the path to the ``install`` tool.
+
+``LDFLAGS`` (configured)
+    Allows users to specify additional flags to pass to the linker.
+
+``LIBS`` (configured)
+    The list of libraries that should be linked with each tool.
+
+``LIBTOOL`` (configured)
+    Specifies the path to the ``libtool`` tool. This tool is renamed ``mklib``
+    by the ``configure`` script.
+
+``LLVMAS`` (defaulted)
+    Specifies the path to the ``llvm-as`` tool.
+
+``LLVMCC``
+    Specifies the path to the LLVM capable compiler.
+
+``LLVMCXX``
+    Specifies the path to the LLVM C++ capable compiler.
+
+``LLVMGCC`` (defaulted)
+    Specifies the path to the LLVM version of the GCC 'C' Compiler.
+
+``LLVMGXX`` (defaulted)
+    Specifies the path to the LLVM version of the GCC C++ Compiler.
+
+``LLVMLD`` (defaulted)
+    Specifies the path to the LLVM bitcode linker tool
+
+``LLVM_OBJ_ROOT`` (configured)
+    Specifies the top directory into which the output of the build is placed.
+
+``LLVM_SRC_ROOT`` (configured)
+    Specifies the top directory in which the sources are found.
+
+``LLVM_TARBALL_NAME`` (configured)
+    Specifies the name of the distribution tarball to create. This is configured
+    from the name of the project and its version number.
+
+``MKDIR`` (defaulted)
+    Specifies the path to the ``mkdir`` tool that creates directories.
+
+``ONLY_TOOLS``
+    If set, specifies the list of tools to build.
+
+``PLATFORMSTRIPOPTS``
+    The options to provide to the linker to specify that a stripped (no symbols)
+    executable should be built.
+
+``RANLIB`` (defaulted)
+    Specifies the path to the ``ranlib`` tool.
+
+``RM`` (defaulted)
+    Specifies the path to the ``rm`` tool.
+
+``SED`` (defaulted)
+    Specifies the path to the ``sed`` tool.
+
+``SHLIBEXT`` (configured)
+    Provides the filename extension to use for shared libraries.
+
+``TBLGEN`` (defaulted)
+    Specifies the path to the ``tblgen`` tool.
+
+``TAR`` (defaulted)
+    Specifies the path to the ``tar`` tool.
+
+``ZIP`` (defaulted)
+    Specifies the path to the ``zip`` tool.
+
+Readable Variables
+------------------
+
+Variables listed in the table below can be used by the user's Makefile but
+should not be changed. Changing the value will generally cause the build to go
+wrong, so don't do it.
+
+``bindir``
+    The directory into which executables will ultimately be installed. This
+    value is derived from the ``--prefix`` option given to ``configure``.
+
+``BuildMode``
+    The name of the type of build being performed: Debug, Release, or
+    Profile.
+
+``bytecode_libdir``
+    The directory into which bitcode libraries will ultimately be installed.
+    This value is derived from the ``--prefix`` option given to ``configure``.
+
+``ConfigureScriptFLAGS``
+    Additional flags given to the ``configure`` script when reconfiguring.
+
+``DistDir``
+    The *current* directory for which a distribution copy is being made.
+
+.. _Echo:
+
+``Echo``
+    The LLVM Makefile System output command. This provides the ``llvm[n]``
+    prefix and starts with ``@`` so the command itself is not printed by
+    ``make``.
+
+``EchoCmd``
+    Same as `Echo`_ but without the leading ``@``.
+
+``includedir``
+    The directory into which include files will ultimately be installed.  This
+    value is derived from the ``--prefix`` option given to ``configure``.
+
+``libdir``
+    The directory into which native libraries will ultimately be installed.
+    This value is derived from the ``--prefix`` option given to
+    ``configure``.
+
+``LibDir``
+    The configuration specific directory into which libraries are placed before
+    installation.
+
+``MakefileConfig``
+    Full path of the ``Makefile.config`` file.
+
+``MakefileConfigIn``
+    Full path of the ``Makefile.config.in`` file.
+
+``ObjDir``
+    The configuration and directory specific directory where build objects
+    (compilation results) are placed.
+
+``SubDirs``
+    The complete list of sub-directories of the current directory as
+    specified by other variables.
+
+``Sources``
+    The complete list of source files.
+
+``sysconfdir``
+    The directory into which configuration files will ultimately be
+    installed. This value is derived from the ``--prefix`` option given to
+    ``configure``.
+
+``ToolDir``
+    The configuration specific directory into which executables are placed
+    before they are installed.
+
+``TopDistDir``
+    The top most directory into which the distribution files are copied.
+
+``Verb``
+    Use this as the first thing on your build script lines to enable or disable
+    verbose mode. It expands to either an ``@`` (quiet mode) or nothing (verbose
+    mode).
+
+Internal Variables
+------------------
+
+Variables listed below are used by the LLVM Makefile System and considered
+internal. You should not use these variables under any circumstances.
+
+.. code-block:: makefile
+
+    Archive
+    AR.Flags
+    BaseNameSources
+    BCCompile.C
+    BCCompile.CXX
+    BCLinkLib
+    C.Flags
+    Compile.C
+    CompileCommonOpts
+    Compile.CXX
+    ConfigStatusScript
+    ConfigureScript
+    CPP.Flags
+    CPP.Flags 
+    CXX.Flags
+    DependFiles
+    DestArchiveLib
+    DestBitcodeLib
+    DestModule
+    DestSharedLib
+    DestTool
+    DistAlways
+    DistCheckDir
+    DistCheckTop
+    DistFiles
+    DistName
+    DistOther
+    DistSources
+    DistSubDirs
+    DistTarBZ2
+    DistTarGZip
+    DistZip
+    ExtraLibs
+    FakeSources
+    INCFiles
+    InternalTargets
+    LD.Flags
+    LibName.A
+    LibName.BC
+    LibName.LA
+    LibName.O
+    LibTool.Flags
+    Link
+    LinkModule
+    LLVMLibDir
+    LLVMLibsOptions
+    LLVMLibsPaths
+    LLVMToolDir
+    LLVMUsedLibs
+    LocalTargets
+    Module
+    ObjectsBC
+    ObjectsLO
+    ObjectsO
+    ObjMakefiles
+    ParallelTargets
+    PreConditions
+    ProjLibsOptions
+    ProjLibsPaths
+    ProjUsedLibs
+    Ranlib
+    RecursiveTargets
+    SrcMakefiles
+    Strip
+    StripWarnMsg
+    TableGen
+    TDFiles
+    ToolBuildPath
+    TopLevelTargets
+    UserTargets
diff --git a/docs/Packaging.html b/docs/Packaging.html
deleted file mode 100644
index ac4dcf0..0000000
--- a/docs/Packaging.html
+++ /dev/null
@@ -1,119 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Advice on Packaging LLVM</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>Advice on Packaging LLVM</h1>
-<ol>
-  <li><a href="#overview">Overview</a></li>
-  <li><a href="#compilation">Compile Flags</a></li>
-  <li><a href="#cxx-features">C++ Features</a></li>
-  <li><a href="#shared-library">Shared Library</a></li>
-  <li><a href="#deps">Dependencies</a></li>
-</ol>
-
-<!--=========================================================================-->
-<h2><a name="overview">Overview</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>LLVM sets certain default configure options to make sure our developers don't
-break things for constrained platforms.  These settings are not optimal for most
-desktop systems, and we hope that packagers (e.g., Redhat, Debian, MacPorts,
-etc.) will tweak them.  This document lists settings we suggest you tweak.
-</p>
-
-<p>LLVM's API changes with each release, so users are likely to want, for
-example, both LLVM-2.6 and LLVM-2.7 installed at the same time to support apps
-developed against each.
-</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="compilation">Compile Flags</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>LLVM runs much more quickly when it's optimized and assertions are removed.
-However, such a build is currently incompatible with users who build without
-defining NDEBUG, and the lack of assertions makes it hard to debug problems in
-user code.  We recommend allowing users to install both optimized and debug
-versions of LLVM in parallel.  The following configure flags are relevant:
-</p>
-
-<dl>
-  <dt><tt>--disable-assertions</tt></dt><dd>Builds LLVM with <tt>NDEBUG</tt>
-  defined.  Changes the LLVM ABI.  Also available by setting
-  <tt>DISABLE_ASSERTIONS=0|1</tt> in <tt>make</tt>'s environment.  This defaults
-  to enabled regardless of the optimization setting, but it slows things
-  down.</dd>
-
-  <dt><tt>--enable-debug-symbols</tt></dt><dd>Builds LLVM with <tt>-g</tt>.
-  Also available by setting <tt>DEBUG_SYMBOLS=0|1</tt> in <tt>make</tt>'s
-  environment.  This defaults to disabled when optimizing, so you should turn it
-  back on to let users debug their programs.</dd>
-
-  <dt><tt>--enable-optimized</tt></dt><dd>(For svn checkouts) Builds LLVM with
-  <tt>-O2</tt> and, by default, turns off debug symbols.  Also available by
-  setting <tt>ENABLE_OPTIMIZED=0|1</tt> in <tt>make</tt>'s environment.  This
-  defaults to enabled when not in a checkout.</dd>
-</dl>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="cxx-features">C++ Features</a></h2>
-<!--=========================================================================-->
-<div>
-
-<dl>
-  <dt>RTTI</dt><dd>LLVM disables RTTI by default.  Add <tt>REQUIRES_RTTI=1</tt>
-  to your environment while running <tt>make</tt> to re-enable it.  This will
-  allow users to build with RTTI enabled and still inherit from LLVM
-  classes.</dd>
-</dl>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="shared-library">Shared Library</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>Configure with <tt>--enable-shared</tt> to build
-<tt>libLLVM-<var>major</var>.<var>minor</var>.(so|dylib)</tt> and link the tools
-against it.  This saves lots of binary size at the cost of some startup time.
-</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="deps">Dependencies</a></h2>
-<!--=========================================================================-->
-<div>
-
-<dl>
-<dt><tt>--enable-libffi</tt></dt><dd>Depend on <a
-href="http://sources.redhat.com/libffi/">libffi</a> to allow the LLVM
-interpreter to call external functions.</dd>
-<dt><tt>--with-oprofile</tt></dt><dd>Depend on <a
-href="http://oprofile.sourceforge.net/doc/devel/index.html">libopagent</a>
-(>=version 0.9.4) to let the LLVM JIT tell oprofile about function addresses and
-line numbers.</dd>
-</dl>
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/Packaging.rst b/docs/Packaging.rst
new file mode 100644
index 0000000..6e74158
--- /dev/null
+++ b/docs/Packaging.rst
@@ -0,0 +1,75 @@
+.. _packaging:
+
+========================
+Advice on Packaging LLVM
+========================
+
+.. contents::
+   :local:
+
+Overview
+========
+
+LLVM sets certain default configure options to make sure our developers don't
+break things for constrained platforms.  These settings are not optimal for most
+desktop systems, and we hope that packagers (e.g., Redhat, Debian, MacPorts,
+etc.) will tweak them.  This document lists settings we suggest you tweak.
+
+LLVM's API changes with each release, so users are likely to want, for example,
+both LLVM-2.6 and LLVM-2.7 installed at the same time to support apps developed
+against each.
+
+Compile Flags
+=============
+
+LLVM runs much more quickly when it's optimized and assertions are removed.
+However, such a build is currently incompatible with users who build without
+defining ``NDEBUG``, and the lack of assertions makes it hard to debug problems
+in user code.  We recommend allowing users to install both optimized and debug
+versions of LLVM in parallel.  The following configure flags are relevant:
+
+``--disable-assertions``
+    Builds LLVM with ``NDEBUG`` defined.  Changes the LLVM ABI.  Also available
+    by setting ``DISABLE_ASSERTIONS=0|1`` in ``make``'s environment.  This
+    defaults to enabled regardless of the optimization setting, but it slows
+    things down.
+
+``--enable-debug-symbols``
+    Builds LLVM with ``-g``.  Also available by setting ``DEBUG_SYMBOLS=0|1`` in
+    ``make``'s environment.  This defaults to disabled when optimizing, so you
+    should turn it back on to let users debug their programs.
+
+``--enable-optimized``
+    (For svn checkouts) Builds LLVM with ``-O2`` and, by default, turns off
+    debug symbols.  Also available by setting ``ENABLE_OPTIMIZED=0|1`` in
+    ``make``'s environment.  This defaults to enabled when not in a
+    checkout.
+
+C++ Features
+============
+
+RTTI
+    LLVM disables RTTI by default.  Add ``REQUIRES_RTTI=1`` to your environment
+    while running ``make`` to re-enable it.  This will allow users to build with
+    RTTI enabled and still inherit from LLVM classes.
+
+Shared Library
+==============
+
+Configure with ``--enable-shared`` to build
+``libLLVM-<major>.<minor>.(so|dylib)`` and link the tools against it.  This
+saves lots of binary size at the cost of some startup time.
+
+Dependencies
+============
+
+``--enable-libffi``
+    Depend on `libffi <http://sources.redhat.com/libffi/>`_ to allow the LLVM
+    interpreter to call external functions.
+
+``--with-oprofile``
+
+    Depend on `libopagent
+    <http://oprofile.sourceforge.net/doc/devel/index.html>`_ (>=version 0.9.4)
+    to let the LLVM JIT tell oprofile about function addresses and line
+    numbers.
diff --git a/docs/Passes.html b/docs/Passes.html
index 37a304d..e8048d5 100644
--- a/docs/Passes.html
+++ b/docs/Passes.html
@@ -3,7 +3,7 @@
 <html>
 <head>
   <title>LLVM's Analysis and Transform Passes</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
   <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
 </head>
 <body>
@@ -100,7 +100,6 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#module-debuginfo">-module-debuginfo</a></td><td>Decodes module-level debug info</td></tr>
 <tr><td><a href="#no-aa">-no-aa</a></td><td>No Alias Analysis (always returns 'may' alias)</td></tr>
 <tr><td><a href="#no-profile">-no-profile</a></td><td>No Profile Information</td></tr>
-<tr><td><a href="#postdomfrontier">-postdomfrontier</a></td><td>Post-Dominance Frontier Construction</td></tr>
 <tr><td><a href="#postdomtree">-postdomtree</a></td><td>Post-Dominator Tree Construction</td></tr>
 <tr><td><a href="#print-alias-sets">-print-alias-sets</a></td><td>Alias Set Printer</td></tr>
 <tr><td><a href="#print-callgraph">-print-callgraph</a></td><td>Print a call graph</td></tr>
@@ -755,7 +754,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </h3>
 <div>
   <p>Provides other passes access to information on how the size and alignment
-  required by the the target ABI for various data types.</p>
+  required by the target ABI for various data types.</p>
 </div>
 
 </div>
@@ -1617,7 +1616,7 @@ if (X &lt; 3) {</pre>
 </h3>
 <div>
   <p>
-  This file demotes all registers to memory references.  It is intented to be
+  This file demotes all registers to memory references.  It is intended to be
   the inverse of <a href="#mem2reg"><tt>-mem2reg</tt></a>.  By converting to
   <tt>load</tt> instructions, the only values live across basic blocks are
   <tt>alloca</tt> instructions and <tt>load</tt> instructions before
@@ -1971,7 +1970,7 @@ if (X &lt; 3) {</pre>
     <li>Verify that a function's argument list agrees with its declared
         type.</li>
     <li>It is illegal to specify a name for a void value.</li>
-    <li>It is illegal to have a internal global value with no initializer.</li>
+    <li>It is illegal to have an internal global value with no initializer.</li>
     <li>It is illegal to have a ret instruction that returns a value that does
         not agree with the function return value type.</li>
     <li>Function call argument types match the function prototype.</li>
@@ -2060,7 +2059,7 @@ if (X &lt; 3) {</pre>
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-02-01 04:51:43 +0100 (Wed, 01 Feb 2012) $
+  Last modified: $Date: 2012-07-26 00:01:31 +0200 (Thu, 26 Jul 2012) $
 </address>
 
 </body>
diff --git a/docs/ProgrammersManual.html b/docs/ProgrammersManual.html
index 625ef9a..5bf499b 100644
--- a/docs/ProgrammersManual.html
+++ b/docs/ProgrammersManual.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-type" content="text/html;charset=UTF-8">
   <title>LLVM Programmer's Manual</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -507,8 +507,9 @@ small and pervasive enough in LLVM that it should always be passed by value.</p>
 
 <div>
 
-<p>The <tt>Twine</tt> class is an efficient way for APIs to accept concatenated
-strings.  For example, a common LLVM paradigm is to name one instruction based on
+<p>The <tt><a href="/doxygen/classllvm_1_1Twine.html">Twine</a></tt> class is an
+efficient way for APIs to accept concatenated strings.  For example, a common
+LLVM paradigm is to name one instruction based on
 the name of another instruction with a suffix, for example:</p>
 
 <div class="doc_code">
@@ -517,17 +518,17 @@ the name of another instruction with a suffix, for example:</p>
 </pre>
 </div>
 
-<p>The <tt>Twine</tt> class is effectively a
-lightweight <a href="http://en.wikipedia.org/wiki/Rope_(computer_science)">rope</a>
+<p>The <tt>Twine</tt> class is effectively a lightweight
+<a href="http://en.wikipedia.org/wiki/Rope_(computer_science)">rope</a>
 which points to temporary (stack allocated) objects.  Twines can be implicitly
 constructed as the result of the plus operator applied to strings (i.e., a C
-strings, an <tt>std::string</tt>, or a <tt>StringRef</tt>).  The twine delays the
-actual concatenation of strings until it is actually required, at which point
-it can be efficiently rendered directly into a character array.  This avoids
-unnecessary heap allocation involved in constructing the temporary results of
-string concatenation. See
-"<tt><a href="/doxygen/classllvm_1_1Twine_8h-source.html">llvm/ADT/Twine.h</a></tt>"
-for more information.</p>
+strings, an <tt>std::string</tt>, or a <tt>StringRef</tt>).  The twine delays
+the actual concatenation of strings until it is actually required, at which
+point it can be efficiently rendered directly into a character array.  This
+avoids unnecessary heap allocation involved in constructing the temporary
+results of string concatenation. See
+"<tt><a href="/doxygen/Twine_8h_source.html">llvm/ADT/Twine.h</a></tt>"
+and <a href="#dss_twine">here</a> for more information.</p>
 
 <p>As with a <tt>StringRef</tt>, <tt>Twine</tt> objects point to external memory
 and should almost never be stored or mentioned directly.  They are intended
@@ -3374,8 +3375,9 @@ provide a name for it (probably based on the name of the translation unit).</p>
 <hr>
 
 <ul>
-  <li><tt><a href="#Function">Function</a> *getFunction(const std::string
-  &amp;Name, const <a href="#FunctionType">FunctionType</a> *Ty)</tt>
+
+  <li><tt><a href="#Function">Function</a> *getFunction(StringRef Name) const
+    </tt>
 
     <p>Look up the specified function in the <tt>Module</tt> <a
     href="#SymbolTable"><tt>SymbolTable</tt></a>. If it does not exist, return
@@ -3863,7 +3865,7 @@ is its address (after linking) which is guaranteed to be constant.</p>
   *Ty, LinkageTypes Linkage, const std::string &amp;N = "", Module* Parent = 0)</tt>
 
     <p>Constructor used when you need to create new <tt>Function</tt>s to add
-    the the program.  The constructor must specify the type of the function to
+    the program.  The constructor must specify the type of the function to
     create and what type of linkage the function should have. The <a 
     href="#FunctionType"><tt>FunctionType</tt></a> argument
     specifies the formal arguments and return value for the function. The same
@@ -4128,7 +4130,7 @@ arguments. An argument has a pointer to the parent Function.</p>
   <a href="mailto:dhurjati@cs.uiuc.edu">Dinakar Dhurjati</a> and
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-18 22:28:55 +0200 (Wed, 18 Apr 2012) $
+  Last modified: $Date: 2012-07-25 15:46:11 +0200 (Wed, 25 Jul 2012) $
 </address>
 
 </body>
diff --git a/docs/Projects.html b/docs/Projects.html
deleted file mode 100644
index ebd7203..0000000
--- a/docs/Projects.html
+++ /dev/null
@@ -1,489 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Creating an LLVM Project</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>Creating an LLVM Project</h1>
-
-<ol>
-<li><a href="#overview">Overview</a></li>
-<li><a href="#create">Create a project from the Sample Project</a></li>
-<li><a href="#source">Source tree layout</a></li>
-<li><a href="#makefiles">Writing LLVM-style Makefiles</a>
-  <ol>
-  <li><a href="#reqVars">Required Variables</a></li>
-  <li><a href="#varsBuildDir">Variables for Building Subdirectories</a></li>
-  <li><a href="#varsBuildLib">Variables for Building Libraries</a></li>
-  <li><a href="#varsBuildProg">Variables for Building Programs</a></li>
-  <li><a href="#miscVars">Miscellaneous Variables</a></li>
-  </ol></li>
-<li><a href="#objcode">Placement of object code</a></li>
-<li><a href="#help">Further help</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by John Criswell</p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="overview">Overview</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM build system is designed to facilitate the building of third party
-projects that use LLVM header files, libraries, and tools.  In order to use
-these facilities, a Makefile from a project must do the following things:</p>
-
-<ol>
-  <li>Set <tt>make</tt> variables. There are several variables that a Makefile
-  needs to set to use the LLVM build system:
-  <ul>
-    <li><tt>PROJECT_NAME</tt> - The name by which your project is known.</li>
-    <li><tt>LLVM_SRC_ROOT</tt> - The root of the LLVM source tree.</li>
-    <li><tt>LLVM_OBJ_ROOT</tt> - The root of the LLVM object tree.</li>
-    <li><tt>PROJ_SRC_ROOT</tt> - The root of the project's source tree.</li>
-    <li><tt>PROJ_OBJ_ROOT</tt> - The root of the project's object tree.</li>
-    <li><tt>PROJ_INSTALL_ROOT</tt> - The root installation directory.</li>
-    <li><tt>LEVEL</tt> - The relative path from the current directory to the
-    project's root ($PROJ_OBJ_ROOT).</li>
-  </ul></li>
-  <li>Include <tt>Makefile.config</tt> from <tt>$(LLVM_OBJ_ROOT)</tt>.</li>
-  <li>Include <tt>Makefile.rules</tt> from <tt>$(LLVM_SRC_ROOT)</tt>.</li>
-</ol>
-
-<p>There are two ways that you can set all of these variables:</p>
-<ol>
-  <li>You can write your own Makefiles which hard-code these values.</li>
-  <li>You can use the pre-made LLVM sample project. This sample project
-  includes Makefiles, a configure script that can be used to configure the
-  location of LLVM, and the ability to support multiple object directories
-  from a single source directory.</li>
-</ol>
-
-<p>This document assumes that you will base your project on the LLVM sample
-project found in <tt>llvm/projects/sample</tt>.  If you want to devise your own
-build system, studying the sample project and LLVM Makefiles will probably
-provide enough information on how to write your own Makefiles.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="create">Create a Project from the Sample Project</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Follow these simple steps to start your project:</p>
-
-<ol>
-<li>Copy the <tt>llvm/projects/sample</tt> directory to any place of your
-choosing.  You can place it anywhere you like.  Rename the directory to match
-the name of your project.</li>
-
-<li>
-If you downloaded LLVM using Subversion, remove all the directories named .svn
-(and all the files therein) from your project's new source tree.  This will
-keep Subversion from thinking that your project is inside
-<tt>llvm/trunk/projects/sample</tt>.</li>
-
-<li>Add your source code and Makefiles to your source tree.</li>
-
-<li>If you want your project to be configured with the <tt>configure</tt> script
-then you need to edit <tt>autoconf/configure.ac</tt> as follows:
-  <ul>
-    <li><b>AC_INIT</b>. Place the name of your project, its version number and
-    a contact email address for your project as the arguments to this macro</li>
-    <li><b>AC_CONFIG_AUX_DIR</b>. If your project isn't in the
-    <tt>llvm/projects</tt> directory then you might need to adjust this so that
-    it specifies a relative path to the <tt>llvm/autoconf</tt> directory.</li>
-    <li><b>LLVM_CONFIG_PROJECT</b>. Just leave this alone.</li>
-    <li><b>AC_CONFIG_SRCDIR</b>. Specify a path to a file name that identifies
-    your project; or just leave it at <tt>Makefile.common.in</tt></li>
-    <li><b>AC_CONFIG_FILES</b>. Do not change.</li>
-    <li><b>AC_CONFIG_MAKEFILE</b>. Use one of these macros for each Makefile
-    that your project uses. This macro arranges for your makefiles to be copied
-    from the source directory, unmodified, to the build directory.</li>
-  </ul>
-</li>
-
-<li>After updating <tt>autoconf/configure.ac</tt>, regenerate the
-configure script with these commands:
-
-<div class="doc_code">
-<p><tt>% cd autoconf<br>
-       % ./AutoRegen.sh</tt></p>
-</div>
-
-<p>You must be using Autoconf version 2.59 or later and your aclocal version
-should be 1.9 or later.</p></li>
-
-<li>Run <tt>configure</tt> in the directory in which you want to place
-object code.  Use the following options to tell your project where it
-can find LLVM:
-
-  <dl>
-    <dt><tt>--with-llvmsrc=&lt;directory&gt;</tt></dt>
-    <dd>Tell your project where the LLVM source tree is located.</dd>
-    <dt><br><tt>--with-llvmobj=&lt;directory&gt;</tt></dt>
-    <dd>Tell your project where the LLVM object tree is located.</dd>
-    <dt><br><tt>--prefix=&lt;directory&gt;</tt></dt>
-    <dd>Tell your project where it should get installed.</dd>
-  </dl>
-</ol>
-
-<p>That's it!  Now all you have to do is type <tt>gmake</tt> (or <tt>make</tt>
-if your on a GNU/Linux system) in the root of your object directory, and your
-project should build.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="source">Source Tree Layout</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>In order to use the LLVM build system, you will want to organize your
-source code so that it can benefit from the build system's features.
-Mainly, you want your source tree layout to look similar to the LLVM
-source tree layout.  The best way to do this is to just copy the
-project tree from <tt>llvm/projects/sample</tt> and modify it to meet
-your needs, but you can certainly add to it if you want.</p>
-
-<p>Underneath your top level directory, you should have the following
-directories:</p>
-
-<dl>
-  <dt><b>lib</b>
-  <dd>
-  This subdirectory should contain all of your library source
-  code.  For each library that you build, you will have one
-  directory in <b>lib</b> that will contain that library's source
-  code.
-
-  <p>
-  Libraries can be object files, archives, or dynamic libraries.
-  The <b>lib</b> directory is just a convenient place for libraries
-  as it places them all in a directory from which they can be linked
-  later.
-
-  <dt><b>include</b>
-  <dd>
-  This subdirectory should contain any header files that are
-  global to your project.  By global, we mean that they are used
-  by more than one library or executable of your project.
-  <p>
-  By placing your header files in <b>include</b>, they will be
-  found automatically by the LLVM build system.  For example, if
-  you have a file <b>include/jazz/note.h</b>, then your source
-  files can include it simply with <b>#include "jazz/note.h"</b>.
-
-  <dt><b>tools</b>
-  <dd>
-  This subdirectory should contain all of your source
-  code for executables.  For each program that you build, you
-  will have one directory in <b>tools</b> that will contain that
-  program's source code.
-  <p>
-
-  <dt><b>test</b>
-  <dd>
-  This subdirectory should contain tests that verify that your code
-  works correctly.  Automated tests are especially useful.
-  <p>
-  Currently, the LLVM build system provides basic support for tests.
-  The LLVM system provides the following:
-  <ul>
-    <li>
-    LLVM provides a tcl procedure that is used by Dejagnu to run
-    tests.  It can be found in <tt>llvm/lib/llvm-dg.exp</tt>.  This
-    test procedure uses RUN lines in the actual test case to determine
-    how to run the test.  See the <a
-    href="TestingGuide.html">TestingGuide</a> for more details. You
-    can easily write Makefile support similar to the Makefiles in
-    <tt>llvm/test</tt> to use Dejagnu to run your project's tests.<br></li>
-    <li>
-    LLVM contains an optional package called <tt>llvm-test</tt>
-    which provides benchmarks and programs that are known to compile with the
-    LLVM GCC front ends.  You can use these
-    programs to test your code, gather statistics information, and
-    compare it to the current LLVM performance statistics.
-    <br>Currently, there is no way to hook your tests directly into the
-    <tt>llvm/test</tt> testing harness.  You will simply
-    need to find a way to use the source provided within that directory
-    on your own.
-  </ul>
-</dl>
-
-<p>Typically, you will want to build your <b>lib</b> directory first followed by
-your <b>tools</b> directory.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="makefiles">Writing LLVM Style Makefiles</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM build system provides a convenient way to build libraries and
-executables.  Most of your project Makefiles will only need to define a few
-variables.  Below is a list of the variables one can set and what they can
-do:</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="reqVars">Required Variables</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt>LEVEL
-  <dd>
-  This variable is the relative path from this Makefile to the
-  top directory of your project's source code.  For example, if
-  your source code is in <tt>/tmp/src</tt>, then the Makefile in
-  <tt>/tmp/src/jump/high</tt> would set <tt>LEVEL</tt> to <tt>"../.."</tt>.
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="varsBuildDir">Variables for Building Subdirectories</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt>DIRS
-  <dd>
-  This is a space separated list of subdirectories that should be
-  built.  They will be built, one at a time, in the order
-  specified.
-  <p>
-
-  <dt>PARALLEL_DIRS
-  <dd>
-  This is a list of directories that can be built in parallel.
-  These will be built after the directories in DIRS have been
-  built.
-  <p>
-
-  <dt>OPTIONAL_DIRS
-  <dd>
-  This is a list of directories that can be built if they exist,
-  but will not cause an error if they do not exist.  They are
-  built serially in the order in which they are listed.
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="varsBuildLib">Variables for Building Libraries</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt>LIBRARYNAME
-  <dd>
-  This variable contains the base name of the library that will
-  be built.  For example, to build a library named
-  <tt>libsample.a</tt>, LIBRARYNAME should be set to
-  <tt>sample</tt>.
-  <p>
-
-  <dt>BUILD_ARCHIVE
-  <dd>
-  By default, a library is a <tt>.o</tt> file that is linked
-  directly into a program.  To build an archive (also known as
-  a static library), set the BUILD_ARCHIVE variable.
-  <p>
-
-  <dt>SHARED_LIBRARY
-  <dd>
-  If SHARED_LIBRARY is defined in your Makefile, a shared
-  (or dynamic) library will be built.
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="varsBuildProg">Variables for Building Programs</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt>TOOLNAME
-  <dd>
-  This variable contains the name of the program that will
-  be built.  For example, to build an executable named
-  <tt>sample</tt>, TOOLNAME should be set to <tt>sample</tt>.
-  <p>
-
-  <dt>USEDLIBS
-  <dd>
-  This variable holds a space separated list of libraries that should
-  be linked into the program.  These libraries must be libraries that
-  come from your <b>lib</b> directory.  The libraries must be
-  specified without their "lib" prefix.  For example, to link
-  libsample.a, you would set USEDLIBS to
-  <tt>sample.a</tt>.
-  <p>
-  Note that this works only for statically linked libraries.
-  <p>
-
-  <dt>LLVMLIBS
-  <dd>
-  This variable holds a space separated list of libraries that should
-  be linked into the program.  These libraries must be LLVM libraries.
-  The libraries must be specified without their "lib" prefix.  For
-  example, to link with a driver that performs an IR transformation
-  you might set LLVMLIBS to this minimal set of libraries
-  <tt>LLVMSupport.a LLVMCore.a LLVMBitReader.a LLVMAsmParser.a LLVMAnalysis.a LLVMTransformUtils.a LLVMScalarOpts.a LLVMTarget.a</tt>.
-  <p>
-  Note that this works only for statically linked libraries. LLVM is
-  split into a large number of static libraries, and the list of libraries you
-  require may be much longer than the list above. To see a full list
-  of libraries use:
-  <tt>llvm-config --libs all</tt>.
-  Using LINK_COMPONENTS as described below, obviates the need to set LLVMLIBS.
-  <p>
-
-  <dt>LINK_COMPONENTS
-  <dd>This variable holds a space separated list of components that
-  the LLVM Makefiles pass to the <tt>llvm-config</tt> tool to generate
-  a link line for the program. For example, to link with all LLVM
-  libraries use
-  <tt>LINK_COMPONENTS = all</tt>.
-  <p>
-
-  <dt>LIBS
-  <dd>
-  To link dynamic libraries, add <tt>-l&lt;library base name&gt;</tt> to
-  the LIBS variable.  The LLVM build system will look in the same places
-  for dynamic libraries as it does for static libraries.
-  <p>
-  For example, to link <tt>libsample.so</tt>, you would have the
-  following line in your <tt>Makefile</tt>:
-  <p>
-  <tt>
-  LIBS += -lsample
-  </tt>
-  <p>
-  Note that LIBS must occur in the Makefile after the inclusion of Makefile.common.
-  <p>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="miscVars">Miscellaneous Variables</a>
-</h3>
-
-<div>
-
-<dl>
-  <dt>ExtraSource
-  <dd>
-  This variable contains a space separated list of extra source
-  files that need to be built.  It is useful for including the
-  output of Lex and Yacc programs.
-  <p>
-
-  <dt>CFLAGS
-  <dt>CPPFLAGS
-  <dd>
-  This variable can be used to add options to the C and C++
-  compiler, respectively.  It is typically used to add options
-  that tell the compiler the location of additional directories
-  to search for header files.
-  <p>
-  It is highly suggested that you append to CFLAGS and CPPFLAGS as
-  opposed to overwriting them.  The master Makefiles may already
-  have useful options in them that you may not want to overwrite.
-  <p>
-</dl>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="objcode">Placement of Object Code</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The final location of built libraries and executables will depend upon
-whether you do a Debug, Release, or Profile build.</p>
-
-<dl>
-  <dt>Libraries
-  <dd>
-  All libraries (static and dynamic) will be stored in
-  <tt>PROJ_OBJ_ROOT/&lt;type&gt;/lib</tt>, where type is <tt>Debug</tt>,
-  <tt>Release</tt>, or <tt>Profile</tt> for a debug, optimized, or
-  profiled build, respectively.<p>
-
-  <dt>Executables
-  <dd>All executables will be stored in
-  <tt>PROJ_OBJ_ROOT/&lt;type&gt;/bin</tt>, where type is <tt>Debug</tt>,
-  <tt>Release</tt>, or <tt>Profile</tt> for a debug, optimized, or profiled
-  build, respectively.
-</dl>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="help">Further Help</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you have any questions or need any help creating an LLVM project,
-the LLVM team would be more than happy to help.  You can always post your
-questions to the <a
-href="http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVM Developers
-Mailing List</a>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:criswell@uiuc.edu">John Criswell</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
-  <br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
-</address>
-
-</body>
-</html>
diff --git a/docs/Projects.rst b/docs/Projects.rst
new file mode 100644
index 0000000..6313288
--- /dev/null
+++ b/docs/Projects.rst
@@ -0,0 +1,327 @@
+.. _projects:
+
+========================
+Creating an LLVM Project
+========================
+
+.. contents::
+   :local:
+
+Overview
+========
+
+The LLVM build system is designed to facilitate the building of third party
+projects that use LLVM header files, libraries, and tools.  In order to use
+these facilities, a ``Makefile`` from a project must do the following things:
+
+* Set ``make`` variables. There are several variables that a ``Makefile`` needs
+  to set to use the LLVM build system:
+
+  * ``PROJECT_NAME`` - The name by which your project is known.
+  * ``LLVM_SRC_ROOT`` - The root of the LLVM source tree.
+  * ``LLVM_OBJ_ROOT`` - The root of the LLVM object tree.
+  * ``PROJ_SRC_ROOT`` - The root of the project's source tree.
+  * ``PROJ_OBJ_ROOT`` - The root of the project's object tree.
+  * ``PROJ_INSTALL_ROOT`` - The root installation directory.
+  * ``LEVEL`` - The relative path from the current directory to the
+    project's root ``($PROJ_OBJ_ROOT)``.
+
+* Include ``Makefile.config`` from ``$(LLVM_OBJ_ROOT)``.
+
+* Include ``Makefile.rules`` from ``$(LLVM_SRC_ROOT)``.
+
+There are two ways that you can set all of these variables:
+
+* You can write your own ``Makefiles`` which hard-code these values.
+
+* You can use the pre-made LLVM sample project. This sample project includes
+  ``Makefiles``, a configure script that can be used to configure the location
+  of LLVM, and the ability to support multiple object directories from a single
+  source directory.
+
+This document assumes that you will base your project on the LLVM sample project
+found in ``llvm/projects/sample``. If you want to devise your own build system,
+studying the sample project and LLVM ``Makefiles`` will probably provide enough
+information on how to write your own ``Makefiles``.
+
+Create a Project from the Sample Project
+========================================
+
+Follow these simple steps to start your project:
+
+1. Copy the ``llvm/projects/sample`` directory to any place of your choosing.
+   You can place it anywhere you like. Rename the directory to match the name
+   of your project.
+
+2. If you downloaded LLVM using Subversion, remove all the directories named
+   ``.svn`` (and all the files therein) from your project's new source tree.
+   This will keep Subversion from thinking that your project is inside
+   ``llvm/trunk/projects/sample``.
+
+3. Add your source code and Makefiles to your source tree.
+
+4. If you want your project to be configured with the ``configure`` script then
+   you need to edit ``autoconf/configure.ac`` as follows:
+
+   * **AC_INIT** - Place the name of your project, its version number and a
+     contact email address for your project as the arguments to this macro
+ 
+   * **AC_CONFIG_AUX_DIR** - If your project isn't in the ``llvm/projects``
+     directory then you might need to adjust this so that it specifies a
+     relative path to the ``llvm/autoconf`` directory.
+
+   * **LLVM_CONFIG_PROJECT** - Just leave this alone.
+
+   * **AC_CONFIG_SRCDIR** - Specify a path to a file name that identifies your
+     project; or just leave it at ``Makefile.common.in``.
+
+   * **AC_CONFIG_FILES** - Do not change.
+
+   * **AC_CONFIG_MAKEFILE** - Use one of these macros for each Makefile that
+     your project uses. This macro arranges for your makefiles to be copied from
+     the source directory, unmodified, to the build directory.
+
+5. After updating ``autoconf/configure.ac``, regenerate the configure script
+   with these commands. (You must be using ``Autoconf`` version 2.59 or later
+   and your ``aclocal`` version should be 1.9 or later.)
+
+       .. code-block:: bash
+
+         % cd autoconf
+         % ./AutoRegen.sh
+
+6. Run ``configure`` in the directory in which you want to place object code.
+   Use the following options to tell your project where it can find LLVM:
+
+   ``--with-llvmsrc=<directory>``
+       Tell your project where the LLVM source tree is located.
+
+   ``--with-llvmobj=<directory>``
+       Tell your project where the LLVM object tree is located.
+
+   ``--prefix=<directory>``
+       Tell your project where it should get installed.
+
+That's it!  Now all you have to do is type ``gmake`` (or ``make`` if you're on a
+GNU/Linux system) in the root of your object directory, and your project should
+build.
+
+Source Tree Layout
+==================
+
+In order to use the LLVM build system, you will want to organize your source
+code so that it can benefit from the build system's features.  Mainly, you want
+your source tree layout to look similar to the LLVM source tree layout.  The
+best way to do this is to just copy the project tree from
+``llvm/projects/sample`` and modify it to meet your needs, but you can certainly
+add to it if you want.
+
+Underneath your top level directory, you should have the following directories:
+
+**lib**
+
+    This subdirectory should contain all of your library source code.  For each
+    library that you build, you will have one directory in **lib** that will
+    contain that library's source code.
+
+    Libraries can be object files, archives, or dynamic libraries.  The **lib**
+    directory is just a convenient place for libraries as it places them all in
+    a directory from which they can be linked later.
+
+**include**
+
+    This subdirectory should contain any header files that are global to your
+    project. By global, we mean that they are used by more than one library or
+    executable of your project.
+
+    By placing your header files in **include**, they will be found
+    automatically by the LLVM build system.  For example, if you have a file
+    **include/jazz/note.h**, then your source files can include it simply with
+    **#include "jazz/note.h"**.
+
+**tools**
+
+    This subdirectory should contain all of your source code for executables.
+    For each program that you build, you will have one directory in **tools**
+    that will contain that program's source code.
+
+**test**
+
+    This subdirectory should contain tests that verify that your code works
+    correctly.  Automated tests are especially useful.
+
+    Currently, the LLVM build system provides basic support for tests. The LLVM
+    system provides the following:
+
+* LLVM provides a ``tcl`` procedure that is used by ``Dejagnu`` to run tests.
+  It can be found in ``llvm/lib/llvm-dg.exp``.  This test procedure uses ``RUN``
+  lines in the actual test case to determine how to run the test.  See the
+  `TestingGuide <TestingGuide.html>`_ for more details. You can easily write
+  Makefile support similar to the Makefiles in ``llvm/test`` to use ``Dejagnu``
+  to run your project's tests.
+
+* LLVM contains an optional package called ``llvm-test``, which provides
+  benchmarks and programs that are known to compile with the Clang front
+  end. You can use these programs to test your code, gather statistical
+  information, and compare it to the current LLVM performance statistics.
+  
+  Currently, there is no way to hook your tests directly into the ``llvm/test``
+  testing harness. You will simply need to find a way to use the source
+  provided within that directory on your own.
+
+Typically, you will want to build your **lib** directory first followed by your
+**tools** directory.
+
+Writing LLVM Style Makefiles
+============================
+
+The LLVM build system provides a convenient way to build libraries and
+executables.  Most of your project Makefiles will only need to define a few
+variables.  Below is a list of the variables one can set and what they can
+do:
+
+Required Variables
+------------------
+
+``LEVEL``
+
+    This variable is the relative path from this ``Makefile`` to the top
+    directory of your project's source code.  For example, if your source code
+    is in ``/tmp/src``, then the ``Makefile`` in ``/tmp/src/jump/high``
+    would set ``LEVEL`` to ``"../.."``.
+
+Variables for Building Subdirectories
+-------------------------------------
+
+``DIRS``
+
+    This is a space separated list of subdirectories that should be built.  They
+    will be built, one at a time, in the order specified.
+
+``PARALLEL_DIRS``
+
+    This is a list of directories that can be built in parallel. These will be
+    built after the directories in DIRS have been built.
+
+``OPTIONAL_DIRS``
+
+    This is a list of directories that can be built if they exist, but will not
+    cause an error if they do not exist.  They are built serially in the order
+    in which they are listed.
+
+Variables for Building Libraries
+--------------------------------
+
+``LIBRARYNAME``
+
+    This variable contains the base name of the library that will be built.  For
+    example, to build a library named ``libsample.a``, ``LIBRARYNAME`` should
+    be set to ``sample``.
+
+``BUILD_ARCHIVE``
+
+    By default, a library is a ``.o`` file that is linked directly into a
+    program.  To build an archive (also known as a static library), set the
+    ``BUILD_ARCHIVE`` variable.
+
+``SHARED_LIBRARY``
+
+    If ``SHARED_LIBRARY`` is defined in your Makefile, a shared (or dynamic)
+    library will be built.
+
+Variables for Building Programs
+-------------------------------
+
+``TOOLNAME``
+
+    This variable contains the name of the program that will be built.  For
+    example, to build an executable named ``sample``, ``TOOLNAME`` should be set
+    to ``sample``.
+
+``USEDLIBS``
+
+    This variable holds a space separated list of libraries that should be
+    linked into the program.  These libraries must be libraries that come from
+    your **lib** directory.  The libraries must be specified without their
+    ``lib`` prefix.  For example, to link ``libsample.a``, you would set
+    ``USEDLIBS`` to ``sample.a``.
+
+    Note that this works only for statically linked libraries.
+
+``LLVMLIBS``
+
+    This variable holds a space separated list of libraries that should be
+    linked into the program.  These libraries must be LLVM libraries.  The
+    libraries must be specified without their ``lib`` prefix.  For example, to
+    link with a driver that performs an IR transformation you might set
+    ``LLVMLIBS`` to this minimal set of libraries ``LLVMSupport.a LLVMCore.a
+    LLVMBitReader.a LLVMAsmParser.a LLVMAnalysis.a LLVMTransformUtils.a
+    LLVMScalarOpts.a LLVMTarget.a``.
+
+    Note that this works only for statically linked libraries. LLVM is split
+    into a large number of static libraries, and the list of libraries you
+    require may be much longer than the list above. To see a full list of
+    libraries use: ``llvm-config --libs all``.  Using ``LINK_COMPONENTS`` as
+    described below, obviates the need to set ``LLVMLIBS``.
+
+``LINK_COMPONENTS``
+
+    This variable holds a space separated list of components that the LLVM
+    ``Makefiles`` pass to the ``llvm-config`` tool to generate a link line for
+    the program. For example, to link with all LLVM libraries use
+    ``LINK_COMPONENTS = all``.
+
+``LIBS``
+
+    To link dynamic libraries, add ``-l<library base name>`` to the ``LIBS``
+    variable.  The LLVM build system will look in the same places for dynamic
+    libraries as it does for static libraries.
+
+    For example, to link ``libsample.so``, you would have the following line in
+    your ``Makefile``:
+
+        .. code-block:: makefile
+
+          LIBS += -lsample
+
+Note that ``LIBS`` must occur in the Makefile after the inclusion of
+``Makefile.common``.
+
+Miscellaneous Variables
+-----------------------
+
+``CFLAGS`` & ``CPPFLAGS``
+
+    This variable can be used to add options to the C and C++ compiler,
+    respectively.  It is typically used to add options that tell the compiler
+    the location of additional directories to search for header files.
+
+    It is highly suggested that you append to ``CFLAGS`` and ``CPPFLAGS`` as
+    opposed to overwriting them.  The master ``Makefiles`` may already have
+    useful options in them that you may not want to overwrite.
+
+Placement of Object Code
+========================
+
+The final location of built libraries and executables will depend upon whether
+you do a ``Debug``, ``Release``, or ``Profile`` build.
+
+Libraries
+
+    All libraries (static and dynamic) will be stored in
+    ``PROJ_OBJ_ROOT/<type>/lib``, where *type* is ``Debug``, ``Release``, or
+    ``Profile`` for a debug, optimized, or profiled build, respectively.
+
+Executables
+
+    All executables will be stored in ``PROJ_OBJ_ROOT/<type>/bin``, where *type*
+    is ``Debug``, ``Release``, or ``Profile`` for a debug, optimized, or
+    profiled build, respectively.
+
+Further Help
+============
+
+If you have any questions or need any help creating an LLVM project, the LLVM
+team would be more than happy to help.  You can always post your questions to
+the `LLVM Developers Mailing List
+<http://lists.cs.uiuc.edu/pipermail/llvmdev/>`_.
diff --git a/docs/README.txt b/docs/README.txt
new file mode 100644
index 0000000..2fbbf98
--- /dev/null
+++ b/docs/README.txt
@@ -0,0 +1,12 @@
+LLVM Documentation
+==================
+
+The LLVM documentation is currently written in two formats:
+
+  * Plain HTML documentation.
+
+  * reStructured Text documentation using the Sphinx documentation generator. It
+    is currently tested with Sphinx 1.1.3.
+
+    For more information, see the "Sphinx Introduction for LLVM Developers"
+    document.
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 71f2cea..85448a5 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -4,11 +4,11 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-  <title>LLVM 3.1 Release Notes</title>
+  <title>LLVM 3.2 Release Notes</title>
 </head>
 <body>
 
-<h1>LLVM 3.1 Release Notes</h1>
+<h1>LLVM 3.2 Release Notes</h1>
 
 <div>
 <img style="float:right" src="http://llvm.org/img/DragonSmall.png"
@@ -18,7 +18,7 @@
 <ol>
   <li><a href="#intro">Introduction</a></li>
   <li><a href="#subproj">Sub-project Status Update</a></li>
-  <li><a href="#externalproj">External Projects Using LLVM 3.1</a></li>
+  <li><a href="#externalproj">External Projects Using LLVM 3.2</a></li>
   <li><a href="#whatsnew">What's New in LLVM?</a></li>
   <li><a href="GettingStarted.html">Installation Instructions</a></li>
   <li><a href="#knownproblems">Known Problems</a></li>
@@ -29,6 +29,12 @@
   <p>Written by the <a href="http://llvm.org/">LLVM Team</a></p>
 </div>
 
+<h1 style="color:red">These are in-progress notes for the upcoming LLVM 3.2
+release.<br>
+You may prefer the
+<a href="http://llvm.org/releases/3.1/docs/ReleaseNotes.html">LLVM 3.1
+Release Notes</a>.</h1>
+
 <!-- *********************************************************************** -->
 <h2>
   <a name="intro">Introduction</a>
@@ -38,11 +44,11 @@
 <div>
 
 <p>This document contains the release notes for the LLVM Compiler
-   Infrastructure, release 3.1.  Here we describe the status of LLVM, including
+   Infrastructure, release 3.2.  Here we describe the status of LLVM, including
    major improvements from the previous release, improvements in various
-   subprojects of LLVM, and some of the current users of the code.
-   All LLVM releases may be downloaded from
-   the <a href="http://llvm.org/releases/">LLVM releases web site</a>.</p>
+   subprojects of LLVM, and some of the current users of the code.  All LLVM
+   releases may be downloaded from the <a href="http://llvm.org/releases/">LLVM
+   releases web site</a>.</p>
 
 <p>For more information about LLVM, including information about the latest
    release, please check out the <a href="http://llvm.org/">main LLVM web
@@ -66,10 +72,10 @@
 
 <div>
 
-<p>The LLVM 3.1 distribution currently consists of code from the core LLVM
-   repository (which roughly includes the LLVM optimizers, code generators and
-   supporting tools), and the Clang repository. In addition to this code, the
-   LLVM Project includes other sub-projects that are in development.  Here we
+<p>The LLVM 3.2 distribution currently consists of code from the core LLVM
+   repository, which roughly includes the LLVM optimizers, code generators and
+   supporting tools, and the Clang repository. In addition to this code, the
+   LLVM Project includes other sub-projects that are in development. Here we
    include updates on these subprojects.</p>
 
 <!--=========================================================================-->
@@ -88,20 +94,13 @@
    production-quality compiler for C, Objective-C, C++ and Objective-C++ on x86
    (32- and 64-bit), and for Darwin/ARM targets.</p>
 
-<p>In the LLVM 3.1 time-frame, the Clang team has made many improvements.
+<p>In the LLVM 3.2 time-frame, the Clang team has made many improvements.
    Highlights include:</p>
 <ul>
-  <li>Greatly expanded <a href="http://clang.llvm.org/cxx_status.html">C++11
-      support</a> including lambdas, initializer lists, constexpr, user-defined
-      literals, and atomics.</li>
-  <li>A new <a href="http://clang.llvm.org/docs/Tooling.html">tooling</a>
-      library to ease building of clang-based standalone tools.</li>
-  <li>Extended support for
-      <a href="http://clang.llvm.org/docs/ObjectiveCLiterals.html">literals in
-      Objective C</a>.</li>
+  <li>...</li>
 </ul>
 
-<p>For more details about the changes to Clang since the 3.0 release, see the
+<p>For more details about the changes to Clang since the 3.1 release, see the
    <a href="http://clang.llvm.org/docs/ReleaseNotes.html">Clang release
    notes.</a></p>
 
@@ -127,23 +126,10 @@
    Linux and OpenBSD platforms.  It fully supports Ada, C, C++ and Fortran.  It
    has partial support for Go, Java, Obj-C and Obj-C++.</p>
 
-<p>The 3.1 release has the following notable changes:</p>
+<p>The 3.2 release has the following notable changes:</p>
 
 <ul>
-  <li>Partial support for gcc-4.7. Ada support is poor, but other languages work
-      fairly well.</li>
-
-  <li>Support for ARM processors. Some essential gcc headers that are needed to
-      build DragonEgg for ARM are not installed by gcc. To work around this,
-      copy the missing headers from the gcc source tree.</li>
-
-  <li>Better optimization for Fortran by exploiting the fact that Fortran scalar
-      arguments have 'restrict' semantics.</li>
-
-  <li>Better optimization for all languages by passing information about type
-      aliasing and type ranges to the LLVM optimizers.</li>
-
-  <li>A regression test-suite was added.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -160,13 +146,15 @@
    target-specific hooks required by code generation and other runtime
    components.  For example, when compiling for a 32-bit target, converting a
    double to a 64-bit unsigned integer is compiled into a runtime call to the
-   "__fixunsdfdi" function. The compiler-rt library provides highly optimized
-   implementations of this and other low-level routines (some are 3x faster than
-   the equivalent libgcc routines).</p>
+   <code>__fixunsdfdi</code> function. The compiler-rt library provides highly
+   optimized implementations of this and other low-level routines (some are 3x
+   faster than the equivalent libgcc routines).</p>
 
-<p>As of 3.1, compiler-rt includes the helper functions for atomic operations,
-   allowing atomic operations on arbitrary-sized quantities to work.  These
-   functions follow the specification defined by gcc and are used by clang.</p>
+<p>The 3.2 release has the following notable changes:</p>
+
+<ul>
+  <li>...</li>
+</ul>
 
 </div>
 
@@ -183,6 +171,12 @@
    expression parsing (particularly for C++) and uses the LLVM JIT for target
    support.</p>
 
+<p>The 3.2 release has the following notable changes:</p>
+
+<ul>
+  <li>...</li>
+</ul>
+
 </div>
 
 <!--=========================================================================-->
@@ -196,15 +190,10 @@
    licensed</a> under the MIT and UIUC license, allowing it to be used more
    permissively.</p>
 
-<p>Within the LLVM 3.1 time-frame there were the following highlights:</p>
+<p>Within the LLVM 3.2 time-frame there were the following highlights:</p>
 
 <ul>
-  <li>The <code>&lt;atomic&gt;</code> header is now passing all tests, when
-      compiling with clang and linking against the support code from
-      compiler-rt.</li>
-  <li>FreeBSD now includes libc++ as part of the base system.</li>
-  <li>libc++ has been ported to Solaris and, in combination with libcxxrt and
-      clang, is working with a large body of existing code.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -220,8 +209,11 @@
   of a Java Virtual Machine (Java VM or JVM) that uses LLVM for static and
   just-in-time compilation.</p>
 
-<p>In the LLVM 3.1 time-frame, VMKit has had significant improvements on both
-   runtime and startup performance.</p>
+<p>The 3.2 release has the following notable changes:</p>
+
+<ul>
+  <li>...</li>
+</ul>
 
 </div>
 
@@ -239,16 +231,10 @@
   Work in the area of automatic SIMD and accelerator code generation was
   started.</p>
 
-<p>Within the LLVM 3.1 time-frame there were the following highlights:</p>
+<p>Within the LLVM 3.2 time-frame there were the following highlights:</p>
 
 <ul>
-  <li>Polly became an official LLVM project</li>
-  <li>Polly can be loaded directly into clang (enabled by '-O3 -mllvm -polly')</li>
-  <li>An automatic scheduling optimizer (derived
-      from <a href="http://pluto-compiler.sourceforge.net/">Pluto</a>) was
-      integrated. It performs loop transformations to optimize for data-locality
-      and parallelism.  The transformations include, but are not limited to
-      interchange, fusion, fission, skewing and tiling.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -257,15 +243,15 @@
 
 <!-- *********************************************************************** -->
 <h2>
-  <a name="externalproj">External Open Source Projects Using LLVM 3.1</a>
+  <a name="externalproj">External Open Source Projects Using LLVM 3.2</a>
 </h2>
 <!-- *********************************************************************** -->
 
 <div>
 
 <p>An exciting aspect of LLVM is that it is used as an enabling technology for
-   a lot of other language and tools projects.  This section lists some of the
-   projects that have already been updated to work with LLVM 3.1.</p>
+   a lot of other language and tools projects. This section lists some of the
+   projects that have already been updated to work with LLVM 3.2.</p>
 
 <h3>Crack</h3>
 
@@ -409,14 +395,14 @@
 
 <!-- *********************************************************************** -->
 <h2>
-  <a name="whatsnew">What's New in LLVM 3.1?</a>
+  <a name="whatsnew">What's New in LLVM 3.2?</a>
 </h2>
 <!-- *********************************************************************** -->
 
 <div>
 
 <p>This release includes a huge number of bug fixes, performance tweaks and
-   minor improvements.  Some of the major improvements and new features are
+   minor improvements. Some of the major improvements and new features are
    listed in this section.</p>
 
 <!--=========================================================================-->
@@ -426,13 +412,13 @@
 
 <div>
 
-  <!-- Features that need text if they're finished for 3.1:
+  <!-- Features that need text if they're finished for 3.2:
    ARM EHABI
    combiner-aa?
    strong phi elim
    loop dependence analysis
    CorrelatedValuePropagation
-   lib/Transforms/IPO/MergeFunctions.cpp => consider for 3.1.
+   lib/Transforms/IPO/MergeFunctions.cpp => consider for 3.2.
    Integrated assembler on by default for arm/thumb?
 
    -->
@@ -443,17 +429,10 @@
    llvm/lib/Archive - replace with lib object?
    -->
 
-<p>LLVM 3.1 includes several major changes and big features:</p>
+<p>LLVM 3.2 includes several major changes and big features:</p>
 
 <ul>
-  <li><a href="../tools/clang/docs/AddressSanitizer.html">AddressSanitizer</a>,
-      a fast memory error detector.</li>
-  <li><a href="CodeGenerator.html#machineinstrbundle">MachineInstr Bundles</a>,
-      Support to model instruction bundling / packing.</li>
-  <li><a href="#armintegratedassembler">ARM Integrated Assembler</a>,
-      A full featured assembler and direct-to-object support for ARM.</li>
-  <li><a href="#blockplacement">Basic Block Placement</a>
-      Probability driven basic block placement.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -470,19 +449,9 @@
    expose new optimization opportunities:</p>
 
 <ul>
-  <li>A new type representing 16 bit <i>half</i> floating point values has
-      been added.</li>
-  <li>IR now supports vectors of pointers, including vector GEPs.</li>
-  <li>Module flags have been introduced. They convey information about the
-      module as a whole to LLVM subsystems. This is currently used to encode
-      Objective C ABI information.</li>
-  <li>Loads can now have range metadata attached to them to describe the
-      possible values being loaded.</li>
-  <li>The <tt>llvm.ctlz</tt> and <tt>llvm.cttz</tt> intrinsics now have an
-    additional argument which indicates whether the behavior of the intrinsic
-    is undefined on a zero input. This can be used to generate more efficient
-    code on platforms that only have instructions which don't return the type
-    size when counting bits in 0.</li>
+  <li>Thread local variables may have a specified TLS model. See the
+  <a href="LangRef.html#globalvars">Language Reference Manual</a>.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -494,22 +463,11 @@
 
 <div>
 
-<p>In addition to many minor performance tweaks and bug fixes, this
-   release includes a few major enhancements and additions to the
-   optimizers:</p>
+<p>In addition to many minor performance tweaks and bug fixes, this release
+   includes a few major enhancements and additions to the optimizers:</p>
 
 <ul>
-  <li>The loop unroll pass now is able to unroll loops with run-time trip counts.
-      This feature is turned off by default, and is enabled with the
-      <code>-unroll-runtime</code> flag.</li>
-  <li>A new basic-block autovectorization pass is available. Pass
-      <code>-vectorize</code> to run this pass along with some associated
-      post-vectorization cleanup passes. For more information, see the EuroLLVM
-      2012 slides: <a href="http://llvm.org/devmtg/2012-04-12/Slides/Hal_Finkel.pdf">
-      Autovectorization with LLVM</a>.</li>
-  <li>Inline cost heuristics have been completely overhauled and now closely
-      model constant propagation through call sites, disregard trivially dead
-      code costs, and can model C++ STL iterator patterns.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -524,14 +482,12 @@
 <p>The LLVM Machine Code (aka MC) subsystem was created to solve a number of
    problems in the realm of assembly, disassembly, object file format handling,
    and a number of other related areas that CPU instruction-set level tools work
-   in. For more information, please see
-  the <a href="http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html">Intro
-    to the LLVM MC Project Blog Post</a>.</p>
+   in. For more information, please see the
+   <a href="http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html">Intro
+   to the LLVM MC Project Blog Post</a>.</p>
 
 <ul>
-  <li>The integrated assembler can optionally emit debug information when
-      assembling a </tt>.s</tt> file. It can be enabled by passing the
-      <tt>-g</tt> option to <tt>llvm-mc</tt>.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -556,21 +512,7 @@
    make it run faster:</p>
 
 <ul>
-  <li>TableGen can now synthesize register classes that are only needed to
-      represent combinations of constraints from instructions and sub-registers.
-      The synthetic register classes inherit most of their properties form their
-      closest user-defined super-class.</li>
-  <li><code>MachineRegisterInfo</code> now allows the reserved registers to be
-      frozen when register allocation starts.  Target hooks should use the
-      <code>MRI-&gt;canReserveReg(FramePtr)</code> method to avoid accidentally
-      disabling frame pointer elimination during register allocation.</li>
-  <li>A new kind of <code>MachineOperand</code> provides a compact
-      representation of large clobber lists on call instructions.  The register
-      mask operand references a bit mask of preserved registers. Everything else
-      is clobbered.</li>
-  <li>The DWARF debug info writer gained support for emitting data for the
-      <a href="SourceLevelDebugging.html#acceltable">name accelerator tables
-      DWARF extension</a>. It is used by LLDB to speed up name lookup.</li>
+  <li>...</li>
 </ul>
 
 <p> We added new TableGen infrastructure to support bundling for
@@ -587,11 +529,14 @@
 <h4>
 <a name="blockplacement">Basic Block Placement</a>
 </h4>
+
 <div>
+
 <p>A probability based block placement and code layout algorithm was added to
-LLVM's code generator. This layout pass supports probabilities derived from
-static heuristics as well as source code annotations such as
-<code>__builtin_expect</code>.</p>
+   LLVM's code generator. This layout pass supports probabilities derived from
+   static heuristics as well as source code annotations such as
+   <code>__builtin_expect</code>.</p>
+
 </div>
 
 <!--=========================================================================-->
@@ -604,14 +549,7 @@ static heuristics as well as source code annotations such as
 <p>New features and major changes in the X86 target include:</p>
 
 <ul>
-  <li>Greatly improved support for AVX2.</li>
-  <li>Lots of bug fixes and improvements for AVX1.</li>
-  <li>Support for the FMA4 and XOP instruction set extensions.</li>
-  <li>Call instructions use the new register mask operands for faster compile
-  times and better support for different calling conventions.  The old WINCALL
-  instructions are no longer needed.</li>
-  <li>DW2 Exception Handling is enabled on Cygwin and MinGW.</li>
-  <li>Support for implicit TLS model used with MSVC runtime.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -626,65 +564,45 @@ static heuristics as well as source code annotations such as
 <p>New features of the ARM target include:</p>
 
 <ul>
-  <li>The constant island pass now supports basic block and constant pool entry
-  alignments greater than 4 bytes.</li>
-  <li>On Darwin, the ARM target now has a full-featured integrated assembler.
-  </li>
+  <li>...</li>
 </ul>
 
+<!--_________________________________________________________________________-->
+
 <h4>
 <a name="armintegratedassembler">ARM Integrated Assembler</a>
 </h4>
+
 <div>
+
 <p>The ARM target now includes a full featured macro assembler, including
-direct-to-object module support for clang. The assembler is currently enabled
-by default for Darwin only pending testing and any additional necessary
-platform specific support for Linux.</p>
+   direct-to-object module support for clang. The assembler is currently enabled
+   by default for Darwin only pending testing and any additional necessary
+   platform specific support for Linux.</p>
 
 <p>Full support is included for Thumb1, Thumb2 and ARM modes, along with
-subtarget and CPU specific extensions for VFP2, VFP3 and NEON.</p>
+   subtarget and CPU specific extensions for VFP2, VFP3 and NEON.</p>
 
 <p>The assembler is Unified Syntax only (see ARM Architecural Reference Manual
-for details). While there is some, and growing, support for pre-unfied (divided)
-syntax, there are still significant gaps in that support.</p>
-</div>
+   for details). While there is some, and growing, support for pre-unfied
+   (divided) syntax, there are still significant gaps in that support.</p>
 
 </div>
-<!--=========================================================================-->
-<h3>
-<a name="MIPS">MIPS Target Improvements</a>
-</h3>
 
-<div>
-New features and major changes in the MIPS target include:</p>
-
-<ul>
-  <li>MIPS32 little-endian direct object code emission is functional.</li>
-  <li>MIPS64 little-endian code generation is largely functional for N64 ABI in assembly printing mode with the exception of handling of long double (f128) type.</li>
-  <li>Support for new instructions has been added, which includes swap-bytes
-   instructions (WSBH and DSBH), floating point multiply-add/subtract and
-   negative multiply-add/subtract instructions, and floating
-   point load/store instructions with reg+reg addressing (LWXC1, etc.)</li>
-  <li>Various fixes to improve performance have been implemented.</li>
-  <li>Post-RA scheduling is now enabled at -O3.</li>
-  <li>Support for soft-float code generation has been added.</li>
-  <li>clang driver's support for MIPS 64-bits targets.</li>
-  <li>Support for MIPS floating point ABI option in clang driver.</li>
-</ul>
 </div>
 
 <!--=========================================================================-->
 <h3>
-<a name="PTX">PTX Target Improvements</a>
+<a name="MIPS">MIPS Target Improvements</a>
 </h3>
 
 <div>
 
-<p>An outstanding conditional inversion bug was fixed in this release.</p>
+<p>New features and major changes in the MIPS target include:</p>
 
-<p><b>NOTE</b>: LLVM 3.1 marks the last release of the PTX back-end, in its
-  current form. The back-end is currently being replaced by the NVPTX
-  back-end, currently in SVN ToT.</p>
+<ul>
+  <li>...</li>
+</ul>
 
 </div>
 
@@ -696,7 +614,7 @@ New features and major changes in the MIPS target include:</p>
 <div>
 
 <ul>
-  <li>Support for Qualcomm's Hexagon VLIW processor has been added.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -709,25 +627,11 @@ New features and major changes in the MIPS target include:</p>
 <div>
 
 <p>If you're already an LLVM user or developer with out-of-tree changes based on
-   LLVM 3.1, this section lists some "gotchas" that you may run into upgrading
+   LLVM 3.2, this section lists some "gotchas" that you may run into upgrading
    from the previous release.</p>
 
 <ul>
-  <li>LLVM's build system now requires a python 2 interpreter to be present at
-      build time. A perl interpreter is no longer required.</li>
-  <li>The C backend has been removed.  It had numerous problems, to the point of
-      not being able to compile any nontrivial program.</li>
-  <li>The Alpha, Blackfin and SystemZ targets have been removed due to lack of
-      maintenance.</li>
-  <li>LLVM 3.1 removes support for reading LLVM 2.9 bitcode files. Going
-      forward, we aim for all future versions of LLVM to read bitcode files and
-      <tt>.ll</tt> files produced by LLVM 3.0 and later.</li>
-  <li>The <tt>unwind</tt> instruction is now gone. With the introduction of the
-      new exception handling system in LLVM 3.0, the <tt>unwind</tt> instruction
-      became obsolete.</li>
-  <li>LLVM 3.0 and earlier automatically added the returns_twice fo functions
-      like setjmp based on the name. This functionality was removed in 3.1.
-      This affects Clang users, if -ffreestanding is used.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -743,40 +647,7 @@ New features and major changes in the MIPS target include:</p>
    LLVM API changes are:</p>
 
 <ul>
-  <li>Target specific options have been moved from global variables to members
-      on the new <code>TargetOptions</code> class, which is local to each
-      <code>TargetMachine</code>. As a consequence, the associated flags will
-      no longer be accepted by <tt>clang -mllvm</tt>. This includes:
-<ul>
-<li><code>llvm::PrintMachineCode</code></li>
-<li><code>llvm::NoFramePointerElim</code></li>
-<li><code>llvm::NoFramePointerElimNonLeaf</code></li>
-<li><code>llvm::DisableFramePointerElim(const MachineFunction &)</code></li>
-<li><code>llvm::LessPreciseFPMADOption</code></li>
-<li><code>llvm::LessPrecideFPMAD()</code></li>
-<li><code>llvm::NoExcessFPPrecision</code></li>
-<li><code>llvm::UnsafeFPMath</code></li>
-<li><code>llvm::NoInfsFPMath</code></li>
-<li><code>llvm::NoNaNsFPMath</code></li>
-<li><code>llvm::HonorSignDependentRoundingFPMathOption</code></li>
-<li><code>llvm::HonorSignDependentRoundingFPMath()</code></li>
-<li><code>llvm::UseSoftFloat</code></li>
-<li><code>llvm::FloatABIType</code></li>
-<li><code>llvm::NoZerosInBSS</code></li>
-<li><code>llvm::JITExceptionHandling</code></li>
-<li><code>llvm::JITEmitDebugInfo</code></li>
-<li><code>llvm::JITEmitDebugInfoToDisk</code></li>
-<li><code>llvm::GuaranteedTailCallOpt</code></li>
-<li><code>llvm::StackAlignmentOverride</code></li>
-<li><code>llvm::RealignStack</code></li>
-<li><code>llvm::DisableJumpTables</code></li>
-<li><code>llvm::EnableFastISel</code></li>
-<li><code>llvm::getTrapFunctionName()</code></li>
-<li><code>llvm::EnableSegmentedStacks</code></li>
-</ul></li>
-
-  <li>The <code>MDBuilder</code> class has been added to simplify the creation
-      of metadata.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -791,13 +662,8 @@ New features and major changes in the MIPS target include:</p>
 <p>In addition, some tools have changed in this release. Some of the changes
    are:</p>
 
-
 <ul>
-  <li><tt>llvm-stress</tt> is a command line tool for generating random
-      <tt>.ll</tt> files to fuzz different LLVM components. </li>
-  <li>The <tt>llvm-ld</tt> tool has been removed.  The clang driver provides a
-      more reliable solution for turning a set of bitcode files into a binary.
-      To merge bitcode files <tt>llvm-link</tt> can be used instead.</li>
+  <li>...</li>
 </ul>
 
 </div>
@@ -811,19 +677,12 @@ New features and major changes in the MIPS target include:</p>
 <div>
 
 <p>Officially supported Python bindings have been added! Feature support is far
-from complete. The current bindings support interfaces to:</p>
+   from complete. The current bindings support interfaces to:</p>
+
 <ul>
-  <li>Object File Interface</li>
-  <li>Disassembler</li>
+  <li>...</li>
 </ul>
 
-<p>Using the Object File Interface, it is possible to inspect binary object files.
-Think of it as a Python version of readelf or llvm-objdump.</p>
-
-<p>Support for additional features is currently being developed by community
-contributors. If you are interested in shaping the direction of the Python
-bindings, please express your intent on IRC or the developers list.</p>
-
 </div>
 
 </div>
@@ -839,11 +698,11 @@ bindings, please express your intent on IRC or the developers list.</p>
 <p>LLVM is generally a production quality compiler, and is used by a broad range
    of applications and shipping in many products.  That said, not every
    subsystem is as mature as the aggregate, particularly the more obscure
-   targets.  If you run into a problem, please check the <a
-   href="http://llvm.org/bugs/">LLVM bug database</a> and submit a bug if
-   there isn't already one or ask on the <a
-    href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVMdev
-    list</a>.</p>
+   targets.  If you run into a problem, please check
+   the <a href="http://llvm.org/bugs/">LLVM bug database</a> and submit a bug if
+   there isn't already one or ask on
+   the <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVMdev
+   list</a>.</p>
 
   <p>Known problem areas include:</p>
 
@@ -851,7 +710,7 @@ bindings, please express your intent on IRC or the developers list.</p>
   <li>The CellSPU, MSP430, PTX and XCore backends are experimental.</li>
 
   <li>The integrated assembler, disassembler, and JIT is not supported by
-      several targets.  If an integrated assembler is not supported, then a
+      several targets. If an integrated assembler is not supported, then a
       system assembler is required.  For more details, see the <a
       href="CodeGenerator.html#targetfeatures">Target Features Matrix</a>.
   </li>
@@ -890,7 +749,7 @@ bindings, please express your intent on IRC or the developers list.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-15 23:58:06 +0200 (Tue, 15 May 2012) $
+  Last modified: $Date: 2012-07-13 14:44:23 +0200 (Fri, 13 Jul 2012) $
 </address>
 
 </body>
diff --git a/docs/SegmentedStacks.html b/docs/SegmentedStacks.html
deleted file mode 100644
index 16f5507..0000000
--- a/docs/SegmentedStacks.html
+++ /dev/null
@@ -1,93 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html>
-  <head>
-        <title>Segmented Stacks in LLVM</title>
-        <link rel="stylesheet" href="llvm.css" type="text/css">
-        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-  </head>
-
-  <body>
-        <h1>Segmented Stacks in LLVM</h1>
-        <div class="doc_author">
-          <p>Written by <a href="mailto:sanjoy@playingwithpointers.com">Sanjoy Das</a></p>
-        </div>
-
-        <ol>
-          <li><a href="#intro">Introduction</a></li>
-          <li><a href="#implementation">Implementation Details</a>
-                <ol>
-                  <li><a href="#morestack">Allocating Stacklets</a></li>
-                  <li><a href="#alloca">Variable Sized Allocas</a></li>
-                </ol>
-          </li>
-        </ol>
-
-        <h2><a name="intro">Introduction</a></h2>
-        <div>
-          <p>
-            Segmented stack allows stack space to be allocated incrementally than as a monolithic chunk (of some worst case size) at thread initialization. This is done by allocating stack blocks (henceforth called <em>stacklets</em>) and linking them into a doubly linked list. The function prologue is responsible for checking if the current stacklet has enough space for the function to execute; and if not, call into the libgcc runtime to allocate more stack space. When using <tt>llc</tt>, segmented stacks can be enabled by adding <tt>-segmented-stacks</tt> to the command line.
-          </p>
-          <p>
-            The runtime functionality is <a href="http://gcc.gnu.org/wiki/SplitStacks">already there in libgcc</a>.
-          </p>
-        </div>
-
-        <h2><a name="implementation">Implementation Details</a></h2>
-        <div>
-          <h3><a name="morestack">Allocating Stacklets</a></h3>
-          <div>
-            <p>
-              As mentioned above, the function prologue checks if the current stacklet has enough space. The current approach is to use a slot in the TCB to store the current stack limit (minus the amount of space needed to allocate a new block) - this slot's offset is again dictated by <code>libgcc</code>. The generated assembly looks like this on x86-64:
-            </p>
-            <pre>
-	          leaq	-8(%rsp), %r10
-	          cmpq	%fs:112,  %r10
-	          jg	.LBB0_2
-
-            # More stack space needs to be allocated
-	          movabsq	$8, %r10 # The amount of space needed
-	          movabsq	$0, %r11 # The total size of arguments passed on stack
-	          callq	__morestack
-	          ret # The reason for this extra return is explained below
-            .LBB0_2:
-            # Usual prologue continues here
-            </pre>
-            <p>
-              The size of function arguments on the stack needs to be passed to <code> __morestack</code> (this function is implemented in <code>libgcc</code>) since that number of bytes has to be copied from the previous stacklet to the current one. This is so that SP (and FP) relative addressing of function arguments work as expected.
-            </p>
-            <p>
-              The unusual <code>ret</code> is needed to have the function which made a call to <code>__morestack</code> return correctly. <code>__morestack</code>, instead of returning, calls into <code>.LBB0_2</code>. This is possible since both, the size of the <code>ret</code> instruction and the PC of call to <code>__morestack</code> are known. When the function body returns, control is transferred back to <code>__morestack</code>. <code>__morestack</code> then de-allocates the new stacklet, restores the correct SP value, and does a second return, which returns control to the correct caller.
-            </p>
-          </div>
-
-          <h3><a name="alloca">Variable Sized Allocas</a></h3>
-          <div>
-            <p>
-              The section on <a href="#morestack">allocating stacklets</a> automatically assumes that every stack frame will be of fixed size. However, LLVM allows the use of the <code>llvm.alloca</code> intrinsic to allocate dynamically sized blocks of memory on the stack. When faced with such a variable-sized alloca, code is generated to
-            </p>
-            <ul>
-              <li>Check if the current stacklet has enough space. If yes, just bump the SP, like in the normal case.</li>
-              <li>If not, generate a call to <code>libgcc</code>, which allocates the memory from the heap.</li>
-            </ul>
-            <p>
-              The memory allocated from the heap is linked into a list in the current stacklet, and freed along with the same. This prevents a memory leak.
-            </p>
-          </div>
-
-        </div>
-
-        <hr>
-        <address>
-          <a href="http://jigsaw.w3.org/css-validator/check/referer">
-                <img src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS">
-          </a>
-          <a href="http://validator.w3.org/check/referer">
-                <img src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01">
-          </a>
-          <a href="mailto:sanjoy@playingwithpointers.com">Sanjoy Das</a><br>
-          <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-          Last modified: $Date$
-        </address>
-  </body>
-</html>
-
diff --git a/docs/SegmentedStacks.rst b/docs/SegmentedStacks.rst
new file mode 100644
index 0000000..f97d62a
--- /dev/null
+++ b/docs/SegmentedStacks.rst
@@ -0,0 +1,80 @@
+.. _segmented_stacks:
+
+========================
+Segmented Stacks in LLVM
+========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Segmented stack allows stack space to be allocated incrementally than as a
+monolithic chunk (of some worst case size) at thread initialization. This is
+done by allocating stack blocks (henceforth called *stacklets*) and linking them
+into a doubly linked list. The function prologue is responsible for checking if
+the current stacklet has enough space for the function to execute; and if not,
+call into the libgcc runtime to allocate more stack space. When using ``llc``,
+segmented stacks can be enabled by adding ``-segmented-stacks`` to the command
+line.
+
+The runtime functionality is `already there in libgcc
+<http://gcc.gnu.org/wiki/SplitStacks>`_.
+
+Implementation Details
+======================
+
+.. _allocating stacklets:
+
+Allocating Stacklets
+--------------------
+
+As mentioned above, the function prologue checks if the current stacklet has
+enough space. The current approach is to use a slot in the TCB to store the
+current stack limit (minus the amount of space needed to allocate a new block) -
+this slot's offset is again dictated by ``libgcc``. The generated
+assembly looks like this on x86-64:
+
+.. code-block:: nasm
+
+    leaq     -8(%rsp), %r10
+    cmpq     %fs:112,  %r10
+    jg       .LBB0_2
+
+    # More stack space needs to be allocated
+    movabsq  $8, %r10   # The amount of space needed
+    movabsq  $0, %r11   # The total size of arguments passed on stack
+    callq    __morestack
+    ret                 # The reason for this extra return is explained below
+  .LBB0_2:
+    # Usual prologue continues here
+
+The size of function arguments on the stack needs to be passed to
+``__morestack`` (this function is implemented in ``libgcc``) since that number
+of bytes has to be copied from the previous stacklet to the current one. This is
+so that SP (and FP) relative addressing of function arguments work as expected.
+
+The unusual ``ret`` is needed to have the function which made a call to
+``__morestack`` return correctly. ``__morestack``, instead of returning, calls
+into ``.LBB0_2``. This is possible since both, the size of the ``ret``
+instruction and the PC of call to ``__morestack`` are known. When the function
+body returns, control is transferred back to ``__morestack``. ``__morestack``
+then de-allocates the new stacklet, restores the correct SP value, and does a
+second return, which returns control to the correct caller.
+
+Variable Sized Allocas
+----------------------
+
+The section on `allocating stacklets`_ automatically assumes that every stack
+frame will be of fixed size. However, LLVM allows the use of the ``llvm.alloca``
+intrinsic to allocate dynamically sized blocks of memory on the stack. When
+faced with such a variable-sized alloca, code is generated to:
+
+* Check if the current stacklet has enough space. If yes, just bump the SP, like
+  in the normal case.
+* If not, generate a call to ``libgcc``, which allocates the memory from the
+  heap.
+
+The memory allocated from the heap is linked into a list in the current
+stacklet, and freed along with the same. This prevents a memory leak.
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index 259a259..bb72bf3 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Source Level Debugging with LLVM</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -77,10 +77,6 @@
   </li>
 </ul>
 </td>
-<td class="right">
-<img src="img/venusflytrap.jpg" alt="A leafy and green bug eater" width="247"
-height="369">
-</td>
 </tr></table>
 
 <div class="doc_author">
@@ -2716,7 +2712,7 @@ HashData[hash_data_count]
   has address attributes: DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges or
   DW_AT_entry_pc. It also contains DW_TAG_variable DIEs that have a DW_OP_addr
   in the location (global and static variables). All global and static variables
-  should be included, including those scoped withing functions and classes. For
+  should be included, including those scoped within functions and classes. For
   example using the following code:</p>
 <div class="doc_code">
 <pre>
@@ -2855,7 +2851,7 @@ int main ()
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-03 02:43:49 +0200 (Tue, 03 Apr 2012) $
+  Last modified: $Date: 2012-06-02 12:20:22 +0200 (Sat, 02 Jun 2012) $
 </address>
 
 </body>
diff --git a/docs/SystemLibrary.html b/docs/SystemLibrary.html
index 7cafedf..4b09e7c 100644
--- a/docs/SystemLibrary.html
+++ b/docs/SystemLibrary.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>System Library</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -310,7 +310,7 @@
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-31 12:21:59 +0100 (Mon, 31 Oct 2011) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/TableGenFundamentals.html b/docs/TableGenFundamentals.html
deleted file mode 100644
index a211389..0000000
--- a/docs/TableGenFundamentals.html
+++ /dev/null
@@ -1,973 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>TableGen Fundamentals</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>TableGen Fundamentals</h1>
-
-<div>
-<ul>
-  <li><a href="#introduction">Introduction</a>
-  <ol>
-    <li><a href="#concepts">Basic concepts</a></li>
-    <li><a href="#example">An example record</a></li>
-    <li><a href="#running">Running TableGen</a></li>
-  </ol></li>
-  <li><a href="#syntax">TableGen syntax</a>
-  <ol>
-    <li><a href="#primitives">TableGen primitives</a>
-    <ol>
-      <li><a href="#comments">TableGen comments</a></li>
-      <li><a href="#types">The TableGen type system</a></li>
-      <li><a href="#values">TableGen values and expressions</a></li>
-    </ol></li>
-    <li><a href="#classesdefs">Classes and definitions</a>
-    <ol>
-      <li><a href="#valuedef">Value definitions</a></li>
-      <li><a href="#recordlet">'let' expressions</a></li>
-      <li><a href="#templateargs">Class template arguments</a></li>
-      <li><a href="#multiclass">Multiclass definitions and instances</a></li>
-    </ol></li>
-    <li><a href="#filescope">File scope entities</a>
-    <ol>
-      <li><a href="#include">File inclusion</a></li>
-      <li><a href="#globallet">'let' expressions</a></li>
-      <li><a href="#foreach">'foreach' blocks</a></li>
-    </ol></li>
-  </ol></li>
-  <li><a href="#backends">TableGen backends</a>
-  <ol>
-    <li><a href="#">todo</a></li>
-  </ol></li>
-</ul>
-</div>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>TableGen's purpose is to help a human develop and maintain records of
-domain-specific information.  Because there may be a large number of these
-records, it is specifically designed to allow writing flexible descriptions and
-for common features of these records to be factored out.  This reduces the
-amount of duplication in the description, reduces the chance of error, and
-makes it easier to structure domain specific information.</p>
-
-<p>The core part of TableGen <a href="#syntax">parses a file</a>, instantiates
-the declarations, and hands the result off to a domain-specific "<a
-href="#backends">TableGen backend</a>" for processing.  The current major user
-of TableGen is the <a href="CodeGenerator.html">LLVM code generator</a>.</p>
-
-<p>Note that if you work on TableGen much, and use emacs or vim, that you can
-find an emacs "TableGen mode" and a vim language file in the
-<tt>llvm/utils/emacs</tt> and <tt>llvm/utils/vim</tt> directories of your LLVM
-distribution, respectively.</p>
-
-<!-- ======================================================================= -->
-<h3><a name="concepts">Basic concepts</a></h3>
-
-<div>
-
-<p>TableGen files consist of two key parts: 'classes' and 'definitions', both
-of which are considered 'records'.</p>
-
-<p><b>TableGen records</b> have a unique name, a list of values, and a list of
-superclasses.  The list of values is the main data that TableGen builds for each
-record; it is this that holds the domain specific information for the
-application.  The interpretation of this data is left to a specific <a
-href="#backends">TableGen backend</a>, but the structure and format rules are
-taken care of and are fixed by TableGen.</p>
-
-<p><b>TableGen definitions</b> are the concrete form of 'records'.  These
-generally do not have any undefined values, and are marked with the
-'<tt>def</tt>' keyword.</p>
-
-<p><b>TableGen classes</b> are abstract records that are used to build and
-describe other records.  These 'classes' allow the end-user to build
-abstractions for either the domain they are targeting (such as "Register",
-"RegisterClass", and "Instruction" in the LLVM code generator) or for the
-implementor to help factor out common properties of records (such as "FPInst",
-which is used to represent floating point instructions in the X86 backend).
-TableGen keeps track of all of the classes that are used to build up a
-definition, so the backend can find all definitions of a particular class, such
-as "Instruction".</p>
-
-<p><b>TableGen multiclasses</b> are groups of abstract records that are
-instantiated all at once.  Each instantiation can result in multiple
-TableGen definitions.  If a multiclass inherits from another multiclass,
-the definitions in the sub-multiclass become part of the current
-multiclass, as if they were declared in the current multiclass.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="example">An example record</a></h3>
-
-<div>
-
-<p>With no other arguments, TableGen parses the specified file and prints out
-all of the classes, then all of the definitions.  This is a good way to see what
-the various definitions expand to fully.  Running this on the <tt>X86.td</tt>
-file prints this (at the time of this writing):</p>
-
-<div class="doc_code">
-<pre>
-...
-<b>def</b> ADD32rr {   <i>// Instruction X86Inst I</i>
-  <b>string</b> Namespace = "X86";
-  <b>dag</b> OutOperandList = (outs GR32:$dst);
-  <b>dag</b> InOperandList = (ins GR32:$src1, GR32:$src2);
-  <b>string</b> AsmString = "add{l}\t{$src2, $dst|$dst, $src2}";
-  <b>list</b>&lt;dag&gt; Pattern = [(set GR32:$dst, (add GR32:$src1, GR32:$src2))];
-  <b>list</b>&lt;Register&gt; Uses = [];
-  <b>list</b>&lt;Register&gt; Defs = [EFLAGS];
-  <b>list</b>&lt;Predicate&gt; Predicates = [];
-  <b>int</b> CodeSize = 3;
-  <b>int</b> AddedComplexity = 0;
-  <b>bit</b> isReturn = 0;
-  <b>bit</b> isBranch = 0;
-  <b>bit</b> isIndirectBranch = 0;
-  <b>bit</b> isBarrier = 0;
-  <b>bit</b> isCall = 0;
-  <b>bit</b> canFoldAsLoad = 0;
-  <b>bit</b> mayLoad = 0;
-  <b>bit</b> mayStore = 0;
-  <b>bit</b> isImplicitDef = 0;
-  <b>bit</b> isConvertibleToThreeAddress = 1;
-  <b>bit</b> isCommutable = 1;
-  <b>bit</b> isTerminator = 0;
-  <b>bit</b> isReMaterializable = 0;
-  <b>bit</b> isPredicable = 0;
-  <b>bit</b> hasDelaySlot = 0;
-  <b>bit</b> usesCustomInserter = 0;
-  <b>bit</b> hasCtrlDep = 0;
-  <b>bit</b> isNotDuplicable = 0;
-  <b>bit</b> hasSideEffects = 0;
-  <b>bit</b> neverHasSideEffects = 0;
-  InstrItinClass Itinerary = NoItinerary;
-  <b>string</b> Constraints = "";
-  <b>string</b> DisableEncoding = "";
-  <b>bits</b>&lt;8&gt; Opcode = { 0, 0, 0, 0, 0, 0, 0, 1 };
-  Format Form = MRMDestReg;
-  <b>bits</b>&lt;6&gt; FormBits = { 0, 0, 0, 0, 1, 1 };
-  ImmType ImmT = NoImm;
-  <b>bits</b>&lt;3&gt; ImmTypeBits = { 0, 0, 0 };
-  <b>bit</b> hasOpSizePrefix = 0;
-  <b>bit</b> hasAdSizePrefix = 0;
-  <b>bits</b>&lt;4&gt; Prefix = { 0, 0, 0, 0 };
-  <b>bit</b> hasREX_WPrefix = 0;
-  FPFormat FPForm = ?;
-  <b>bits</b>&lt;3&gt; FPFormBits = { 0, 0, 0 };
-}
-...
-</pre>
-</div>
-
-<p>This definition corresponds to a 32-bit register-register add instruction in
-the X86.  The string after the '<tt>def</tt>' string indicates the name of the
-record&mdash;"<tt>ADD32rr</tt>" in this case&mdash;and the comment at the end of
-the line indicates the superclasses of the definition.  The body of the record
-contains all of the data that TableGen assembled for the record, indicating that
-the instruction is part of the "X86" namespace, the pattern indicating how the
-the instruction should be emitted into the assembly file, that it is a
-two-address instruction, has a particular encoding, etc.  The contents and
-semantics of the information in the record is specific to the needs of the X86
-backend, and is only shown as an example.</p>
-
-<p>As you can see, a lot of information is needed for every instruction
-supported by the code generator, and specifying it all manually would be
-unmaintainable, prone to bugs, and tiring to do in the first place.  Because we
-are using TableGen, all of the information was derived from the following
-definition:</p>
-
-<div class="doc_code">
-<pre>
-let Defs = [EFLAGS],
-    isCommutable = 1,                  <i>// X = ADD Y,Z --&gt; X = ADD Z,Y</i>
-    isConvertibleToThreeAddress = 1 <b>in</b> <i>// Can transform into LEA.</i>
-def ADD32rr  : I&lt;0x01, MRMDestReg, (outs GR32:$dst),
-                                   (ins GR32:$src1, GR32:$src2),
-                 "add{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]&gt;;
-</pre>
-</div>
-
-<p>This definition makes use of the custom class <tt>I</tt> (extended from the
-custom class <tt>X86Inst</tt>), which is defined in the X86-specific TableGen
-file, to factor out the common features that instructions of its class share.  A
-key feature of TableGen is that it allows the end-user to define the
-abstractions they prefer to use when describing their information.</p>
-
-<p>Each def record has a special entry called "NAME."  This is the
-name of the def ("ADD32rr" above).  In the general case def names can
-be formed from various kinds of string processing expressions and NAME
-resolves to the final value obtained after resolving all of those
-expressions.  The user may refer to NAME anywhere she desires to use
-the ultimate name of the def.  NAME should not be defined anywhere
-else in user code to avoid conflict problems.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="running">Running TableGen</a></h3>
-
-<div>
-
-<p>TableGen runs just like any other LLVM tool.  The first (optional) argument
-specifies the file to read.  If a filename is not specified, <tt>tblgen</tt>
-reads from standard input.</p>
-
-<p>To be useful, one of the <a href="#backends">TableGen backends</a> must be
-used.  These backends are selectable on the command line (type '<tt>tblgen
--help</tt>' for a list).  For example, to get a list of all of the definitions
-that subclass a particular type (which can be useful for building up an enum
-list of these records), use the <tt>-print-enums</tt> option:</p>
-
-<div class="doc_code">
-<pre>
-$ tblgen X86.td -print-enums -class=Register
-AH, AL, AX, BH, BL, BP, BPL, BX, CH, CL, CX, DH, DI, DIL, DL, DX, EAX, EBP, EBX,
-ECX, EDI, EDX, EFLAGS, EIP, ESI, ESP, FP0, FP1, FP2, FP3, FP4, FP5, FP6, IP,
-MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, R10, R10B, R10D, R10W, R11, R11B, R11D,
-R11W, R12, R12B, R12D, R12W, R13, R13B, R13D, R13W, R14, R14B, R14D, R14W, R15,
-R15B, R15D, R15W, R8, R8B, R8D, R8W, R9, R9B, R9D, R9W, RAX, RBP, RBX, RCX, RDI,
-RDX, RIP, RSI, RSP, SI, SIL, SP, SPL, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
-XMM0, XMM1, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5,
-XMM6, XMM7, XMM8, XMM9,
-
-$ tblgen X86.td -print-enums -class=Instruction 
-ABS_F, ABS_Fp32, ABS_Fp64, ABS_Fp80, ADC32mi, ADC32mi8, ADC32mr, ADC32ri,
-ADC32ri8, ADC32rm, ADC32rr, ADC64mi32, ADC64mi8, ADC64mr, ADC64ri32, ADC64ri8,
-ADC64rm, ADC64rr, ADD16mi, ADD16mi8, ADD16mr, ADD16ri, ADD16ri8, ADD16rm,
-ADD16rr, ADD32mi, ADD32mi8, ADD32mr, ADD32ri, ADD32ri8, ADD32rm, ADD32rr,
-ADD64mi32, ADD64mi8, ADD64mr, ADD64ri32, ...
-</pre>
-</div>
-
-<p>The default backend prints out all of the records, as described <a
-href="#example">above</a>.</p>
-
-<p>If you plan to use TableGen, you will most likely have to <a
-href="#backends">write a backend</a> that extracts the information specific to
-what you need and formats it in the appropriate way.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="syntax">TableGen syntax</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>TableGen doesn't care about the meaning of data (that is up to the backend to
-define), but it does care about syntax, and it enforces a simple type system.
-This section describes the syntax and the constructs allowed in a TableGen file.
-</p>
-
-<!-- ======================================================================= -->
-<h3><a name="primitives">TableGen primitives</a></h3>
-
-<div>
-
-<!-- -------------------------------------------------------------------------->
-<h4><a name="comments">TableGen comments</a></h4>
-
-<div>
-
-<p>TableGen supports BCPL style "<tt>//</tt>" comments, which run to the end of
-the line, and it also supports <b>nestable</b> "<tt>/* */</tt>" comments.</p>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="types">The TableGen type system</a>
-</h4>
-
-<div>
-
-<p>TableGen files are strongly typed, in a simple (but complete) type-system.
-These types are used to perform automatic conversions, check for errors, and to
-help interface designers constrain the input that they allow.  Every <a
-href="#valuedef">value definition</a> is required to have an associated type.
-</p>
-
-<p>TableGen supports a mixture of very low-level types (such as <tt>bit</tt>)
-and very high-level types (such as <tt>dag</tt>).  This flexibility is what
-allows it to describe a wide range of information conveniently and compactly.
-The TableGen types are:</p>
-
-<dl>
-<dt><tt><b>bit</b></tt></dt>
-  <dd>A 'bit' is a boolean value that can hold either 0 or 1.</dd>
-
-<dt><tt><b>int</b></tt></dt>
-  <dd>The 'int' type represents a simple 32-bit integer value, such as 5.</dd>
-
-<dt><tt><b>string</b></tt></dt>
-  <dd>The 'string' type represents an ordered sequence of characters of
-  arbitrary length.</dd>
-
-<dt><tt><b>bits</b>&lt;n&gt;</tt></dt>
-  <dd>A 'bits' type is an arbitrary, but fixed, size integer that is broken up
-  into individual bits.  This type is useful because it can handle some bits
-  being defined while others are undefined.</dd>
-
-<dt><tt><b>list</b>&lt;ty&gt;</tt></dt>
-  <dd>This type represents a list whose elements are some other type.  The
-  contained type is arbitrary: it can even be another list type.</dd>
-
-<dt>Class type</dt>
-  <dd>Specifying a class name in a type context means that the defined value
-  must be a subclass of the specified class.  This is useful in conjunction with
-  the <b><tt>list</tt></b> type, for example, to constrain the elements of the
-  list to a common base class (e.g., a <tt><b>list</b>&lt;Register&gt;</tt> can
-  only contain definitions derived from the "<tt>Register</tt>" class).</dd>
-
-<dt><tt><b>dag</b></tt></dt>
-  <dd>This type represents a nestable directed graph of elements.</dd>
-
-<dt><tt><b>code</b></tt></dt>
-  <dd>This represents a big hunk of text.  This is lexically distinct from 
-  string values because it doesn't require escapeing double quotes and other
-  common characters that occur in code.</dd>
-</dl>
-
-<p>To date, these types have been sufficient for describing things that
-TableGen has been used for, but it is straight-forward to extend this list if
-needed.</p>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="values">TableGen values and expressions</a>
-</h4>
-
-<div>
-
-<p>TableGen allows for a pretty reasonable number of different expression forms
-when building up values.  These forms allow the TableGen file to be written in a
-natural syntax and flavor for the application.  The current expression forms
-supported include:</p>
-
-<dl>
-<dt><tt>?</tt></dt>
-  <dd>uninitialized field</dd>
-<dt><tt>0b1001011</tt></dt>
-  <dd>binary integer value</dd>
-<dt><tt>07654321</tt></dt>
-  <dd>octal integer value (indicated by a leading 0)</dd>
-<dt><tt>7</tt></dt>
-  <dd>decimal integer value</dd>
-<dt><tt>0x7F</tt></dt>
-  <dd>hexadecimal integer value</dd>
-<dt><tt>"foo"</tt></dt>
-  <dd>string value</dd>
-<dt><tt>[{ ... }]</tt></dt>
-  <dd>code fragment</dd>
-<dt><tt>[ X, Y, Z ]&lt;type&gt;</tt></dt>
-  <dd>list value.  &lt;type&gt; is the type of the list 
-element and is usually optional.  In rare cases,
-TableGen is unable to deduce the element type in
-which case the user must specify it explicitly.</dd>
-<dt><tt>{ a, b, c }</tt></dt>
-  <dd>initializer for a "bits&lt;3&gt;" value</dd>
-<dt><tt>value</tt></dt>
-  <dd>value reference</dd>
-<dt><tt>value{17}</tt></dt>
-  <dd>access to one bit of a value</dd>
-<dt><tt>value{15-17}</tt></dt>
-  <dd>access to multiple bits of a value</dd>
-<dt><tt>DEF</tt></dt>
-  <dd>reference to a record definition</dd>
-<dt><tt>CLASS&lt;val list&gt;</tt></dt>
-  <dd>reference to a new anonymous definition of CLASS with the specified
-      template arguments.</dd>
-<dt><tt>X.Y</tt></dt>
-  <dd>reference to the subfield of a value</dd>
-<dt><tt>list[4-7,17,2-3]</tt></dt>
-  <dd>A slice of the 'list' list, including elements 4,5,6,7,17,2, and 3 from
-  it.  Elements may be included multiple times.</dd>
-<dt><tt>foreach &lt;var&gt; = &lt;list&gt; in { &lt;body&gt; }</tt></dt>
-<dt><tt>foreach &lt;var&gt; = &lt;list&gt; in &lt;def&gt;</tt></dt>
-  <dd> Replicate &lt;body&gt; or &lt;def&gt;, replacing instances of
-  &lt;var&gt; with each value in &lt;list&gt;.  &lt;var&gt; is scoped at the
-  level of the <tt>foreach</tt> loop and must not conflict with any other object
-  introduced in &lt;body&gt; or &lt;def&gt;.  Currently only <tt>def</tt>s are
-  expanded within &lt;body&gt;.
-  </dd>
-<dt><tt>(DEF a, b)</tt></dt>
-  <dd>a dag value.  The first element is required to be a record definition, the
-  remaining elements in the list may be arbitrary other values, including nested
-  `<tt>dag</tt>' values.</dd>
-<dt><tt>!strconcat(a, b)</tt></dt>
-  <dd>A string value that is the result of concatenating the 'a' and 'b'
-  strings.</dd>
-<dt><tt>str1#str2</tt></dt>
-  <dd>"#" (paste) is a shorthand for !strconcat.  It may concatenate
-  things that are not quoted strings, in which case an implicit
-  !cast&lt;string&gt; is done on the operand of the paste.</dd>
-<dt><tt>!cast&lt;type&gt;(a)</tt></dt>
-  <dd>A symbol of type <em>type</em> obtained by looking up the string 'a' in
-the symbol table.  If the type of 'a' does not match <em>type</em>, TableGen
-aborts with an error. !cast&lt;string&gt; is a special case in that the argument must
-be an object defined by a 'def' construct.</dd>
-<dt><tt>!subst(a, b, c)</tt></dt>
-  <dd>If 'a' and 'b' are of string type or are symbol references, substitute 
-'b' for 'a' in 'c.'  This operation is analogous to $(subst) in GNU make.</dd>
-<dt><tt>!foreach(a, b, c)</tt></dt>
-  <dd>For each member 'b' of dag or list 'a' apply operator 'c.'  'b' is a 
-dummy variable that should be declared as a member variable of an instantiated 
-class.  This operation is analogous to $(foreach) in GNU make.</dd>
-<dt><tt>!head(a)</tt></dt>
-  <dd>The first element of list 'a.'</dd>
-<dt><tt>!tail(a)</tt></dt>
-  <dd>The 2nd-N elements of list 'a.'</dd>
-<dt><tt>!empty(a)</tt></dt>
-  <dd>An integer {0,1} indicating whether list 'a' is empty.</dd>
-<dt><tt>!if(a,b,c)</tt></dt>
-  <dd>'b' if the result of 'int' or 'bit' operator 'a' is nonzero,
-      'c' otherwise.</dd>
-<dt><tt>!eq(a,b)</tt></dt>
-  <dd>'bit 1' if string a is equal to string b, 0 otherwise.  This
-      only operates on string, int and bit objects.  Use !cast&lt;string&gt; to
-      compare other types of objects.</dd>
-</dl>
-
-<p>Note that all of the values have rules specifying how they convert to values
-for different types.  These rules allow you to assign a value like "<tt>7</tt>"
-to a "<tt>bits&lt;4&gt;</tt>" value, for example.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="classesdefs">Classes and definitions</a>
-</h3>
-
-<div>
-
-<p>As mentioned in the <a href="#concepts">intro</a>, classes and definitions
-(collectively known as 'records') in TableGen are the main high-level unit of
-information that TableGen collects.  Records are defined with a <tt>def</tt> or
-<tt>class</tt> keyword, the record name, and an optional list of "<a
-href="#templateargs">template arguments</a>".  If the record has superclasses,
-they are specified as a comma separated list that starts with a colon character
-("<tt>:</tt>").  If <a href="#valuedef">value definitions</a> or <a
-href="#recordlet">let expressions</a> are needed for the class, they are
-enclosed in curly braces ("<tt>{}</tt>"); otherwise, the record ends with a
-semicolon.</p>
-
-<p>Here is a simple TableGen file:</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> C { <b>bit</b> V = 1; }
-<b>def</b> X : C;
-<b>def</b> Y : C {
-  <b>string</b> Greeting = "hello";
-}
-</pre>
-</div>
-
-<p>This example defines two definitions, <tt>X</tt> and <tt>Y</tt>, both of
-which derive from the <tt>C</tt> class.  Because of this, they both get the
-<tt>V</tt> bit value.  The <tt>Y</tt> definition also gets the Greeting member
-as well.</p>
-
-<p>In general, classes are useful for collecting together the commonality
-between a group of records and isolating it in a single place.  Also, classes
-permit the specification of default values for their subclasses, allowing the
-subclasses to override them as they wish.</p>
-
-<!---------------------------------------------------------------------------->
-<h4>
-  <a name="valuedef">Value definitions</a>
-</h4>
-
-<div>
-
-<p>Value definitions define named entries in records.  A value must be defined
-before it can be referred to as the operand for another value definition or
-before the value is reset with a <a href="#recordlet">let expression</a>.  A
-value is defined by specifying a <a href="#types">TableGen type</a> and a name.
-If an initial value is available, it may be specified after the type with an
-equal sign.  Value definitions require terminating semicolons.</p>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="recordlet">'let' expressions</a>
-</h4>
-
-<div>
-
-<p>A record-level let expression is used to change the value of a value
-definition in a record.  This is primarily useful when a superclass defines a
-value that a derived class or definition wants to override.  Let expressions
-consist of the '<tt>let</tt>' keyword followed by a value name, an equal sign
-("<tt>=</tt>"), and a new value.  For example, a new class could be added to the
-example above, redefining the <tt>V</tt> field for all of its subclasses:</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> D : C { let V = 0; }
-<b>def</b> Z : D;
-</pre>
-</div>
-
-<p>In this case, the <tt>Z</tt> definition will have a zero value for its "V"
-value, despite the fact that it derives (indirectly) from the <tt>C</tt> class,
-because the <tt>D</tt> class overrode its value.</p>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="templateargs">Class template arguments</a>
-</h4>
-
-<div>
-
-<p>TableGen permits the definition of parameterized classes as well as normal
-concrete classes.  Parameterized TableGen classes specify a list of variable
-bindings (which may optionally have defaults) that are bound when used.  Here is
-a simple example:</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> FPFormat&lt;<b>bits</b>&lt;3&gt; val&gt; {
-  <b>bits</b>&lt;3&gt; Value = val;
-}
-<b>def</b> NotFP      : FPFormat&lt;0&gt;;
-<b>def</b> ZeroArgFP  : FPFormat&lt;1&gt;;
-<b>def</b> OneArgFP   : FPFormat&lt;2&gt;;
-<b>def</b> OneArgFPRW : FPFormat&lt;3&gt;;
-<b>def</b> TwoArgFP   : FPFormat&lt;4&gt;;
-<b>def</b> CompareFP  : FPFormat&lt;5&gt;;
-<b>def</b> CondMovFP  : FPFormat&lt;6&gt;;
-<b>def</b> SpecialFP  : FPFormat&lt;7&gt;;
-</pre>
-</div>
-
-<p>In this case, template arguments are used as a space efficient way to specify
-a list of "enumeration values", each with a "<tt>Value</tt>" field set to the
-specified integer.</p>
-
-<p>The more esoteric forms of <a href="#values">TableGen expressions</a> are
-useful in conjunction with template arguments.  As an example:</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> ModRefVal&lt;<b>bits</b>&lt;2&gt; val&gt; {
-  <b>bits</b>&lt;2&gt; Value = val;
-}
-
-<b>def</b> None   : ModRefVal&lt;0&gt;;
-<b>def</b> Mod    : ModRefVal&lt;1&gt;;
-<b>def</b> Ref    : ModRefVal&lt;2&gt;;
-<b>def</b> ModRef : ModRefVal&lt;3&gt;;
-
-<b>class</b> Value&lt;ModRefVal MR&gt; {
-  <i>// Decode some information into a more convenient format, while providing
-  // a nice interface to the user of the "Value" class.</i>
-  <b>bit</b> isMod = MR.Value{0};
-  <b>bit</b> isRef = MR.Value{1};
-
-  <i>// other stuff...</i>
-}
-
-<i>// Example uses</i>
-<b>def</b> bork : Value&lt;Mod&gt;;
-<b>def</b> zork : Value&lt;Ref&gt;;
-<b>def</b> hork : Value&lt;ModRef&gt;;
-</pre>
-</div>
-
-<p>This is obviously a contrived example, but it shows how template arguments
-can be used to decouple the interface provided to the user of the class from the
-actual internal data representation expected by the class.  In this case,
-running <tt>tblgen</tt> on the example prints the following definitions:</p>
-
-<div class="doc_code">
-<pre>
-<b>def</b> bork {      <i>// Value</i>
-  <b>bit</b> isMod = 1;
-  <b>bit</b> isRef = 0;
-}
-<b>def</b> hork {      <i>// Value</i>
-  <b>bit</b> isMod = 1;
-  <b>bit</b> isRef = 1;
-}
-<b>def</b> zork {      <i>// Value</i>
-  <b>bit</b> isMod = 0;
-  <b>bit</b> isRef = 1;
-}
-</pre>
-</div>
-
-<p> This shows that TableGen was able to dig into the argument and extract a
-piece of information that was requested by the designer of the "Value" class.
-For more realistic examples, please see existing users of TableGen, such as the
-X86 backend.</p>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="multiclass">Multiclass definitions and instances</a>
-</h4>
-
-<div>
-
-<p>
-While classes with template arguments are a good way to factor commonality
-between two instances of a definition, multiclasses allow a convenient notation
-for defining multiple definitions at once (instances of implicitly constructed
-classes).  For example, consider an 3-address instruction set whose instructions
-come in two forms: "<tt>reg = reg op reg</tt>" and "<tt>reg = reg op imm</tt>"
-(e.g. SPARC). In this case, you'd like to specify in one place that this
-commonality exists, then in a separate place indicate what all the ops are.
-</p>
-
-<p>
-Here is an example TableGen fragment that shows this idea:
-</p>
-
-<div class="doc_code">
-<pre>
-<b>def</b> ops;
-<b>def</b> GPR;
-<b>def</b> Imm;
-<b>class</b> inst&lt;<b>int</b> opc, <b>string</b> asmstr, <b>dag</b> operandlist&gt;;
-
-<b>multiclass</b> ri_inst&lt;<b>int</b> opc, <b>string</b> asmstr&gt; {
-  def _rr : inst&lt;opc, !strconcat(asmstr, " $dst, $src1, $src2"),
-                 (ops GPR:$dst, GPR:$src1, GPR:$src2)&gt;;
-  def _ri : inst&lt;opc, !strconcat(asmstr, " $dst, $src1, $src2"),
-                 (ops GPR:$dst, GPR:$src1, Imm:$src2)&gt;;
-}
-
-<i>// Instantiations of the ri_inst multiclass.</i>
-<b>defm</b> ADD : ri_inst&lt;0b111, "add"&gt;;
-<b>defm</b> SUB : ri_inst&lt;0b101, "sub"&gt;;
-<b>defm</b> MUL : ri_inst&lt;0b100, "mul"&gt;;
-...
-</pre>
-</div>
-
-<p>The name of the resultant definitions has the multidef fragment names
-   appended to them, so this defines <tt>ADD_rr</tt>, <tt>ADD_ri</tt>,
-   <tt>SUB_rr</tt>, etc.  A defm may inherit from multiple multiclasses,
-   instantiating definitions from each multiclass.  Using a multiclass
-   this way is exactly equivalent to instantiating the classes multiple
-   times yourself, e.g. by writing:</p>
-
-<div class="doc_code">
-<pre>
-<b>def</b> ops;
-<b>def</b> GPR;
-<b>def</b> Imm;
-<b>class</b> inst&lt;<b>int</b> opc, <b>string</b> asmstr, <b>dag</b> operandlist&gt;;
-
-<b>class</b> rrinst&lt;<b>int</b> opc, <b>string</b> asmstr&gt;
-  : inst&lt;opc, !strconcat(asmstr, " $dst, $src1, $src2"),
-         (ops GPR:$dst, GPR:$src1, GPR:$src2)&gt;;
-
-<b>class</b> riinst&lt;<b>int</b> opc, <b>string</b> asmstr&gt;
-  : inst&lt;opc, !strconcat(asmstr, " $dst, $src1, $src2"),
-         (ops GPR:$dst, GPR:$src1, Imm:$src2)&gt;;
-
-<i>// Instantiations of the ri_inst multiclass.</i>
-<b>def</b> ADD_rr : rrinst&lt;0b111, "add"&gt;;
-<b>def</b> ADD_ri : riinst&lt;0b111, "add"&gt;;
-<b>def</b> SUB_rr : rrinst&lt;0b101, "sub"&gt;;
-<b>def</b> SUB_ri : riinst&lt;0b101, "sub"&gt;;
-<b>def</b> MUL_rr : rrinst&lt;0b100, "mul"&gt;;
-<b>def</b> MUL_ri : riinst&lt;0b100, "mul"&gt;;
-...
-</pre>
-</div>
-
-<p>
-A defm can also be used inside a multiclass providing several levels of
-multiclass instanciations.
-</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> Instruction&lt;bits&lt;4&gt; opc, string Name&gt; {
-  bits&lt;4&gt; opcode = opc;
-  string name = Name;
-}
-
-<b>multiclass</b> basic_r&lt;bits&lt;4&gt; opc&gt; {
-  <b>def</b> rr : Instruction&lt;opc, "rr"&gt;;
-  <b>def</b> rm : Instruction&lt;opc, "rm"&gt;;
-}
-
-<b>multiclass</b> basic_s&lt;bits&lt;4&gt; opc&gt; {
-  <b>defm</b> SS : basic_r&lt;opc&gt;;
-  <b>defm</b> SD : basic_r&lt;opc&gt;;
-  <b>def</b> X : Instruction&lt;opc, "x"&gt;;
-}
-
-<b>multiclass</b> basic_p&lt;bits&lt;4&gt; opc&gt; {
-  <b>defm</b> PS : basic_r&lt;opc&gt;;
-  <b>defm</b> PD : basic_r&lt;opc&gt;;
-  <b>def</b> Y : Instruction&lt;opc, "y"&gt;;
-}
-
-<b>defm</b> ADD : basic_s&lt;0xf&gt;, basic_p&lt;0xf&gt;;
-...
-
-<i>// Results</i>
-<b>def</b> ADDPDrm { ...
-<b>def</b> ADDPDrr { ...
-<b>def</b> ADDPSrm { ...
-<b>def</b> ADDPSrr { ...
-<b>def</b> ADDSDrm { ...
-<b>def</b> ADDSDrr { ...
-<b>def</b> ADDY { ...
-<b>def</b> ADDX { ...
-</pre>
-</div>
-
-<p>
-defm declarations can inherit from classes too, the
-rule to follow is that the class list must start after the
-last multiclass, and there must be at least one multiclass
-before them.
-</p>
-
-<div class="doc_code">
-<pre>
-<b>class</b> XD { bits&lt;4&gt; Prefix = 11; }
-<b>class</b> XS { bits&lt;4&gt; Prefix = 12; }
-
-<b>class</b> I&lt;bits<4&gt; op> {
-  bits&lt;4&gt; opcode = op;
-}
-
-<b>multiclass</b> R {
-  <b>def</b> rr : I&lt;4&gt;;
-  <b>def</b> rm : I&lt;2&gt;;
-}
-
-<b>multiclass</b> Y {
-  <b>defm</b> SS : R, XD;
-  <b>defm</b> SD : R, XS;
-}
-
-<b>defm</b> Instr : Y;
-
-<i>// Results</i>
-<b>def</b> InstrSDrm {
-  bits&lt;4&gt; opcode = { 0, 0, 1, 0 };
-  bits&lt;4&gt; Prefix = { 1, 1, 0, 0 };
-}
-...
-<b>def</b> InstrSSrr {
-  bits&lt;4&gt; opcode = { 0, 1, 0, 0 };
-  bits&lt;4&gt; Prefix = { 1, 0, 1, 1 };
-}
-</pre>
-</div>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="filescope">File scope entities</a>
-</h3>
-
-<div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="include">File inclusion</a>
-</h4>
-
-<div>
-<p>TableGen supports the '<tt>include</tt>' token, which textually substitutes
-the specified file in place of the include directive.  The filename should be
-specified as a double quoted string immediately after the '<tt>include</tt>'
-keyword.  Example:</p>
-
-<div class="doc_code">
-<pre>
-<b>include</b> "foo.td"
-</pre>
-</div>
-
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="globallet">'let' expressions</a>
-</h4>
-
-<div>
-
-<p>"Let" expressions at file scope are similar to <a href="#recordlet">"let"
-expressions within a record</a>, except they can specify a value binding for
-multiple records at a time, and may be useful in certain other cases.
-File-scope let expressions are really just another way that TableGen allows the
-end-user to factor out commonality from the records.</p>
-
-<p>File-scope "let" expressions take a comma-separated list of bindings to
-apply, and one or more records to bind the values in.  Here are some
-examples:</p>
-
-<div class="doc_code">
-<pre>
-<b>let</b> isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 <b>in</b>
-  <b>def</b> RET : I&lt;0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)]&gt;;
-
-<b>let</b> isCall = 1 <b>in</b>
-  <i>// All calls clobber the non-callee saved registers...</i>
-  <b>let</b> Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
-              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
-              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, EFLAGS] <b>in</b> {
-    <b>def</b> CALLpcrel32 : Ii32&lt;0xE8, RawFrm, (outs), (ins i32imm:$dst,variable_ops),
-                           "call\t${dst:call}", []&gt;;
-    <b>def</b> CALL32r     : I&lt;0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
-                        "call\t{*}$dst", [(X86call GR32:$dst)]&gt;;
-    <b>def</b> CALL32m     : I&lt;0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
-                        "call\t{*}$dst", []&gt;;
-  }
-</pre>
-</div>
-
-<p>File-scope "let" expressions are often useful when a couple of definitions
-need to be added to several records, and the records do not otherwise need to be
-opened, as in the case with the <tt>CALL*</tt> instructions above.</p>
-
-<p>It's also possible to use "let" expressions inside multiclasses, providing
-more ways to factor out commonality from the records, specially if using
-several levels of multiclass instanciations. This also avoids the need of using
-"let" expressions within subsequent records inside a multiclass.</p> 
-
-<pre class="doc_code">
-<b>multiclass </b>basic_r&lt;bits&lt;4&gt; opc&gt; {
-  <b>let </b>Predicates = [HasSSE2] in {
-    <b>def </b>rr : Instruction&lt;opc, "rr"&gt;;
-    <b>def </b>rm : Instruction&lt;opc, "rm"&gt;;
-  }
-  <b>let </b>Predicates = [HasSSE3] in
-    <b>def </b>rx : Instruction&lt;opc, "rx"&gt;;
-}
-
-<b>multiclass </b>basic_ss&lt;bits&lt;4&gt; opc&gt; {
-  <b>let </b>IsDouble = 0 in
-    <b>defm </b>SS : basic_r&lt;opc&gt;;
-
-  <b>let </b>IsDouble = 1 in
-    <b>defm </b>SD : basic_r&lt;opc&gt;;
-}
-
-<b>defm </b>ADD : basic_ss&lt;0xf&gt;;
-</pre>
-</div>
-
-<!-- -------------------------------------------------------------------------->
-<h4>
-  <a name="foreach">Looping</a>
-</h4>
-
-<div>
-<p>TableGen supports the '<tt>foreach</tt>' block, which textually replicates
-the loop body, substituting iterator values for iterator references in the
-body.  Example:</p>
-
-<div class="doc_code">
-<pre>
-<b>foreach</b> i = [0, 1, 2, 3] in {
-  <b>def</b> R#i : Register&lt;...&gt;;
-  <b>def</b> F#i : Register&lt;...&gt;;
-}
-</pre>
-</div>
-
-<p>This will create objects <tt>R0</tt>, <tt>R1</tt>, <tt>R2</tt> and
-<tt>R3</tt>.  <tt>foreach</tt> blocks may be nested. If there is only
-one item in the body the braces may be elided:</p>
-
-<div class="doc_code">
-<pre>
-<b>foreach</b> i = [0, 1, 2, 3] in
-  <b>def</b> R#i : Register&lt;...&gt;;
-
-</pre>
-</div>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="codegen">Code Generator backend info</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Expressions used by code generator to describe instructions and isel
-patterns:</p>
-
-<dl>
-<dt><tt>(implicit a)</tt></dt>
-  <dd>an implicitly defined physical register.  This tells the dag instruction
-  selection emitter the input pattern's extra definitions matches implicit
-  physical register definitions.</dd>
-</dl>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="backends">TableGen backends</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>TODO: How they work, how to write one.  This section should not contain
-details about any particular backend, except maybe -print-enums as an example.
-This should highlight the APIs in <tt>TableGen/Record.h</tt>.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-03-27 13:25:16 +0200 (Tue, 27 Mar 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/TableGenFundamentals.rst b/docs/TableGenFundamentals.rst
new file mode 100644
index 0000000..bfb2618
--- /dev/null
+++ b/docs/TableGenFundamentals.rst
@@ -0,0 +1,799 @@
+.. _tablegen:
+
+=====================
+TableGen Fundamentals
+=====================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+TableGen's purpose is to help a human develop and maintain records of
+domain-specific information.  Because there may be a large number of these
+records, it is specifically designed to allow writing flexible descriptions and
+for common features of these records to be factored out.  This reduces the
+amount of duplication in the description, reduces the chance of error, and makes
+it easier to structure domain specific information.
+
+The core part of TableGen `parses a file`_, instantiates the declarations, and
+hands the result off to a domain-specific `TableGen backend`_ for processing.
+The current major user of TableGen is the `LLVM code
+generator <CodeGenerator.html>`_.
+
+Note that if you work on TableGen much, and use emacs or vim, that you can find
+an emacs "TableGen mode" and a vim language file in the ``llvm/utils/emacs`` and
+``llvm/utils/vim`` directories of your LLVM distribution, respectively.
+
+.. _intro:
+
+Basic concepts
+--------------
+
+TableGen files consist of two key parts: 'classes' and 'definitions', both of
+which are considered 'records'.
+
+**TableGen records** have a unique name, a list of values, and a list of
+superclasses.  The list of values is the main data that TableGen builds for each
+record; it is this that holds the domain specific information for the
+application.  The interpretation of this data is left to a specific `TableGen
+backend`_, but the structure and format rules are taken care of and are fixed by
+TableGen.
+
+**TableGen definitions** are the concrete form of 'records'.  These generally do
+not have any undefined values, and are marked with the '``def``' keyword.
+
+**TableGen classes** are abstract records that are used to build and describe
+other records.  These 'classes' allow the end-user to build abstractions for
+either the domain they are targeting (such as "Register", "RegisterClass", and
+"Instruction" in the LLVM code generator) or for the implementor to help factor
+out common properties of records (such as "FPInst", which is used to represent
+floating point instructions in the X86 backend).  TableGen keeps track of all of
+the classes that are used to build up a definition, so the backend can find all
+definitions of a particular class, such as "Instruction".
+
+**TableGen multiclasses** are groups of abstract records that are instantiated
+all at once.  Each instantiation can result in multiple TableGen definitions.
+If a multiclass inherits from another multiclass, the definitions in the
+sub-multiclass become part of the current multiclass, as if they were declared
+in the current multiclass.
+
+.. _described above:
+
+An example record
+-----------------
+
+With no other arguments, TableGen parses the specified file and prints out all
+of the classes, then all of the definitions.  This is a good way to see what the
+various definitions expand to fully.  Running this on the ``X86.td`` file prints
+this (at the time of this writing):
+
+.. code-block:: llvm
+
+  ...
+  def ADD32rr {   // Instruction X86Inst I
+    string Namespace = "X86";
+    dag OutOperandList = (outs GR32:$dst);
+    dag InOperandList = (ins GR32:$src1, GR32:$src2);
+    string AsmString = "add{l}\t{$src2, $dst|$dst, $src2}";
+    list<dag> Pattern = [(set GR32:$dst, (add GR32:$src1, GR32:$src2))];
+    list<Register> Uses = [];
+    list<Register> Defs = [EFLAGS];
+    list<Predicate> Predicates = [];
+    int CodeSize = 3;
+    int AddedComplexity = 0;
+    bit isReturn = 0;
+    bit isBranch = 0;
+    bit isIndirectBranch = 0;
+    bit isBarrier = 0;
+    bit isCall = 0;
+    bit canFoldAsLoad = 0;
+    bit mayLoad = 0;
+    bit mayStore = 0;
+    bit isImplicitDef = 0;
+    bit isConvertibleToThreeAddress = 1;
+    bit isCommutable = 1;
+    bit isTerminator = 0;
+    bit isReMaterializable = 0;
+    bit isPredicable = 0;
+    bit hasDelaySlot = 0;
+    bit usesCustomInserter = 0;
+    bit hasCtrlDep = 0;
+    bit isNotDuplicable = 0;
+    bit hasSideEffects = 0;
+    bit neverHasSideEffects = 0;
+    InstrItinClass Itinerary = NoItinerary;
+    string Constraints = "";
+    string DisableEncoding = "";
+    bits<8> Opcode = { 0, 0, 0, 0, 0, 0, 0, 1 };
+    Format Form = MRMDestReg;
+    bits<6> FormBits = { 0, 0, 0, 0, 1, 1 };
+    ImmType ImmT = NoImm;
+    bits<3> ImmTypeBits = { 0, 0, 0 };
+    bit hasOpSizePrefix = 0;
+    bit hasAdSizePrefix = 0;
+    bits<4> Prefix = { 0, 0, 0, 0 };
+    bit hasREX_WPrefix = 0;
+    FPFormat FPForm = ?;
+    bits<3> FPFormBits = { 0, 0, 0 };
+  }
+  ...
+
+This definition corresponds to a 32-bit register-register add instruction in the
+X86.  The string after the '``def``' string indicates the name of the
+record---"``ADD32rr``" in this case---and the comment at the end of the line
+indicates the superclasses of the definition.  The body of the record contains
+all of the data that TableGen assembled for the record, indicating that the
+instruction is part of the "X86" namespace, the pattern indicating how the the
+instruction should be emitted into the assembly file, that it is a two-address
+instruction, has a particular encoding, etc.  The contents and semantics of the
+information in the record is specific to the needs of the X86 backend, and is
+only shown as an example.
+
+As you can see, a lot of information is needed for every instruction supported
+by the code generator, and specifying it all manually would be unmaintainable,
+prone to bugs, and tiring to do in the first place.  Because we are using
+TableGen, all of the information was derived from the following definition:
+
+.. code-block:: llvm
+
+  let Defs = [EFLAGS],
+      isCommutable = 1,                  // X = ADD Y,Z --> X = ADD Z,Y
+      isConvertibleToThreeAddress = 1 in // Can transform into LEA.
+  def ADD32rr  : I<0x01, MRMDestReg, (outs GR32:$dst),
+                                     (ins GR32:$src1, GR32:$src2),
+                   "add{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
+
+This definition makes use of the custom class ``I`` (extended from the custom
+class ``X86Inst``), which is defined in the X86-specific TableGen file, to
+factor out the common features that instructions of its class share.  A key
+feature of TableGen is that it allows the end-user to define the abstractions
+they prefer to use when describing their information.
+
+Each def record has a special entry called "``NAME``."  This is the name of the
+def ("``ADD32rr``" above).  In the general case def names can be formed from
+various kinds of string processing expressions and ``NAME`` resolves to the
+final value obtained after resolving all of those expressions.  The user may
+refer to ``NAME`` anywhere she desires to use the ultimate name of the def.
+``NAME`` should not be defined anywhere else in user code to avoid conflict
+problems.
+
+Running TableGen
+----------------
+
+TableGen runs just like any other LLVM tool.  The first (optional) argument
+specifies the file to read.  If a filename is not specified, ``llvm-tblgen``
+reads from standard input.
+
+To be useful, one of the `TableGen backends`_ must be used.  These backends are
+selectable on the command line (type '``llvm-tblgen -help``' for a list).  For
+example, to get a list of all of the definitions that subclass a particular type
+(which can be useful for building up an enum list of these records), use the
+``-print-enums`` option:
+
+.. code-block:: bash
+
+  $ llvm-tblgen X86.td -print-enums -class=Register
+  AH, AL, AX, BH, BL, BP, BPL, BX, CH, CL, CX, DH, DI, DIL, DL, DX, EAX, EBP, EBX,
+  ECX, EDI, EDX, EFLAGS, EIP, ESI, ESP, FP0, FP1, FP2, FP3, FP4, FP5, FP6, IP,
+  MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, R10, R10B, R10D, R10W, R11, R11B, R11D,
+  R11W, R12, R12B, R12D, R12W, R13, R13B, R13D, R13W, R14, R14B, R14D, R14W, R15,
+  R15B, R15D, R15W, R8, R8B, R8D, R8W, R9, R9B, R9D, R9W, RAX, RBP, RBX, RCX, RDI,
+  RDX, RIP, RSI, RSP, SI, SIL, SP, SPL, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+  XMM0, XMM1, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5,
+  XMM6, XMM7, XMM8, XMM9,
+
+  $ llvm-tblgen X86.td -print-enums -class=Instruction 
+  ABS_F, ABS_Fp32, ABS_Fp64, ABS_Fp80, ADC32mi, ADC32mi8, ADC32mr, ADC32ri,
+  ADC32ri8, ADC32rm, ADC32rr, ADC64mi32, ADC64mi8, ADC64mr, ADC64ri32, ADC64ri8,
+  ADC64rm, ADC64rr, ADD16mi, ADD16mi8, ADD16mr, ADD16ri, ADD16ri8, ADD16rm,
+  ADD16rr, ADD32mi, ADD32mi8, ADD32mr, ADD32ri, ADD32ri8, ADD32rm, ADD32rr,
+  ADD64mi32, ADD64mi8, ADD64mr, ADD64ri32, ...
+
+The default backend prints out all of the records, as `described above`_.
+
+If you plan to use TableGen, you will most likely have to `write a backend`_
+that extracts the information specific to what you need and formats it in the
+appropriate way.
+
+.. _parses a file:
+
+TableGen syntax
+===============
+
+TableGen doesn't care about the meaning of data (that is up to the backend to
+define), but it does care about syntax, and it enforces a simple type system.
+This section describes the syntax and the constructs allowed in a TableGen file.
+
+TableGen primitives
+-------------------
+
+TableGen comments
+^^^^^^^^^^^^^^^^^
+
+TableGen supports BCPL style "``//``" comments, which run to the end of the
+line, and it also supports **nestable** "``/* */``" comments.
+
+.. _TableGen type:
+
+The TableGen type system
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+TableGen files are strongly typed, in a simple (but complete) type-system.
+These types are used to perform automatic conversions, check for errors, and to
+help interface designers constrain the input that they allow.  Every `value
+definition`_ is required to have an associated type.
+
+TableGen supports a mixture of very low-level types (such as ``bit``) and very
+high-level types (such as ``dag``).  This flexibility is what allows it to
+describe a wide range of information conveniently and compactly.  The TableGen
+types are:
+
+``bit``
+    A 'bit' is a boolean value that can hold either 0 or 1.
+
+``int``
+    The 'int' type represents a simple 32-bit integer value, such as 5.
+
+``string``
+    The 'string' type represents an ordered sequence of characters of arbitrary
+    length.
+
+``bits<n>``
+    A 'bits' type is an arbitrary, but fixed, size integer that is broken up
+    into individual bits.  This type is useful because it can handle some bits
+    being defined while others are undefined.
+
+``list<ty>``
+    This type represents a list whose elements are some other type.  The
+    contained type is arbitrary: it can even be another list type.
+
+Class type
+    Specifying a class name in a type context means that the defined value must
+    be a subclass of the specified class.  This is useful in conjunction with
+    the ``list`` type, for example, to constrain the elements of the list to a
+    common base class (e.g., a ``list<Register>`` can only contain definitions
+    derived from the "``Register``" class).
+
+``dag``
+    This type represents a nestable directed graph of elements.
+
+``code``
+    This represents a big hunk of text.  This is lexically distinct from string
+    values because it doesn't require escaping double quotes and other common
+    characters that occur in code.
+
+To date, these types have been sufficient for describing things that TableGen
+has been used for, but it is straight-forward to extend this list if needed.
+
+.. _TableGen expressions:
+
+TableGen values and expressions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TableGen allows for a pretty reasonable number of different expression forms
+when building up values.  These forms allow the TableGen file to be written in a
+natural syntax and flavor for the application.  The current expression forms
+supported include:
+
+``?``
+    uninitialized field
+
+``0b1001011``
+    binary integer value
+
+``07654321``
+    octal integer value (indicated by a leading 0)
+
+``7``
+    decimal integer value
+
+``0x7F``
+    hexadecimal integer value
+
+``"foo"``
+    string value
+
+``[{ ... }]``
+    code fragment
+
+``[ X, Y, Z ]<type>``
+    list value.  <type> is the type of the list element and is usually optional.
+    In rare cases, TableGen is unable to deduce the element type in which case
+    the user must specify it explicitly.
+
+``{ a, b, c }``
+    initializer for a "bits<3>" value
+
+``value``
+    value reference
+
+``value{17}``
+    access to one bit of a value
+
+``value{15-17}``
+    access to multiple bits of a value
+
+``DEF``
+    reference to a record definition
+
+``CLASS<val list>``
+    reference to a new anonymous definition of CLASS with the specified template
+    arguments.
+
+``X.Y``
+    reference to the subfield of a value
+
+``list[4-7,17,2-3]``
+    A slice of the 'list' list, including elements 4,5,6,7,17,2, and 3 from it.
+    Elements may be included multiple times.
+
+``foreach <var> = [ <list> ] in { <body> }``
+
+``foreach <var> = [ <list> ] in <def>``
+    Replicate <body> or <def>, replacing instances of <var> with each value
+    in <list>.  <var> is scoped at the level of the ``foreach`` loop and must
+    not conflict with any other object introduced in <body> or <def>.  Currently
+    only ``def``\s are expanded within <body>.
+
+``foreach <var> = 0-15 in ...``
+
+``foreach <var> = {0-15,32-47} in ...``
+    Loop over ranges of integers. The braces are required for multiple ranges.
+
+``(DEF a, b)``
+    a dag value.  The first element is required to be a record definition, the
+    remaining elements in the list may be arbitrary other values, including
+    nested ```dag``' values.
+
+``!strconcat(a, b)``
+    A string value that is the result of concatenating the 'a' and 'b' strings.
+
+``str1#str2``
+    "#" (paste) is a shorthand for !strconcat.  It may concatenate things that
+    are not quoted strings, in which case an implicit !cast<string> is done on
+    the operand of the paste.
+
+``!cast<type>(a)``
+    A symbol of type *type* obtained by looking up the string 'a' in the symbol
+    table.  If the type of 'a' does not match *type*, TableGen aborts with an
+    error. !cast<string> is a special case in that the argument must be an
+    object defined by a 'def' construct.
+
+``!subst(a, b, c)``
+    If 'a' and 'b' are of string type or are symbol references, substitute 'b'
+    for 'a' in 'c.'  This operation is analogous to $(subst) in GNU make.
+
+``!foreach(a, b, c)``
+    For each member 'b' of dag or list 'a' apply operator 'c.'  'b' is a dummy
+    variable that should be declared as a member variable of an instantiated
+    class.  This operation is analogous to $(foreach) in GNU make.
+
+``!head(a)``
+    The first element of list 'a.'
+
+``!tail(a)``
+    The 2nd-N elements of list 'a.'
+
+``!empty(a)``
+    An integer {0,1} indicating whether list 'a' is empty.
+
+``!if(a,b,c)``
+  'b' if the result of 'int' or 'bit' operator 'a' is nonzero, 'c' otherwise.
+
+``!eq(a,b)``
+    'bit 1' if string a is equal to string b, 0 otherwise.  This only operates
+    on string, int and bit objects.  Use !cast<string> to compare other types of
+    objects.
+
+Note that all of the values have rules specifying how they convert to values
+for different types.  These rules allow you to assign a value like "``7``"
+to a "``bits<4>``" value, for example.
+
+Classes and definitions
+-----------------------
+
+As mentioned in the `intro`_, classes and definitions (collectively known as
+'records') in TableGen are the main high-level unit of information that TableGen
+collects.  Records are defined with a ``def`` or ``class`` keyword, the record
+name, and an optional list of "`template arguments`_".  If the record has
+superclasses, they are specified as a comma separated list that starts with a
+colon character ("``:``").  If `value definitions`_ or `let expressions`_ are
+needed for the class, they are enclosed in curly braces ("``{}``"); otherwise,
+the record ends with a semicolon.
+
+Here is a simple TableGen file:
+
+.. code-block:: llvm
+
+  class C { bit V = 1; }
+  def X : C;
+  def Y : C {
+    string Greeting = "hello";
+  }
+
+This example defines two definitions, ``X`` and ``Y``, both of which derive from
+the ``C`` class.  Because of this, they both get the ``V`` bit value.  The ``Y``
+definition also gets the Greeting member as well.
+
+In general, classes are useful for collecting together the commonality between a
+group of records and isolating it in a single place.  Also, classes permit the
+specification of default values for their subclasses, allowing the subclasses to
+override them as they wish.
+
+.. _value definition:
+.. _value definitions:
+
+Value definitions
+^^^^^^^^^^^^^^^^^
+
+Value definitions define named entries in records.  A value must be defined
+before it can be referred to as the operand for another value definition or
+before the value is reset with a `let expression`_.  A value is defined by
+specifying a `TableGen type`_ and a name.  If an initial value is available, it
+may be specified after the type with an equal sign.  Value definitions require
+terminating semicolons.
+
+.. _let expression:
+.. _let expressions:
+.. _"let" expressions within a record:
+
+'let' expressions
+^^^^^^^^^^^^^^^^^
+
+A record-level let expression is used to change the value of a value definition
+in a record.  This is primarily useful when a superclass defines a value that a
+derived class or definition wants to override.  Let expressions consist of the
+'``let``' keyword followed by a value name, an equal sign ("``=``"), and a new
+value.  For example, a new class could be added to the example above, redefining
+the ``V`` field for all of its subclasses:
+
+.. code-block:: llvm
+
+  class D : C { let V = 0; }
+  def Z : D;
+
+In this case, the ``Z`` definition will have a zero value for its ``V`` value,
+despite the fact that it derives (indirectly) from the ``C`` class, because the
+``D`` class overrode its value.
+
+.. _template arguments:
+
+Class template arguments
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+TableGen permits the definition of parameterized classes as well as normal
+concrete classes.  Parameterized TableGen classes specify a list of variable
+bindings (which may optionally have defaults) that are bound when used.  Here is
+a simple example:
+
+.. code-block:: llvm
+
+  class FPFormat<bits<3> val> {
+    bits<3> Value = val;
+  }
+  def NotFP      : FPFormat<0>;
+  def ZeroArgFP  : FPFormat<1>;
+  def OneArgFP   : FPFormat<2>;
+  def OneArgFPRW : FPFormat<3>;
+  def TwoArgFP   : FPFormat<4>;
+  def CompareFP  : FPFormat<5>;
+  def CondMovFP  : FPFormat<6>;
+  def SpecialFP  : FPFormat<7>;
+
+In this case, template arguments are used as a space efficient way to specify a
+list of "enumeration values", each with a "``Value``" field set to the specified
+integer.
+
+The more esoteric forms of `TableGen expressions`_ are useful in conjunction
+with template arguments.  As an example:
+
+.. code-block:: llvm
+
+  class ModRefVal<bits<2> val> {
+    bits<2> Value = val;
+  }
+
+  def None   : ModRefVal<0>;
+  def Mod    : ModRefVal<1>;
+  def Ref    : ModRefVal<2>;
+  def ModRef : ModRefVal<3>;
+
+  class Value<ModRefVal MR> {
+    // Decode some information into a more convenient format, while providing
+    // a nice interface to the user of the "Value" class.
+    bit isMod = MR.Value{0};
+    bit isRef = MR.Value{1};
+
+    // other stuff...
+  }
+
+  // Example uses
+  def bork : Value<Mod>;
+  def zork : Value<Ref>;
+  def hork : Value<ModRef>;
+
+This is obviously a contrived example, but it shows how template arguments can
+be used to decouple the interface provided to the user of the class from the
+actual internal data representation expected by the class.  In this case,
+running ``llvm-tblgen`` on the example prints the following definitions:
+
+.. code-block:: llvm
+
+  def bork {      // Value
+    bit isMod = 1;
+    bit isRef = 0;
+  }
+  def hork {      // Value
+    bit isMod = 1;
+    bit isRef = 1;
+  }
+  def zork {      // Value
+    bit isMod = 0;
+    bit isRef = 1;
+  }
+
+This shows that TableGen was able to dig into the argument and extract a piece
+of information that was requested by the designer of the "Value" class.  For
+more realistic examples, please see existing users of TableGen, such as the X86
+backend.
+
+Multiclass definitions and instances
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+While classes with template arguments are a good way to factor commonality
+between two instances of a definition, multiclasses allow a convenient notation
+for defining multiple definitions at once (instances of implicitly constructed
+classes).  For example, consider an 3-address instruction set whose instructions
+come in two forms: "``reg = reg op reg``" and "``reg = reg op imm``"
+(e.g. SPARC). In this case, you'd like to specify in one place that this
+commonality exists, then in a separate place indicate what all the ops are.
+
+Here is an example TableGen fragment that shows this idea:
+
+.. code-block:: llvm
+
+  def ops;
+  def GPR;
+  def Imm;
+  class inst<int opc, string asmstr, dag operandlist>;
+
+  multiclass ri_inst<int opc, string asmstr> {
+    def _rr : inst<opc, !strconcat(asmstr, " $dst, $src1, $src2"),
+                   (ops GPR:$dst, GPR:$src1, GPR:$src2)>;
+    def _ri : inst<opc, !strconcat(asmstr, " $dst, $src1, $src2"),
+                   (ops GPR:$dst, GPR:$src1, Imm:$src2)>;
+  }
+
+  // Instantiations of the ri_inst multiclass.
+  defm ADD : ri_inst<0b111, "add">;
+  defm SUB : ri_inst<0b101, "sub">;
+  defm MUL : ri_inst<0b100, "mul">;
+  ...
+
+The name of the resultant definitions has the multidef fragment names appended
+to them, so this defines ``ADD_rr``, ``ADD_ri``, ``SUB_rr``, etc.  A defm may
+inherit from multiple multiclasses, instantiating definitions from each
+multiclass.  Using a multiclass this way is exactly equivalent to instantiating
+the classes multiple times yourself, e.g. by writing:
+
+.. code-block:: llvm
+
+  def ops;
+  def GPR;
+  def Imm;
+  class inst<int opc, string asmstr, dag operandlist>;
+
+  class rrinst<int opc, string asmstr>
+    : inst<opc, !strconcat(asmstr, " $dst, $src1, $src2"),
+           (ops GPR:$dst, GPR:$src1, GPR:$src2)>;
+
+  class riinst<int opc, string asmstr>
+    : inst<opc, !strconcat(asmstr, " $dst, $src1, $src2"),
+           (ops GPR:$dst, GPR:$src1, Imm:$src2)>;
+
+  // Instantiations of the ri_inst multiclass.
+  def ADD_rr : rrinst<0b111, "add">;
+  def ADD_ri : riinst<0b111, "add">;
+  def SUB_rr : rrinst<0b101, "sub">;
+  def SUB_ri : riinst<0b101, "sub">;
+  def MUL_rr : rrinst<0b100, "mul">;
+  def MUL_ri : riinst<0b100, "mul">;
+  ...
+
+A ``defm`` can also be used inside a multiclass providing several levels of
+multiclass instanciations.
+
+.. code-block:: llvm
+
+  class Instruction<bits<4> opc, string Name> {
+    bits<4> opcode = opc;
+    string name = Name;
+  }
+
+  multiclass basic_r<bits<4> opc> {
+    def rr : Instruction<opc, "rr">;
+    def rm : Instruction<opc, "rm">;
+  }
+
+  multiclass basic_s<bits<4> opc> {
+    defm SS : basic_r<opc>;
+    defm SD : basic_r<opc>;
+    def X : Instruction<opc, "x">;
+  }
+
+  multiclass basic_p<bits<4> opc> {
+    defm PS : basic_r<opc>;
+    defm PD : basic_r<opc>;
+    def Y : Instruction<opc, "y">;
+  }
+
+  defm ADD : basic_s<0xf>, basic_p<0xf>;
+  ...
+
+  // Results
+  def ADDPDrm { ...
+  def ADDPDrr { ...
+  def ADDPSrm { ...
+  def ADDPSrr { ...
+  def ADDSDrm { ...
+  def ADDSDrr { ...
+  def ADDY { ...
+  def ADDX { ...
+
+``defm`` declarations can inherit from classes too, the rule to follow is that
+the class list must start after the last multiclass, and there must be at least
+one multiclass before them.
+
+.. code-block:: llvm
+
+  class XD { bits<4> Prefix = 11; }
+  class XS { bits<4> Prefix = 12; }
+
+  class I<bits<4> op> {
+    bits<4> opcode = op;
+  }
+
+  multiclass R {
+    def rr : I<4>;
+    def rm : I<2>;
+  }
+
+  multiclass Y {
+    defm SS : R, XD;
+    defm SD : R, XS;
+  }
+
+  defm Instr : Y;
+
+  // Results
+  def InstrSDrm {
+    bits<4> opcode = { 0, 0, 1, 0 };
+    bits<4> Prefix = { 1, 1, 0, 0 };
+  }
+  ...
+  def InstrSSrr {
+    bits<4> opcode = { 0, 1, 0, 0 };
+    bits<4> Prefix = { 1, 0, 1, 1 };
+  }
+
+File scope entities
+-------------------
+
+File inclusion
+^^^^^^^^^^^^^^
+
+TableGen supports the '``include``' token, which textually substitutes the
+specified file in place of the include directive.  The filename should be
+specified as a double quoted string immediately after the '``include``' keyword.
+Example:
+
+.. code-block:: llvm
+
+  include "foo.td"
+
+'let' expressions
+^^^^^^^^^^^^^^^^^
+
+"Let" expressions at file scope are similar to `"let" expressions within a
+record`_, except they can specify a value binding for multiple records at a
+time, and may be useful in certain other cases.  File-scope let expressions are
+really just another way that TableGen allows the end-user to factor out
+commonality from the records.
+
+File-scope "let" expressions take a comma-separated list of bindings to apply,
+and one or more records to bind the values in.  Here are some examples:
+
+.. code-block:: llvm
+
+  let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in
+    def RET : I<0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)]>;
+
+  let isCall = 1 in
+    // All calls clobber the non-callee saved registers...
+    let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+                MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+                XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, EFLAGS] in {
+      def CALLpcrel32 : Ii32<0xE8, RawFrm, (outs), (ins i32imm:$dst,variable_ops),
+                             "call\t${dst:call}", []>;
+      def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+                          "call\t{*}$dst", [(X86call GR32:$dst)]>;
+      def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+                          "call\t{*}$dst", []>;
+    }
+
+File-scope "let" expressions are often useful when a couple of definitions need
+to be added to several records, and the records do not otherwise need to be
+opened, as in the case with the ``CALL*`` instructions above.
+
+It's also possible to use "let" expressions inside multiclasses, providing more
+ways to factor out commonality from the records, specially if using several
+levels of multiclass instanciations. This also avoids the need of using "let"
+expressions within subsequent records inside a multiclass.
+
+.. code-block:: llvm
+
+  multiclass basic_r<bits<4> opc> {
+    let Predicates = [HasSSE2] in {
+      def rr : Instruction<opc, "rr">;
+      def rm : Instruction<opc, "rm">;
+    }
+    let Predicates = [HasSSE3] in
+      def rx : Instruction<opc, "rx">;
+  }
+
+  multiclass basic_ss<bits<4> opc> {
+    let IsDouble = 0 in
+      defm SS : basic_r<opc>;
+
+    let IsDouble = 1 in
+      defm SD : basic_r<opc>;
+  }
+
+  defm ADD : basic_ss<0xf>;
+
+Looping
+^^^^^^^
+
+TableGen supports the '``foreach``' block, which textually replicates the loop
+body, substituting iterator values for iterator references in the body.
+Example:
+
+.. code-block:: llvm
+
+  foreach i = [0, 1, 2, 3] in {
+    def R#i : Register<...>;
+    def F#i : Register<...>;
+  }
+
+This will create objects ``R0``, ``R1``, ``R2`` and ``R3``.  ``foreach`` blocks
+may be nested. If there is only one item in the body the braces may be
+elided:
+
+.. code-block:: llvm
+
+  foreach i = [0, 1, 2, 3] in
+    def R#i : Register<...>;
+
+Code Generator backend info
+===========================
+
+Expressions used by code generator to describe instructions and isel patterns:
+
+``(implicit a)``
+    an implicitly defined physical register.  This tells the dag instruction
+    selection emitter the input pattern's extra definitions matches implicit
+    physical register definitions.
+
+.. _TableGen backend:
+.. _TableGen backends:
+.. _write a backend:
+
+TableGen backends
+=================
+
+TODO: How they work, how to write one.  This section should not contain details
+about any particular backend, except maybe ``-print-enums`` as an example.  This
+should highlight the APIs in ``TableGen/Record.h``.
diff --git a/docs/TestSuiteMakefileGuide.html b/docs/TestSuiteMakefileGuide.html
index 876fe42..1b24250 100644
--- a/docs/TestSuiteMakefileGuide.html
+++ b/docs/TestSuiteMakefileGuide.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>LLVM test-suite Makefile Guide</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
       
@@ -238,12 +238,12 @@ LLVM.</p>
   simple one is simply running <tt>gmake</tt> with no arguments. This will
   compile and run all programs in the tree using a number of different methods
   and compare results. Any failures are reported in the output, but are likely
-  drowned in the other output. Passes are not reported explicitely.</p>
+  drowned in the other output. Passes are not reported explicitly.</p>
 
   <p>Somewhat better is running <tt>gmake TEST=sometest test</tt>, which runs
   the specified test and usually adds per-program summaries to the output
   (depending on which sometest you use). For example, the <tt>nightly</tt> test
-  explicitely outputs TEST-PASS or TEST-FAIL for every test after each program.
+  explicitly outputs TEST-PASS or TEST-FAIL for every test after each program.
   Though these lines are still drowned in the output, it's easy to grep the
   output logs in the Output directories.</p>
 
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index 33ce793..804e929 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>LLVM Testing Infrastructure Guide</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
       
@@ -626,6 +626,8 @@ define i8 @coerce_offset0(i32 %V, i32* %P) {
 
 <div>
 
+<!-- {% raw %} -->
+
 <p>The CHECK: and CHECK-NOT: directives both take a pattern to match.  For most
 uses of FileCheck, fixed string matching is perfectly sufficient.  For some
 things, a more flexible form of matching is desired.  To support this, FileCheck
@@ -650,6 +652,8 @@ braces like you would in C.  In the rare case that you want to match double
 braces explicitly from the input, you can use something ugly like
 <b>{{[{][{]}}</b> as your pattern.</p>
 
+<!-- {% endraw %} -->
+
 </div>
 
 <!-- _______________________________________________________________________ -->
@@ -659,6 +663,9 @@ braces explicitly from the input, you can use something ugly like
 
 <div>
 
+
+<!-- {% raw %} -->
+
 <p>It is often useful to match a pattern and then verify that it occurs again
 later in the file.  For codegen tests, this can be useful to allow any register,
 but verify that that register is used consistently later.  To do this, FileCheck
@@ -690,6 +697,8 @@ that FileCheck is not actually line-oriented when it matches, this allows you to
 define two separate CHECK lines that match on the same line.
 </p>
 
+<!-- {% endraw %} -->
+
 </div>
 
 </div>
@@ -900,7 +909,7 @@ the <a href="TestSuiteMakefileGuide.html">Test Suite Makefile Guide.</a></p>
 
   John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya Lattner<br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-18 10:02:25 +0200 (Wed, 18 Apr 2012) $
+  Last modified: $Date: 2012-05-08 20:26:07 +0200 (Tue, 08 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/WritingAnLLVMBackend.html b/docs/WritingAnLLVMBackend.html
index 85548ea..11517c2 100644
--- a/docs/WritingAnLLVMBackend.html
+++ b/docs/WritingAnLLVMBackend.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Writing an LLVM Compiler Backend</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -2526,7 +2526,7 @@ with assembler.
   <a href="http://www.woo.com">Mason Woo</a> and <a href="http://misha.brukman.net">Misha Brukman</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2012-03-01 16:14:19 +0100 (Thu, 01 Mar 2012) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 
 </body>
diff --git a/docs/WritingAnLLVMPass.html b/docs/WritingAnLLVMPass.html
index 5dc67ae..149b103 100644
--- a/docs/WritingAnLLVMPass.html
+++ b/docs/WritingAnLLVMPass.html
@@ -4,7 +4,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <title>Writing an LLVM Pass</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
+  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
 </head>
 <body>
 
@@ -1947,7 +1947,7 @@ Despite that, we have kept the LLVM passes SMP ready, and you should too.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-08 13:52:52 +0200 (Sun, 08 Apr 2012) $
+  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
 </address>
 
 </body>
diff --git a/docs/img/lines.gif b/docs/_static/lines.gif
index 88f491e..88f491e 100644
--- a/docs/img/lines.gif
+++ b/docs/_static/lines.gif
diff --git a/docs/llvm.css b/docs/_static/llvm.css
index e3e6351..d7b5dae 100644
--- a/docs/llvm.css
+++ b/docs/_static/llvm.css
@@ -16,7 +16,7 @@ table       { text-align: center; border: 2px solid black;
               margin-right: 1em; margin-bottom: 1em; }
 tr, td      { border: 2px solid gray; padding: 4pt 4pt 2pt 2pt; }
 th          { border: 2px solid gray; font-weight: bold; font-size: 105%;
-              background: url("img/lines.gif");
+              background: url("lines.gif");
               font-family: "Georgia,Palatino,Times,Roman,SanSerif";
               text-align: center; vertical-align: middle; }
 /*
@@ -24,7 +24,7 @@ th          { border: 2px solid gray; font-weight: bold; font-size: 105%;
  */
 /* Common for title and header */
 .doc_title, .doc_section, .doc_subsection, h1, h2, h3 {
-  color: black; background: url("img/lines.gif");
+  color: black; background: url("lines.gif");
   font-family: "Georgia,Palatino,Times,Roman,SanSerif"; font-weight: bold;
   border-width: 1px;
   border-style: solid none solid none;
diff --git a/docs/_templates/indexsidebar.html b/docs/_templates/indexsidebar.html
new file mode 100644
index 0000000..4161742
--- /dev/null
+++ b/docs/_templates/indexsidebar.html
@@ -0,0 +1,7 @@
+{# This template defines sidebar which can be used to provide common links on
+   all documentation pages. #}
+
+<h3>Bugs</h3>
+
+<p>LLVM bugs should be reported to
+  <a href="http://llvm.org/bugs">Bugzilla</a>.</p>
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 0000000..de5db5c
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,13 @@
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+<style type="text/css">
+  table.right { float: right; margin-left: 20px; }
+  table.right td { border: 1px solid #ccc; }
+</style>
+{% endblock %}
+
+{% block rootrellink %}
+  <li><a href="http://llvm.org/">LLVM Home</a>&nbsp;|&nbsp;</li>
+  <li><a href="{{ pathto('index') }}">Documentation</a>&raquo;</li>
+{% endblock %}
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..de0585d
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+#
+# LLVM documentation build configuration file.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.intersphinx', 'sphinx.ext.todo']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'LLVM'
+copyright = u'2012, LLVM Project'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '3.2'
+# The full version, including alpha/beta/rc tags.
+release = '3.2'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+today_fmt = '%Y-%m-%d'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+show_authors = True
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'friendly'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'llvm-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ["."]
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+html_last_updated_fmt = '%Y-%m-%d'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+html_sidebars = {'index': 'indexsidebar.html'}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# We load all the old-school HTML documentation pages into Sphinx here.
+basedir = os.path.dirname(__file__)
+html_additional_pages = {}
+for directory in ('', 'tutorial'):
+    for file in os.listdir(os.path.join(basedir, directory)):
+        if not file.endswith('.html'):
+            continue
+
+        subpath = os.path.join(directory, file)
+        name,_ = os.path.splitext(subpath)
+        html_additional_pages[name] = subpath
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'LLVMdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'LLVM.tex', u'LLVM Documentation',
+   u'LLVM project', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = []
+
+# Automatically derive the list of man pages from the contents of the command
+# guide subdirectory.
+man_page_authors = "Maintained by The LLVM Team (http://llvm.org/)."
+command_guide_subpath = 'CommandGuide'
+command_guide_path = os.path.join(basedir, command_guide_subpath)
+for name in os.listdir(command_guide_path):
+    # Ignore non-ReST files and the index page.
+    if not name.endswith('.rst') or name in ('index.rst',):
+        continue
+
+    # Otherwise, automatically extract the description.
+    file_subpath = os.path.join(command_guide_subpath, name)
+    with open(os.path.join(command_guide_path, name)) as f:
+        it = iter(f)
+        title = it.next()[:-1]
+        header = it.next()[:-1]
+
+        if len(header) != len(title):
+            print >>sys.stderr, (
+                "error: invalid header in %r (does not match title)" % (
+                    file_subpath,))
+        if ' - ' not in title:
+            print >>sys.stderr, (
+                ("error: invalid title in %r "
+                 "(expected '<name> - <description>')") % (
+                    file_subpath,))
+
+        # Split the name out of the title.
+        name,description = title.split(' - ', 1)
+        man_pages.append((file_subpath.replace('.rst',''), name,
+                          description, man_page_authors, 1))
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+# FIXME: Define intersphinx configration.
+intersphinx_mapping = {}
diff --git a/docs/design_and_overview.rst b/docs/design_and_overview.rst
new file mode 100644
index 0000000..ea68415
--- /dev/null
+++ b/docs/design_and_overview.rst
@@ -0,0 +1,36 @@
+.. _design_and_overview:
+
+LLVM Design & Overview
+======================
+
+.. toctree::
+   :hidden:
+
+   GetElementPtr
+
+* `LLVM Language Reference Manual <LangRef.html>`_
+
+  Defines the LLVM intermediate representation.
+
+* `Introduction to the LLVM Compiler <http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html>`_
+
+  Presentation providing a users introduction to LLVM.
+
+* `Intro to LLVM <http://www.aosabook.org/en/llvm.html>`_
+
+  Book chapter providing a compiler hacker's introduction to LLVM.
+
+* `LLVM: A Compilation Framework forLifelong Program Analysis & Transformation
+  <http://llvm.org/pubs/2004-01-30-CGO-LLVM.html>`_
+
+  Design overview.
+
+* `LLVM: An Infrastructure for Multi-Stage Optimization
+  <http://llvm.org/pubs/2002-12-LattnerMSThesis.html>`_
+
+  More details (quite old now).
+
+* :ref:`gep`
+
+  Answers to some very frequent questions about LLVM's most frequently
+  misunderstood instruction.
diff --git a/docs/development_process.rst b/docs/development_process.rst
new file mode 100644
index 0000000..4fc20b3
--- /dev/null
+++ b/docs/development_process.rst
@@ -0,0 +1,30 @@
+.. _development_process:
+
+Development Process Documentation
+=================================
+
+.. toctree::
+   :hidden:
+
+   MakefileGuide
+   Projects
+
+* :ref:`projects`
+
+  How-to guide and templates for new projects that *use* the LLVM
+  infrastructure.  The templates (directory organization, Makefiles, and test
+  tree) allow the project code to be located outside (or inside) the ``llvm/``
+  tree, while using LLVM header files and libraries.
+
+* `LLVMBuild Documentation <LLVMBuild.html>`_
+
+  Describes the LLVMBuild organization and files used by LLVM to specify
+  component descriptions.
+
+* :ref:`makefile_guide`
+
+  Describes how the LLVM makefiles work and how to use them.
+
+* `How To Release LLVM To The Public <HowToReleaseLLVM.html>`_
+
+  This is a guide to preparing LLVM releases. Most developers can ignore it.
diff --git a/docs/doxygen.css b/docs/doxygen.css
index 80c6cad..83951f6 100644
--- a/docs/doxygen.css
+++ b/docs/doxygen.css
@@ -327,7 +327,7 @@ HR { height: 1px;
 }
 .title {
   font-size: 25pt; 
-  color: black; background: url("../img/lines.gif");
+  color: black;
   font-weight: bold;
   border-width: 1px;
   border-style: solid none solid none;
diff --git a/docs/img/Debugging.gif b/docs/img/Debugging.gif
deleted file mode 100644
index 662d35a..0000000
--- a/docs/img/Debugging.gif
+++ /dev/null
diff --git a/docs/img/libdeps.gif b/docs/img/libdeps.gif
deleted file mode 100644
index c5c0ed4..0000000
--- a/docs/img/libdeps.gif
+++ /dev/null
diff --git a/docs/img/objdeps.gif b/docs/img/objdeps.gif
deleted file mode 100644
index 57c3e2e..0000000
--- a/docs/img/objdeps.gif
+++ /dev/null
diff --git a/docs/img/venusflytrap.jpg b/docs/img/venusflytrap.jpg
deleted file mode 100644
index 59340ef..0000000
--- a/docs/img/venusflytrap.jpg
+++ /dev/null
diff --git a/docs/index.html b/docs/index.html
deleted file mode 100644
index edd476d..0000000
--- a/docs/index.html
+++ /dev/null
@@ -1,286 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Documentation for the LLVM System at SVN head</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>Documentation for the LLVM System at SVN head</h1>
-
-<p class="doc_warning">If you are using a released version of LLVM,
-see <a href="http://llvm.org/releases/">the download page</a> to find
-your documentation.</p>
-
-<table class="layout" width="95%"><tr class="layout"><td class="left">
-<ul>
-  <li><a href="#llvmdesign">LLVM Design</a></li>
-  <li><a href="/pubs/">LLVM Publications</a></li>
-  <li><a href="#userguide">LLVM User Guides</a></li>
-  <li><a href="#llvmprog">LLVM Programming Documentation</a></li>
-  <li><a href="#subsystems">LLVM Subsystem Documentation</a></li>
-  <li><a href="#develprocess">LLVM Development Process Documentation</a></li>
-  <li><a href="#maillist">LLVM Mailing Lists</a></li>
-</ul>
-</td><td class="right">
-  <form action="http://www.google.com/search" method=get>
-    <p>
-      <input type="hidden" name="sitesearch" value="llvm.org/docs">
-      <input type=text name=q size=25><br>
-      <input type=submit value="Search the LLVM Docs" name="submit">
-    </p>
-  </form>
-</td></tr></table>
-
-<div class="doc_author">
-  <p>Written by <a href="http://llvm.org/">The LLVM Team</a></p>
-</div>
-
-<!--=======================================================================-->
-<h2><a name="llvmdesign">LLVM Design &amp; Overview</a></h2>
-<!--=======================================================================-->
-
-<ul>
-<li><a href="LangRef.html">LLVM Language Reference Manual</a> - Defines the LLVM
-intermediate representation.</li>
-<li><a href="http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html">Introduction to the LLVM Compiler </a> - Presentation providing a users introduction to LLVM.</li>
-<li><a href="http://www.aosabook.org/en/llvm.html">Intro to LLVM</a> - book chapter providing a compiler hacker's introduction to LLVM.</li>
-<li><a href="http://llvm.org/pubs/2004-01-30-CGO-LLVM.html">LLVM: A Compilation Framework for
-Lifelong Program Analysis &amp; Transformation</a> - Design overview.</li>
-<li><a href="http://llvm.org/pubs/2002-12-LattnerMSThesis.html">LLVM: An Infrastructure for
-Multi-Stage Optimization</a> - More details (quite old now).</li>
-<li><a href="GetElementPtr.html">GetElementPtr FAQ</a> - Answers to some very
-frequent questions about LLVM's most frequently misunderstood instruction.</li>
-</ul>
-
-<!--=======================================================================-->
-<h2><a name="userguide">LLVM User Guides</a></h2>
-<!--=======================================================================-->
-
-<ul>
-<li><a href="GettingStarted.html">The LLVM Getting Started Guide</a> -
-Discusses how to get up and running quickly with the LLVM infrastructure.
-Everything from unpacking and compilation of the distribution to execution of
-some tools.</li>
-
-<li><a href="CMake.html">LLVM CMake guide</a> - An addendum to the main Getting
-Started guide for those using the <a href="http://www.cmake.org/">CMake build
-system</a>.
-</li>
-
-<li><a href="GettingStartedVS.html">Getting Started with the LLVM System using
-Microsoft Visual Studio</a> - An addendum to the main Getting Started guide for
-those using Visual Studio on Windows.</li>
-
-<li><a href="tutorial/">LLVM Tutorial</a> - A walk through the process of using
-LLVM for a custom language, and the facilities LLVM offers in tutorial form.</li>
-<li><a href="DeveloperPolicy.html">Developer Policy</a> - The LLVM project's
-policy towards developers and their contributions.</li>
-
-<li><a href="CommandGuide/index.html">LLVM Command Guide</a> - A reference
-manual for the LLVM command line utilities ("man" pages for LLVM tools).</li>
-
-<li><a href="Passes.html">LLVM's Analysis and Transform Passes</a> - A list of
-optimizations and analyses implemented in LLVM.</li>
-
-<li><a href="FAQ.html">Frequently Asked Questions</a> - A list of common
-questions and problems and their solutions.</li>
-
-<li><a href="ReleaseNotes.html">Release notes for the current release</a>
-- This describes new features, known bugs, and other limitations.</li>
-
-<li><a href="HowToSubmitABug.html">How to Submit A Bug Report</a> -
-Instructions for properly submitting information about any bugs you run into in
-the LLVM system.</li>
-
-<li><a href="TestingGuide.html">LLVM Testing Infrastructure Guide</a> - A reference
-manual for using the LLVM testing infrastructure.</li>
-
-<li><a href="http://clang.llvm.org/get_started.html">How to build the C, C++, ObjC,
-and ObjC++ front end</a> - Instructions for building the clang front-end from
-source.</li>
-
-<li><a href="Packaging.html">Packaging guide</a> - Advice on packaging
-LLVM into a distribution.</li>
-
-<li><a href="Lexicon.html">The LLVM Lexicon</a> - Definition of acronyms, terms
-and concepts used in LLVM.</li>
-
-<li><a name="irc">You can probably find help on the unofficial LLVM IRC
-channel</a>.  We often are on irc.oftc.net in the #llvm channel.  If you are
-using the mozilla browser, and have chatzilla installed, you can <a
-href="irc://irc.oftc.net/llvm">join #llvm on irc.oftc.net</a> directly.</li>
-
-<li><a href="HowToAddABuilder.html">How To Add Your Build Configuration 
-To LLVM Buildbot Infrastructure</a> - Instructions for adding new builder to
-LLVM buildbot master.</li>
-
-</ul>
-
-
-<!--=======================================================================-->
-<h2><a name="llvmprog">LLVM Programming Documentation</a></h2>
-<!--=======================================================================-->
-
-<ul>
-<li><a href="LangRef.html">LLVM Language Reference Manual</a> - Defines the LLVM
-intermediate representation and the assembly form of the different nodes.</li>
-
-<li><a href="ProgrammersManual.html">The LLVM Programmers Manual</a> -
-Introduction to the general layout of the LLVM sourcebase, important classes
-and APIs, and some tips &amp; tricks.</li>
-
-<li><a href="CommandLine.html">CommandLine library Reference Manual</a> -
-Provides information on using the command line parsing library.</li>
-
-<li><a href="CodingStandards.html">LLVM Coding standards</a> -
-Details the LLVM coding standards and provides useful information on writing
-efficient C++ code.</li>
-
-<li><a href="ExtendingLLVM.html">Extending LLVM</a> - Look here to see how
-to add instructions and intrinsics to LLVM.</li>
-
-<li><a href="http://llvm.org/doxygen/">Doxygen generated
-documentation</a> (<a
-href="http://llvm.org/doxygen/inherits.html">classes</a>)
-
-(<a href="http://llvm.org/doxygen/doxygen.tar.gz">tarball</a>)
-</li>
-
-<li><a href="http://llvm.org/viewvc/">ViewVC Repository Browser</a></li>
-
-</ul>
-
-<!--=======================================================================-->
-<h2><a name="subsystems">LLVM Subsystem Documentation</a></h2>
-<!--=======================================================================-->
-
-<ul>
-
-<li><a href="WritingAnLLVMPass.html">Writing an LLVM Pass</a> - Information
-on how to write LLVM transformations and analyses.</li>
-
-<li><a href="WritingAnLLVMBackend.html">Writing an LLVM Backend</a> - Information
-on how to write LLVM backends for machine targets.</li>
-
-<li><a href="CodeGenerator.html">The LLVM Target-Independent Code
-Generator</a> - The design and implementation of the LLVM code generator.
-Useful if you are working on retargetting LLVM to a new architecture, designing
-a new codegen pass, or enhancing existing components.</li>
-
-<li><a href="TableGenFundamentals.html">TableGen Fundamentals</a> -
-Describes the TableGen tool, which is used heavily by the LLVM code
-generator.</li>
-
-<li><a href="AliasAnalysis.html">Alias Analysis in LLVM</a> - Information
-on how to write a new alias analysis implementation or how to use existing
-analyses.</li>
-
-<li><a href="GarbageCollection.html">Accurate Garbage Collection with
-LLVM</a> - The interfaces source-language compilers should use for compiling
-GC'd programs.</li>
-
-<li><a href="SourceLevelDebugging.html">Source Level Debugging with
-LLVM</a> - This document describes the design and philosophy behind the LLVM
-source-level debugger.</li>
-
-<li><a href="ExceptionHandling.html">Zero Cost Exception handling in LLVM</a>
-- This document describes the design and implementation of exception handling
-in LLVM.</li>
-
-<li><a href="Bugpoint.html">Bugpoint</a> - automatic bug finder and test-case
-reducer description and usage information.</li>
-
-<li><a href="BitCodeFormat.html">LLVM Bitcode File Format</a> - This describes
-the file format and encoding used for LLVM "bc" files.</li>
-
-<li><a href="SystemLibrary.html">System Library</a> - This document describes
-the LLVM System Library (<tt>lib/System</tt>) and how to keep LLVM source code
-portable</li>
-
-<li><a href="LinkTimeOptimization.html">Link Time Optimization</a> - This
-document describes the interface between LLVM intermodular optimizer and
-the linker and its design</li>
-
-<li><a href="GoldPlugin.html">The LLVM gold plugin</a> - How to build your
-programs with link-time optimization on Linux.</li>
-
-<li><a href="DebuggingJITedCode.html">The GDB JIT interface</a> - How to debug
-JITed code with GDB.</li>
-
-<li><a href="BranchWeightMetadata.html">Branch Weight Metadata</a> - Provides
-information about Branch Prediction Information.</li>
-
-</ul>
-
-<!--=======================================================================-->
-<h2><a name="develprocess">LLVM Development Process Documentation</a></h2>
-<!--=======================================================================-->
-
-<ul>
-
-<li><a href="Projects.html">LLVM Project Guide</a> - How-to guide and
-templates for new projects that <em>use</em> the LLVM infrastructure.  The
-templates (directory organization, Makefiles, and test tree) allow the project
-code to be located outside (or inside) the <tt>llvm/</tt> tree, while using LLVM
-header files and libraries.</li>
-
-<li><a href="LLVMBuild.html">LLVMBuild Documentation</a> - Describes the
-LLVMBuild organization and files used by LLVM to specify component
-descriptions.</li>
-
-<li><a href="MakefileGuide.html">LLVM Makefile Guide</a> - Describes how the
-LLVM makefiles work and how to use them.</li>
-
-<li><a href="HowToReleaseLLVM.html">How To Release LLVM To The Public</a> - This
-is a guide to preparing LLVM releases. Most developers can ignore it.</li>
-
-</ul>
-
-<!--=======================================================================-->
-<h2><a name="maillist">LLVM Mailing Lists</a></h2>
-<!--=======================================================================-->
-
-<ul>
-<li>The <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce">
-LLVM Announcements List</a>: This is a low volume list that provides important
-announcements regarding LLVM.  It gets email about once a month.</li>
-
-<li>The <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">Developer's
-List</a>: This list is for people who want to be included in technical
-discussions of LLVM. People post to this list when they have questions about
-writing code for or using the LLVM tools. It is relatively low volume.</li>
-
-<li>The <a href="http://lists.cs.uiuc.edu/pipermail/llvmbugs/">Bugs &amp;
-Patches Archive</a>: This list gets emailed every time a bug is opened and
-closed, and when people submit patches to be included in LLVM.  It is higher
-volume than the LLVMdev list.</li>
-
-<li>The <a href="http://lists.cs.uiuc.edu/pipermail/llvm-commits/">Commits
-Archive</a>: This list contains all commit messages that are made when LLVM
-developers commit code changes to the repository. It is useful for those who
-want to stay on the bleeding edge of LLVM development. This list is very high
-volume.</li>
-
-<li>The <a href="http://lists.cs.uiuc.edu/pipermail/llvm-testresults/">
-Test Results Archive</a>: A message is automatically sent to this list by every
-active nightly tester when it completes.  As such, this list gets email several
-times each day, making it a high volume list.</li>
-
-</ul>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-02-26 23:26:37 +0100 (Sun, 26 Feb 2012) $
-</address>
-</body></html>
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..53d3e7c
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,70 @@
+.. _contents:
+
+Overview
+========
+
+.. warning::
+
+   If you are using a released version of LLVM, see `the download page
+   <http://llvm.org/releases/>`_ to find your documentation.
+
+The LLVM compiler infrastructure supports a wide range of projects, from
+industrial strength compilers to specialized JIT applications to small
+research projects.
+
+Similarly, documentation is broken down into several high-level groupings
+targeted at different audiences:
+
+  * **Design & Overview**
+
+    Several introductory papers and presentations are available at
+    :ref:`design_and_overview`.
+
+  * **Publications**
+
+    The list of `publications <http://llvm.org/pubs>`_ based on LLVM.
+
+  * **User Guides**
+
+    Those new to the LLVM system should first vist the :ref:`userguides`.
+
+    NOTE: If you are a user who is only interested in using LLVM-based
+    compilers, you should look into `Clang <http://clang.llvm.org>`_ or
+    `DragonEgg <http://dragonegg.llvm.org>`_ instead. The documentation here is
+    intended for users who have a need to work with the intermediate LLVM
+    representation.
+
+  * **API Clients**
+
+    Developers of applications which use LLVM as a library should visit the
+    :ref:`programming`.
+
+  * **Subsystems**
+
+    API clients and LLVM developers may be interested in the
+    :ref:`subsystems` documentation.
+
+  * **Development Process**
+
+    Additional documentation on the LLVM project can be found at
+    :ref:`development_process`.
+
+  * **Mailing Lists**
+
+    For more information, consider consulting the LLVM :ref:`mailing_lists`.
+
+.. toctree::
+   :maxdepth: 2
+
+   design_and_overview
+   userguides
+   programming
+   subsystems
+   development_process
+   mailing_lists
+   
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/llvm-theme/layout.html b/docs/llvm-theme/layout.html
new file mode 100644
index 0000000..746c2f5
--- /dev/null
+++ b/docs/llvm-theme/layout.html
@@ -0,0 +1,23 @@
+{#
+    sphinxdoc/layout.html
+    ~~~~~~~~~~~~~~~~~~~~~
+
+    Sphinx layout template for the sphinxdoc theme.
+
+    :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{% extends "basic/layout.html" %}
+
+{% block relbar1 %}
+<div class="logo">
+  <a href="{{ pathto('index') }}">
+    <img src="{{pathto("_static/logo.png", 1) }}"
+         alt="LLVM Logo" width="250" height="88"/></a>
+</div>
+{{ super() }}
+{% endblock %}
+
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
diff --git a/docs/llvm-theme/static/contents.png b/docs/llvm-theme/static/contents.png
new file mode 100644
index 0000000..7fb8215
--- /dev/null
+++ b/docs/llvm-theme/static/contents.png
diff --git a/docs/llvm-theme/static/llvm-theme.css b/docs/llvm-theme/static/llvm-theme.css
new file mode 100644
index 0000000..f684d00
--- /dev/null
+++ b/docs/llvm-theme/static/llvm-theme.css
@@ -0,0 +1,374 @@
+/*
+ * sphinxdoc.css_t
+ * ~~~~~~~~~~~~~~~
+ *
+ * Sphinx stylesheet -- sphinxdoc theme.  Originally created by
+ * Armin Ronacher for Werkzeug.
+ *
+ * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+@import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+body {
+    font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva',
+                 'Verdana', sans-serif;
+    font-size: 14px;
+    letter-spacing: -0.01em;
+    line-height: 150%;
+    text-align: center;
+    background-color: #BFD1D4;
+    color: black;
+    padding: 0;
+    border: 1px solid #aaa;
+
+    margin: 0px 80px 0px 80px;
+    min-width: 740px;
+}
+
+div.logo {
+    background-color: white;
+    text-align: left;
+    padding: 10px 10px 15px 15px;
+}
+
+div.document {
+    background-color: white;
+    text-align: left;
+    background-image: url(contents.png);
+    background-repeat: repeat-x;
+}
+
+div.bodywrapper {
+    margin: 0 240px 0 0;
+    border-right: 1px solid #ccc;
+}
+
+div.body {
+    margin: 0;
+    padding: 0.5em 20px 20px 20px;
+}
+
+div.related {
+    font-size: 1em;
+}
+
+div.related ul {
+    background-image: url(navigation.png);
+    height: 2em;
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+}
+
+div.related ul li {
+    margin: 0;
+    padding: 0;
+    height: 2em;
+    float: left;
+}
+
+div.related ul li.right {
+    float: right;
+    margin-right: 5px;
+}
+
+div.related ul li a {
+    margin: 0;
+    padding: 0 5px 0 5px;
+    line-height: 1.75em;
+    color: #EE9816;
+}
+
+div.related ul li a:hover {
+    color: #3CA8E7;
+}
+
+div.sphinxsidebarwrapper {
+    padding: 0;
+}
+
+div.sphinxsidebar {
+    margin: 0;
+    padding: 0.5em 15px 15px 0;
+    width: 210px;
+    float: right;
+    font-size: 1em;
+    text-align: left;
+}
+
+div.sphinxsidebar h3, div.sphinxsidebar h4 {
+    margin: 1em 0 0.5em 0;
+    font-size: 1em;
+    padding: 0.1em 0 0.1em 0.5em;
+    color: white;
+    border: 1px solid #86989B;
+    background-color: #AFC1C4;
+}
+
+div.sphinxsidebar h3 a {
+    color: white;
+}
+
+div.sphinxsidebar ul {
+    padding-left: 1.5em;
+    margin-top: 7px;
+    padding: 0;
+    line-height: 130%;
+}
+
+div.sphinxsidebar ul ul {
+    margin-left: 20px;
+}
+
+div.footer {
+    background-color: #E3EFF1;
+    color: #86989B;
+    padding: 3px 8px 3px 0;
+    clear: both;
+    font-size: 0.8em;
+    text-align: right;
+}
+
+div.footer a {
+    color: #86989B;
+    text-decoration: underline;
+}
+
+/* -- body styles ----------------------------------------------------------- */
+
+p {
+    margin: 0.8em 0 0.5em 0;
+}
+
+a {
+    color: #CA7900;
+    text-decoration: none;
+}
+
+a:hover {
+    color: #2491CF;
+}
+
+div.body p a{
+    text-decoration: underline;
+}
+
+h1 {
+    margin: 0;
+    padding: 0.7em 0 0.3em 0;
+    font-size: 1.5em;
+    color: #11557C;
+}
+
+h2 {
+    margin: 1.3em 0 0.2em 0;
+    font-size: 1.35em;
+    padding: 0;
+}
+
+h3 {
+    margin: 1em 0 -0.3em 0;
+    font-size: 1.2em;
+}
+
+h3 a:hover {
+    text-decoration: underline;
+}
+
+div.body h1 a, div.body h2 a, div.body h3 a, div.body h4 a, div.body h5 a, div.body h6 a {
+    color: black!important;
+}
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    background-color: #f2f2f2;
+    font-weight: normal;
+    color: #20435c;
+    border-bottom: 1px solid #ccc;
+    margin: 20px -20px 10px -20px;
+    padding: 3px 0 3px 10px;
+}
+
+div.body h1 { margin-top: 0; font-size: 200%; }
+div.body h2 { font-size: 160%; }
+div.body h3 { font-size: 140%; }
+div.body h4 { font-size: 120%; }
+div.body h5 { font-size: 110%; }
+div.body h6 { font-size: 100%; }
+
+h1 a.anchor, h2 a.anchor, h3 a.anchor, h4 a.anchor, h5 a.anchor, h6 a.anchor {
+    display: none;
+    margin: 0 0 0 0.3em;
+    padding: 0 0.2em 0 0.2em;
+    color: #aaa!important;
+}
+
+h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor,
+h5:hover a.anchor, h6:hover a.anchor {
+    display: inline;
+}
+
+h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover,
+h5 a.anchor:hover, h6 a.anchor:hover {
+    color: #777;
+    background-color: #eee;
+}
+
+a.headerlink {
+    color: #c60f0f!important;
+    font-size: 1em;
+    margin-left: 6px;
+    padding: 0 4px 0 4px;
+    text-decoration: none!important;
+}
+
+a.headerlink:hover {
+    background-color: #ccc;
+    color: white!important;
+}
+
+cite, code, tt {
+    font-family: 'Consolas', 'Deja Vu Sans Mono',
+                 'Bitstream Vera Sans Mono', monospace;
+    font-size: 0.95em;
+    letter-spacing: 0.01em;
+}
+
+:not(a.reference) > tt {
+    background-color: #f2f2f2;
+    border-bottom: 1px solid #ddd;
+    color: #333;
+}
+
+tt.descname, tt.descclassname, tt.xref {
+    border: 0;
+}
+
+hr {
+    border: 1px solid #abc;
+    margin: 2em;
+}
+
+p a tt {
+    border: 0;
+    color: #CA7900;
+}
+
+p a tt:hover {
+    color: #2491CF;
+}
+
+a tt {
+    border: none;
+}
+
+pre {
+    font-family: 'Consolas', 'Deja Vu Sans Mono',
+                 'Bitstream Vera Sans Mono', monospace;
+    font-size: 0.95em;
+    letter-spacing: 0.015em;
+    line-height: 120%;
+    padding: 0.5em;
+    border: 1px solid #ccc;
+    background-color: #f8f8f8;
+}
+
+pre a {
+    color: inherit;
+    text-decoration: underline;
+}
+
+td.linenos pre {
+    padding: 0.5em 0;
+}
+
+div.quotebar {
+    background-color: #f8f8f8;
+    max-width: 250px;
+    float: right;
+    padding: 2px 7px;
+    border: 1px solid #ccc;
+}
+
+div.topic {
+    background-color: #f8f8f8;
+}
+
+table {
+    border-collapse: collapse;
+    margin: 0 -0.5em 0 -0.5em;
+}
+
+table td, table th {
+    padding: 0.2em 0.5em 0.2em 0.5em;
+}
+
+div.admonition, div.warning {
+    font-size: 0.9em;
+    margin: 1em 0 1em 0;
+    border: 1px solid #86989B;
+    background-color: #f7f7f7;
+    padding: 0;
+}
+
+div.admonition p, div.warning p {
+    margin: 0.5em 1em 0.5em 1em;
+    padding: 0;
+}
+
+div.admonition pre, div.warning pre {
+    margin: 0.4em 1em 0.4em 1em;
+}
+
+div.admonition p.admonition-title,
+div.warning p.admonition-title {
+    margin: 0;
+    padding: 0.1em 0 0.1em 0.5em;
+    color: white;
+    border-bottom: 1px solid #86989B;
+    font-weight: bold;
+    background-color: #AFC1C4;
+}
+
+div.warning {
+    border: 1px solid #940000;
+}
+
+div.warning p.admonition-title {
+    background-color: #CF0000;
+    border-bottom-color: #940000;
+}
+
+div.admonition ul, div.admonition ol,
+div.warning ul, div.warning ol {
+    margin: 0.1em 0.5em 0.5em 3em;
+    padding: 0;
+}
+
+div.versioninfo {
+    margin: 1em 0 0 0;
+    border: 1px solid #ccc;
+    background-color: #DDEAF0;
+    padding: 8px;
+    line-height: 1.3em;
+    font-size: 0.9em;
+}
+
+.viewcode-back {
+    font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva',
+                 'Verdana', sans-serif;
+}
+
+div.viewcode-block:target {
+    background-color: #f4debf;
+    border-top: 1px solid #ac9;
+    border-bottom: 1px solid #ac9;
+}
diff --git a/docs/llvm-theme/static/logo.png b/docs/llvm-theme/static/logo.png
new file mode 100644
index 0000000..18d424c
--- /dev/null
+++ b/docs/llvm-theme/static/logo.png
diff --git a/docs/llvm-theme/static/navigation.png b/docs/llvm-theme/static/navigation.png
new file mode 100644
index 0000000..1081dc1
--- /dev/null
+++ b/docs/llvm-theme/static/navigation.png
diff --git a/docs/llvm-theme/theme.conf b/docs/llvm-theme/theme.conf
new file mode 100644
index 0000000..573fd78
--- /dev/null
+++ b/docs/llvm-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = basic
+stylesheet = llvm-theme.css
+pygments_style = friendly
diff --git a/docs/mailing_lists.rst b/docs/mailing_lists.rst
new file mode 100644
index 0000000..106f1da
--- /dev/null
+++ b/docs/mailing_lists.rst
@@ -0,0 +1,35 @@
+.. _mailing_lists:
+
+Mailing Lists
+=============
+
+ * `LLVM Announcements List
+   <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce>`_
+
+   This is a low volume list that provides important announcements regarding
+   LLVM.  It gets email about once a month.
+
+ * `Developer's List <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_
+
+   This list is for people who want to be included in technical discussions of
+   LLVM. People post to this list when they have questions about writing code
+   for or using the LLVM tools. It is relatively low volume.
+
+ * `Bugs & Patches Archive <http://lists.cs.uiuc.edu/pipermail/llvmbugs/>`_
+
+   This list gets emailed every time a bug is opened and closed, and when people
+   submit patches to be included in LLVM.  It is higher volume than the LLVMdev
+   list.
+
+ * `Commits Archive <http://lists.cs.uiuc.edu/pipermail/llvm-commits/>`_
+
+   This list contains all commit messages that are made when LLVM developers
+   commit code changes to the repository. It is useful for those who want to
+   stay on the bleeding edge of LLVM development. This list is very high volume.
+
+ * `Test Results Archive
+   <http://lists.cs.uiuc.edu/pipermail/llvm-testresults/>`_
+
+   A message is automatically sent to this list by every active nightly tester
+   when it completes.  As such, this list gets email several times each day,
+   making it a high volume list.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..8dfec03
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,190 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\llvm.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\llvm.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+:end
diff --git a/docs/programming.rst b/docs/programming.rst
new file mode 100644
index 0000000..27e4301
--- /dev/null
+++ b/docs/programming.rst
@@ -0,0 +1,40 @@
+.. _programming:
+
+Programming Documentation
+=========================
+
+.. toctree::
+   :hidden:
+
+   CodingStandards
+   CommandLine
+
+* `LLVM Language Reference Manual <LangRef.html>`_
+
+  Defines the LLVM intermediate representation and the assembly form of the
+  different nodes.
+
+* `The LLVM Programmers Manual <ProgrammersManual.html>`_
+
+  Introduction to the general layout of the LLVM sourcebase, important classes
+  and APIs, and some tips & tricks.
+
+* :ref:`commandline`
+
+  Provides information on using the command line parsing library.
+
+* :ref:`coding_standards`
+
+  Details the LLVM coding standards and provides useful information on writing
+  efficient C++ code.
+
+* `Extending LLVM <ExtendingLLVM.html>`_
+
+  Look here to see how to add instructions and intrinsics to LLVM.
+
+* `Doxygen generated documentation <http://llvm.org/doxygen/>`_
+
+  (`classes <http://llvm.org/doxygen/inherits.html>`_)
+  (`tarball <http://llvm.org/doxygen/doxygen.tar.gz>`_)
+
+* `ViewVC Repository Browser <http://llvm.org/viewvc/>`_
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
new file mode 100644
index 0000000..be33295
--- /dev/null
+++ b/docs/subsystems.rst
@@ -0,0 +1,91 @@
+.. _subsystems:
+
+Subsystem Documentation
+=======================
+
+.. toctree::
+   :hidden:
+
+   AliasAnalysis
+   BitCodeFormat
+   BranchWeightMetadata
+   Bugpoint
+   CodeGenerator
+   ExceptionHandling
+   LinkTimeOptimization
+   SegmentedStacks
+   TableGenFundamentals
+
+* `Writing an LLVM Pass <WritingAnLLVMPass.html>`_
+    
+   Information on how to write LLVM transformations and analyses.
+    
+* `Writing an LLVM Backend <WritingAnLLVMBackend.html>`_
+    
+   Information on how to write LLVM backends for machine targets.
+
+* :ref:`code_generator`
+
+   The design and implementation of the LLVM code generator.  Useful if you are
+   working on retargetting LLVM to a new architecture, designing a new codegen
+   pass, or enhancing existing components.
+    
+* :ref:`tablegen`
+
+   Describes the TableGen tool, which is used heavily by the LLVM code
+   generator.
+    
+* :ref:`alias_analysis`
+    
+   Information on how to write a new alias analysis implementation or how to
+   use existing analyses.
+    
+* `Accurate Garbage Collection with LLVM <GarbageCollection.html>`_
+    
+   The interfaces source-language compilers should use for compiling GC'd
+   programs.
+
+* `Source Level Debugging with LLVM <SourceLevelDebugging.html>`_
+    
+   This document describes the design and philosophy behind the LLVM
+   source-level debugger.
+    
+* :ref:`exception_handling`
+    
+   This document describes the design and implementation of exception handling
+   in LLVM.
+    
+* :ref:`bugpoint`
+    
+   Automatic bug finder and test-case reducer description and usage
+   information.
+    
+* :ref:`bitcode_format`
+    
+   This describes the file format and encoding used for LLVM "bc" files.
+    
+* `System Library <SystemLibrary.html>`_
+    
+   This document describes the LLVM System Library (<tt>lib/System</tt>) and
+   how to keep LLVM source code portable
+    
+* :ref:`lto`
+    
+   This document describes the interface between LLVM intermodular optimizer
+   and the linker and its design
+    
+* `The LLVM gold plugin <GoldPlugin.html>`_
+    
+   How to build your programs with link-time optimization on Linux.
+    
+* `The GDB JIT interface <DebuggingJITedCode.html>`_
+    
+   How to debug JITed code with GDB.
+    
+* :ref:`branch_weight`
+    
+   Provides information about Branch Prediction Information.
+
+* :ref:`segmented_stacks`
+
+   This document describes segmented stacks and how they are used in LLVM.
diff --git a/docs/tutorial/LangImpl1.html b/docs/tutorial/LangImpl1.html
index 22a2b12..717454f 100644
--- a/docs/tutorial/LangImpl1.html
+++ b/docs/tutorial/LangImpl1.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Tutorial Introduction and the Lexer</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -342,7 +342,7 @@ so that you can use the lexer and parser together.
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl2.html b/docs/tutorial/LangImpl2.html
index e4707b3..694f734 100644
--- a/docs/tutorial/LangImpl2.html
+++ b/docs/tutorial/LangImpl2.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Implementing a Parser and AST</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1225,7 +1225,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl3.html b/docs/tutorial/LangImpl3.html
index 9647b43..1390153 100644
--- a/docs/tutorial/LangImpl3.html
+++ b/docs/tutorial/LangImpl3.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Implementing code generation to LLVM IR</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -685,10 +685,10 @@ clang++ -g -O3 toy.cpp `llvm-config --cppflags --ldflags --libs core` -o toy
 // See example below.
 
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Support/IRBuilder.h"
 #include &lt;cstdio&gt;
 #include &lt;string&gt;
 #include &lt;map&gt;
@@ -1262,7 +1262,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl4.html b/docs/tutorial/LangImpl4.html
index 06a8a13..3f8d4a4 100644
--- a/docs/tutorial/LangImpl4.html
+++ b/docs/tutorial/LangImpl4.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Adding JIT and Optimizer Support</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -253,10 +253,9 @@ add instruction from every execution of this function.</p>
 <p>LLVM provides a wide variety of optimizations that can be used in certain
 circumstances.  Some <a href="../Passes.html">documentation about the various 
 passes</a> is available, but it isn't very complete.  Another good source of
-ideas can come from looking at the passes that <tt>llvm-gcc</tt> or
-<tt>llvm-ld</tt> run to get started.  The "<tt>opt</tt>" tool allows you to 
-experiment with passes from the command line, so you can see if they do
-anything.</p>
+ideas can come from looking at the passes that <tt>Clang</tt> runs to get
+started.  The "<tt>opt</tt>" tool allows you to experiment with passes from the
+command line, so you can see if they do anything.</p>
 
 <p>Now that we have reasonable code coming out of our front-end, lets talk about
 executing it!</p>
@@ -518,6 +517,7 @@ at runtime.</p>
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -525,7 +525,6 @@ at runtime.</p>
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
 #include &lt;string&gt;
@@ -1147,7 +1146,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl5.html b/docs/tutorial/LangImpl5.html
index 0164ca3..a7a3737 100644
--- a/docs/tutorial/LangImpl5.html
+++ b/docs/tutorial/LangImpl5.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Extending the Language: Control Flow</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -895,6 +895,7 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -902,7 +903,6 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
 #include &lt;string&gt;
@@ -1766,7 +1766,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl6.html b/docs/tutorial/LangImpl6.html
index 4fcf109..1128893 100644
--- a/docs/tutorial/LangImpl6.html
+++ b/docs/tutorial/LangImpl6.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Extending the Language: User-defined Operators</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -636,7 +636,7 @@ def mandelhelp(xmin xmax xstep   ymin ymax ystep)
     : putchard(10)
   )
  
-# mandel - This is a convenient helper function for ploting the mandelbrot set
+# mandel - This is a convenient helper function for plotting the mandelbrot set
 # from the specified position with the specified Magnification.
 def mandel(realstart imagstart realmag imagmag) 
   mandelhelp(realstart, realstart+realmag*78, realmag,
@@ -834,6 +834,7 @@ library, although doing that will cause problems on Windows.</p>
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -841,7 +842,6 @@ library, although doing that will cause problems on Windows.</p>
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
 #include &lt;string&gt;
@@ -1823,7 +1823,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-07-31 09:05:57 +0200 (Tue, 31 Jul 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl7.html b/docs/tutorial/LangImpl7.html
index ebf6514..f1fe404 100644
--- a/docs/tutorial/LangImpl7.html
+++ b/docs/tutorial/LangImpl7.html
@@ -7,7 +7,7 @@
          construction</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1002,6 +1002,7 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -1009,7 +1010,6 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
 #include &lt;string&gt;
@@ -2158,7 +2158,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-10-16 10:06:54 +0200 (Sun, 16 Oct 2011) $
+  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl8.html b/docs/tutorial/LangImpl8.html
index cc55d40..50fcd8c 100644
--- a/docs/tutorial/LangImpl8.html
+++ b/docs/tutorial/LangImpl8.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Conclusion and other useful LLVM tidbits</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -47,7 +47,7 @@
 
 <div>
 
-<p>Welcome to the the final chapter of the "<a href="index.html">Implementing a
+<p>Welcome to the final chapter of the "<a href="index.html">Implementing a
 language with LLVM</a>" tutorial.  In the course of this tutorial, we have grown
 our little Kaleidoscope language from being a useless toy, to being a
 semi-interesting (but probably still useless) toy. :)</p>
@@ -353,7 +353,7 @@ Passing Style</a> and the use of tail calls (which LLVM also supports).</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-07-23 10:51:15 +0200 (Mon, 23 Jul 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/Makefile b/docs/tutorial/Makefile
deleted file mode 100644
index fdf1bb6..0000000
--- a/docs/tutorial/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-##===- docs/tutorial/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL := ../..
-include $(LEVEL)/Makefile.common
-
-HTML       := $(wildcard $(PROJ_SRC_DIR)/*.html)
-PNG        := $(wildcard $(PROJ_SRC_DIR)/*.png)
-EXTRA_DIST := $(HTML) index.html
-HTML_DIR   := $(DESTDIR)$(PROJ_docsdir)/html/tutorial
-
-install-local:: $(HTML)
-	$(Echo) Installing HTML Tutorial Documentation
-	$(Verb) $(MKDIR) $(HTML_DIR)
-	$(Verb) $(DataInstall) $(HTML) $(HTML_DIR)
-	$(Verb) $(DataInstall) $(PNG) $(HTML_DIR)
-	$(Verb) $(DataInstall) $(PROJ_SRC_DIR)/index.html $(HTML_DIR)
-
-uninstall-local::
-	$(Echo) Uninstalling Tutorial Documentation
-	$(Verb) $(RM) -rf $(HTML_DIR)
-
-printvars::
-	$(Echo) "HTML           : " '$(HTML)'
diff --git a/docs/tutorial/OCamlLangImpl1.html b/docs/tutorial/OCamlLangImpl1.html
index 7cae68c..86a395a 100644
--- a/docs/tutorial/OCamlLangImpl1.html
+++ b/docs/tutorial/OCamlLangImpl1.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -359,7 +359,7 @@ include a driver so that you can use the lexer and parser together.
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl2.html b/docs/tutorial/OCamlLangImpl2.html
index e1bb871..9bb4c40 100644
--- a/docs/tutorial/OCamlLangImpl2.html
+++ b/docs/tutorial/OCamlLangImpl2.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1037,7 +1037,7 @@ main ()
   <a href="mailto:sabre@nondot.org">Chris Lattner</a>
   <a href="mailto:erickt@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl3.html b/docs/tutorial/OCamlLangImpl3.html
index c240bb9..e6105e8 100644
--- a/docs/tutorial/OCamlLangImpl3.html
+++ b/docs/tutorial/OCamlLangImpl3.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1087,7 +1087,7 @@ main ()
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-07-15 22:03:30 +0200 (Fri, 15 Jul 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl4.html b/docs/tutorial/OCamlLangImpl4.html
index db164d5..e3e2469 100644
--- a/docs/tutorial/OCamlLangImpl4.html
+++ b/docs/tutorial/OCamlLangImpl4.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -270,10 +270,9 @@ add instruction from every execution of this function.</p>
 <p>LLVM provides a wide variety of optimizations that can be used in certain
 circumstances.  Some <a href="../Passes.html">documentation about the various
 passes</a> is available, but it isn't very complete.  Another good source of
-ideas can come from looking at the passes that <tt>llvm-gcc</tt> or
-<tt>llvm-ld</tt> run to get started.  The "<tt>opt</tt>" tool allows you to
-experiment with passes from the command line, so you can see if they do
-anything.</p>
+ideas can come from looking at the passes that <tt>Clang</tt> runs to get
+started.  The "<tt>opt</tt>" tool allows you to experiment with passes from the
+command line, so you can see if they do anything.</p>
 
 <p>Now that we have reasonable code coming out of our front-end, lets talk about
 executing it!</p>
@@ -1021,7 +1020,7 @@ extern double putchard(double X) {
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl5.html b/docs/tutorial/OCamlLangImpl5.html
index ca79691..994957e 100644
--- a/docs/tutorial/OCamlLangImpl5.html
+++ b/docs/tutorial/OCamlLangImpl5.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1554,7 +1554,7 @@ operators</a>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl6.html b/docs/tutorial/OCamlLangImpl6.html
index bde429b..cef3884 100644
--- a/docs/tutorial/OCamlLangImpl6.html
+++ b/docs/tutorial/OCamlLangImpl6.html
@@ -7,7 +7,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -611,7 +611,7 @@ def mandelhelp(xmin xmax xstep   ymin ymax ystep)
     : putchard(10)
   )
 
-# mandel - This is a convenient helper function for ploting the mandelbrot set
+# mandel - This is a convenient helper function for plotting the mandelbrot set
 # from the specified position with the specified Magnification.
 def mandel(realstart imagstart realmag imagmag)
   mandelhelp(realstart, realstart+realmag*78, realmag,
@@ -1568,7 +1568,7 @@ SSA construction</a>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-07-31 09:05:57 +0200 (Tue, 31 Jul 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl7.html b/docs/tutorial/OCamlLangImpl7.html
index a48e679..abe8913 100644
--- a/docs/tutorial/OCamlLangImpl7.html
+++ b/docs/tutorial/OCamlLangImpl7.html
@@ -8,7 +8,7 @@
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
   <meta name="author" content="Erick Tryzelaar">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -1898,7 +1898,7 @@ extern double printd(double X) {
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl8.html b/docs/tutorial/OCamlLangImpl8.html
index eed8c03..7c1a500 100644
--- a/docs/tutorial/OCamlLangImpl8.html
+++ b/docs/tutorial/OCamlLangImpl8.html
@@ -6,7 +6,7 @@
   <title>Kaleidoscope: Conclusion and other useful LLVM tidbits</title>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <meta name="author" content="Chris Lattner">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
@@ -47,7 +47,7 @@
 
 <div>
 
-<p>Welcome to the the final chapter of the "<a href="index.html">Implementing a
+<p>Welcome to the final chapter of the "<a href="index.html">Implementing a
 language with LLVM</a>" tutorial.  In the course of this tutorial, we have grown
 our little Kaleidoscope language from being a useless toy, to being a
 semi-interesting (but probably still useless) toy. :)</p>
diff --git a/docs/tutorial/index.html b/docs/tutorial/index.html
index 0a8cae2..2c11a9a 100644
--- a/docs/tutorial/index.html
+++ b/docs/tutorial/index.html
@@ -7,7 +7,7 @@
   <meta name="author" content="Owen Anderson">
   <meta name="description" 
   content="LLVM Tutorial: Table of Contents.">
-  <link rel="stylesheet" href="../llvm.css" type="text/css">
+  <link rel="stylesheet" href="../_static/llvm.css" type="text/css">
 </head>
 
 <body>
diff --git a/docs/userguides.rst b/docs/userguides.rst
new file mode 100644
index 0000000..26a5a8c
--- /dev/null
+++ b/docs/userguides.rst
@@ -0,0 +1,89 @@
+.. _userguides:
+
+User Guides
+===========
+
+.. toctree::
+   :hidden:
+
+   CMake
+   CommandGuide/index
+   DeveloperPolicy
+   GettingStartedVS
+   FAQ
+   Lexicon
+   Packaging
+
+* `The LLVM Getting Started Guide <GettingStarted.html>`_
+    
+   Discusses how to get up and running quickly with the LLVM infrastructure.
+   Everything from unpacking and compilation of the distribution to execution
+   of some tools.
+    
+* :ref:`building-with-cmake`
+
+   An addendum to the main Getting Started guide for those using the `CMake
+   build system <http://www.cmake.org>`_.
+    
+* `Getting Started with the LLVM System using Microsoft Visual Studio
+  <GettingStartedVS.html>`_
+
+   An addendum to the main Getting Started guide for those using Visual Studio
+   on Windows.
+    
+* `LLVM Tutorial <tutorial/>`_
+
+   A walk through the process of using LLVM for a custom language, and the
+   facilities LLVM offers in tutorial form.
+
+* :ref:`developer_policy`
+
+   The LLVM project's policy towards developers and their contributions.
+
+* :ref:`LLVM Command Guide <commands>`
+
+   A reference manual for the LLVM command line utilities ("man" pages for LLVM
+   tools).
+    
+* `LLVM's Analysis and Transform Passes <Passes.html>`_
+
+   A list of optimizations and analyses implemented in LLVM.
+    
+* :ref:`faq`
+
+   A list of common questions and problems and their solutions.
+    
+* `Release notes for the current release <ReleaseNotes.html>`_
+
+   This describes new features, known bugs, and other limitations.
+
+* `How to Submit A Bug Report <HowToSubmitABug.html>`_
+    
+   Instructions for properly submitting information about any bugs you run into
+   in the LLVM system.
+    
+* `LLVM Testing Infrastructure Guide <TestingGuide.html>`_
+
+   A reference manual for using the LLVM testing infrastructure.
+    
+* `How to build the C, C++, ObjC, and ObjC++ front end <http://clang.llvm.org/get_started.html>`_
+
+   Instructions for building the clang front-end from source.
+    
+* :ref:`packaging`
+
+   Advice on packaging LLVM into a distribution.
+    
+* :ref:`lexicon`
+
+   Definition of acronyms, terms and concepts used in LLVM.
+
+* `How To Add Your Build Configuration To LLVM Buildbot Infrastructure <HowToAddABuilder.html>`_
+
+   Instructions for adding new builder to LLVM buildbot master.
+    
+* **IRC** -- You can probably find help on the unofficial LLVM IRC.
+
+   We often are on irc.oftc.net in the #llvm channel.  If you are using the
+   mozilla browser, and have chatzilla installed, you can `join #llvm on
+   irc.oftc.net <irc://irc.oftc.net/llvm>`_.
diff --git a/docs/yaml2obj.rst b/docs/yaml2obj.rst
new file mode 100644
index 0000000..cb59162
--- /dev/null
+++ b/docs/yaml2obj.rst
@@ -0,0 +1,222 @@
+.. _yaml2obj:
+
+yaml2obj
+========
+
+yaml2obj takes a YAML description of an object file and converts it to a binary
+file.
+
+    $ yaml2py input-file
+
+.. program:: yaml2py
+
+Outputs the binary to stdout.
+
+COFF Syntax
+-----------
+
+Here's a sample COFF file.
+
+.. code-block:: yaml
+
+  header:
+    Machine: IMAGE_FILE_MACHINE_I386 # (0x14C)
+
+  sections:
+    - Name: .text
+      Characteristics: [ IMAGE_SCN_CNT_CODE
+                       , IMAGE_SCN_ALIGN_16BYTES
+                       , IMAGE_SCN_MEM_EXECUTE
+                       , IMAGE_SCN_MEM_READ
+                       ] # 0x60500020
+      SectionData:
+        "\x83\xEC\x0C\xC7\x44\x24\x08\x00\x00\x00\x00\xC7\x04\x24\x00\x00\x00\x00\xE8\x00\x00\x00\x00\xE8\x00\x00\x00\x00\x8B\x44\x24\x08\x83\xC4\x0C\xC3" # |....D$.......$...............D$.....|
+
+  symbols:
+    - Name: .text
+      Value: 0
+      SectionNumber: 1
+      SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+      ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+      StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+      NumberOfAuxSymbols: 1
+      AuxillaryData:
+        "\x24\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00" # |$.................|
+
+    - Name: _main
+      Value: 0
+      SectionNumber: 1
+      SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+      ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+      StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
+Here's a simplified Kwalify_ schema with an extension to allow alternate types.
+
+.. _Kwalify: http://www.kuwata-lab.com/kwalify/ruby/users-guide.html
+
+.. code-block:: yaml
+
+  type: map
+    mapping:
+      header:
+        type: map
+        mapping:
+          Machine: [ {type: str, enum:
+                                 [ IMAGE_FILE_MACHINE_UNKNOWN
+                                 , IMAGE_FILE_MACHINE_AM33
+                                 , IMAGE_FILE_MACHINE_AMD64
+                                 , IMAGE_FILE_MACHINE_ARM
+                                 , IMAGE_FILE_MACHINE_ARMV7
+                                 , IMAGE_FILE_MACHINE_EBC
+                                 , IMAGE_FILE_MACHINE_I386
+                                 , IMAGE_FILE_MACHINE_IA64
+                                 , IMAGE_FILE_MACHINE_M32R
+                                 , IMAGE_FILE_MACHINE_MIPS16
+                                 , IMAGE_FILE_MACHINE_MIPSFPU
+                                 , IMAGE_FILE_MACHINE_MIPSFPU16
+                                 , IMAGE_FILE_MACHINE_POWERPC
+                                 , IMAGE_FILE_MACHINE_POWERPCFP
+                                 , IMAGE_FILE_MACHINE_R4000
+                                 , IMAGE_FILE_MACHINE_SH3
+                                 , IMAGE_FILE_MACHINE_SH3DSP
+                                 , IMAGE_FILE_MACHINE_SH4
+                                 , IMAGE_FILE_MACHINE_SH5
+                                 , IMAGE_FILE_MACHINE_THUMB
+                                 , IMAGE_FILE_MACHINE_WCEMIPSV2
+                                 ]}
+                   , {type: int}
+                   ]
+          Characteristics:
+            - type: seq
+              sequence:
+                - type: str
+                  enum: [ IMAGE_FILE_RELOCS_STRIPPED
+                        , IMAGE_FILE_EXECUTABLE_IMAGE
+                        , IMAGE_FILE_LINE_NUMS_STRIPPED
+                        , IMAGE_FILE_LOCAL_SYMS_STRIPPED
+                        , IMAGE_FILE_AGGRESSIVE_WS_TRIM
+                        , IMAGE_FILE_LARGE_ADDRESS_AWARE
+                        , IMAGE_FILE_BYTES_REVERSED_LO
+                        , IMAGE_FILE_32BIT_MACHINE
+                        , IMAGE_FILE_DEBUG_STRIPPED
+                        , IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP
+                        , IMAGE_FILE_NET_RUN_FROM_SWAP
+                        , IMAGE_FILE_SYSTEM
+                        , IMAGE_FILE_DLL
+                        , IMAGE_FILE_UP_SYSTEM_ONLY
+                        , IMAGE_FILE_BYTES_REVERSED_HI
+                        ]
+            - type: int
+      sections:
+        type: seq
+        sequence:
+          - type: map
+            mapping:
+              Name: {type: str}
+              Characteristics:
+                - type: seq
+                  sequence:
+                    - type: str
+                      enum: [ IMAGE_SCN_TYPE_NO_PAD
+                            , IMAGE_SCN_CNT_CODE
+                            , IMAGE_SCN_CNT_INITIALIZED_DATA
+                            , IMAGE_SCN_CNT_UNINITIALIZED_DATA
+                            , IMAGE_SCN_LNK_OTHER
+                            , IMAGE_SCN_LNK_INFO
+                            , IMAGE_SCN_LNK_REMOVE
+                            , IMAGE_SCN_LNK_COMDAT
+                            , IMAGE_SCN_GPREL
+                            , IMAGE_SCN_MEM_PURGEABLE
+                            , IMAGE_SCN_MEM_16BIT
+                            , IMAGE_SCN_MEM_LOCKED
+                            , IMAGE_SCN_MEM_PRELOAD
+                            , IMAGE_SCN_ALIGN_1BYTES
+                            , IMAGE_SCN_ALIGN_2BYTES
+                            , IMAGE_SCN_ALIGN_4BYTES
+                            , IMAGE_SCN_ALIGN_8BYTES
+                            , IMAGE_SCN_ALIGN_16BYTES
+                            , IMAGE_SCN_ALIGN_32BYTES
+                            , IMAGE_SCN_ALIGN_64BYTES
+                            , IMAGE_SCN_ALIGN_128BYTES
+                            , IMAGE_SCN_ALIGN_256BYTES
+                            , IMAGE_SCN_ALIGN_512BYTES
+                            , IMAGE_SCN_ALIGN_1024BYTES
+                            , IMAGE_SCN_ALIGN_2048BYTES
+                            , IMAGE_SCN_ALIGN_4096BYTES
+                            , IMAGE_SCN_ALIGN_8192BYTES
+                            , IMAGE_SCN_LNK_NRELOC_OVFL
+                            , IMAGE_SCN_MEM_DISCARDABLE
+                            , IMAGE_SCN_MEM_NOT_CACHED
+                            , IMAGE_SCN_MEM_NOT_PAGED
+                            , IMAGE_SCN_MEM_SHARED
+                            , IMAGE_SCN_MEM_EXECUTE
+                            , IMAGE_SCN_MEM_READ
+                            , IMAGE_SCN_MEM_WRITE
+                            ]
+                - type: int
+              SectionData: {type: str}
+      symbols:
+        type: seq
+        sequence:
+          - type: map
+            mapping:
+              Name: {type: str}
+              Value: {type: int}
+              SectionNumber: {type: int}
+              SimpleType: [ {type: str, enum: [ IMAGE_SYM_TYPE_NULL
+                                              , IMAGE_SYM_TYPE_VOID
+                                              , IMAGE_SYM_TYPE_CHAR
+                                              , IMAGE_SYM_TYPE_SHORT
+                                              , IMAGE_SYM_TYPE_INT
+                                              , IMAGE_SYM_TYPE_LONG
+                                              , IMAGE_SYM_TYPE_FLOAT
+                                              , IMAGE_SYM_TYPE_DOUBLE
+                                              , IMAGE_SYM_TYPE_STRUCT
+                                              , IMAGE_SYM_TYPE_UNION
+                                              , IMAGE_SYM_TYPE_ENUM
+                                              , IMAGE_SYM_TYPE_MOE
+                                              , IMAGE_SYM_TYPE_BYTE
+                                              , IMAGE_SYM_TYPE_WORD
+                                              , IMAGE_SYM_TYPE_UINT
+                                              , IMAGE_SYM_TYPE_DWORD
+                                              ]}
+                          , {type: int}
+                          ]
+              ComplexType: [ {type: str, enum: [ IMAGE_SYM_DTYPE_NULL
+                                               , IMAGE_SYM_DTYPE_POINTER
+                                               , IMAGE_SYM_DTYPE_FUNCTION
+                                               , IMAGE_SYM_DTYPE_ARRAY
+                                               ]}
+                           , {type: int}
+                           ]
+              StorageClass: [ {type: str, enum:
+                                          [ IMAGE_SYM_CLASS_END_OF_FUNCTION
+                                          , IMAGE_SYM_CLASS_NULL
+                                          , IMAGE_SYM_CLASS_AUTOMATIC
+                                          , IMAGE_SYM_CLASS_EXTERNAL
+                                          , IMAGE_SYM_CLASS_STATIC
+                                          , IMAGE_SYM_CLASS_REGISTER
+                                          , IMAGE_SYM_CLASS_EXTERNAL_DEF
+                                          , IMAGE_SYM_CLASS_LABEL
+                                          , IMAGE_SYM_CLASS_UNDEFINED_LABEL
+                                          , IMAGE_SYM_CLASS_MEMBER_OF_STRUCT
+                                          , IMAGE_SYM_CLASS_ARGUMENT
+                                          , IMAGE_SYM_CLASS_STRUCT_TAG
+                                          , IMAGE_SYM_CLASS_MEMBER_OF_UNION
+                                          , IMAGE_SYM_CLASS_UNION_TAG
+                                          , IMAGE_SYM_CLASS_TYPE_DEFINITION
+                                          , IMAGE_SYM_CLASS_UNDEFINED_STATIC
+                                          , IMAGE_SYM_CLASS_ENUM_TAG
+                                          , IMAGE_SYM_CLASS_MEMBER_OF_ENUM
+                                          , IMAGE_SYM_CLASS_REGISTER_PARAM
+                                          , IMAGE_SYM_CLASS_BIT_FIELD
+                                          , IMAGE_SYM_CLASS_BLOCK
+                                          , IMAGE_SYM_CLASS_FUNCTION
+                                          , IMAGE_SYM_CLASS_END_OF_STRUCT
+                                          , IMAGE_SYM_CLASS_FILE
+                                          , IMAGE_SYM_CLASS_SECTION
+                                          , IMAGE_SYM_CLASS_WEAK_EXTERNAL
+                                          , IMAGE_SYM_CLASS_CLR_TOKEN
+                                          ]}
+                            , {type: int}
+                            ]
diff --git a/examples/BrainF/BrainF.h b/examples/BrainF/BrainF.h
index add0687..c069feb 100644
--- a/examples/BrainF/BrainF.h
+++ b/examples/BrainF/BrainF.h
@@ -15,9 +15,9 @@
 #ifndef BRAINF_H
 #define BRAINF_H
 
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/examples/BrainF/BrainFDriver.cpp b/examples/BrainF/BrainFDriver.cpp
index cdbf02a..58617b7 100644
--- a/examples/BrainF/BrainFDriver.cpp
+++ b/examples/BrainF/BrainFDriver.cpp
@@ -21,7 +21,6 @@
 // ./BrainF           prog.bf          #Write as BitCode
 //
 // lli prog.bf.bc                      #Run generated BitCode
-// llvm-ld -native -o=prog prog.bf.bc  #Compile BitCode into native executable
 //
 //===--------------------------------------------------------------------===//
 
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index cf078bb..6dbd662 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -52,6 +52,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/Intrinsics.h"
@@ -59,7 +60,6 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/TargetSelect.h"
 
@@ -1257,7 +1257,7 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
 
   // FIXME: Redundant storage which, beyond utilizing value of 
   //        caughtResultStore for unwindException storage, may be alleviated 
-  //        alltogether with a block rearrangement
+  //        altogether with a block rearrangement
   builder.CreateStore(caughtResult, caughtResultStorage);
   builder.CreateStore(unwindException, exceptionStorage);
   builder.CreateStore(ourExceptionThrownState, exceptionCaughtFlag);
diff --git a/examples/Fibonacci/fibonacci.cpp b/examples/Fibonacci/fibonacci.cpp
index a7d1ca8..cfd9b1e 100644
--- a/examples/Fibonacci/fibonacci.cpp
+++ b/examples/Fibonacci/fibonacci.cpp
@@ -40,7 +40,7 @@ static Function *CreateFibFunction(Module *M, LLVMContext &Context) {
   // Create the fib function and insert it into module M.  This function is said
   // to return an int and take an int parameter.
   Function *FibF =
-    cast<Function>(M->getOrInsertFunction("fib", Type::getInt32Ty(Context), 
+    cast<Function>(M->getOrInsertFunction("fib", Type::getInt32Ty(Context),
                                           Type::getInt32Ty(Context),
                                           (Type *)0));
 
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
 
   InitializeNativeTarget();
   LLVMContext Context;
-  
+
   // Create some module to put our function into it.
   OwningPtr<Module> M(new Module("test", Context));
 
@@ -132,6 +132,6 @@ int main(int argc, char **argv) {
 
   // import result of execution
   outs() << "Result: " << GV.IntVal << "\n";
-  
+
   return 0;
 }
diff --git a/examples/HowToUseJIT/HowToUseJIT.cpp b/examples/HowToUseJIT/HowToUseJIT.cpp
index 92b2860..5588e92 100644
--- a/examples/HowToUseJIT/HowToUseJIT.cpp
+++ b/examples/HowToUseJIT/HowToUseJIT.cpp
@@ -34,18 +34,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
 #include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/examples/Kaleidoscope/Chapter3/toy.cpp b/examples/Kaleidoscope/Chapter3/toy.cpp
index 33980f5..c1e34b2 100644
--- a/examples/Kaleidoscope/Chapter3/toy.cpp
+++ b/examples/Kaleidoscope/Chapter3/toy.cpp
@@ -1,8 +1,8 @@
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Support/IRBuilder.h"
 #include <cstdio>
 #include <string>
 #include <map>
diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index 9a28322..cce4466 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -1,6 +1,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -8,7 +9,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
 #include <string>
diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index adfbad5..36dd760 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -1,6 +1,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -8,7 +9,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
 #include <string>
diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index c16d6bd..db3495d 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -1,6 +1,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -8,7 +9,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
 #include <string>
diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index 87b28c3..143b30b 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -1,6 +1,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
@@ -8,7 +9,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
 #include <string>
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 7774606..0bd5db3 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -21,9 +21,9 @@
 
 /* Need these includes to support the LLVM 'cast' template for the C++ 'wrap' 
    and 'unwrap' conversion functions. */
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/Support/IRBuilder.h"
 
 extern "C" {
 #endif
@@ -53,7 +53,7 @@ extern "C" {
  * The declared parameter names are descriptive and specify which type is
  * required. Additionally, each type hierarchy is documented along with the
  * functions that operate upon it. For more detail, refer to LLVM's C++ code.
- * If in doubt, refer to Core.cpp, which performs paramter downcasts in the
+ * If in doubt, refer to Core.cpp, which performs parameter downcasts in the
  * form unwrap<RequiredType>(Param).
  *
  * Many exotic languages can interoperate with C code but have a harder time
@@ -106,7 +106,7 @@ typedef struct LLVMOpaqueType *LLVMTypeRef;
 typedef struct LLVMOpaqueValue *LLVMValueRef;
 
 /**
- * Represents a basic block of instruction in LLVM IR.
+ * Represents a basic block of instructions in LLVM IR.
  *
  * This models llvm::BasicBlock.
  */
@@ -478,6 +478,15 @@ void LLVMSetTarget(LLVMModuleRef M, const char *Triple);
 void LLVMDumpModule(LLVMModuleRef M);
 
 /**
+ * Print a representation of a module to a file. The ErrorMessage needs to be
+ * disposed with LLVMDisposeMessage. Returns 0 on success, 1 otherwise.
+ *
+ * @see Module::print()
+ */
+LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
+                               char **ErrorMessage);
+
+/**
  * Set inline assembly for a module.
  *
  * @see Module::setModuleInlineAsm()
@@ -977,7 +986,7 @@ LLVMTypeRef LLVMX86MMXType(void);
  *
  * LLVMValueRef essentially represents llvm::Value. There is a rich
  * hierarchy of classes within this type. Depending on the instance
- * obtain, not all APIs are available.
+ * obtained, not all APIs are available.
  *
  * Callers can determine the type of a LLVMValueRef by calling the
  * LLVMIsA* family of functions (e.g. LLVMIsAArgument()). These
@@ -1153,7 +1162,7 @@ LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DECLARE_VALUE_CAST)
  *
  * Uses are obtained in an iterator fashion. First, call this function
  * to obtain a reference to the first use. Then, call LLVMGetNextUse()
- * on that instance and all subsequently obtained instances untl
+ * on that instance and all subsequently obtained instances until
  * LLVMGetNextUse() returns NULL.
  *
  * @see llvm::Value::use_begin()
@@ -2106,7 +2115,7 @@ LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst);
 LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst);
 
 /**
- * Obtain the instruction that occured before this one.
+ * Obtain the instruction that occurred before this one.
  *
  * If the instruction is the first instruction in a basic block, NULL
  * will be returned.
diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index a676e37..69fdc64 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@@ -109,9 +109,9 @@ struct LLVMOpInfo1 {
  */
 typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
                                                 uint64_t ReferenceValue,
-						uint64_t *ReferenceType,
-						uint64_t ReferencePC,
-						const char **ReferenceName);
+                                                uint64_t *ReferenceType,
+                                                uint64_t ReferencePC,
+                                                const char **ReferenceName);
 /**
  * The reference types on input and output.
  */
diff --git a/include/llvm-c/Linker.h b/include/llvm-c/Linker.h
new file mode 100644
index 0000000..9f337cf
--- /dev/null
+++ b/include/llvm-c/Linker.h
@@ -0,0 +1,42 @@
+/*===-- llvm-c/Linker.h - Module Linker C Interface -------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file defines the C interface to the module/file/archive linker.       *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_LINKER_H
+#define LLVM_C_LINKER_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef enum {
+  LLVMLinkerDestroySource = 0, /* Allow source module to be destroyed. */
+  LLVMLinkerPreserveSource = 1 /* Preserve the source module. */
+} LLVMLinkerMode;
+
+
+/* Links the source module into the destination module, taking ownership
+ * of the source module away from the caller. Optionally returns a
+ * human-readable description of any errors that occurred in linking.
+ * OutMessage must be disposed with LLVMDisposeMessage. The return value
+ * is true if an error occurred, false otherwise. */
+LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
+                         LLVMLinkerMode Mode, char **OutMessage);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/llvm-c/Target.h b/include/llvm-c/Target.h
index 568e60d..8915040 100644
--- a/include/llvm-c/Target.h
+++ b/include/llvm-c/Target.h
@@ -56,19 +56,19 @@ typedef struct LLVMStructLayout *LLVMStructLayoutRef;
   
 /* Declare all of the available assembly printer initialization functions. */
 #define LLVM_ASM_PRINTER(TargetName) \
-  void LLVMInitialize##TargetName##AsmPrinter();
+  void LLVMInitialize##TargetName##AsmPrinter(void);
 #include "llvm/Config/AsmPrinters.def"
 #undef LLVM_ASM_PRINTER  /* Explicit undef to make SWIG happier */
 
 /* Declare all of the available assembly parser initialization functions. */
 #define LLVM_ASM_PARSER(TargetName) \
-  void LLVMInitialize##TargetName##AsmParser();
+  void LLVMInitialize##TargetName##AsmParser(void);
 #include "llvm/Config/AsmParsers.def"
 #undef LLVM_ASM_PARSER  /* Explicit undef to make SWIG happier */
 
 /* Declare all of the available disassembler initialization functions. */
 #define LLVM_DISASSEMBLER(TargetName) \
-  void LLVMInitialize##TargetName##Disassembler();
+  void LLVMInitialize##TargetName##Disassembler(void);
 #include "llvm/Config/Disassemblers.def"
 #undef LLVM_DISASSEMBLER  /* Explicit undef to make SWIG happier */
   
@@ -102,7 +102,7 @@ static inline void LLVMInitializeAllTargetMCs(void) {
 /** LLVMInitializeAllAsmPrinters - The main program should call this function if
     it wants all asm printers that LLVM is configured to support, to make them
     available via the TargetRegistry. */
-static inline void LLVMInitializeAllAsmPrinters() {
+static inline void LLVMInitializeAllAsmPrinters(void) {
 #define LLVM_ASM_PRINTER(TargetName) LLVMInitialize##TargetName##AsmPrinter();
 #include "llvm/Config/AsmPrinters.def"
 #undef LLVM_ASM_PRINTER  /* Explicit undef to make SWIG happier */
@@ -111,7 +111,7 @@ static inline void LLVMInitializeAllAsmPrinters() {
 /** LLVMInitializeAllAsmParsers - The main program should call this function if
     it wants all asm parsers that LLVM is configured to support, to make them
     available via the TargetRegistry. */
-static inline void LLVMInitializeAllAsmParsers() {
+static inline void LLVMInitializeAllAsmParsers(void) {
 #define LLVM_ASM_PARSER(TargetName) LLVMInitialize##TargetName##AsmParser();
 #include "llvm/Config/AsmParsers.def"
 #undef LLVM_ASM_PARSER  /* Explicit undef to make SWIG happier */
@@ -120,7 +120,7 @@ static inline void LLVMInitializeAllAsmParsers() {
 /** LLVMInitializeAllDisassemblers - The main program should call this function
     if it wants all disassemblers that LLVM is configured to support, to make
     them available via the TargetRegistry. */
-static inline void LLVMInitializeAllDisassemblers() {
+static inline void LLVMInitializeAllDisassemblers(void) {
 #define LLVM_DISASSEMBLER(TargetName) \
   LLVMInitialize##TargetName##Disassembler();
 #include "llvm/Config/Disassemblers.def"
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 2b466f9..5a625a4 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -274,6 +274,7 @@ namespace llvm {
     /* C fmod, or llvm frem. */
     opStatus mod(const APFloat &, roundingMode);
     opStatus fusedMultiplyAdd(const APFloat &, const APFloat &, roundingMode);
+    opStatus roundToIntegral(roundingMode);
 
     /* Sign operations.  */
     void changeSign();
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 4101989..f30a6e3 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -16,6 +16,7 @@
 #define LLVM_APINT_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <climits>
@@ -273,6 +274,13 @@ public:
       initSlowCase(that);
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  /// @brief Move Constructor.
+  APInt(APInt&& that) : BitWidth(that.BitWidth), VAL(that.VAL) {
+    that.BitWidth = 0;
+  }
+#endif
+
   /// @brief Destructor.
   ~APInt() {
     if (!isSingleWord())
@@ -349,13 +357,7 @@ public:
   /// @brief Check if this APInt has an N-bits unsigned integer value.
   bool isIntN(unsigned N) const {
     assert(N && "N == 0 ???");
-    if (N >= getBitWidth())
-      return true;
-
-    if (isSingleWord())
-      return isUIntN(N, VAL);
-    return APInt(N, makeArrayRef(pVal, getNumWords())).zext(getBitWidth())
-      == (*this);
+    return getActiveBits() <= N;
   }
 
   /// @brief Check if this APInt has an N-bits signed integer value.
@@ -503,6 +505,18 @@ public:
     return getAllOnesValue(numBits).lshr(numBits - loBitsSet);
   }
 
+  /// \brief Determine if two APInts have the same value, after zero-extending
+  /// one of them (if needed!) to ensure that the bit-widths match.
+  static bool isSameValue(const APInt &I1, const APInt &I2) {
+    if (I1.getBitWidth() == I2.getBitWidth())
+      return I1 == I2;
+
+    if (I1.getBitWidth() > I2.getBitWidth())
+      return I1 == I2.zext(I1.getBitWidth());
+
+    return I1.zext(I2.getBitWidth()) == I2;
+  }
+  
   /// \brief Overload to compute a hash_code for an APInt value.
   friend hash_code hash_value(const APInt &Arg);
 
@@ -587,6 +601,21 @@ public:
     return AssignSlowCase(RHS);
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  /// @brief Move assignment operator.
+  APInt& operator=(APInt&& that) {
+    if (!isSingleWord())
+      delete [] pVal;
+
+    BitWidth = that.BitWidth;
+    VAL = that.VAL;
+
+    that.BitWidth = 0;
+
+    return *this;
+  }
+#endif
+
   /// The RHS value is assigned to *this. If the significant bits in RHS exceed
   /// the bit width, the excess bits are truncated. If the bit width is larger
   /// than 64, the value is zero filled in the unspecified high order bits.
@@ -817,9 +846,10 @@ public:
     if (LHS.isNegative()) {
       if (RHS.isNegative())
         APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
-      else
+      else {
         APInt::udivrem(-LHS, RHS, Quotient, Remainder);
-      Quotient = -Quotient;
+        Quotient = -Quotient;
+      }
       Remainder = -Remainder;
     } else if (RHS.isNegative()) {
       APInt::udivrem(LHS, -RHS, Quotient, Remainder);
@@ -1087,7 +1117,7 @@ public:
     else {
       // Set all the bits in all the words.
       for (unsigned i = 0; i < getNumWords(); ++i)
-	pVal[i] = -1ULL;
+        pVal[i] = -1ULL;
     }
     // Clear the unused ones
     clearUnusedBits();
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index 54a7b60..048c65c 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -135,6 +135,19 @@ public:
     assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
     return IsUnsigned ? uge(RHS) : sge(RHS);
   }
+  inline bool operator==(const APSInt& RHS) const {
+    assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
+    return eq(RHS);
+  }
+  inline bool operator==(int64_t RHS) const {
+    return isSameValue(*this, APSInt(APInt(64, RHS), true));
+  }
+  inline bool operator!=(const APSInt& RHS) const {
+    return !((*this) == RHS);
+  }
+  inline bool operator!=(int64_t RHS) const {
+    return !((*this) == RHS);
+  }
 
   // The remaining operators just wrap the logic of APInt, but retain the
   // signedness information.
@@ -250,17 +263,50 @@ public:
                            : APInt::getSignedMinValue(numBits), Unsigned);
   }
 
+  /// \brief Determine if two APSInts have the same value, zero- or
+  /// sign-extending as needed.  
+  static bool isSameValue(const APSInt &I1, const APSInt &I2) {
+    if (I1.getBitWidth() == I2.getBitWidth() && I1.isSigned() == I2.isSigned())
+      return I1 == I2;
+
+    // Check for a bit-width mismatch.
+    if (I1.getBitWidth() > I2.getBitWidth())
+      return isSameValue(I1, I2.extend(I1.getBitWidth()));
+    else if (I2.getBitWidth() > I1.getBitWidth())
+      return isSameValue(I1.extend(I2.getBitWidth()), I2);
+
+    // We have a signedness mismatch. Turn the signed value into an unsigned
+    // value.
+    if (I1.isSigned()) {
+      if (I1.isNegative())
+        return false;
+
+      return APSInt(I1, true) == I2;
+    }
+
+    if (I2.isNegative())
+      return false;
+
+    return I1 == APSInt(I2, true);
+  }
+
   /// Profile - Used to insert APSInt objects, or objects that contain APSInt
   ///  objects, into FoldingSets.
   void Profile(FoldingSetNodeID& ID) const;
 };
 
+inline bool operator==(int64_t V1, const APSInt& V2) {
+  return V2 == V1;
+}
+inline bool operator!=(int64_t V1, const APSInt& V2) {
+  return V2 != V1;
+}
+
 inline raw_ostream &operator<<(raw_ostream &OS, const APSInt &I) {
   I.print(OS, I.isSigned());
   return OS;
 }
 
-
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index f4c8e55..cf55aad 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -60,7 +60,7 @@ namespace llvm {
       : Data(begin), Length(end - begin) {}
 
     /// Construct an ArrayRef from a SmallVector.
-    /*implicit*/ ArrayRef(const SmallVectorImpl<T> &Vec)
+    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T> &Vec)
       : Data(Vec.data()), Length(Vec.size()) {}
 
     /// Construct an ArrayRef from a std::vector.
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 7e0b5ba..3e2e5f2 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_BITVECTOR_H
 #define LLVM_ADT_BITVECTOR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
@@ -97,6 +98,13 @@ public:
     std::memcpy(Bits, RHS.Bits, Capacity * sizeof(BitWord));
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  BitVector(BitVector &&RHS)
+    : Bits(RHS.Bits), Size(RHS.Size), Capacity(RHS.Capacity) {
+    RHS.Bits = 0;
+  }
+#endif
+
   ~BitVector() {
     std::free(Bits);
   }
@@ -251,11 +259,6 @@ public:
     return *this;
   }
 
-  // No argument flip.
-  BitVector operator~() const {
-    return BitVector(*this).flip();
-  }
-
   // Indexing.
   reference operator[](unsigned Idx) {
     assert (Idx < Size && "Out-of-bounds Bit access.");
@@ -272,6 +275,16 @@ public:
     return (*this)[Idx];
   }
 
+  /// Test if any common bits are set.
+  bool anyCommon(const BitVector &RHS) const {
+    unsigned ThisWords = NumBitWords(size());
+    unsigned RHSWords  = NumBitWords(RHS.size());
+    for (unsigned i = 0, e = std::min(ThisWords, RHSWords); i != e; ++i)
+      if (Bits[i] & RHS.Bits[i])
+        return true;
+    return false;
+  }
+
   // Comparison operators.
   bool operator==(const BitVector &RHS) const {
     unsigned ThisWords = NumBitWords(size());
@@ -366,6 +379,21 @@ public:
     return *this;
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  const BitVector &operator=(BitVector &&RHS) {
+    if (this == &RHS) return *this;
+
+    std::free(Bits);
+    Bits = RHS.Bits;
+    Size = RHS.Size;
+    Capacity = RHS.Capacity;
+
+    RHS.Bits = 0;
+
+    return *this;
+  }
+#endif
+
   void swap(BitVector &RHS) {
     std::swap(Bits, RHS.Bits);
     std::swap(Size, RHS.Size);
@@ -472,24 +500,6 @@ private:
   }
 };
 
-inline BitVector operator&(const BitVector &LHS, const BitVector &RHS) {
-  BitVector Result(LHS);
-  Result &= RHS;
-  return Result;
-}
-
-inline BitVector operator|(const BitVector &LHS, const BitVector &RHS) {
-  BitVector Result(LHS);
-  Result |= RHS;
-  return Result;
-}
-
-inline BitVector operator^(const BitVector &LHS, const BitVector &RHS) {
-  BitVector Result(LHS);
-  Result ^= RHS;
-  return Result;
-}
-
 } // End llvm namespace
 
 namespace std {
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 8d4a19d..65a70fb 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_ADT_DENSEMAP_H
 #define LLVM_ADT_DENSEMAP_H
 
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include "llvm/Support/type_traits.h"
@@ -23,6 +25,7 @@
 #include <new>
 #include <utility>
 #include <cassert>
+#include <climits>
 #include <cstddef>
 #include <cstring>
 
@@ -33,116 +36,83 @@ template<typename KeyT, typename ValueT,
          bool IsConst = false>
 class DenseMapIterator;
 
-template<typename KeyT, typename ValueT,
-         typename KeyInfoT = DenseMapInfo<KeyT> >
-class DenseMap {
+template<typename DerivedT,
+         typename KeyT, typename ValueT, typename KeyInfoT>
+class DenseMapBase {
+protected:
   typedef std::pair<KeyT, ValueT> BucketT;
-  unsigned NumBuckets;
-  BucketT *Buckets;
 
-  unsigned NumEntries;
-  unsigned NumTombstones;
 public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
   typedef BucketT value_type;
 
-  DenseMap(const DenseMap &other) {
-    NumBuckets = 0;
-    CopyFrom(other);
-  }
-
-  explicit DenseMap(unsigned NumInitBuckets = 0) {
-    init(NumInitBuckets);
-  }
-
-  template<typename InputIt>
-  DenseMap(const InputIt &I, const InputIt &E) {
-    init(NextPowerOf2(std::distance(I, E)));
-    insert(I, E);
-  }
-  
-  ~DenseMap() {
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
-    for (BucketT *P = Buckets, *E = Buckets+NumBuckets; P != E; ++P) {
-      if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
-          !KeyInfoT::isEqual(P->first, TombstoneKey))
-        P->second.~ValueT();
-      P->first.~KeyT();
-    }
-#ifndef NDEBUG
-    if (NumBuckets)
-      memset((void*)Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
-#endif
-    operator delete(Buckets);
-  }
-
   typedef DenseMapIterator<KeyT, ValueT, KeyInfoT> iterator;
   typedef DenseMapIterator<KeyT, ValueT,
                            KeyInfoT, true> const_iterator;
   inline iterator begin() {
     // When the map is empty, avoid the overhead of AdvancePastEmptyBuckets().
-    return empty() ? end() : iterator(Buckets, Buckets+NumBuckets);
+    return empty() ? end() : iterator(getBuckets(), getBucketsEnd());
   }
   inline iterator end() {
-    return iterator(Buckets+NumBuckets, Buckets+NumBuckets, true);
+    return iterator(getBucketsEnd(), getBucketsEnd(), true);
   }
   inline const_iterator begin() const {
-    return empty() ? end() : const_iterator(Buckets, Buckets+NumBuckets);
+    return empty() ? end() : const_iterator(getBuckets(), getBucketsEnd());
   }
   inline const_iterator end() const {
-    return const_iterator(Buckets+NumBuckets, Buckets+NumBuckets, true);
+    return const_iterator(getBucketsEnd(), getBucketsEnd(), true);
   }
 
-  bool empty() const { return NumEntries == 0; }
-  unsigned size() const { return NumEntries; }
+  bool empty() const { return getNumEntries() == 0; }
+  unsigned size() const { return getNumEntries(); }
 
   /// Grow the densemap so that it has at least Size buckets. Does not shrink
   void resize(size_t Size) {
-    if (Size > NumBuckets)
+    if (Size > getNumBuckets())
       grow(Size);
   }
 
   void clear() {
-    if (NumEntries == 0 && NumTombstones == 0) return;
+    if (getNumEntries() == 0 && getNumTombstones() == 0) return;
     
     // If the capacity of the array is huge, and the # elements used is small,
     // shrink the array.
-    if (NumEntries * 4 < NumBuckets && NumBuckets > 64) {
+    if (getNumEntries() * 4 < getNumBuckets() && getNumBuckets() > 64) {
       shrink_and_clear();
       return;
     }
 
     const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
-    for (BucketT *P = Buckets, *E = Buckets+NumBuckets; P != E; ++P) {
+    for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
       if (!KeyInfoT::isEqual(P->first, EmptyKey)) {
         if (!KeyInfoT::isEqual(P->first, TombstoneKey)) {
           P->second.~ValueT();
-          --NumEntries;
+          decrementNumEntries();
         }
         P->first = EmptyKey;
       }
     }
-    assert(NumEntries == 0 && "Node count imbalance!");
-    NumTombstones = 0;
+    assert(getNumEntries() == 0 && "Node count imbalance!");
+    setNumTombstones(0);
   }
 
   /// count - Return true if the specified key is in the map.
   bool count(const KeyT &Val) const {
-    BucketT *TheBucket;
+    const BucketT *TheBucket;
     return LookupBucketFor(Val, TheBucket);
   }
 
   iterator find(const KeyT &Val) {
     BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
-      return iterator(TheBucket, Buckets+NumBuckets, true);
+      return iterator(TheBucket, getBucketsEnd(), true);
     return end();
   }
   const_iterator find(const KeyT &Val) const {
-    BucketT *TheBucket;
+    const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
-      return const_iterator(TheBucket, Buckets+NumBuckets, true);
+      return const_iterator(TheBucket, getBucketsEnd(), true);
     return end();
   }
 
@@ -155,21 +125,21 @@ public:
   iterator find_as(const LookupKeyT &Val) {
     BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
-      return iterator(TheBucket, Buckets+NumBuckets, true);
+      return iterator(TheBucket, getBucketsEnd(), true);
     return end();
   }
   template<class LookupKeyT>
   const_iterator find_as(const LookupKeyT &Val) const {
-    BucketT *TheBucket;
+    const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
-      return const_iterator(TheBucket, Buckets+NumBuckets, true);
+      return const_iterator(TheBucket, getBucketsEnd(), true);
     return end();
   }
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
   ValueT lookup(const KeyT &Val) const {
-    BucketT *TheBucket;
+    const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
       return TheBucket->second;
     return ValueT();
@@ -181,12 +151,12 @@ public:
   std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &KV) {
     BucketT *TheBucket;
     if (LookupBucketFor(KV.first, TheBucket))
-      return std::make_pair(iterator(TheBucket, Buckets+NumBuckets, true),
+      return std::make_pair(iterator(TheBucket, getBucketsEnd(), true),
                             false); // Already in map.
 
     // Otherwise, insert the new element.
     TheBucket = InsertIntoBucket(KV.first, KV.second, TheBucket);
-    return std::make_pair(iterator(TheBucket, Buckets+NumBuckets, true), true);
+    return std::make_pair(iterator(TheBucket, getBucketsEnd(), true), true);
   }
 
   /// insert - Range insertion of pairs.
@@ -204,23 +174,16 @@ public:
 
     TheBucket->second.~ValueT();
     TheBucket->first = getTombstoneKey();
-    --NumEntries;
-    ++NumTombstones;
+    decrementNumEntries();
+    incrementNumTombstones();
     return true;
   }
   void erase(iterator I) {
     BucketT *TheBucket = &*I;
     TheBucket->second.~ValueT();
     TheBucket->first = getTombstoneKey();
-    --NumEntries;
-    ++NumTombstones;
-  }
-
-  void swap(DenseMap& RHS) {
-    std::swap(NumBuckets, RHS.NumBuckets);
-    std::swap(Buckets, RHS.Buckets);
-    std::swap(NumEntries, RHS.NumEntries);
-    std::swap(NumTombstones, RHS.NumTombstones);
+    decrementNumEntries();
+    incrementNumTombstones();
   }
 
   value_type& FindAndConstruct(const KeyT &Key) {
@@ -235,68 +198,211 @@ public:
     return FindAndConstruct(Key).second;
   }
 
-  DenseMap& operator=(const DenseMap& other) {
-    CopyFrom(other);
-    return *this;
+#if LLVM_USE_RVALUE_REFERENCES
+  value_type& FindAndConstruct(KeyT &&Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return *TheBucket;
+
+    return *InsertIntoBucket(Key, ValueT(), TheBucket);
   }
 
+  ValueT &operator[](KeyT &&Key) {
+    return FindAndConstruct(Key).second;
+  }
+#endif
+
   /// isPointerIntoBucketsArray - Return true if the specified pointer points
   /// somewhere into the DenseMap's array of buckets (i.e. either to a key or
   /// value in the DenseMap).
   bool isPointerIntoBucketsArray(const void *Ptr) const {
-    return Ptr >= Buckets && Ptr < Buckets+NumBuckets;
+    return Ptr >= getBuckets() && Ptr < getBucketsEnd();
   }
 
   /// getPointerIntoBucketsArray() - Return an opaque pointer into the buckets
   /// array.  In conjunction with the previous method, this can be used to
   /// determine whether an insertion caused the DenseMap to reallocate.
-  const void *getPointerIntoBucketsArray() const { return Buckets; }
+  const void *getPointerIntoBucketsArray() const { return getBuckets(); }
 
-private:
-  void CopyFrom(const DenseMap& other) {
-    if (NumBuckets != 0 &&
-        (!isPodLike<KeyT>::value || !isPodLike<ValueT>::value)) {
-      const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
-      for (BucketT *P = Buckets, *E = Buckets+NumBuckets; P != E; ++P) {
-        if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
-            !KeyInfoT::isEqual(P->first, TombstoneKey))
-          P->second.~ValueT();
-        P->first.~KeyT();
-      }
-    }
+protected:
+  DenseMapBase() {}
 
-    NumEntries = other.NumEntries;
-    NumTombstones = other.NumTombstones;
+  void destroyAll() {
+    if (getNumBuckets() == 0) // Nothing to do.
+      return;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+      if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
+          !KeyInfoT::isEqual(P->first, TombstoneKey))
+        P->second.~ValueT();
+      P->first.~KeyT();
+    }
 
-    if (NumBuckets) {
 #ifndef NDEBUG
-      memset((void*)Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
+    memset((void*)getBuckets(), 0x5a, sizeof(BucketT)*getNumBuckets());
 #endif
-      operator delete(Buckets);
-    }
+  }
 
-    NumBuckets = other.NumBuckets;
+  void initEmpty() {
+    setNumEntries(0);
+    setNumTombstones(0);
 
-    if (NumBuckets == 0) {
-      Buckets = 0;
-      return;
+    assert((getNumBuckets() & (getNumBuckets()-1)) == 0 &&
+           "# initial buckets must be a power of two!");
+    const KeyT EmptyKey = getEmptyKey();
+    for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
+      new (&B->first) KeyT(EmptyKey);
+  }
+
+  void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+    initEmpty();
+
+    // Insert all the old elements.
+    const KeyT EmptyKey = getEmptyKey();
+    const KeyT TombstoneKey = getTombstoneKey();
+    for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
+      if (!KeyInfoT::isEqual(B->first, EmptyKey) &&
+          !KeyInfoT::isEqual(B->first, TombstoneKey)) {
+        // Insert the key/value into the new table.
+        BucketT *DestBucket;
+        bool FoundVal = LookupBucketFor(B->first, DestBucket);
+        (void)FoundVal; // silence warning.
+        assert(!FoundVal && "Key already in new map?");
+        DestBucket->first = llvm_move(B->first);
+        new (&DestBucket->second) ValueT(llvm_move(B->second));
+        incrementNumEntries();
+
+        // Free the value.
+        B->second.~ValueT();
+      }
+      B->first.~KeyT();
     }
 
-    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT) * NumBuckets));
+#ifndef NDEBUG
+    if (OldBucketsBegin != OldBucketsEnd)
+      memset((void*)OldBucketsBegin, 0x5a,
+             sizeof(BucketT) * (OldBucketsEnd - OldBucketsBegin));
+#endif
+  }
+
+  template <typename OtherBaseT>
+  void copyFrom(const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT>& other) {
+    assert(getNumBuckets() == other.getNumBuckets());
+
+    setNumEntries(other.getNumEntries());
+    setNumTombstones(other.getNumTombstones());
 
     if (isPodLike<KeyT>::value && isPodLike<ValueT>::value)
-      memcpy(Buckets, other.Buckets, NumBuckets * sizeof(BucketT));
+      memcpy(getBuckets(), other.getBuckets(),
+             getNumBuckets() * sizeof(BucketT));
     else
-      for (size_t i = 0; i < NumBuckets; ++i) {
-        new (&Buckets[i].first) KeyT(other.Buckets[i].first);
-        if (!KeyInfoT::isEqual(Buckets[i].first, getEmptyKey()) &&
-            !KeyInfoT::isEqual(Buckets[i].first, getTombstoneKey()))
-          new (&Buckets[i].second) ValueT(other.Buckets[i].second);
+      for (size_t i = 0; i < getNumBuckets(); ++i) {
+        new (&getBuckets()[i].first) KeyT(other.getBuckets()[i].first);
+        if (!KeyInfoT::isEqual(getBuckets()[i].first, getEmptyKey()) &&
+            !KeyInfoT::isEqual(getBuckets()[i].first, getTombstoneKey()))
+          new (&getBuckets()[i].second) ValueT(other.getBuckets()[i].second);
       }
   }
 
+  void swap(DenseMapBase& RHS) {
+    std::swap(getNumEntries(), RHS.getNumEntries());
+    std::swap(getNumTombstones(), RHS.getNumTombstones());
+  }
+
+  static unsigned getHashValue(const KeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+  template<typename LookupKeyT>
+  static unsigned getHashValue(const LookupKeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+  static const KeyT getEmptyKey() {
+    return KeyInfoT::getEmptyKey();
+  }
+  static const KeyT getTombstoneKey() {
+    return KeyInfoT::getTombstoneKey();
+  }
+
+private:
+  unsigned getNumEntries() const {
+    return static_cast<const DerivedT *>(this)->getNumEntries();
+  }
+  void setNumEntries(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumEntries(Num);
+  }
+  void incrementNumEntries() {
+    setNumEntries(getNumEntries() + 1);
+  }
+  void decrementNumEntries() {
+    setNumEntries(getNumEntries() - 1);
+  }
+  unsigned getNumTombstones() const {
+    return static_cast<const DerivedT *>(this)->getNumTombstones();
+  }
+  void setNumTombstones(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumTombstones(Num);
+  }
+  void incrementNumTombstones() {
+    setNumTombstones(getNumTombstones() + 1);
+  }
+  void decrementNumTombstones() {
+    setNumTombstones(getNumTombstones() - 1);
+  }
+  const BucketT *getBuckets() const {
+    return static_cast<const DerivedT *>(this)->getBuckets();
+  }
+  BucketT *getBuckets() {
+    return static_cast<DerivedT *>(this)->getBuckets();
+  }
+  unsigned getNumBuckets() const {
+    return static_cast<const DerivedT *>(this)->getNumBuckets();
+  }
+  BucketT *getBucketsEnd() {
+    return getBuckets() + getNumBuckets();
+  }
+  const BucketT *getBucketsEnd() const {
+    return getBuckets() + getNumBuckets();
+  }
+
+  void grow(unsigned AtLeast) {
+    static_cast<DerivedT *>(this)->grow(AtLeast);
+  }
+
+  void shrink_and_clear() {
+    static_cast<DerivedT *>(this)->shrink_and_clear();
+  }
+
+
   BucketT *InsertIntoBucket(const KeyT &Key, const ValueT &Value,
                             BucketT *TheBucket) {
+    TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+    TheBucket->first = Key;
+    new (&TheBucket->second) ValueT(Value);
+    return TheBucket;
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  BucketT *InsertIntoBucket(const KeyT &Key, ValueT &&Value,
+                            BucketT *TheBucket) {
+    TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+    TheBucket->first = Key;
+    new (&TheBucket->second) ValueT(std::move(Value));
+    return TheBucket;
+  }
+
+  BucketT *InsertIntoBucket(KeyT &&Key, ValueT &&Value, BucketT *TheBucket) {
+    TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+    TheBucket->first = std::move(Key);
+    new (&TheBucket->second) ValueT(std::move(Value));
+    return TheBucket;
+  }
+#endif
+
+  BucketT *InsertIntoBucketImpl(const KeyT &Key, BucketT *TheBucket) {
     // If the load of the hash table is more than 3/4, or if fewer than 1/8 of
     // the buckets are empty (meaning that many are filled with tombstones),
     // grow the table.
@@ -306,48 +412,38 @@ private:
     // probe almost the entire table until it found the empty bucket.  If the
     // table completely filled with tombstones, no lookup would ever succeed,
     // causing infinite loops in lookup.
-    ++NumEntries;
-    if (NumEntries*4 >= NumBuckets*3) {
+    unsigned NewNumEntries = getNumEntries() + 1;
+    unsigned NumBuckets = getNumBuckets();
+    if (NewNumEntries*4 >= NumBuckets*3) {
       this->grow(NumBuckets * 2);
       LookupBucketFor(Key, TheBucket);
+      NumBuckets = getNumBuckets();
     }
-    if (NumBuckets-(NumEntries+NumTombstones) < NumBuckets/8) {
+    if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
       this->grow(NumBuckets);
       LookupBucketFor(Key, TheBucket);
     }
 
+    // Only update the state after we've grown our bucket space appropriately
+    // so that when growing buckets we have self-consistent entry count.
+    incrementNumEntries();
+
     // If we are writing over a tombstone, remember this.
     if (!KeyInfoT::isEqual(TheBucket->first, getEmptyKey()))
-      --NumTombstones;
+      decrementNumTombstones();
 
-    TheBucket->first = Key;
-    new (&TheBucket->second) ValueT(Value);
     return TheBucket;
   }
 
-  static unsigned getHashValue(const KeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-  template<typename LookupKeyT>
-  static unsigned getHashValue(const LookupKeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-  static const KeyT getEmptyKey() {
-    return KeyInfoT::getEmptyKey();
-  }
-  static const KeyT getTombstoneKey() {
-    return KeyInfoT::getTombstoneKey();
-  }
-
   /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
   /// FoundBucket.  If the bucket contains the key and a value, this returns
   /// true, otherwise it returns a bucket with an empty marker or tombstone and
   /// returns false.
   template<typename LookupKeyT>
-  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) const {
-    unsigned BucketNo = getHashValue(Val);
-    unsigned ProbeAmt = 1;
-    BucketT *BucketsPtr = Buckets;
+  bool LookupBucketFor(const LookupKeyT &Val,
+                       const BucketT *&FoundBucket) const {
+    const BucketT *BucketsPtr = getBuckets();
+    const unsigned NumBuckets = getNumBuckets();
 
     if (NumBuckets == 0) {
       FoundBucket = 0;
@@ -355,15 +451,17 @@ private:
     }
 
     // FoundTombstone - Keep track of whether we find a tombstone while probing.
-    BucketT *FoundTombstone = 0;
+    const BucketT *FoundTombstone = 0;
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
     assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
            !KeyInfoT::isEqual(Val, TombstoneKey) &&
            "Empty/Tombstone value shouldn't be inserted into map!");
 
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets-1);
+    unsigned ProbeAmt = 1;
     while (1) {
-      BucketT *ThisBucket = BucketsPtr + (BucketNo & (NumBuckets-1));
+      const BucketT *ThisBucket = BucketsPtr + BucketNo;
       // Found Val's bucket?  If so, return it.
       if (KeyInfoT::isEqual(Val, ThisBucket->first)) {
         FoundBucket = ThisBucket;
@@ -388,115 +486,481 @@ private:
       // Otherwise, it's a hash collision or a tombstone, continue quadratic
       // probing.
       BucketNo += ProbeAmt++;
+      BucketNo &= (NumBuckets-1);
     }
   }
 
-  void init(unsigned InitBuckets) {
-    NumEntries = 0;
-    NumTombstones = 0;
-    NumBuckets = InitBuckets;
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
+    const BucketT *ConstFoundBucket;
+    bool Result = const_cast<const DenseMapBase *>(this)
+      ->LookupBucketFor(Val, ConstFoundBucket);
+    FoundBucket = const_cast<BucketT *>(ConstFoundBucket);
+    return Result;
+  }
 
-    if (InitBuckets == 0) {
-      Buckets = 0;
-      return;
+public:
+  /// Return the approximate size (in bytes) of the actual map.
+  /// This is just the raw memory used by DenseMap.
+  /// If entries are pointers to objects, the size of the referenced objects
+  /// are not included.
+  size_t getMemorySize() const {
+    return getNumBuckets() * sizeof(BucketT);
+  }
+};
+
+template<typename KeyT, typename ValueT,
+         typename KeyInfoT = DenseMapInfo<KeyT> >
+class DenseMap
+    : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT>,
+                          KeyT, ValueT, KeyInfoT> {
+  // Lift some types from the dependent base class into this class for
+  // simplicity of referring to them.
+  typedef DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT> BaseT;
+  typedef typename BaseT::BucketT BucketT;
+  friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT>;
+
+  BucketT *Buckets;
+  unsigned NumEntries;
+  unsigned NumTombstones;
+  unsigned NumBuckets;
+
+public:
+  explicit DenseMap(unsigned NumInitBuckets = 0) {
+    init(NumInitBuckets);
+  }
+
+  DenseMap(const DenseMap &other) {
+    init(0);
+    copyFrom(other);
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  DenseMap(DenseMap &&other) {
+    init(0);
+    swap(other);
+  }
+#endif
+
+  template<typename InputIt>
+  DenseMap(const InputIt &I, const InputIt &E) {
+    init(NextPowerOf2(std::distance(I, E)));
+    this->insert(I, E);
+  }
+
+  ~DenseMap() {
+    this->destroyAll();
+    operator delete(Buckets);
+  }
+
+  void swap(DenseMap& RHS) {
+    std::swap(Buckets, RHS.Buckets);
+    std::swap(NumEntries, RHS.NumEntries);
+    std::swap(NumTombstones, RHS.NumTombstones);
+    std::swap(NumBuckets, RHS.NumBuckets);
+  }
+
+  DenseMap& operator=(const DenseMap& other) {
+    copyFrom(other);
+    return *this;
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  DenseMap& operator=(DenseMap &&other) {
+    this->destroyAll();
+    operator delete(Buckets);
+    init(0);
+    swap(other);
+    return *this;
+  }
+#endif
+
+  void copyFrom(const DenseMap& other) {
+    this->destroyAll();
+    operator delete(Buckets);
+    if (allocateBuckets(other.NumBuckets)) {
+      this->BaseT::copyFrom(other);
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
     }
+  }
 
-    assert(InitBuckets && (InitBuckets & (InitBuckets-1)) == 0 &&
-           "# initial buckets must be a power of two!");
-    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT)*InitBuckets));
-    // Initialize all the keys to EmptyKey.
-    const KeyT EmptyKey = getEmptyKey();
-    for (unsigned i = 0; i != InitBuckets; ++i)
-      new (&Buckets[i].first) KeyT(EmptyKey);
+  void init(unsigned InitBuckets) {
+    if (allocateBuckets(InitBuckets)) {
+      this->BaseT::initEmpty();
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
+    }
   }
 
   void grow(unsigned AtLeast) {
     unsigned OldNumBuckets = NumBuckets;
     BucketT *OldBuckets = Buckets;
 
-    if (NumBuckets < 64)
-      NumBuckets = 64;
+    allocateBuckets(std::max<unsigned>(64, NextPowerOf2(AtLeast)));
+    assert(Buckets);
+    if (!OldBuckets) {
+      this->BaseT::initEmpty();
+      return;
+    }
 
-    // Double the number of buckets.
-    while (NumBuckets < AtLeast)
-      NumBuckets <<= 1;
-    NumTombstones = 0;
-    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT)*NumBuckets));
+    this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets);
 
-    // Initialize all the keys to EmptyKey.
-    const KeyT EmptyKey = getEmptyKey();
-    for (unsigned i = 0, e = NumBuckets; i != e; ++i)
-      new (&Buckets[i].first) KeyT(EmptyKey);
+    // Free the old table.
+    operator delete(OldBuckets);
+  }
 
-    // Insert all the old elements.
-    const KeyT TombstoneKey = getTombstoneKey();
-    for (BucketT *B = OldBuckets, *E = OldBuckets+OldNumBuckets; B != E; ++B) {
-      if (!KeyInfoT::isEqual(B->first, EmptyKey) &&
-          !KeyInfoT::isEqual(B->first, TombstoneKey)) {
-        // Insert the key/value into the new table.
-        BucketT *DestBucket;
-        bool FoundVal = LookupBucketFor(B->first, DestBucket);
-        (void)FoundVal; // silence warning.
-        assert(!FoundVal && "Key already in new map?");
-        DestBucket->first = B->first;
-        new (&DestBucket->second) ValueT(B->second);
+  void shrink_and_clear() {
+    unsigned OldNumEntries = NumEntries;
+    this->destroyAll();
 
-        // Free the value.
-        B->second.~ValueT();
-      }
-      B->first.~KeyT();
+    // Reduce the number of buckets.
+    unsigned NewNumBuckets = 0;
+    if (OldNumEntries)
+      NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1));
+    if (NewNumBuckets == NumBuckets) {
+      this->BaseT::initEmpty();
+      return;
     }
 
-#ifndef NDEBUG
-    if (OldNumBuckets)
-      memset((void*)OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
+    operator delete(Buckets);
+    init(NewNumBuckets);
+  }
+
+private:
+  unsigned getNumEntries() const {
+    return NumEntries;
+  }
+  void setNumEntries(unsigned Num) {
+    NumEntries = Num;
+  }
+
+  unsigned getNumTombstones() const {
+    return NumTombstones;
+  }
+  void setNumTombstones(unsigned Num) {
+    NumTombstones = Num;
+  }
+
+  BucketT *getBuckets() const {
+    return Buckets;
+  }
+
+  unsigned getNumBuckets() const {
+    return NumBuckets;
+  }
+
+  bool allocateBuckets(unsigned Num) {
+    NumBuckets = Num;
+    if (NumBuckets == 0) {
+      Buckets = 0;
+      return false;
+    }
+
+    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT) * NumBuckets));
+    return true;
+  }
+};
+
+template<typename KeyT, typename ValueT,
+         unsigned InlineBuckets = 4,
+         typename KeyInfoT = DenseMapInfo<KeyT> >
+class SmallDenseMap
+    : public DenseMapBase<SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT>,
+                          KeyT, ValueT, KeyInfoT> {
+  // Lift some types from the dependent base class into this class for
+  // simplicity of referring to them.
+  typedef DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT> BaseT;
+  typedef typename BaseT::BucketT BucketT;
+  friend class DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT>;
+
+  unsigned Small : 1;
+  unsigned NumEntries : 31;
+  unsigned NumTombstones;
+
+  struct LargeRep {
+    BucketT *Buckets;
+    unsigned NumBuckets;
+  };
+
+  /// A "union" of an inline bucket array and the struct representing
+  /// a large bucket. This union will be discriminated by the 'Small' bit.
+  typename AlignedCharArray<BucketT[InlineBuckets], LargeRep>::union_type
+    storage;
+
+public:
+  explicit SmallDenseMap(unsigned NumInitBuckets = 0) {
+    init(NumInitBuckets);
+  }
+
+  SmallDenseMap(const SmallDenseMap &other) {
+    init(0);
+    copyFrom(other);
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallDenseMap(SmallDenseMap &&other) {
+    init(0);
+    swap(other);
+  }
 #endif
-    // Free the old table.
-    operator delete(OldBuckets);
+
+  template<typename InputIt>
+  SmallDenseMap(const InputIt &I, const InputIt &E) {
+    init(NextPowerOf2(std::distance(I, E)));
+    this->insert(I, E);
   }
 
-  void shrink_and_clear() {
-    unsigned OldNumBuckets = NumBuckets;
-    BucketT *OldBuckets = Buckets;
+  ~SmallDenseMap() {
+    this->destroyAll();
+    deallocateBuckets();
+  }
 
-    // Reduce the number of buckets.
-    NumBuckets = NumEntries > 32 ? 1 << (Log2_32_Ceil(NumEntries) + 1)
-                                 : 64;
-    NumTombstones = 0;
-    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT)*NumBuckets));
+  void swap(SmallDenseMap& RHS) {
+    unsigned TmpNumEntries = RHS.NumEntries;
+    RHS.NumEntries = NumEntries;
+    NumEntries = TmpNumEntries;
+    std::swap(NumTombstones, RHS.NumTombstones);
 
-    // Initialize all the keys to EmptyKey.
-    const KeyT EmptyKey = getEmptyKey();
-    for (unsigned i = 0, e = NumBuckets; i != e; ++i)
-      new (&Buckets[i].first) KeyT(EmptyKey);
+    const KeyT EmptyKey = this->getEmptyKey();
+    const KeyT TombstoneKey = this->getTombstoneKey();
+    if (Small && RHS.Small) {
+      // If we're swapping inline bucket arrays, we have to cope with some of
+      // the tricky bits of DenseMap's storage system: the buckets are not
+      // fully initialized. Thus we swap every key, but we may have
+      // a one-directional move of the value.
+      for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
+        BucketT *LHSB = &getInlineBuckets()[i],
+                *RHSB = &RHS.getInlineBuckets()[i];
+        bool hasLHSValue = (!KeyInfoT::isEqual(LHSB->first, EmptyKey) &&
+                            !KeyInfoT::isEqual(LHSB->first, TombstoneKey));
+        bool hasRHSValue = (!KeyInfoT::isEqual(RHSB->first, EmptyKey) &&
+                            !KeyInfoT::isEqual(RHSB->first, TombstoneKey));
+        if (hasLHSValue && hasRHSValue) {
+          // Swap together if we can...
+          std::swap(*LHSB, *RHSB);
+          continue;
+        }
+        // Swap separately and handle any assymetry.
+        std::swap(LHSB->first, RHSB->first);
+        if (hasLHSValue) {
+          new (&RHSB->second) ValueT(llvm_move(LHSB->second));
+          LHSB->second.~ValueT();
+        } else if (hasRHSValue) {
+          new (&LHSB->second) ValueT(llvm_move(RHSB->second));
+          RHSB->second.~ValueT();
+        }
+      }
+      return;
+    }
+    if (!Small && !RHS.Small) {
+      std::swap(getLargeRep()->Buckets, RHS.getLargeRep()->Buckets);
+      std::swap(getLargeRep()->NumBuckets, RHS.getLargeRep()->NumBuckets);
+      return;
+    }
 
-    // Free the old buckets.
-    const KeyT TombstoneKey = getTombstoneKey();
-    for (BucketT *B = OldBuckets, *E = OldBuckets+OldNumBuckets; B != E; ++B) {
-      if (!KeyInfoT::isEqual(B->first, EmptyKey) &&
-          !KeyInfoT::isEqual(B->first, TombstoneKey)) {
-        // Free the value.
-        B->second.~ValueT();
+    SmallDenseMap &SmallSide = Small ? *this : RHS;
+    SmallDenseMap &LargeSide = Small ? RHS : *this;
+
+    // First stash the large side's rep and move the small side across.
+    LargeRep TmpRep = llvm_move(*LargeSide.getLargeRep());
+    LargeSide.getLargeRep()->~LargeRep();
+    LargeSide.Small = true;
+    // This is similar to the standard move-from-old-buckets, but the bucket
+    // count hasn't actually rotated in this case. So we have to carefully
+    // move construct the keys and values into their new locations, but there
+    // is no need to re-hash things.
+    for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
+      BucketT *NewB = &LargeSide.getInlineBuckets()[i],
+              *OldB = &SmallSide.getInlineBuckets()[i];
+      new (&NewB->first) KeyT(llvm_move(OldB->first));
+      OldB->first.~KeyT();
+      if (!KeyInfoT::isEqual(NewB->first, EmptyKey) &&
+          !KeyInfoT::isEqual(NewB->first, TombstoneKey)) {
+        new (&NewB->second) ValueT(llvm_move(OldB->second));
+        OldB->second.~ValueT();
       }
-      B->first.~KeyT();
     }
 
-#ifndef NDEBUG
-    memset((void*)OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
+    // The hard part of moving the small buckets across is done, just move
+    // the TmpRep into its new home.
+    SmallSide.Small = false;
+    new (SmallSide.getLargeRep()) LargeRep(llvm_move(TmpRep));
+  }
+
+  SmallDenseMap& operator=(const SmallDenseMap& other) {
+    copyFrom(other);
+    return *this;
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallDenseMap& operator=(SmallDenseMap &&other) {
+    this->destroyAll();
+    deallocateBuckets();
+    init(0);
+    swap(other);
+    return *this;
+  }
 #endif
+
+  void copyFrom(const SmallDenseMap& other) {
+    this->destroyAll();
+    deallocateBuckets();
+    Small = true;
+    if (other.getNumBuckets() > InlineBuckets) {
+      Small = false;
+      allocateBuckets(other.getNumBuckets());
+    }
+    this->BaseT::copyFrom(other);
+  }
+
+  void init(unsigned InitBuckets) {
+    Small = true;
+    if (InitBuckets > InlineBuckets) {
+      Small = false;
+      new (getLargeRep()) LargeRep(allocateBuckets(InitBuckets));
+    }
+    this->BaseT::initEmpty();
+  }
+
+  void grow(unsigned AtLeast) {
+    if (AtLeast > InlineBuckets)
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast));
+
+    if (Small) {
+      if (AtLeast <= InlineBuckets)
+        return; // Nothing to do.
+
+      // First move the inline buckets into a temporary storage.
+      typename AlignedCharArray<BucketT[InlineBuckets]>::union_type
+        TmpStorage;
+      BucketT *TmpBegin = reinterpret_cast<BucketT *>(TmpStorage.buffer);
+      BucketT *TmpEnd = TmpBegin;
+
+      // Loop over the buckets, moving non-empty, non-tombstones into the
+      // temporary storage. Have the loop move the TmpEnd forward as it goes.
+      const KeyT EmptyKey = this->getEmptyKey();
+      const KeyT TombstoneKey = this->getTombstoneKey();
+      for (BucketT *P = getBuckets(), *E = P + InlineBuckets; P != E; ++P) {
+        if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
+            !KeyInfoT::isEqual(P->first, TombstoneKey)) {
+          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
+                 "Too many inline buckets!");
+          new (&TmpEnd->first) KeyT(llvm_move(P->first));
+          new (&TmpEnd->second) ValueT(llvm_move(P->second));
+          ++TmpEnd;
+          P->second.~ValueT();
+        }
+        P->first.~KeyT();
+      }
+
+      // Now make this map use the large rep, and move all the entries back
+      // into it.
+      Small = false;
+      new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
+      this->moveFromOldBuckets(TmpBegin, TmpEnd);
+      return;
+    }
+
+    LargeRep OldRep = llvm_move(*getLargeRep());
+    getLargeRep()->~LargeRep();
+    if (AtLeast <= InlineBuckets) {
+      Small = true;
+    } else {
+      new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
+    }
+
+    this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets);
+
     // Free the old table.
-    operator delete(OldBuckets);
+    operator delete(OldRep.Buckets);
+  }
+
+  void shrink_and_clear() {
+    unsigned OldSize = this->size();
+    this->destroyAll();
+
+    // Reduce the number of buckets.
+    unsigned NewNumBuckets = 0;
+    if (OldSize) {
+      NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1);
+      if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u)
+        NewNumBuckets = 64;
+    }
+    if ((Small && NewNumBuckets <= InlineBuckets) ||
+        (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) {
+      this->BaseT::initEmpty();
+      return;
+    }
 
-    NumEntries = 0;
+    deallocateBuckets();
+    init(NewNumBuckets);
   }
-  
-public:
-  /// Return the approximate size (in bytes) of the actual map.
-  /// This is just the raw memory used by DenseMap.
-  /// If entries are pointers to objects, the size of the referenced objects
-  /// are not included.
-  size_t getMemorySize() const {
-    return NumBuckets * sizeof(BucketT);
+
+private:
+  unsigned getNumEntries() const {
+    return NumEntries;
+  }
+  void setNumEntries(unsigned Num) {
+    assert(Num < INT_MAX && "Cannot support more than INT_MAX entries");
+    NumEntries = Num;
+  }
+
+  unsigned getNumTombstones() const {
+    return NumTombstones;
+  }
+  void setNumTombstones(unsigned Num) {
+    NumTombstones = Num;
+  }
+
+  const BucketT *getInlineBuckets() const {
+    assert(Small);
+    // Note that this cast does not violate aliasing rules as we assert that
+    // the memory's dynamic type is the small, inline bucket buffer, and the
+    // 'storage.buffer' static type is 'char *'.
+    return reinterpret_cast<const BucketT *>(storage.buffer);
+  }
+  BucketT *getInlineBuckets() {
+    return const_cast<BucketT *>(
+      const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
+  }
+  const LargeRep *getLargeRep() const {
+    assert(!Small);
+    // Note, same rule about aliasing as with getInlineBuckets.
+    return reinterpret_cast<const LargeRep *>(storage.buffer);
+  }
+  LargeRep *getLargeRep() {
+    return const_cast<LargeRep *>(
+      const_cast<const SmallDenseMap *>(this)->getLargeRep());
+  }
+
+  const BucketT *getBuckets() const {
+    return Small ? getInlineBuckets() : getLargeRep()->Buckets;
+  }
+  BucketT *getBuckets() {
+    return const_cast<BucketT *>(
+      const_cast<const SmallDenseMap *>(this)->getBuckets());
+  }
+  unsigned getNumBuckets() const {
+    return Small ? InlineBuckets : getLargeRep()->NumBuckets;
+  }
+
+  void deallocateBuckets() {
+    if (Small)
+      return;
+
+    operator delete(getLargeRep()->Buckets);
+    getLargeRep()->~LargeRep();
+  }
+
+  LargeRep allocateBuckets(unsigned Num) {
+    assert(Num > InlineBuckets && "Must allocate more buckets than are inline");
+    LargeRep Rep = {
+      static_cast<BucketT*>(operator new(sizeof(BucketT) * Num)), Num
+    };
+    return Rep;
   }
 };
 
diff --git a/include/llvm/ADT/DepthFirstIterator.h b/include/llvm/ADT/DepthFirstIterator.h
index dd13a2c..519b180 100644
--- a/include/llvm/ADT/DepthFirstIterator.h
+++ b/include/llvm/ADT/DepthFirstIterator.h
@@ -187,7 +187,7 @@ public:
   /// current node, counting both nodes.
   unsigned getPathLength() const { return VisitStack.size(); }
 
-  /// getPath - Return the n'th node in the path from the the entry node to the
+  /// getPath - Return the n'th node in the path from the entry node to the
   /// current node.
   NodeType *getPath(unsigned n) const {
     return VisitStack[n].first.getPointer();
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 7d7c777..ba415ac 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -518,6 +518,111 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
+/// FoldingSetVectorIterator - This implements an iterator for
+/// FoldingSetVector. It is only necessary because FoldingSetIterator provides
+/// a value_type of T, while the vector in FoldingSetVector exposes
+/// a value_type of T*. Fortunately, FoldingSetIterator doesn't expose very
+/// much besides operator* and operator->, so we just wrap the inner vector
+/// iterator and perform the extra dereference.
+template <class T, class VectorIteratorT>
+class FoldingSetVectorIterator {
+  // Provide a typedef to workaround the lack of correct injected class name
+  // support in older GCCs.
+  typedef FoldingSetVectorIterator<T, VectorIteratorT> SelfT;
+
+  VectorIteratorT Iterator;
+
+public:
+  FoldingSetVectorIterator(VectorIteratorT I) : Iterator(I) {}
+
+  bool operator==(const SelfT &RHS) const {
+    return Iterator == RHS.Iterator;
+  }
+  bool operator!=(const SelfT &RHS) const {
+    return Iterator != RHS.Iterator;
+  }
+
+  T &operator*() const { return **Iterator; }
+
+  T *operator->() const { return *Iterator; }
+
+  inline SelfT &operator++() {
+    ++Iterator;
+    return *this;
+  }
+  SelfT operator++(int) {
+    SelfT tmp = *this;
+    ++*this;
+    return tmp;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetVector - This template class combines a FoldingSet and a vector
+/// to provide the interface of FoldingSet but with deterministic iteration
+/// order based on the insertion order. T must be a subclass of FoldingSetNode
+/// and implement a Profile function.
+template <class T, class VectorT = SmallVector<T*, 8> >
+class FoldingSetVector {
+  FoldingSet<T> Set;
+  VectorT Vector;
+
+public:
+  explicit FoldingSetVector(unsigned Log2InitSize = 6)
+      : Set(Log2InitSize) {
+  }
+
+  typedef FoldingSetVectorIterator<T, typename VectorT::iterator> iterator;
+  iterator begin() { return Vector.begin(); }
+  iterator end()   { return Vector.end(); }
+
+  typedef FoldingSetVectorIterator<const T, typename VectorT::const_iterator>
+    const_iterator;
+  const_iterator begin() const { return Vector.begin(); }
+  const_iterator end()   const { return Vector.end(); }
+
+  /// clear - Remove all nodes from the folding set.
+  void clear() { Set.clear(); Vector.clear(); }
+
+  /// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
+  /// return it.  If not, return the insertion token that will make insertion
+  /// faster.
+  T *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) {
+    return Set.FindNodeOrInsertPos(ID, InsertPos);
+  }
+
+  /// GetOrInsertNode - If there is an existing simple Node exactly
+  /// equal to the specified node, return it.  Otherwise, insert 'N' and
+  /// return it instead.
+  T *GetOrInsertNode(T *N) {
+    T *Result = Set.GetOrInsertNode(N);
+    if (Result == N) Vector.push_back(N);
+    return Result;
+  }
+
+  /// InsertNode - Insert the specified node into the folding set, knowing that
+  /// it is not already in the folding set.  InsertPos must be obtained from
+  /// FindNodeOrInsertPos.
+  void InsertNode(T *N, void *InsertPos) {
+    Set.InsertNode(N, InsertPos);
+    Vector.push_back(N);
+  }
+
+  /// InsertNode - Insert the specified node into the folding set, knowing that
+  /// it is not already in the folding set.
+  void InsertNode(T *N) {
+    Set.InsertNode(N);
+    Vector.push_back(N);
+  }
+
+  /// size - Returns the number of nodes in the folding set.
+  unsigned size() const { return Set.size(); }
+
+  /// empty - Returns true if there are no nodes in the folding set.
+  bool empty() const { return Set.empty(); }
+};
+
+//===----------------------------------------------------------------------===//
 /// FoldingSetIteratorImpl - This is the common iterator support shared by all
 /// folding sets, which knows how to walk the folding set hash table.
 class FoldingSetIteratorImpl {
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 53032ee..6ab0725 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -76,10 +76,6 @@ namespace llvm {
 ///   using llvm::hash_value;
 ///   llvm::hash_code code = hash_value(x);
 /// \endcode
-///
-/// Also note that there are two numerical values which are reserved, and the
-/// implementation ensures will never be produced for real hash_codes. These
-/// can be used as sentinels within hashing data structures.
 class hash_code {
   size_t value;
 
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index 89b1648..949dc44 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -431,7 +431,7 @@ protected:
 
   // Make sure the index is not the Tombstone or Entry key of the DenseMap.
   static inline unsigned maskCacheIndex(unsigned I) {
-	return (I & ~0x02);
+    return (I & ~0x02);
   }
 
   unsigned incrementHeight(TreeTy* L, TreeTy* R) const {
@@ -667,7 +667,7 @@ public:
     return reinterpret_cast<TreeTy*>(stack.back() & ~Flags);
   }
 
-  uintptr_t getVisitState() {
+  uintptr_t getVisitState() const {
     assert(!stack.empty());
     return stack.back() & Flags;
   }
diff --git a/include/llvm/ADT/IndexedMap.h b/include/llvm/ADT/IndexedMap.h
index 87126ea..2ffb505 100644
--- a/include/llvm/ADT/IndexedMap.h
+++ b/include/llvm/ADT/IndexedMap.h
@@ -20,19 +20,14 @@
 #ifndef LLVM_ADT_INDEXEDMAP_H
 #define LLVM_ADT_INDEXEDMAP_H
 
+#include "llvm/ADT/STLExtras.h"
 #include <cassert>
 #include <functional>
 #include <vector>
 
 namespace llvm {
 
-  struct IdentityFunctor : public std::unary_function<unsigned, unsigned> {
-    unsigned operator()(unsigned Index) const {
-      return Index;
-    }
-  };
-
-  template <typename T, typename ToIndexT = IdentityFunctor>
+template <typename T, typename ToIndexT = llvm::identity<unsigned> >
   class IndexedMap {
     typedef typename ToIndexT::argument_type IndexT;
     typedef std::vector<T> StorageT;
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index 3a1a3f4..a9724ee 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -21,9 +21,9 @@
 #ifndef LLVM_ADT_INTRUSIVE_REF_CNT_PTR
 #define LLVM_ADT_INTRUSIVE_REF_CNT_PTR
 
-#include <cassert>
-
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include <memory>
 
 namespace llvm {
 
@@ -34,7 +34,7 @@ namespace llvm {
 /// RefCountedBase - A generic base class for objects that wish to
 ///  have their lifetimes managed using reference counts. Classes
 ///  subclass RefCountedBase to obtain such functionality, and are
-///  typically handled with IntrusivePtr "smart pointers" (see below)
+///  typically handled with IntrusiveRefCntPtr "smart pointers" (see below)
 ///  which automatically handle the management of reference counts.
 ///  Objects that subclass RefCountedBase should not be allocated on
 ///  the stack, as invoking "delete" (which is called when the
@@ -123,25 +123,25 @@ namespace llvm {
       retain();
     }
 
-    template <class X>
-    IntrusiveRefCntPtr(const IntrusiveRefCntPtr<X>& S)
-      : Obj(S.getPtr()) {
-      retain();
+#if LLVM_USE_RVALUE_REFERENCES
+    IntrusiveRefCntPtr(IntrusiveRefCntPtr&& S) : Obj(S.Obj) {
+      S.Obj = 0;
     }
 
-    IntrusiveRefCntPtr& operator=(const IntrusiveRefCntPtr& S) {
-      replace(S.getPtr());
-      return *this;
+    template <class X>
+    IntrusiveRefCntPtr(IntrusiveRefCntPtr<X>&& S) : Obj(S.getPtr()) {
+      S.Obj = 0;
     }
+#endif
 
     template <class X>
-    IntrusiveRefCntPtr& operator=(const IntrusiveRefCntPtr<X>& S) {
-      replace(S.getPtr());
-      return *this;
+    IntrusiveRefCntPtr(const IntrusiveRefCntPtr<X>& S)
+      : Obj(S.getPtr()) {
+      retain();
     }
 
-    IntrusiveRefCntPtr& operator=(T * S) {
-      replace(S);
+    IntrusiveRefCntPtr& operator=(IntrusiveRefCntPtr S) {
+      swap(S);
       return *this;
     }
 
@@ -176,10 +176,6 @@ namespace llvm {
   private:
     void retain() { if (Obj) IntrusiveRefCntPtrInfo<T>::retain(Obj); }
     void release() { if (Obj) IntrusiveRefCntPtrInfo<T>::release(Obj); }
-
-    void replace(T* S) {
-      this_type(S).swap(*this);
-    }
   };
 
   template<class T, class U>
diff --git a/include/llvm/ADT/PointerIntPair.h b/include/llvm/ADT/PointerIntPair.h
index ccdcd1a..fcc758b 100644
--- a/include/llvm/ADT/PointerIntPair.h
+++ b/include/llvm/ADT/PointerIntPair.h
@@ -108,7 +108,14 @@ public:
   static PointerIntPair getFromOpaqueValue(void *V) {
     PointerIntPair P; P.setFromOpaqueValue(V); return P; 
   }
-  
+
+  // Allow PointerIntPairs to be created from const void * if and only if the
+  // pointer type could be created from a const void *.
+  static PointerIntPair getFromOpaqueValue(const void *V) {
+    (void)PtrTraits::getFromVoidPointer(V);
+    return getFromOpaqueValue(const_cast<void *>(V));
+  }
+
   bool operator==(const PointerIntPair &RHS) const {return Value == RHS.Value;}
   bool operator!=(const PointerIntPair &RHS) const {return Value != RHS.Value;}
   bool operator<(const PointerIntPair &RHS) const {return Value < RHS.Value;}
@@ -158,6 +165,10 @@ public:
   getFromVoidPointer(void *P) {
     return PointerIntPair<PointerTy, IntBits, IntType>::getFromOpaqueValue(P);
   }
+  static inline PointerIntPair<PointerTy, IntBits, IntType>
+  getFromVoidPointer(const void *P) {
+    return PointerIntPair<PointerTy, IntBits, IntType>::getFromOpaqueValue(P);
+  }
   enum {
     NumLowBitsAvailable = PtrTraits::NumLowBitsAvailable - IntBits
   };
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index 614b59c..a9e86d2 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -54,8 +54,8 @@ namespace llvm {
     static inline void *getAsVoidPointer(void *P) { return P; }
     static inline void *getFromVoidPointer(void *P) { return P; }
     enum {
-      PT1BitsAv = PointerLikeTypeTraits<PT1>::NumLowBitsAvailable,
-      PT2BitsAv = PointerLikeTypeTraits<PT2>::NumLowBitsAvailable,
+      PT1BitsAv = (int)(PointerLikeTypeTraits<PT1>::NumLowBitsAvailable),
+      PT2BitsAv = (int)(PointerLikeTypeTraits<PT2>::NumLowBitsAvailable),
       NumLowBitsAvailable = PT1BitsAv < PT2BitsAv ? PT1BitsAv : PT2BitsAv
     };
   };
diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index 63a2b52..7f6350e 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -23,26 +23,65 @@
 
 namespace llvm {
 
-template<class SetType, bool External>   // Non-external set
+// The po_iterator_storage template provides access to the set of already
+// visited nodes during the po_iterator's depth-first traversal.
+//
+// The default implementation simply contains a set of visited nodes, while
+// the Extended=true version uses a reference to an external set.
+//
+// It is possible to prune the depth-first traversal in several ways:
+//
+// - When providing an external set that already contains some graph nodes,
+//   those nodes won't be visited again. This is useful for restarting a
+//   post-order traversal on a graph with nodes that aren't dominated by a
+//   single node.
+//
+// - By providing a custom SetType class, unwanted graph nodes can be excluded
+//   by having the insert() function return false. This could for example
+//   confine a CFG traversal to blocks in a specific loop.
+//
+// - Finally, by specializing the po_iterator_storage template itself, graph
+//   edges can be pruned by returning false in the insertEdge() function. This
+//   could be used to remove loop back-edges from the CFG seen by po_iterator.
+//
+// A specialized po_iterator_storage class can observe both the pre-order and
+// the post-order. The insertEdge() function is called in a pre-order, while
+// the finishPostorder() function is called just before the po_iterator moves
+// on to the next node.
+
+/// Default po_iterator_storage implementation with an internal set object.
+template<class SetType, bool External>
 class po_iterator_storage {
-public:
   SetType Visited;
-};
+public:
+  // Return true if edge destination should be visited.
+  template<typename NodeType>
+  bool insertEdge(NodeType *From, NodeType *To) {
+    return Visited.insert(To);
+  }
 
-/// DFSetTraits - Allow the SetType used to record depth-first search results to
-/// optionally record node postorder.
-template<class SetType>
-struct DFSetTraits {
-  static void finishPostorder(
-    typename SetType::iterator::value_type, SetType &) {}
+  // Called after all children of BB have been visited.
+  template<typename NodeType>
+  void finishPostorder(NodeType *BB) {}
 };
 
+/// Specialization of po_iterator_storage that references an external set.
 template<class SetType>
 class po_iterator_storage<SetType, true> {
+  SetType &Visited;
 public:
   po_iterator_storage(SetType &VSet) : Visited(VSet) {}
   po_iterator_storage(const po_iterator_storage &S) : Visited(S.Visited) {}
-  SetType &Visited;
+
+  // Return true if edge destination should be visited, called with From = 0 for
+  // the root node.
+  // Graph edges can be pruned by specializing this function.
+  template<class NodeType>
+  bool insertEdge(NodeType *From, NodeType *To) { return Visited.insert(To); }
+
+  // Called after all children of BB have been visited.
+  template<class NodeType>
+  void finishPostorder(NodeType *BB) {}
 };
 
 template<class GraphT,
@@ -64,14 +103,15 @@ class po_iterator : public std::iterator<std::forward_iterator_tag,
   void traverseChild() {
     while (VisitStack.back().second != GT::child_end(VisitStack.back().first)) {
       NodeType *BB = *VisitStack.back().second++;
-      if (this->Visited.insert(BB)) {  // If the block is not visited...
+      if (this->insertEdge(VisitStack.back().first, BB)) {
+        // If the block is not visited...
         VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
       }
     }
   }
 
   inline po_iterator(NodeType *BB) {
-    this->Visited.insert(BB);
+    this->insertEdge((NodeType*)0, BB);
     VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
     traverseChild();
   }
@@ -79,7 +119,7 @@ class po_iterator : public std::iterator<std::forward_iterator_tag,
 
   inline po_iterator(NodeType *BB, SetType &S) :
     po_iterator_storage<SetType, ExtStorage>(S) {
-    if (this->Visited.insert(BB)) {
+    if (this->insertEdge((NodeType*)0, BB)) {
       VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
       traverseChild();
     }
@@ -117,8 +157,7 @@ public:
   inline NodeType *operator->() const { return operator*(); }
 
   inline _Self& operator++() {   // Preincrement
-    DFSetTraits<SetType>::finishPostorder(VisitStack.back().first,
-                                          this->Visited);
+    this->finishPostorder(VisitStack.back().first);
     VisitStack.pop_back();
     if (!VisitStack.empty())
       traverseChild();
@@ -173,14 +212,14 @@ ipo_iterator<T> ipo_end(T G){
   return ipo_iterator<T>::end(G);
 }
 
-//Provide global definitions of external inverse postorder iterators...
+// Provide global definitions of external inverse postorder iterators...
 template <class T,
           class SetType = std::set<typename GraphTraits<T>::NodeType*> >
 struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> {
   ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) :
-    ipo_iterator<T, SetType, true>(&V) {}
+    ipo_iterator<T, SetType, true>(V) {}
   ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) :
-    ipo_iterator<T, SetType, true>(&V) {}
+    ipo_iterator<T, SetType, true>(V) {}
 };
 
 template <class T, class SetType>
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 5da906d..aee500d 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -30,6 +30,16 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 
 template<class Ty>
+struct identity : public std::unary_function<Ty, Ty> {
+  Ty &operator()(Ty &self) const {
+    return self;
+  }
+  const Ty &operator()(const Ty &self) const {
+    return self;
+  }
+};
+
+template<class Ty>
 struct less_ptr : public std::binary_function<Ty, Ty, bool> {
   bool operator()(const Ty* left, const Ty* right) const {
     return *left < *right;
@@ -49,7 +59,7 @@ struct greater_ptr : public std::binary_function<Ty, Ty, bool> {
 //   for_each(V.begin(), B.end(), deleter<Interval>);
 //
 template <class T>
-static inline void deleter(T *Ptr) {
+inline void deleter(T *Ptr) {
   delete Ptr;
 }
 
@@ -228,7 +238,7 @@ inline size_t array_lengthof(T (&)[N]) {
 /// array_pod_sort_comparator - This is helper function for array_pod_sort,
 /// which just uses operator< on T.
 template<typename T>
-static inline int array_pod_sort_comparator(const void *P1, const void *P2) {
+inline int array_pod_sort_comparator(const void *P1, const void *P2) {
   if (*reinterpret_cast<const T*>(P1) < *reinterpret_cast<const T*>(P2))
     return -1;
   if (*reinterpret_cast<const T*>(P2) < *reinterpret_cast<const T*>(P1))
@@ -239,7 +249,7 @@ static inline int array_pod_sort_comparator(const void *P1, const void *P2) {
 /// get_array_pad_sort_comparator - This is an internal helper function used to
 /// get type deduction of T right.
 template<typename T>
-static int (*get_array_pad_sort_comparator(const T &))
+inline int (*get_array_pad_sort_comparator(const T &))
              (const void*, const void*) {
   return array_pod_sort_comparator<T>;
 }
@@ -260,7 +270,7 @@ static int (*get_array_pad_sort_comparator(const T &))
 /// NOTE: If qsort_r were portable, we could allow a custom comparator and
 /// default to std::less.
 template<class IteratorTy>
-static inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
+inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
   // Don't dereference start iterator of empty sequence.
   if (Start == End) return;
   qsort(&*Start, End-Start, sizeof(*Start),
@@ -268,13 +278,13 @@ static inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
 }
 
 template<class IteratorTy>
-static inline void array_pod_sort(IteratorTy Start, IteratorTy End,
+inline void array_pod_sort(IteratorTy Start, IteratorTy End,
                                   int (*Compare)(const void*, const void*)) {
   // Don't dereference start iterator of empty sequence.
   if (Start == End) return;
   qsort(&*Start, End-Start, sizeof(*Start), Compare);
 }
-  
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <algorithm>
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index a3469a1..7a645e0 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -15,6 +15,7 @@
 #define LLVM_ADT_SMALLBITVECTOR_H
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
@@ -152,6 +153,12 @@ public:
       switchToLarge(new BitVector(*RHS.getPointer()));
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallBitVector(SmallBitVector &&RHS) : X(RHS.X) {
+    RHS.X = 1;
+  }
+#endif
+
   ~SmallBitVector() {
     if (!isSmall())
       delete getPointer();
@@ -347,6 +354,19 @@ public:
     return (*this)[Idx];
   }
 
+  /// Test if any common bits are set.
+  bool anyCommon(const SmallBitVector &RHS) const {
+    if (isSmall() && RHS.isSmall())
+      return (getSmallBits() & RHS.getSmallBits()) != 0;
+    if (!isSmall() && !RHS.isSmall())
+      return getPointer()->anyCommon(*RHS.getPointer());
+
+    for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
+      if (test(i) && RHS.test(i))
+        return true;
+    return false;
+  }
+
   // Comparison operators.
   bool operator==(const SmallBitVector &RHS) const {
     if (size() != RHS.size())
@@ -422,9 +442,72 @@ public:
     return *this;
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  const SmallBitVector &operator=(SmallBitVector &&RHS) {
+    if (this != &RHS) {
+      clear();
+      swap(RHS);
+    }
+    return *this;
+  }
+#endif
+
   void swap(SmallBitVector &RHS) {
     std::swap(X, RHS.X);
   }
+
+  /// setBitsInMask - Add '1' bits from Mask to this vector. Don't resize.
+  /// This computes "*this |= Mask".
+  void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
+    if (isSmall())
+      applyMask<true, false>(Mask, MaskWords);
+    else
+      getPointer()->setBitsInMask(Mask, MaskWords);
+  }
+
+  /// clearBitsInMask - Clear any bits in this vector that are set in Mask.
+  /// Don't resize. This computes "*this &= ~Mask".
+  void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
+    if (isSmall())
+      applyMask<false, false>(Mask, MaskWords);
+    else
+      getPointer()->clearBitsInMask(Mask, MaskWords);
+  }
+
+  /// setBitsNotInMask - Add a bit to this vector for every '0' bit in Mask.
+  /// Don't resize.  This computes "*this |= ~Mask".
+  void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
+    if (isSmall())
+      applyMask<true, true>(Mask, MaskWords);
+    else
+      getPointer()->setBitsNotInMask(Mask, MaskWords);
+  }
+
+  /// clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
+  /// Don't resize.  This computes "*this &= Mask".
+  void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
+    if (isSmall())
+      applyMask<false, true>(Mask, MaskWords);
+    else
+      getPointer()->clearBitsNotInMask(Mask, MaskWords);
+  }
+
+private:
+  template<bool AddBits, bool InvertMask>
+  void applyMask(const uint32_t *Mask, unsigned MaskWords) {
+    assert((NumBaseBits == 64 || NumBaseBits == 32) && "Unsupported word size");
+    if (NumBaseBits == 64 && MaskWords >= 2) {
+      uint64_t M = Mask[0] | (uint64_t(Mask[1]) << 32);
+      if (InvertMask) M = ~M;
+      if (AddBits) setSmallBits(getSmallBits() | M);
+      else         setSmallBits(getSmallBits() & ~M);
+    } else {
+      uint32_t M = Mask[0];
+      if (InvertMask) M = ~M;
+      if (AddBits) setSmallBits(getSmallBits() | M);
+      else         setSmallBits(getSmallBits() & ~M);
+    }
+  }
 };
 
 inline SmallBitVector
diff --git a/include/llvm/ADT/SmallString.h b/include/llvm/ADT/SmallString.h
index 199783b..c6f0a5b 100644
--- a/include/llvm/ADT/SmallString.h
+++ b/include/llvm/ADT/SmallString.h
@@ -45,7 +45,7 @@ public:
   /// @{
 
   /// Assign from a repeated element
-  void assign(unsigned NumElts, char Elt) {
+  void assign(size_t NumElts, char Elt) {
     this->SmallVectorImpl<char>::assign(NumElts, Elt);
   }
 
@@ -77,6 +77,11 @@ public:
   void append(in_iter S, in_iter E) {
     SmallVectorImpl<char>::append(S, E);
   }
+  
+  void append(size_t NumInputs, char Elt) {
+    SmallVectorImpl<char>::append(NumInputs, Elt);
+  }
+
 
   /// Append from a StringRef
   void append(StringRef RHS) {
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 0d9d0d1..9fbbbe4 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_SMALLVECTOR_H
 #define LLVM_ADT_SMALLVECTOR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cassert>
@@ -54,6 +55,11 @@ protected:
     return BeginX == static_cast<const void*>(&FirstEl);
   }
 
+  /// resetToSmall - Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
   /// grow_pod - This is an implementation of the grow() method which only works
   /// on POD-like data types and is out of line to reduce code duplication.
   void grow_pod(size_t MinSizeInBytes, size_t TSize);
@@ -160,28 +166,84 @@ protected:
     }
   }
 
-  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
+  /// move - Use move-assignment to move the range [I, E) onto the
+  /// objects starting with "Dest".  This is just <memory>'s
+  /// std::move, but not all stdlibs actually provide that.
+  template<typename It1, typename It2>
+  static It2 move(It1 I, It1 E, It2 Dest) {
+#if LLVM_USE_RVALUE_REFERENCES
+    for (; I != E; ++I, ++Dest)
+      *Dest = ::std::move(*I);
+    return Dest;
+#else
+    return ::std::copy(I, E, Dest);
+#endif
+  }
+
+  /// move_backward - Use move-assignment to move the range
+  /// [I, E) onto the objects ending at "Dest", moving objects
+  /// in reverse order.  This is just <algorithm>'s
+  /// std::move_backward, but not all stdlibs actually provide that.
+  template<typename It1, typename It2>
+  static It2 move_backward(It1 I, It1 E, It2 Dest) {
+#if LLVM_USE_RVALUE_REFERENCES
+    while (I != E)
+      *--Dest = ::std::move(*--E);
+    return Dest;
+#else
+    return ::std::copy_backward(I, E, Dest);
+#endif
+  }
+
+  /// uninitialized_move - Move the range [I, E) into the uninitialized
+  /// memory starting with "Dest", constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+#if LLVM_USE_RVALUE_REFERENCES
+    for (; I != E; ++I, ++Dest)
+      ::new ((void*) &*Dest) T(::std::move(*I));
+#else
+    ::std::uninitialized_copy(I, E, Dest);
+#endif
+  }
+
+  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized
+  /// memory starting with "Dest", constructing elements as needed.
   template<typename It1, typename It2>
   static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
     std::uninitialized_copy(I, E, Dest);
   }
 
-  /// grow - double the size of the allocated memory, guaranteeing space for at
-  /// least one more element or MinSize if specified.
+  /// grow - Grow the allocated memory (without initializing new
+  /// elements), doubling the size of the allocated memory.
+  /// Guarantees space for at least one more element, or MinSize more
+  /// elements if specified.
   void grow(size_t MinSize = 0);
   
 public:
   void push_back(const T &Elt) {
     if (this->EndX < this->CapacityX) {
     Retry:
-      new (this->end()) T(Elt);
+      ::new ((void*) this->end()) T(Elt);
       this->setEnd(this->end()+1);
       return;
     }
     this->grow();
     goto Retry;
   }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  void push_back(T &&Elt) {
+    if (this->EndX < this->CapacityX) {
+    Retry:
+      ::new ((void*) this->end()) T(::std::move(Elt));
+      this->setEnd(this->end()+1);
+      return;
+    }
+    this->grow();
+    goto Retry;
+  }
+#endif
   
   void pop_back() {
     this->setEnd(this->end()-1);
@@ -199,8 +261,8 @@ void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
     NewCapacity = MinSize;
   T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
 
-  // Copy the elements over.
-  this->uninitialized_copy(this->begin(), this->end(), NewElts);
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
 
   // Destroy the original elements.
   destroy_range(this->begin(), this->end());
@@ -225,6 +287,29 @@ protected:
   // No need to do a destroy loop for POD's.
   static void destroy_range(T *, T *) {}
 
+  /// move - Use move-assignment to move the range [I, E) onto the
+  /// objects starting with "Dest".  For PODs, this is just memcpy.
+  template<typename It1, typename It2>
+  static It2 move(It1 I, It1 E, It2 Dest) {
+    return ::std::copy(I, E, Dest);
+  }
+
+  /// move_backward - Use move-assignment to move the range
+  /// [I, E) onto the objects ending at "Dest", moving objects
+  /// in reverse order.
+  template<typename It1, typename It2>
+  static It2 move_backward(It1 I, It1 E, It2 Dest) {
+    return ::std::copy_backward(I, E, Dest);
+  }
+
+  /// uninitialized_move - Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
   /// uninitialized_copy - Copy the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
   template<typename It1, typename It2>
@@ -252,7 +337,7 @@ public:
   void push_back(const T &Elt) {
     if (this->EndX < this->CapacityX) {
     Retry:
-      *this->end() = Elt;
+      memcpy(this->end(), &Elt, sizeof(T));
       this->setEnd(this->end()+1);
       return;
     }
@@ -330,7 +415,11 @@ public:
   }
 
   T pop_back_val() {
+#if LLVM_USE_RVALUE_REFERENCES
+    T Result = ::std::move(this->back());
+#else
     T Result = this->back();
+#endif
     this->pop_back();
     return Result;
   }
@@ -374,36 +463,79 @@ public:
   }
 
   iterator erase(iterator I) {
+    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    assert(I < this->end() && "Erasing at past-the-end iterator.");
+
     iterator N = I;
     // Shift all elts down one.
-    std::copy(I+1, this->end(), I);
+    this->move(I+1, this->end(), I);
     // Drop the last elt.
     this->pop_back();
     return(N);
   }
 
   iterator erase(iterator S, iterator E) {
+    assert(S >= this->begin() && "Range to erase is out of bounds.");
+    assert(S <= E && "Trying to erase invalid range.");
+    assert(E <= this->end() && "Trying to erase past the end.");
+
     iterator N = S;
     // Shift all elts down.
-    iterator I = std::copy(E, this->end(), S);
+    iterator I = this->move(E, this->end(), S);
     // Drop the last elts.
     this->destroy_range(I, this->end());
     this->setEnd(I);
     return(N);
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  iterator insert(iterator I, T &&Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end()-1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX < this->CapacityX) {
+    Retry:
+      ::new ((void*) this->end()) T(::std::move(this->back()));
+      this->setEnd(this->end()+1);
+      // Push everything else over.
+      this->move_backward(I, this->end()-1, this->end());
+
+      // If we just moved the element we're inserting, be sure to update
+      // the reference.
+      T *EltPtr = &Elt;
+      if (I <= EltPtr && EltPtr < this->EndX)
+        ++EltPtr;
+
+      *I = ::std::move(*EltPtr);
+      return I;
+    }
+    size_t EltNo = I-this->begin();
+    this->grow();
+    I = this->begin()+EltNo;
+    goto Retry;
+  }
+#endif
+
   iterator insert(iterator I, const T &Elt) {
     if (I == this->end()) {  // Important special case for empty vector.
       this->push_back(Elt);
       return this->end()-1;
     }
 
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
     if (this->EndX < this->CapacityX) {
     Retry:
-      new (this->end()) T(this->back());
+      ::new ((void*) this->end()) T(this->back());
       this->setEnd(this->end()+1);
       // Push everything else over.
-      std::copy_backward(I, this->end()-1, this->end());
+      this->move_backward(I, this->end()-1, this->end());
 
       // If we just moved the element we're inserting, be sure to update
       // the reference.
@@ -421,13 +553,16 @@ public:
   }
 
   iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
     if (I == this->end()) {  // Important special case for empty vector.
       append(NumToInsert, Elt);
-      return this->end()-1;
+      return this->begin()+InsertElt;
     }
 
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
 
     // Ensure there is enough space.
     reserve(static_cast<unsigned>(this->size() + NumToInsert));
@@ -444,7 +579,7 @@ public:
       append(this->end()-NumToInsert, this->end());
 
       // Copy the existing elements that get replaced.
-      std::copy_backward(I, OldEnd-NumToInsert, OldEnd);
+      this->move_backward(I, OldEnd-NumToInsert, OldEnd);
 
       std::fill_n(I, NumToInsert, Elt);
       return I;
@@ -453,11 +588,11 @@ public:
     // Otherwise, we're inserting more elements than exist already, and we're
     // not inserting at the end.
 
-    // Copy over the elements that we're about to overwrite.
+    // Move over the elements that we're about to overwrite.
     T *OldEnd = this->end();
     this->setEnd(this->end() + NumToInsert);
     size_t NumOverwritten = OldEnd-I;
-    this->uninitialized_copy(I, OldEnd, this->end()-NumOverwritten);
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
 
     // Replace the overwritten part.
     std::fill_n(I, NumOverwritten, Elt);
@@ -469,14 +604,18 @@ public:
 
   template<typename ItTy>
   iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
     if (I == this->end()) {  // Important special case for empty vector.
       append(From, To);
-      return this->end()-1;
+      return this->begin()+InsertElt;
     }
 
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
     size_t NumToInsert = std::distance(From, To);
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
 
     // Ensure there is enough space.
     reserve(static_cast<unsigned>(this->size() + NumToInsert));
@@ -493,7 +632,7 @@ public:
       append(this->end()-NumToInsert, this->end());
 
       // Copy the existing elements that get replaced.
-      std::copy_backward(I, OldEnd-NumToInsert, OldEnd);
+      this->move_backward(I, OldEnd-NumToInsert, OldEnd);
 
       std::copy(From, To, I);
       return I;
@@ -502,16 +641,16 @@ public:
     // Otherwise, we're inserting more elements than exist already, and we're
     // not inserting at the end.
 
-    // Copy over the elements that we're about to overwrite.
+    // Move over the elements that we're about to overwrite.
     T *OldEnd = this->end();
     this->setEnd(this->end() + NumToInsert);
     size_t NumOverwritten = OldEnd-I;
-    this->uninitialized_copy(I, OldEnd, this->end()-NumOverwritten);
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
 
     // Replace the overwritten part.
-    for (; NumOverwritten > 0; --NumOverwritten) {
-      *I = *From;
-      ++I; ++From;
+    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J; ++From;
     }
 
     // Insert the non-overwritten middle part.
@@ -519,8 +658,11 @@ public:
     return I;
   }
 
-  const SmallVectorImpl
-  &operator=(const SmallVectorImpl &RHS);
+  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+#endif
 
   bool operator==(const SmallVectorImpl &RHS) const {
     if (this->size() != RHS.size()) return false;
@@ -590,7 +732,7 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
 }
 
 template <typename T>
-const SmallVectorImpl<T> &SmallVectorImpl<T>::
+SmallVectorImpl<T> &SmallVectorImpl<T>::
   operator=(const SmallVectorImpl<T> &RHS) {
   // Avoid self-assignment.
   if (this == &RHS) return *this;
@@ -617,6 +759,7 @@ const SmallVectorImpl<T> &SmallVectorImpl<T>::
 
   // If we have to grow to have enough elements, destroy the current elements.
   // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
     this->destroy_range(this->begin(), this->end());
@@ -637,6 +780,69 @@ const SmallVectorImpl<T> &SmallVectorImpl<T>::
   return *this;
 }
 
+#if LLVM_USE_RVALUE_REFERENCES
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall()) free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = this->move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    this->move(RHS.begin(), RHS.end(), this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+#endif
 
 /// SmallVector - This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
@@ -692,6 +898,18 @@ public:
     return *this;
   }
 
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(NumTsAvailable) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+#endif
+
 };
 
 /// Specialize SmallVector at N=0.  This specialization guarantees
@@ -700,7 +918,8 @@ public:
 template <typename T>
 class SmallVector<T,0> : public SmallVectorImpl<T> {
 public:
-  SmallVector() : SmallVectorImpl<T>(0) {}
+  SmallVector() : SmallVectorImpl<T>(0) {
+  }
 
   explicit SmallVector(unsigned Size, const T &Value = T())
     : SmallVectorImpl<T>(0) {
@@ -713,13 +932,26 @@ public:
   }
 
   SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(0) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  const SmallVector &operator=(const SmallVector &RHS) {
     SmallVectorImpl<T>::operator=(RHS);
+    return *this;
   }
 
-  SmallVector &operator=(const SmallVectorImpl<T> &RHS) {
-    return SmallVectorImpl<T>::operator=(RHS);
+#if LLVM_USE_RVALUE_REFERENCES
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(0) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
   }
 
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+#endif
 };
 
 template<typename T, unsigned N>
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 923c6a5..55696333 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -21,33 +21,62 @@
 #define LLVM_ADT_SPARSESET_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/DataTypes.h"
 #include <limits>
 
 namespace llvm {
 
-/// SparseSetFunctor - Objects in a SparseSet are identified by small integer
-/// keys.  A functor object is used to compute the key of an object.  The
-/// functor's operator() must return an unsigned smaller than the universe.
+/// SparseSetValTraits - Objects in a SparseSet are identified by keys that can
+/// be uniquely converted to a small integer less than the set's universe. This
+/// class allows the set to hold values that differ from the set's key type as
+/// long as an index can still be derived from the value. SparseSet never
+/// directly compares ValueT, only their indices, so it can map keys to
+/// arbitrary values. SparseSetValTraits computes the index from the value
+/// object. To compute the index from a key, SparseSet uses a separate
+/// KeyFunctorT template argument.
 ///
-/// The default functor implementation forwards to a getSparseSetKey() method
-/// on the object.  It is intended for sparse sets holding ad-hoc structs.
+/// A simple type declaration, SparseSet<Type>, handles these cases:
+/// - unsigned key, identity index, identity value
+/// - unsigned key, identity index, fat value providing getSparseSetIndex()
+///
+/// The type declaration SparseSet<Type, UnaryFunction> handles:
+/// - unsigned key, remapped index, identity value (virtual registers)
+/// - pointer key, pointer-derived index, identity value (node+ID)
+/// - pointer key, pointer-derived index, fat value with getSparseSetIndex()
+///
+/// Only other, unexpected cases require specializing SparseSetValTraits.
+///
+/// For best results, ValueT should not require a destructor.
 ///
 template<typename ValueT>
-struct SparseSetFunctor {
-  unsigned operator()(const ValueT &Val) {
-    return Val.getSparseSetKey();
+struct SparseSetValTraits {
+  static unsigned getValIndex(const ValueT &Val) {
+    return Val.getSparseSetIndex();
   }
 };
 
-/// SparseSetFunctor<unsigned> - Provide a trivial identity functor for
-/// SparseSet<unsigned>.
+/// SparseSetValFunctor - Helper class for selecting SparseSetValTraits. The
+/// generic implementation handles ValueT classes which either provide
+/// getSparseSetIndex() or specialize SparseSetValTraits<>.
 ///
-template<> struct SparseSetFunctor<unsigned> {
-  unsigned operator()(unsigned Val) { return Val; }
+template<typename KeyT, typename ValueT, typename KeyFunctorT>
+struct SparseSetValFunctor {
+  unsigned operator()(const ValueT &Val) const {
+    return SparseSetValTraits<ValueT>::getValIndex(Val);
+  }
 };
 
-/// SparseSet - Fast set implementation for objects that can be identified by
+/// SparseSetValFunctor<KeyT, KeyT> - Helper class for the common case of
+/// identity key/value sets.
+template<typename KeyT, typename KeyFunctorT>
+struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
+  unsigned operator()(const KeyT &Key) const {
+    return KeyFunctorT()(Key);
+  }
+};
+
+/// SparseSet - Fast set implmentation for objects that can be identified by
 /// small unsigned keys.
 ///
 /// SparseSet allocates memory proportional to the size of the key universe, so
@@ -82,18 +111,20 @@ template<> struct SparseSetFunctor<unsigned> {
 /// uint16_t or uint32_t.
 ///
 /// @param ValueT      The type of objects in the set.
+/// @param KeyFunctorT A functor that computes an unsigned index from KeyT.
 /// @param SparseT     An unsigned integer type. See above.
-/// @param KeyFunctorT A functor that computes the unsigned key of a ValueT.
 ///
 template<typename ValueT,
-         typename SparseT = uint8_t,
-         typename KeyFunctorT = SparseSetFunctor<ValueT> >
+         typename KeyFunctorT = llvm::identity<unsigned>,
+         typename SparseT = uint8_t>
 class SparseSet {
+  typedef typename KeyFunctorT::argument_type KeyT;
   typedef SmallVector<ValueT, 8> DenseT;
   DenseT Dense;
   SparseT *Sparse;
   unsigned Universe;
-  KeyFunctorT KeyOf;
+  KeyFunctorT KeyIndexOf;
+  SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
 
   // Disable copy construction and assignment.
   // This data structure is not meant to be used that way.
@@ -160,21 +191,21 @@ public:
     Dense.clear();
   }
 
-  /// find - Find an element by its key.
+  /// findIndex - Find an element by its index.
   ///
-  /// @param   Key A valid key to find.
+  /// @param   Idx A valid index to find.
   /// @returns An iterator to the element identified by key, or end().
   ///
-  iterator find(unsigned Key) {
-    assert(Key < Universe && "Key out of range");
+  iterator findIndex(unsigned Idx) {
+    assert(Idx < Universe && "Key out of range");
     assert(std::numeric_limits<SparseT>::is_integer &&
            !std::numeric_limits<SparseT>::is_signed &&
            "SparseT must be an unsigned integer type");
     const unsigned Stride = std::numeric_limits<SparseT>::max() + 1u;
-    for (unsigned i = Sparse[Key], e = size(); i < e; i += Stride) {
-      const unsigned FoundKey = KeyOf(Dense[i]);
-      assert(FoundKey < Universe && "Invalid key in set. Did object mutate?");
-      if (Key == FoundKey)
+    for (unsigned i = Sparse[Idx], e = size(); i < e; i += Stride) {
+      const unsigned FoundIdx = ValIndexOf(Dense[i]);
+      assert(FoundIdx < Universe && "Invalid key in set. Did object mutate?");
+      if (Idx == FoundIdx)
         return begin() + i;
       // Stride is 0 when SparseT >= unsigned.  We don't need to loop.
       if (!Stride)
@@ -183,13 +214,22 @@ public:
     return end();
   }
 
-  const_iterator find(unsigned Key) const {
-    return const_cast<SparseSet*>(this)->find(Key);
+  /// find - Find an element by its key.
+  ///
+  /// @param   Key A valid key to find.
+  /// @returns An iterator to the element identified by key, or end().
+  ///
+  iterator find(const KeyT &Key) {
+    return findIndex(KeyIndexOf(Key));
+  }
+
+  const_iterator find(const KeyT &Key) const {
+    return const_cast<SparseSet*>(this)->findIndex(KeyIndexOf(Key));
   }
 
   /// count - Returns true if this set contains an element identified by Key.
   ///
-  bool count(unsigned Key) const {
+  bool count(const KeyT &Key) const {
     return find(Key) != end();
   }
 
@@ -204,11 +244,11 @@ public:
   /// Insertion invalidates all iterators.
   ///
   std::pair<iterator, bool> insert(const ValueT &Val) {
-    unsigned Key = KeyOf(Val);
-    iterator I = find(Key);
+    unsigned Idx = ValIndexOf(Val);
+    iterator I = findIndex(Idx);
     if (I != end())
       return std::make_pair(I, false);
-    Sparse[Key] = size();
+    Sparse[Idx] = size();
     Dense.push_back(Val);
     return std::make_pair(end() - 1, true);
   }
@@ -216,7 +256,7 @@ public:
   /// array subscript - If an element already exists with this key, return it.
   /// Otherwise, automatically construct a new value from Key, insert it,
   /// and return the newly inserted element.
-  ValueT &operator[](unsigned Key) {
+  ValueT &operator[](const KeyT &Key) {
     return *insert(ValueT(Key)).first;
   }
 
@@ -238,9 +278,9 @@ public:
     assert(unsigned(I - begin()) < size() && "Invalid iterator");
     if (I != end() - 1) {
       *I = Dense.back();
-      unsigned BackKey = KeyOf(Dense.back());
-      assert(BackKey < Universe && "Invalid key in set. Did object mutate?");
-      Sparse[BackKey] = I - begin();
+      unsigned BackIdx = ValIndexOf(Dense.back());
+      assert(BackIdx < Universe && "Invalid key in set. Did object mutate?");
+      Sparse[BackIdx] = I - begin();
     }
     // This depends on SmallVector::pop_back() not invalidating iterators.
     // std::vector::pop_back() doesn't give that guarantee.
@@ -253,7 +293,7 @@ public:
   /// @param   Key The key identifying the element to erase.
   /// @returns True when an element was erased, false if no element was found.
   ///
-  bool erase(unsigned Key) {
+  bool erase(const KeyT &Key) {
     iterator I = find(Key);
     if (I == end())
       return false;
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 76ba66e..cd84603 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/type_traits.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <limits>
@@ -292,6 +293,16 @@ namespace llvm {
     /// Note: O(size() + Chars.size())
     size_type find_last_of(StringRef Chars, size_t From = npos) const;
 
+    /// find_last_not_of - Find the last character in the string that is not
+    /// \arg C, or npos if not found.
+    size_type find_last_not_of(char C, size_t From = npos) const;
+
+    /// find_last_not_of - Find the last character in the string that is not in
+    /// \arg Chars, or npos if not found.
+    ///
+    /// Note: O(size() + Chars.size())
+    size_type find_last_not_of(StringRef Chars, size_t From = npos) const;
+
     /// @}
     /// @name Helpful Algorithms
     /// @{
@@ -480,6 +491,24 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
     }
 
+    /// ltrim - Return string with consecutive characters in \arg Chars starting
+    /// from the left removed.
+    StringRef ltrim(StringRef Chars = " \t\n\v\f\r") const {
+      return drop_front(std::min(Length, find_first_not_of(Chars)));
+    }
+
+    /// rtrim - Return string with consecutive characters in \arg Chars starting
+    /// from the right removed.
+    StringRef rtrim(StringRef Chars = " \t\n\v\f\r") const {
+      return drop_back(Length - std::min(Length, find_last_not_of(Chars) + 1));
+    }
+
+    /// trim - Return string with consecutive characters in \arg Chars starting
+    /// from the left and right removed.
+    StringRef trim(StringRef Chars = " \t\n\v\f\r") const {
+      return ltrim(Chars).rtrim(Chars);
+    }
+
     /// @}
   };
 
diff --git a/include/llvm/ADT/StringSwitch.h b/include/llvm/ADT/StringSwitch.h
index 7480583..7fd6e279 100644
--- a/include/llvm/ADT/StringSwitch.h
+++ b/include/llvm/ADT/StringSwitch.h
@@ -48,8 +48,8 @@ class StringSwitch {
   const T *Result;
 
 public:
-  explicit StringSwitch(StringRef Str)
-  : Str(Str), Result(0) { }
+  explicit StringSwitch(StringRef S)
+  : Str(S), Result(0) { }
 
   template<unsigned N>
   StringSwitch& Case(const char (&S)[N], const T& Value) {
diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index 5014517..d3d33b8 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h
@@ -10,8 +10,11 @@
 #ifndef LLVM_ADT_TINYPTRVECTOR_H
 #define LLVM_ADT_TINYPTRVECTOR_H
 
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
   
@@ -25,18 +28,78 @@ template <typename EltTy>
 class TinyPtrVector {
 public:
   typedef llvm::SmallVector<EltTy, 4> VecTy;
+  typedef typename VecTy::value_type value_type;
+
   llvm::PointerUnion<EltTy, VecTy*> Val;
-  
+
   TinyPtrVector() {}
+  ~TinyPtrVector() {
+    if (VecTy *V = Val.template dyn_cast<VecTy*>())
+      delete V;
+  }
+
   TinyPtrVector(const TinyPtrVector &RHS) : Val(RHS.Val) {
     if (VecTy *V = Val.template dyn_cast<VecTy*>())
       Val = new VecTy(*V);
   }
-  ~TinyPtrVector() {
-    if (VecTy *V = Val.template dyn_cast<VecTy*>())
+  TinyPtrVector &operator=(const TinyPtrVector &RHS) {
+    if (this == &RHS)
+      return *this;
+    if (RHS.empty()) {
+      this->clear();
+      return *this;
+    }
+
+    // Try to squeeze into the single slot. If it won't fit, allocate a copied
+    // vector.
+    if (Val.template is<EltTy>()) {
+      if (RHS.size() == 1)
+        Val = RHS.front();
+      else
+        Val = new VecTy(*RHS.Val.template get<VecTy*>());
+      return *this;
+    }
+
+    // If we have a full vector allocated, try to re-use it.
+    if (RHS.Val.template is<EltTy>()) {
+      Val.template get<VecTy*>()->clear();
+      Val.template get<VecTy*>()->push_back(RHS.front());
+    } else {
+      *Val.template get<VecTy*>() = *RHS.Val.template get<VecTy*>();
+    }
+    return *this;
+  }
+
+#if LLVM_USE_RVALUE_REFERENCES
+  TinyPtrVector(TinyPtrVector &&RHS) : Val(RHS.Val) {
+    RHS.Val = (EltTy)0;
+  }
+  TinyPtrVector &operator=(TinyPtrVector &&RHS) {
+    if (this == &RHS)
+      return *this;
+    if (RHS.empty()) {
+      this->clear();
+      return *this;
+    }
+
+    // If this vector has been allocated on the heap, re-use it if cheap. If it
+    // would require more copying, just delete it and we'll steal the other
+    // side.
+    if (VecTy *V = Val.template dyn_cast<VecTy*>()) {
+      if (RHS.Val.template is<EltTy>()) {
+        V->clear();
+        V->push_back(RHS.front());
+        return *this;
+      }
       delete V;
+    }
+
+    Val = RHS.Val;
+    RHS.Val = (EltTy)0;
+    return *this;
   }
-  
+#endif
+
   // implicit conversion operator to ArrayRef.
   operator ArrayRef<EltTy>() const {
     if (Val.isNull())
@@ -45,7 +108,7 @@ public:
       return *Val.getAddrOfPtr1();
     return *Val.template get<VecTy*>();
   }
-  
+
   bool empty() const {
     // This vector can be empty if it contains no element, or if it
     // contains a pointer to an empty vector.
@@ -54,7 +117,7 @@ public:
       return Vec->empty();
     return false;
   }
-  
+
   unsigned size() const {
     if (empty())
       return 0;
@@ -62,27 +125,21 @@ public:
       return 1;
     return Val.template get<VecTy*>()->size();
   }
-  
+
   typedef const EltTy *const_iterator;
   typedef EltTy *iterator;
 
   iterator begin() {
-    if (empty())
-      return 0;
-    
     if (Val.template is<EltTy>())
       return Val.getAddrOfPtr1();
-    
+
     return Val.template get<VecTy *>()->begin();
 
   }
   iterator end() {
-    if (empty())
-      return 0;
-    
     if (Val.template is<EltTy>())
-      return begin() + 1;
-    
+      return begin() + (Val.isNull() ? 0 : 1);
+
     return Val.template get<VecTy *>()->end();
   }
 
@@ -100,38 +157,53 @@ public:
       assert(i == 0 && "tinyvector index out of range");
       return V;
     }
-    
-    assert(i < Val.template get<VecTy*>()->size() && 
+
+    assert(i < Val.template get<VecTy*>()->size() &&
            "tinyvector index out of range");
     return (*Val.template get<VecTy*>())[i];
   }
-  
+
   EltTy front() const {
     assert(!empty() && "vector empty");
     if (EltTy V = Val.template dyn_cast<EltTy>())
       return V;
     return Val.template get<VecTy*>()->front();
   }
-  
+
+  EltTy back() const {
+    assert(!empty() && "vector empty");
+    if (EltTy V = Val.template dyn_cast<EltTy>())
+      return V;
+    return Val.template get<VecTy*>()->back();
+  }
+
   void push_back(EltTy NewVal) {
     assert(NewVal != 0 && "Can't add a null value");
-    
+
     // If we have nothing, add something.
     if (Val.isNull()) {
       Val = NewVal;
       return;
     }
-    
+
     // If we have a single value, convert to a vector.
     if (EltTy V = Val.template dyn_cast<EltTy>()) {
       Val = new VecTy();
       Val.template get<VecTy*>()->push_back(V);
     }
-    
+
     // Add the new value, we know we have a vector.
     Val.template get<VecTy*>()->push_back(NewVal);
   }
-  
+
+  void pop_back() {
+    // If we have a single value, convert to empty.
+    if (Val.template is<EltTy>())
+      Val = (EltTy)0;
+    else if (VecTy *Vec = Val.template get<VecTy*>())
+      Vec->pop_back();
+  }
+
   void clear() {
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
@@ -144,6 +216,9 @@ public:
   }
 
   iterator erase(iterator I) {
+    assert(I >= begin() && "Iterator to erase is out of bounds.");
+    assert(I < end() && "Erasing at past-the-end iterator.");
+
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
       if (I == begin())
@@ -153,12 +228,63 @@ public:
       // benefit to collapsing back to a pointer
       return Vec->erase(I);
     }
+    return end();
+  }
+
+  iterator erase(iterator S, iterator E) {
+    assert(S >= begin() && "Range to erase is out of bounds.");
+    assert(S <= E && "Trying to erase invalid range.");
+    assert(E <= end() && "Trying to erase past the end.");
+
+    if (Val.template is<EltTy>()) {
+      if (S == begin() && S != E)
+        Val = (EltTy)0;
+    } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
+      return Vec->erase(S, E);
+    }
+    return end();
+  }
 
-    return 0;
+  iterator insert(iterator I, const EltTy &Elt) {
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+    if (I == end()) {
+      push_back(Elt);
+      return llvm::prior(end());
+    }
+    assert(!Val.isNull() && "Null value with non-end insert iterator.");
+    if (EltTy V = Val.template dyn_cast<EltTy>()) {
+      assert(I == begin());
+      Val = Elt;
+      push_back(V);
+      return begin();
+    }
+
+    return Val.template get<VecTy*>()->insert(I, Elt);
+  }
+
+  template<typename ItTy>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+    if (From == To)
+      return I;
+
+    // If we have a single value, convert to a vector.
+    ptrdiff_t Offset = I - begin();
+    if (Val.isNull()) {
+      if (llvm::next(From) == To) {
+        Val = *From;
+        return begin();
+      }
+
+      Val = new VecTy();
+    } else if (EltTy V = Val.template dyn_cast<EltTy>()) {
+      Val = new VecTy();
+      Val.template get<VecTy*>()->push_back(V);
+    }
+    return Val.template get<VecTy*>()->insert(begin() + Offset, From, To);
   }
-  
-private:
-  void operator=(const TinyPtrVector&); // NOT IMPLEMENTED YET.
 };
 } // end namespace llvm
 
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index f5f99d0..7f7061a 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -62,8 +62,8 @@ public:
     x86_64,  // X86-64: amd64, x86_64
     xcore,   // XCore: xcore
     mblaze,  // MBlaze: mblaze
-    ptx32,   // PTX: ptx (32-bit)
-    ptx64,   // PTX: ptx (64-bit)
+    nvptx,   // NVPTX: 32-bit
+    nvptx64, // NVPTX: 64-bit
     le32,    // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten)
     amdil   // amdil: amd IL
   };
@@ -98,7 +98,8 @@ public:
     Minix,
     RTEMS,
     NativeClient,
-    CNK         // BG/P Compute-Node Kernel
+    CNK,         // BG/P Compute-Node Kernel
+    Bitrig
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -194,6 +195,11 @@ public:
   bool getMacOSXVersion(unsigned &Major, unsigned &Minor,
                         unsigned &Micro) const;
 
+  /// getiOSVersion - Parse the version number as with getOSVersion.  This should
+  /// only be called with IOS triples.
+  void getiOSVersion(unsigned &Major, unsigned &Minor,
+                     unsigned &Micro) const;
+
   /// @}
   /// @name Direct Component Access
   /// @{
@@ -266,7 +272,7 @@ public:
   /// compatibility, which handles supporting skewed version numbering schemes
   /// used by the "darwin" triples.
   unsigned isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
-			     unsigned Micro = 0) const {
+                             unsigned Micro = 0) const {
     assert(isMacOSX() && "Not an OS X triple!");
 
     // If this is OS X, expect a sane version number.
diff --git a/include/llvm/ADT/ValueMap.h b/include/llvm/ADT/ValueMap.h
index 707d07d..f7e2551 100644
--- a/include/llvm/ADT/ValueMap.h
+++ b/include/llvm/ADT/ValueMap.h
@@ -111,20 +111,21 @@ public:
 
   /// count - Return true if the specified key is in the map.
   bool count(const KeyT &Val) const {
-    return Map.count(Wrap(Val));
+    return Map.find_as(Val) != Map.end();
   }
 
   iterator find(const KeyT &Val) {
-    return iterator(Map.find(Wrap(Val)));
+    return iterator(Map.find_as(Val));
   }
   const_iterator find(const KeyT &Val) const {
-    return const_iterator(Map.find(Wrap(Val)));
+    return const_iterator(Map.find_as(Val));
   }
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
   ValueT lookup(const KeyT &Val) const {
-    return Map.lookup(Wrap(Val));
+    typename MapT::const_iterator I = Map.find_as(Val);
+    return I != Map.end() ? I->second : ValueT();
   }
 
   // Inserts key,value pair into the map if the key isn't already in the map.
@@ -145,7 +146,12 @@ public:
 
 
   bool erase(const KeyT &Val) {
-    return Map.erase(Wrap(Val));
+    typename MapT::iterator I = Map.find_as(Val);
+    if (I == Map.end())
+      return false;
+
+    Map.erase(I);
+    return true;
   }
   void erase(iterator I) {
     return Map.erase(I.base());
@@ -256,9 +262,15 @@ struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config> > {
   static unsigned getHashValue(const VH &Val) {
     return PointerInfo::getHashValue(Val.Unwrap());
   }
+  static unsigned getHashValue(const KeyT &Val) {
+    return PointerInfo::getHashValue(Val);
+  }
   static bool isEqual(const VH &LHS, const VH &RHS) {
     return LHS == RHS;
   }
+  static bool isEqual(const KeyT &LHS, const VH &RHS) {
+    return LHS == RHS.getValPtr();
+  }
 };
 
 
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index b823f71..674868a 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -50,6 +50,7 @@ class Pass;
 class AnalysisUsage;
 class MemTransferInst;
 class MemIntrinsic;
+class DominatorTree;
 
 class AliasAnalysis {
 protected:
@@ -462,6 +463,18 @@ public:
   virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
                                      ImmutableCallSite CS2);
 
+  /// callCapturesBefore - Return information about whether a particular call 
+  /// site modifies or reads the specified memory location.
+  ModRefResult callCapturesBefore(const Instruction *I,
+                                  const AliasAnalysis::Location &MemLoc,
+                                  DominatorTree *DT);
+
+  /// callCapturesBefore - A convenience wrapper.
+  ModRefResult callCapturesBefore(const Instruction *I, const Value *P,
+                                  uint64_t Size, DominatorTree *DT) {
+    return callCapturesBefore(I, Location(P, Size), DT);
+  }
+
   //===--------------------------------------------------------------------===//
   /// Higher level methods for querying mod/ref information.
   ///
diff --git a/include/llvm/Analysis/BlockFrequencyImpl.h b/include/llvm/Analysis/BlockFrequencyImpl.h
index 6f2ccfb..5168ab7 100644
--- a/include/llvm/Analysis/BlockFrequencyImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyImpl.h
@@ -217,7 +217,7 @@ class BlockFrequencyImpl {
     divBlockFreq(BB, BranchProbability(Numerator, EntryFreq));
   }
 
-  /// doLoop - Propagate block frequency down throught the loop.
+  /// doLoop - Propagate block frequency down through the loop.
   void doLoop(BlockT *Head, BlockT *Tail) {
     DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
                  << getBlockName(Tail) << ")\n");
diff --git a/include/llvm/Analysis/CodeMetrics.h b/include/llvm/Analysis/CodeMetrics.h
index 7116078..03c807c 100644
--- a/include/llvm/Analysis/CodeMetrics.h
+++ b/include/llvm/Analysis/CodeMetrics.h
@@ -16,6 +16,7 @@
 #define LLVM_ANALYSIS_CODEMETRICS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CallSite.h"
 
 namespace llvm {
   class BasicBlock;
@@ -29,10 +30,11 @@ namespace llvm {
 
   /// \brief Check whether a call will lower to something small.
   ///
-  /// This tests checks whether calls to this function will lower to something
+  /// This tests checks whether this callsite will lower to something
   /// significantly cheaper than a traditional call, often a single
-  /// instruction.
-  bool callIsSmall(const Function *F);
+  /// instruction. Note that if isInstructionFree(CS.getInstruction()) would
+  /// return true, so will this function.
+  bool callIsSmall(ImmutableCallSite CS);
 
   /// \brief Utility to calculate the size and a few similar metrics for a set
   /// of basic blocks.
diff --git a/include/llvm/Analysis/Dominators.h b/include/llvm/Analysis/Dominators.h
index 6e8e4246..1289edd 100644
--- a/include/llvm/Analysis/Dominators.h
+++ b/include/llvm/Analysis/Dominators.h
@@ -152,7 +152,7 @@ EXTERN_TEMPLATE_INSTANTIATION(class DomTreeNodeBase<BasicBlock>);
 EXTERN_TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
 
 template<class NodeT>
-static raw_ostream &operator<<(raw_ostream &o,
+inline raw_ostream &operator<<(raw_ostream &o,
                                const DomTreeNodeBase<NodeT> *Node) {
   if (Node->getBlock())
     WriteAsOperand(o, Node->getBlock(), false);
@@ -165,7 +165,7 @@ static raw_ostream &operator<<(raw_ostream &o,
 }
 
 template<class NodeT>
-static void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &o,
+inline void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &o,
                          unsigned Lev) {
   o.indent(2*Lev) << "[" << Lev << "] " << N;
   for (typename DomTreeNodeBase<NodeT>::const_iterator I = N->begin(),
@@ -705,6 +705,8 @@ DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A, const NodeT *B) {
 
 EXTERN_TEMPLATE_INSTANTIATION(class DominatorTreeBase<BasicBlock>);
 
+class BasicBlockEdge;
+
 //===-------------------------------------
 /// DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to
 /// compute a normal dominator tree.
@@ -778,6 +780,8 @@ public:
   bool dominates(const Instruction *Def, const Use &U) const;
   bool dominates(const Instruction *Def, const Instruction *User) const;
   bool dominates(const Instruction *Def, const BasicBlock *BB) const;
+  bool dominates(const BasicBlockEdge &BBE, const Use &U) const;
+  bool dominates(const BasicBlockEdge &BBE, const BasicBlock *BB) const;
 
   bool properlyDominates(const DomTreeNode *A, const DomTreeNode *B) const {
     return DT->properlyDominates(A, B);
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 691c2d1..0cba135 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -127,10 +127,6 @@ namespace llvm {
     //  adding a replacement API.
     InlineCost getInlineCost(CallSite CS, Function *Callee, int Threshold);
   };
-
-  /// callIsSmall - If a call is likely to lower to a single target instruction,
-  /// or is otherwise deemed small return true.
-  bool callIsSmall(const Function *Callee);
 }
 
 #endif
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 91feaaa..eeb482d 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -46,7 +46,7 @@
 namespace llvm {
 
 template<typename T>
-static void RemoveFromVector(std::vector<T*> &V, T *N) {
+inline void RemoveFromVector(std::vector<T*> &V, T *N) {
   typename std::vector<T*>::iterator I = std::find(V.begin(), V.end(), N);
   assert(I != V.end() && "N is not in this list!");
   V.erase(I);
@@ -97,6 +97,9 @@ public:
   BlockT *getHeader() const { return Blocks.front(); }
   LoopT *getParentLoop() const { return ParentLoop; }
 
+  /// setParentLoop is a raw interface for bypassing addChildLoop.
+  void setParentLoop(LoopT *L) { ParentLoop = L; }
+
   /// contains - Return true if the specified loop is contained within in
   /// this loop.
   ///
@@ -122,14 +125,20 @@ public:
   /// iterator/begin/end - Return the loops contained entirely within this loop.
   ///
   const std::vector<LoopT *> &getSubLoops() const { return SubLoops; }
+  std::vector<LoopT *> &getSubLoopsVector() { return SubLoops; }
   typedef typename std::vector<LoopT *>::const_iterator iterator;
+  typedef typename std::vector<LoopT *>::const_reverse_iterator
+    reverse_iterator;
   iterator begin() const { return SubLoops.begin(); }
   iterator end() const { return SubLoops.end(); }
+  reverse_iterator rbegin() const { return SubLoops.rbegin(); }
+  reverse_iterator rend() const { return SubLoops.rend(); }
   bool empty() const { return SubLoops.empty(); }
 
   /// getBlocks - Get a list of the basic blocks which make up this loop.
   ///
   const std::vector<BlockT*> &getBlocks() const { return Blocks; }
+  std::vector<BlockT*> &getBlocksVector() { return Blocks; }
   typedef typename std::vector<BlockT*>::const_iterator block_iterator;
   block_iterator block_begin() const { return Blocks.begin(); }
   block_iterator block_end() const { return Blocks.end(); }
@@ -181,83 +190,26 @@ public:
   /// outside of the loop.  These are the blocks _inside of the current loop_
   /// which branch out.  The returned list is always unique.
   ///
-  void getExitingBlocks(SmallVectorImpl<BlockT *> &ExitingBlocks) const {
-    // Sort the blocks vector so that we can use binary search to do quick
-    // lookups.
-    SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-    std::sort(LoopBBs.begin(), LoopBBs.end());
-
-    typedef GraphTraits<BlockT*> BlockTraits;
-    for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
-      for (typename BlockTraits::ChildIteratorType I =
-          BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
-          I != E; ++I)
-        if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I)) {
-          // Not in current loop? It must be an exit block.
-          ExitingBlocks.push_back(*BI);
-          break;
-        }
-  }
+  void getExitingBlocks(SmallVectorImpl<BlockT *> &ExitingBlocks) const;
 
   /// getExitingBlock - If getExitingBlocks would return exactly one block,
   /// return that block. Otherwise return null.
-  BlockT *getExitingBlock() const {
-    SmallVector<BlockT*, 8> ExitingBlocks;
-    getExitingBlocks(ExitingBlocks);
-    if (ExitingBlocks.size() == 1)
-      return ExitingBlocks[0];
-    return 0;
-  }
+  BlockT *getExitingBlock() const;
 
   /// getExitBlocks - Return all of the successor blocks of this loop.  These
   /// are the blocks _outside of the current loop_ which are branched to.
   ///
-  void getExitBlocks(SmallVectorImpl<BlockT*> &ExitBlocks) const {
-    // Sort the blocks vector so that we can use binary search to do quick
-    // lookups.
-    SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-    std::sort(LoopBBs.begin(), LoopBBs.end());
-
-    typedef GraphTraits<BlockT*> BlockTraits;
-    for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
-      for (typename BlockTraits::ChildIteratorType I =
-           BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
-           I != E; ++I)
-        if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
-          // Not in current loop? It must be an exit block.
-          ExitBlocks.push_back(*I);
-  }
+  void getExitBlocks(SmallVectorImpl<BlockT*> &ExitBlocks) const;
 
   /// getExitBlock - If getExitBlocks would return exactly one block,
   /// return that block. Otherwise return null.
-  BlockT *getExitBlock() const {
-    SmallVector<BlockT*, 8> ExitBlocks;
-    getExitBlocks(ExitBlocks);
-    if (ExitBlocks.size() == 1)
-      return ExitBlocks[0];
-    return 0;
-  }
+  BlockT *getExitBlock() const;
 
   /// Edge type.
-  typedef std::pair<BlockT*, BlockT*> Edge;
+  typedef std::pair<const BlockT*, const BlockT*> Edge;
 
   /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
-  template <typename EdgeT>
-  void getExitEdges(SmallVectorImpl<EdgeT> &ExitEdges) const {
-    // Sort the blocks vector so that we can use binary search to do quick
-    // lookups.
-    SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-    array_pod_sort(LoopBBs.begin(), LoopBBs.end());
-
-    typedef GraphTraits<BlockT*> BlockTraits;
-    for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
-      for (typename BlockTraits::ChildIteratorType I =
-           BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
-           I != E; ++I)
-        if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
-          // Not in current loop? It must be an exit block.
-          ExitEdges.push_back(EdgeT(*BI, *I));
-  }
+  void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const;
 
   /// getLoopPreheader - If there is a preheader for this loop, return it.  A
   /// loop has a preheader if there is only one edge to the header of the loop
@@ -266,71 +218,18 @@ public:
   ///
   /// This method returns null if there is no preheader for the loop.
   ///
-  BlockT *getLoopPreheader() const {
-    // Keep track of nodes outside the loop branching to the header...
-    BlockT *Out = getLoopPredecessor();
-    if (!Out) return 0;
-
-    // Make sure there is only one exit out of the preheader.
-    typedef GraphTraits<BlockT*> BlockTraits;
-    typename BlockTraits::ChildIteratorType SI = BlockTraits::child_begin(Out);
-    ++SI;
-    if (SI != BlockTraits::child_end(Out))
-      return 0;  // Multiple exits from the block, must not be a preheader.
-
-    // The predecessor has exactly one successor, so it is a preheader.
-    return Out;
-  }
+  BlockT *getLoopPreheader() const;
 
   /// getLoopPredecessor - If the given loop's header has exactly one unique
   /// predecessor outside the loop, return it. Otherwise return null.
   /// This is less strict that the loop "preheader" concept, which requires
   /// the predecessor to have exactly one successor.
   ///
-  BlockT *getLoopPredecessor() const {
-    // Keep track of nodes outside the loop branching to the header...
-    BlockT *Out = 0;
-
-    // Loop over the predecessors of the header node...
-    BlockT *Header = getHeader();
-    typedef GraphTraits<BlockT*> BlockTraits;
-    typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
-    for (typename InvBlockTraits::ChildIteratorType PI =
-         InvBlockTraits::child_begin(Header),
-         PE = InvBlockTraits::child_end(Header); PI != PE; ++PI) {
-      typename InvBlockTraits::NodeType *N = *PI;
-      if (!contains(N)) {     // If the block is not in the loop...
-        if (Out && Out != N)
-          return 0;             // Multiple predecessors outside the loop
-        Out = N;
-      }
-    }
-
-    // Make sure there is only one exit out of the preheader.
-    assert(Out && "Header of loop has no predecessors from outside loop?");
-    return Out;
-  }
+  BlockT *getLoopPredecessor() const;
 
   /// getLoopLatch - If there is a single latch block for this loop, return it.
   /// A latch block is a block that contains a branch back to the header.
-  BlockT *getLoopLatch() const {
-    BlockT *Header = getHeader();
-    typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
-    typename InvBlockTraits::ChildIteratorType PI =
-                                            InvBlockTraits::child_begin(Header);
-    typename InvBlockTraits::ChildIteratorType PE =
-                                              InvBlockTraits::child_end(Header);
-    BlockT *Latch = 0;
-    for (; PI != PE; ++PI) {
-      typename InvBlockTraits::NodeType *N = *PI;
-      if (contains(N)) {
-        if (Latch) return 0;
-        Latch = N;
-      }
-    }
-
-    return Latch;
-  }
+  BlockT *getLoopLatch() const;
 
   //===--------------------------------------------------------------------===//
   // APIs for updating loop information after changing the CFG
@@ -348,17 +247,7 @@ public:
   /// the OldChild entry in our children list with NewChild, and updates the
   /// parent pointer of OldChild to be null and the NewChild to be this loop.
   /// This updates the loop depth of the new child.
-  void replaceChildLoopWith(LoopT *OldChild,
-                            LoopT *NewChild) {
-    assert(OldChild->ParentLoop == this && "This loop is already broken!");
-    assert(NewChild->ParentLoop == 0 && "NewChild already has a parent!");
-    typename std::vector<LoopT *>::iterator I =
-                          std::find(SubLoops.begin(), SubLoops.end(), OldChild);
-    assert(I != SubLoops.end() && "OldChild not in loop!");
-    *I = NewChild;
-    OldChild->ParentLoop = 0;
-    NewChild->ParentLoop = static_cast<LoopT *>(this);
-  }
+  void replaceChildLoopWith(LoopT *OldChild, LoopT *NewChild);
 
   /// addChildLoop - Add the specified loop to be a child of this loop.  This
   /// updates the loop depth of the new child.
@@ -411,121 +300,12 @@ public:
   }
 
   /// verifyLoop - Verify loop structure
-  void verifyLoop() const {
-#ifndef NDEBUG
-    assert(!Blocks.empty() && "Loop header is missing");
-
-    // Setup for using a depth-first iterator to visit every block in the loop.
-    SmallVector<BlockT*, 8> ExitBBs;
-    getExitBlocks(ExitBBs);
-    llvm::SmallPtrSet<BlockT*, 8> VisitSet;
-    VisitSet.insert(ExitBBs.begin(), ExitBBs.end());
-    df_ext_iterator<BlockT*, llvm::SmallPtrSet<BlockT*, 8> >
-        BI = df_ext_begin(getHeader(), VisitSet),
-        BE = df_ext_end(getHeader(), VisitSet);
-
-    // Keep track of the number of BBs visited.
-    unsigned NumVisited = 0;
-
-    // Sort the blocks vector so that we can use binary search to do quick
-    // lookups.
-    SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-    std::sort(LoopBBs.begin(), LoopBBs.end());
-
-    // Check the individual blocks.
-    for ( ; BI != BE; ++BI) {
-      BlockT *BB = *BI;
-      bool HasInsideLoopSuccs = false;
-      bool HasInsideLoopPreds = false;
-      SmallVector<BlockT *, 2> OutsideLoopPreds;
-
-      typedef GraphTraits<BlockT*> BlockTraits;
-      for (typename BlockTraits::ChildIteratorType SI =
-           BlockTraits::child_begin(BB), SE = BlockTraits::child_end(BB);
-           SI != SE; ++SI)
-        if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *SI)) {
-          HasInsideLoopSuccs = true;
-          break;
-        }
-      typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
-      for (typename InvBlockTraits::ChildIteratorType PI =
-           InvBlockTraits::child_begin(BB), PE = InvBlockTraits::child_end(BB);
-           PI != PE; ++PI) {
-        BlockT *N = *PI;
-        if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), N))
-          HasInsideLoopPreds = true;
-        else
-          OutsideLoopPreds.push_back(N);
-      }
-
-      if (BB == getHeader()) {
-        assert(!OutsideLoopPreds.empty() && "Loop is unreachable!");
-      } else if (!OutsideLoopPreds.empty()) {
-        // A non-header loop shouldn't be reachable from outside the loop,
-        // though it is permitted if the predecessor is not itself actually
-        // reachable.
-        BlockT *EntryBB = BB->getParent()->begin();
-        for (df_iterator<BlockT *> NI = df_begin(EntryBB),
-             NE = df_end(EntryBB); NI != NE; ++NI)
-          for (unsigned i = 0, e = OutsideLoopPreds.size(); i != e; ++i)
-            assert(*NI != OutsideLoopPreds[i] &&
-                   "Loop has multiple entry points!");
-      }
-      assert(HasInsideLoopPreds && "Loop block has no in-loop predecessors!");
-      assert(HasInsideLoopSuccs && "Loop block has no in-loop successors!");
-      assert(BB != getHeader()->getParent()->begin() &&
-             "Loop contains function entry block!");
-
-      NumVisited++;
-    }
-
-    assert(NumVisited == getNumBlocks() && "Unreachable block in loop");
-
-    // Check the subloops.
-    for (iterator I = begin(), E = end(); I != E; ++I)
-      // Each block in each subloop should be contained within this loop.
-      for (block_iterator BI = (*I)->block_begin(), BE = (*I)->block_end();
-           BI != BE; ++BI) {
-        assert(std::binary_search(LoopBBs.begin(), LoopBBs.end(), *BI) &&
-               "Loop does not contain all the blocks of a subloop!");
-      }
-
-    // Check the parent loop pointer.
-    if (ParentLoop) {
-      assert(std::find(ParentLoop->begin(), ParentLoop->end(), this) !=
-               ParentLoop->end() &&
-             "Loop is not a subloop of its parent!");
-    }
-#endif
-  }
+  void verifyLoop() const;
 
   /// verifyLoop - Verify loop structure of this loop and all nested loops.
-  void verifyLoopNest(DenseSet<const LoopT*> *Loops) const {
-    Loops->insert(static_cast<const LoopT *>(this));
-    // Verify this loop.
-    verifyLoop();
-    // Verify the subloops.
-    for (iterator I = begin(), E = end(); I != E; ++I)
-      (*I)->verifyLoopNest(Loops);
-  }
+  void verifyLoopNest(DenseSet<const LoopT*> *Loops) const;
 
-  void print(raw_ostream &OS, unsigned Depth = 0) const {
-    OS.indent(Depth*2) << "Loop at depth " << getLoopDepth()
-       << " containing: ";
-
-    for (unsigned i = 0; i < getBlocks().size(); ++i) {
-      if (i) OS << ",";
-      BlockT *BB = getBlocks()[i];
-      WriteAsOperand(OS, BB, false);
-      if (BB == getHeader())    OS << "<header>";
-      if (BB == getLoopLatch()) OS << "<latch>";
-      if (isLoopExiting(BB))    OS << "<exiting>";
-    }
-    OS << "\n";
-
-    for (iterator I = begin(), E = end(); I != E; ++I)
-      (*I)->print(OS, Depth+2);
-  }
+  void print(raw_ostream &OS, unsigned Depth = 0) const;
 
 protected:
   friend class LoopInfoBase<BlockT, LoopT>;
@@ -540,6 +320,11 @@ raw_ostream& operator<<(raw_ostream &OS, const LoopBase<BlockT, LoopT> &Loop) {
   return OS;
 }
 
+// Implementation in LoopInfoImpl.h
+#ifdef __GNUC__
+__extension__ extern template class LoopBase<BasicBlock, Loop>;
+#endif
+
 class Loop : public LoopBase<BasicBlock, Loop> {
 public:
   Loop() {}
@@ -650,8 +435,12 @@ public:
   /// function.
   ///
   typedef typename std::vector<LoopT *>::const_iterator iterator;
+  typedef typename std::vector<LoopT *>::const_reverse_iterator
+    reverse_iterator;
   iterator begin() const { return TopLevelLoops.begin(); }
   iterator end() const { return TopLevelLoops.end(); }
+  reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); }
+  reverse_iterator rend() const { return TopLevelLoops.rend(); }
   bool empty() const { return TopLevelLoops.empty(); }
 
   /// getLoopFor - Return the inner most loop that BB lives in.  If a basic
@@ -744,189 +533,19 @@ public:
     return isNotAlreadyContainedIn(SubLoop->getParentLoop(), ParentLoop);
   }
 
-  void Calculate(DominatorTreeBase<BlockT> &DT) {
-    BlockT *RootNode = DT.getRootNode()->getBlock();
-
-    for (df_iterator<BlockT*> NI = df_begin(RootNode),
-           NE = df_end(RootNode); NI != NE; ++NI)
-      if (LoopT *L = ConsiderForLoop(*NI, DT))
-        TopLevelLoops.push_back(L);
-  }
-
-  LoopT *ConsiderForLoop(BlockT *BB, DominatorTreeBase<BlockT> &DT) {
-    if (BBMap.count(BB)) return 0; // Haven't processed this node?
-
-    std::vector<BlockT *> TodoStack;
-
-    // Scan the predecessors of BB, checking to see if BB dominates any of
-    // them.  This identifies backedges which target this node...
-    typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
-    for (typename InvBlockTraits::ChildIteratorType I =
-         InvBlockTraits::child_begin(BB), E = InvBlockTraits::child_end(BB);
-         I != E; ++I) {
-      typename InvBlockTraits::NodeType *N = *I;
-      // If BB dominates its predecessor...
-      if (DT.dominates(BB, N) && DT.isReachableFromEntry(N))
-          TodoStack.push_back(N);
-    }
-
-    if (TodoStack.empty()) return 0;  // No backedges to this block...
-
-    // Create a new loop to represent this basic block...
-    LoopT *L = new LoopT(BB);
-    BBMap[BB] = L;
-
-    while (!TodoStack.empty()) {  // Process all the nodes in the loop
-      BlockT *X = TodoStack.back();
-      TodoStack.pop_back();
-
-      if (!L->contains(X) &&         // As of yet unprocessed??
-          DT.isReachableFromEntry(X)) {
-        // Check to see if this block already belongs to a loop.  If this occurs
-        // then we have a case where a loop that is supposed to be a child of
-        // the current loop was processed before the current loop.  When this
-        // occurs, this child loop gets added to a part of the current loop,
-        // making it a sibling to the current loop.  We have to reparent this
-        // loop.
-        if (LoopT *SubLoop =
-            const_cast<LoopT *>(getLoopFor(X)))
-          if (SubLoop->getHeader() == X && isNotAlreadyContainedIn(SubLoop, L)){
-            // Remove the subloop from its current parent...
-            assert(SubLoop->ParentLoop && SubLoop->ParentLoop != L);
-            LoopT *SLP = SubLoop->ParentLoop;  // SubLoopParent
-            typename std::vector<LoopT *>::iterator I =
-              std::find(SLP->SubLoops.begin(), SLP->SubLoops.end(), SubLoop);
-            assert(I != SLP->SubLoops.end() &&"SubLoop not a child of parent?");
-            SLP->SubLoops.erase(I);   // Remove from parent...
-
-            // Add the subloop to THIS loop...
-            SubLoop->ParentLoop = L;
-            L->SubLoops.push_back(SubLoop);
-          }
-
-        // Normal case, add the block to our loop...
-        L->Blocks.push_back(X);
-
-        typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
-
-        // Add all of the predecessors of X to the end of the work stack...
-        TodoStack.insert(TodoStack.end(), InvBlockTraits::child_begin(X),
-                         InvBlockTraits::child_end(X));
-      }
-    }
-
-    // If there are any loops nested within this loop, create them now!
-    for (typename std::vector<BlockT*>::iterator I = L->Blocks.begin(),
-         E = L->Blocks.end(); I != E; ++I)
-      if (LoopT *NewLoop = ConsiderForLoop(*I, DT)) {
-        L->SubLoops.push_back(NewLoop);
-        NewLoop->ParentLoop = L;
-      }
-
-    // Add the basic blocks that comprise this loop to the BBMap so that this
-    // loop can be found for them.
-    //
-    for (typename std::vector<BlockT*>::iterator I = L->Blocks.begin(),
-           E = L->Blocks.end(); I != E; ++I)
-      BBMap.insert(std::make_pair(*I, L));
-
-    // Now that we have a list of all of the child loops of this loop, check to
-    // see if any of them should actually be nested inside of each other.  We
-    // can accidentally pull loops our of their parents, so we must make sure to
-    // organize the loop nests correctly now.
-    {
-      std::map<BlockT *, LoopT *> ContainingLoops;
-      for (unsigned i = 0; i != L->SubLoops.size(); ++i) {
-        LoopT *Child = L->SubLoops[i];
-        assert(Child->getParentLoop() == L && "Not proper child loop?");
-
-        if (LoopT *ContainingLoop = ContainingLoops[Child->getHeader()]) {
-          // If there is already a loop which contains this loop, move this loop
-          // into the containing loop.
-          MoveSiblingLoopInto(Child, ContainingLoop);
-          --i;  // The loop got removed from the SubLoops list.
-        } else {
-          // This is currently considered to be a top-level loop.  Check to see
-          // if any of the contained blocks are loop headers for subloops we
-          // have already processed.
-          for (unsigned b = 0, e = Child->Blocks.size(); b != e; ++b) {
-            LoopT *&BlockLoop = ContainingLoops[Child->Blocks[b]];
-            if (BlockLoop == 0) {   // Child block not processed yet...
-              BlockLoop = Child;
-            } else if (BlockLoop != Child) {
-              LoopT *SubLoop = BlockLoop;
-              // Reparent all of the blocks which used to belong to BlockLoops
-              for (unsigned j = 0, f = SubLoop->Blocks.size(); j != f; ++j)
-                ContainingLoops[SubLoop->Blocks[j]] = Child;
-
-              // There is already a loop which contains this block, that means
-              // that we should reparent the loop which the block is currently
-              // considered to belong to to be a child of this loop.
-              MoveSiblingLoopInto(SubLoop, Child);
-              --i;  // We just shrunk the SubLoops list.
-            }
-          }
-        }
-      }
-    }
-
-    return L;
-  }
-
-  /// MoveSiblingLoopInto - This method moves the NewChild loop to live inside
-  /// of the NewParent Loop, instead of being a sibling of it.
-  void MoveSiblingLoopInto(LoopT *NewChild,
-                           LoopT *NewParent) {
-    LoopT *OldParent = NewChild->getParentLoop();
-    assert(OldParent && OldParent == NewParent->getParentLoop() &&
-           NewChild != NewParent && "Not sibling loops!");
-
-    // Remove NewChild from being a child of OldParent
-    typename std::vector<LoopT *>::iterator I =
-      std::find(OldParent->SubLoops.begin(), OldParent->SubLoops.end(),
-                NewChild);
-    assert(I != OldParent->SubLoops.end() && "Parent fields incorrect??");
-    OldParent->SubLoops.erase(I);   // Remove from parent's subloops list
-    NewChild->ParentLoop = 0;
-
-    InsertLoopInto(NewChild, NewParent);
-  }
-
-  /// InsertLoopInto - This inserts loop L into the specified parent loop.  If
-  /// the parent loop contains a loop which should contain L, the loop gets
-  /// inserted into L instead.
-  void InsertLoopInto(LoopT *L, LoopT *Parent) {
-    BlockT *LHeader = L->getHeader();
-    assert(Parent->contains(LHeader) &&
-           "This loop should not be inserted here!");
-
-    // Check to see if it belongs in a child loop...
-    for (unsigned i = 0, e = static_cast<unsigned>(Parent->SubLoops.size());
-         i != e; ++i)
-      if (Parent->SubLoops[i]->contains(LHeader)) {
-        InsertLoopInto(L, Parent->SubLoops[i]);
-        return;
-      }
-
-    // If not, insert it here!
-    Parent->SubLoops.push_back(L);
-    L->ParentLoop = Parent;
-  }
+  /// Create the loop forest using a stable algorithm.
+  void Analyze(DominatorTreeBase<BlockT> &DomTree);
 
   // Debugging
 
-  void print(raw_ostream &OS) const {
-    for (unsigned i = 0; i < TopLevelLoops.size(); ++i)
-      TopLevelLoops[i]->print(OS);
-  #if 0
-    for (DenseMap<BasicBlock*, LoopT*>::const_iterator I = BBMap.begin(),
-           E = BBMap.end(); I != E; ++I)
-      OS << "BB '" << I->first->getName() << "' level = "
-         << I->second->getLoopDepth() << "\n";
-  #endif
-  }
+  void print(raw_ostream &OS) const;
 };
 
+// Implementation in LoopInfoImpl.h
+#ifdef __GNUC__
+__extension__ extern template class LoopInfoBase<BasicBlock, Loop>;
+#endif
+
 class LoopInfo : public FunctionPass {
   LoopInfoBase<BasicBlock, Loop> LI;
   friend class LoopBase<BasicBlock, Loop>;
@@ -946,8 +565,11 @@ public:
   /// function.
   ///
   typedef LoopInfoBase<BasicBlock, Loop>::iterator iterator;
+  typedef LoopInfoBase<BasicBlock, Loop>::reverse_iterator reverse_iterator;
   inline iterator begin() const { return LI.begin(); }
   inline iterator end() const { return LI.end(); }
+  inline reverse_iterator rbegin() const { return LI.rbegin(); }
+  inline reverse_iterator rend() const { return LI.rend(); }
   bool empty() const { return LI.empty(); }
 
   /// getLoopFor - Return the inner most loop that BB lives in.  If a basic
@@ -1074,27 +696,6 @@ template <> struct GraphTraits<Loop*> {
   }
 };
 
-template<class BlockT, class LoopT>
-void
-LoopBase<BlockT, LoopT>::addBasicBlockToLoop(BlockT *NewBB,
-                                             LoopInfoBase<BlockT, LoopT> &LIB) {
-  assert((Blocks.empty() || LIB[getHeader()] == this) &&
-         "Incorrect LI specified for this loop!");
-  assert(NewBB && "Cannot add a null basic block to the loop!");
-  assert(LIB[NewBB] == 0 && "BasicBlock already in the loop!");
-
-  LoopT *L = static_cast<LoopT *>(this);
-
-  // Add the loop mapping to the LoopInfo object...
-  LIB.BBMap[NewBB] = L;
-
-  // Add the basic block to this loop and all parent loops...
-  while (L) {
-    L->Blocks.push_back(NewBB);
-    L = L->getParentLoop();
-  }
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
new file mode 100644
index 0000000..c07fbf7
--- /dev/null
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -0,0 +1,570 @@
+//===- llvm/Analysis/LoopInfoImpl.h - Natural Loop Calculator ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the generic implementation of LoopInfo used for both Loops and
+// MachineLoops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOP_INFO_IMPL_H
+#define LLVM_ANALYSIS_LOOP_INFO_IMPL_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// APIs for simple analysis of the loop. See header notes.
+
+/// getExitingBlocks - Return all blocks inside the loop that have successors
+/// outside of the loop.  These are the blocks _inside of the current loop_
+/// which branch out.  The returned list is always unique.
+///
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::
+getExitingBlocks(SmallVectorImpl<BlockT *> &ExitingBlocks) const {
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
+  std::sort(LoopBBs.begin(), LoopBBs.end());
+
+  typedef GraphTraits<BlockT*> BlockTraits;
+  for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
+    for (typename BlockTraits::ChildIteratorType I =
+           BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
+         I != E; ++I)
+      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I)) {
+        // Not in current loop? It must be an exit block.
+        ExitingBlocks.push_back(*BI);
+        break;
+      }
+}
+
+/// getExitingBlock - If getExitingBlocks would return exactly one block,
+/// return that block. Otherwise return null.
+template<class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getExitingBlock() const {
+  SmallVector<BlockT*, 8> ExitingBlocks;
+  getExitingBlocks(ExitingBlocks);
+  if (ExitingBlocks.size() == 1)
+    return ExitingBlocks[0];
+  return 0;
+}
+
+/// getExitBlocks - Return all of the successor blocks of this loop.  These
+/// are the blocks _outside of the current loop_ which are branched to.
+///
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::
+getExitBlocks(SmallVectorImpl<BlockT*> &ExitBlocks) const {
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
+  std::sort(LoopBBs.begin(), LoopBBs.end());
+
+  typedef GraphTraits<BlockT*> BlockTraits;
+  for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
+    for (typename BlockTraits::ChildIteratorType I =
+           BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
+         I != E; ++I)
+      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+        // Not in current loop? It must be an exit block.
+        ExitBlocks.push_back(*I);
+}
+
+/// getExitBlock - If getExitBlocks would return exactly one block,
+/// return that block. Otherwise return null.
+template<class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getExitBlock() const {
+  SmallVector<BlockT*, 8> ExitBlocks;
+  getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() == 1)
+    return ExitBlocks[0];
+  return 0;
+}
+
+/// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::
+getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const {
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
+  array_pod_sort(LoopBBs.begin(), LoopBBs.end());
+
+  typedef GraphTraits<BlockT*> BlockTraits;
+  for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
+    for (typename BlockTraits::ChildIteratorType I =
+           BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
+         I != E; ++I)
+      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+        // Not in current loop? It must be an exit block.
+        ExitEdges.push_back(Edge(*BI, *I));
+}
+
+/// getLoopPreheader - If there is a preheader for this loop, return it.  A
+/// loop has a preheader if there is only one edge to the header of the loop
+/// from outside of the loop.  If this is the case, the block branching to the
+/// header of the loop is the preheader node.
+///
+/// This method returns null if there is no preheader for the loop.
+///
+template<class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getLoopPreheader() const {
+  // Keep track of nodes outside the loop branching to the header...
+  BlockT *Out = getLoopPredecessor();
+  if (!Out) return 0;
+
+  // Make sure there is only one exit out of the preheader.
+  typedef GraphTraits<BlockT*> BlockTraits;
+  typename BlockTraits::ChildIteratorType SI = BlockTraits::child_begin(Out);
+  ++SI;
+  if (SI != BlockTraits::child_end(Out))
+    return 0;  // Multiple exits from the block, must not be a preheader.
+
+  // The predecessor has exactly one successor, so it is a preheader.
+  return Out;
+}
+
+/// getLoopPredecessor - If the given loop's header has exactly one unique
+/// predecessor outside the loop, return it. Otherwise return null.
+/// This is less strict that the loop "preheader" concept, which requires
+/// the predecessor to have exactly one successor.
+///
+template<class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
+  // Keep track of nodes outside the loop branching to the header...
+  BlockT *Out = 0;
+
+  // Loop over the predecessors of the header node...
+  BlockT *Header = getHeader();
+  typedef GraphTraits<BlockT*> BlockTraits;
+  typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
+  for (typename InvBlockTraits::ChildIteratorType PI =
+         InvBlockTraits::child_begin(Header),
+         PE = InvBlockTraits::child_end(Header); PI != PE; ++PI) {
+    typename InvBlockTraits::NodeType *N = *PI;
+    if (!contains(N)) {     // If the block is not in the loop...
+      if (Out && Out != N)
+        return 0;             // Multiple predecessors outside the loop
+      Out = N;
+    }
+  }
+
+  // Make sure there is only one exit out of the preheader.
+  assert(Out && "Header of loop has no predecessors from outside loop?");
+  return Out;
+}
+
+/// getLoopLatch - If there is a single latch block for this loop, return it.
+/// A latch block is a block that contains a branch back to the header.
+template<class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getLoopLatch() const {
+  BlockT *Header = getHeader();
+  typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
+  typename InvBlockTraits::ChildIteratorType PI =
+    InvBlockTraits::child_begin(Header);
+  typename InvBlockTraits::ChildIteratorType PE =
+    InvBlockTraits::child_end(Header);
+  BlockT *Latch = 0;
+  for (; PI != PE; ++PI) {
+    typename InvBlockTraits::NodeType *N = *PI;
+    if (contains(N)) {
+      if (Latch) return 0;
+      Latch = N;
+    }
+  }
+
+  return Latch;
+}
+
+//===----------------------------------------------------------------------===//
+// APIs for updating loop information after changing the CFG
+//
+
+/// addBasicBlockToLoop - This method is used by other analyses to update loop
+/// information.  NewBB is set to be a new member of the current loop.
+/// Because of this, it is added as a member of all parent loops, and is added
+/// to the specified LoopInfo object as being in the current basic block.  It
+/// is not valid to replace the loop header with this method.
+///
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::
+addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase<BlockT, LoopT> &LIB) {
+  assert((Blocks.empty() || LIB[getHeader()] == this) &&
+         "Incorrect LI specified for this loop!");
+  assert(NewBB && "Cannot add a null basic block to the loop!");
+  assert(LIB[NewBB] == 0 && "BasicBlock already in the loop!");
+
+  LoopT *L = static_cast<LoopT *>(this);
+
+  // Add the loop mapping to the LoopInfo object...
+  LIB.BBMap[NewBB] = L;
+
+  // Add the basic block to this loop and all parent loops...
+  while (L) {
+    L->Blocks.push_back(NewBB);
+    L = L->getParentLoop();
+  }
+}
+
+/// replaceChildLoopWith - This is used when splitting loops up.  It replaces
+/// the OldChild entry in our children list with NewChild, and updates the
+/// parent pointer of OldChild to be null and the NewChild to be this loop.
+/// This updates the loop depth of the new child.
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::
+replaceChildLoopWith(LoopT *OldChild, LoopT *NewChild) {
+  assert(OldChild->ParentLoop == this && "This loop is already broken!");
+  assert(NewChild->ParentLoop == 0 && "NewChild already has a parent!");
+  typename std::vector<LoopT *>::iterator I =
+    std::find(SubLoops.begin(), SubLoops.end(), OldChild);
+  assert(I != SubLoops.end() && "OldChild not in loop!");
+  *I = NewChild;
+  OldChild->ParentLoop = 0;
+  NewChild->ParentLoop = static_cast<LoopT *>(this);
+}
+
+/// verifyLoop - Verify loop structure
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::verifyLoop() const {
+#ifndef NDEBUG
+  assert(!Blocks.empty() && "Loop header is missing");
+
+  // Setup for using a depth-first iterator to visit every block in the loop.
+  SmallVector<BlockT*, 8> ExitBBs;
+  getExitBlocks(ExitBBs);
+  llvm::SmallPtrSet<BlockT*, 8> VisitSet;
+  VisitSet.insert(ExitBBs.begin(), ExitBBs.end());
+  df_ext_iterator<BlockT*, llvm::SmallPtrSet<BlockT*, 8> >
+    BI = df_ext_begin(getHeader(), VisitSet),
+    BE = df_ext_end(getHeader(), VisitSet);
+
+  // Keep track of the number of BBs visited.
+  unsigned NumVisited = 0;
+
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
+  std::sort(LoopBBs.begin(), LoopBBs.end());
+
+  // Check the individual blocks.
+  for ( ; BI != BE; ++BI) {
+    BlockT *BB = *BI;
+    bool HasInsideLoopSuccs = false;
+    bool HasInsideLoopPreds = false;
+    SmallVector<BlockT *, 2> OutsideLoopPreds;
+
+    typedef GraphTraits<BlockT*> BlockTraits;
+    for (typename BlockTraits::ChildIteratorType SI =
+           BlockTraits::child_begin(BB), SE = BlockTraits::child_end(BB);
+         SI != SE; ++SI)
+      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *SI)) {
+        HasInsideLoopSuccs = true;
+        break;
+      }
+    typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
+    for (typename InvBlockTraits::ChildIteratorType PI =
+           InvBlockTraits::child_begin(BB), PE = InvBlockTraits::child_end(BB);
+         PI != PE; ++PI) {
+      BlockT *N = *PI;
+      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), N))
+        HasInsideLoopPreds = true;
+      else
+        OutsideLoopPreds.push_back(N);
+    }
+
+    if (BB == getHeader()) {
+        assert(!OutsideLoopPreds.empty() && "Loop is unreachable!");
+    } else if (!OutsideLoopPreds.empty()) {
+      // A non-header loop shouldn't be reachable from outside the loop,
+      // though it is permitted if the predecessor is not itself actually
+      // reachable.
+      BlockT *EntryBB = BB->getParent()->begin();
+        for (df_iterator<BlockT *> NI = df_begin(EntryBB),
+               NE = df_end(EntryBB); NI != NE; ++NI)
+          for (unsigned i = 0, e = OutsideLoopPreds.size(); i != e; ++i)
+            assert(*NI != OutsideLoopPreds[i] &&
+                   "Loop has multiple entry points!");
+    }
+    assert(HasInsideLoopPreds && "Loop block has no in-loop predecessors!");
+    assert(HasInsideLoopSuccs && "Loop block has no in-loop successors!");
+    assert(BB != getHeader()->getParent()->begin() &&
+           "Loop contains function entry block!");
+
+    NumVisited++;
+  }
+
+  assert(NumVisited == getNumBlocks() && "Unreachable block in loop");
+
+  // Check the subloops.
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    // Each block in each subloop should be contained within this loop.
+    for (block_iterator BI = (*I)->block_begin(), BE = (*I)->block_end();
+         BI != BE; ++BI) {
+        assert(std::binary_search(LoopBBs.begin(), LoopBBs.end(), *BI) &&
+               "Loop does not contain all the blocks of a subloop!");
+    }
+
+  // Check the parent loop pointer.
+  if (ParentLoop) {
+    assert(std::find(ParentLoop->begin(), ParentLoop->end(), this) !=
+           ParentLoop->end() &&
+           "Loop is not a subloop of its parent!");
+  }
+#endif
+}
+
+/// verifyLoop - Verify loop structure of this loop and all nested loops.
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::verifyLoopNest(
+  DenseSet<const LoopT*> *Loops) const {
+  Loops->insert(static_cast<const LoopT *>(this));
+  // Verify this loop.
+  verifyLoop();
+  // Verify the subloops.
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    (*I)->verifyLoopNest(Loops);
+}
+
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth*2) << "Loop at depth " << getLoopDepth()
+       << " containing: ";
+
+  for (unsigned i = 0; i < getBlocks().size(); ++i) {
+    if (i) OS << ",";
+    BlockT *BB = getBlocks()[i];
+    WriteAsOperand(OS, BB, false);
+    if (BB == getHeader())    OS << "<header>";
+    if (BB == getLoopLatch()) OS << "<latch>";
+    if (isLoopExiting(BB))    OS << "<exiting>";
+  }
+  OS << "\n";
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    (*I)->print(OS, Depth+2);
+}
+
+//===----------------------------------------------------------------------===//
+/// Stable LoopInfo Analysis - Build a loop tree using stable iterators so the
+/// result does / not depend on use list (block predecessor) order.
+///
+
+/// Discover a subloop with the specified backedges such that: All blocks within
+/// this loop are mapped to this loop or a subloop. And all subloops within this
+/// loop have their parent loop set to this loop or a subloop.
+template<class BlockT, class LoopT>
+static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT*> Backedges,
+                                  LoopInfoBase<BlockT, LoopT> *LI,
+                                  DominatorTreeBase<BlockT> &DomTree) {
+  typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
+
+  unsigned NumBlocks = 0;
+  unsigned NumSubloops = 0;
+
+  // Perform a backward CFG traversal using a worklist.
+  std::vector<BlockT *> ReverseCFGWorklist(Backedges.begin(), Backedges.end());
+  while (!ReverseCFGWorklist.empty()) {
+    BlockT *PredBB = ReverseCFGWorklist.back();
+    ReverseCFGWorklist.pop_back();
+
+    LoopT *Subloop = LI->getLoopFor(PredBB);
+    if (!Subloop) {
+      if (!DomTree.isReachableFromEntry(PredBB))
+        continue;
+
+      // This is an undiscovered block. Map it to the current loop.
+      LI->changeLoopFor(PredBB, L);
+      ++NumBlocks;
+      if (PredBB == L->getHeader())
+          continue;
+      // Push all block predecessors on the worklist.
+      ReverseCFGWorklist.insert(ReverseCFGWorklist.end(),
+                                InvBlockTraits::child_begin(PredBB),
+                                InvBlockTraits::child_end(PredBB));
+    }
+    else {
+      // This is a discovered block. Find its outermost discovered loop.
+      while (LoopT *Parent = Subloop->getParentLoop())
+        Subloop = Parent;
+
+      // If it is already discovered to be a subloop of this loop, continue.
+      if (Subloop == L)
+        continue;
+
+      // Discover a subloop of this loop.
+      Subloop->setParentLoop(L);
+      ++NumSubloops;
+      NumBlocks += Subloop->getBlocks().capacity();
+      PredBB = Subloop->getHeader();
+      // Continue traversal along predecessors that are not loop-back edges from
+      // within this subloop tree itself. Note that a predecessor may directly
+      // reach another subloop that is not yet discovered to be a subloop of
+      // this loop, which we must traverse.
+      for (typename InvBlockTraits::ChildIteratorType PI =
+             InvBlockTraits::child_begin(PredBB),
+             PE = InvBlockTraits::child_end(PredBB); PI != PE; ++PI) {
+        if (LI->getLoopFor(*PI) != Subloop)
+          ReverseCFGWorklist.push_back(*PI);
+      }
+    }
+  }
+  L->getSubLoopsVector().reserve(NumSubloops);
+  L->getBlocksVector().reserve(NumBlocks);
+}
+
+namespace {
+/// Populate all loop data in a stable order during a single forward DFS.
+template<class BlockT, class LoopT>
+class PopulateLoopsDFS {
+  typedef GraphTraits<BlockT*> BlockTraits;
+  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+
+  LoopInfoBase<BlockT, LoopT> *LI;
+  DenseSet<const BlockT *> VisitedBlocks;
+  std::vector<std::pair<BlockT*, SuccIterTy> > DFSStack;
+
+public:
+  PopulateLoopsDFS(LoopInfoBase<BlockT, LoopT> *li):
+    LI(li) {}
+
+  void traverse(BlockT *EntryBlock);
+
+protected:
+  void insertIntoLoop(BlockT *Block);
+
+  BlockT *dfsSource() { return DFSStack.back().first; }
+  SuccIterTy &dfsSucc() { return DFSStack.back().second; }
+  SuccIterTy dfsSuccEnd() { return BlockTraits::child_end(dfsSource()); }
+
+  void pushBlock(BlockT *Block) {
+    DFSStack.push_back(std::make_pair(Block, BlockTraits::child_begin(Block)));
+  }
+};
+} // anonymous
+
+/// Top-level driver for the forward DFS within the loop.
+template<class BlockT, class LoopT>
+void PopulateLoopsDFS<BlockT, LoopT>::traverse(BlockT *EntryBlock) {
+  pushBlock(EntryBlock);
+  VisitedBlocks.insert(EntryBlock);
+  while (!DFSStack.empty()) {
+    // Traverse the leftmost path as far as possible.
+    while (dfsSucc() != dfsSuccEnd()) {
+      BlockT *BB = *dfsSucc();
+      ++dfsSucc();
+      if (!VisitedBlocks.insert(BB).second)
+        continue;
+
+      // Push the next DFS successor onto the stack.
+      pushBlock(BB);
+    }
+    // Visit the top of the stack in postorder and backtrack.
+    insertIntoLoop(dfsSource());
+    DFSStack.pop_back();
+  }
+}
+
+/// Add a single Block to its ancestor loops in PostOrder. If the block is a
+/// subloop header, add the subloop to its parent in PostOrder, then reverse the
+/// Block and Subloop vectors of the now complete subloop to achieve RPO.
+template<class BlockT, class LoopT>
+void PopulateLoopsDFS<BlockT, LoopT>::insertIntoLoop(BlockT *Block) {
+  LoopT *Subloop = LI->getLoopFor(Block);
+  if (Subloop && Block == Subloop->getHeader()) {
+    // We reach this point once per subloop after processing all the blocks in
+    // the subloop.
+    if (Subloop->getParentLoop())
+      Subloop->getParentLoop()->getSubLoopsVector().push_back(Subloop);
+    else
+      LI->addTopLevelLoop(Subloop);
+
+    // For convenience, Blocks and Subloops are inserted in postorder. Reverse
+    // the lists, except for the loop header, which is always at the beginning.
+    std::reverse(Subloop->getBlocksVector().begin()+1,
+                 Subloop->getBlocksVector().end());
+    std::reverse(Subloop->getSubLoopsVector().begin(),
+                 Subloop->getSubLoopsVector().end());
+
+    Subloop = Subloop->getParentLoop();
+  }
+  for (; Subloop; Subloop = Subloop->getParentLoop())
+    Subloop->getBlocksVector().push_back(Block);
+}
+
+/// Analyze LoopInfo discovers loops during a postorder DominatorTree traversal
+/// interleaved with backward CFG traversals within each subloop
+/// (discoverAndMapSubloop). The backward traversal skips inner subloops, so
+/// this part of the algorithm is linear in the number of CFG edges. Subloop and
+/// Block vectors are then populated during a single forward CFG traversal
+/// (PopulateLoopDFS).
+///
+/// During the two CFG traversals each block is seen three times:
+/// 1) Discovered and mapped by a reverse CFG traversal.
+/// 2) Visited during a forward DFS CFG traversal.
+/// 3) Reverse-inserted in the loop in postorder following forward DFS.
+///
+/// The Block vectors are inclusive, so step 3 requires loop-depth number of
+/// insertions per block.
+template<class BlockT, class LoopT>
+void LoopInfoBase<BlockT, LoopT>::
+Analyze(DominatorTreeBase<BlockT> &DomTree) {
+
+  // Postorder traversal of the dominator tree.
+  DomTreeNodeBase<BlockT>* DomRoot = DomTree.getRootNode();
+  for (po_iterator<DomTreeNodeBase<BlockT>*> DomIter = po_begin(DomRoot),
+         DomEnd = po_end(DomRoot); DomIter != DomEnd; ++DomIter) {
+
+    BlockT *Header = DomIter->getBlock();
+    SmallVector<BlockT *, 4> Backedges;
+
+    // Check each predecessor of the potential loop header.
+    typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
+    for (typename InvBlockTraits::ChildIteratorType PI =
+           InvBlockTraits::child_begin(Header),
+           PE = InvBlockTraits::child_end(Header); PI != PE; ++PI) {
+
+      BlockT *Backedge = *PI;
+
+      // If Header dominates predBB, this is a new loop. Collect the backedges.
+      if (DomTree.dominates(Header, Backedge)
+          && DomTree.isReachableFromEntry(Backedge)) {
+        Backedges.push_back(Backedge);
+      }
+    }
+    // Perform a backward CFG traversal to discover and map blocks in this loop.
+    if (!Backedges.empty()) {
+      LoopT *L = new LoopT(Header);
+      discoverAndMapSubloop(L, ArrayRef<BlockT*>(Backedges), this, DomTree);
+    }
+  }
+  // Perform a single forward CFG traversal to populate block and subloop
+  // vectors for all loops.
+  PopulateLoopsDFS<BlockT, LoopT> DFS(this);
+  DFS.traverse(DomRoot->getBlock());
+}
+
+// Debugging
+template<class BlockT, class LoopT>
+void LoopInfoBase<BlockT, LoopT>::print(raw_ostream &OS) const {
+  for (unsigned i = 0; i < TopLevelLoops.size(); ++i)
+    TopLevelLoops[i]->print(OS);
+#if 0
+  for (DenseMap<BasicBlock*, LoopT*>::const_iterator I = BBMap.begin(),
+         E = BBMap.end(); I != E; ++I)
+    OS << "BB '" << I->first->getName() << "' level = "
+       << I->second->getLoopDepth() << "\n";
+#endif
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Analysis/LoopIterator.h b/include/llvm/Analysis/LoopIterator.h
index 269ac80..68f25f7 100644
--- a/include/llvm/Analysis/LoopIterator.h
+++ b/include/llvm/Analysis/LoopIterator.h
@@ -109,6 +109,16 @@ public:
   }
 };
 
+/// Specialize po_iterator_storage to record postorder numbers.
+template<> class po_iterator_storage<LoopBlocksTraversal, true> {
+  LoopBlocksTraversal &LBT;
+public:
+  po_iterator_storage(LoopBlocksTraversal &lbs) : LBT(lbs) {}
+  // These functions are defined below.
+  bool insertEdge(BasicBlock *From, BasicBlock *To);
+  void finishPostorder(BasicBlock *BB);
+};
+
 /// Traverse the blocks in a loop using a depth-first search.
 class LoopBlocksTraversal {
 public:
@@ -155,31 +165,17 @@ public:
     DFS.PostBlocks.push_back(BB);
     DFS.PostNumbers[BB] = DFS.PostBlocks.size();
   }
-
-  //===----------------------------------------------------------------------
-  // Implement part of the std::set interface for the purpose of driving the
-  // generic po_iterator.
-
-  /// Return true if the block is outside the loop or has already been visited.
-  /// Sorry if this is counterintuitive.
-  bool count(BasicBlock *BB) const {
-    return !DFS.L->contains(LI->getLoopFor(BB)) || DFS.PostNumbers.count(BB);
-  }
-
-  /// If this block is contained in the loop and has not been visited, return
-  /// true and assign a preorder number. This is a proxy for visitPreorder
-  /// called by POIterator.
-  bool insert(BasicBlock *BB) {
-    return visitPreorder(BB);
-  }
 };
 
-/// Specialize DFSetTraits to record postorder numbers.
-template<> struct DFSetTraits<LoopBlocksTraversal> {
-  static void finishPostorder(BasicBlock *BB, LoopBlocksTraversal& LBT) {
-    LBT.finishPostorder(BB);
-  }
-};
+inline bool po_iterator_storage<LoopBlocksTraversal, true>::
+insertEdge(BasicBlock *From, BasicBlock *To) {
+  return LBT.visitPreorder(To);
+}
+
+inline void po_iterator_storage<LoopBlocksTraversal, true>::
+finishPostorder(BasicBlock *BB) {
+  LBT.finishPostorder(BB);
+}
 
 } // End namespace llvm
 
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index 865d236..e674e74 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -15,6 +15,15 @@
 #ifndef LLVM_ANALYSIS_MEMORYBUILTINS_H
 #define LLVM_ANALYSIS_MEMORYBUILTINS_H
 
+#include "llvm/IRBuilder.h"
+#include "llvm/Operator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/TargetFolder.h"
+#include "llvm/Support/ValueHandle.h"
+
 namespace llvm {
 class CallInst;
 class PointerType;
@@ -22,24 +31,44 @@ class TargetData;
 class Type;
 class Value;
 
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
+/// like).
+bool isAllocationFn(const Value *V, bool LookThroughBitCast = false);
+
+/// \brief Tests if a value is a call or invoke to a function that returns a
+/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
+bool isNoAliasFn(const Value *V, bool LookThroughBitCast = false);
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates uninitialized memory (such as malloc).
+bool isMallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates zero-filled memory (such as calloc).
+bool isCallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory (either malloc, calloc, or strdup like).
+bool isAllocLikeFn(const Value *V, bool LookThroughBitCast = false);
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// reallocates memory (such as realloc).
+bool isReallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+
+
 //===----------------------------------------------------------------------===//
 //  malloc Call Utility Functions.
 //
 
-/// isMalloc - Returns true if the value is either a malloc call or a bitcast of 
-/// the result of a malloc call
-bool isMalloc(const Value *I);
-
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
 const CallInst *extractMallocCall(const Value *I);
-CallInst *extractMallocCall(Value *I);
-
-/// extractMallocCallFromBitCast - Returns the corresponding CallInst if the
-/// instruction is a bitcast of the result of a malloc call.
-const CallInst *extractMallocCallFromBitCast(const Value *I);
-CallInst *extractMallocCallFromBitCast(Value *I);
+static inline CallInst *extractMallocCall(Value *I) {
+  return const_cast<CallInst*>(extractMallocCall((const Value*)I));
+}
 
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction 
 /// is a call to malloc whose array size can be determined and the array size
@@ -67,7 +96,20 @@ Type *getMallocAllocatedType(const CallInst *CI);
 /// determined.
 Value *getMallocArraySize(CallInst *CI, const TargetData *TD,
                           bool LookThroughSExt = false);
-                          
+
+
+//===----------------------------------------------------------------------===//
+//  calloc Call Utility Functions.
+//
+
+/// extractCallocCall - Returns the corresponding CallInst if the instruction
+/// is a calloc call.
+const CallInst *extractCallocCall(const Value *I);
+static inline CallInst *extractCallocCall(Value *I) {
+  return const_cast<CallInst*>(extractCallocCall((const Value*)I));
+}
+
+
 //===----------------------------------------------------------------------===//
 //  free Call Utility Functions.
 //
@@ -79,6 +121,131 @@ static inline CallInst *isFreeCall(Value *I) {
   return const_cast<CallInst*>(isFreeCall((const Value*)I));
 }
 
+  
+//===----------------------------------------------------------------------===//
+//  Utility functions to compute size of objects.
+//
+
+/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// object size in Size if successful, and false otherwise.
+/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
+/// byval arguments, and global variables.
+bool getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
+                   bool RoundToAlign = false);
+
+
+
+typedef std::pair<APInt, APInt> SizeOffsetType;
+
+/// \brief Evaluate the size and offset of an object ponted by a Value*
+/// statically. Fails if size or offset are not known at compile time.
+class ObjectSizeOffsetVisitor
+  : public InstVisitor<ObjectSizeOffsetVisitor, SizeOffsetType> {
+
+  const TargetData *TD;
+  bool RoundToAlign;
+  unsigned IntTyBits;
+  APInt Zero;
+
+  APInt align(APInt Size, uint64_t Align);
+
+  SizeOffsetType unknown() {
+    return std::make_pair(APInt(), APInt());
+  }
+
+public:
+  ObjectSizeOffsetVisitor(const TargetData *TD, LLVMContext &Context,
+                          bool RoundToAlign = false);
+
+  SizeOffsetType compute(Value *V);
+
+  bool knownSize(SizeOffsetType &SizeOffset) {
+    return SizeOffset.first.getBitWidth() > 1;
+  }
+
+  bool knownOffset(SizeOffsetType &SizeOffset) {
+    return SizeOffset.second.getBitWidth() > 1;
+  }
+
+  bool bothKnown(SizeOffsetType &SizeOffset) {
+    return knownSize(SizeOffset) && knownOffset(SizeOffset);
+  }
+
+  SizeOffsetType visitAllocaInst(AllocaInst &I);
+  SizeOffsetType visitArgument(Argument &A);
+  SizeOffsetType visitCallSite(CallSite CS);
+  SizeOffsetType visitConstantPointerNull(ConstantPointerNull&);
+  SizeOffsetType visitExtractElementInst(ExtractElementInst &I);
+  SizeOffsetType visitExtractValueInst(ExtractValueInst &I);
+  SizeOffsetType visitGEPOperator(GEPOperator &GEP);
+  SizeOffsetType visitGlobalVariable(GlobalVariable &GV);
+  SizeOffsetType visitIntToPtrInst(IntToPtrInst&);
+  SizeOffsetType visitLoadInst(LoadInst &I);
+  SizeOffsetType visitPHINode(PHINode&);
+  SizeOffsetType visitSelectInst(SelectInst &I);
+  SizeOffsetType visitUndefValue(UndefValue&);
+  SizeOffsetType visitInstruction(Instruction &I);
+};
+
+typedef std::pair<Value*, Value*> SizeOffsetEvalType;
+
+
+/// \brief Evaluate the size and offset of an object ponted by a Value*.
+/// May create code to compute the result at run-time.
+class ObjectSizeOffsetEvaluator
+  : public InstVisitor<ObjectSizeOffsetEvaluator, SizeOffsetEvalType> {
+
+  typedef IRBuilder<true, TargetFolder> BuilderTy;
+  typedef std::pair<WeakVH, WeakVH> WeakEvalType;
+  typedef DenseMap<const Value*, WeakEvalType> CacheMapTy;
+  typedef SmallPtrSet<const Value*, 8> PtrSetTy;
+
+  const TargetData *TD;
+  LLVMContext &Context;
+  BuilderTy Builder;
+  ObjectSizeOffsetVisitor Visitor;
+  IntegerType *IntTy;
+  Value *Zero;
+  CacheMapTy CacheMap;
+  PtrSetTy SeenVals;
+
+  SizeOffsetEvalType unknown() {
+    return std::make_pair((Value*)0, (Value*)0);
+  }
+  SizeOffsetEvalType compute_(Value *V);
+
+public:
+  ObjectSizeOffsetEvaluator(const TargetData *TD, LLVMContext &Context);
+  SizeOffsetEvalType compute(Value *V);
+
+  bool knownSize(SizeOffsetEvalType SizeOffset) {
+    return SizeOffset.first;
+  }
+
+  bool knownOffset(SizeOffsetEvalType SizeOffset) {
+    return SizeOffset.second;
+  }
+
+  bool anyKnown(SizeOffsetEvalType SizeOffset) {
+    return knownSize(SizeOffset) || knownOffset(SizeOffset);
+  }
+
+  bool bothKnown(SizeOffsetEvalType SizeOffset) {
+    return knownSize(SizeOffset) && knownOffset(SizeOffset);
+  }
+
+  SizeOffsetEvalType visitAllocaInst(AllocaInst &I);
+  SizeOffsetEvalType visitCallSite(CallSite CS);
+  SizeOffsetEvalType visitExtractElementInst(ExtractElementInst &I);
+  SizeOffsetEvalType visitExtractValueInst(ExtractValueInst &I);
+  SizeOffsetEvalType visitGEPOperator(GEPOperator &GEP);
+  SizeOffsetEvalType visitIntToPtrInst(IntToPtrInst&);
+  SizeOffsetEvalType visitLoadInst(LoadInst &I);
+  SizeOffsetEvalType visitPHINode(PHINode &PHI);
+  SizeOffsetEvalType visitSelectInst(SelectInst &I);
+  SizeOffsetEvalType visitInstruction(Instruction &I);
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 68ce364..7e049d6 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -124,11 +124,11 @@ namespace llvm {
     }
 
     /// isClobber - Return true if this MemDepResult represents a query that is
-    /// a instruction clobber dependency.
+    /// an instruction clobber dependency.
     bool isClobber() const { return Value.getInt() == Clobber; }
 
     /// isDef - Return true if this MemDepResult represents a query that is
-    /// a instruction definition dependency.
+    /// an instruction definition dependency.
     bool isDef() const { return Value.getInt() == Def; }
     
     /// isNonLocal - Return true if this MemDepResult represents a query that
@@ -431,9 +431,6 @@ namespace llvm {
 
     void RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P);
     
-    AliasAnalysis::ModRefResult
-    getModRefInfo(const Instruction *Inst, const AliasAnalysis::Location &Loc);
-
     /// verifyRemoved - Verify that the specified instruction does not occur
     /// in our internal data structures.
     void verifyRemoved(Instruction *Inst) const;
diff --git a/include/llvm/Analysis/ProfileInfoLoader.h b/include/llvm/Analysis/ProfileInfoLoader.h
index 9e0c393..dcf3b38 100644
--- a/include/llvm/Analysis/ProfileInfoLoader.h
+++ b/include/llvm/Analysis/ProfileInfoLoader.h
@@ -28,19 +28,16 @@ class BasicBlock;
 
 class ProfileInfoLoader {
   const std::string &Filename;
-  Module &M;
   std::vector<std::string> CommandLines;
   std::vector<unsigned>    FunctionCounts;
   std::vector<unsigned>    BlockCounts;
   std::vector<unsigned>    EdgeCounts;
   std::vector<unsigned>    OptimalEdgeCounts;
   std::vector<unsigned>    BBTrace;
-  bool Warned;
 public:
   // ProfileInfoLoader ctor - Read the specified profiling data file, exiting
   // the program if the file is invalid or broken.
-  ProfileInfoLoader(const char *ToolName, const std::string &Filename,
-                    Module &M);
+  ProfileInfoLoader(const char *ToolName, const std::string &Filename);
 
   static const unsigned Uncounted;
 
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index b098eea..188d11c 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -473,25 +473,85 @@ public:
   const_iterator end() const { return children.end(); }
   //@}
 
-  /// @name BasicBlock Iterators
+  /// @name BasicBlock Node Iterators
   ///
   /// These iterators iterate over all BasicBlock RegionNodes that are
-  /// contained in this Region. The iterator also iterates over BasicBlocks
-  /// that are elements of a subregion of this Region. It is therefore called a
-  /// flat iterator.
+  /// contained in this Region. The iterator also iterates over BasicBlock
+  /// RegionNodes that are elements of a subregion of this Region. It is
+  /// therefore called a flat iterator.
   //@{
   typedef df_iterator<RegionNode*, SmallPtrSet<RegionNode*, 8>, false,
-                      GraphTraits<FlatIt<RegionNode*> > > block_iterator;
+                      GraphTraits<FlatIt<RegionNode*> > > block_node_iterator;
 
   typedef df_iterator<const RegionNode*, SmallPtrSet<const RegionNode*, 8>,
                       false, GraphTraits<FlatIt<const RegionNode*> > >
-            const_block_iterator;
+            const_block_node_iterator;
+
+  block_node_iterator block_node_begin();
+  block_node_iterator block_node_end();
+
+  const_block_node_iterator block_node_begin() const;
+  const_block_node_iterator block_node_end() const;
+  //@}
+
+  /// @name BasicBlock Iterators
+  ///
+  /// These iterators iterate over all BasicBlocks that are contained in this
+  /// Region. The iterator also iterates over BasicBlocks that are elements of
+  /// a subregion of this Region. It is therefore called a flat iterator.
+  //@{
+  template <bool IsConst>
+  class block_iterator_wrapper
+    : public df_iterator<typename conditional<IsConst,
+                                              const BasicBlock,
+                                              BasicBlock>::type*> {
+    typedef df_iterator<typename conditional<IsConst,
+                                             const BasicBlock,
+                                             BasicBlock>::type*>
+      super;
+  public:
+    typedef block_iterator_wrapper<IsConst> Self;
+    typedef typename super::pointer pointer;
+
+    // Construct the begin iterator.
+    block_iterator_wrapper(pointer Entry, pointer Exit) : super(df_begin(Entry))
+    {
+      // Mark the exit of the region as visited, so that the children of the
+      // exit and the exit itself, i.e. the block outside the region will never
+      // be visited.
+      super::Visited.insert(Exit);
+    }
+
+    // Construct the end iterator.
+    block_iterator_wrapper() : super(df_end<pointer>((BasicBlock *)0)) {}
+
+    /*implicit*/ block_iterator_wrapper(super I) : super(I) {}
+
+    // FIXME: Even a const_iterator returns a non-const BasicBlock pointer.
+    //        This was introduced for backwards compatibility, but should
+    //        be removed as soon as all users are fixed.
+    BasicBlock *operator*() const {
+      return const_cast<BasicBlock*>(super::operator*());
+    }
+  };
+
+  typedef block_iterator_wrapper<false> block_iterator;
+  typedef block_iterator_wrapper<true>  const_block_iterator;
+
+  block_iterator block_begin() {
+   return block_iterator(getEntry(), getExit());
+  }
 
-  block_iterator block_begin();
-  block_iterator block_end();
+  block_iterator block_end() {
+   return block_iterator();
+  }
 
-  const_block_iterator block_begin() const;
-  const_block_iterator block_end() const;
+  const_block_iterator block_begin() const {
+    return const_block_iterator(getEntry(), getExit());
+  }
+  const_block_iterator block_end() const {
+    return const_block_iterator();
+  }
   //@}
 
   /// @name Element Iterators
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 72408f7..c213ade 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -30,7 +30,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include <map>
 
 namespace llvm {
@@ -250,6 +250,9 @@ namespace llvm {
     ///
     ValueExprMapType ValueExprMap;
 
+    /// Mark predicate values currently being processed by isImpliedCond.
+    DenseSet<Value*> PendingLoopPredicates;
+
     /// ExitLimit - Information about the number of loop iterations for
     /// which a loop exit's branch condition evaluates to the not-taken path.
     /// This is a temporary pair of exact and max expressions that are
@@ -834,7 +837,8 @@ namespace llvm {
     ///
     bool SimplifyICmpOperands(ICmpInst::Predicate &Pred,
                               const SCEV *&LHS,
-                              const SCEV *&RHS);
+                              const SCEV *&RHS,
+                              unsigned Depth = 0);
 
     /// getLoopDisposition - Return the "disposition" of the given SCEV with
     /// respect to the given loop.
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index c22fc3a..3f8f149 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_ANALYSIS_SCALAREVOLUTION_EXPANDER_H
 #define LLVM_ANALYSIS_SCALAREVOLUTION_EXPANDER_H
 
+#include "llvm/IRBuilder.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/ValueHandle.h"
 #include <set>
@@ -24,6 +24,10 @@
 namespace llvm {
   class TargetLowering;
 
+  /// Return true if the given expression is safe to expand in the sense that
+  /// all materialized values are safe to speculate.
+  bool isSafeToExpand(const SCEV *S);
+
   /// SCEVExpander - This class uses information about analyze scalars to
   /// rewrite expressions in canonical form.
   ///
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 47b3710..ded1297 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_SCALAREVOLUTION_EXPRESSIONS_H
 
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
@@ -493,6 +494,74 @@ namespace llvm {
       llvm_unreachable("Invalid use of SCEVCouldNotCompute!");
     }
   };
+
+  /// Visit all nodes in the expression tree using worklist traversal.
+  ///
+  /// Visitor implements:
+  ///   // return true to follow this node.
+  ///   bool follow(const SCEV *S);
+  ///   // return true to terminate the search.
+  ///   bool isDone();
+  template<typename SV>
+  class SCEVTraversal {
+    SV &Visitor;
+    SmallVector<const SCEV *, 8> Worklist;
+    SmallPtrSet<const SCEV *, 8> Visited;
+
+    void push(const SCEV *S) {
+      if (Visited.insert(S) && Visitor.follow(S))
+        Worklist.push_back(S);
+    }
+  public:
+    SCEVTraversal(SV& V): Visitor(V) {}
+
+    void visitAll(const SCEV *Root) {
+      push(Root);
+      while (!Worklist.empty() && !Visitor.isDone()) {
+        const SCEV *S = Worklist.pop_back_val();
+
+        switch (S->getSCEVType()) {
+        case scConstant:
+        case scUnknown:
+          break;
+        case scTruncate:
+        case scZeroExtend:
+        case scSignExtend:
+          push(cast<SCEVCastExpr>(S)->getOperand());
+          break;
+        case scAddExpr:
+        case scMulExpr:
+        case scSMaxExpr:
+        case scUMaxExpr:
+        case scAddRecExpr: {
+          const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
+          for (SCEVNAryExpr::op_iterator I = NAry->op_begin(),
+                 E = NAry->op_end(); I != E; ++I) {
+            push(*I);
+          }
+          break;
+        }
+        case scUDivExpr: {
+          const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
+          push(UDiv->getLHS());
+          push(UDiv->getRHS());
+          break;
+        }
+        case scCouldNotCompute:
+          llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+        default:
+          llvm_unreachable("Unknown SCEV kind!");
+        }
+      }
+    }
+  };
+
+  /// Use SCEVTraversal to visit all nodes in the givien expression tree.
+  template<typename SV>
+  void visitAll(const SCEV *Root, SV& Visitor) {
+    SCEVTraversal<SV> T(Visitor);
+    T.visitAll(Root);
+  }
 }
 
 #endif
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index f2f9db4..e8d45f6 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -151,6 +151,14 @@ namespace llvm {
     return GetUnderlyingObject(const_cast<Value *>(V), TD, MaxLookup);
   }
 
+  /// GetUnderlyingObjects - This method is similar to GetUnderlyingObject
+  /// except that it can look through phi and select instructions and return
+  /// multiple objects.
+  void GetUnderlyingObjects(Value *V,
+                            SmallVectorImpl<Value *> &Objects,
+                            const TargetData *TD = 0,
+                            unsigned MaxLookup = 6);
+
   /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
   /// are lifetime markers.
   bool onlyUsedByLifetimeMarkers(const Value *V);
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index 0099f17..223aa00 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -16,6 +16,7 @@
 #define LLVM_ATTRIBUTES_H
 
 #include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include <cassert>
 #include <string>
 
@@ -45,14 +46,9 @@ class Attributes {
   Attributes() : Bits(0) { }
   explicit Attributes(uint64_t Val) : Bits(Val) { }
   /*implicit*/ Attributes(Attribute::AttrConst Val) : Bits(Val.v) { }
-  Attributes(const Attributes &Attrs) : Bits(Attrs.Bits) { }
   // This is a "safe bool() operator".
   operator const void *() const { return Bits ? this : 0; }
   bool isEmptyOrSingleton() const { return (Bits & (Bits - 1)) == 0; }
-  Attributes &operator = (const Attributes &Attrs) {
-    Bits = Attrs.Bits;
-    return *this;
-  }
   bool operator == (const Attributes &Attrs) const {
     return Bits == Attrs.Bits;
   }
@@ -138,6 +134,9 @@ DECLARE_LLVM_ATTRIBUTE(NonLazyBind,1U<<31) ///< Function is called early and/or
                                             /// often, so lazy binding isn't
                                             /// worthwhile.
 DECLARE_LLVM_ATTRIBUTE(AddressSafety,1ULL<<32) ///< Address safety checking is on.
+DECLARE_LLVM_ATTRIBUTE(IANSDialect,1ULL<<33) ///< Inline asm non-standard dialect.
+                                           /// When not set, ATT dialect assumed.
+                                           /// When set implies the Intel dialect.
 
 #undef DECLARE_LLVM_ATTRIBUTE
 
@@ -163,14 +162,16 @@ const AttrConst FunctionOnly = {NoReturn_i | NoUnwind_i | ReadNone_i |
   ReadOnly_i | NoInline_i | AlwaysInline_i | OptimizeForSize_i |
   StackProtect_i | StackProtectReq_i | NoRedZone_i | NoImplicitFloat_i |
   Naked_i | InlineHint_i | StackAlignment_i |
-  UWTable_i | NonLazyBind_i | ReturnsTwice_i | AddressSafety_i};
+  UWTable_i | NonLazyBind_i | ReturnsTwice_i | AddressSafety_i |
+  IANSDialect_i};
 
 /// @brief Parameter attributes that do not apply to vararg call arguments.
 const AttrConst VarArgsIncompatible = {StructRet_i};
 
 /// @brief Attributes that are mutually incompatible.
-const AttrConst MutuallyIncompatible[4] = {
-  {ByVal_i | InReg_i | Nest_i | StructRet_i},
+const AttrConst MutuallyIncompatible[5] = {
+  {ByVal_i | Nest_i | StructRet_i},
+  {ByVal_i | Nest_i | InReg_i },
   {ZExt_i  | SExt_i},
   {ReadNone_i | ReadOnly_i},
   {NoInline_i | AlwaysInline_i}
@@ -222,6 +223,50 @@ inline unsigned getStackAlignmentFromAttrs(Attributes A) {
   return 1U << ((StackAlign.Raw() >> 26) - 1);
 }
 
+/// This returns an integer containing an encoding of all the
+/// LLVM attributes found in the given attribute bitset.  Any
+/// change to this encoding is a breaking change to bitcode
+/// compatibility.
+inline uint64_t encodeLLVMAttributesForBitcode(Attributes Attrs) {
+  // FIXME: It doesn't make sense to store the alignment information as an
+  // expanded out value, we should store it as a log2 value.  However, we can't
+  // just change that here without breaking bitcode compatibility.  If this ever
+  // becomes a problem in practice, we should introduce new tag numbers in the
+  // bitcode file and have those tags use a more efficiently encoded alignment
+  // field.
+
+  // Store the alignment in the bitcode as a 16-bit raw value instead of a
+  // 5-bit log2 encoded value. Shift the bits above the alignment up by
+  // 11 bits.
+
+  uint64_t EncodedAttrs = Attrs.Raw() & 0xffff;
+  if (Attrs & Attribute::Alignment)
+    EncodedAttrs |= (1ull << 16) <<
+      (((Attrs & Attribute::Alignment).Raw()-1) >> 16);
+  EncodedAttrs |= (Attrs.Raw() & (0xfffull << 21)) << 11;
+
+  return EncodedAttrs;
+}
+
+/// This returns an attribute bitset containing the LLVM attributes
+/// that have been decoded from the given integer.  This function
+/// must stay in sync with 'encodeLLVMAttributesForBitcode'.
+inline Attributes decodeLLVMAttributesForBitcode(uint64_t EncodedAttrs) {
+  // The alignment is stored as a 16-bit raw value from bits 31--16.
+  // We shift the bits above 31 down by 11 bits.
+
+  unsigned Alignment = (EncodedAttrs & (0xffffull << 16)) >> 16;
+  assert((!Alignment || isPowerOf2_32(Alignment)) &&
+         "Alignment must be a power of two.");
+
+  Attributes Attrs(EncodedAttrs & 0xffff);
+  if (Alignment)
+    Attrs |= Attribute::constructAlignmentFromInt(Alignment);
+  Attrs |= Attributes((EncodedAttrs & (0xfffull << 32)) >> 11);
+
+  return Attrs;
+}
+
 
 /// The set of Attributes set in Attributes is converted to a
 /// string of equivalent mnemonics. This is, presumably, for writing out
@@ -268,16 +313,8 @@ public:
   // Attribute List Construction and Mutation
   //===--------------------------------------------------------------------===//
 
-  /// get - Return a Attributes list with the specified parameter in it.
-  static AttrListPtr get(const AttributeWithIndex *Attr, unsigned NumAttrs);
-
-  /// get - Return a Attribute list with the parameters specified by the
-  /// consecutive random access iterator range.
-  template <typename Iter>
-  static AttrListPtr get(const Iter &I, const Iter &E) {
-    if (I == E) return AttrListPtr();  // Empty list.
-    return get(&*I, static_cast<unsigned>(E-I));
-  }
+  /// get - Return a Attributes list with the specified parameters in it.
+  static AttrListPtr get(ArrayRef<AttributeWithIndex> Attrs);
 
   /// addAttr - Add the specified attribute at the specified index to this
   /// attribute list.  Since attribute lists are immutable, this
diff --git a/include/llvm/Bitcode/Archive.h b/include/llvm/Bitcode/Archive.h
index 86c44c7..3c75e58 100644
--- a/include/llvm/Bitcode/Archive.h
+++ b/include/llvm/Bitcode/Archive.h
@@ -47,14 +47,13 @@ class ArchiveMember : public ilist_node<ArchiveMember> {
     /// characteristics of the member. The various "is" methods below provide
     /// access to the flags. The flags are not user settable.
     enum Flags {
-      CompressedFlag = 1,          ///< Member is a normal compressed file
-      SVR4SymbolTableFlag = 2,     ///< Member is a SVR4 symbol table
-      BSD4SymbolTableFlag = 4,     ///< Member is a BSD4 symbol table
-      LLVMSymbolTableFlag = 8,     ///< Member is an LLVM symbol table
-      BitcodeFlag = 16,            ///< Member is bitcode
-      HasPathFlag = 64,            ///< Member has a full or partial path
-      HasLongFilenameFlag = 128,   ///< Member uses the long filename syntax
-      StringTableFlag = 256        ///< Member is an ar(1) format string table
+      SVR4SymbolTableFlag = 1,     ///< Member is a SVR4 symbol table
+      BSD4SymbolTableFlag = 2,     ///< Member is a BSD4 symbol table
+      LLVMSymbolTableFlag = 4,     ///< Member is an LLVM symbol table
+      BitcodeFlag = 8,             ///< Member is bitcode
+      HasPathFlag = 16,            ///< Member has a full or partial path
+      HasLongFilenameFlag = 32,    ///< Member uses the long filename syntax
+      StringTableFlag = 64         ///< Member is an ar(1) format string table
     };
 
   /// @}
@@ -109,11 +108,6 @@ class ArchiveMember : public ilist_node<ArchiveMember> {
     /// @brief Get the data content of the archive member
     const char* getData() const { return data; }
 
-    /// This method determines if the member is a regular compressed file.
-    /// @returns true iff the archive member is a compressed regular file.
-    /// @brief Determine if the member is a compressed regular file.
-    bool isCompressed() const { return flags&CompressedFlag; }
-
     /// @returns true iff the member is a SVR4 (non-LLVM) symbol table
     /// @brief Determine if this member is a SVR4 symbol table.
     bool isSVR4SymbolTable() const { return flags&SVR4SymbolTableFlag; }
@@ -427,7 +421,6 @@ class Archive {
     bool writeToDisk(
       bool CreateSymbolTable=false,   ///< Create Symbol table
       bool TruncateNames=false,       ///< Truncate the filename to 15 chars
-      bool Compress=false,            ///< Compress files
       std::string* ErrMessage=0       ///< If non-null, where error msg is set
     );
 
@@ -494,7 +487,6 @@ class Archive {
       std::ofstream& ARFile,       ///< The file to write member onto
       bool CreateSymbolTable,      ///< Should symbol table be created?
       bool TruncateNames,          ///< Should names be truncated to 11 chars?
-      bool ShouldCompress,         ///< Should the member be compressed?
       std::string* ErrMessage      ///< If non-null, place were error msg is set
     );
 
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index cc2b473..dd96b04 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -71,8 +71,8 @@ namespace llvm {
   /// isBitcodeWrapper - Return true if the given bytes are the magic bytes
   /// for an LLVM IR bitcode wrapper.
   ///
-  static inline bool isBitcodeWrapper(const unsigned char *BufPtr,
-                                      const unsigned char *BufEnd) {
+  inline bool isBitcodeWrapper(const unsigned char *BufPtr,
+                               const unsigned char *BufEnd) {
     // See if you can find the hidden message in the magic bytes :-).
     // (Hint: it's a little-endian encoding.)
     return BufPtr != BufEnd &&
@@ -85,8 +85,8 @@ namespace llvm {
   /// isRawBitcode - Return true if the given bytes are the magic bytes for
   /// raw LLVM IR bitcode (without a wrapper).
   ///
-  static inline bool isRawBitcode(const unsigned char *BufPtr,
-                                  const unsigned char *BufEnd) {
+  inline bool isRawBitcode(const unsigned char *BufPtr,
+                           const unsigned char *BufEnd) {
     // These bytes sort of have a hidden message, but it's not in
     // little-endian this time, and it's a little redundant.
     return BufPtr != BufEnd &&
@@ -99,8 +99,8 @@ namespace llvm {
   /// isBitcode - Return true if the given bytes are the magic bytes for
   /// LLVM IR bitcode, either with or without a wrapper.
   ///
-  static bool inline isBitcode(const unsigned char *BufPtr,
-                               const unsigned char *BufEnd) {
+  inline bool isBitcode(const unsigned char *BufPtr,
+                        const unsigned char *BufEnd) {
     return isBitcodeWrapper(BufPtr, BufEnd) ||
            isRawBitcode(BufPtr, BufEnd);
   }
@@ -121,9 +121,9 @@ namespace llvm {
   /// BC file.
   /// If 'VerifyBufferSize' is true, check that the buffer is large enough to
   /// contain the whole bitcode file.
-  static inline bool SkipBitcodeWrapperHeader(const unsigned char *&BufPtr,
-                                              const unsigned char *&BufEnd,
-                                              bool VerifyBufferSize) {
+  inline bool SkipBitcodeWrapperHeader(const unsigned char *&BufPtr,
+                                       const unsigned char *&BufEnd,
+                                       bool VerifyBufferSize) {
     enum {
       KnownHeaderSize = 4*4,  // Size of header we read.
       OffsetField = 2*4,      // Offset in bytes to Offset field.
diff --git a/include/llvm/CMakeLists.txt b/include/llvm/CMakeLists.txt
index de3ff86..f8cb425 100644
--- a/include/llvm/CMakeLists.txt
+++ b/include/llvm/CMakeLists.txt
@@ -6,8 +6,6 @@ add_custom_target(intrinsics_gen ALL
   DEPENDS ${llvm_builded_incs_dir}/Intrinsics.gen)
 set_target_properties(intrinsics_gen PROPERTIES FOLDER "Tablegenning")
 
-set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} intrinsics_gen PARENT_SCOPE)
-
 if( MSVC_IDE OR XCODE )
   # Creates a dummy target containing all headers for the benefit of
   # Visual Studio users.
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 56a87f1..170a528 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -354,6 +354,13 @@ namespace llvm {
     void EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                    unsigned Size) const;
 
+    /// EmitLabelReference - Emit something like ".long Label"
+    /// where the size in bytes of the directive is specified by Size and Label
+    /// specifies the label.
+    void EmitLabelReference(const MCSymbol *Label, unsigned Size) const {
+      EmitLabelPlusOffset(Label, 0, Size);
+    }
+
     //===------------------------------------------------------------------===//
     // Dwarf Emission Helper Routines
     //===------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/DFAPacketizer.h b/include/llvm/CodeGen/DFAPacketizer.h
index ee1ed07..2d2db78 100644
--- a/include/llvm/CodeGen/DFAPacketizer.h
+++ b/include/llvm/CodeGen/DFAPacketizer.h
@@ -28,6 +28,7 @@
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/ADT/DenseMap.h"
+#include <map>
 
 namespace llvm {
 
@@ -36,7 +37,7 @@ class MachineInstr;
 class MachineLoopInfo;
 class MachineDominatorTree;
 class InstrItineraryData;
-class ScheduleDAGInstrs;
+class DefaultVLIWScheduler;
 class SUnit;
 
 class DFAPacketizer {
@@ -77,6 +78,8 @@ public:
   // reserveResources - Reserve the resources occupied by a machine
   // instruction and change the current state to reflect that change.
   void reserveResources(llvm::MachineInstr *MI);
+
+  const InstrItineraryData *getInstrItins() const { return InstrItins; }
 };
 
 // VLIWPacketizerList - Implements a simple VLIW packetizer using DFA. The
@@ -87,20 +90,21 @@ public:
 // and machine resource is marked as taken. If any dependency is found, a target
 // API call is made to prune the dependence.
 class VLIWPacketizerList {
+protected:
   const TargetMachine &TM;
   const MachineFunction &MF;
   const TargetInstrInfo *TII;
 
-  // Encapsulate data types not exposed to the target interface.
-  ScheduleDAGInstrs *SchedulerImpl;
+  // The VLIW Scheduler.
+  DefaultVLIWScheduler *VLIWScheduler;
 
-protected:
   // Vector of instructions assigned to the current packet.
   std::vector<MachineInstr*> CurrentPacketMIs;
   // DFA resource tracker.
   DFAPacketizer *ResourceTracker;
-  // Scheduling units.
-  std::vector<SUnit> SUnits;
+
+  // Generate MI -> SU map.
+  std::map<MachineInstr*, SUnit*> MIToSUnit;
 
 public:
   VLIWPacketizerList(
@@ -118,17 +122,32 @@ public:
   DFAPacketizer *getResourceTracker() {return ResourceTracker;}
 
   // addToPacket - Add MI to the current packet.
-  void addToPacket(MachineInstr *MI);
+  virtual MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+    MachineBasicBlock::iterator MII = MI;
+    CurrentPacketMIs.push_back(MI);
+    ResourceTracker->reserveResources(MI);
+    return MII;
+  }
 
   // endPacket - End the current packet.
-  void endPacket(MachineBasicBlock *MBB, MachineInstr *I);
+  void endPacket(MachineBasicBlock *MBB, MachineInstr *MI);
+
+  // initPacketizerState - perform initialization before packetizing
+  // an instruction. This function is supposed to be overrided by
+  // the target dependent packetizer.
+  virtual void initPacketizerState(void) { return; }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *I, MachineBasicBlock *MBB);
+  virtual bool ignorePseudoInstruction(MachineInstr *I,
+                                       MachineBasicBlock *MBB) {
+    return false;
+  }
 
-  // isSoloInstruction - return true if instruction I must end previous
-  // packet.
-  bool isSoloInstruction(MachineInstr *I);
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  virtual bool isSoloInstruction(MachineInstr *MI) {
+    return true;
+  }
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
@@ -141,6 +160,7 @@ public:
   virtual bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
     return false;
   }
+
 };
 }
 
diff --git a/include/llvm/CodeGen/EdgeBundles.h b/include/llvm/CodeGen/EdgeBundles.h
index a1d29b1..e8a4a2d 100644
--- a/include/llvm/CodeGen/EdgeBundles.h
+++ b/include/llvm/CodeGen/EdgeBundles.h
@@ -46,7 +46,7 @@ public:
   unsigned getNumBundles() const { return EC.getNumClasses(); }
 
   /// getBlocks - Return an array of blocks that are connected to Bundle.
-  ArrayRef<unsigned> getBlocks(unsigned Bundle) { return Blocks[Bundle]; }
+  ArrayRef<unsigned> getBlocks(unsigned Bundle) const { return Blocks[Bundle]; }
 
   /// getMachineFunction - Return the last machine function computed.
   const MachineFunction *getMachineFunction() const { return MF; }
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index e57c8b1..7cb9695 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -34,6 +34,7 @@ class MachineFrameInfo;
 class MachineRegisterInfo;
 class TargetData;
 class TargetInstrInfo;
+class TargetLibraryInfo;
 class TargetLowering;
 class TargetMachine;
 class TargetRegisterClass;
@@ -57,6 +58,7 @@ protected:
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
   const TargetRegisterInfo &TRI;
+  const TargetLibraryInfo *LibInfo;
 
   /// The position of the last instruction for materializing constants
   /// for use in the current block. It resets to EmitStartPt when it
@@ -144,7 +146,8 @@ public:
   virtual ~FastISel();
 
 protected:
-  explicit FastISel(FunctionLoweringInfo &funcInfo);
+  explicit FastISel(FunctionLoweringInfo &funcInfo,
+                    const TargetLibraryInfo *libInfo);
 
   /// TargetSelectInstruction - This method is called by target-independent
   /// code when the normal FastISel process fails to select an instruction.
@@ -299,6 +302,15 @@ protected:
                             unsigned Op1, bool Op1IsKill,
                             uint64_t Imm);
 
+  /// FastEmitInst_rrii - Emit a MachineInstr with two register operands,
+  /// two immediates operands, and a result register in the given register
+  /// class.
+  unsigned FastEmitInst_rrii(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill,
+                             uint64_t Imm1, uint64_t Imm2);
+
   /// FastEmitInst_i - Emit a MachineInstr with a single immediate
   /// operand, and a result register in the given register class.
   unsigned FastEmitInst_i(unsigned MachineInstrOpcode,
diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h
index 45469ed..20e33f7 100644
--- a/include/llvm/CodeGen/GCMetadata.h
+++ b/include/llvm/CodeGen/GCMetadata.h
@@ -48,18 +48,18 @@ namespace llvm {
     /// PointKind - The type of a collector-safe point.
     ///
     enum PointKind {
-      Loop,    //< Instr is a loop (backwards branch).
-      Return,  //< Instr is a return instruction.
-      PreCall, //< Instr is a call instruction.
-      PostCall //< Instr is the return address of a call.
+      Loop,    ///< Instr is a loop (backwards branch).
+      Return,  ///< Instr is a return instruction.
+      PreCall, ///< Instr is a call instruction.
+      PostCall ///< Instr is the return address of a call.
     };
   }
 
   /// GCPoint - Metadata for a collector-safe point in machine code.
   ///
   struct GCPoint {
-    GC::PointKind Kind; //< The kind of the safe point.
-    MCSymbol *Label;    //< A label.
+    GC::PointKind Kind; ///< The kind of the safe point.
+    MCSymbol *Label;    ///< A label.
     DebugLoc Loc;
 
     GCPoint(GC::PointKind K, MCSymbol *L, DebugLoc DL)
@@ -69,9 +69,10 @@ namespace llvm {
   /// GCRoot - Metadata for a pointer to an object managed by the garbage
   /// collector.
   struct GCRoot {
-    int Num;            //< Usually a frame index.
-    int StackOffset;    //< Offset from the stack pointer.
-    const Constant *Metadata;//< Metadata straight from the call to llvm.gcroot.
+    int Num;            ///< Usually a frame index.
+    int StackOffset;    ///< Offset from the stack pointer.
+    const Constant *Metadata; ///< Metadata straight from the call
+                              ///< to llvm.gcroot.
 
     GCRoot(int N, const Constant *MD) : Num(N), StackOffset(-1), Metadata(MD) {}
   };
diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h
index 1cbd36a..dfc26d7 100644
--- a/include/llvm/CodeGen/GCStrategy.h
+++ b/include/llvm/CodeGen/GCStrategy.h
@@ -65,14 +65,14 @@ namespace llvm {
     list_type Functions;
     
   protected:
-    unsigned NeededSafePoints; //< Bitmask of required safe points.
-    bool CustomReadBarriers;   //< Default is to insert loads.
-    bool CustomWriteBarriers;  //< Default is to insert stores.
-    bool CustomRoots;          //< Default is to pass through to backend.
-    bool CustomSafePoints;     //< Default is to use NeededSafePoints
-                               //  to find safe points.
-    bool InitRoots;            //< If set, roots are nulled during lowering.
-    bool UsesMetadata;         //< If set, backend must emit metadata tables.
+    unsigned NeededSafePoints; ///< Bitmask of required safe points.
+    bool CustomReadBarriers;   ///< Default is to insert loads.
+    bool CustomWriteBarriers;  ///< Default is to insert stores.
+    bool CustomRoots;          ///< Default is to pass through to backend.
+    bool CustomSafePoints;     ///< Default is to use NeededSafePoints
+                               ///< to find safe points.
+    bool InitRoots;            ///< If set, roots are nulled during lowering.
+    bool UsesMetadata;         ///< If set, backend must emit metadata tables.
     
   public:
     GCStrategy();
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ab8ab5d..f387bd5 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -37,87 +37,87 @@ namespace ISD {
   /// and getMachineOpcode() member functions of SDNode.
   ///
   enum NodeType {
-    // DELETED_NODE - This is an illegal value that is used to catch
-    // errors.  This opcode is not a legal opcode for any node.
+    /// DELETED_NODE - This is an illegal value that is used to catch
+    /// errors.  This opcode is not a legal opcode for any node.
     DELETED_NODE,
 
-    // EntryToken - This is the marker used to indicate the start of the region.
+    /// EntryToken - This is the marker used to indicate the start of a region.
     EntryToken,
 
-    // TokenFactor - This node takes multiple tokens as input and produces a
-    // single token result.  This is used to represent the fact that the operand
-    // operators are independent of each other.
+    /// TokenFactor - This node takes multiple tokens as input and produces a
+    /// single token result. This is used to represent the fact that the operand
+    /// operators are independent of each other.
     TokenFactor,
 
-    // AssertSext, AssertZext - These nodes record if a register contains a
-    // value that has already been zero or sign extended from a narrower type.
-    // These nodes take two operands.  The first is the node that has already
-    // been extended, and the second is a value type node indicating the width
-    // of the extension
+    /// AssertSext, AssertZext - These nodes record if a register contains a
+    /// value that has already been zero or sign extended from a narrower type.
+    /// These nodes take two operands.  The first is the node that has already
+    /// been extended, and the second is a value type node indicating the width
+    /// of the extension
     AssertSext, AssertZext,
 
-    // Various leaf nodes.
+    /// Various leaf nodes.
     BasicBlock, VALUETYPE, CONDCODE, Register, RegisterMask,
     Constant, ConstantFP,
     GlobalAddress, GlobalTLSAddress, FrameIndex,
     JumpTable, ConstantPool, ExternalSymbol, BlockAddress,
 
-    // The address of the GOT
+    /// The address of the GOT
     GLOBAL_OFFSET_TABLE,
 
-    // FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and
-    // llvm.returnaddress on the DAG.  These nodes take one operand, the index
-    // of the frame or return address to return.  An index of zero corresponds
-    // to the current function's frame or return address, an index of one to the
-    // parent's frame or return address, and so on.
+    /// FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and
+    /// llvm.returnaddress on the DAG.  These nodes take one operand, the index
+    /// of the frame or return address to return.  An index of zero corresponds
+    /// to the current function's frame or return address, an index of one to
+    /// the parent's frame or return address, and so on.
     FRAMEADDR, RETURNADDR,
 
-    // FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to
-    // first (possible) on-stack argument. This is needed for correct stack
-    // adjustment during unwind.
+    /// FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to
+    /// first (possible) on-stack argument. This is needed for correct stack
+    /// adjustment during unwind.
     FRAME_TO_ARGS_OFFSET,
 
-    // RESULT, OUTCHAIN = EXCEPTIONADDR(INCHAIN) - This node represents the
-    // address of the exception block on entry to an landing pad block.
+    /// RESULT, OUTCHAIN = EXCEPTIONADDR(INCHAIN) - This node represents the
+    /// address of the exception block on entry to an landing pad block.
     EXCEPTIONADDR,
 
-    // RESULT, OUTCHAIN = LSDAADDR(INCHAIN) - This node represents the
-    // address of the Language Specific Data Area for the enclosing function.
+    /// RESULT, OUTCHAIN = LSDAADDR(INCHAIN) - This node represents the
+    /// address of the Language Specific Data Area for the enclosing function.
     LSDAADDR,
 
-    // RESULT, OUTCHAIN = EHSELECTION(INCHAIN, EXCEPTION) - This node represents
-    // the selection index of the exception thrown.
+    /// RESULT, OUTCHAIN = EHSELECTION(INCHAIN, EXCEPTION) - This node
+    /// represents the selection index of the exception thrown.
     EHSELECTION,
 
-    // OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents
-    // 'eh_return' gcc dwarf builtin, which is used to return from
-    // exception. The general meaning is: adjust stack by OFFSET and pass
-    // execution to HANDLER. Many platform-related details also :)
+    /// OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents
+    /// 'eh_return' gcc dwarf builtin, which is used to return from
+    /// exception. The general meaning is: adjust stack by OFFSET and pass
+    /// execution to HANDLER. Many platform-related details also :)
     EH_RETURN,
 
-    // RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer)
-    // This corresponds to the eh.sjlj.setjmp intrinsic.
-    // It takes an input chain and a pointer to the jump buffer as inputs
-    // and returns an outchain.
+    /// RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer)
+    /// This corresponds to the eh.sjlj.setjmp intrinsic.
+    /// It takes an input chain and a pointer to the jump buffer as inputs
+    /// and returns an outchain.
     EH_SJLJ_SETJMP,
 
-    // OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer)
-    // This corresponds to the eh.sjlj.longjmp intrinsic.
-    // It takes an input chain and a pointer to the jump buffer as inputs
-    // and returns an outchain.
+    /// OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer)
+    /// This corresponds to the eh.sjlj.longjmp intrinsic.
+    /// It takes an input chain and a pointer to the jump buffer as inputs
+    /// and returns an outchain.
     EH_SJLJ_LONGJMP,
 
-    // TargetConstant* - Like Constant*, but the DAG does not do any folding,
-    // simplification, or lowering of the constant. They are used for constants
-    // which are known to fit in the immediate fields of their users, or for
-    // carrying magic numbers which are not values which need to be materialized
-    // in registers.
+    /// TargetConstant* - Like Constant*, but the DAG does not do any folding,
+    /// simplification, or lowering of the constant. They are used for constants
+    /// which are known to fit in the immediate fields of their users, or for
+    /// carrying magic numbers which are not values which need to be
+    /// materialized in registers.
     TargetConstant,
     TargetConstantFP,
 
-    // TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or
-    // anything else with this node, and this is valid in the target-specific
-    // dag, turning into a GlobalAddress operand.
+    /// TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or
+    /// anything else with this node, and this is valid in the target-specific
+    /// dag, turning into a GlobalAddress operand.
     TargetGlobalAddress,
     TargetGlobalTLSAddress,
     TargetFrameIndex,
@@ -126,6 +126,11 @@ namespace ISD {
     TargetExternalSymbol,
     TargetBlockAddress,
 
+    /// TargetIndex - Like a constant pool entry, but with completely
+    /// target-dependent semantics. Holds target flags, a 32-bit index, and a
+    /// 64-bit index. Targets can use this however they like.
+    TargetIndex,
+
     /// RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...)
     /// This node represents a target intrinsic function with no side effects.
     /// The first operand is the ID number of the intrinsic from the
@@ -148,93 +153,94 @@ namespace ISD {
     /// namespace.  The operands to the intrinsic follow.
     INTRINSIC_VOID,
 
-    // CopyToReg - This node has three operands: a chain, a register number to
-    // set to this value, and a value.
+    /// CopyToReg - This node has three operands: a chain, a register number to
+    /// set to this value, and a value.
     CopyToReg,
 
-    // CopyFromReg - This node indicates that the input value is a virtual or
-    // physical register that is defined outside of the scope of this
-    // SelectionDAG.  The register is available from the RegisterSDNode object.
+    /// CopyFromReg - This node indicates that the input value is a virtual or
+    /// physical register that is defined outside of the scope of this
+    /// SelectionDAG.  The register is available from the RegisterSDNode object.
     CopyFromReg,
 
-    // UNDEF - An undefined node
+    /// UNDEF - An undefined node.
     UNDEF,
 
-    // EXTRACT_ELEMENT - This is used to get the lower or upper (determined by
-    // a Constant, which is required to be operand #1) half of the integer or
-    // float value specified as operand #0.  This is only for use before
-    // legalization, for values that will be broken into multiple registers.
+    /// EXTRACT_ELEMENT - This is used to get the lower or upper (determined by
+    /// a Constant, which is required to be operand #1) half of the integer or
+    /// float value specified as operand #0.  This is only for use before
+    /// legalization, for values that will be broken into multiple registers.
     EXTRACT_ELEMENT,
 
-    // BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.  Given
-    // two values of the same integer value type, this produces a value twice as
-    // big.  Like EXTRACT_ELEMENT, this can only be used before legalization.
+    /// BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
+    /// Given two values of the same integer value type, this produces a value
+    /// twice as big.  Like EXTRACT_ELEMENT, this can only be used before
+    /// legalization.
     BUILD_PAIR,
 
-    // MERGE_VALUES - This node takes multiple discrete operands and returns
-    // them all as its individual results.  This nodes has exactly the same
-    // number of inputs and outputs. This node is useful for some pieces of the
-    // code generator that want to think about a single node with multiple
-    // results, not multiple nodes.
+    /// MERGE_VALUES - This node takes multiple discrete operands and returns
+    /// them all as its individual results.  This nodes has exactly the same
+    /// number of inputs and outputs. This node is useful for some pieces of the
+    /// code generator that want to think about a single node with multiple
+    /// results, not multiple nodes.
     MERGE_VALUES,
 
-    // Simple integer binary arithmetic operators.
+    /// Simple integer binary arithmetic operators.
     ADD, SUB, MUL, SDIV, UDIV, SREM, UREM,
 
-    // SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing
-    // a signed/unsigned value of type i[2*N], and return the full value as
-    // two results, each of type iN.
+    /// SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing
+    /// a signed/unsigned value of type i[2*N], and return the full value as
+    /// two results, each of type iN.
     SMUL_LOHI, UMUL_LOHI,
 
-    // SDIVREM/UDIVREM - Divide two integers and produce both a quotient and
-    // remainder result.
+    /// SDIVREM/UDIVREM - Divide two integers and produce both a quotient and
+    /// remainder result.
     SDIVREM, UDIVREM,
 
-    // CARRY_FALSE - This node is used when folding other nodes,
-    // like ADDC/SUBC, which indicate the carry result is always false.
+    /// CARRY_FALSE - This node is used when folding other nodes,
+    /// like ADDC/SUBC, which indicate the carry result is always false.
     CARRY_FALSE,
 
-    // Carry-setting nodes for multiple precision addition and subtraction.
-    // These nodes take two operands of the same value type, and produce two
-    // results.  The first result is the normal add or sub result, the second
-    // result is the carry flag result.
+    /// Carry-setting nodes for multiple precision addition and subtraction.
+    /// These nodes take two operands of the same value type, and produce two
+    /// results.  The first result is the normal add or sub result, the second
+    /// result is the carry flag result.
     ADDC, SUBC,
 
-    // Carry-using nodes for multiple precision addition and subtraction.  These
-    // nodes take three operands: The first two are the normal lhs and rhs to
-    // the add or sub, and the third is the input carry flag.  These nodes
-    // produce two results; the normal result of the add or sub, and the output
-    // carry flag.  These nodes both read and write a carry flag to allow them
-    // to them to be chained together for add and sub of arbitrarily large
-    // values.
+    /// Carry-using nodes for multiple precision addition and subtraction. These
+    /// nodes take three operands: The first two are the normal lhs and rhs to
+    /// the add or sub, and the third is the input carry flag.  These nodes
+    /// produce two results; the normal result of the add or sub, and the output
+    /// carry flag.  These nodes both read and write a carry flag to allow them
+    /// to them to be chained together for add and sub of arbitrarily large
+    /// values.
     ADDE, SUBE,
 
-    // RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
-    // These nodes take two operands: the normal LHS and RHS to the add. They
-    // produce two results: the normal result of the add, and a boolean that
-    // indicates if an overflow occurred (*not* a flag, because it may be stored
-    // to memory, etc.).  If the type of the boolean is not i1 then the high
-    // bits conform to getBooleanContents.
-    // These nodes are generated from the llvm.[su]add.with.overflow intrinsics.
+    /// RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
+    /// These nodes take two operands: the normal LHS and RHS to the add. They
+    /// produce two results: the normal result of the add, and a boolean that
+    /// indicates if an overflow occurred (*not* a flag, because it may be store
+    /// to memory, etc.).  If the type of the boolean is not i1 then the high
+    /// bits conform to getBooleanContents.
+    /// These nodes are generated from llvm.[su]add.with.overflow intrinsics.
     SADDO, UADDO,
 
-    // Same for subtraction
+    /// Same for subtraction.
     SSUBO, USUBO,
 
-    // Same for multiplication
+    /// Same for multiplication.
     SMULO, UMULO,
 
-    // Simple binary floating point operators.
+    /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FMA, FDIV, FREM,
 
-    // FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
-    // DAG node does not require that X and Y have the same type, just that they
-    // are both floating point.  X and the result must have the same type.
-    // FCOPYSIGN(f32, f64) is allowed.
+    /// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
+    /// DAG node does not require that X and Y have the same type, just that the
+    /// are both floating point.  X and the result must have the same type.
+    /// FCOPYSIGN(f32, f64) is allowed.
     FCOPYSIGN,
 
-    // INT = FGETSIGN(FP) - Return the sign bit of the specified floating point
-    // value as an integer 0/1 value.
+    /// INT = FGETSIGN(FP) - Return the sign bit of the specified floating point
+    /// value as an integer 0/1 value.
     FGETSIGN,
 
     /// BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a vector with the
@@ -292,13 +298,14 @@ namespace ISD {
     /// than the vector element type, and is implicitly truncated to it.
     SCALAR_TO_VECTOR,
 
-    // MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing
-    // an unsigned/signed value of type i[2*N], then return the top part.
+    /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
+    /// producing an unsigned/signed value of type i[2*N], then return the top
+    /// part.
     MULHU, MULHS,
 
     /// Bitwise operators - logical and, logical or, logical xor.
     AND, OR, XOR,
-    
+
     /// Shift and rotation operations.  After legalization, the type of the
     /// shift amount is known to be TLI.getShiftAmountTy().  Before legalization
     /// the shift amount can be any type, but care must be taken to ensure it is
@@ -306,7 +313,6 @@ namespace ISD {
     /// legalization, types like i1024 can occur and i8 doesn't have enough bits
     /// to represent the shift amount.  By convention, DAGCombine and
     /// SelectionDAGBuilder forces these shift amounts to i32 for simplicity.
-    ///
     SHL, SRA, SRL, ROTL, ROTR,
 
     /// Byte Swap and Counting operators.
@@ -315,67 +321,67 @@ namespace ISD {
     /// Bit counting operators with an undefined result for zero inputs.
     CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF,
 
-    // Select(COND, TRUEVAL, FALSEVAL).  If the type of the boolean COND is not
-    // i1 then the high bits must conform to getBooleanContents.
+    /// Select(COND, TRUEVAL, FALSEVAL).  If the type of the boolean COND is not
+    /// i1 then the high bits must conform to getBooleanContents.
     SELECT,
 
-    // Select with a vector condition (op #0) and two vector operands (ops #1
-    // and #2), returning a vector result.  All vectors have the same length.
-    // Much like the scalar select and setcc, each bit in the condition selects
-    // whether the corresponding result element is taken from op #1 or op #2.
-    // At first, the VSELECT condition is of vXi1 type. Later, targets may change
-    // the condition type in order to match the VSELECT node using a a pattern.
-    // The condition follows the BooleanContent format of the target.
+    /// Select with a vector condition (op #0) and two vector operands (ops #1
+    /// and #2), returning a vector result.  All vectors have the same length.
+    /// Much like the scalar select and setcc, each bit in the condition selects
+    /// whether the corresponding result element is taken from op #1 or op #2.
+    /// At first, the VSELECT condition is of vXi1 type. Later, targets may
+    /// change the condition type in order to match the VSELECT node using a
+    /// pattern. The condition follows the BooleanContent format of the target.
     VSELECT,
 
-    // Select with condition operator - This selects between a true value and
-    // a false value (ops #2 and #3) based on the boolean result of comparing
-    // the lhs and rhs (ops #0 and #1) of a conditional expression with the
-    // condition code in op #4, a CondCodeSDNode.
+    /// Select with condition operator - This selects between a true value and
+    /// a false value (ops #2 and #3) based on the boolean result of comparing
+    /// the lhs and rhs (ops #0 and #1) of a conditional expression with the
+    /// condition code in op #4, a CondCodeSDNode.
     SELECT_CC,
 
-    // SetCC operator - This evaluates to a true value iff the condition is
-    // true.  If the result value type is not i1 then the high bits conform
-    // to getBooleanContents.  The operands to this are the left and right
-    // operands to compare (ops #0, and #1) and the condition code to compare
-    // them with (op #2) as a CondCodeSDNode. If the operands are vector types
-    // then the result type must also be a vector type.
+    /// SetCC operator - This evaluates to a true value iff the condition is
+    /// true.  If the result value type is not i1 then the high bits conform
+    /// to getBooleanContents.  The operands to this are the left and right
+    /// operands to compare (ops #0, and #1) and the condition code to compare
+    /// them with (op #2) as a CondCodeSDNode. If the operands are vector types
+    /// then the result type must also be a vector type.
     SETCC,
 
-    // SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded
-    // integer shift operations, just like ADD/SUB_PARTS.  The operation
-    // ordering is:
-    //       [Lo,Hi] = op [LoLHS,HiLHS], Amt
+    /// SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded
+    /// integer shift operations, just like ADD/SUB_PARTS.  The operation
+    /// ordering is:
+    ///       [Lo,Hi] = op [LoLHS,HiLHS], Amt
     SHL_PARTS, SRA_PARTS, SRL_PARTS,
 
-    // Conversion operators.  These are all single input single output
-    // operations.  For all of these, the result type must be strictly
-    // wider or narrower (depending on the operation) than the source
-    // type.
+    /// Conversion operators.  These are all single input single output
+    /// operations.  For all of these, the result type must be strictly
+    /// wider or narrower (depending on the operation) than the source
+    /// type.
 
-    // SIGN_EXTEND - Used for integer types, replicating the sign bit
-    // into new bits.
+    /// SIGN_EXTEND - Used for integer types, replicating the sign bit
+    /// into new bits.
     SIGN_EXTEND,
 
-    // ZERO_EXTEND - Used for integer types, zeroing the new bits.
+    /// ZERO_EXTEND - Used for integer types, zeroing the new bits.
     ZERO_EXTEND,
 
-    // ANY_EXTEND - Used for integer types.  The high bits are undefined.
+    /// ANY_EXTEND - Used for integer types.  The high bits are undefined.
     ANY_EXTEND,
 
-    // TRUNCATE - Completely drop the high bits.
+    /// TRUNCATE - Completely drop the high bits.
     TRUNCATE,
 
-    // [SU]INT_TO_FP - These operators convert integers (whose interpreted sign
-    // depends on the first letter) to floating point.
+    /// [SU]INT_TO_FP - These operators convert integers (whose interpreted sign
+    /// depends on the first letter) to floating point.
     SINT_TO_FP,
     UINT_TO_FP,
 
-    // SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to
-    // sign extend a small value in a large integer register (e.g. sign
-    // extending the low 8 bits of a 32-bit register to fill the top 24 bits
-    // with the 7th bit).  The size of the smaller type is indicated by the 1th
-    // operand, a ValueType node.
+    /// SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to
+    /// sign extend a small value in a large integer register (e.g. sign
+    /// extending the low 8 bits of a 32-bit register to fill the top 24 bits
+    /// with the 7th bit).  The size of the smaller type is indicated by the 1th
+    /// operand, a ValueType node.
     SIGN_EXTEND_INREG,
 
     /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned
@@ -396,12 +402,12 @@ namespace ISD {
     /// FP_EXTEND(FP_ROUND(X,0)) because the extra bits aren't removed.
     FP_ROUND,
 
-    // FLT_ROUNDS_ - Returns current rounding mode:
-    // -1 Undefined
-    //  0 Round to 0
-    //  1 Round to nearest
-    //  2 Round to +inf
-    //  3 Round to -inf
+    /// FLT_ROUNDS_ - Returns current rounding mode:
+    /// -1 Undefined
+    ///  0 Round to 0
+    ///  1 Round to nearest
+    ///  2 Round to +inf
+    ///  3 Round to -inf
     FLT_ROUNDS_,
 
     /// X = FP_ROUND_INREG(Y, VT) - This operator takes an FP register, and
@@ -414,208 +420,211 @@ namespace ISD {
     /// X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
     FP_EXTEND,
 
-    // BITCAST - This operator converts between integer, vector and FP
-    // values, as if the value was stored to memory with one type and loaded
-    // from the same address with the other type (or equivalently for vector
-    // format conversions, etc).  The source and result are required to have
-    // the same bit size (e.g.  f32 <-> i32).  This can also be used for
-    // int-to-int or fp-to-fp conversions, but that is a noop, deleted by
-    // getNode().
+    /// BITCAST - This operator converts between integer, vector and FP
+    /// values, as if the value was stored to memory with one type and loaded
+    /// from the same address with the other type (or equivalently for vector
+    /// format conversions, etc).  The source and result are required to have
+    /// the same bit size (e.g.  f32 <-> i32).  This can also be used for
+    /// int-to-int or fp-to-fp conversions, but that is a noop, deleted by
+    /// getNode().
     BITCAST,
 
-    // CONVERT_RNDSAT - This operator is used to support various conversions
-    // between various types (float, signed, unsigned and vectors of those
-    // types) with rounding and saturation. NOTE: Avoid using this operator as
-    // most target don't support it and the operator might be removed in the
-    // future. It takes the following arguments:
-    //   0) value
-    //   1) dest type (type to convert to)
-    //   2) src type (type to convert from)
-    //   3) rounding imm
-    //   4) saturation imm
-    //   5) ISD::CvtCode indicating the type of conversion to do
+    /// CONVERT_RNDSAT - This operator is used to support various conversions
+    /// between various types (float, signed, unsigned and vectors of those
+    /// types) with rounding and saturation. NOTE: Avoid using this operator as
+    /// most target don't support it and the operator might be removed in the
+    /// future. It takes the following arguments:
+    ///   0) value
+    ///   1) dest type (type to convert to)
+    ///   2) src type (type to convert from)
+    ///   3) rounding imm
+    ///   4) saturation imm
+    ///   5) ISD::CvtCode indicating the type of conversion to do
     CONVERT_RNDSAT,
 
-    // FP16_TO_FP32, FP32_TO_FP16 - These operators are used to perform
-    // promotions and truncation for half-precision (16 bit) floating
-    // numbers. We need special nodes since FP16 is a storage-only type with
-    // special semantics of operations.
+    /// FP16_TO_FP32, FP32_TO_FP16 - These operators are used to perform
+    /// promotions and truncation for half-precision (16 bit) floating
+    /// numbers. We need special nodes since FP16 is a storage-only type with
+    /// special semantics of operations.
     FP16_TO_FP32, FP32_TO_FP16,
 
-    // FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
-    // FLOG, FLOG2, FLOG10, FEXP, FEXP2,
-    // FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR - Perform various unary floating
-    // point operations. These are inspired by libm.
+    /// FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
+    /// FLOG, FLOG2, FLOG10, FEXP, FEXP2,
+    /// FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR - Perform various unary
+    /// floating point operations. These are inspired by libm.
     FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     FLOG, FLOG2, FLOG10, FEXP, FEXP2,
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR,
 
-    // LOAD and STORE have token chains as their first operand, then the same
-    // operands as an LLVM load/store instruction, then an offset node that
-    // is added / subtracted from the base pointer to form the address (for
-    // indexed memory ops).
+    /// LOAD and STORE have token chains as their first operand, then the same
+    /// operands as an LLVM load/store instruction, then an offset node that
+    /// is added / subtracted from the base pointer to form the address (for
+    /// indexed memory ops).
     LOAD, STORE,
 
-    // DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned
-    // to a specified boundary.  This node always has two return values: a new
-    // stack pointer value and a chain. The first operand is the token chain,
-    // the second is the number of bytes to allocate, and the third is the
-    // alignment boundary.  The size is guaranteed to be a multiple of the stack
-    // alignment, and the alignment is guaranteed to be bigger than the stack
-    // alignment (if required) or 0 to get standard stack alignment.
+    /// DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned
+    /// to a specified boundary.  This node always has two return values: a new
+    /// stack pointer value and a chain. The first operand is the token chain,
+    /// the second is the number of bytes to allocate, and the third is the
+    /// alignment boundary.  The size is guaranteed to be a multiple of the
+    /// stack alignment, and the alignment is guaranteed to be bigger than the
+    /// stack alignment (if required) or 0 to get standard stack alignment.
     DYNAMIC_STACKALLOC,
 
-    // Control flow instructions.  These all have token chains.
+    /// Control flow instructions.  These all have token chains.
 
-    // BR - Unconditional branch.  The first operand is the chain
-    // operand, the second is the MBB to branch to.
+    /// BR - Unconditional branch.  The first operand is the chain
+    /// operand, the second is the MBB to branch to.
     BR,
 
-    // BRIND - Indirect branch.  The first operand is the chain, the second
-    // is the value to branch to, which must be of the same type as the target's
-    // pointer type.
+    /// BRIND - Indirect branch.  The first operand is the chain, the second
+    /// is the value to branch to, which must be of the same type as the
+    /// target's pointer type.
     BRIND,
 
-    // BR_JT - Jumptable branch. The first operand is the chain, the second
-    // is the jumptable index, the last one is the jumptable entry index.
+    /// BR_JT - Jumptable branch. The first operand is the chain, the second
+    /// is the jumptable index, the last one is the jumptable entry index.
     BR_JT,
 
-    // BRCOND - Conditional branch.  The first operand is the chain, the
-    // second is the condition, the third is the block to branch to if the
-    // condition is true.  If the type of the condition is not i1, then the
-    // high bits must conform to getBooleanContents.
+    /// BRCOND - Conditional branch.  The first operand is the chain, the
+    /// second is the condition, the third is the block to branch to if the
+    /// condition is true.  If the type of the condition is not i1, then the
+    /// high bits must conform to getBooleanContents.
     BRCOND,
 
-    // BR_CC - Conditional branch.  The behavior is like that of SELECT_CC, in
-    // that the condition is represented as condition code, and two nodes to
-    // compare, rather than as a combined SetCC node.  The operands in order are
-    // chain, cc, lhs, rhs, block to branch to if condition is true.
+    /// BR_CC - Conditional branch.  The behavior is like that of SELECT_CC, in
+    /// that the condition is represented as condition code, and two nodes to
+    /// compare, rather than as a combined SetCC node.  The operands in order
+    /// are chain, cc, lhs, rhs, block to branch to if condition is true.
     BR_CC,
 
-    // INLINEASM - Represents an inline asm block.  This node always has two
-    // return values: a chain and a flag result.  The inputs are as follows:
-    //   Operand #0   : Input chain.
-    //   Operand #1   : a ExternalSymbolSDNode with a pointer to the asm string.
-    //   Operand #2   : a MDNodeSDNode with the !srcloc metadata.
-    //   Operand #3   : HasSideEffect, IsAlignStack bits.
-    //   After this, it is followed by a list of operands with this format:
-    //     ConstantSDNode: Flags that encode whether it is a mem or not, the
-    //                     of operands that follow, etc.  See InlineAsm.h.
-    //     ... however many operands ...
-    //   Operand #last: Optional, an incoming flag.
-    //
-    // The variable width operands are required to represent target addressing
-    // modes as a single "operand", even though they may have multiple
-    // SDOperands.
+    /// INLINEASM - Represents an inline asm block.  This node always has two
+    /// return values: a chain and a flag result.  The inputs are as follows:
+    ///   Operand #0  : Input chain.
+    ///   Operand #1  : a ExternalSymbolSDNode with a pointer to the asm string.
+    ///   Operand #2  : a MDNodeSDNode with the !srcloc metadata.
+    ///   Operand #3  : HasSideEffect, IsAlignStack bits.
+    ///   After this, it is followed by a list of operands with this format:
+    ///     ConstantSDNode: Flags that encode whether it is a mem or not, the
+    ///                     of operands that follow, etc.  See InlineAsm.h.
+    ///     ... however many operands ...
+    ///   Operand #last: Optional, an incoming flag.
+    ///
+    /// The variable width operands are required to represent target addressing
+    /// modes as a single "operand", even though they may have multiple
+    /// SDOperands.
     INLINEASM,
 
-    // EH_LABEL - Represents a label in mid basic block used to track
-    // locations needed for debug and exception handling tables.  These nodes
-    // take a chain as input and return a chain.
+    /// EH_LABEL - Represents a label in mid basic block used to track
+    /// locations needed for debug and exception handling tables.  These nodes
+    /// take a chain as input and return a chain.
     EH_LABEL,
 
-    // STACKSAVE - STACKSAVE has one operand, an input chain.  It produces a
-    // value, the same type as the pointer type for the system, and an output
-    // chain.
+    /// STACKSAVE - STACKSAVE has one operand, an input chain.  It produces a
+    /// value, the same type as the pointer type for the system, and an output
+    /// chain.
     STACKSAVE,
 
-    // STACKRESTORE has two operands, an input chain and a pointer to restore to
-    // it returns an output chain.
+    /// STACKRESTORE has two operands, an input chain and a pointer to restore
+    /// to it returns an output chain.
     STACKRESTORE,
 
-    // CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of
-    // a call sequence, and carry arbitrary information that target might want
-    // to know.  The first operand is a chain, the rest are specified by the
-    // target and not touched by the DAG optimizers.
-    // CALLSEQ_START..CALLSEQ_END pairs may not be nested.
+    /// CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end
+    /// of a call sequence, and carry arbitrary information that target might
+    /// want to know.  The first operand is a chain, the rest are specified by
+    /// the target and not touched by the DAG optimizers.
+    /// CALLSEQ_START..CALLSEQ_END pairs may not be nested.
     CALLSEQ_START,  // Beginning of a call sequence
     CALLSEQ_END,    // End of a call sequence
 
-    // VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE,
-    // and the alignment. It returns a pair of values: the vaarg value and a
-    // new chain.
+    /// VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE,
+    /// and the alignment. It returns a pair of values: the vaarg value and a
+    /// new chain.
     VAARG,
 
-    // VACOPY - VACOPY has five operands: an input chain, a destination pointer,
-    // a source pointer, a SRCVALUE for the destination, and a SRCVALUE for the
-    // source.
+    /// VACOPY - VACOPY has 5 operands: an input chain, a destination pointer,
+    /// a source pointer, a SRCVALUE for the destination, and a SRCVALUE for the
+    /// source.
     VACOPY,
 
-    // VAEND, VASTART - VAEND and VASTART have three operands: an input chain, a
-    // pointer, and a SRCVALUE.
+    /// VAEND, VASTART - VAEND and VASTART have three operands: an input chain,
+    /// pointer, and a SRCVALUE.
     VAEND, VASTART,
 
-    // SRCVALUE - This is a node type that holds a Value* that is used to
-    // make reference to a value in the LLVM IR.
+    /// SRCVALUE - This is a node type that holds a Value* that is used to
+    /// make reference to a value in the LLVM IR.
     SRCVALUE,
 
-    // MDNODE_SDNODE - This is a node that holdes an MDNode*, which is used to
-    // reference metadata in the IR.
+    /// MDNODE_SDNODE - This is a node that holdes an MDNode*, which is used to
+    /// reference metadata in the IR.
     MDNODE_SDNODE,
 
-    // PCMARKER - This corresponds to the pcmarker intrinsic.
+    /// PCMARKER - This corresponds to the pcmarker intrinsic.
     PCMARKER,
 
-    // READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
-    // The only operand is a chain and a value and a chain are produced.  The
-    // value is the contents of the architecture specific cycle counter like
-    // register (or other high accuracy low latency clock source)
+    /// READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
+    /// The only operand is a chain and a value and a chain are produced.  The
+    /// value is the contents of the architecture specific cycle counter like
+    /// register (or other high accuracy low latency clock source)
     READCYCLECOUNTER,
 
-    // HANDLENODE node - Used as a handle for various purposes.
+    /// HANDLENODE node - Used as a handle for various purposes.
     HANDLENODE,
 
-    // INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.  It
-    // takes as input a token chain, the pointer to the trampoline, the pointer
-    // to the nested function, the pointer to pass for the 'nest' parameter, a
-    // SRCVALUE for the trampoline and another for the nested function (allowing
-    // targets to access the original Function*).  It produces a token chain as
-    // output.
+    /// INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.  It
+    /// takes as input a token chain, the pointer to the trampoline, the pointer
+    /// to the nested function, the pointer to pass for the 'nest' parameter, a
+    /// SRCVALUE for the trampoline and another for the nested function
+    /// (allowing targets to access the original Function*).
+    /// It produces a token chain as output.
     INIT_TRAMPOLINE,
 
-    // ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
-    // It takes a pointer to the trampoline and produces a (possibly) new
-    // pointer to the same trampoline with platform-specific adjustments
-    // applied.  The pointer it returns points to an executable block of code.
+    /// ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
+    /// It takes a pointer to the trampoline and produces a (possibly) new
+    /// pointer to the same trampoline with platform-specific adjustments
+    /// applied.  The pointer it returns points to an executable block of code.
     ADJUST_TRAMPOLINE,
 
-    // TRAP - Trapping instruction
+    /// TRAP - Trapping instruction
     TRAP,
 
-    // PREFETCH - This corresponds to a prefetch intrinsic. It takes chains are
-    // their first operand. The other operands are the address to prefetch,
-    // read / write specifier, locality specifier and instruction / data cache
-    // specifier.
+    /// DEBUGTRAP - Trap intended to get the attention of a debugger.
+    DEBUGTRAP,
+
+    /// PREFETCH - This corresponds to a prefetch intrinsic. The first operand
+    /// is the chain.  The other operands are the address to prefetch,
+    /// read / write specifier, locality specifier and instruction / data cache
+    /// specifier.
     PREFETCH,
 
-    // OUTCHAIN = MEMBARRIER(INCHAIN, load-load, load-store, store-load,
-    //                       store-store, device)
-    // This corresponds to the memory.barrier intrinsic.
-    // it takes an input chain, 4 operands to specify the type of barrier, an
-    // operand specifying if the barrier applies to device and uncached memory
-    // and produces an output chain.
+    /// OUTCHAIN = MEMBARRIER(INCHAIN, load-load, load-store, store-load,
+    ///                       store-store, device)
+    /// This corresponds to the memory.barrier intrinsic.
+    /// it takes an input chain, 4 operands to specify the type of barrier, an
+    /// operand specifying if the barrier applies to device and uncached memory
+    /// and produces an output chain.
     MEMBARRIER,
 
-    // OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
-    // This corresponds to the fence instruction. It takes an input chain, and
-    // two integer constants: an AtomicOrdering and a SynchronizationScope.
+    /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
+    /// This corresponds to the fence instruction. It takes an input chain, and
+    /// two integer constants: an AtomicOrdering and a SynchronizationScope.
     ATOMIC_FENCE,
 
-    // Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr)
-    // This corresponds to "load atomic" instruction.
+    /// Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr)
+    /// This corresponds to "load atomic" instruction.
     ATOMIC_LOAD,
 
-    // OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr, val)
-    // This corresponds to "store atomic" instruction.
+    /// OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr, val)
+    /// This corresponds to "store atomic" instruction.
     ATOMIC_STORE,
 
-    // Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-    // This corresponds to the cmpxchg instruction.
+    /// Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+    /// This corresponds to the cmpxchg instruction.
     ATOMIC_CMP_SWAP,
 
-    // Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt)
-    // Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt)
-    // These correspond to the atomicrmw instruction.
+    /// Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt)
+    /// Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt)
+    /// These correspond to the atomicrmw instruction.
     ATOMIC_SWAP,
     ATOMIC_LOAD_ADD,
     ATOMIC_LOAD_SUB,
@@ -790,16 +799,16 @@ namespace ISD {
   /// CvtCode enum - This enum defines the various converts CONVERT_RNDSAT
   /// supports.
   enum CvtCode {
-    CVT_FF,     // Float from Float
-    CVT_FS,     // Float from Signed
-    CVT_FU,     // Float from Unsigned
-    CVT_SF,     // Signed from Float
-    CVT_UF,     // Unsigned from Float
-    CVT_SS,     // Signed from Signed
-    CVT_SU,     // Signed from Unsigned
-    CVT_US,     // Unsigned from Signed
-    CVT_UU,     // Unsigned from Unsigned
-    CVT_INVALID // Marker - Invalid opcode
+    CVT_FF,     /// Float from Float
+    CVT_FS,     /// Float from Signed
+    CVT_FU,     /// Float from Unsigned
+    CVT_SF,     /// Signed from Float
+    CVT_UF,     /// Unsigned from Float
+    CVT_SS,     /// Signed from Signed
+    CVT_SU,     /// Signed from Unsigned
+    CVT_US,     /// Unsigned from Signed
+    CVT_UU,     /// Unsigned from Unsigned
+    CVT_INVALID /// Marker - Invalid opcode
   };
 
 } // end llvm::ISD namespace
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index eb01f66c..8414c64 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -158,7 +158,10 @@ class LexicalScope {
 public:
   LexicalScope(LexicalScope *P, const MDNode *D, const MDNode *I, bool A)
     : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A),
-      LastInsn(0), FirstInsn(0), DFSIn(0), DFSOut(0), IndentLevel(0) {
+      LastInsn(0), FirstInsn(0), DFSIn(0), DFSOut(0) {
+#ifndef NDEBUG
+    IndentLevel = 0;
+#endif
     if (Parent)
       Parent->addChild(this);
   }
@@ -241,7 +244,9 @@ private:
   const MachineInstr *FirstInsn;      // First instruction of this scope.
   unsigned DFSIn, DFSOut;             // In & Out Depth use to determine
                                       // scope nesting.
+#ifndef NDEBUG
   mutable unsigned IndentLevel;       // Private state for dump()
+#endif
 };
 
 } // end llvm namespace
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index a6008ab..a3ce47c 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -40,15 +40,6 @@ namespace llvm {
   /// definition and use points.
   ///
   class VNInfo {
-  private:
-    enum {
-      HAS_PHI_KILL    = 1,
-      IS_PHI_DEF      = 1 << 1,
-      IS_UNUSED       = 1 << 2
-    };
-
-    unsigned char flags;
-
   public:
     typedef BumpPtrAllocator Allocator;
 
@@ -60,60 +51,30 @@ namespace llvm {
 
     /// VNInfo constructor.
     VNInfo(unsigned i, SlotIndex d)
-      : flags(0), id(i), def(d)
+      : id(i), def(d)
     { }
 
     /// VNInfo construtor, copies values from orig, except for the value number.
     VNInfo(unsigned i, const VNInfo &orig)
-      : flags(orig.flags), id(i), def(orig.def)
+      : id(i), def(orig.def)
     { }
 
     /// Copy from the parameter into this VNInfo.
     void copyFrom(VNInfo &src) {
-      flags = src.flags;
       def = src.def;
     }
 
-    /// Used for copying value number info.
-    unsigned getFlags() const { return flags; }
-    void setFlags(unsigned flags) { this->flags = flags; }
-
-    /// Merge flags from another VNInfo
-    void mergeFlags(const VNInfo *VNI) {
-      flags = (flags | VNI->flags) & ~IS_UNUSED;
-    }
-
-    /// Returns true if one or more kills are PHI nodes.
-    /// Obsolete, do not use!
-    bool hasPHIKill() const { return flags & HAS_PHI_KILL; }
-    /// Set the PHI kill flag on this value.
-    void setHasPHIKill(bool hasKill) {
-      if (hasKill)
-        flags |= HAS_PHI_KILL;
-      else
-        flags &= ~HAS_PHI_KILL;
-    }
-
     /// Returns true if this value is defined by a PHI instruction (or was,
     /// PHI instrucions may have been eliminated).
-    bool isPHIDef() const { return flags & IS_PHI_DEF; }
-    /// Set the "phi def" flag on this value.
-    void setIsPHIDef(bool phiDef) {
-      if (phiDef)
-        flags |= IS_PHI_DEF;
-      else
-        flags &= ~IS_PHI_DEF;
-    }
+    /// PHI-defs begin at a block boundary, all other defs begin at register or
+    /// EC slots.
+    bool isPHIDef() const { return def.isBlock(); }
 
     /// Returns true if this value is unused.
-    bool isUnused() const { return flags & IS_UNUSED; }
-    /// Set the "is unused" flag on this value.
-    void setIsUnused(bool unused) {
-      if (unused)
-        flags |= IS_UNUSED;
-      else
-        flags &= ~IS_UNUSED;
-    }
+    bool isUnused() const { return !def.isValid(); }
+
+    /// Mark this value as unused.
+    void markUnused() { def = SlotIndex(); }
   };
 
   /// LiveRange structure - This represents a simple register range in the
@@ -274,6 +235,11 @@ namespace llvm {
       return VNI;
     }
 
+    /// createDeadDef - Make sure the interval has a value defined at Def.
+    /// If one already exists, return it. Otherwise allocate a new value and
+    /// add liveness for a dead def.
+    VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator);
+
     /// Create a copy of the given value. The new value will be identical except
     /// for the Value number.
     VNInfo *createValueCopy(const VNInfo *orig,
@@ -288,17 +254,6 @@ namespace llvm {
     /// unused values.
     void RenumberValues(LiveIntervals &lis);
 
-    /// isOnlyLROfValNo - Return true if the specified live range is the only
-    /// one defined by the its val#.
-    bool isOnlyLROfValNo(const LiveRange *LR) {
-      for (const_iterator I = begin(), E = end(); I != E; ++I) {
-        const LiveRange *Tmp = I;
-        if (Tmp != LR && Tmp->valno == LR->valno)
-          return false;
-      }
-      return true;
-    }
-
     /// MergeValueNumberInto - This method is called when two value nubmers
     /// are found to be equivalent.  This eliminates V1, replacing all
     /// LiveRanges with the V1 value number with the V2 value number.  This can
@@ -377,14 +332,6 @@ namespace llvm {
       return I == end() ? 0 : &*I;
     }
 
-    const LiveRange *getLiveRangeBefore(SlotIndex Idx) const {
-      return getLiveRangeContaining(Idx.getPrevSlot());
-    }
-
-    LiveRange *getLiveRangeBefore(SlotIndex Idx) {
-      return getLiveRangeContaining(Idx.getPrevSlot());
-    }
-
     /// getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
     VNInfo *getVNInfoAt(SlotIndex Idx) const {
       const_iterator I = FindLiveRangeContaining(Idx);
@@ -411,11 +358,6 @@ namespace llvm {
       return I != end() && I->start <= Idx ? I : end();
     }
 
-    /// findDefinedVNInfo - Find the by the specified
-    /// index (register interval) or defined
-    VNInfo *findDefinedVNInfoForRegInt(SlotIndex Idx) const;
-
-
     /// overlaps - Return true if the intersection of the two live intervals is
     /// not empty.
     bool overlaps(const LiveInterval& other) const {
@@ -498,10 +440,6 @@ namespace llvm {
       weight = HUGE_VALF;
     }
 
-    /// ComputeJoinedWeight - Set the weight of a live interval after
-    /// Other has been merged into it.
-    void ComputeJoinedWeight(const LiveInterval &Other);
-
     bool operator<(const LiveInterval& other) const {
       const SlotIndex &thisIndex = beginIndex();
       const SlotIndex &otherIndex = other.beginIndex();
@@ -509,15 +447,27 @@ namespace llvm {
               (thisIndex == otherIndex && reg < other.reg));
     }
 
-    void print(raw_ostream &OS, const TargetRegisterInfo *TRI = 0) const;
+    void print(raw_ostream &OS) const;
     void dump() const;
 
+    /// \brief Walk the interval and assert if any invariants fail to hold.
+    ///
+    /// Note that this is a no-op when asserts are disabled.
+#ifdef NDEBUG
+    void verify() const {}
+#else
+    void verify() const;
+#endif
+
   private:
 
     Ranges::iterator addRangeFrom(LiveRange LR, Ranges::iterator From);
     void extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd);
     Ranges::iterator extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStr);
     void markValNoForDeletion(VNInfo *V);
+    void mergeIntervalRanges(const LiveInterval &RHS,
+                             VNInfo *LHSValNo = 0,
+                             const VNInfo *RHSValNo = 0);
 
     LiveInterval& operator=(const LiveInterval& rhs); // DO NOT IMPLEMENT
 
@@ -528,6 +478,91 @@ namespace llvm {
     return OS;
   }
 
+  /// LiveRangeQuery - Query information about a live range around a given
+  /// instruction. This class hides the implementation details of live ranges,
+  /// and it should be used as the primary interface for examining live ranges
+  /// around instructions.
+  ///
+  class LiveRangeQuery {
+    VNInfo *EarlyVal;
+    VNInfo *LateVal;
+    SlotIndex EndPoint;
+    bool Kill;
+
+  public:
+    /// Create a LiveRangeQuery for the given live range and instruction index.
+    /// The sub-instruction slot of Idx doesn't matter, only the instruction it
+    /// refers to is considered.
+    LiveRangeQuery(const LiveInterval &LI, SlotIndex Idx)
+      : EarlyVal(0), LateVal(0), Kill(false) {
+      // Find the segment that enters the instruction.
+      LiveInterval::const_iterator I = LI.find(Idx.getBaseIndex());
+      LiveInterval::const_iterator E = LI.end();
+      if (I == E)
+        return;
+      // Is this an instruction live-in segment?
+      if (SlotIndex::isEarlierInstr(I->start, Idx)) {
+        EarlyVal = I->valno;
+        EndPoint = I->end;
+        // Move to the potentially live-out segment.
+        if (SlotIndex::isSameInstr(Idx, I->end)) {
+          Kill = true;
+          if (++I == E)
+            return;
+        }
+      }
+      // I now points to the segment that may be live-through, or defined by
+      // this instr. Ignore segments starting after the current instr.
+      if (SlotIndex::isEarlierInstr(Idx, I->start))
+        return;
+      LateVal = I->valno;
+      EndPoint = I->end;
+    }
+
+    /// Return the value that is live-in to the instruction. This is the value
+    /// that will be read by the instruction's use operands. Return NULL if no
+    /// value is live-in.
+    VNInfo *valueIn() const {
+      return EarlyVal;
+    }
+
+    /// Return true if the live-in value is killed by this instruction. This
+    /// means that either the live range ends at the instruction, or it changes
+    /// value.
+    bool isKill() const {
+      return Kill;
+    }
+
+    /// Return true if this instruction has a dead def.
+    bool isDeadDef() const {
+      return EndPoint.isDead();
+    }
+
+    /// Return the value leaving the instruction, if any. This can be a
+    /// live-through value, or a live def. A dead def returns NULL.
+    VNInfo *valueOut() const {
+      return isDeadDef() ? 0 : LateVal;
+    }
+
+    /// Return the value defined by this instruction, if any. This includes
+    /// dead defs, it is the value created by the instruction's def operands.
+    VNInfo *valueDefined() const {
+      return EarlyVal == LateVal ? 0 : LateVal;
+    }
+
+    /// Return the end point of the last live range segment to interact with
+    /// the instruction, if any.
+    ///
+    /// The end point is an invalid SlotIndex only if the live range doesn't
+    /// intersect the instruction at all.
+    ///
+    /// The end point may be at or past the end of the instruction's basic
+    /// block. That means the value was live out of the block.
+    SlotIndex endPoint() const {
+      return EndPoint;
+    }
+  };
+
   /// ConnectedVNInfoEqClasses - Helper class that can divide VNInfos in a
   /// LiveInterval into equivalence clases of connected components. A
   /// LiveInterval that has multiple connected components can be broken into
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 76201c9..da521db 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -20,12 +20,13 @@
 #ifndef LLVM_CODEGEN_LIVEINTERVAL_ANALYSIS_H
 #define LLVM_CODEGEN_LIVEINTERVAL_ANALYSIS_H
 
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
@@ -35,7 +36,9 @@
 namespace llvm {
 
   class AliasAnalysis;
+  class LiveRangeCalc;
   class LiveVariables;
+  class MachineDominatorTree;
   class MachineLoopInfo;
   class TargetRegisterInfo;
   class MachineRegisterInfo;
@@ -44,27 +47,29 @@ namespace llvm {
   class VirtRegMap;
 
   class LiveIntervals : public MachineFunctionPass {
-    MachineFunction* mf_;
-    MachineRegisterInfo* mri_;
-    const TargetMachine* tm_;
-    const TargetRegisterInfo* tri_;
-    const TargetInstrInfo* tii_;
-    AliasAnalysis *aa_;
-    LiveVariables* lv_;
-    SlotIndexes* indexes_;
+    MachineFunction* MF;
+    MachineRegisterInfo* MRI;
+    const TargetMachine* TM;
+    const TargetRegisterInfo* TRI;
+    const TargetInstrInfo* TII;
+    AliasAnalysis *AA;
+    LiveVariables* LV;
+    SlotIndexes* Indexes;
+    MachineDominatorTree *DomTree;
+    LiveRangeCalc *LRCalc;
 
     /// Special pool allocator for VNInfo's (LiveInterval val#).
     ///
     VNInfo::Allocator VNInfoAllocator;
 
-    typedef DenseMap<unsigned, LiveInterval*> Reg2IntervalMap;
-    Reg2IntervalMap r2iMap_;
+    /// Live interval pointers for all the virtual registers.
+    IndexedMap<LiveInterval*, VirtReg2IndexFunctor> VirtRegIntervals;
 
-    /// allocatableRegs_ - A bit vector of allocatable registers.
-    BitVector allocatableRegs_;
+    /// AllocatableRegs - A bit vector of allocatable registers.
+    BitVector AllocatableRegs;
 
-    /// reservedRegs_ - A bit vector of reserved registers.
-    BitVector reservedRegs_;
+    /// ReservedRegs - A bit vector of reserved registers.
+    BitVector ReservedRegs;
 
     /// RegMaskSlots - Sorted list of instructions with register mask operands.
     /// Always use the 'r' slot, RegMasks are normal clobbers, not early
@@ -92,83 +97,59 @@ namespace llvm {
     /// block.
     SmallVector<std::pair<unsigned, unsigned>, 8> RegMaskBlocks;
 
+    /// RegUnitIntervals - Keep a live interval for each register unit as a way
+    /// of tracking fixed physreg interference.
+    SmallVector<LiveInterval*, 0> RegUnitIntervals;
+
   public:
     static char ID; // Pass identification, replacement for typeid
-    LiveIntervals() : MachineFunctionPass(ID) {
-      initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-    }
+    LiveIntervals();
+    virtual ~LiveIntervals();
 
     // Calculate the spill weight to assign to a single instruction.
     static float getSpillWeight(bool isDef, bool isUse, unsigned loopDepth);
 
-    typedef Reg2IntervalMap::iterator iterator;
-    typedef Reg2IntervalMap::const_iterator const_iterator;
-    const_iterator begin() const { return r2iMap_.begin(); }
-    const_iterator end() const { return r2iMap_.end(); }
-    iterator begin() { return r2iMap_.begin(); }
-    iterator end() { return r2iMap_.end(); }
-    unsigned getNumIntervals() const { return (unsigned)r2iMap_.size(); }
-
-    LiveInterval &getInterval(unsigned reg) {
-      Reg2IntervalMap::iterator I = r2iMap_.find(reg);
-      assert(I != r2iMap_.end() && "Interval does not exist for register");
-      return *I->second;
+    LiveInterval &getInterval(unsigned Reg) {
+      LiveInterval *LI = VirtRegIntervals[Reg];
+      assert(LI && "Interval does not exist for virtual register");
+      return *LI;
     }
 
-    const LiveInterval &getInterval(unsigned reg) const {
-      Reg2IntervalMap::const_iterator I = r2iMap_.find(reg);
-      assert(I != r2iMap_.end() && "Interval does not exist for register");
-      return *I->second;
+    const LiveInterval &getInterval(unsigned Reg) const {
+      return const_cast<LiveIntervals*>(this)->getInterval(Reg);
     }
 
-    bool hasInterval(unsigned reg) const {
-      return r2iMap_.count(reg);
+    bool hasInterval(unsigned Reg) const {
+      return VirtRegIntervals.inBounds(Reg) && VirtRegIntervals[Reg];
     }
 
     /// isAllocatable - is the physical register reg allocatable in the current
     /// function?
     bool isAllocatable(unsigned reg) const {
-      return allocatableRegs_.test(reg);
+      return AllocatableRegs.test(reg);
     }
 
     /// isReserved - is the physical register reg reserved in the current
     /// function
     bool isReserved(unsigned reg) const {
-      return reservedRegs_.test(reg);
-    }
-
-    /// getScaledIntervalSize - get the size of an interval in "units,"
-    /// where every function is composed of one thousand units.  This
-    /// measure scales properly with empty index slots in the function.
-    double getScaledIntervalSize(LiveInterval& I) {
-      return (1000.0 * I.getSize()) / indexes_->getIndexesLength();
-    }
-
-    /// getFuncInstructionCount - Return the number of instructions in the
-    /// current function.
-    unsigned getFuncInstructionCount() {
-      return indexes_->getFunctionSize();
+      return ReservedRegs.test(reg);
     }
 
-    /// getApproximateInstructionCount - computes an estimate of the number
-    /// of instructions in a given LiveInterval.
-    unsigned getApproximateInstructionCount(LiveInterval& I) {
-      double IntervalPercentage = getScaledIntervalSize(I) / 1000.0;
-      return (unsigned)(IntervalPercentage * indexes_->getFunctionSize());
+    // Interval creation.
+    LiveInterval &getOrCreateInterval(unsigned Reg) {
+      if (!hasInterval(Reg)) {
+        VirtRegIntervals.grow(Reg);
+        VirtRegIntervals[Reg] = createInterval(Reg);
+      }
+      return getInterval(Reg);
     }
 
-    // Interval creation
-    LiveInterval &getOrCreateInterval(unsigned reg) {
-      Reg2IntervalMap::iterator I = r2iMap_.find(reg);
-      if (I == r2iMap_.end())
-        I = r2iMap_.insert(std::make_pair(reg, createInterval(reg))).first;
-      return *I->second;
+    // Interval removal.
+    void removeInterval(unsigned Reg) {
+      delete VirtRegIntervals[Reg];
+      VirtRegIntervals[Reg] = 0;
     }
 
-    /// dupInterval - Duplicate a live interval. The caller is responsible for
-    /// managing the allocated memory.
-    LiveInterval *dupInterval(LiveInterval *li);
-
     /// addLiveRangeToEndOfBlock - Given a register and an instruction,
     /// adds a live range from that instruction to the end of its MBB.
     LiveRange addLiveRangeToEndOfBlock(unsigned reg,
@@ -184,42 +165,38 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = 0);
 
-    // Interval removal
-
-    void removeInterval(unsigned Reg) {
-      DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.find(Reg);
-      delete I->second;
-      r2iMap_.erase(I);
+    SlotIndexes *getSlotIndexes() const {
+      return Indexes;
     }
 
-    SlotIndexes *getSlotIndexes() const {
-      return indexes_;
+    AliasAnalysis *getAliasAnalysis() const {
+      return AA;
     }
 
     /// isNotInMIMap - returns true if the specified machine instr has been
     /// removed or was never entered in the map.
     bool isNotInMIMap(const MachineInstr* Instr) const {
-      return !indexes_->hasIndex(Instr);
+      return !Indexes->hasIndex(Instr);
     }
 
     /// Returns the base index of the given instruction.
     SlotIndex getInstructionIndex(const MachineInstr *instr) const {
-      return indexes_->getInstructionIndex(instr);
+      return Indexes->getInstructionIndex(instr);
     }
 
     /// Returns the instruction associated with the given index.
     MachineInstr* getInstructionFromIndex(SlotIndex index) const {
-      return indexes_->getInstructionFromIndex(index);
+      return Indexes->getInstructionFromIndex(index);
     }
 
     /// Return the first index in the given basic block.
     SlotIndex getMBBStartIdx(const MachineBasicBlock *mbb) const {
-      return indexes_->getMBBStartIdx(mbb);
+      return Indexes->getMBBStartIdx(mbb);
     }
 
     /// Return the last index in the given basic block.
     SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const {
-      return indexes_->getMBBEndIdx(mbb);
+      return Indexes->getMBBEndIdx(mbb);
     }
 
     bool isLiveInToMBB(const LiveInterval &li,
@@ -233,24 +210,24 @@ namespace llvm {
     }
 
     MachineBasicBlock* getMBBFromIndex(SlotIndex index) const {
-      return indexes_->getMBBFromIndex(index);
+      return Indexes->getMBBFromIndex(index);
     }
 
     SlotIndex InsertMachineInstrInMaps(MachineInstr *MI) {
-      return indexes_->insertMachineInstrInMaps(MI);
+      return Indexes->insertMachineInstrInMaps(MI);
     }
 
     void RemoveMachineInstrFromMaps(MachineInstr *MI) {
-      indexes_->removeMachineInstrFromMaps(MI);
+      Indexes->removeMachineInstrFromMaps(MI);
     }
 
     void ReplaceMachineInstrInMaps(MachineInstr *MI, MachineInstr *NewMI) {
-      indexes_->replaceMachineInstrInMaps(MI, NewMI);
+      Indexes->replaceMachineInstrInMaps(MI, NewMI);
     }
 
     bool findLiveInMBBs(SlotIndex Start, SlotIndex End,
                         SmallVectorImpl<MachineBasicBlock*> &MBBs) const {
-      return indexes_->findLiveInMBBs(Start, End, MBBs);
+      return Indexes->findLiveInMBBs(Start, End, MBBs);
     }
 
     VNInfo::Allocator& getVNInfoAllocator() { return VNInfoAllocator; }
@@ -264,18 +241,15 @@ namespace llvm {
     /// print - Implement the dump method.
     virtual void print(raw_ostream &O, const Module* = 0) const;
 
-    /// isReMaterializable - Returns true if every definition of MI of every
-    /// val# of the specified interval is re-materializable. Also returns true
-    /// by reference if all of the defs are load instructions.
-    bool isReMaterializable(const LiveInterval &li,
-                            const SmallVectorImpl<LiveInterval*> *SpillIs,
-                            bool &isLoad);
-
     /// intervalIsInOneMBB - If LI is confined to a single basic block, return
     /// a pointer to that block.  If LI is live in to or out of any block,
     /// return NULL.
     MachineBasicBlock *intervalIsInOneMBB(const LiveInterval &LI) const;
 
+    /// Returns true if VNI is killed by any PHI-def values in LI.
+    /// This may conservatively return true to avoid expensive computations.
+    bool hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const;
+
     /// addKillFlags - Add kill flags to any instruction that kills a virtual
     /// register.
     void addKillFlags();
@@ -337,13 +311,47 @@ namespace llvm {
     bool checkRegMaskInterference(LiveInterval &LI,
                                   BitVector &UsableRegs);
 
+    // Register unit functions.
+    //
+    // Fixed interference occurs when MachineInstrs use physregs directly
+    // instead of virtual registers. This typically happens when passing
+    // arguments to a function call, or when instructions require operands in
+    // fixed registers.
+    //
+    // Each physreg has one or more register units, see MCRegisterInfo. We
+    // track liveness per register unit to handle aliasing registers more
+    // efficiently.
+
+    /// getRegUnit - Return the live range for Unit.
+    /// It will be computed if it doesn't exist.
+    LiveInterval &getRegUnit(unsigned Unit) {
+      LiveInterval *LI = RegUnitIntervals[Unit];
+      if (!LI) {
+        // Compute missing ranges on demand.
+        RegUnitIntervals[Unit] = LI = new LiveInterval(Unit, HUGE_VALF);
+        computeRegUnitInterval(LI);
+      }
+      return *LI;
+    }
+
+    /// getCachedRegUnit - Return the live range for Unit if it has already
+    /// been computed, or NULL if it hasn't been computed yet.
+    LiveInterval *getCachedRegUnit(unsigned Unit) {
+      return RegUnitIntervals[Unit];
+    }
+
   private:
     /// computeIntervals - Compute live intervals.
     void computeIntervals();
 
+    /// Compute live intervals for all virtual registers.
+    void computeVirtRegs();
+
+    /// Compute RegMaskSlots and RegMaskBits.
+    void computeRegMasks();
+
     /// handleRegisterDef - update intervals for a register def
-    /// (calls handlePhysicalRegisterDef and
-    /// handleVirtualRegisterDef)
+    /// (calls handleVirtualRegisterDef)
     void handleRegisterDef(MachineBasicBlock *MBB,
                            MachineBasicBlock::iterator MI,
                            SlotIndex MIIdx,
@@ -363,43 +371,15 @@ namespace llvm {
                                   unsigned MOIdx,
                                   LiveInterval& interval);
 
-    /// handlePhysicalRegisterDef - update intervals for a physical register
-    /// def.
-    void handlePhysicalRegisterDef(MachineBasicBlock* mbb,
-                                   MachineBasicBlock::iterator mi,
-                                   SlotIndex MIIdx, MachineOperand& MO,
-                                   LiveInterval &interval);
-
-    /// handleLiveInRegister - Create interval for a livein register.
-    void handleLiveInRegister(MachineBasicBlock* mbb,
-                              SlotIndex MIIdx,
-                              LiveInterval &interval);
-
-    /// getReMatImplicitUse - If the remat definition MI has one (for now, we
-    /// only allow one) virtual register operand, then its uses are implicitly
-    /// using the register. Returns the virtual register.
-    unsigned getReMatImplicitUse(const LiveInterval &li,
-                                 MachineInstr *MI) const;
-
-    /// isValNoAvailableAt - Return true if the val# of the specified interval
-    /// which reaches the given instruction also reaches the specified use
-    /// index.
-    bool isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
-                            SlotIndex UseIdx) const;
-
-    /// isReMaterializable - Returns true if the definition MI of the specified
-    /// val# of the specified interval is re-materializable. Also returns true
-    /// by reference if the def is a load.
-    bool isReMaterializable(const LiveInterval &li, const VNInfo *ValNo,
-                            MachineInstr *MI,
-                            const SmallVectorImpl<LiveInterval*> *SpillIs,
-                            bool &isLoad);
-
     static LiveInterval* createInterval(unsigned Reg);
 
     void printInstrs(raw_ostream &O) const;
     void dumpInstrs() const;
 
+    void computeLiveInRegUnits();
+    void computeRegUnitInterval(LiveInterval*);
+    void computeVirtRegInterval(LiveInterval*);
+
     class HMEditor;
   };
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index 57a6193..def7b00 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@@ -55,29 +55,29 @@ public:
   };
 
 private:
-  LiveInterval &parent_;
-  SmallVectorImpl<LiveInterval*> &newRegs_;
+  LiveInterval *Parent;
+  SmallVectorImpl<LiveInterval*> &NewRegs;
   MachineRegisterInfo &MRI;
   LiveIntervals &LIS;
   VirtRegMap *VRM;
   const TargetInstrInfo &TII;
-  Delegate *const delegate_;
+  Delegate *const TheDelegate;
 
-  /// firstNew_ - Index of the first register added to newRegs_.
-  const unsigned firstNew_;
+  /// FirstNew - Index of the first register added to NewRegs.
+  const unsigned FirstNew;
 
-  /// scannedRemattable_ - true when remattable values have been identified.
-  bool scannedRemattable_;
+  /// ScannedRemattable - true when remattable values have been identified.
+  bool ScannedRemattable;
 
-  /// remattable_ - Values defined by remattable instructions as identified by
+  /// Remattable - Values defined by remattable instructions as identified by
   /// tii.isTriviallyReMaterializable().
-  SmallPtrSet<const VNInfo*,4> remattable_;
+  SmallPtrSet<const VNInfo*,4> Remattable;
 
-  /// rematted_ - Values that were actually rematted, and so need to have their
+  /// Rematted - Values that were actually rematted, and so need to have their
   /// live range trimmed or entirely removed.
-  SmallPtrSet<const VNInfo*,4> rematted_;
+  SmallPtrSet<const VNInfo*,4> Rematted;
 
-  /// scanRemattable - Identify the parent_ values that may rematerialize.
+  /// scanRemattable - Identify the Parent values that may rematerialize.
   void scanRemattable(AliasAnalysis *aa);
 
   /// allUsesAvailableAt - Return true if all registers used by OrigMI at
@@ -99,32 +99,35 @@ public:
   /// @param vrm Map of virtual registers to physical registers for this
   ///            function.  If NULL, no virtual register map updates will
   ///            be done.  This could be the case if called before Regalloc.
-  LiveRangeEdit(LiveInterval &parent,
+  LiveRangeEdit(LiveInterval *parent,
                 SmallVectorImpl<LiveInterval*> &newRegs,
                 MachineFunction &MF,
                 LiveIntervals &lis,
                 VirtRegMap *vrm,
                 Delegate *delegate = 0)
-    : parent_(parent), newRegs_(newRegs),
+    : Parent(parent), NewRegs(newRegs),
       MRI(MF.getRegInfo()), LIS(lis), VRM(vrm),
       TII(*MF.getTarget().getInstrInfo()),
-      delegate_(delegate),
-      firstNew_(newRegs.size()),
-      scannedRemattable_(false) {}
+      TheDelegate(delegate),
+      FirstNew(newRegs.size()),
+      ScannedRemattable(false) {}
 
-  LiveInterval &getParent() const { return parent_; }
-  unsigned getReg() const { return parent_.reg; }
+  LiveInterval &getParent() const {
+   assert(Parent && "No parent LiveInterval");
+   return *Parent;
+  }
+  unsigned getReg() const { return getParent().reg; }
 
   /// Iterator for accessing the new registers added by this edit.
   typedef SmallVectorImpl<LiveInterval*>::const_iterator iterator;
-  iterator begin() const { return newRegs_.begin()+firstNew_; }
-  iterator end() const { return newRegs_.end(); }
-  unsigned size() const { return newRegs_.size()-firstNew_; }
+  iterator begin() const { return NewRegs.begin()+FirstNew; }
+  iterator end() const { return NewRegs.end(); }
+  unsigned size() const { return NewRegs.size()-FirstNew; }
   bool empty() const { return size() == 0; }
-  LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; }
+  LiveInterval *get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
 
   ArrayRef<LiveInterval*> regs() const {
-    return makeArrayRef(newRegs_).slice(firstNew_);
+    return makeArrayRef(NewRegs).slice(FirstNew);
   }
 
   /// createFrom - Create a new virtual register based on OldReg.
@@ -174,12 +177,12 @@ public:
   /// markRematerialized - explicitly mark a value as rematerialized after doing
   /// it manually.
   void markRematerialized(const VNInfo *ParentVNI) {
-    rematted_.insert(ParentVNI);
+    Rematted.insert(ParentVNI);
   }
 
   /// didRematerialize - Return true if ParentVNI was rematerialized anywhere.
   bool didRematerialize(const VNInfo *ParentVNI) const {
-    return rematted_.count(ParentVNI);
+    return Rematted.count(ParentVNI);
   }
 
   /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index ef9c0c2..c917bd8 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -143,10 +143,7 @@ public:
     IterTy MII;
 
   public:
-    bundle_iterator(IterTy mii) : MII(mii) {
-      assert(!MII->isInsideBundle() &&
-             "It's not legal to initialize bundle_iterator with a bundled MI");
-    }
+    bundle_iterator(IterTy mii) : MII(mii) {}
 
     bundle_iterator(Ty &mi) : MII(mi) {
       assert(!mi.isInsideBundle() &&
@@ -156,7 +153,10 @@ public:
       assert((!mi || !mi->isInsideBundle()) &&
              "It's not legal to initialize bundle_iterator with a bundled MI");
     }
-    bundle_iterator(const bundle_iterator &I) : MII(I.MII) {}
+    // Template allows conversion from const to nonconst.
+    template<class OtherTy, class OtherIterTy>
+    bundle_iterator(const bundle_iterator<OtherTy, OtherIterTy> &I)
+      : MII(I.getInstrIterator()) {}
     bundle_iterator() : MII(0) {}
 
     Ty &operator*() const { return *MII; }
@@ -173,29 +173,24 @@ public:
 
     // Increment and decrement operators...
     bundle_iterator &operator--() {      // predecrement - Back up
-      do {
-        --MII;
-      } while (MII->isInsideBundle());
+      do --MII;
+      while (MII->isInsideBundle());
       return *this;
     }
     bundle_iterator &operator++() {      // preincrement - Advance
-      do {
-        ++MII;
-      } while (MII->isInsideBundle());
+      IterTy E = MII->getParent()->instr_end();
+      do ++MII;
+      while (MII != E && MII->isInsideBundle());
       return *this;
     }
     bundle_iterator operator--(int) {    // postdecrement operators...
       bundle_iterator tmp = *this;
-      do {
-        --MII;
-      } while (MII->isInsideBundle());
+      --*this;
       return tmp;
     }
     bundle_iterator operator++(int) {    // postincrement operators...
       bundle_iterator tmp = *this;
-      do {
-        ++MII;
-      } while (MII->isInsideBundle());
+      ++*this;
       return tmp;
     }
 
@@ -235,42 +230,14 @@ public:
   reverse_instr_iterator       instr_rend  ()       { return Insts.rend();   }
   const_reverse_instr_iterator instr_rend  () const { return Insts.rend();   }
 
-  iterator                begin()       { return Insts.begin();  }
-  const_iterator          begin() const { return Insts.begin();  }
-  iterator                  end()       {
-    instr_iterator II = instr_end();
-    if (II != instr_begin()) {
-      while (II->isInsideBundle())
-        --II;
-    }
-    return II;
-  }
-  const_iterator            end() const {
-    const_instr_iterator II = instr_end();
-    if (II != instr_begin()) {
-      while (II->isInsideBundle())
-        --II;
-    }
-    return II;
-  }
-  reverse_iterator       rbegin()       {
-    reverse_instr_iterator II = instr_rbegin();
-    if (II != instr_rend()) {
-      while (II->isInsideBundle())
-        ++II;
-    }
-    return II;
-  }
-  const_reverse_iterator rbegin() const {
-    const_reverse_instr_iterator II = instr_rbegin();
-    if (II != instr_rend()) {
-      while (II->isInsideBundle())
-        ++II;
-    }
-    return II;
-  }
-  reverse_iterator       rend  ()       { return Insts.rend();   }
-  const_reverse_iterator rend  () const { return Insts.rend();   }
+  iterator                begin()       { return instr_begin();  }
+  const_iterator          begin() const { return instr_begin();  }
+  iterator                end  ()       { return instr_end();    }
+  const_iterator          end  () const { return instr_end();    }
+  reverse_iterator       rbegin()       { return instr_rbegin(); }
+  const_reverse_iterator rbegin() const { return instr_rbegin(); }
+  reverse_iterator       rend  ()       { return instr_rend();   }
+  const_reverse_iterator rend  () const { return instr_rend();   }
 
 
   // Machine-CFG iterators
@@ -412,6 +379,10 @@ public:
   /// which refer to fromMBB to refer to this.
   void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB);
 
+  /// isPredecessor - Return true if the specified MBB is a predecessor of this
+  /// block.
+  bool isPredecessor(const MachineBasicBlock *MBB) const;
+
   /// isSuccessor - Return true if the specified MBB is a successor of this
   /// block.
   bool isSuccessor(const MachineBasicBlock *MBB) const;
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 44402a9..8b958e4 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -359,7 +359,7 @@ public:
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
     Objects[ObjectIdx+NumFixedObjects].Alignment = Align;
-    MaxAlignment = std::max(MaxAlignment, Align);
+    ensureMaxAlignment(Align);
   }
 
   /// NeedsStackProtector - Returns true if the object may need stack
@@ -416,9 +416,11 @@ public:
   ///
   unsigned getMaxAlignment() const { return MaxAlignment; }
 
-  /// setMaxAlignment - Set the preferred alignment.
-  ///
-  void setMaxAlignment(unsigned Align) { MaxAlignment = Align; }
+  /// ensureMaxAlignment - Make sure the function is at least Align bytes
+  /// aligned.
+  void ensureMaxAlignment(unsigned Align) {
+    if (MaxAlignment < Align) MaxAlignment = Align;
+  }
 
   /// AdjustsStack - Return true if this function adjusts the stack -- e.g.,
   /// when calling another function. This is only valid during and after
@@ -485,7 +487,7 @@ public:
     Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP));
     int Index = (int)Objects.size() - NumFixedObjects - 1;
     assert(Index >= 0 && "Bad frame index!");
-    MaxAlignment = std::max(MaxAlignment, Alignment);
+    ensureMaxAlignment(Alignment);
     return Index;
   }
 
@@ -496,7 +498,7 @@ public:
   int CreateSpillStackObject(uint64_t Size, unsigned Alignment) {
     CreateStackObject(Size, Alignment, true, false);
     int Index = (int)Objects.size() - NumFixedObjects - 1;
-    MaxAlignment = std::max(MaxAlignment, Alignment);
+    ensureMaxAlignment(Alignment);
     return Index;
   }
 
@@ -515,7 +517,7 @@ public:
   int CreateVariableSizedObject(unsigned Alignment) {
     HasVarSizedObjects = true;
     Objects.push_back(StackObject(0, Alignment, 0, false, false, true));
-    MaxAlignment = std::max(MaxAlignment, Alignment);
+    ensureMaxAlignment(Alignment);
     return (int)Objects.size()-NumFixedObjects-1;
   }
 
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index dda2dc7..062c750 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -189,8 +189,8 @@ public:
   ///
   void setAlignment(unsigned A) { Alignment = A; }
 
-  /// EnsureAlignment - Make sure the function is at least 1 << A bytes aligned.
-  void EnsureAlignment(unsigned A) {
+  /// ensureAlignment - Make sure the function is at least 1 << A bytes aligned.
+  void ensureAlignment(unsigned A) {
     if (Alignment < A) Alignment = A;
   }
 
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 65093d7..4c5eb8b 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -635,6 +635,30 @@ public:
       getOperand(0).getSubReg() == getOperand(1).getSubReg();
   }
 
+  /// isTransient - Return true if this is a transient instruction that is
+  /// either very likely to be eliminated during register allocation (such as
+  /// copy-like instructions), or if this instruction doesn't have an
+  /// execution-time cost.
+  bool isTransient() const {
+    switch(getOpcode()) {
+    default: return false;
+    // Copy-like instructions are usually eliminated during register allocation.
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    // Pseudo-instructions that don't produce any real output.
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::GC_LABEL:
+    case TargetOpcode::DBG_VALUE:
+      return true;
+    }
+  }
+
   /// getBundleSize - Return the number of instructions inside the MI bundle.
   unsigned getBundleSize() const;
 
@@ -912,12 +936,12 @@ private:
   /// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
   /// this instruction from their respective use lists.  This requires that the
   /// operands already be on their use lists.
-  void RemoveRegOperandsFromUseLists();
+  void RemoveRegOperandsFromUseLists(MachineRegisterInfo&);
 
   /// AddRegOperandsToUseLists - Add all of the register operands in
   /// this instruction from their respective use lists.  This requires that the
   /// operands not be on their use lists yet.
-  void AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo);
+  void AddRegOperandsToUseLists(MachineRegisterInfo&);
 
   /// hasPropertyInBundle - Slow path for hasProperty when we're dealing with a
   /// bundle.
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 99849a6..654361f 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -34,6 +34,7 @@ namespace RegState {
     Undef          = 0x20,
     EarlyClobber   = 0x40,
     Debug          = 0x80,
+    InternalRead   = 0x100,
     DefineNoRead   = Define | Undef,
     ImplicitDefine = Implicit | Define,
     ImplicitKill   = Implicit | Kill
@@ -67,7 +68,8 @@ public:
                                              flags & RegState::Undef,
                                              flags & RegState::EarlyClobber,
                                              SubReg,
-                                             flags & RegState::Debug));
+                                             flags & RegState::Debug,
+                                             flags & RegState::InternalRead));
     return *this;
   }
 
@@ -106,6 +108,12 @@ public:
     return *this;
   }
 
+  const MachineInstrBuilder &addTargetIndex(unsigned Idx, int64_t Offset = 0,
+                                          unsigned char TargetFlags = 0) const {
+    MI->addOperand(MachineOperand::CreateTargetIndex(Idx, Offset, TargetFlags));
+    return *this;
+  }
+
   const MachineInstrBuilder &addJumpTableIndex(unsigned Idx,
                                           unsigned char TargetFlags = 0) const {
     MI->addOperand(MachineOperand::CreateJTI(Idx, TargetFlags));
@@ -310,6 +318,9 @@ inline unsigned getDeadRegState(bool B) {
 inline unsigned getUndefRegState(bool B) {
   return B ? RegState::Undef : 0;
 }
+inline unsigned getInternalReadRegState(bool B) {
+  return B ? RegState::InternalRead : 0;
+}
 
 } // End llvm namespace
 
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 0fb4969..dc5f9a6 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -43,14 +43,14 @@ bool finalizeBundles(MachineFunction &MF);
 
 /// getBundleStart - Returns the first instruction in the bundle containing MI.
 ///
-static inline MachineInstr *getBundleStart(MachineInstr *MI) {
+inline MachineInstr *getBundleStart(MachineInstr *MI) {
   MachineBasicBlock::instr_iterator I = MI;
   while (I->isInsideBundle())
     --I;
   return I;
 }
 
-static inline const MachineInstr *getBundleStart(const MachineInstr *MI) {
+inline const MachineInstr *getBundleStart(const MachineInstr *MI) {
   MachineBasicBlock::const_instr_iterator I = MI;
   while (I->isInsideBundle())
     --I;
diff --git a/include/llvm/CodeGen/MachineJumpTableInfo.h b/include/llvm/CodeGen/MachineJumpTableInfo.h
index 6bd6682..f7c4e86 100644
--- a/include/llvm/CodeGen/MachineJumpTableInfo.h
+++ b/include/llvm/CodeGen/MachineJumpTableInfo.h
@@ -10,9 +10,9 @@
 // The MachineJumpTableInfo class keeps track of jump tables referenced by
 // lowered switch instructions in the MachineFunction.
 //
-// Instructions reference the address of these jump tables through the use of 
-// MO_JumpTableIndex values.  When emitting assembly or machine code, these 
-// virtual address references are converted to refer to the address of the 
+// Instructions reference the address of these jump tables through the use of
+// MO_JumpTableIndex values.  When emitting assembly or machine code, these
+// virtual address references are converted to refer to the address of the
 // function jump tables.
 //
 //===----------------------------------------------------------------------===//
@@ -34,11 +34,11 @@ class raw_ostream;
 struct MachineJumpTableEntry {
   /// MBBs - The vector of basic blocks from which to create the jump table.
   std::vector<MachineBasicBlock*> MBBs;
-  
+
   explicit MachineJumpTableEntry(const std::vector<MachineBasicBlock*> &M)
   : MBBs(M) {}
 };
-  
+
 class MachineJumpTableInfo {
 public:
   /// JTEntryKind - This enum indicates how each entry of the jump table is
@@ -57,7 +57,7 @@ public:
     /// with a relocation as gp-relative, e.g.:
     ///     .gprel32 LBB123
     EK_GPRel32BlockAddress,
-    
+
     /// EK_LabelDifference32 - Each entry is the address of the block minus
     /// the address of the jump table.  This is used for PIC jump tables where
     /// gprel32 is not supported.  e.g.:
@@ -80,18 +80,18 @@ private:
   std::vector<MachineJumpTableEntry> JumpTables;
 public:
   explicit MachineJumpTableInfo(JTEntryKind Kind): EntryKind(Kind) {}
-    
+
   JTEntryKind getEntryKind() const { return EntryKind; }
 
   /// getEntrySize - Return the size of each entry in the jump table.
   unsigned getEntrySize(const TargetData &TD) const;
   /// getEntryAlignment - Return the alignment of each entry in the jump table.
   unsigned getEntryAlignment(const TargetData &TD) const;
-  
+
   /// createJumpTableIndex - Create a new jump table.
   ///
   unsigned createJumpTableIndex(const std::vector<MachineBasicBlock*> &DestBBs);
-  
+
   /// isEmpty - Return true if there are no jump tables.
   ///
   bool isEmpty() const { return JumpTables.empty(); }
@@ -105,7 +105,7 @@ public:
   void RemoveJumpTable(unsigned Idx) {
     JumpTables[Idx].MBBs.clear();
   }
-  
+
   /// ReplaceMBBInJumpTables - If Old is the target of any jump tables, update
   /// the jump tables to branch to New instead.
   bool ReplaceMBBInJumpTables(MachineBasicBlock *Old, MachineBasicBlock *New);
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 6dd9440..3e204be 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the MachineLoopInfo class that is used to identify natural 
+// This file defines the MachineLoopInfo class that is used to identify natural
 // loops and determine the loop depth of various nodes of the CFG.  Note that
 // natural loops may actually be several loops that share the same header node.
 //
@@ -35,6 +35,12 @@
 
 namespace llvm {
 
+// Implementation in LoopInfoImpl.h
+#ifdef __GNUC__
+class MachineLoop;
+__extension__ extern template class LoopBase<MachineBasicBlock, MachineLoop>;
+#endif
+
 class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
 public:
   MachineLoop();
@@ -57,6 +63,12 @@ private:
     : LoopBase<MachineBasicBlock, MachineLoop>(MBB) {}
 };
 
+// Implementation in LoopInfoImpl.h
+#ifdef __GNUC__
+__extension__ extern template
+class LoopInfoBase<MachineBasicBlock, MachineLoop>;
+#endif
+
 class MachineLoopInfo : public MachineFunctionPass {
   LoopInfoBase<MachineBasicBlock, MachineLoop> LI;
   friend class LoopBase<MachineBasicBlock, MachineLoop>;
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index d244dd9..37d42b3 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_MACHINEOPERAND_H
 #define LLVM_CODEGEN_MACHINEOPERAND_H
 
+#include "llvm/ADT/Hashing.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
@@ -44,6 +45,7 @@ public:
     MO_MachineBasicBlock,      ///< MachineBasicBlock reference
     MO_FrameIndex,             ///< Abstract Stack Frame Index
     MO_ConstantPoolIndex,      ///< Address of indexed Constant in Constant Pool
+    MO_TargetIndex,            ///< Target-dependent index+offset operand.
     MO_JumpTableIndex,         ///< Address of indexed Jump Table for switch
     MO_ExternalSymbol,         ///< Name of external global symbol
     MO_GlobalAddress,          ///< Address of a global value
@@ -148,7 +150,7 @@ private:
 
     struct {                  // For MO_Register.
       // Register number is in SmallContents.RegNo.
-      MachineOperand **Prev;  // Access list for register.
+      MachineOperand *Prev;   // Access list for register. See MRI.
       MachineOperand *Next;
     } Reg;
 
@@ -214,6 +216,8 @@ public:
   bool isFI() const { return OpKind == MO_FrameIndex; }
   /// isCPI - Tests if this is a MO_ConstantPoolIndex operand.
   bool isCPI() const { return OpKind == MO_ConstantPoolIndex; }
+  /// isTargetIndex - Tests if this is a MO_TargetIndex operand.
+  bool isTargetIndex() const { return OpKind == MO_TargetIndex; }
   /// isJTI - Tests if this is a MO_JumpTableIndex operand.
   bool isJTI() const { return OpKind == MO_JumpTableIndex; }
   /// isGlobal - Tests if this is a MO_GlobalAddress operand.
@@ -301,13 +305,6 @@ public:
     return !isUndef() && !isInternalRead() && (isUse() || getSubReg());
   }
 
-  /// getNextOperandForReg - Return the next MachineOperand in the function that
-  /// uses or defines this register.
-  MachineOperand *getNextOperandForReg() const {
-    assert(isReg() && "This is not a register operand!");
-    return Contents.Reg.Next;
-  }
-
   //===--------------------------------------------------------------------===//
   // Mutators for Register Operands
   //===--------------------------------------------------------------------===//
@@ -334,17 +331,9 @@ public:
   ///
   void substPhysReg(unsigned Reg, const TargetRegisterInfo&);
 
-  void setIsUse(bool Val = true) {
-    assert(isReg() && "Wrong MachineOperand accessor");
-    assert((Val || !isDebug()) && "Marking a debug operation as def");
-    IsDef = !Val;
-  }
+  void setIsUse(bool Val = true) { setIsDef(!Val); }
 
-  void setIsDef(bool Val = true) {
-    assert(isReg() && "Wrong MachineOperand accessor");
-    assert((!Val || !isDebug()) && "Marking a debug operation as def");
-    IsDef = Val;
-  }
+  void setIsDef(bool Val = true);
 
   void setImplicit(bool Val = true) {
     assert(isReg() && "Wrong MachineOperand accessor");
@@ -407,7 +396,7 @@ public:
   }
 
   int getIndex() const {
-    assert((isFI() || isCPI() || isJTI()) &&
+    assert((isFI() || isCPI() || isTargetIndex() || isJTI()) &&
            "Wrong MachineOperand accessor");
     return Contents.OffsetedInfo.Val.Index;
   }
@@ -430,8 +419,8 @@ public:
   /// getOffset - Return the offset from the symbol in this operand. This always
   /// returns 0 for ExternalSymbol operands.
   int64_t getOffset() const {
-    assert((isGlobal() || isSymbol() || isCPI() || isBlockAddress()) &&
-           "Wrong MachineOperand accessor");
+    assert((isGlobal() || isSymbol() || isCPI() || isTargetIndex() ||
+            isBlockAddress()) && "Wrong MachineOperand accessor");
     return (int64_t(Contents.OffsetedInfo.OffsetHi) << 32) |
            SmallContents.OffsetLo;
   }
@@ -478,14 +467,14 @@ public:
   }
 
   void setOffset(int64_t Offset) {
-    assert((isGlobal() || isSymbol() || isCPI() || isBlockAddress()) &&
-        "Wrong MachineOperand accessor");
+    assert((isGlobal() || isSymbol() || isCPI() || isTargetIndex() ||
+            isBlockAddress()) && "Wrong MachineOperand accessor");
     SmallContents.OffsetLo = unsigned(Offset);
     Contents.OffsetedInfo.OffsetHi = int(Offset >> 32);
   }
 
   void setIndex(int Idx) {
-    assert((isFI() || isCPI() || isJTI()) &&
+    assert((isFI() || isCPI() || isTargetIndex() || isJTI()) &&
            "Wrong MachineOperand accessor");
     Contents.OffsetedInfo.Val.Index = Idx;
   }
@@ -503,6 +492,13 @@ public:
   /// operand. Note: This method ignores isKill and isDead properties.
   bool isIdenticalTo(const MachineOperand &Other) const;
 
+  /// \brief MachineOperand hash_value overload.
+  ///
+  /// Note that this includes the same information in the hash that
+  /// isIdenticalTo uses for comparison. It is thus suited for use in hash
+  /// tables which use that function for equality comparisons only.
+  friend hash_code hash_value(const MachineOperand &MO);
+
   /// ChangeToImmediate - Replace this operand with a new immediate operand of
   /// the specified value.  If an operand is known to be an immediate already,
   /// the setImm method should be used.
@@ -542,14 +538,15 @@ public:
                                   bool isUndef = false,
                                   bool isEarlyClobber = false,
                                   unsigned SubReg = 0,
-                                  bool isDebug = false) {
+                                  bool isDebug = false,
+                                  bool isInternalRead = false) {
     MachineOperand Op(MachineOperand::MO_Register);
     Op.IsDef = isDef;
     Op.IsImp = isImp;
     Op.IsKill = isKill;
     Op.IsDead = isDead;
     Op.IsUndef = isUndef;
-    Op.IsInternalRead = false;
+    Op.IsInternalRead = isInternalRead;
     Op.IsEarlyClobber = isEarlyClobber;
     Op.IsDebug = isDebug;
     Op.SmallContents.RegNo = Reg;
@@ -578,6 +575,14 @@ public:
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
+  static MachineOperand CreateTargetIndex(unsigned Idx, int64_t Offset,
+                                          unsigned char TargetFlags = 0) {
+    MachineOperand Op(MachineOperand::MO_TargetIndex);
+    Op.setIndex(Idx);
+    Op.setOffset(Offset);
+    Op.setTargetFlags(TargetFlags);
+    return Op;
+  }
   static MachineOperand CreateJTI(unsigned Idx,
                                   unsigned char TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_JumpTableIndex);
@@ -653,15 +658,6 @@ private:
     assert(isReg() && "Can only add reg operand to use lists");
     return Contents.Reg.Prev != 0;
   }
-
-  /// AddRegOperandToRegInfo - Add this register operand to the specified
-  /// MachineRegisterInfo.  If it is null, then the next/prev fields should be
-  /// explicitly nulled out.
-  void AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo);
-
-  /// RemoveRegOperandFromRegInfo - Remove this register operand from the
-  /// MachineRegisterInfo it is linked with.
-  void RemoveRegOperandFromRegInfo();
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const MachineOperand& MO) {
diff --git a/include/llvm/CodeGen/MachinePassRegistry.h b/include/llvm/CodeGen/MachinePassRegistry.h
index c41e8e26..90ee7f4 100644
--- a/include/llvm/CodeGen/MachinePassRegistry.h
+++ b/include/llvm/CodeGen/MachinePassRegistry.h
@@ -26,7 +26,7 @@ namespace llvm {
 typedef void *(*MachinePassCtor)();
 
 
-//===----------------------------------------------------------------------===// 
+//===----------------------------------------------------------------------===//
 ///
 /// MachinePassRegistryListener - Listener to adds and removals of nodes in
 /// registration list.
@@ -42,7 +42,7 @@ public:
 };
 
 
-//===----------------------------------------------------------------------===// 
+//===----------------------------------------------------------------------===//
 ///
 /// MachinePassRegistryNode - Machine pass node stored in registration list.
 ///
@@ -55,7 +55,7 @@ private:
   const char *Name;                     // Name of function pass.
   const char *Description;              // Description string.
   MachinePassCtor Ctor;                 // Function pass creator.
-  
+
 public:
 
   MachinePassRegistryNode(const char *N, const char *D, MachinePassCtor C)
@@ -72,11 +72,11 @@ public:
   const char *getDescription()            const { return Description; }
   MachinePassCtor getCtor()               const { return Ctor; }
   void setNext(MachinePassRegistryNode *N)      { Next = N; }
-  
+
 };
 
 
-//===----------------------------------------------------------------------===// 
+//===----------------------------------------------------------------------===//
 ///
 /// MachinePassRegistry - Track the registration of machine passes.
 ///
@@ -88,7 +88,7 @@ private:
   MachinePassRegistryNode *List;        // List of registry nodes.
   MachinePassCtor Default;              // Default function pass creator.
   MachinePassRegistryListener* Listener;// Listener for list adds are removes.
-  
+
 public:
 
   // NO CONSTRUCTOR - we don't want static constructor ordering to mess
@@ -99,6 +99,7 @@ public:
   MachinePassRegistryNode *getList()                    { return List; }
   MachinePassCtor getDefault()                          { return Default; }
   void setDefault(MachinePassCtor C)                    { Default = C; }
+  void setDefault(StringRef Name);
   void setListener(MachinePassRegistryListener *L)      { Listener = L; }
 
   /// Add - Adds a function pass to the registration list.
@@ -126,7 +127,7 @@ public:
 
   void initialize(cl::Option &O) {
     cl::parser<typename RegistryClass::FunctionPassCtor>::initialize(O);
-    
+
     // Add existing passes to option.
     for (RegistryClass *Node = RegistryClass::getList();
          Node; Node = Node->getNext()) {
@@ -134,7 +135,7 @@ public:
                       (typename RegistryClass::FunctionPassCtor)Node->getCtor(),
                              Node->getDescription());
     }
-    
+
     // Make sure we listen for list changes.
     RegistryClass::setListener(this);
   }
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 3272fbd..42a8aa4 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -57,6 +57,26 @@ class MachineRegisterInfo {
   /// physical registers.
   MachineOperand **PhysRegUseDefLists;
 
+  /// getRegUseDefListHead - Return the head pointer for the register use/def
+  /// list for the specified virtual or physical register.
+  MachineOperand *&getRegUseDefListHead(unsigned RegNo) {
+    if (TargetRegisterInfo::isVirtualRegister(RegNo))
+      return VRegInfo[RegNo].second;
+    return PhysRegUseDefLists[RegNo];
+  }
+
+  MachineOperand *getRegUseDefListHead(unsigned RegNo) const {
+    if (TargetRegisterInfo::isVirtualRegister(RegNo))
+      return VRegInfo[RegNo].second;
+    return PhysRegUseDefLists[RegNo];
+  }
+
+  /// Get the next element in the use-def chain.
+  static MachineOperand *getNextOperandForReg(const MachineOperand *MO) {
+    assert(MO && MO->isReg() && "This is not a register operand!");
+    return MO->Contents.Reg.Next;
+  }
+
   /// UsedPhysRegs - This is a bit vector that is computed and set by the
   /// register allocator, and must be kept up to date by passes that run after
   /// register allocation (though most don't modify this).  This is used
@@ -129,12 +149,21 @@ public:
   // Register Info
   //===--------------------------------------------------------------------===//
 
+  // Strictly for use by MachineInstr.cpp.
+  void addRegOperandToUseList(MachineOperand *MO);
+
+  // Strictly for use by MachineInstr.cpp.
+  void removeRegOperandFromUseList(MachineOperand *MO);
+
   /// reg_begin/reg_end - Provide iteration support to walk over all definitions
   /// and uses of a register within the MachineFunction that corresponds to this
   /// MachineRegisterInfo object.
   template<bool Uses, bool Defs, bool SkipDebug>
   class defusechain_iterator;
 
+  // Make it a friend so it can access getNextOperandForReg().
+  template<bool, bool, bool> friend class defusechain_iterator;
+
   /// reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified
   /// register.
   typedef defusechain_iterator<true,true,false> reg_iterator;
@@ -172,6 +201,15 @@ public:
   /// specified register (it may be live-in).
   bool def_empty(unsigned RegNo) const { return def_begin(RegNo) == def_end(); }
 
+  /// hasOneDef - Return true if there is exactly one instruction defining the
+  /// specified register.
+  bool hasOneDef(unsigned RegNo) const {
+    def_iterator DI = def_begin(RegNo);
+    if (DI == def_end())
+      return false;
+    return ++DI == def_end();
+  }
+
   /// use_iterator/use_begin/use_end - Walk all uses of the specified register.
   typedef defusechain_iterator<true,false,false> use_iterator;
   use_iterator use_begin(unsigned RegNo) const {
@@ -185,7 +223,12 @@ public:
 
   /// hasOneUse - Return true if there is exactly one instruction using the
   /// specified register.
-  bool hasOneUse(unsigned RegNo) const;
+  bool hasOneUse(unsigned RegNo) const {
+    use_iterator UI = use_begin(RegNo);
+    if (UI == use_end())
+      return false;
+    return ++UI == use_end();
+  }
 
   /// use_nodbg_iterator/use_nodbg_begin/use_nodbg_end - Walk all uses of the
   /// specified register, skipping those marked as Debug.
@@ -218,25 +261,16 @@ public:
   /// constraints.
   void replaceRegWith(unsigned FromReg, unsigned ToReg);
 
-  /// getRegUseDefListHead - Return the head pointer for the register use/def
-  /// list for the specified virtual or physical register.
-  MachineOperand *&getRegUseDefListHead(unsigned RegNo) {
-    if (TargetRegisterInfo::isVirtualRegister(RegNo))
-      return VRegInfo[RegNo].second;
-    return PhysRegUseDefLists[RegNo];
-  }
-
-  MachineOperand *getRegUseDefListHead(unsigned RegNo) const {
-    if (TargetRegisterInfo::isVirtualRegister(RegNo))
-      return VRegInfo[RegNo].second;
-    return PhysRegUseDefLists[RegNo];
-  }
-
   /// getVRegDef - Return the machine instr that defines the specified virtual
   /// register or null if none is found.  This assumes that the code is in SSA
   /// form, so there should only be one definition.
   MachineInstr *getVRegDef(unsigned Reg) const;
 
+  /// getUniqueVRegDef - Return the unique machine instr that defines the
+  /// specified virtual register or null if none is found.  If there are
+  /// multiple definitions or no definition, return null.
+  MachineInstr *getUniqueVRegDef(unsigned Reg) const;
+
   /// clearKillFlags - Iterate over all the uses of the given register and
   /// clear the kill flag from the MachineOperand. This function is used by
   /// optimization passes which extend register lifetimes and need only
@@ -336,7 +370,7 @@ public:
   bool isPhysRegOrOverlapUsed(unsigned Reg) const {
     if (UsedPhysRegMask.test(Reg))
       return true;
-    for (const uint16_t *AI = TRI->getOverlaps(Reg); *AI; ++AI)
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
       if (UsedPhysRegs.test(*AI))
         return true;
     return false;
@@ -434,10 +468,6 @@ public:
                         const TargetRegisterInfo &TRI,
                         const TargetInstrInfo &TII);
 
-private:
-  void HandleVRegListReallocation();
-
-public:
   /// defusechain_iterator - This class provides iterator support for machine
   /// operands in the function that use or define a specific register.  If
   /// ReturnUses is true it returns uses of registers, if ReturnDefs is true it
@@ -481,13 +511,22 @@ public:
     // Iterator traversal: forward iteration only
     defusechain_iterator &operator++() {          // Preincrement
       assert(Op && "Cannot increment end iterator!");
-      Op = Op->getNextOperandForReg();
-
-      // If this is an operand we don't care about, skip it.
-      while (Op && ((!ReturnUses && Op->isUse()) ||
-                    (!ReturnDefs && Op->isDef()) ||
-                    (SkipDebug && Op->isDebug())))
-        Op = Op->getNextOperandForReg();
+      Op = getNextOperandForReg(Op);
+
+      // All defs come before the uses, so stop def_iterator early.
+      if (!ReturnUses) {
+        if (Op) {
+          if (Op->isUse())
+            Op = 0;
+          else
+            assert(!Op->isDebug() && "Can't have debug defs");
+        }
+      } else {
+        // If this is an operand we don't care about, skip it.
+        while (Op && ((!ReturnDefs && Op->isDef()) ||
+                      (SkipDebug && Op->isDebug())))
+          Op = getNextOperandForReg(Op);
+      }
 
       return *this;
     }
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index e852009..8da2045 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -19,7 +19,7 @@
 //                     createCustomMachineSched);
 //
 // Inside <Target>PassConfig:
-//   enablePass(MachineSchedulerID);
+//   enablePass(&MachineSchedulerID);
 //   MachineSchedRegistry::setDefault(createCustomMachineSched);
 //
 //===----------------------------------------------------------------------===//
@@ -35,6 +35,7 @@ class AliasAnalysis;
 class LiveIntervals;
 class MachineDominatorTree;
 class MachineLoopInfo;
+class RegisterClassInfo;
 class ScheduleDAGInstrs;
 
 /// MachineSchedContext provides enough context from the MachineScheduler pass
@@ -47,7 +48,10 @@ struct MachineSchedContext {
   AliasAnalysis *AA;
   LiveIntervals *LIS;
 
-  MachineSchedContext(): MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {}
+  RegisterClassInfo *RegClassInfo;
+
+  MachineSchedContext();
+  virtual ~MachineSchedContext();
 };
 
 /// MachineSchedRegistry provides a selection of available machine instruction
@@ -81,6 +85,9 @@ public:
   static void setDefault(ScheduleDAGCtor C) {
     Registry.setDefault((MachinePassCtor)C);
   }
+  static void setDefault(StringRef Name) {
+    Registry.setDefault(Name);
+  }
   static void setListener(MachinePassRegistryListener *L) {
     Registry.setListener(L);
   }
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index e76fe99..07b3b45 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -24,6 +24,7 @@ namespace llvm {
   class FunctionPass;
   class MachineFunctionPass;
   class PassInfo;
+  class PassManagerBase;
   class TargetLowering;
   class TargetRegisterClass;
   class raw_ostream;
@@ -31,8 +32,6 @@ namespace llvm {
 
 namespace llvm {
 
-extern char &NoPassID; // Allow targets to choose not to run a pass.
-
 class PassConfigImpl;
 
 /// Target-Independent Code Generator Pass Configuration Options.
@@ -54,9 +53,15 @@ public:
   /// optimization after regalloc.
   static char PostRAMachineLICMID;
 
+private:
+  PassManagerBase *PM;
+  AnalysisID StartAfter;
+  AnalysisID StopAfter;
+  bool Started;
+  bool Stopped;
+
 protected:
   TargetMachine *TM;
-  PassManagerBase *PM;
   PassConfigImpl *Impl; // Internal data structures
   bool Initialized;     // Flagged after all passes are configured.
 
@@ -91,6 +96,18 @@ public:
 
   CodeGenOpt::Level getOptLevel() const { return TM->getOptLevel(); }
 
+  /// setStartStopPasses - Set the StartAfter and StopAfter passes to allow
+  /// running only a portion of the normal code-gen pass sequence.  If the
+  /// Start pass ID is zero, then compilation will begin at the normal point;
+  /// otherwise, clear the Started flag to indicate that passes should not be
+  /// added until the starting pass is seen.  If the Stop pass ID is zero,
+  /// then compilation will continue to the end.
+  void setStartStopPasses(AnalysisID Start, AnalysisID Stop) {
+    StartAfter = Start;
+    StopAfter = Stop;
+    Started = (StartAfter == 0);
+  }
+
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
 
   bool getEnableTailMerge() const { return EnableTailMerge; }
@@ -98,16 +115,19 @@ public:
 
   /// Allow the target to override a specific pass without overriding the pass
   /// pipeline. When passes are added to the standard pipeline at the
-  /// point where StadardID is expected, add TargetID in its place.
-  void substitutePass(char &StandardID, char &TargetID);
+  /// point where StandardID is expected, add TargetID in its place.
+  void substitutePass(AnalysisID StandardID, AnalysisID TargetID);
+
+  /// Insert InsertedPassID pass after TargetPassID pass.
+  void insertPass(AnalysisID TargetPassID, AnalysisID InsertedPassID);
 
   /// Allow the target to enable a specific standard pass by default.
-  void enablePass(char &ID) { substitutePass(ID, ID); }
+  void enablePass(AnalysisID PassID) { substitutePass(PassID, PassID); }
 
   /// Allow the target to disable a specific standard pass by default.
-  void disablePass(char &ID) { substitutePass(ID, NoPassID); }
+  void disablePass(AnalysisID PassID) { substitutePass(PassID, 0); }
 
-  /// Return the pass ssubtituted for StandardID by the target.
+  /// Return the pass substituted for StandardID by the target.
   /// If no substitution exists, return StandardID.
   AnalysisID getPassSubstitution(AnalysisID StandardID) const;
 
@@ -118,6 +138,9 @@ public:
   /// transforms following machine independent optimization.
   virtual void addIRPasses();
 
+  /// Add passes to lower exception handling for the code generator.
+  void addPassesToHandleExceptions();
+
   /// Add common passes that perform LLVM IR to IR transforms in preparation for
   /// instruction selection.
   virtual void addISelPrepare();
@@ -172,6 +195,18 @@ protected:
   /// LLVMTargetMachine provides standard regalloc passes for most targets.
   virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
 
+  /// addPreRewrite - Add passes to the optimized register allocation pipeline
+  /// after register allocation is complete, but before virtual registers are
+  /// rewritten to physical registers.
+  ///
+  /// These passes must preserve VirtRegMap and LiveIntervals, and when running
+  /// after RABasic or RAGreedy, they should take advantage of LiveRegMatrix.
+  /// When these passes run, VirtRegMap contains legal physreg assignments for
+  /// all virtual registers.
+  virtual bool addPreRewrite() {
+    return false;
+  }
+
   /// addFinalizeRegAlloc - This method may be implemented by targets that want
   /// to run passes within the regalloc pipeline, immediately after the register
   /// allocation pass itself. These passes run as soon as virtual regisiters
@@ -216,8 +251,12 @@ protected:
   ///
 
   /// Add a CodeGen pass at this point in the pipeline after checking overrides.
-  /// Return the pass that was added, or NoPassID.
-  AnalysisID addPass(char &ID);
+  /// Return the pass that was added, or zero if no pass was added.
+  AnalysisID addPass(AnalysisID PassID);
+
+  /// Add a pass to the PassManager if that pass is supposed to be run, as
+  /// determined by the StartAfter and StopAfter options.
+  void addPass(Pass *P);
 
   /// addMachinePasses helper to create the target-selected or overriden
   /// regalloc pass.
@@ -226,7 +265,7 @@ protected:
   /// printAndVerify - Add a pass to dump then verify the machine function, if
   /// those steps are enabled.
   ///
-  void printAndVerify(const char *Banner) const;
+  void printAndVerify(const char *Banner);
 };
 } // namespace llvm
 
@@ -276,6 +315,10 @@ namespace llvm {
   ///  This pass is still in development
   extern char &StrongPHIEliminationID;
 
+  /// LiveIntervals - This analysis keeps track of the live ranges of virtual
+  /// and physical registers.
+  extern char &LiveIntervalsID;
+
   /// LiveStacks pass. An analysis keeping track of the liveness of stack slots.
   extern char &LiveStacksID;
 
@@ -297,6 +340,10 @@ namespace llvm {
   /// basic blocks.
   extern char &SpillPlacementID;
 
+  /// VirtRegRewriter pass. Rewrite virtual registers to physical registers as
+  /// assigned in VirtRegMap.
+  extern char &VirtRegRewriterID;
+
   /// UnreachableMachineBlockElimination - This pass removes unreachable
   /// machine basic blocks.
   extern char &UnreachableMachineBlockElimID;
@@ -342,10 +389,21 @@ namespace llvm {
   /// branches.
   extern char &BranchFolderPassID;
 
+  /// MachineFunctionPrinterPass - This pass prints out MachineInstr's.
+  extern char &MachineFunctionPrinterPassID;
+
   /// TailDuplicate - Duplicate blocks with unconditional branches
   /// into tails of their predecessors.
   extern char &TailDuplicateID;
 
+  /// MachineTraceMetrics - This pass computes critical path and CPU resource
+  /// usage in an ensemble of traces.
+  extern char &MachineTraceMetricsID;
+
+  /// EarlyIfConverter - This pass performs if-conversion on SSA form by
+  /// inserting cmov instructions.
+  extern char &EarlyIfConverterID;
+
   /// IfConverter - This pass performs machine code if conversion.
   extern char &IfConverterID;
 
diff --git a/include/llvm/CodeGen/ProcessImplicitDefs.h b/include/llvm/CodeGen/ProcessImplicitDefs.h
deleted file mode 100644
index 6ab57f0..0000000
--- a/include/llvm/CodeGen/ProcessImplicitDefs.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//===-------------- llvm/CodeGen/ProcessImplicitDefs.h ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
-#define LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
-
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/ADT/SmallSet.h"
-
-namespace llvm {
-
-  class MachineInstr;
-  class TargetInstrInfo;
-  class TargetRegisterInfo;
-  class MachineRegisterInfo;
-  class LiveVariables;
-
-  /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
-  /// for each use. Add isUndef marker to implicit_def defs and their uses.
-  class ProcessImplicitDefs : public MachineFunctionPass {
-    const TargetInstrInfo *TII;
-    const TargetRegisterInfo *TRI;
-    MachineRegisterInfo *MRI;
-    LiveVariables *LV;
-
-    bool CanTurnIntoImplicitDef(MachineInstr *MI, unsigned Reg,
-                                unsigned OpIdx,
-                                SmallSet<unsigned, 8> &ImpDefRegs);
-
-  public:
-    static char ID;
-
-    ProcessImplicitDefs() : MachineFunctionPass(ID) {
-      initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &au) const;
-
-    virtual bool runOnMachineFunction(MachineFunction &fn);
-  };
-
-}
-
-#endif // LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
diff --git a/lib/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index 400e1f4..400e1f4 100644
--- a/lib/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
new file mode 100644
index 0000000..2043155
--- /dev/null
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -0,0 +1,282 @@
+//===-- RegisterPressure.h - Dynamic Register Pressure -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTERPRESSURE_H
+#define LLVM_CODEGEN_REGISTERPRESSURE_H
+
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SparseSet.h"
+
+namespace llvm {
+
+class LiveIntervals;
+class RegisterClassInfo;
+class MachineInstr;
+
+/// Base class for register pressure results.
+struct RegisterPressure {
+  /// Map of max reg pressure indexed by pressure set ID, not class ID.
+  std::vector<unsigned> MaxSetPressure;
+
+  /// List of live in registers.
+  SmallVector<unsigned,8> LiveInRegs;
+  SmallVector<unsigned,8> LiveOutRegs;
+
+  /// Increase register pressure for each pressure set impacted by this register
+  /// class. Normally called by RegPressureTracker, but may be called manually
+  /// to account for live through (global liveness).
+  void increase(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI);
+
+  /// Decrease register pressure for each pressure set impacted by this register
+  /// class. This is only useful to account for spilling or rematerialization.
+  void decrease(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI);
+
+  void dump(const TargetRegisterInfo *TRI);
+};
+
+/// RegisterPressure computed within a region of instructions delimited by
+/// TopIdx and BottomIdx.  During pressure computation, the maximum pressure per
+/// register pressure set is increased. Once pressure within a region is fully
+/// computed, the live-in and live-out sets are recorded.
+///
+/// This is preferable to RegionPressure when LiveIntervals are available,
+/// because delimiting regions by SlotIndex is more robust and convenient than
+/// holding block iterators. The block contents can change without invalidating
+/// the pressure result.
+struct IntervalPressure : RegisterPressure {
+  /// Record the boundary of the region being tracked.
+  SlotIndex TopIdx;
+  SlotIndex BottomIdx;
+
+  void reset();
+
+  void openTop(SlotIndex NextTop);
+
+  void openBottom(SlotIndex PrevBottom);
+};
+
+/// RegisterPressure computed within a region of instructions delimited by
+/// TopPos and BottomPos. This is a less precise version of IntervalPressure for
+/// use when LiveIntervals are unavailable.
+struct RegionPressure : RegisterPressure {
+  /// Record the boundary of the region being tracked.
+  MachineBasicBlock::const_iterator TopPos;
+  MachineBasicBlock::const_iterator BottomPos;
+
+  void reset();
+
+  void openTop(MachineBasicBlock::const_iterator PrevTop);
+
+  void openBottom(MachineBasicBlock::const_iterator PrevBottom);
+};
+
+/// An element of pressure difference that identifies the pressure set and
+/// amount of increase or decrease in units of pressure.
+struct PressureElement {
+  unsigned PSetID;
+  int UnitIncrease;
+
+  PressureElement(): PSetID(~0U), UnitIncrease(0) {}
+  PressureElement(unsigned id, int inc): PSetID(id), UnitIncrease(inc) {}
+
+  bool isValid() const { return PSetID != ~0U; }
+};
+
+/// Store the effects of a change in pressure on things that MI scheduler cares
+/// about.
+///
+/// Excess records the value of the largest difference in register units beyond
+/// the target's pressure limits across the affected pressure sets, where
+/// largest is defined as the absolute value of the difference. Negative
+/// ExcessUnits indicates a reduction in pressure that had already exceeded the
+/// target's limits.
+///
+/// CriticalMax records the largest increase in the tracker's max pressure that
+/// exceeds the critical limit for some pressure set determined by the client.
+///
+/// CurrentMax records the largest increase in the tracker's max pressure that
+/// exceeds the current limit for some pressure set determined by the client.
+struct RegPressureDelta {
+  PressureElement Excess;
+  PressureElement CriticalMax;
+  PressureElement CurrentMax;
+
+  RegPressureDelta() {}
+};
+
+/// Track the current register pressure at some position in the instruction
+/// stream, and remember the high water mark within the region traversed. This
+/// does not automatically consider live-through ranges. The client may
+/// independently adjust for global liveness.
+///
+/// Each RegPressureTracker only works within a MachineBasicBlock. Pressure can
+/// be tracked across a larger region by storing a RegisterPressure result at
+/// each block boundary and explicitly adjusting pressure to account for block
+/// live-in and live-out register sets.
+///
+/// RegPressureTracker holds a reference to a RegisterPressure result that it
+/// computes incrementally. During downward tracking, P.BottomIdx or P.BottomPos
+/// is invalid until it reaches the end of the block or closeRegion() is
+/// explicitly called. Similarly, P.TopIdx is invalid during upward
+/// tracking. Changing direction has the side effect of closing region, and
+/// traversing past TopIdx or BottomIdx reopens it.
+class RegPressureTracker {
+  const MachineFunction     *MF;
+  const TargetRegisterInfo  *TRI;
+  const RegisterClassInfo   *RCI;
+  const MachineRegisterInfo *MRI;
+  const LiveIntervals       *LIS;
+
+  /// We currently only allow pressure tracking within a block.
+  const MachineBasicBlock *MBB;
+
+  /// Track the max pressure within the region traversed so far.
+  RegisterPressure &P;
+
+  /// Run in two modes dependending on whether constructed with IntervalPressure
+  /// or RegisterPressure. If requireIntervals is false, LIS are ignored.
+  bool RequireIntervals;
+
+  /// Register pressure corresponds to liveness before this instruction
+  /// iterator. It may point to the end of the block rather than an instruction.
+  MachineBasicBlock::const_iterator CurrPos;
+
+  /// Pressure map indexed by pressure set ID, not class ID.
+  std::vector<unsigned> CurrSetPressure;
+
+  /// List of live registers.
+  SparseSet<unsigned> LivePhysRegs;
+  SparseSet<unsigned, VirtReg2IndexFunctor> LiveVirtRegs;
+
+public:
+  RegPressureTracker(IntervalPressure &rp) :
+    MF(0), TRI(0), RCI(0), LIS(0), MBB(0), P(rp), RequireIntervals(true) {}
+
+  RegPressureTracker(RegionPressure &rp) :
+    MF(0), TRI(0), RCI(0), LIS(0), MBB(0), P(rp), RequireIntervals(false) {}
+
+  void init(const MachineFunction *mf, const RegisterClassInfo *rci,
+            const LiveIntervals *lis, const MachineBasicBlock *mbb,
+            MachineBasicBlock::const_iterator pos);
+
+  /// Force liveness of registers. Particularly useful to initialize the
+  /// livein/out state of the tracker before the first call to advance/recede.
+  void addLiveRegs(ArrayRef<unsigned> Regs);
+
+  /// Get the MI position corresponding to this register pressure.
+  MachineBasicBlock::const_iterator getPos() const { return CurrPos; }
+
+  // Reset the MI position corresponding to the register pressure. This allows
+  // schedulers to move instructions above the RegPressureTracker's
+  // CurrPos. Since the pressure is computed before CurrPos, the iterator
+  // position changes while pressure does not.
+  void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; }
+
+  /// Recede across the previous instruction.
+  bool recede();
+
+  /// Advance across the current instruction.
+  bool advance();
+
+  /// Finalize the region boundaries and recored live ins and live outs.
+  void closeRegion();
+
+  /// Get the resulting register pressure over the traversed region.
+  /// This result is complete if either advance() or recede() has returned true,
+  /// or if closeRegion() was explicitly invoked.
+  RegisterPressure &getPressure() { return P; }
+
+  /// Get the register set pressure at the current position, which may be less
+  /// than the pressure across the traversed region.
+  std::vector<unsigned> &getRegSetPressureAtPos() { return CurrSetPressure; }
+
+  void discoverPhysLiveIn(unsigned Reg);
+  void discoverPhysLiveOut(unsigned Reg);
+
+  void discoverVirtLiveIn(unsigned Reg);
+  void discoverVirtLiveOut(unsigned Reg);
+
+  bool isTopClosed() const;
+  bool isBottomClosed() const;
+
+  void closeTop();
+  void closeBottom();
+
+  /// Consider the pressure increase caused by traversing this instruction
+  /// bottom-up. Find the pressure set with the most change beyond its pressure
+  /// limit based on the tracker's current pressure, and record the number of
+  /// excess register units of that pressure set introduced by this instruction.
+  void getMaxUpwardPressureDelta(const MachineInstr *MI,
+                                 RegPressureDelta &Delta,
+                                 ArrayRef<PressureElement> CriticalPSets,
+                                 ArrayRef<unsigned> MaxPressureLimit);
+
+  /// Consider the pressure increase caused by traversing this instruction
+  /// top-down. Find the pressure set with the most change beyond its pressure
+  /// limit based on the tracker's current pressure, and record the number of
+  /// excess register units of that pressure set introduced by this instruction.
+  void getMaxDownwardPressureDelta(const MachineInstr *MI,
+                                   RegPressureDelta &Delta,
+                                   ArrayRef<PressureElement> CriticalPSets,
+                                   ArrayRef<unsigned> MaxPressureLimit);
+
+  /// Find the pressure set with the most change beyond its pressure limit after
+  /// traversing this instruction either upward or downward depending on the
+  /// closed end of the current region.
+  void getMaxPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                           ArrayRef<PressureElement> CriticalPSets,
+                           ArrayRef<unsigned> MaxPressureLimit) {
+    if (isTopClosed())
+      return getMaxDownwardPressureDelta(MI, Delta, CriticalPSets,
+                                         MaxPressureLimit);
+
+    assert(isBottomClosed() && "Uninitialized pressure tracker");
+    return getMaxUpwardPressureDelta(MI, Delta, CriticalPSets,
+                                     MaxPressureLimit);
+  }
+
+  /// Get the pressure of each PSet after traversing this instruction bottom-up.
+  void getUpwardPressure(const MachineInstr *MI,
+                         std::vector<unsigned> &PressureResult,
+                         std::vector<unsigned> &MaxPressureResult);
+
+  /// Get the pressure of each PSet after traversing this instruction top-down.
+  void getDownwardPressure(const MachineInstr *MI,
+                           std::vector<unsigned> &PressureResult,
+                           std::vector<unsigned> &MaxPressureResult);
+
+  void getPressureAfterInst(const MachineInstr *MI,
+                            std::vector<unsigned> &PressureResult,
+                            std::vector<unsigned> &MaxPressureResult) {
+    if (isTopClosed())
+      return getUpwardPressure(MI, PressureResult, MaxPressureResult);
+
+    assert(isBottomClosed() && "Uninitialized pressure tracker");
+    return getDownwardPressure(MI, PressureResult, MaxPressureResult);
+  }
+
+protected:
+  void increasePhysRegPressure(ArrayRef<unsigned> Regs);
+  void decreasePhysRegPressure(ArrayRef<unsigned> Regs);
+
+  void increaseVirtRegPressure(ArrayRef<unsigned> Regs);
+  void decreaseVirtRegPressure(ArrayRef<unsigned> Regs);
+
+  void bumpUpwardPressure(const MachineInstr *MI);
+  void bumpDownwardPressure(const MachineInstr *MI);
+};
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index f4de693..85ab47b 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -117,8 +117,9 @@ namespace llvm {
       }
     }
 
-    bool operator==(const SDep &Other) const {
-      if (Dep != Other.Dep || Latency != Other.Latency) return false;
+    /// Return true if the specified SDep is equivalent except for latency.
+    bool overlaps(const SDep &Other) const {
+      if (Dep != Other.Dep) return false;
       switch (Dep.getInt()) {
       case Data:
       case Anti:
@@ -133,6 +134,10 @@ namespace llvm {
       llvm_unreachable("Invalid dependency kind!");
     }
 
+    bool operator==(const SDep &Other) const {
+      return overlaps(Other) && Latency == Other.Latency;
+    }
+
     bool operator!=(const SDep &Other) const {
       return !operator==(Other);
     }
@@ -272,6 +277,9 @@ namespace llvm {
     unsigned Depth;                     // Node depth.
     unsigned Height;                    // Node height.
   public:
+    unsigned TopReadyCycle; // Cycle relative to start when node is ready.
+    unsigned BotReadyCycle; // Cycle relative to end when node is ready.
+
     const TargetRegisterClass *CopyDstRC; // Is a special copy node if not null.
     const TargetRegisterClass *CopySrcRC;
 
@@ -287,7 +295,7 @@ namespace llvm {
         isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        CopyDstRC(NULL), CopySrcRC(NULL) {}
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
     /// a MachineInstr.
@@ -301,7 +309,7 @@ namespace llvm {
         isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        CopyDstRC(NULL), CopySrcRC(NULL) {}
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
@@ -314,7 +322,7 @@ namespace llvm {
         isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        CopyDstRC(NULL), CopySrcRC(NULL) {}
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// setNode - Assign the representative SDNode for this SUnit.
     /// This may be used during pre-regalloc scheduling.
@@ -552,12 +560,6 @@ namespace llvm {
     ///
     virtual void computeLatency(SUnit *SU) = 0;
 
-    /// ComputeOperandLatency - Override dependence edge latency using
-    /// operand use/def information
-    ///
-    virtual void computeOperandLatency(SUnit *, SUnit *,
-                                       SDep&) const { }
-
     /// ForceUnitLatencies - Return true if all scheduling edges should be given
     /// a latency value of one.  The default is to return false; schedulers may
     /// override this as needed.
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 4fee108..1bde942 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -28,6 +28,7 @@ namespace llvm {
   class MachineLoopInfo;
   class MachineDominatorTree;
   class LiveIntervals;
+  class RegPressureTracker;
 
   /// LoopDependencies - This class analyzes loop-oriented register
   /// dependencies, which are used to guide scheduling decisions.
@@ -35,7 +36,6 @@ namespace llvm {
   /// scheduled as soon as possible after the variable's last use.
   ///
   class LoopDependencies {
-    const MachineLoopInfo &MLI;
     const MachineDominatorTree &MDT;
 
   public:
@@ -43,9 +43,7 @@ namespace llvm {
       LoopDeps;
     LoopDeps Deps;
 
-    LoopDependencies(const MachineLoopInfo &mli,
-                     const MachineDominatorTree &mdt) :
-      MLI(mli), MDT(mdt) {}
+    LoopDependencies(const MachineDominatorTree &mdt) : MDT(mdt) {}
 
     /// VisitLoop - Clear out any previous state and analyze the given loop.
     ///
@@ -105,7 +103,7 @@ namespace llvm {
 
     VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
 
-    unsigned getSparseSetKey() const {
+    unsigned getSparseSetIndex() const {
       return TargetRegisterInfo::virtReg2Index(VirtReg);
     }
   };
@@ -160,7 +158,7 @@ namespace llvm {
   /// compares ValueT's, only unsigned keys. This allows the set to be cleared
   /// between scheduling regions in constant time as long as ValueT does not
   /// require a destructor.
-  typedef SparseSet<VReg2SUnit> VReg2SUnitMap;
+  typedef SparseSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMap;
 
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
@@ -229,7 +227,7 @@ namespace llvm {
     ///
     LoopDependencies LoopRegs;
 
-    /// DbgValues - Remember instruction that preceeds DBG_VALUE.
+    /// DbgValues - Remember instruction that precedes DBG_VALUE.
     /// These are generated by buildSchedGraph but persist so they can be
     /// referenced when emitting the final schedule.
     typedef std::vector<std::pair<MachineInstr *, MachineInstr *> >
@@ -275,7 +273,7 @@ namespace llvm {
 
     /// buildSchedGraph - Build SUnits from the MachineBasicBlock that we are
     /// input.
-    void buildSchedGraph(AliasAnalysis *AA);
+    void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -290,11 +288,15 @@ namespace llvm {
     ///
     virtual void computeLatency(SUnit *SU);
 
-    /// computeOperandLatency - Override dependence edge latency using
+    /// computeOperandLatency - Return dependence edge latency using
     /// operand use/def information
     ///
-    virtual void computeOperandLatency(SUnit *Def, SUnit *Use,
-                                       SDep& dep) const;
+    /// FindMin may be set to get the minimum vs. expected latency. Minimum
+    /// latency is used for scheduling groups, while expected latency is for
+    /// instruction cost and critical path.
+    virtual unsigned computeOperandLatency(SUnit *Def, SUnit *Use,
+                                           const SDep& dep,
+                                           bool FindMin = false) const;
 
     /// schedule - Order nodes according to selected style, filling
     /// in the Sequence member.
@@ -321,10 +323,6 @@ namespace llvm {
     void addPhysRegDeps(SUnit *SU, unsigned OperIdx);
     void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
     void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
-
-    VReg2SUnitMap::iterator findVRegDef(unsigned VirtReg) {
-      return VRegDefs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
-    }
   };
 
   /// newSUnit - Creates a new SUnit and return a ptr to it.
diff --git a/include/llvm/CodeGen/ScheduleHazardRecognizer.h b/include/llvm/CodeGen/ScheduleHazardRecognizer.h
index 2f53baa..9dfa344 100644
--- a/include/llvm/CodeGen/ScheduleHazardRecognizer.h
+++ b/include/llvm/CodeGen/ScheduleHazardRecognizer.h
@@ -46,6 +46,8 @@ public:
 
   /// atIssueLimit - Return true if no more instructions may be issued in this
   /// cycle.
+  ///
+  /// FIXME: remove this once MachineScheduler is the only client.
   virtual bool atIssueLimit() const { return false; }
 
   /// getHazardType - Return the hazard type of emitting this node.  There are
@@ -55,7 +57,7 @@ public:
   ///     other instruction is available, issue it first.
   ///  * NoopHazard: issuing this instruction would break the program.  If
   ///     some other instruction can be issued, do so, otherwise issue a noop.
-  virtual HazardType getHazardType(SUnit *m, int Stalls) {
+  virtual HazardType getHazardType(SUnit *m, int Stalls = 0) {
     return NoHazard;
   }
 
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 6a7a87e..1ccfe54 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -177,6 +177,44 @@ class SelectionDAG {
   /// DbgInfo - Tracks dbg_value information through SDISel.
   SDDbgInfo *DbgInfo;
 
+public:
+  /// DAGUpdateListener - Clients of various APIs that cause global effects on
+  /// the DAG can optionally implement this interface.  This allows the clients
+  /// to handle the various sorts of updates that happen.
+  ///
+  /// A DAGUpdateListener automatically registers itself with DAG when it is
+  /// constructed, and removes itself when destroyed in RAII fashion.
+  struct DAGUpdateListener {
+    DAGUpdateListener *const Next;
+    SelectionDAG &DAG;
+
+    explicit DAGUpdateListener(SelectionDAG &D)
+      : Next(D.UpdateListeners), DAG(D) {
+      DAG.UpdateListeners = this;
+    }
+
+    virtual ~DAGUpdateListener() {
+      assert(DAG.UpdateListeners == this &&
+             "DAGUpdateListeners must be destroyed in LIFO order");
+      DAG.UpdateListeners = Next;
+    }
+
+    /// NodeDeleted - The node N that was deleted and, if E is not null, an
+    /// equivalent node E that replaced it.
+    virtual void NodeDeleted(SDNode *N, SDNode *E);
+
+    /// NodeUpdated - The node N that was updated.
+    virtual void NodeUpdated(SDNode *N);
+  };
+
+private:
+  /// DAGUpdateListener is a friend so it can manipulate the listener stack.
+  friend struct DAGUpdateListener;
+
+  /// UpdateListeners - Linked list of registered DAGUpdateListener instances.
+  /// This stack is maintained by DAGUpdateListener RAII.
+  DAGUpdateListener *UpdateListeners;
+
   /// setGraphColorHelper - Implementation of setSubgraphColor.
   /// Return whether we had to truncate the search.
   ///
@@ -384,6 +422,8 @@ public:
                                   int Offset = 0, unsigned char TargetFlags=0) {
     return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
   }
+  SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
+                         unsigned char TargetFlags = 0);
   // When generating a branch to a BB, we don't in general know enough
   // to provide debug info for the BB at that time, so keep this one around.
   SDValue getBasicBlock(MachineBasicBlock *MBB);
@@ -817,30 +857,14 @@ public:
   SDDbgValue *getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
                           DebugLoc DL, unsigned O);
 
-  /// DAGUpdateListener - Clients of various APIs that cause global effects on
-  /// the DAG can optionally implement this interface.  This allows the clients
-  /// to handle the various sorts of updates that happen.
-  class DAGUpdateListener {
-  public:
-    virtual ~DAGUpdateListener();
-
-    /// NodeDeleted - The node N that was deleted and, if E is not null, an
-    /// equivalent node E that replaced it.
-    virtual void NodeDeleted(SDNode *N, SDNode *E) = 0;
-
-    /// NodeUpdated - The node N that was updated.
-    virtual void NodeUpdated(SDNode *N) = 0;
-  };
-
   /// RemoveDeadNode - Remove the specified node from the system. If any of its
   /// operands then becomes dead, remove them as well. Inform UpdateListener
   /// for each node deleted.
-  void RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener = 0);
+  void RemoveDeadNode(SDNode *N);
 
   /// RemoveDeadNodes - This method deletes the unreachable nodes in the
   /// given list, and any nodes that become unreachable as a result.
-  void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
-                       DAGUpdateListener *UpdateListener = 0);
+  void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes);
 
   /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
   /// This can cause recursive merging of nodes in the DAG.  Use the first
@@ -857,24 +881,19 @@ public:
   /// to be given new uses. These new uses of From are left in place, and
   /// not automatically transferred to To.
   ///
-  void ReplaceAllUsesWith(SDValue From, SDValue Op,
-                          DAGUpdateListener *UpdateListener = 0);
-  void ReplaceAllUsesWith(SDNode *From, SDNode *To,
-                          DAGUpdateListener *UpdateListener = 0);
-  void ReplaceAllUsesWith(SDNode *From, const SDValue *To,
-                          DAGUpdateListener *UpdateListener = 0);
+  void ReplaceAllUsesWith(SDValue From, SDValue Op);
+  void ReplaceAllUsesWith(SDNode *From, SDNode *To);
+  void ReplaceAllUsesWith(SDNode *From, const SDValue *To);
 
   /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
   /// uses of other values produced by From.Val alone.
-  void ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
-                                 DAGUpdateListener *UpdateListener = 0);
+  void ReplaceAllUsesOfValueWith(SDValue From, SDValue To);
 
   /// ReplaceAllUsesOfValuesWith - Like ReplaceAllUsesOfValueWith, but
   /// for multiple values at once. This correctly handles the case where
   /// there is an overlap between the From values and the To values.
   void ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To,
-                                  unsigned Num,
-                                  DAGUpdateListener *UpdateListener = 0);
+                                  unsigned Num);
 
   /// AssignTopologicalOrder - Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
@@ -1031,7 +1050,7 @@ public:
 
 private:
   bool RemoveNodeFromCSEMaps(SDNode *N);
-  void AddModifiedNodeToCSEMaps(SDNode *N, DAGUpdateListener *UpdateListener);
+  void AddModifiedNodeToCSEMaps(SDNode *N);
   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos);
   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2,
                                void *&InsertPos);
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index ee3f231..c42f655 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -172,53 +172,22 @@ protected:
   ///
   unsigned DAGSize;
 
-  /// ISelPosition - Node iterator marking the current position of
-  /// instruction selection as it procedes through the topologically-sorted
-  /// node list.
-  SelectionDAG::allnodes_iterator ISelPosition;
-
-
-  /// ISelUpdater - helper class to handle updates of the
-  /// instruction selection graph.
-  class ISelUpdater : public SelectionDAG::DAGUpdateListener {
-    virtual void anchor();
-    SelectionDAG::allnodes_iterator &ISelPosition;
-  public:
-    explicit ISelUpdater(SelectionDAG::allnodes_iterator &isp)
-      : ISelPosition(isp) {}
-
-    /// NodeDeleted - Handle nodes deleted from the graph. If the
-    /// node being deleted is the current ISelPosition node, update
-    /// ISelPosition.
-    ///
-    virtual void NodeDeleted(SDNode *N, SDNode *E) {
-      if (ISelPosition == SelectionDAG::allnodes_iterator(N))
-        ++ISelPosition;
-    }
-
-    /// NodeUpdated - Ignore updates for now.
-    virtual void NodeUpdated(SDNode *N) {}
-  };
-
   /// ReplaceUses - replace all uses of the old node F with the use
   /// of the new node T.
   void ReplaceUses(SDValue F, SDValue T) {
-    ISelUpdater ISU(ISelPosition);
-    CurDAG->ReplaceAllUsesOfValueWith(F, T, &ISU);
+    CurDAG->ReplaceAllUsesOfValueWith(F, T);
   }
 
   /// ReplaceUses - replace all uses of the old nodes F with the use
   /// of the new nodes T.
   void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) {
-    ISelUpdater ISU(ISelPosition);
-    CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num, &ISU);
+    CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num);
   }
 
   /// ReplaceUses - replace all uses of the old node F with the use
   /// of the new node T.
   void ReplaceUses(SDNode *F, SDNode *T) {
-    ISelUpdater ISU(ISelPosition);
-    CurDAG->ReplaceAllUsesWith(F, T, &ISU);
+    CurDAG->ReplaceAllUsesWith(F, T);
   }
 
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index f8248b8..0dfb394 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -74,6 +74,10 @@ namespace ISD {
   /// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
   /// element is not an undef.
   bool isScalarToVector(const SDNode *N);
+
+  /// allOperandsUndef - Return true if the node has at least one operand
+  /// and all operands of the specified node are ISD::UNDEF.
+  bool allOperandsUndef(const SDNode *N);
 }  // end llvm:ISD namespace
 
 //===----------------------------------------------------------------------===//
@@ -1339,6 +1343,29 @@ public:
   }
 };
 
+/// Completely target-dependent object reference.
+class TargetIndexSDNode : public SDNode {
+  unsigned char TargetFlags;
+  int Index;
+  int64_t Offset;
+  friend class SelectionDAG;
+public:
+
+  TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned char TF)
+    : SDNode(ISD::TargetIndex, DebugLoc(), getSDVTList(VT)),
+      TargetFlags(TF), Index(Idx), Offset(Ofs) {}
+public:
+
+  unsigned char getTargetFlags() const { return TargetFlags; }
+  int getIndex() const { return Index; }
+  int64_t getOffset() const { return Offset; }
+
+  static bool classof(const TargetIndexSDNode*) { return true; }
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::TargetIndex;
+  }
+};
+
 class BasicBlockSDNode : public SDNode {
   MachineBasicBlock *MBB;
   friend class SelectionDAG;
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 0457e43..c52599b 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -76,7 +76,6 @@ namespace llvm {
   /// SlotIndex - An opaque wrapper around machine indexes.
   class SlotIndex {
     friend class SlotIndexes;
-    friend struct DenseMapInfo<SlotIndex>;
 
     enum Slot {
       /// Basic block boundary.  Used for live ranges entering and leaving a
@@ -121,11 +120,6 @@ namespace llvm {
       return static_cast<Slot>(lie.getInt());
     }
 
-    static inline unsigned getHashValue(const SlotIndex &v) {
-      void *ptrVal = v.lie.getOpaqueValue();
-      return (unsigned((intptr_t)ptrVal)) ^ (unsigned((intptr_t)ptrVal) >> 9);
-    }
-
   public:
     enum {
       /// The default distance between instructions as returned by distance().
@@ -133,14 +127,6 @@ namespace llvm {
       InstrDist = 4 * Slot_Count
     };
 
-    static inline SlotIndex getEmptyKey() {
-      return SlotIndex(0, 1);
-    }
-
-    static inline SlotIndex getTombstoneKey() {
-      return SlotIndex(0, 2);
-    }
-
     /// Construct an invalid index.
     SlotIndex() : lie(0, 0) {}
 
@@ -293,23 +279,6 @@ namespace llvm {
 
   };
 
-  /// DenseMapInfo specialization for SlotIndex.
-  template <>
-  struct DenseMapInfo<SlotIndex> {
-    static inline SlotIndex getEmptyKey() {
-      return SlotIndex::getEmptyKey();
-    }
-    static inline SlotIndex getTombstoneKey() {
-      return SlotIndex::getTombstoneKey();
-    }
-    static inline unsigned getHashValue(const SlotIndex &v) {
-      return SlotIndex::getHashValue(v);
-    }
-    static inline bool isEqual(const SlotIndex &LHS, const SlotIndex &RHS) {
-      return (LHS == RHS);
-    }
-  };
-
   template <> struct isPodLike<SlotIndex> { static const bool value = true; };
 
 
@@ -344,7 +313,6 @@ namespace llvm {
     IndexList indexList;
 
     MachineFunction *mf;
-    unsigned functionSize;
 
     typedef DenseMap<const MachineInstr*, SlotIndex> Mi2IndexMap;
     Mi2IndexMap mi2iMap;
@@ -402,19 +370,6 @@ namespace llvm {
       return SlotIndex(&indexList.back(), 0);
     }
 
-    /// Returns the distance between the highest and lowest indexes allocated
-    /// so far.
-    unsigned getIndexesLength() const {
-      assert(indexList.front().getIndex() == 0 &&
-             "Initial index isn't zero?");
-      return indexList.back().getIndex();
-    }
-
-    /// Returns the number of instructions in the function.
-    unsigned getFunctionSize() const {
-      return functionSize;
-    }
-
     /// Returns true if the given machine instr is mapped to an index,
     /// otherwise returns false.
     bool hasIndex(const MachineInstr *instr) const {
@@ -444,7 +399,7 @@ namespace llvm {
     }
 
     /// getIndexBefore - Returns the index of the last indexed instruction
-    /// before MI, or the the start index of its basic block.
+    /// before MI, or the start index of its basic block.
     /// MI is not required to have an index.
     SlotIndex getIndexBefore(const MachineInstr *MI) const {
       const MachineBasicBlock *MBB = MI->getParent();
@@ -590,7 +545,7 @@ namespace llvm {
         nextItr = getIndexAfter(mi).listEntry();
         prevItr = prior(nextItr);
       } else {
-        // Insert mi's index immediately after the preceeding instruction.
+        // Insert mi's index immediately after the preceding instruction.
         prevItr = getIndexBefore(mi).listEntry();
         nextItr = llvm::next(prevItr);
       }
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 5a42136..9849e92 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -33,6 +33,8 @@ namespace llvm {
 
 
 class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
+  bool UseInitArray;
+
 public:
   virtual ~TargetLoweringObjectFileELF() {}
 
@@ -66,6 +68,7 @@ public:
   getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                           MachineModuleInfo *MMI) const;
 
+  void InitializeELF(bool UseInitArray_);
   virtual const MCSection *
   getStaticCtorSection(unsigned Priority = 65535) const;
   virtual const MCSection *
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index 76c2357..eb38cd3 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -68,34 +68,38 @@ namespace llvm {
       v2i32          =  22,   //  2 x i32
       v4i32          =  23,   //  4 x i32
       v8i32          =  24,   //  8 x i32
-      v1i64          =  25,   //  1 x i64
-      v2i64          =  26,   //  2 x i64
-      v4i64          =  27,   //  4 x i64
-      v8i64          =  28,   //  8 x i64
-
-      v2f16          =  29,   //  2 x f16
-      v2f32          =  30,   //  2 x f32
-      v4f32          =  31,   //  4 x f32
-      v8f32          =  32,   //  8 x f32
-      v2f64          =  33,   //  2 x f64
-      v4f64          =  34,   //  4 x f64
+      v16i32         =  25,   // 16 x i32
+      v1i64          =  26,   //  1 x i64
+      v2i64          =  27,   //  2 x i64
+      v4i64          =  28,   //  4 x i64
+      v8i64          =  29,   //  8 x i64
+      v16i64         =  30,   // 16 x i64
+
+      v2f16          =  31,   //  2 x f16
+      v2f32          =  32,   //  2 x f32
+      v4f32          =  33,   //  4 x f32
+      v8f32          =  34,   //  8 x f32
+      v2f64          =  35,   //  2 x f64
+      v4f64          =  36,   //  4 x f64
 
       FIRST_VECTOR_VALUETYPE = v2i8,
       LAST_VECTOR_VALUETYPE  = v4f64,
+      FIRST_INTEGER_VECTOR_VALUETYPE = v2i8,
+      LAST_INTEGER_VECTOR_VALUETYPE = v16i64,
       FIRST_FP_VECTOR_VALUETYPE = v2f16,
       LAST_FP_VECTOR_VALUETYPE = v4f64,
 
-      x86mmx         =  35,   // This is an X86 MMX value
+      x86mmx         =  37,   // This is an X86 MMX value
 
-      Glue           =  36,   // This glues nodes together during pre-RA sched
+      Glue           =  38,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  37,   // This has no value
+      isVoid         =  39,   // This has no value
 
-      Untyped        =  38,   // This value takes a register, but has
+      Untyped        =  40,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
-      LAST_VALUETYPE =  39,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  41,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -153,15 +157,16 @@ namespace llvm {
     bool isFloatingPoint() const {
       return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
                SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
-	      (SimpleTy >= MVT::FIRST_FP_VECTOR_VALUETYPE &&
-         SimpleTy <= MVT::LAST_FP_VECTOR_VALUETYPE));
+              (SimpleTy >= MVT::FIRST_FP_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_FP_VECTOR_VALUETYPE));
     }
 
     /// isInteger - Return true if this is an integer, or a vector integer type.
     bool isInteger() const {
       return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
                SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
-	      (SimpleTy >= MVT::v2i8 && SimpleTy <= MVT::v8i64));
+              (SimpleTy >= MVT::FIRST_INTEGER_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_INTEGER_VECTOR_VALUETYPE));
     }
 
     /// isVector - Return true if this is a vector value type.
@@ -170,6 +175,37 @@ namespace llvm {
               SimpleTy <= MVT::LAST_VECTOR_VALUETYPE);
     }
 
+    /// is64BitVector - Return true if this is a 64-bit vector type.
+    bool is64BitVector() const {
+      return (SimpleTy == MVT::v8i8  || SimpleTy == MVT::v4i16 ||
+              SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 ||
+              SimpleTy == MVT::v2f32);
+    }
+
+    /// is128BitVector - Return true if this is a 128-bit vector type.
+    bool is128BitVector() const {
+      return (SimpleTy == MVT::v16i8 || SimpleTy == MVT::v8i16 ||
+              SimpleTy == MVT::v4i32 || SimpleTy == MVT::v2i64 ||
+              SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64);
+    }
+
+    /// is256BitVector - Return true if this is a 256-bit vector type.
+    bool is256BitVector() const {
+      return (SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64  ||
+              SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
+              SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64);
+    }
+
+    /// is512BitVector - Return true if this is a 512-bit vector type.
+    bool is512BitVector() const {
+      return (SimpleTy == MVT::v8i64 || SimpleTy == MVT::v16i32);
+    }
+
+    /// is1024BitVector - Return true if this is a 1024-bit vector type.
+    bool is1024BitVector() const {
+      return (SimpleTy == MVT::v16i64);
+    }
+
     /// isPow2VectorType - Returns true if the given vector is a power of 2.
     bool isPow2VectorType() const {
       unsigned NElts = getVectorNumElements();
@@ -196,7 +232,7 @@ namespace llvm {
     MVT getVectorElementType() const {
       switch (SimpleTy) {
       default:
-        return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
+        llvm_unreachable("Not a vector MVT!");
       case v2i8 :
       case v4i8 :
       case v8i8 :
@@ -208,11 +244,13 @@ namespace llvm {
       case v16i16: return i16;
       case v2i32:
       case v4i32:
-      case v8i32: return i32;
+      case v8i32:
+      case v16i32: return i32;
       case v1i64:
       case v2i64:
       case v4i64:
-      case v8i64: return i64;
+      case v8i64:
+      case v16i64: return i64;
       case v2f16: return f16;
       case v2f32:
       case v4f32:
@@ -225,10 +263,12 @@ namespace llvm {
     unsigned getVectorNumElements() const {
       switch (SimpleTy) {
       default:
-        return ~0U;
+        llvm_unreachable("Not a vector MVT!");
       case v32i8: return 32;
       case v16i8:
-      case v16i16: return 16;
+      case v16i16:
+      case v16i32:
+      case v16i64:return 16;
       case v8i8 :
       case v8i16:
       case v8i32:
@@ -295,7 +335,9 @@ namespace llvm {
       case v4i64:
       case v8f32:
       case v4f64: return 256;
+      case v16i32:
       case v8i64: return 512;
+      case v16i64:return 1024;
       }
     }
 
@@ -368,12 +410,14 @@ namespace llvm {
         if (NumElements == 2)  return MVT::v2i32;
         if (NumElements == 4)  return MVT::v4i32;
         if (NumElements == 8)  return MVT::v8i32;
+        if (NumElements == 16) return MVT::v16i32;
         break;
       case MVT::i64:
         if (NumElements == 1)  return MVT::v1i64;
         if (NumElements == 2)  return MVT::v2i64;
         if (NumElements == 4)  return MVT::v4i64;
         if (NumElements == 8)  return MVT::v8i64;
+        if (NumElements == 16) return MVT::v16i64;
         break;
       case MVT::f16:
         if (NumElements == 2)  return MVT::v2f16;
@@ -487,32 +531,27 @@ namespace llvm {
 
     /// is64BitVector - Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
-      if (!isSimple())
-        return isExtended64BitVector();
-
-      return (V == MVT::v8i8  || V==MVT::v4i16 || V==MVT::v2i32 ||
-              V == MVT::v1i64 || V==MVT::v2f32);
+      return isSimple() ? V.is64BitVector() : isExtended64BitVector();
     }
 
     /// is128BitVector - Return true if this is a 128-bit vector type.
     bool is128BitVector() const {
-      if (!isSimple())
-        return isExtended128BitVector();
-      return (V==MVT::v16i8 || V==MVT::v8i16 || V==MVT::v4i32 ||
-              V==MVT::v2i64 || V==MVT::v4f32 || V==MVT::v2f64);
+      return isSimple() ? V.is128BitVector() : isExtended128BitVector();
     }
 
     /// is256BitVector - Return true if this is a 256-bit vector type.
-    inline bool is256BitVector() const {
-      if (!isSimple())
-        return isExtended256BitVector();
-      return (V == MVT::v8f32  || V == MVT::v4f64 || V == MVT::v32i8 ||
-              V == MVT::v16i16 || V == MVT::v8i32 || V == MVT::v4i64);
+    bool is256BitVector() const {
+      return isSimple() ? V.is256BitVector() : isExtended256BitVector();
     }
 
     /// is512BitVector - Return true if this is a 512-bit vector type.
-    inline bool is512BitVector() const {
-      return isSimple() ? (V == MVT::v8i64) : isExtended512BitVector();
+    bool is512BitVector() const {
+      return isSimple() ? V.is512BitVector() : isExtended512BitVector();
+    }
+
+    /// is1024BitVector - Return true if this is a 1024-bit vector type.
+    bool is1024BitVector() const {
+      return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
     }
 
     /// isOverloaded - Return true if this is an overloaded type for TableGen.
@@ -705,6 +744,7 @@ namespace llvm {
     bool isExtended128BitVector() const;
     bool isExtended256BitVector() const;
     bool isExtended512BitVector() const;
+    bool isExtended1024BitVector() const;
     EVT getExtendedVectorElementType() const;
     unsigned getExtendedVectorNumElements() const;
     unsigned getExtendedSizeInBits() const;
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index 6c22690..f4b75bd 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -45,22 +45,24 @@ def v16i16 : ValueType<256, 21>;   // 16 x i16 vector value
 def v2i32  : ValueType<64 , 22>;   //  2 x i32 vector value
 def v4i32  : ValueType<128, 23>;   //  4 x i32 vector value
 def v8i32  : ValueType<256, 24>;   //  8 x i32 vector value
-def v1i64  : ValueType<64 , 25>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 26>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 27>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 28>;   //  8 x i64 vector value
+def v16i32 : ValueType<512, 25>;   // 16 x i32 vector value
+def v1i64  : ValueType<64 , 26>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 27>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 28>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 29>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,30>;  // 16 x i64 vector value
 
-def v2f16  : ValueType<32 , 29>;   //  2 x f16 vector value
-def v2f32  : ValueType<64 , 30>;   //  2 x f32 vector value
-def v4f32  : ValueType<128, 31>;   //  4 x f32 vector value
-def v8f32  : ValueType<256, 32>;   //  8 x f32 vector value
-def v2f64  : ValueType<128, 33>;   //  2 x f64 vector value
-def v4f64  : ValueType<256, 34>;   //  4 x f64 vector value
+def v2f16  : ValueType<32 , 31>;   //  2 x f16 vector value
+def v2f32  : ValueType<64 , 32>;   //  2 x f32 vector value
+def v4f32  : ValueType<128, 33>;   //  4 x f32 vector value
+def v8f32  : ValueType<256, 34>;   //  8 x f32 vector value
+def v2f64  : ValueType<128, 35>;   //  2 x f64 vector value
+def v4f64  : ValueType<256, 36>;   //  4 x f64 vector value
 
-def x86mmx : ValueType<64 , 35>;   // X86 MMX value
-def FlagVT : ValueType<0  , 36>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 37>;   // Produces no value
-def untyped: ValueType<8  , 38>;   // Produces an untyped value
+def x86mmx : ValueType<64 , 37>;   // X86 MMX value
+def FlagVT : ValueType<0  , 38>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 39>;   // Produces no value
+def untyped: ValueType<8  , 40>;   // Produces an untyped value
 
 def MetadataVT: ValueType<0, 250>; // Metadata
 
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index 95c4d6c..b912251 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -14,15 +14,21 @@
 /* Directories clang will search for headers */
 #define C_INCLUDE_DIRS "${C_INCLUDE_DIRS}"
 
-/* Define if CBE is enabled for printf %a output */
-#cmakedefine ENABLE_CBE_PRINTF_A ${ENABLE_CBE_PRINTF_A}
+/* Default <path> to all compiler invocations for --sysroot=<path>. */
+#undef DEFAULT_SYSROOT
 
 /* Define if position independent code is enabled */
 #cmakedefine ENABLE_PIC
 
-/* Define if timestamp information (e.g., __DATE___) is allowed */
+/* Define if timestamp information (e.g., __DATE__) is allowed */
 #cmakedefine ENABLE_TIMESTAMPS ${ENABLE_TIMESTAMPS}
 
+/* Directory where gcc is installed. */
+#undef GCC_INSTALL_PREFIX
+
+/* Define to 1 if you have the `arc4random' function. */
+#cmakedefine HAVE_ARC4RANDOM
+
 /* Define to 1 if you have the `argz_append' function. */
 #cmakedefine HAVE_ARGZ_APPEND ${HAVE_ARGZ_APPEND}
 
@@ -155,7 +161,7 @@
 #cmakedefine HAVE_GETTIMEOFDAY ${HAVE_GETTIMEOFDAY}
 
 /* Define if the Graphviz program is available */
-#undef HAVE_GRAPHVIZ
+#cmakedefine HAVE_GRAPHVIZ ${HAVE_GRAPHVIZ}
 
 /* Define if the gv program is available */
 #cmakedefine HAVE_GV ${HAVE_GV}
@@ -548,6 +554,9 @@
 /* Has gcc/MSVC atomic intrinsics */
 #cmakedefine01 LLVM_HAS_ATOMICS
 
+/* Host triple LLVM will be executed on */
+#cmakedefine LLVM_HOSTTRIPLE "${LLVM_HOSTTRIPLE}"
+
 /* Installation directory for include files */
 #cmakedefine LLVM_INCLUDEDIR "${LLVM_INCLUDEDIR}"
 
@@ -617,6 +626,12 @@
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
 
+/* Define if we have the Intel JIT API runtime support library */
+#cmakedefine LLVM_USE_INTEL_JITEVENTS 1
+
+/* Define if we have the oprofile JIT-support library */
+#cmakedefine LLVM_USE_OPROFILE 1
+
 /* Major version of the LLVM API */
 #cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
 
@@ -713,10 +728,4 @@
 /* Added by Kevin -- Maximum path length */
 #cmakedefine MAXPATHLEN ${MAXPATHLEN}
 
-/* Support for Intel JIT Events API is enabled */
-#cmakedefine LLVM_USE_INTEL_JITEVENTS 1
-
-/* Support for OProfile JIT API is enabled */
-#cmakedefine LLVM_USE_OPROFILE 1
-
 #endif
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 677bf2e..5a60ba5 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -6,6 +6,9 @@
 /* Bug report URL. */
 #undef BUG_REPORT_URL
 
+/* Define if we have libxml2 */
+#undef CLANG_HAVE_LIBXML
+
 /* Relative directory for resource files */
 #undef CLANG_RESOURCE_DIR
 
@@ -18,12 +21,15 @@
 /* Define if position independent code is enabled */
 #undef ENABLE_PIC
 
-/* Define if timestamp information (e.g., __DATE___) is allowed */
+/* Define if timestamp information (e.g., __DATE__) is allowed */
 #undef ENABLE_TIMESTAMPS
 
 /* Directory where gcc is installed. */
 #undef GCC_INSTALL_PREFIX
 
+/* Define to 1 if you have the `arc4random' function. */
+#undef HAVE_ARC4RANDOM
+
 /* Define to 1 if you have the `argz_append' function. */
 #undef HAVE_ARGZ_APPEND
 
@@ -549,6 +555,9 @@
 /* Has gcc/MSVC atomic intrinsics */
 #undef LLVM_HAS_ATOMICS
 
+/* Host triple LLVM will be executed on */
+#undef LLVM_HOSTTRIPLE
+
 /* Installation directory for include files */
 #undef LLVM_INCLUDEDIR
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 85d28fe..3944292 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -40,6 +40,9 @@
 /* Has gcc/MSVC atomic intrinsics */
 #cmakedefine01 LLVM_HAS_ATOMICS
 
+/* Host triple LLVM will be executed on */
+#cmakedefine LLVM_HOSTTRIPLE "${LLVM_HOSTTRIPLE}"
+
 /* Installation directory for include files */
 #cmakedefine LLVM_INCLUDEDIR "${LLVM_INCLUDEDIR}"
 
diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index 973652f..9489dfe 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@@ -40,6 +40,9 @@
 /* Has gcc/MSVC atomic intrinsics */
 #undef LLVM_HAS_ATOMICS
 
+/* Host triple LLVM will be executed on */
+#undef LLVM_HOSTTRIPLE
+
 /* Installation directory for include files */
 #undef LLVM_INCLUDEDIR
 
diff --git a/include/llvm/Constant.h b/include/llvm/Constant.h
index 13acdc6..e0e516d 100644
--- a/include/llvm/Constant.h
+++ b/include/llvm/Constant.h
@@ -137,8 +137,8 @@ public:
 
   static Constant *getNullValue(Type* Ty);
 
-  /// @returns the value for an integer constant of the given type that has all
-  /// its bits set to true.
+  /// @returns the value for an integer or vector of integer constant of the
+  /// given type that has all its bits set to true.
   /// @brief Get the all ones value
   static Constant *getAllOnesValue(Type* Ty);
 
diff --git a/include/llvm/Constants.h b/include/llvm/Constants.h
index 0abe17d..fdd5382 100644
--- a/include/llvm/Constants.h
+++ b/include/llvm/Constants.h
@@ -917,6 +917,17 @@ public:
     return getLShr(C1, C2, true);
   }
 
+  /// getBinOpIdentity - Return the identity for the given binary operation,
+  /// i.e. a constant C such that X op C = X and C op X = X for every X.  It
+  /// returns null if the operator doesn't have an identity.
+  static Constant *getBinOpIdentity(unsigned Opcode, Type *Ty);
+
+  /// getBinOpAbsorber - Return the absorbing element for the given binary
+  /// operation, i.e. a constant C such that X op C = C and C op X = C for
+  /// every X.  For example, this returns zero for integer multiplication.
+  /// It returns null if the operator doesn't have an absorbing element.
+  static Constant *getBinOpAbsorber(unsigned Opcode, Type *Ty);
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
 
diff --git a/include/llvm/Analysis/DIBuilder.h b/include/llvm/DIBuilder.h
index 2d109cd..2ed48a9 100644
--- a/include/llvm/Analysis/DIBuilder.h
+++ b/include/llvm/DIBuilder.h
@@ -1,4 +1,4 @@
-//===--- llvm/Analysis/DIBuilder.h - Debug Information Builder --*- C++ -*-===//
+//===--- llvm/DIBuilder.h - Debug Information Builder -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -127,8 +127,8 @@ namespace llvm {
                              StringRef Name = StringRef());
 
     /// createReferenceType - Create debugging information entry for a c++
-    /// style reference.
-    DIType createReferenceType(DIType RTy);
+    /// style reference or rvalue reference type.
+    DIType createReferenceType(unsigned Tag, DIType RTy);
 
     /// createTypedef - Create debugging information entry for a typedef.
     /// @param Ty          Original type.
@@ -177,7 +177,7 @@ namespace llvm {
     /// @param OffsetInBits Member offset.
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Ty           Parent type.
-    /// @param PropertyName Name of the Objective C property assoicated with
+    /// @param PropertyName Name of the Objective C property associated with
     ///                     this ivar.
     /// @param GetterName   Name of the Objective C property getter selector.
     /// @param SetterName   Name of the Objective C property setter selector.
@@ -218,11 +218,11 @@ namespace llvm {
     /// @param PropertyAttributes Objective C property attributes.
     /// @param Ty           Type.
     DIObjCProperty createObjCProperty(StringRef Name,
-				      DIFile File, unsigned LineNumber,
-				      StringRef GetterName,
-				      StringRef SetterName,
-				      unsigned PropertyAttributes,
-				      DIType Ty);
+                                      DIFile File, unsigned LineNumber,
+                                      StringRef GetterName,
+                                      StringRef SetterName,
+                                      unsigned PropertyAttributes,
+                                      DIType Ty);
       
     /// createClassType - Create debugging information entry for a class.
     /// @param Scope        Scope in which this class is defined.
@@ -329,10 +329,12 @@ namespace llvm {
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Elements     Enumeration elements.
+    /// @param Flags        Flags (e.g. forward decl)
     DIType createEnumerationType(DIDescriptor Scope, StringRef Name, 
                                  DIFile File, unsigned LineNumber, 
-                                 uint64_t SizeInBits, 
-                                 uint64_t AlignInBits, DIArray Elements);
+                                 uint64_t SizeInBits, uint64_t AlignInBits,
+                                 DIArray Elements, DIType ClassType,
+                                 unsigned Flags);
 
     /// createSubroutineType - Create subroutine type.
     /// @param File          File in which this subroutine is defined.
@@ -348,8 +350,8 @@ namespace llvm {
     DIType createTemporaryType(DIFile F);
 
     /// createForwardDecl - Create a temporary forward-declared type.
-    DIType createForwardDecl(unsigned Tag, StringRef Name, DIFile F,
-                             unsigned Line, unsigned RuntimeLang = 0);
+    DIType createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
+                             DIFile F, unsigned Line, unsigned RuntimeLang = 0);
 
     /// retainType - Retain DIType in a module even if it is not referenced 
     /// through debug info anchors.
diff --git a/include/llvm/Analysis/DebugInfo.h b/include/llvm/DebugInfo.h
index 894c542..618220f 100644
--- a/include/llvm/Analysis/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -46,8 +46,8 @@ namespace llvm {
   class DIObjCProperty;
 
   /// DIDescriptor - A thin wraper around MDNode to access encoded debug info.
-  /// This should not be stored in a container, because underly MDNode may
-  /// change in certain situations.
+  /// This should not be stored in a container, because the underlying MDNode
+  /// may change in certain situations.
   class DIDescriptor {
   public:
     enum {
@@ -104,12 +104,6 @@ namespace llvm {
       return getUnsignedField(0) & ~LLVMDebugVersionMask;
     }
 
-    /// print - print descriptor.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print descriptor to dbgs() with a newline.
-    void dump() const;
-
     bool isDerivedType() const;
     bool isCompositeType() const;
     bool isBasicType() const;
@@ -130,10 +124,18 @@ namespace llvm {
     bool isTemplateTypeParameter() const;
     bool isTemplateValueParameter() const;
     bool isObjCProperty() const;
+
+    /// print - print descriptor.
+    void print(raw_ostream &OS) const;
+
+    /// dump - print descriptor to dbgs() with a newline.
+    void dump() const;
   };
 
   /// DISubrange - This is used to represent ranges, for array bounds.
   class DISubrange : public DIDescriptor {
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DISubrange(const MDNode *N = 0) : DIDescriptor(N) {}
 
@@ -155,10 +157,11 @@ namespace llvm {
 
   /// DIScope - A base class for various scopes.
   class DIScope : public DIDescriptor {
-    virtual void anchor();
+  protected:
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DIScope(const MDNode *N = 0) : DIDescriptor (N) {}
-    virtual ~DIScope() {}
 
     StringRef getFilename() const;
     StringRef getDirectory() const;
@@ -166,7 +169,8 @@ namespace llvm {
 
   /// DICompileUnit - A wrapper for a compile unit.
   class DICompileUnit : public DIScope {
-    virtual void anchor();
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DICompileUnit(const MDNode *N = 0) : DIScope(N) {}
 
@@ -196,17 +200,12 @@ namespace llvm {
 
     /// Verify - Verify that a compile unit is well formed.
     bool Verify() const;
-
-    /// print - print compile unit.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print compile unit to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DIFile - This is a wrapper for a file.
   class DIFile : public DIScope {
-    virtual void anchor();
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const {} // FIXME: Output something?
   public:
     explicit DIFile(const MDNode *N = 0) : DIScope(N) {
       if (DbgNode && !isFile())
@@ -224,6 +223,8 @@ namespace llvm {
   /// FIXME: it seems strange that this doesn't have either a reference to the
   /// type/precision or a file/line pair for location info.
   class DIEnumerator : public DIDescriptor {
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DIEnumerator(const MDNode *N = 0) : DIDescriptor(N) {}
 
@@ -235,19 +236,17 @@ namespace llvm {
   /// FIXME: Types should be factored much better so that CV qualifiers and
   /// others do not require a huge and empty descriptor full of zeros.
   class DIType : public DIScope {
-    virtual void anchor();
   protected:
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
     // This ctor is used when the Tag has already been validated by a derived
     // ctor.
     DIType(const MDNode *N, bool, bool) : DIScope(N) {}
-
   public:
-
     /// Verify - Verify that a type descriptor is well formed.
     bool Verify() const;
     explicit DIType(const MDNode *N);
     explicit DIType() {}
-    virtual ~DIType() {}
 
     DIScope getContext() const          { return getFieldAs<DIScope>(1); }
     StringRef getName() const           { return getStringField(2);     }
@@ -314,17 +313,10 @@ namespace llvm {
     /// this descriptor.
     void replaceAllUsesWith(DIDescriptor &D);
     void replaceAllUsesWith(MDNode *D);
-
-    /// print - print type.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print type to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DIBasicType - A basic type, like 'int' or 'float'.
   class DIBasicType : public DIType {
-    virtual void anchor();
   public:
     explicit DIBasicType(const MDNode *N = 0) : DIType(N) {}
 
@@ -332,18 +324,13 @@ namespace llvm {
 
     /// Verify - Verify that a basic type descriptor is well formed.
     bool Verify() const;
-
-    /// print - print basic type.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print basic type to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DIDerivedType - A simple derived type, like a const qualified type,
   /// a typedef, a pointer or reference, etc.
   class DIDerivedType : public DIType {
-    virtual void anchor();
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   protected:
     explicit DIDerivedType(const MDNode *N, bool, bool)
       : DIType(N, true, true) {}
@@ -401,19 +388,14 @@ namespace llvm {
 
     /// Verify - Verify that a derived type descriptor is well formed.
     bool Verify() const;
-
-    /// print - print derived type.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print derived type to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DICompositeType - This descriptor holds a type that can refer to multiple
   /// other types, like a function or struct.
   /// FIXME: Why is this a DIDerivedType??
   class DICompositeType : public DIDerivedType {
-    virtual void anchor();
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DICompositeType(const MDNode *N = 0)
       : DIDerivedType(N, true, true) {
@@ -430,12 +412,6 @@ namespace llvm {
 
     /// Verify - Verify that a composite type descriptor is well formed.
     bool Verify() const;
-
-    /// print - print composite type.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print composite type to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DITemplateTypeParameter - This is a wrapper for template type parameter.
@@ -477,7 +453,8 @@ namespace llvm {
 
   /// DISubprogram - This is a wrapper for a subprogram (e.g. a function).
   class DISubprogram : public DIScope {
-    virtual void anchor();
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DISubprogram(const MDNode *N = 0) : DIScope(N) {}
 
@@ -576,12 +553,6 @@ namespace llvm {
     /// Verify - Verify that a subprogram descriptor is well formed.
     bool Verify() const;
 
-    /// print - print subprogram.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print subprogram to dbgs() with a newline.
-    void dump() const;
-
     /// describes - Return true if this subprogram provides debugging
     /// information for the function F.
     bool describes(const Function *F);
@@ -597,6 +568,8 @@ namespace llvm {
 
   /// DIGlobalVariable - This is a wrapper for a global variable.
   class DIGlobalVariable : public DIDescriptor {
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DIGlobalVariable(const MDNode *N = 0) : DIDescriptor(N) {}
 
@@ -634,17 +607,13 @@ namespace llvm {
 
     /// Verify - Verify that a global variable descriptor is well formed.
     bool Verify() const;
-
-    /// print - print global variable.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print global variable to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DIVariable - This is a wrapper for a variable (e.g. parameter, local,
   /// global etc).
   class DIVariable : public DIDescriptor {
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DIVariable(const MDNode *N = 0)
       : DIDescriptor(N) {}
@@ -706,18 +675,11 @@ namespace llvm {
     /// information for an inlined function arguments.
     bool isInlinedFnArgument(const Function *CurFn);
 
-    /// print - print variable.
-    void print(raw_ostream &OS) const;
-
     void printExtendedName(raw_ostream &OS) const;
-
-    /// dump - print variable to dbgs() with a newline.
-    void dump() const;
   };
 
   /// DILexicalBlock - This is a wrapper for a lexical block.
   class DILexicalBlock : public DIScope {
-    virtual void anchor();
   public:
     explicit DILexicalBlock(const MDNode *N = 0) : DIScope(N) {}
     DIScope getContext() const       { return getFieldAs<DIScope>(1);      }
@@ -736,7 +698,6 @@ namespace llvm {
   /// DILexicalBlockFile - This is a wrapper for a lexical block with
   /// a filename change.
   class DILexicalBlockFile : public DIScope {
-    virtual void anchor();
   public:
     explicit DILexicalBlockFile(const MDNode *N = 0) : DIScope(N) {}
     DIScope getContext() const { return getScope().getContext(); }
@@ -756,7 +717,6 @@ namespace llvm {
 
   /// DINameSpace - A wrapper for a C++ style name space.
   class DINameSpace : public DIScope { 
-    virtual void anchor();
   public:
     explicit DINameSpace(const MDNode *N = 0) : DIScope(N) {}
     DIScope getContext() const     { return getFieldAs<DIScope>(1);      }
@@ -794,6 +754,8 @@ namespace llvm {
   };
 
   class DIObjCProperty : public DIDescriptor {
+    friend class DIDescriptor;
+    void printInternal(raw_ostream &OS) const;
   public:
     explicit DIObjCProperty(const MDNode *N) : DIDescriptor(N) { }
 
@@ -830,12 +792,6 @@ namespace llvm {
 
     /// Verify - Verify that a derived type descriptor is well formed.
     bool Verify() const;
-
-    /// print - print derived type.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print derived type to dbgs() with a newline.
-    void dump() const;
   };
 
   /// getDISubprogram - Find subprogram that is enclosing this scope.
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 64f80c5..cfdeb46 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -15,9 +15,9 @@
 #ifndef LLVM_DEBUGINFO_DICONTEXT_H
 #define LLVM_DEBUGINFO_DICONTEXT_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
-#include <cstring>
 
 namespace llvm {
 
@@ -25,27 +25,52 @@ class raw_ostream;
 
 /// DILineInfo - a format-neutral container for source line information.
 class DILineInfo {
-  const char *FileName;
+  SmallString<16> FileName;
+  SmallString<16> FunctionName;
   uint32_t Line;
   uint32_t Column;
 public:
-  DILineInfo() : FileName("<invalid>"), Line(0), Column(0) {}
-  DILineInfo(const char *fileName, uint32_t line, uint32_t column)
-    : FileName(fileName), Line(line), Column(column) {}
+  DILineInfo()
+    : FileName("<invalid>"), FunctionName("<invalid>"),
+      Line(0), Column(0) {}
+  DILineInfo(const SmallString<16> &fileName,
+             const SmallString<16> &functionName,
+             uint32_t line, uint32_t column)
+    : FileName(fileName), FunctionName(functionName),
+      Line(line), Column(column) {}
 
-  const char *getFileName() const { return FileName; }
+  const char *getFileName() { return FileName.c_str(); }
+  const char *getFunctionName() { return FunctionName.c_str(); }
   uint32_t getLine() const { return Line; }
   uint32_t getColumn() const { return Column; }
 
   bool operator==(const DILineInfo &RHS) const {
     return Line == RHS.Line && Column == RHS.Column &&
-           std::strcmp(FileName, RHS.FileName) == 0;
+           FileName.equals(RHS.FileName) &&
+           FunctionName.equals(RHS.FunctionName);
   }
   bool operator!=(const DILineInfo &RHS) const {
     return !(*this == RHS);
   }
 };
 
+/// DILineInfoSpecifier - controls which fields of DILineInfo container
+/// should be filled with data.
+class DILineInfoSpecifier {
+  const uint32_t Flags;  // Or'ed flags that set the info we want to fetch.
+public:
+  enum Specification {
+    FileLineInfo = 1 << 0,
+    AbsoluteFilePath = 1 << 1,
+    FunctionName = 1 << 2
+  };
+  // Use file/line info by default.
+  DILineInfoSpecifier(uint32_t flags = FileLineInfo) : Flags(flags) {}
+  bool needs(Specification spec) const {
+    return (Flags & spec) > 0;
+  }
+};
+
 class DIContext {
 public:
   virtual ~DIContext();
@@ -60,7 +85,8 @@ public:
 
   virtual void dump(raw_ostream &OS) = 0;
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address) = 0;
+  virtual DILineInfo getLineInfoForAddress(uint64_t address,
+      DILineInfoSpecifier specifier = DILineInfoSpecifier()) = 0;
 };
 
 }
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index e920e98..ae8b68d 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -354,7 +354,7 @@ public:
   /// variable, possibly emitting it to memory if needed.  This is used by the
   /// Emitter.
   virtual void *getOrEmitGlobalVariable(const GlobalVariable *GV) {
-    return getPointerToGlobal((GlobalValue*)GV);
+    return getPointerToGlobal((const GlobalValue *)GV);
   }
 
   /// Registers a listener to be called back on various events within
diff --git a/include/llvm/ExecutionEngine/Interpreter.h b/include/llvm/ExecutionEngine/Interpreter.h
index 7425cdb..72d97ef 100644
--- a/include/llvm/ExecutionEngine/Interpreter.h
+++ b/include/llvm/ExecutionEngine/Interpreter.h
@@ -23,7 +23,7 @@ extern "C" void LLVMLinkInInterpreter();
 namespace {
   struct ForceInterpreterLinking {
     ForceInterpreterLinking() {
-      // We must reference the passes in such a way that compilers will not
+      // We must reference the interpreter in such a way that compilers will not
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
diff --git a/include/llvm/ExecutionEngine/JIT.h b/include/llvm/ExecutionEngine/JIT.h
index 6013db4..b4cda1d 100644
--- a/include/llvm/ExecutionEngine/JIT.h
+++ b/include/llvm/ExecutionEngine/JIT.h
@@ -23,7 +23,7 @@ extern "C" void LLVMLinkInJIT();
 namespace {
   struct ForceJITLinking {
     ForceJITLinking() {
-      // We must reference the passes in such a way that compilers will not
+      // We must reference JIT in such a way that compilers will not
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
diff --git a/include/llvm/ExecutionEngine/MCJIT.h b/include/llvm/ExecutionEngine/MCJIT.h
index f956a50..ac16bdc 100644
--- a/include/llvm/ExecutionEngine/MCJIT.h
+++ b/include/llvm/ExecutionEngine/MCJIT.h
@@ -23,7 +23,7 @@ extern "C" void LLVMLinkInMCJIT();
 namespace {
   struct ForceMCJITLinking {
     ForceMCJITLinking() {
-      // We must reference the passes in such a way that compilers will not
+      // We must reference MCJIT in such a way that compilers will not
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 54c28f3..a5c9272 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -65,12 +65,15 @@ public:
   RuntimeDyld(RTDyldMemoryManager*);
   ~RuntimeDyld();
 
+  /// Load an in-memory object file into the dynamic linker.
   bool loadObject(MemoryBuffer *InputBuffer);
-  // Get the address of our local copy of the symbol. This may or may not
-  // be the address used for relocation (clients can copy the data around
-  // and resolve relocatons based on where they put it).
+
+  /// Get the address of our local copy of the symbol. This may or may not
+  /// be the address used for relocation (clients can copy the data around
+  /// and resolve relocatons based on where they put it).
   void *getSymbolAddress(StringRef Name);
-  // Resolve the relocations for all symbols we currently know about.
+
+  /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();
 
   /// mapSectionAddress - map a section to its target address space value.
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index e17cd87..fdd90d1 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -420,8 +420,8 @@ public:
   void dropAllReferences();
 
   /// hasAddressTaken - returns true if there are any uses of this function
-  /// other than direct calls or invokes to it. Optionally passes back the
-  /// offending user for diagnostic purposes.
+  /// other than direct calls or invokes to it, or blockaddress expressions.
+  /// Optionally passes back an offending user for diagnostic purposes.
   ///
   bool hasAddressTaken(const User** = 0) const;
 
diff --git a/include/llvm/GlobalValue.h b/include/llvm/GlobalValue.h
index 81a11a4..8b969f3 100644
--- a/include/llvm/GlobalValue.h
+++ b/include/llvm/GlobalValue.h
@@ -164,6 +164,12 @@ public:
     return Linkage == CommonLinkage;
   }
 
+  /// isDiscardableIfUnused - Whether the definition of this global may be
+  /// discarded if it is not used in its compilation unit.
+  static bool isDiscardableIfUnused(LinkageTypes Linkage) {
+    return isLinkOnceLinkage(Linkage) || isLocalLinkage(Linkage);
+  }
+
   /// mayBeOverridden - Whether the definition of this global may be replaced
   /// by something non-equivalent at link time.  For example, if a function has
   /// weak linkage then the code defining it may be replaced by different code.
@@ -221,6 +227,10 @@ public:
   void setLinkage(LinkageTypes LT) { Linkage = LT; }
   LinkageTypes getLinkage() const { return Linkage; }
 
+  bool isDiscardableIfUnused() const {
+    return isDiscardableIfUnused(Linkage);
+  }
+
   bool mayBeOverridden() const { return mayBeOverridden(Linkage); }
 
   bool isWeakForLinker() const { return isWeakForLinker(Linkage); }
diff --git a/include/llvm/GlobalVariable.h b/include/llvm/GlobalVariable.h
index 034ade1..99b7a73 100644
--- a/include/llvm/GlobalVariable.h
+++ b/include/llvm/GlobalVariable.h
@@ -41,24 +41,35 @@ class GlobalVariable : public GlobalValue, public ilist_node<GlobalVariable> {
   void setParent(Module *parent);
 
   bool isConstantGlobal : 1;           // Is this a global constant?
-  bool isThreadLocalSymbol : 1;        // Is this symbol "Thread Local"?
+  unsigned threadLocalMode : 3;        // Is this symbol "Thread Local",
+                                       // if so, what is the desired model?
 
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
     return User::operator new(s, 1);
   }
+
+  enum ThreadLocalMode {
+    NotThreadLocal = 0,
+    GeneralDynamicTLSModel,
+    LocalDynamicTLSModel,
+    InitialExecTLSModel,
+    LocalExecTLSModel
+  };
+
   /// GlobalVariable ctor - If a parent module is specified, the global is
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
                  Constant *Initializer = 0, const Twine &Name = "",
-                 bool ThreadLocal = false, unsigned AddressSpace = 0);
+                 ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0);
   /// GlobalVariable ctor - This creates a global and inserts it before the
   /// specified other global.
   GlobalVariable(Module &M, Type *Ty, bool isConstant,
                  LinkageTypes Linkage, Constant *Initializer,
-                 const Twine &Name,
-                 GlobalVariable *InsertBefore = 0, bool ThreadLocal = false,
+                 const Twine &Name = "",
+                 GlobalVariable *InsertBefore = 0,
+                 ThreadLocalMode = NotThreadLocal,
                  unsigned AddressSpace = 0);
 
   ~GlobalVariable() {
@@ -135,8 +146,14 @@ public:
   void setConstant(bool Val) { isConstantGlobal = Val; }
 
   /// If the value is "Thread Local", its value isn't shared by the threads.
-  bool isThreadLocal() const { return isThreadLocalSymbol; }
-  void setThreadLocal(bool Val) { isThreadLocalSymbol = Val; }
+  bool isThreadLocal() const { return threadLocalMode != NotThreadLocal; }
+  void setThreadLocal(bool Val) {
+    threadLocalMode = Val ? GeneralDynamicTLSModel : NotThreadLocal;
+  }
+  void setThreadLocalMode(ThreadLocalMode Val) { threadLocalMode = Val; }
+  ThreadLocalMode getThreadLocalMode() const {
+    return static_cast<ThreadLocalMode>(threadLocalMode);
+  }
 
   /// copyAttributesFrom - copy all additional attributes (those not needed to
   /// create a GlobalVariable) from the GlobalVariable Src to this one.
diff --git a/include/llvm/Support/IRBuilder.h b/include/llvm/IRBuilder.h
index ef00e8e..d5b6f47 100644
--- a/include/llvm/Support/IRBuilder.h
+++ b/include/llvm/IRBuilder.h
@@ -1,4 +1,4 @@
-//===---- llvm/Support/IRBuilder.h - Builder for LLVM Instrs ----*- C++ -*-===//
+//===---- llvm/IRBuilder.h - Builder for LLVM Instructions ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_IRBUILDER_H
-#define LLVM_SUPPORT_IRBUILDER_H
+#ifndef LLVM_IRBUILDER_H
+#define LLVM_IRBUILDER_H
 
 #include "llvm/Instructions.h"
 #include "llvm/BasicBlock.h"
@@ -411,6 +411,17 @@ public:
   // Instruction creation methods: Terminators
   //===--------------------------------------------------------------------===//
 
+private:
+  /// \brief Helper to add branch weight metadata onto an instruction.
+  /// \returns The annotated instruction.
+  template <typename InstTy>
+  InstTy *addBranchWeights(InstTy *I, MDNode *Weights) {
+    if (Weights)
+      I->setMetadata(LLVMContext::MD_prof, Weights);
+    return I;
+  }
+
+public:
   /// CreateRetVoid - Create a 'ret void' instruction.
   ReturnInst *CreateRetVoid() {
     return Insert(ReturnInst::Create(Context));
@@ -444,15 +455,19 @@ public:
 
   /// CreateCondBr - Create a conditional 'br Cond, TrueDest, FalseDest'
   /// instruction.
-  BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False) {
-    return Insert(BranchInst::Create(True, False, Cond));
+  BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False,
+                           MDNode *BranchWeights = 0) {
+    return Insert(addBranchWeights(BranchInst::Create(True, False, Cond),
+                                   BranchWeights));
   }
 
   /// CreateSwitch - Create a switch instruction with the specified value,
   /// default dest, and with a hint for the number of cases that will be added
   /// (for efficient allocation).
-  SwitchInst *CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases = 10) {
-    return Insert(SwitchInst::Create(V, Dest, NumCases));
+  SwitchInst *CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases = 10,
+                           MDNode *BranchWeights = 0) {
+    return Insert(addBranchWeights(SwitchInst::Create(V, Dest, NumCases),
+                                   BranchWeights));
   }
 
   /// CreateIndirectBr - Create an indirect branch instruction with the
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 33d2043..de97957 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -71,6 +71,7 @@ void initializeBasicCallGraphPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockFrequencyInfoPass(PassRegistry&);
 void initializeBlockPlacementPass(PassRegistry&);
+void initializeBoundsCheckingPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
@@ -99,6 +100,7 @@ void initializeDomPrinterPass(PassRegistry&);
 void initializeDomViewerPass(PassRegistry&);
 void initializeDominanceFrontierPass(PassRegistry&);
 void initializeDominatorTreePass(PassRegistry&);
+void initializeEarlyIfConverterPass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEdgeProfilerPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
@@ -135,6 +137,7 @@ void initializeLibCallAliasAnalysisPass(PassRegistry&);
 void initializeLintPass(PassRegistry&);
 void initializeLiveDebugVariablesPass(PassRegistry&);
 void initializeLiveIntervalsPass(PassRegistry&);
+void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
@@ -169,6 +172,7 @@ void initializeMachineLoopRangesPass(PassRegistry&);
 void initializeMachineModuleInfoPass(PassRegistry&);
 void initializeMachineSchedulerPass(PassRegistry&);
 void initializeMachineSinkingPass(PassRegistry&);
+void initializeMachineTraceMetricsPass(PassRegistry&);
 void initializeMachineVerifierPassPass(PassRegistry&);
 void initializeMemCpyOptPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
@@ -214,7 +218,6 @@ void initializeRegionOnlyPrinterPass(PassRegistry&);
 void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
 void initializeRegionViewerPass(PassRegistry&);
-void initializeRenderMachineFunctionPass(PassRegistry&);
 void initializeSCCPPass(PassRegistry&);
 void initializeSROA_DTPass(PassRegistry&);
 void initializeSROA_SSAUpPass(PassRegistry&);
@@ -247,10 +250,12 @@ void initializeUnreachableBlockElimPass(PassRegistry&);
 void initializeUnreachableMachineBlockElimPass(PassRegistry&);
 void initializeVerifierPass(PassRegistry&);
 void initializeVirtRegMapPass(PassRegistry&);
+void initializeVirtRegRewriterPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
+void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/Instruction.h b/include/llvm/Instruction.h
index 9c5ac44..5512dcc 100644
--- a/include/llvm/Instruction.h
+++ b/include/llvm/Instruction.h
@@ -215,6 +215,27 @@ public:
   bool isCommutative() const { return isCommutative(getOpcode()); }
   static bool isCommutative(unsigned op);
 
+  /// isIdempotent - Return true if the instruction is idempotent:
+  ///
+  ///   Idempotent operators satisfy:  x op x === x
+  ///
+  /// In LLVM, the And and Or operators are idempotent.
+  ///
+  bool isIdempotent() const { return isIdempotent(getOpcode()); }
+  static bool isIdempotent(unsigned op);
+
+  /// isNilpotent - Return true if the instruction is nilpotent:
+  ///
+  ///   Nilpotent operators satisfy:  x op x === Id,
+  ///
+  ///   where Id is the identity for the operator, i.e. a constant such that
+  ///     x op Id === x and Id op x === x for all x.
+  ///
+  /// In LLVM, the Xor operator is nilpotent.
+  ///
+  bool isNilpotent() const { return isNilpotent(getOpcode()); }
+  static bool isNilpotent(unsigned op);
+
   /// mayWriteToMemory - Return true if this instruction may modify memory.
   ///
   bool mayWriteToMemory() const;
@@ -260,6 +281,16 @@ public:
   /// ignores the SubclassOptionalData flags, which specify conditions
   /// under which the instruction's result is undefined.
   bool isIdenticalToWhenDefined(const Instruction *I) const;
+
+  /// When checking for operation equivalence (using isSameOperationAs) it is
+  /// sometimes useful to ignore certain attributes.
+  enum OperationEquivalenceFlags {
+    /// Check for equivalence ignoring load/store alignment.
+    CompareIgnoringAlignment = 1<<0,
+    /// Check for equivalence treating a type and a vector of that type
+    /// as equivalent.
+    CompareUsingScalarTypes = 1<<1
+  };
   
   /// This function determines if the specified instruction executes the same
   /// operation as the current one. This means that the opcodes, type, operand
@@ -269,7 +300,7 @@ public:
   /// @returns true if the specified instruction is the same operation as
   /// the current one.
   /// @brief Determine if one instruction is the same operation as another.
-  bool isSameOperationAs(const Instruction *I) const;
+  bool isSameOperationAs(const Instruction *I, unsigned flags = 0) const;
   
   /// isUsedOutsideOfBlock - Return true if there are any uses of this
   /// instruction in blocks other than the specified block.  Note that PHI nodes
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index f6eaf04..f5187e6 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -20,6 +20,8 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Attributes.h"
 #include "llvm/CallingConv.h"
+#include "llvm/Support/IntegersSubset.h"
+#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -699,7 +701,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(AtomicRMWInst, Value)
 // checkGEPType - Simple wrapper function to give a better assertion failure
 // message on bad indexes for a gep instruction.
 //
-static inline Type *checkGEPType(Type *Ty) {
+inline Type *checkGEPType(Type *Ty) {
   assert(Ty && "Invalid GetElementPtrInst indices for type!");
   return Ty;
 }
@@ -1265,6 +1267,11 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attributes attr);
 
+  /// \brief Return true if this call has the given attribute.
+  bool hasFnAttr(Attributes N) const {
+    return paramHasAttr(~0, N);
+  }
+
   /// @brief Determine whether the call or the callee has the given attribute.
   bool paramHasAttr(unsigned i, Attributes attr) const;
 
@@ -1274,7 +1281,7 @@ public:
   }
 
   /// @brief Return true if the call should not be inlined.
-  bool isNoInline() const { return paramHasAttr(~0, Attribute::NoInline); }
+  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
   void setIsNoInline(bool Value = true) {
     if (Value) addAttribute(~0, Attribute::NoInline);
     else removeAttribute(~0, Attribute::NoInline);
@@ -1282,7 +1289,7 @@ public:
 
   /// @brief Return true if the call can return twice
   bool canReturnTwice() const {
-    return paramHasAttr(~0, Attribute::ReturnsTwice);
+    return hasFnAttr(Attribute::ReturnsTwice);
   }
   void setCanReturnTwice(bool Value = true) {
     if (Value) addAttribute(~0, Attribute::ReturnsTwice);
@@ -1291,7 +1298,7 @@ public:
 
   /// @brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
-    return paramHasAttr(~0, Attribute::ReadNone);
+    return hasFnAttr(Attribute::ReadNone);
   }
   void setDoesNotAccessMemory(bool NotAccessMemory = true) {
     if (NotAccessMemory) addAttribute(~0, Attribute::ReadNone);
@@ -1300,7 +1307,7 @@ public:
 
   /// @brief Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || paramHasAttr(~0, Attribute::ReadOnly);
+    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
   void setOnlyReadsMemory(bool OnlyReadsMemory = true) {
     if (OnlyReadsMemory) addAttribute(~0, Attribute::ReadOnly);
@@ -1308,14 +1315,14 @@ public:
   }
 
   /// @brief Determine if the call cannot return.
-  bool doesNotReturn() const { return paramHasAttr(~0, Attribute::NoReturn); }
+  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn(bool DoesNotReturn = true) {
     if (DoesNotReturn) addAttribute(~0, Attribute::NoReturn);
     else removeAttribute(~0, Attribute::NoReturn);
   }
 
   /// @brief Determine if the call cannot unwind.
-  bool doesNotThrow() const { return paramHasAttr(~0, Attribute::NoUnwind); }
+  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow(bool DoesNotThrow = true) {
     if (DoesNotThrow) addAttribute(~0, Attribute::NoUnwind);
     else removeAttribute(~0, Attribute::NoUnwind);
@@ -2237,7 +2244,7 @@ public:
   /// getNumClauses - Get the number of clauses for this landing pad.
   unsigned getNumClauses() const { return getNumOperands() - 1; }
 
-  /// reserveClauses - Grow the size of the operand list to accomodate the new
+  /// reserveClauses - Grow the size of the operand list to accommodate the new
   /// number of clauses.
   void reserveClauses(unsigned Size) { growOperands(Size); }
 
@@ -2440,10 +2447,31 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 class SwitchInst : public TerminatorInst {
   void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
   unsigned ReservedSpace;
+  // Operands format:
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
   // Operand[2n  ] = Value to match
   // Operand[2n+1] = BasicBlock to go to on match
+  
+  // Store case values separately from operands list. We needn't User-Use
+  // concept here, since it is just a case value, it will always constant,
+  // and case value couldn't reused with another instructions/values.
+  // Additionally:
+  // It allows us to use custom type for case values that is not inherited
+  // from Value. Since case value is a complex type that implements
+  // the subset of integers, we needn't extract sub-constants within
+  // slow getAggregateElement method.
+  // For case values we will use std::list to by two reasons:
+  // 1. It allows to add/remove cases without whole collection reallocation.
+  // 2. In most of cases we needn't random access.
+  // Currently case values are also stored in Operands List, but it will moved
+  // out in future commits.
+  typedef std::list<IntegersSubset> Subsets;
+  typedef Subsets::iterator SubsetsIt;
+  typedef Subsets::const_iterator SubsetsConstIt;
+  
+  Subsets TheSubsets;
+  
   SwitchInst(const SwitchInst &SI);
   void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
   void growOperands();
@@ -2468,121 +2496,25 @@ protected:
   virtual SwitchInst *clone_impl() const;
 public:
   
+  // FIXME: Currently there are a lot of unclean template parameters,
+  // we need to make refactoring in future.
+  // All these parameters are used to implement both iterator and const_iterator
+  // without code duplication.
+  // SwitchInstTy may be "const SwitchInst" or "SwitchInst"
+  // ConstantIntTy may be "const ConstantInt" or "ConstantInt"
+  // SubsetsItTy may be SubsetsConstIt or SubsetsIt
+  // BasicBlockTy may be "const BasicBlock" or "BasicBlock"
+  template <class SwitchInstTy, class ConstantIntTy,
+            class SubsetsItTy, class BasicBlockTy> 
+    class CaseIteratorT;
+
+  typedef CaseIteratorT<const SwitchInst, const ConstantInt,
+                        SubsetsConstIt, const BasicBlock> ConstCaseIt;
+  class CaseIt;
+  
   // -2
   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
   
-  template <class SwitchInstTy, class ConstantIntTy, class BasicBlockTy> 
-  class CaseIteratorT {
-  protected:
-    
-    SwitchInstTy *SI;
-    unsigned Index;
-    
-  public:
-    
-    typedef CaseIteratorT<SwitchInstTy, ConstantIntTy, BasicBlockTy> Self;
-    
-    /// Initializes case iterator for given SwitchInst and for given
-    /// case number.    
-    CaseIteratorT(SwitchInstTy *SI, unsigned CaseNum) {
-      this->SI = SI;
-      Index = CaseNum;
-    }
-    
-    /// Initializes case iterator for given SwitchInst and for given
-    /// TerminatorInst's successor index.
-    static Self fromSuccessorIndex(SwitchInstTy *SI, unsigned SuccessorIndex) {
-      assert(SuccessorIndex < SI->getNumSuccessors() &&
-             "Successor index # out of range!");    
-      return SuccessorIndex != 0 ? 
-             Self(SI, SuccessorIndex - 1) :
-             Self(SI, DefaultPseudoIndex);       
-    }
-    
-    /// Resolves case value for current case.
-    ConstantIntTy *getCaseValue() {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      return reinterpret_cast<ConstantIntTy*>(SI->getOperand(2 + Index*2));
-    }
-    
-    /// Resolves successor for current case.
-    BasicBlockTy *getCaseSuccessor() {
-      assert((Index < SI->getNumCases() ||
-              Index == DefaultPseudoIndex) &&
-             "Index out the number of cases.");
-      return SI->getSuccessor(getSuccessorIndex());      
-    }
-    
-    /// Returns number of current case.
-    unsigned getCaseIndex() const { return Index; }
-    
-    /// Returns TerminatorInst's successor index for current case successor.
-    unsigned getSuccessorIndex() const {
-      assert((Index == DefaultPseudoIndex || Index < SI->getNumCases()) &&
-             "Index out the number of cases.");
-      return Index != DefaultPseudoIndex ? Index + 1 : 0;
-    }
-    
-    Self operator++() {
-      // Check index correctness after increment.
-      // Note: Index == getNumCases() means end().      
-      assert(Index+1 <= SI->getNumCases() && "Index out the number of cases.");
-      ++Index;
-      return *this;
-    }
-    Self operator++(int) {
-      Self tmp = *this;
-      ++(*this);
-      return tmp;
-    }
-    Self operator--() { 
-      // Check index correctness after decrement.
-      // Note: Index == getNumCases() means end().
-      // Also allow "-1" iterator here. That will became valid after ++.
-      assert((Index == 0 || Index-1 <= SI->getNumCases()) &&
-             "Index out the number of cases.");
-      --Index;
-      return *this;
-    }
-    Self operator--(int) {
-      Self tmp = *this;
-      --(*this);
-      return tmp;
-    }
-    bool operator==(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index == Index;
-    }
-    bool operator!=(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index != Index;
-    }
-  };
-  
-  typedef CaseIteratorT<const SwitchInst, const ConstantInt, const BasicBlock>
-    ConstCaseIt;
-
-  class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> {
-    
-    typedef CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> ParentTy;
-  
-  public:
-    
-    CaseIt(const ParentTy& Src) : ParentTy(Src) {}
-    CaseIt(SwitchInst *SI, unsigned CaseNum) : ParentTy(SI, CaseNum) {}
-
-    /// Sets the new value for current case.    
-    void setValue(ConstantInt *V) {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      SI->setOperand(2 + Index*2, reinterpret_cast<Value*>(V));
-    }
-    
-    /// Sets the new successor for current case.
-    void setSuccessor(BasicBlock *S) {
-      SI->setSuccessor(getSuccessorIndex(), S);      
-    }
-  };
-
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases, Instruction *InsertBefore = 0) {
     return new SwitchInst(Value, Default, NumCases, InsertBefore);
@@ -2618,23 +2550,23 @@ public:
   /// Returns a read/write iterator that points to the first
   /// case in SwitchInst.
   CaseIt case_begin() {
-    return CaseIt(this, 0);
+    return CaseIt(this, 0, TheSubsets.begin());
   }
   /// Returns a read-only iterator that points to the first
   /// case in the SwitchInst.
   ConstCaseIt case_begin() const {
-    return ConstCaseIt(this, 0);
+    return ConstCaseIt(this, 0, TheSubsets.begin());
   }
   
   /// Returns a read/write iterator that points one past the last
   /// in the SwitchInst.
   CaseIt case_end() {
-    return CaseIt(this, getNumCases());
+    return CaseIt(this, getNumCases(), TheSubsets.end());
   }
   /// Returns a read-only iterator that points one past the last
   /// in the SwitchInst.
   ConstCaseIt case_end() const {
-    return ConstCaseIt(this, getNumCases());
+    return ConstCaseIt(this, getNumCases(), TheSubsets.end());
   }
   /// Returns an iterator that points to the default case.
   /// Note: this iterator allows to resolve successor only. Attempt
@@ -2642,10 +2574,10 @@ public:
   /// Also note, that increment and decrement also causes an assertion and
   /// makes iterator invalid. 
   CaseIt case_default() {
-    return CaseIt(this, DefaultPseudoIndex);
+    return CaseIt(this, DefaultPseudoIndex, TheSubsets.end());
   }
   ConstCaseIt case_default() const {
-    return ConstCaseIt(this, DefaultPseudoIndex);
+    return ConstCaseIt(this, DefaultPseudoIndex, TheSubsets.end());
   }
   
   /// findCaseValue - Search all of the case values for the specified constant.
@@ -2654,13 +2586,13 @@ public:
   /// that it is handled by the default handler.
   CaseIt findCaseValue(const ConstantInt *C) {
     for (CaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValue() == C)
+      if (i.getCaseValueEx().isSatisfies(IntItem::fromConstantInt(C)))
         return i;
     return case_default();
   }
   ConstCaseIt findCaseValue(const ConstantInt *C) const {
     for (ConstCaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValue() == C)
+      if (i.getCaseValueEx().isSatisfies(IntItem::fromConstantInt(C)))
         return i;
     return case_default();
   }    
@@ -2681,10 +2613,17 @@ public:
   }
 
   /// addCase - Add an entry to the switch instruction...
+  /// @Deprecated
   /// Note:
   /// This action invalidates case_end(). Old case_end() iterator will
   /// point to the added case.
   void addCase(ConstantInt *OnVal, BasicBlock *Dest);
+  
+  /// addCase - Add an entry to the switch instruction.
+  /// Note:
+  /// This action invalidates case_end(). Old case_end() iterator will
+  /// point to the added case.
+  void addCase(IntegersSubset& OnVal, BasicBlock *Dest);
 
   /// removeCase - This method removes the specified case and its successor
   /// from the switch instruction. Note that this operation may reorder the
@@ -2692,7 +2631,7 @@ public:
   /// Note:
   /// This action invalidates iterators for all cases following the one removed,
   /// including the case_end() iterator.
-  void removeCase(CaseIt i);
+  void removeCase(CaseIt& i);
 
   unsigned getNumSuccessors() const { return getNumOperands()/2; }
   BasicBlock *getSuccessor(unsigned idx) const {
@@ -2703,8 +2642,193 @@ public:
     assert(idx < getNumSuccessors() && "Successor # out of range for switch!");
     setOperand(idx*2+1, (Value*)NewSucc);
   }
+  
+  uint16_t hash() const {
+    uint32_t NumberOfCases = (uint32_t)getNumCases();
+    uint16_t Hash = (0xFFFF & NumberOfCases) ^ (NumberOfCases >> 16);
+    for (ConstCaseIt i = case_begin(), e = case_end();
+         i != e; ++i) {
+      uint32_t NumItems = (uint32_t)i.getCaseValueEx().getNumItems(); 
+      Hash = (Hash << 1) ^ (0xFFFF & NumItems) ^ (NumItems >> 16);
+    }
+    return Hash;
+  }  
+  
+  // Case iterators definition.
+
+  template <class SwitchInstTy, class ConstantIntTy,
+            class SubsetsItTy, class BasicBlockTy> 
+  class CaseIteratorT {
+  protected:
+    
+    SwitchInstTy *SI;
+    unsigned long Index;
+    SubsetsItTy SubsetIt;
+    
+    /// Initializes case iterator for given SwitchInst and for given
+    /// case number.    
+    friend class SwitchInst;
+    CaseIteratorT(SwitchInstTy *SI, unsigned SuccessorIndex,
+                  SubsetsItTy CaseValueIt) {
+      this->SI = SI;
+      Index = SuccessorIndex;
+      this->SubsetIt = CaseValueIt;
+    }
+    
+  public:
+    typedef typename SubsetsItTy::reference IntegersSubsetRef;
+    typedef CaseIteratorT<SwitchInstTy, ConstantIntTy,
+                          SubsetsItTy, BasicBlockTy> Self;
+    
+    CaseIteratorT(SwitchInstTy *SI, unsigned CaseNum) {
+          this->SI = SI;
+          Index = CaseNum;
+          SubsetIt = SI->TheSubsets.begin();
+          std::advance(SubsetIt, CaseNum);
+        }
+        
+    
+    /// Initializes case iterator for given SwitchInst and for given
+    /// TerminatorInst's successor index.
+    static Self fromSuccessorIndex(SwitchInstTy *SI, unsigned SuccessorIndex) {
+      assert(SuccessorIndex < SI->getNumSuccessors() &&
+             "Successor index # out of range!");    
+      return SuccessorIndex != 0 ? 
+             Self(SI, SuccessorIndex - 1) :
+             Self(SI, DefaultPseudoIndex);       
+    }
+    
+    /// Resolves case value for current case.
+    /// @Deprecated
+    ConstantIntTy *getCaseValue() {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      IntegersSubsetRef CaseRanges = *SubsetIt;
+      
+      // FIXME: Currently we work with ConstantInt based cases.
+      // So return CaseValue as ConstantInt.
+      return CaseRanges.getSingleNumber(0).toConstantInt();
+    }
+
+    /// Resolves case value for current case.
+    IntegersSubsetRef getCaseValueEx() {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      return *SubsetIt;
+    }
+    
+    /// Resolves successor for current case.
+    BasicBlockTy *getCaseSuccessor() {
+      assert((Index < SI->getNumCases() ||
+              Index == DefaultPseudoIndex) &&
+             "Index out the number of cases.");
+      return SI->getSuccessor(getSuccessorIndex());      
+    }
+    
+    /// Returns number of current case.
+    unsigned getCaseIndex() const { return Index; }
+    
+    /// Returns TerminatorInst's successor index for current case successor.
+    unsigned getSuccessorIndex() const {
+      assert((Index == DefaultPseudoIndex || Index < SI->getNumCases()) &&
+             "Index out the number of cases.");
+      return Index != DefaultPseudoIndex ? Index + 1 : 0;
+    }
+    
+    Self operator++() {
+      // Check index correctness after increment.
+      // Note: Index == getNumCases() means end().
+      assert(Index+1 <= SI->getNumCases() && "Index out the number of cases.");
+      ++Index;
+      if (Index == 0)
+        SubsetIt = SI->TheSubsets.begin();
+      else
+        ++SubsetIt;
+      return *this;
+    }
+    Self operator++(int) {
+      Self tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+    Self operator--() { 
+      // Check index correctness after decrement.
+      // Note: Index == getNumCases() means end().
+      // Also allow "-1" iterator here. That will became valid after ++.
+      unsigned NumCases = SI->getNumCases();
+      assert((Index == 0 || Index-1 <= NumCases) &&
+             "Index out the number of cases.");
+      --Index;
+      if (Index == NumCases) {
+        SubsetIt = SI->TheSubsets.end();
+        return *this;
+      }
+        
+      if (Index != -1UL)
+        --SubsetIt;
+      
+      return *this;
+    }
+    Self operator--(int) {
+      Self tmp = *this;
+      --(*this);
+      return tmp;
+    }
+    bool operator==(const Self& RHS) const {
+      assert(RHS.SI == SI && "Incompatible operators.");
+      return RHS.Index == Index;
+    }
+    bool operator!=(const Self& RHS) const {
+      assert(RHS.SI == SI && "Incompatible operators.");
+      return RHS.Index != Index;
+    }
+  };
+
+  class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt,
+                                      SubsetsIt, BasicBlock> {
+    typedef CaseIteratorT<SwitchInst, ConstantInt, SubsetsIt, BasicBlock>
+      ParentTy;
+    
+  protected:
+    friend class SwitchInst;
+    CaseIt(SwitchInst *SI, unsigned CaseNum, SubsetsIt SubsetIt) :
+      ParentTy(SI, CaseNum, SubsetIt) {}
+    
+    void updateCaseValueOperand(IntegersSubset& V) {
+      SI->setOperand(2 + Index*2, reinterpret_cast<Value*>((Constant*)V));      
+    }
+  
+  public:
+
+    CaseIt(SwitchInst *SI, unsigned CaseNum) : ParentTy(SI, CaseNum) {}    
+    
+    CaseIt(const ParentTy& Src) : ParentTy(Src) {}
+
+    /// Sets the new value for current case.    
+    /// @Deprecated.
+    void setValue(ConstantInt *V) {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      IntegersSubsetToBB Mapping;
+      // FIXME: Currently we work with ConstantInt based cases.
+      // So inititalize IntItem container directly from ConstantInt.
+      Mapping.add(IntItem::fromConstantInt(V));
+      *SubsetIt = Mapping.getCase();
+      updateCaseValueOperand(*SubsetIt);
+    }
+    
+    /// Sets the new value for current case.
+    void setValueEx(IntegersSubset& V) {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      *SubsetIt = V;
+      updateCaseValueOperand(*SubsetIt);   
+    }
+    
+    /// Sets the new successor for current case.
+    void setSuccessor(BasicBlock *S) {
+      SI->setSuccessor(getSuccessorIndex(), S);      
+    }
+  };
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
+
   static inline bool classof(const SwitchInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Switch;
@@ -2905,6 +3029,11 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attributes attr);
 
+  /// \brief Return true if this call has the given attribute.
+  bool hasFnAttr(Attributes N) const {
+    return paramHasAttr(~0, N);
+  }
+
   /// @brief Determine whether the call or the callee has the given attribute.
   bool paramHasAttr(unsigned i, Attributes attr) const;
 
@@ -2914,7 +3043,7 @@ public:
   }
 
   /// @brief Return true if the call should not be inlined.
-  bool isNoInline() const { return paramHasAttr(~0, Attribute::NoInline); }
+  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
   void setIsNoInline(bool Value = true) {
     if (Value) addAttribute(~0, Attribute::NoInline);
     else removeAttribute(~0, Attribute::NoInline);
@@ -2922,7 +3051,7 @@ public:
 
   /// @brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
-    return paramHasAttr(~0, Attribute::ReadNone);
+    return hasFnAttr(Attribute::ReadNone);
   }
   void setDoesNotAccessMemory(bool NotAccessMemory = true) {
     if (NotAccessMemory) addAttribute(~0, Attribute::ReadNone);
@@ -2931,7 +3060,7 @@ public:
 
   /// @brief Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || paramHasAttr(~0, Attribute::ReadOnly);
+    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
   void setOnlyReadsMemory(bool OnlyReadsMemory = true) {
     if (OnlyReadsMemory) addAttribute(~0, Attribute::ReadOnly);
@@ -2939,14 +3068,14 @@ public:
   }
 
   /// @brief Determine if the call cannot return.
-  bool doesNotReturn() const { return paramHasAttr(~0, Attribute::NoReturn); }
+  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn(bool DoesNotReturn = true) {
     if (DoesNotReturn) addAttribute(~0, Attribute::NoReturn);
     else removeAttribute(~0, Attribute::NoReturn);
   }
 
   /// @brief Determine if the call cannot unwind.
-  bool doesNotThrow() const { return paramHasAttr(~0, Attribute::NoUnwind); }
+  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow(bool DoesNotThrow = true) {
     if (DoesNotThrow) addAttribute(~0, Attribute::NoUnwind);
     else removeAttribute(~0, Attribute::NoUnwind);
diff --git a/include/llvm/Intrinsics.h b/include/llvm/Intrinsics.h
index 3703825..c350388 100644
--- a/include/llvm/Intrinsics.h
+++ b/include/llvm/Intrinsics.h
@@ -74,6 +74,53 @@ namespace Intrinsic {
   /// Map a GCC builtin name to an intrinsic ID.
   ID getIntrinsicForGCCBuiltin(const char *Prefix, const char *BuiltinName);
   
+  /// IITDescriptor - This is a type descriptor which explains the type
+  /// requirements of an intrinsic.  This is returned by
+  /// getIntrinsicInfoTableEntries.
+  struct IITDescriptor {
+    enum IITDescriptorKind {
+      Void, MMX, Metadata, Float, Double,
+      Integer, Vector, Pointer, Struct,
+      Argument, ExtendVecArgument, TruncVecArgument
+    } Kind;
+    
+    union {
+      unsigned Integer_Width;
+      unsigned Float_Width;
+      unsigned Vector_Width;
+      unsigned Pointer_AddressSpace;
+      unsigned Struct_NumElements;
+      unsigned Argument_Info;
+    };
+    
+    enum ArgKind {
+      AK_AnyInteger,
+      AK_AnyFloat,
+      AK_AnyVector,
+      AK_AnyPointer
+    };
+    unsigned getArgumentNumber() const {
+      assert(Kind == Argument || Kind == ExtendVecArgument || 
+             Kind == TruncVecArgument);
+      return Argument_Info >> 2;
+    }
+    ArgKind getArgumentKind() const {
+      assert(Kind == Argument || Kind == ExtendVecArgument || 
+             Kind == TruncVecArgument);
+      return (ArgKind)(Argument_Info&3);
+    }
+    
+    static IITDescriptor get(IITDescriptorKind K, unsigned Field) {
+      IITDescriptor Result = { K, { Field } };
+      return Result;
+    }
+  };
+  
+  /// getIntrinsicInfoTableEntries - Return the IIT table descriptor for the
+  /// specified intrinsic into an array of IITDescriptors.
+  /// 
+  void getIntrinsicInfoTableEntries(ID id, SmallVectorImpl<IITDescriptor> &T);
+  
 } // End Intrinsic namespace
 
 } // End llvm namespace
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index 069f907..d1a0fee 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -55,6 +55,8 @@ class NoCapture<int argNo> : IntrinsicProperty {
   int ArgNo = argNo;
 }
 
+def IntrNoReturn : IntrinsicProperty;
+
 //===----------------------------------------------------------------------===//
 // Types used by intrinsics.
 //===----------------------------------------------------------------------===//
@@ -63,11 +65,15 @@ class LLVMType<ValueType vt> {
   ValueType VT = vt;
 }
 
-class LLVMPointerType<LLVMType elty>
+class LLVMQualPointerType<LLVMType elty, int addrspace>
   : LLVMType<iPTR>{
   LLVMType ElTy = elty;
+  int AddrSpace = addrspace;
 }
 
+class LLVMPointerType<LLVMType elty>
+  : LLVMQualPointerType<elty, 0>;
+
 class LLVMAnyPointerType<LLVMType elty>
   : LLVMType<iPTRAny>{
   LLVMType ElTy = elty;
@@ -127,9 +133,12 @@ def llvm_v16i16_ty     : LLVMType<v16i16>;   // 16 x i16
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
 def llvm_v8i32_ty      : LLVMType<v8i32>;    //  8 x i32
+def llvm_v16i32_ty     : LLVMType<v16i32>;   // 16 x i32
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
 def llvm_v4i64_ty      : LLVMType<v4i64>;    //  4 x i64
+def llvm_v8i64_ty      : LLVMType<v8i64>;    //  8 x i64
+def llvm_v16i64_ty     : LLVMType<v16i64>;   // 16 x i64
 
 def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
@@ -155,10 +164,11 @@ def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 //    intrinsic.
 //  * Properties can be set to describe the behavior of the intrinsic.
 //
+class SDPatternOperator;
 class Intrinsic<list<LLVMType> ret_types,
                 list<LLVMType> param_types = [],
                 list<IntrinsicProperty> properties = [],
-                string name = ""> {
+                string name = ""> : SDPatternOperator {
   string LLVMName = name;
   string TargetPrefix = "";   // Set to a prefix for target-specific intrinsics.
   list<LLVMType> RetTypes = ret_types;
@@ -253,12 +263,18 @@ let Properties = [IntrReadMem] in {
   def int_log2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_fabs : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_floor : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 }
 
 let Properties = [IntrNoMem] in {
   def int_fma  : Intrinsic<[llvm_anyfloat_ty],
                          [LLVMMatchType<0>, LLVMMatchType<0>,
                           LLVMMatchType<0>]>;
+
+  def int_fmuladd : Intrinsic<[llvm_anyfloat_ty],
+                              [LLVMMatchType<0>, LLVMMatchType<0>,
+                               LLVMMatchType<0>]>;
 }
 
 // NOTE: these are internal interfaces.
@@ -297,7 +313,7 @@ let Properties = [IntrNoMem] in {
 let Properties = [IntrNoMem] in {
   def int_dbg_declare      : Intrinsic<[],
                                        [llvm_metadata_ty, llvm_metadata_ty]>;
-  def int_dbg_value  	   : Intrinsic<[],
+  def int_dbg_value        : Intrinsic<[],
                                        [llvm_metadata_ty, llvm_i64_ty,
                                         llvm_metadata_ty]>;
 }
@@ -396,8 +412,13 @@ def int_invariant_end   : Intrinsic<[],
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
                      GCCBuiltin<"__builtin_flt_rounds">;
-def int_trap : Intrinsic<[]>,
+def int_trap : Intrinsic<[], [], [IntrNoReturn]>,
                GCCBuiltin<"__builtin_trap">;
+def int_debugtrap : Intrinsic<[]>,
+                    GCCBuiltin<"__builtin_debugtrap">;
+
+// NOP: calls/invokes to this intrinsic are removed by codegen
+def int_donothing : Intrinsic<[], [], [IntrNoMem]>;
 
 // Intrisics to support half precision floating point format
 let Properties = [IntrNoMem] in {
@@ -439,5 +460,6 @@ include "llvm/IntrinsicsX86.td"
 include "llvm/IntrinsicsARM.td"
 include "llvm/IntrinsicsCellSPU.td"
 include "llvm/IntrinsicsXCore.td"
-include "llvm/IntrinsicsPTX.td"
 include "llvm/IntrinsicsHexagon.td"
+include "llvm/IntrinsicsNVVM.td"
+include "llvm/IntrinsicsMips.td"
diff --git a/include/llvm/IntrinsicsHexagon.td b/include/llvm/IntrinsicsHexagon.td
index eb5dc8f..efd04f3 100644
--- a/include/llvm/IntrinsicsHexagon.td
+++ b/include/llvm/IntrinsicsHexagon.td
@@ -225,6 +225,22 @@ class Hexagon_qi_didi_Intrinsic<string GCCIntSuffix>
                           [llvm_i1_ty], [llvm_i64_ty, llvm_i64_ty],
                           [IntrNoMem]>;
 //
+// DEF_FUNCTION_TYPE_2(QI_ftype_SIDI,BT_BOOL,BT_INT,BT_LONGLONG) ->
+// Hexagon_qi_didi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_sidi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
+                          [IntrNoMem]>;
+//
+// DEF_FUNCTION_TYPE_2(QI_ftype_DISI,BT_BOOL,BT_LONGLONG,BT_INT) ->
+// Hexagon_qi_disi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_disi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_i64_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+//
 // DEF_FUNCTION_TYPE_2(QI_ftype_QIQI,BT_BOOL,BT_BOOL,BT_BOOL) ->
 // Hexagon_qi_qiqi_Intrinsic<string GCCIntSuffix>
 //
@@ -406,3266 +422,4456 @@ class Hexagon_di_didisisi_Intrinsic<string GCCIntSuffix>
                            llvm_i32_ty, llvm_i32_ty],
                           [IntrNoMem]>;
 
+class Hexagon_mem_memmemsisi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
+                           llvm_i32_ty, llvm_i32_ty],
+                          [IntrReadWriteArgMem]>;
+
+//
+// Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_si_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_di_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_di_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_i64_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_sf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_sf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_di_sf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_di_sf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i64_ty], [llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_sf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_sf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_si_sf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_si_sf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i32_ty], [llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_si_df_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_si_df_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i32_ty], [llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_sfsf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_sfsf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_qi_sfsf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_sfsf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_float_ty, llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_qi_sfsi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_sfsi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_float_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_qi_sfqi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_sfqi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_float_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_sfsfsf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_sfsfsf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
+                                            llvm_float_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
+                                            llvm_float_ty,
+                           llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_di_dididi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_di_dididisi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
+                           llvm_i64_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_si_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_si_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_di_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_di_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_i64_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_di_df_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_di_df_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i64_ty], [llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_df_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_df_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_dfdf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_dfdf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_qi_dfdf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_dfdf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_double_ty, llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_qi_dfsi_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_qi_dfsi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i1_ty], [llvm_double_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+//
+//
+// Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
+                                             llvm_double_ty],
+                          [IntrNoMem]>;
+//
+// Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
+//
+class Hexagon_df_dfdfdfqi_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
+                                             llvm_double_ty,
+                          llvm_i32_ty],
+                          [IntrNoMem]>;
+
+
+// This one below will not be generated from iset.py.
+// So make sure, you don't overwrite this one.
+//
+// BUILTIN_INFO(SI_to_SXTHI_asrh,SI_ftype_SI,1)
+//
+def int_hexagon_SI_to_SXTHI_asrh :
+Hexagon_si_si_Intrinsic<"SI_to_SXTHI_asrh">;
+//
+// BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4)
+//
+def int_hexagon_circ_ldd :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">;
+// This one above will not be generated from iset.py.
+// So make sure, you don't overwrite this one.
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpeq,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpeq : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpeq">;
+def int_hexagon_C2_cmpeq :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpeq">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgt,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgt : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgt">;
+def int_hexagon_C2_cmpgt :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgt">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtu,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgtu : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgtu">;
+def int_hexagon_C2_cmpgtu :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgtu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpeqp,QI_ftype_DIDI,2)
 //
-def int_hexagon_C2_cmpeqp : Hexagon_qi_didi_Intrinsic<"HEXAGON.C2.cmpeqp">;
+def int_hexagon_C2_cmpeqp :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpeqp">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtp,QI_ftype_DIDI,2)
 //
-def int_hexagon_C2_cmpgtp : Hexagon_qi_didi_Intrinsic<"HEXAGON.C2.cmpgtp">;
+def int_hexagon_C2_cmpgtp :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpgtp">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtup,QI_ftype_DIDI,2)
 //
-def int_hexagon_C2_cmpgtup : Hexagon_qi_didi_Intrinsic<"HEXAGON.C2.cmpgtup">;
+def int_hexagon_C2_cmpgtup :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpgtup">;
+//
+// BUILTIN_INFO(HEXAGON.A4_rcmpeqi,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_rcmpeqi :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_rcmpneqi,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_rcmpneqi :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_rcmpeq,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_rcmpeq :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeq">;
+//
+// BUILTIN_INFO(HEXAGON.A4_rcmpneq,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_rcmpneq :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneq">;
 //
 // BUILTIN_INFO(HEXAGON.C2_bitsset,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_bitsset : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.bitsset">;
+def int_hexagon_C2_bitsset :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsset">;
 //
 // BUILTIN_INFO(HEXAGON.C2_bitsclr,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_bitsclr : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.bitsclr">;
+def int_hexagon_C2_bitsclr :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsclr">;
+//
+// BUILTIN_INFO(HEXAGON.C4_nbitsset,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_nbitsset :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsset">;
+//
+// BUILTIN_INFO(HEXAGON.C4_nbitsclr,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_nbitsclr :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsclr">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpeqi,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpeqi : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpeqi">;
+def int_hexagon_C2_cmpeqi :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpeqi">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgti,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgti : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgti">;
+def int_hexagon_C2_cmpgti :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgti">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtui,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgtui : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgtui">;
+def int_hexagon_C2_cmpgtui :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgtui">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgei,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgei : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgei">;
+def int_hexagon_C2_cmpgei :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgei">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgeui,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpgeui : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpgeui">;
+def int_hexagon_C2_cmpgeui :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgeui">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmplt,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmplt : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmplt">;
+def int_hexagon_C2_cmplt :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmplt">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpltu,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_cmpltu : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.cmpltu">;
+def int_hexagon_C2_cmpltu :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpltu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_bitsclri,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_bitsclri : Hexagon_qi_sisi_Intrinsic<"HEXAGON.C2.bitsclri">;
+def int_hexagon_C2_bitsclri :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsclri">;
+//
+// BUILTIN_INFO(HEXAGON.C4_nbitsclri,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_nbitsclri :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsclri">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmpneqi,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmpneqi :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpneqi">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmpltei,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmpltei :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpltei">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmplteui,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmplteui :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplteui">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmpneq,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmpneq :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpneq">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmplte,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmplte :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplte">;
+//
+// BUILTIN_INFO(HEXAGON.C4_cmplteu,QI_ftype_SISI,2)
+//
+def int_hexagon_C4_cmplteu :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplteu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_and,QI_ftype_QIQI,2)
 //
-def int_hexagon_C2_and : Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C2.and">;
+def int_hexagon_C2_and :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_and">;
 //
 // BUILTIN_INFO(HEXAGON.C2_or,QI_ftype_QIQI,2)
 //
-def int_hexagon_C2_or : Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C2.or">;
+def int_hexagon_C2_or :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_or">;
 //
 // BUILTIN_INFO(HEXAGON.C2_xor,QI_ftype_QIQI,2)
 //
-def int_hexagon_C2_xor : Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C2.xor">;
+def int_hexagon_C2_xor :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_xor">;
 //
 // BUILTIN_INFO(HEXAGON.C2_andn,QI_ftype_QIQI,2)
 //
-def int_hexagon_C2_andn : Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C2.andn">;
+def int_hexagon_C2_andn :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_andn">;
 //
 // BUILTIN_INFO(HEXAGON.C2_not,QI_ftype_QI,1)
 //
-def int_hexagon_C2_not : Hexagon_qi_qi_Intrinsic<"HEXAGON.C2.not">;
+def int_hexagon_C2_not :
+Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_not">;
 //
 // BUILTIN_INFO(HEXAGON.C2_orn,QI_ftype_QIQI,2)
 //
-def int_hexagon_C2_orn : Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C2.orn">;
+def int_hexagon_C2_orn :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_orn">;
+//
+// BUILTIN_INFO(HEXAGON.C4_and_and,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_and_and :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_and">;
+//
+// BUILTIN_INFO(HEXAGON.C4_and_or,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_and_or :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_or">;
+//
+// BUILTIN_INFO(HEXAGON.C4_or_and,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_or_and :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_and">;
+//
+// BUILTIN_INFO(HEXAGON.C4_or_or,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_or_or :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_or">;
+//
+// BUILTIN_INFO(HEXAGON.C4_and_andn,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_and_andn :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_andn">;
+//
+// BUILTIN_INFO(HEXAGON.C4_and_orn,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_and_orn :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_orn">;
+//
+// BUILTIN_INFO(HEXAGON.C4_or_andn,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_or_andn :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_andn">;
+//
+// BUILTIN_INFO(HEXAGON.C4_or_orn,QI_ftype_QIQIQI,3)
+//
+def int_hexagon_C4_or_orn :
+Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_orn">;
 //
 // BUILTIN_INFO(HEXAGON.C2_pxfer_map,QI_ftype_QI,1)
 //
-def int_hexagon_C2_pxfer_map : Hexagon_qi_qi_Intrinsic<"HEXAGON.C2.pxfer.map">;
+def int_hexagon_C2_pxfer_map :
+Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_pxfer_map">;
 //
 // BUILTIN_INFO(HEXAGON.C2_any8,QI_ftype_QI,1)
 //
-def int_hexagon_C2_any8 : Hexagon_qi_qi_Intrinsic<"HEXAGON.C2.any8">;
+def int_hexagon_C2_any8 :
+Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_any8">;
 //
 // BUILTIN_INFO(HEXAGON.C2_all8,QI_ftype_QI,1)
 //
-def int_hexagon_C2_all8 : Hexagon_qi_qi_Intrinsic<"HEXAGON.C2.all8">;
+def int_hexagon_C2_all8 :
+Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_all8">;
 //
 // BUILTIN_INFO(HEXAGON.C2_vitpack,SI_ftype_QIQI,2)
 //
-def int_hexagon_C2_vitpack : Hexagon_si_qiqi_Intrinsic<"HEXAGON.C2.vitpack">;
+def int_hexagon_C2_vitpack :
+Hexagon_si_qiqi_Intrinsic<"HEXAGON_C2_vitpack">;
 //
 // BUILTIN_INFO(HEXAGON.C2_mux,SI_ftype_QISISI,3)
 //
-def int_hexagon_C2_mux : Hexagon_si_qisisi_Intrinsic<"HEXAGON.C2.mux">;
+def int_hexagon_C2_mux :
+Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_mux">;
 //
 // BUILTIN_INFO(HEXAGON.C2_muxii,SI_ftype_QISISI,3)
 //
-def int_hexagon_C2_muxii : Hexagon_si_qisisi_Intrinsic<"HEXAGON.C2.muxii">;
+def int_hexagon_C2_muxii :
+Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxii">;
 //
 // BUILTIN_INFO(HEXAGON.C2_muxir,SI_ftype_QISISI,3)
 //
-def int_hexagon_C2_muxir : Hexagon_si_qisisi_Intrinsic<"HEXAGON.C2.muxir">;
+def int_hexagon_C2_muxir :
+Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxir">;
 //
 // BUILTIN_INFO(HEXAGON.C2_muxri,SI_ftype_QISISI,3)
 //
-def int_hexagon_C2_muxri : Hexagon_si_qisisi_Intrinsic<"HEXAGON.C2.muxri">;
+def int_hexagon_C2_muxri :
+Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxri">;
 //
 // BUILTIN_INFO(HEXAGON.C2_vmux,DI_ftype_QIDIDI,3)
 //
-def int_hexagon_C2_vmux : Hexagon_di_qididi_Intrinsic<"HEXAGON.C2.vmux">;
+def int_hexagon_C2_vmux :
+Hexagon_di_qididi_Intrinsic<"HEXAGON_C2_vmux">;
 //
 // BUILTIN_INFO(HEXAGON.C2_mask,DI_ftype_QI,1)
 //
-def int_hexagon_C2_mask : Hexagon_di_qi_Intrinsic<"HEXAGON.C2.mask">;
+def int_hexagon_C2_mask :
+Hexagon_di_qi_Intrinsic<"HEXAGON_C2_mask">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpbeq,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpbeq : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpbeq">;
+def int_hexagon_A2_vcmpbeq :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpbeq">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpbeqi,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpbeqi :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbeqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpbeq_any,QI_ftype_DIDI,2)
+//
+def int_hexagon_A4_vcmpbeq_any :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpbgtu,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpbgtu : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpbgtu">;
+def int_hexagon_A2_vcmpbgtu :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpbgtui,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpbgtui :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbgtui">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpbgt,QI_ftype_DIDI,2)
+//
+def int_hexagon_A4_vcmpbgt :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A4_vcmpbgt">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpbgti,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpbgti :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbgti">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbeq,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbeq :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbeq">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbeqi,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbeqi :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbeqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbgtu,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbgtu :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgtu">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbgtui,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbgtui :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgtui">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbgt,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbgt :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgt">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpbgti,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpbgti :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgti">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpheq,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpheq : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpheq">;
+def int_hexagon_A2_vcmpheq :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpheq">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmphgt,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmphgt : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmphgt">;
+def int_hexagon_A2_vcmphgt :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmphgt">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmphgtu,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmphgtu : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmphgtu">;
+def int_hexagon_A2_vcmphgtu :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmphgtu">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpheqi,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpheqi :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpheqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmphgti,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmphgti :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmphgti">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmphgtui,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmphgtui :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmphgtui">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpheq,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpheq :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpheq">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmphgt,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmphgt :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgt">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmphgtu,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmphgtu :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgtu">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmpheqi,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmpheqi :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpheqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmphgti,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmphgti :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgti">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cmphgtui,QI_ftype_SISI,2)
+//
+def int_hexagon_A4_cmphgtui :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpweq,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpweq : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpweq">;
+def int_hexagon_A2_vcmpweq :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpweq">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpwgt,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpwgt : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpwgt">;
+def int_hexagon_A2_vcmpwgt :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpwgt">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpwgtu,QI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vcmpwgtu : Hexagon_qi_didi_Intrinsic<"HEXAGON.A2.vcmpwgtu">;
+def int_hexagon_A2_vcmpwgtu :
+Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpweqi,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpweqi :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpweqi">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpwgti,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpwgti :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpwgti">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vcmpwgtui,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_vcmpwgtui :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpwgtui">;
+//
+// BUILTIN_INFO(HEXAGON.A4_boundscheck,QI_ftype_SIDI,2)
+//
+def int_hexagon_A4_boundscheck :
+Hexagon_qi_sidi_Intrinsic<"HEXAGON_A4_boundscheck">;
+//
+// BUILTIN_INFO(HEXAGON.A4_tlbmatch,QI_ftype_DISI,2)
+//
+def int_hexagon_A4_tlbmatch :
+Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_tlbmatch">;
 //
 // BUILTIN_INFO(HEXAGON.C2_tfrpr,SI_ftype_QI,1)
 //
-def int_hexagon_C2_tfrpr : Hexagon_si_qi_Intrinsic<"HEXAGON.C2.tfrpr">;
+def int_hexagon_C2_tfrpr :
+Hexagon_si_qi_Intrinsic<"HEXAGON_C2_tfrpr">;
 //
 // BUILTIN_INFO(HEXAGON.C2_tfrrp,QI_ftype_SI,1)
 //
-def int_hexagon_C2_tfrrp : Hexagon_qi_si_Intrinsic<"HEXAGON.C2.tfrrp">;
+def int_hexagon_C2_tfrrp :
+Hexagon_qi_si_Intrinsic<"HEXAGON_C2_tfrrp">;
+//
+// BUILTIN_INFO(HEXAGON.C4_fastcorner9,QI_ftype_QIQI,2)
+//
+def int_hexagon_C4_fastcorner9 :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9">;
+//
+// BUILTIN_INFO(HEXAGON.C4_fastcorner9_not,QI_ftype_QIQI,2)
+//
+def int_hexagon_C4_fastcorner9_not :
+Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9_not">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_acc_sat_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.acc.sat.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpy_nac_sat_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpy.nac.sat.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_hh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.hh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_hh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.hh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_hl_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.hl.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_hl_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.hl.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_lh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.lh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_lh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.lh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_ll_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.ll.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_ll_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.ll.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.hh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.hh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.hl.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.hl.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.lh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.lh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.ll.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.ll.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.hh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.hh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.hl.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.hl.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.lh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.lh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.ll.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_rnd_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.rnd.ll.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.hh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.hh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.hl.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.hl.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.lh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.lh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.ll.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_sat_rnd_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.sat.rnd.ll.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.hh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.hh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.hl.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.hl.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.lh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.lh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.ll.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_acc_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.acc.ll.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.hh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.hh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.hl.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.hl.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.lh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.lh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.ll.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyd_nac_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyd.nac.ll.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_hh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.hh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_hh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.hh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_hl_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.hl.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_hl_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.hl.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_lh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.lh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_lh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.lh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_ll_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.ll.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_ll_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.ll.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_hh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.hh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_hh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.hh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_hl_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.hl.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_hl_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.hl.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_lh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.lh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_lh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.lh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_ll_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.ll.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyd_rnd_ll_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.mpyd.rnd.ll.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_acc_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.acc.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.hh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.hh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.hl.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.hl.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.lh.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.lh.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s0,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.ll.s0">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s1,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_mpyu_nac_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.mpyu.nac.ll.s1">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s0,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_hh_s0 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.hh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s1,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_hh_s1 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.hh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s0,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_hl_s0 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.hl.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s1,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_hl_s1 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.hl.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s0,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_lh_s0 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.lh.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s1,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_lh_s1 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.lh.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s0,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_ll_s0 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.ll.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s1,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_ll_s1 :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.ll.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.hh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.hh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.hl.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.hl.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.lh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.lh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.ll.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_acc_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.acc.ll.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.hh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.hh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.hl.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.hl.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.lh.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.lh.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.ll.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_mpyud_nac_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.mpyud.nac.ll.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s0,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_hh_s0 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.hh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s1,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_hh_s1 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.hh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s0,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_hl_s0 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.hl.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s1,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_hl_s1 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.hl.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s0,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_lh_s0 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.lh.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s1,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_lh_s1 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.lh.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s0,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_ll_s0 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.ll.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s1,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyud_ll_s1 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.mpyud.ll.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpysmi,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpysmi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpysmi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysmi">;
 //
 // BUILTIN_INFO(HEXAGON.M2_macsip,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_macsip :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.macsip">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsip">;
 //
 // BUILTIN_INFO(HEXAGON.M2_macsin,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_macsin :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.macsin">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsin">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyss_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_dpmpyss_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.dpmpyss.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyss_acc_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_dpmpyss_acc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.dpmpyss.acc.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_acc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyss_nac_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_dpmpyss_nac_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.dpmpyss.nac.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_nac_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyuu_s0,UDI_ftype_SISI,2)
 //
 def int_hexagon_M2_dpmpyuu_s0 :
-Hexagon_udi_sisi_Intrinsic<"HEXAGON.M2.dpmpyuu.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyuu_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyuu_acc_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_dpmpyuu_acc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.dpmpyuu.acc.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_acc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyuu_nac_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_dpmpyuu_nac_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.dpmpyuu.nac.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_nac_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_up,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpy_up :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpy.up">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up">;
+//
+// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1,SI_ftype_SISI,2)
+//
+def int_hexagon_M2_mpy_up_s1 :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1_sat,SI_ftype_SISI,2)
+//
+def int_hexagon_M2_mpy_up_s1_sat :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1_sat">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyu_up,USI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyu_up :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.M2.mpyu.up">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_up">;
+//
+// BUILTIN_INFO(HEXAGON.M2_mpysu_up,SI_ftype_SISI,2)
+//
+def int_hexagon_M2_mpysu_up :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysu_up">;
 //
 // BUILTIN_INFO(HEXAGON.M2_dpmpyss_rnd_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_dpmpyss_rnd_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.dpmpyss.rnd.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_rnd_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mac_up_s1_sat,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mac_up_s1_sat :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mac_up_s1_sat">;
+//
+// BUILTIN_INFO(HEXAGON.M4_nac_up_s1_sat,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_nac_up_s1_sat :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_nac_up_s1_sat">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyi,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpyi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyi">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpyui,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_mpyui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.mpyui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyui">;
 //
 // BUILTIN_INFO(HEXAGON.M2_maci,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_maci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.maci">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_maci">;
 //
 // BUILTIN_INFO(HEXAGON.M2_acci,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_acci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.acci">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_acci">;
 //
 // BUILTIN_INFO(HEXAGON.M2_accii,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_accii :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.accii">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_accii">;
 //
 // BUILTIN_INFO(HEXAGON.M2_nacci,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_nacci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.nacci">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_nacci">;
 //
 // BUILTIN_INFO(HEXAGON.M2_naccii,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_naccii :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.naccii">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_naccii">;
 //
 // BUILTIN_INFO(HEXAGON.M2_subacc,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_subacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.subacc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_subacc">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mpyrr_addr,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mpyrr_addr :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addr">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mpyri_addr_u2,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mpyri_addr_u2 :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr_u2">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mpyri_addr,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mpyri_addr :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mpyri_addi,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mpyri_addi :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addi">;
+//
+// BUILTIN_INFO(HEXAGON.M4_mpyrr_addi,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_mpyrr_addi :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addi">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_vmpy2s_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.vmpy2s.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_vmpy2s_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.vmpy2s.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2s_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_vmac2s_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.vmac2s.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2s_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_vmac2s_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.vmac2s.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s0,DI_ftype_SISI,2)
+//
+def int_hexagon_M2_vmpy2su_s0 :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s1,DI_ftype_SISI,2)
+//
+def int_hexagon_M2_vmpy2su_s1 :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M2_vmac2su_s0,DI_ftype_DISISI,3)
+//
+def int_hexagon_M2_vmac2su_s0 :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M2_vmac2su_s1,DI_ftype_DISISI,3)
+//
+def int_hexagon_M2_vmac2su_s1 :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0pack,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_vmpy2s_s0pack :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.vmpy2s.s0pack">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0pack">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1pack,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_vmpy2s_s1pack :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.vmpy2s.s1pack">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1pack">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_vmac2 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.vmac2">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2es_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vmpy2es_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vmpy2es.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmpy2es_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vmpy2es_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vmpy2es.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2es_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vmac2es_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vmac2es.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2es_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vmac2es_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vmac2es.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vmac2es,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vmac2es :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vmac2es">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrmac_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vrmac_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vrmac.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrmac_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrmpy_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vrmpy_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vrmpy.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrmpy_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s0,SI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vdmpyrs_s0 :
-Hexagon_si_didi_Intrinsic<"HEXAGON.M2.vdmpyrs.s0">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s1,SI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vdmpyrs_s1 :
-Hexagon_si_didi_Intrinsic<"HEXAGON.M2.vdmpyrs.s1">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vrmpybuu,DI_ftype_DIDI,2)
+//
+def int_hexagon_M5_vrmpybuu :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybuu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vrmacbuu,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M5_vrmacbuu :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbuu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vrmpybsu,DI_ftype_DIDI,2)
+//
+def int_hexagon_M5_vrmpybsu :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybsu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vrmacbsu,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M5_vrmacbsu :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbsu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vmpybuu,DI_ftype_SISI,2)
+//
+def int_hexagon_M5_vmpybuu :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybuu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vmpybsu,DI_ftype_SISI,2)
+//
+def int_hexagon_M5_vmpybsu :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybsu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vmacbuu,DI_ftype_DISISI,3)
+//
+def int_hexagon_M5_vmacbuu :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbuu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vmacbsu,DI_ftype_DISISI,3)
+//
+def int_hexagon_M5_vmacbsu :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbsu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vdmpybsu,DI_ftype_DIDI,2)
+//
+def int_hexagon_M5_vdmpybsu :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vdmpybsu">;
+//
+// BUILTIN_INFO(HEXAGON.M5_vdmacbsu,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M5_vdmacbsu :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vdmacbsu">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmacs_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vdmacs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vdmacs.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmacs_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vdmacs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vdmacs.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmpys_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vdmpys_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vdmpys.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vdmpys_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vdmpys_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vdmpys.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyrs_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyrs_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.cmpyrs.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyrs_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyrs_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.cmpyrs.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s0,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyrsc_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.cmpyrsc.s0">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyrsc_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.cmpyrsc.s1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmacs_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmacs_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmacs.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmacs_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmacs_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmacs.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmacsc_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmacsc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmacsc.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmacsc_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmacsc_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmacsc.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpys_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpys_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpys.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpys_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpys_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpys.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpysc_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpysc_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpysc.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpysc_s1,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpysc_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpysc.s1">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cnacs_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cnacs_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cnacs.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cnacs_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cnacs_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cnacs.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cnacsc_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cnacsc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cnacsc.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cnacsc_s1,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cnacsc_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cnacsc.s1">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1,DI_ftype_DISI,2)
 //
 def int_hexagon_M2_vrcmpys_s1 :
-Hexagon_di_disi_Intrinsic<"HEXAGON.M2.vrcmpys.s1">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpys_acc_s1,DI_ftype_DIDISI,3)
 //
 def int_hexagon_M2_vrcmpys_acc_s1 :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.M2.vrcmpys.acc.s1">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_M2_vrcmpys_acc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1rp,SI_ftype_DISI,2)
 //
 def int_hexagon_M2_vrcmpys_s1rp :
-Hexagon_si_disi_Intrinsic<"HEXAGON.M2.vrcmpys.s1rp">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1rp">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacls_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacls_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacls.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacls_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacls_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacls.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmachs_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmachs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmachs.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmachs_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmachs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmachs.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyl_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyl_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyl.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyl_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyl_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyl.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyh_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyh.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyh_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyh.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacls_rs0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacls_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacls.rs0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacls_rs1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacls_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacls.rs1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmachs_rs0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmachs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmachs.rs0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmachs_rs1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmachs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmachs.rs1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyl_rs0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyl_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyl.rs0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyl_rs1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyl_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyl.rs1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyh_rs0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyh_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyh.rs0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyh_rs1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyh_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyh.rs1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs1">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s0,DI_ftype_DIDI,2)
+//
+def int_hexagon_M4_vrmpyeh_s0 :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s1,DI_ftype_DIDI,2)
+//
+def int_hexagon_M4_vrmpyeh_s1 :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s0,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M4_vrmpyeh_acc_s0 :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s1,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M4_vrmpyeh_acc_s1 :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s0,DI_ftype_DIDI,2)
+//
+def int_hexagon_M4_vrmpyoh_s0 :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s1,DI_ftype_DIDI,2)
+//
+def int_hexagon_M4_vrmpyoh_s1 :
+Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s0,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M4_vrmpyoh_acc_s0 :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s1,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M4_vrmpyoh_acc_s1 :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_hmmpyl_rs1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_hmmpyl_rs1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.hmmpyl.rs1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_hmmpyh_rs1,SI_ftype_SISI,2)
 //
 def int_hexagon_M2_hmmpyh_rs1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.M2.hmmpyh.rs1">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_rs1">;
+//
+// BUILTIN_INFO(HEXAGON.M2_hmmpyl_s1,SI_ftype_SISI,2)
+//
+def int_hexagon_M2_hmmpyl_s1 :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_s1">;
+//
+// BUILTIN_INFO(HEXAGON.M2_hmmpyh_s1,SI_ftype_SISI,2)
+//
+def int_hexagon_M2_hmmpyh_s1 :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmaculs_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmaculs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmaculs.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmaculs_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmaculs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmaculs.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacuhs_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacuhs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacuhs.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacuhs_s1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacuhs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacuhs.s1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyul_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyul_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyul.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyul_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyul_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyul.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyuh_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyuh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyuh.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyuh_s1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyuh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyuh.s1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmaculs_rs0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmaculs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmaculs.rs0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmaculs_rs1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmaculs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmaculs.rs1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacuhs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacuhs.rs0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs1,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_mmacuhs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.mmacuhs.rs1">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyul_rs0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyul_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyul.rs0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyul_rs1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyul_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyul.rs1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyuh_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyuh.rs0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs1,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_mmpyuh_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.mmpyuh.rs1">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs1">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vrcmaci_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vrcmaci.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vrcmacr_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vrcmacr.s0">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0c,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vrcmaci_s0c :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vrcmaci.s0c">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0c">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0c,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vrcmacr_s0c :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vrcmacr.s0c">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0c">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmaci_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmaci_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmaci.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmaci_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmacr_s0,DI_ftype_DISISI,3)
 //
 def int_hexagon_M2_cmacr_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.M2.cmacr.s0">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacr_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vrcmpyi_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vrcmpyi.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vrcmpyr_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vrcmpyr.s0">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0c,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vrcmpyi_s0c :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vrcmpyi.s0c">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0c">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0c,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vrcmpyr_s0c :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vrcmpyr.s0c">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0c">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyi_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyi_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpyi.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyi_s0">;
 //
 // BUILTIN_INFO(HEXAGON.M2_cmpyr_s0,DI_ftype_SISI,2)
 //
 def int_hexagon_M2_cmpyr_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.M2.cmpyr.s0">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyr_s0">;
+//
+// BUILTIN_INFO(HEXAGON.M4_cmpyi_wh,SI_ftype_DISI,2)
+//
+def int_hexagon_M4_cmpyi_wh :
+Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_wh">;
+//
+// BUILTIN_INFO(HEXAGON.M4_cmpyr_wh,SI_ftype_DISI,2)
+//
+def int_hexagon_M4_cmpyr_wh :
+Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_wh">;
+//
+// BUILTIN_INFO(HEXAGON.M4_cmpyi_whc,SI_ftype_DISI,2)
+//
+def int_hexagon_M4_cmpyi_whc :
+Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_whc">;
+//
+// BUILTIN_INFO(HEXAGON.M4_cmpyr_whc,SI_ftype_DISI,2)
+//
+def int_hexagon_M4_cmpyr_whc :
+Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_whc">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_i,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vcmpy_s0_sat_i :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vcmpy.s0.sat.i">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_i">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_r,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vcmpy_s0_sat_r :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vcmpy.s0.sat.r">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_r">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_i,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vcmpy_s1_sat_i :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vcmpy.s1.sat.i">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_i">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_r,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vcmpy_s1_sat_r :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vcmpy.s1.sat.r">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_r">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_i,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vcmac_s0_sat_i :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vcmac.s0.sat.i">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_i">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_r,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_M2_vcmac_s0_sat_r :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M2.vcmac.s0.sat.r">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vcrotate,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_vcrotate :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.vcrotate">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcrotate">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vrcrotate_acc,DI_ftype_DIDISISI,4)
+//
+def int_hexagon_S4_vrcrotate_acc :
+Hexagon_di_didisisi_Intrinsic<"HEXAGON_S4_vrcrotate_acc">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vrcrotate,DI_ftype_DISISI,3)
+//
+def int_hexagon_S4_vrcrotate :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_vrcrotate">;
+//
+// BUILTIN_INFO(HEXAGON.S2_vcnegh,DI_ftype_DISI,2)
+//
+def int_hexagon_S2_vcnegh :
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcnegh">;
+//
+// BUILTIN_INFO(HEXAGON.S2_vrcnegh,DI_ftype_DIDISI,3)
+//
+def int_hexagon_S2_vrcnegh :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vrcnegh">;
+//
+// BUILTIN_INFO(HEXAGON.M4_pmpyw,DI_ftype_SISI,2)
+//
+def int_hexagon_M4_pmpyw :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_pmpyw">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vpmpyh,DI_ftype_SISI,2)
+//
+def int_hexagon_M4_vpmpyh :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_vpmpyh">;
+//
+// BUILTIN_INFO(HEXAGON.M4_pmpyw_acc,DI_ftype_DISISI,3)
+//
+def int_hexagon_M4_pmpyw_acc :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_pmpyw_acc">;
+//
+// BUILTIN_INFO(HEXAGON.M4_vpmpyh_acc,DI_ftype_DISISI,3)
+//
+def int_hexagon_M4_vpmpyh_acc :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_vpmpyh_acc">;
 //
 // BUILTIN_INFO(HEXAGON.A2_add,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_add :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.add">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_add">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sub,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_sub :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.sub">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_sub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addsat,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addsat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addsat">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subsat,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subsat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subsat">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addi,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addi">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_l16_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_l16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_l16_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_l16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.hl">;
-def int_hexagon_A2_addh_l16_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.lh">;
-def int_hexagon_A2_addh_l16_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_l16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.sat.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_l16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.sat.hl">;
-def int_hexagon_A2_addh_l16_sat_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.sat.lh">;
-def int_hexagon_A2_addh_l16_sat_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.l16.sat.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_l16_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_l16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.l16.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_l16_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_l16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.l16.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_l16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.l16.sat.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_l16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.l16.sat.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_lh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.lh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_lh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_hh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.sat.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_lh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_sat_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.sat.lh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_lh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.sat.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_addh_h16_sat_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.addh.h16.sat.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_lh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.lh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_lh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_hh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.sat.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_lh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_sat_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.sat.lh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_lh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.sat.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subh_h16_sat_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subh.h16.sat.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_aslh,SI_ftype_SI,1)
 //
 def int_hexagon_A2_aslh :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.aslh">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_aslh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_asrh,SI_ftype_SI,1)
 //
 def int_hexagon_A2_asrh :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.asrh">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_asrh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_addp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.addp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addpsat,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_addpsat :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.addpsat">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addpsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_addsp,DI_ftype_SIDI,2)
 //
 def int_hexagon_A2_addsp :
-Hexagon_di_sidi_Intrinsic<"HEXAGON.A2.addsp">;
+Hexagon_di_sidi_Intrinsic<"HEXAGON_A2_addsp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_subp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.subp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_subp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_neg,SI_ftype_SI,1)
 //
 def int_hexagon_A2_neg :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.neg">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_neg">;
 //
 // BUILTIN_INFO(HEXAGON.A2_negsat,SI_ftype_SI,1)
 //
 def int_hexagon_A2_negsat :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.negsat">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_negsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_abs,SI_ftype_SI,1)
 //
 def int_hexagon_A2_abs :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.abs">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_abs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_abssat,SI_ftype_SI,1)
 //
 def int_hexagon_A2_abssat :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.abssat">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_abssat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vconj,DI_ftype_DI,1)
 //
 def int_hexagon_A2_vconj :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.vconj">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_vconj">;
 //
 // BUILTIN_INFO(HEXAGON.A2_negp,DI_ftype_DI,1)
 //
 def int_hexagon_A2_negp :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.negp">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_negp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_absp,DI_ftype_DI,1)
 //
 def int_hexagon_A2_absp :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.absp">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_absp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_max,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_max :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.max">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_max">;
 //
 // BUILTIN_INFO(HEXAGON.A2_maxu,USI_ftype_SISI,2)
 //
 def int_hexagon_A2_maxu :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.A2.maxu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_maxu">;
 //
 // BUILTIN_INFO(HEXAGON.A2_min,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_min :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.min">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_min">;
 //
 // BUILTIN_INFO(HEXAGON.A2_minu,USI_ftype_SISI,2)
 //
 def int_hexagon_A2_minu :
-Hexagon_usi_sisi_Intrinsic<"HEXAGON.A2.minu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_minu">;
 //
 // BUILTIN_INFO(HEXAGON.A2_maxp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_maxp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.maxp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_maxup,UDI_ftype_DIDI,2)
 //
 def int_hexagon_A2_maxup :
-Hexagon_udi_didi_Intrinsic<"HEXAGON.A2.maxup">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxup">;
 //
 // BUILTIN_INFO(HEXAGON.A2_minp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_minp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.minp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_minup,UDI_ftype_DIDI,2)
 //
 def int_hexagon_A2_minup :
-Hexagon_udi_didi_Intrinsic<"HEXAGON.A2.minup">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minup">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfr,SI_ftype_SI,1)
 //
 def int_hexagon_A2_tfr :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.tfr">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfrsi,SI_ftype_SI,1)
 //
 def int_hexagon_A2_tfrsi :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.tfrsi">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfrsi">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfrp,DI_ftype_DI,1)
 //
 def int_hexagon_A2_tfrp :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.tfrp">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_tfrp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfrpi,DI_ftype_SI,1)
 //
 def int_hexagon_A2_tfrpi :
-Hexagon_di_si_Intrinsic<"HEXAGON.A2.tfrpi">;
+Hexagon_di_si_Intrinsic<"HEXAGON_A2_tfrpi">;
 //
 // BUILTIN_INFO(HEXAGON.A2_zxtb,SI_ftype_SI,1)
 //
 def int_hexagon_A2_zxtb :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.zxtb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxtb">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sxtb,SI_ftype_SI,1)
 //
 def int_hexagon_A2_sxtb :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.sxtb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxtb">;
 //
 // BUILTIN_INFO(HEXAGON.A2_zxth,SI_ftype_SI,1)
 //
 def int_hexagon_A2_zxth :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.zxth">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxth">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sxth,SI_ftype_SI,1)
 //
 def int_hexagon_A2_sxth :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.sxth">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxth">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combinew,DI_ftype_SISI,2)
 //
 def int_hexagon_A2_combinew :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.A2.combinew">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combinew">;
+//
+// BUILTIN_INFO(HEXAGON.A4_combineri,DI_ftype_SISI,2)
+//
+def int_hexagon_A4_combineri :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineri">;
+//
+// BUILTIN_INFO(HEXAGON.A4_combineir,DI_ftype_SISI,2)
+//
+def int_hexagon_A4_combineir :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineir">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combineii,DI_ftype_SISI,2)
 //
 def int_hexagon_A2_combineii :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.A2.combineii">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combineii">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combine_hh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_combine_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.combine.hh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combine_hl,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_combine_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.combine.hl">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combine_lh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_combine_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.combine.lh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_lh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_combine_ll,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_combine_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.combine.ll">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_ll">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfril,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_tfril :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.tfril">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfril">;
 //
 // BUILTIN_INFO(HEXAGON.A2_tfrih,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_tfrih :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.tfrih">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfrih">;
 //
 // BUILTIN_INFO(HEXAGON.A2_and,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_and :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.and">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_and">;
 //
 // BUILTIN_INFO(HEXAGON.A2_or,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_or :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.or">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_or">;
 //
 // BUILTIN_INFO(HEXAGON.A2_xor,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_xor :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.xor">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_xor">;
 //
 // BUILTIN_INFO(HEXAGON.A2_not,SI_ftype_SI,1)
 //
 def int_hexagon_A2_not :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.not">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_not">;
 //
 // BUILTIN_INFO(HEXAGON.M2_xor_xacc,SI_ftype_SISISI,3)
 //
 def int_hexagon_M2_xor_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M2.xor.xacc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_xor_xacc">;
+//
+// BUILTIN_INFO(HEXAGON.M4_xor_xacc,DI_ftype_DIDIDI,3)
+//
+def int_hexagon_M4_xor_xacc :
+Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_xor_xacc">;
+//
+// BUILTIN_INFO(HEXAGON.A4_andn,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_andn :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_andn">;
+//
+// BUILTIN_INFO(HEXAGON.A4_orn,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_orn :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_orn">;
+//
+// BUILTIN_INFO(HEXAGON.A4_andnp,DI_ftype_DIDI,2)
+//
+def int_hexagon_A4_andnp :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A4_andnp">;
+//
+// BUILTIN_INFO(HEXAGON.A4_ornp,DI_ftype_DIDI,2)
+//
+def int_hexagon_A4_ornp :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A4_ornp">;
+//
+// BUILTIN_INFO(HEXAGON.S4_addaddi,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_addaddi :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addaddi">;
+//
+// BUILTIN_INFO(HEXAGON.S4_subaddi,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_subaddi :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subaddi">;
+//
+// BUILTIN_INFO(HEXAGON.M4_and_and,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_and_and :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_and">;
+//
+// BUILTIN_INFO(HEXAGON.M4_and_andn,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_and_andn :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_andn">;
+//
+// BUILTIN_INFO(HEXAGON.M4_and_or,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_and_or :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_or">;
+//
+// BUILTIN_INFO(HEXAGON.M4_and_xor,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_and_xor :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_xor">;
+//
+// BUILTIN_INFO(HEXAGON.M4_or_and,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_or_and :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_and">;
+//
+// BUILTIN_INFO(HEXAGON.M4_or_andn,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_or_andn :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_andn">;
+//
+// BUILTIN_INFO(HEXAGON.M4_or_or,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_or_or :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_or">;
+//
+// BUILTIN_INFO(HEXAGON.M4_or_xor,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_or_xor :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_xor">;
+//
+// BUILTIN_INFO(HEXAGON.S4_or_andix,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_or_andix :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andix">;
+//
+// BUILTIN_INFO(HEXAGON.S4_or_andi,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_or_andi :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andi">;
+//
+// BUILTIN_INFO(HEXAGON.S4_or_ori,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_or_ori :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_ori">;
+//
+// BUILTIN_INFO(HEXAGON.M4_xor_and,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_xor_and :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_and">;
+//
+// BUILTIN_INFO(HEXAGON.M4_xor_or,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_xor_or :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_or">;
+//
+// BUILTIN_INFO(HEXAGON.M4_xor_andn,SI_ftype_SISISI,3)
+//
+def int_hexagon_M4_xor_andn :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_andn">;
 //
 // BUILTIN_INFO(HEXAGON.A2_subri,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_subri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.subri">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subri">;
 //
 // BUILTIN_INFO(HEXAGON.A2_andir,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_andir :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.andir">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_andir">;
 //
 // BUILTIN_INFO(HEXAGON.A2_orir,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_orir :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.orir">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_orir">;
 //
 // BUILTIN_INFO(HEXAGON.A2_andp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_andp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.andp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_andp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_orp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_orp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.orp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_orp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_xorp,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_xorp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.xorp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_xorp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_notp,DI_ftype_DI,1)
 //
 def int_hexagon_A2_notp :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.notp">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_notp">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sxtw,DI_ftype_SI,1)
 //
 def int_hexagon_A2_sxtw :
-Hexagon_di_si_Intrinsic<"HEXAGON.A2.sxtw">;
+Hexagon_di_si_Intrinsic<"HEXAGON_A2_sxtw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sat,SI_ftype_DI,1)
 //
 def int_hexagon_A2_sat :
-Hexagon_si_di_Intrinsic<"HEXAGON.A2.sat">;
+Hexagon_si_di_Intrinsic<"HEXAGON_A2_sat">;
+//
+// BUILTIN_INFO(HEXAGON.A2_roundsat,SI_ftype_DI,1)
+//
+def int_hexagon_A2_roundsat :
+Hexagon_si_di_Intrinsic<"HEXAGON_A2_roundsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_sath,SI_ftype_SI,1)
 //
 def int_hexagon_A2_sath :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.sath">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_sath">;
 //
 // BUILTIN_INFO(HEXAGON.A2_satuh,SI_ftype_SI,1)
 //
 def int_hexagon_A2_satuh :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.satuh">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_satuh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_satub,SI_ftype_SI,1)
 //
 def int_hexagon_A2_satub :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.satub">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_satub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_satb,SI_ftype_SI,1)
 //
 def int_hexagon_A2_satb :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.satb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_satb">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vaddub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddub">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vaddb_map,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vaddb_map :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddb_map">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddubs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vaddubs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddubs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddubs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddh,DI_ftype_DIDI,2)
 //
-def int_hexagon_A2_vaddh  :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddh">;
+def int_hexagon_A2_vaddh :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddhs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vaddhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddhs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vadduhs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vadduhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vadduhs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vadduhs">;
+//
+// BUILTIN_INFO(HEXAGON.A5_vaddhubs,SI_ftype_DIDI,2)
+//
+def int_hexagon_A5_vaddhubs :
+Hexagon_si_didi_Intrinsic<"HEXAGON_A5_vaddhubs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vaddw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vaddws,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vaddws :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vaddws">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddws">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxaddsubw,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxaddsubw :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubw">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxsubaddw,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxsubaddw :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddw">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxaddsubh,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxaddsubh :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubh">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxsubaddh,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxsubaddh :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddh">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxaddsubhr,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxaddsubhr :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubhr">;
+//
+// BUILTIN_INFO(HEXAGON.S4_vxsubaddhr,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_vxsubaddhr :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddhr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svavgh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svavgh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svavgh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavgh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svavghs,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svavghs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svavghs">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavghs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svnavgh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svnavgh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svnavgh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svnavgh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svaddh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svaddh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svaddh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svaddhs,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svaddhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svaddhs">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svadduhs,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svadduhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svadduhs">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svadduhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svsubh,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svsubh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svsubh">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svsubhs,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svsubhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svsubhs">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_svsubuhs,SI_ftype_SISI,2)
 //
 def int_hexagon_A2_svsubuhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A2.svsubuhs">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubuhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vraddub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vraddub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vraddub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vraddub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vraddub_acc,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_A2_vraddub_acc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.A2.vraddub.acc">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vraddub_acc">;
+//
+// BUILTIN_INFO(HEXAGON.M2_vraddh,SI_ftype_DIDI,2)
+//
+def int_hexagon_M2_vraddh :
+Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vraddh">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vradduh,SI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vradduh :
-Hexagon_si_didi_Intrinsic<"HEXAGON.M2.vradduh">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vradduh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubub">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vsubb_map,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vsubb_map :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubb_map">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsububs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsububs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsububs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsububs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubhs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubhs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubuhs,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubuhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubuhs">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubuhs">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vsubws,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vsubws :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vsubws">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubws">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vabsh,DI_ftype_DI,1)
 //
 def int_hexagon_A2_vabsh :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.vabsh">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vabshsat,DI_ftype_DI,1)
 //
 def int_hexagon_A2_vabshsat :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.vabshsat">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabshsat">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vabsw,DI_ftype_DI,1)
 //
 def int_hexagon_A2_vabsw :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.vabsw">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vabswsat,DI_ftype_DI,1)
 //
 def int_hexagon_A2_vabswsat :
-Hexagon_di_di_Intrinsic<"HEXAGON.A2.vabswsat">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabswsat">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vabsdiffw,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vabsdiffw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vabsdiffw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffw">;
 //
 // BUILTIN_INFO(HEXAGON.M2_vabsdiffh,DI_ftype_DIDI,2)
 //
 def int_hexagon_M2_vabsdiffh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.M2.vabsdiffh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vrsadub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vrsadub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vrsadub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vrsadub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vrsadub_acc,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_A2_vrsadub_acc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.A2.vrsadub.acc">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vrsadub_acc">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavguh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavguh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavguh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavgh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavgh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavgh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavgw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavgw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavgw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgwr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgwr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavgwr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavgwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavgwr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgwcr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgwcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgwcr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwcr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavgwcr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavgwcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavgwcr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwcr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavghcr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavghcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavghcr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghcr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavghcr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavghcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavghcr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghcr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavguw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavguw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavguw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavguwr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavguwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavguwr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguwr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavgubr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavgubr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavgubr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgubr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavguhr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavguhr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavguhr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguhr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vavghr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vavghr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vavghr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghr">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vnavghr,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vnavghr :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vnavghr">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghr">;
 //
-// BUILTIN_INFO(HEXAGON.A2_vminh,DI_ftype_DIDI,2)
+// BUILTIN_INFO(HEXAGON.A4_round_ri,SI_ftype_SISI,2)
 //
-def int_hexagon_A2_vminh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vminh">;
+def int_hexagon_A4_round_ri :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri">;
 //
-// BUILTIN_INFO(HEXAGON.A2_vmaxh,DI_ftype_DIDI,2)
+// BUILTIN_INFO(HEXAGON.A4_round_rr,SI_ftype_SISI,2)
 //
-def int_hexagon_A2_vmaxh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vmaxh">;
+def int_hexagon_A4_round_rr :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr">;
+//
+// BUILTIN_INFO(HEXAGON.A4_round_ri_sat,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_round_ri_sat :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri_sat">;
+//
+// BUILTIN_INFO(HEXAGON.A4_round_rr_sat,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_round_rr_sat :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr_sat">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cround_ri,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_cround_ri :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_ri">;
+//
+// BUILTIN_INFO(HEXAGON.A4_cround_rr,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_cround_rr :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_rr">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrminh,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrminh :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminh">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrmaxh,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrmaxh :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxh">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrminuh,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrminuh :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuh">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrmaxuh,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrmaxuh :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuh">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrminw,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrminw :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminw">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrmaxw,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrmaxw :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxw">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrminuw,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrminuw :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuw">;
+//
+// BUILTIN_INFO(HEXAGON.A4_vrmaxuw,DI_ftype_DIDISI,3)
+//
+def int_hexagon_A4_vrmaxuw :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuw">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vminb,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vminb :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminb">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vmaxb,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vmaxb :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxb">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vminub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vminub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vminub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminub">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vmaxub,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vmaxub :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vmaxub">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxub">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vminh,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vminh :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminh">;
+//
+// BUILTIN_INFO(HEXAGON.A2_vmaxh,DI_ftype_DIDI,2)
+//
+def int_hexagon_A2_vmaxh :
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vminuh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vminuh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vminuh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vmaxuh,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vmaxuh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vmaxuh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuh">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vminw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vminw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vminw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vmaxw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vmaxw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vmaxw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vminuw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vminuw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vminuw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuw">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vmaxuw,DI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vmaxuw :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A2.vmaxuw">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuw">;
+//
+// BUILTIN_INFO(HEXAGON.A4_modwrapu,SI_ftype_SISI,2)
+//
+def int_hexagon_A4_modwrapu :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_modwrapu">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfadd,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfadd :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfadd">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfsub,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfsub :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfsub">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfmpy,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfmpy :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmpy">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffma,SF_ftype_SFSFSF,3)
+//
+def int_hexagon_F2_sffma :
+Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffma_sc,SF_ftype_SFSFSFQI,4)
+//
+def int_hexagon_F2_sffma_sc :
+Hexagon_sf_sfsfsfqi_Intrinsic<"HEXAGON_F2_sffma_sc">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffms,SF_ftype_SFSFSF,3)
+//
+def int_hexagon_F2_sffms :
+Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffma_lib,SF_ftype_SFSFSF,3)
+//
+def int_hexagon_F2_sffma_lib :
+Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma_lib">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffms_lib,SF_ftype_SFSFSF,3)
+//
+def int_hexagon_F2_sffms_lib :
+Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms_lib">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfcmpeq,QI_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfcmpeq :
+Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpeq">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfcmpgt,QI_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfcmpgt :
+Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpgt">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfcmpge,QI_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfcmpge :
+Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpge">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfcmpuo,QI_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfcmpuo :
+Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpuo">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfmax,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfmax :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmax">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfmin,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sfmin :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmin">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfclass,QI_ftype_SFSI,2)
+//
+def int_hexagon_F2_sfclass :
+Hexagon_qi_sfsi_Intrinsic<"HEXAGON_F2_sfclass">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfimm_p,SF_ftype_SI,1)
+//
+def int_hexagon_F2_sfimm_p :
+Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_p">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sfimm_n,SF_ftype_SI,1)
+//
+def int_hexagon_F2_sfimm_n :
+Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_n">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffixupn,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sffixupn :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupn">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffixupd,SF_ftype_SFSF,2)
+//
+def int_hexagon_F2_sffixupd :
+Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupd">;
+//
+// BUILTIN_INFO(HEXAGON.F2_sffixupr,SF_ftype_SF,1)
+//
+def int_hexagon_F2_sffixupr :
+Hexagon_sf_sf_Intrinsic<"HEXAGON_F2_sffixupr">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfadd,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfadd :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfadd">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfsub,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfsub :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfsub">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfmpy,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfmpy :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmpy">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffma,DF_ftype_DFDFDF,3)
+//
+def int_hexagon_F2_dffma :
+Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffma">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffms,DF_ftype_DFDFDF,3)
+//
+def int_hexagon_F2_dffms :
+Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffms">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffma_lib,DF_ftype_DFDFDF,3)
+//
+def int_hexagon_F2_dffma_lib :
+Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffma_lib">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffms_lib,DF_ftype_DFDFDF,3)
+//
+def int_hexagon_F2_dffms_lib :
+Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffms_lib">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffma_sc,DF_ftype_DFDFDFQI,4)
+//
+def int_hexagon_F2_dffma_sc :
+Hexagon_df_dfdfdfqi_Intrinsic<"HEXAGON_F2_dffma_sc">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfmax,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfmax :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmax">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfmin,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfmin :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmin">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfcmpeq,QI_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfcmpeq :
+Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpeq">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfcmpgt,QI_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfcmpgt :
+Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpgt">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfcmpge,QI_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfcmpge :
+Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpge">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfcmpuo,QI_ftype_DFDF,2)
+//
+def int_hexagon_F2_dfcmpuo :
+Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpuo">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfclass,QI_ftype_DFSI,2)
+//
+def int_hexagon_F2_dfclass :
+Hexagon_qi_dfsi_Intrinsic<"HEXAGON_F2_dfclass">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfimm_p,DF_ftype_SI,1)
+//
+def int_hexagon_F2_dfimm_p :
+Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_p">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dfimm_n,DF_ftype_SI,1)
+//
+def int_hexagon_F2_dfimm_n :
+Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_n">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffixupn,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dffixupn :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dffixupn">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffixupd,DF_ftype_DFDF,2)
+//
+def int_hexagon_F2_dffixupd :
+Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dffixupd">;
+//
+// BUILTIN_INFO(HEXAGON.F2_dffixupr,DF_ftype_DF,1)
+//
+def int_hexagon_F2_dffixupr :
+Hexagon_df_df_Intrinsic<"HEXAGON_F2_dffixupr">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2df,DF_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2df :
+Hexagon_df_sf_Intrinsic<"HEXAGON_F2_conv_sf2df">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2sf,SF_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2sf :
+Hexagon_sf_df_Intrinsic<"HEXAGON_F2_conv_df2sf">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_uw2sf,SF_ftype_SI,1)
+//
+def int_hexagon_F2_conv_uw2sf :
+Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_uw2sf">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_uw2df,DF_ftype_SI,1)
+//
+def int_hexagon_F2_conv_uw2df :
+Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_uw2df">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_w2sf,SF_ftype_SI,1)
+//
+def int_hexagon_F2_conv_w2sf :
+Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_w2sf">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_w2df,DF_ftype_SI,1)
+//
+def int_hexagon_F2_conv_w2df :
+Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_w2df">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_ud2sf,SF_ftype_DI,1)
+//
+def int_hexagon_F2_conv_ud2sf :
+Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_ud2sf">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_ud2df,DF_ftype_DI,1)
+//
+def int_hexagon_F2_conv_ud2df :
+Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_ud2df">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_d2sf,SF_ftype_DI,1)
+//
+def int_hexagon_F2_conv_d2sf :
+Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_d2sf">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_d2df,DF_ftype_DI,1)
+//
+def int_hexagon_F2_conv_d2df :
+Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_d2df">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw,SI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2uw :
+Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2w,SI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2w :
+Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud,DI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2ud :
+Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2d,DI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2d :
+Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2uw,SI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2uw :
+Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2w,SI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2w :
+Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2ud,DI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2ud :
+Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2d,DI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2d :
+Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw_chop,SI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2uw_chop :
+Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2w_chop,SI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2w_chop :
+Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud_chop,DI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2ud_chop :
+Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_sf2d_chop,DI_ftype_SF,1)
+//
+def int_hexagon_F2_conv_sf2d_chop :
+Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2uw_chop,SI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2uw_chop :
+Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2w_chop,SI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2w_chop :
+Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2ud_chop,DI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2ud_chop :
+Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud_chop">;
+//
+// BUILTIN_INFO(HEXAGON.F2_conv_df2d_chop,DI_ftype_DF,1)
+//
+def int_hexagon_F2_conv_df2d_chop :
+Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d_chop">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asr_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asr.r.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asl_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asl.r.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_lsr_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.lsr.r.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_r_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_lsl_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.lsl.r.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsl_r_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.r.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.r.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.r.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsl_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsl.r.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.r.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.r.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.r.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsl_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsl.r.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.r.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.r.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.r.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsl_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsl.r.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.r.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.r.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.r.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsl_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsl.r.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.r.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.r.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.r.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsl_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsl.r.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.r.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.r.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.r.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsl_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsl.r.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.r.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.r.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.r.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsl_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsl.r.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.r.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.r.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.r.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsl_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsl.r.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.r.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.r.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.r.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsl_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsl.r.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_or">;
+//
+// BUILTIN_INFO(HEXAGON.S2_asr_r_p_xor,DI_ftype_DIDISI,3)
+//
+def int_hexagon_S2_asr_r_p_xor :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_xor">;
+//
+// BUILTIN_INFO(HEXAGON.S2_asl_r_p_xor,DI_ftype_DIDISI,3)
+//
+def int_hexagon_S2_asl_r_p_xor :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_xor">;
+//
+// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_xor,DI_ftype_DIDISI,3)
+//
+def int_hexagon_S2_lsr_r_p_xor :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_xor">;
+//
+// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_xor,DI_ftype_DIDISI,3)
+//
+def int_hexagon_S2_lsl_r_p_xor :
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_xor">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_r_sat,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asr_r_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asr.r.r.sat">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r_sat">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_r_sat,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asl_r_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asl.r.r.sat">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r_sat">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asr_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asr.i.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_lsr_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.lsr.i.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_i_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asl_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asl.i.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.i.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.i.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.i.p">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.i.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.i.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_acc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.i.r.acc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.i.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.i.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p_acc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.i.p.acc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_acc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.i.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.i.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_nac,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.i.r.nac">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.i.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.i.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p_nac,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.i.p.nac">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_nac">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r_xacc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_i_r_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.i.r.xacc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_xacc,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_i_r_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.i.r.xacc">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_xacc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p_xacc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_i_p_xacc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.i.p.xacc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p_xacc,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_i_p_xacc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.i.p.xacc">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_xacc">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.i.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.i.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_and,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.i.r.and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asr_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asr.i.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_lsr_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.lsr.i.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_or,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_asl_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.asl.i.r.or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.i.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.i.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p_and,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.i.p.and">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_and">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asr_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asr.i.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_lsr_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.lsr.i.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_p_or,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_asl_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.asl.i.p.or">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_or">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_r_sat,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asl_i_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asl.i.r.sat">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r_sat">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asr_i_r_rnd :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asr.i.r.rnd">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd_goodsyntax,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_asr_i_r_rnd_goodsyntax :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.asr.i.r.rnd.goodsyntax">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax">;
+//
+// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd,DI_ftype_DISI,2)
+//
+def int_hexagon_S2_asr_i_p_rnd :
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd">;
+//
+// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd_goodsyntax,DI_ftype_DISI,2)
+//
+def int_hexagon_S2_asr_i_p_rnd_goodsyntax :
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax">;
+//
+// BUILTIN_INFO(HEXAGON.S4_lsli,SI_ftype_SISI,2)
+//
+def int_hexagon_S4_lsli :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_lsli">;
 //
 // BUILTIN_INFO(HEXAGON.S2_addasl_rrri,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_addasl_rrri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.addasl.rrri">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_addasl_rrri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_andi_asl_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_andi_asl_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_asl_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_ori_asl_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_ori_asl_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_asl_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_addi_asl_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_addi_asl_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_asl_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_subi_asl_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_subi_asl_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_asl_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_andi_lsr_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_andi_lsr_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_lsr_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_ori_lsr_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_ori_lsr_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_lsr_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_addi_lsr_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_addi_lsr_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_lsr_ri">;
+//
+// BUILTIN_INFO(HEXAGON.S4_subi_lsr_ri,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_subi_lsr_ri :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_lsr_ri">;
 //
 // BUILTIN_INFO(HEXAGON.S2_valignib,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_valignib :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.valignib">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_valignib">;
 //
 // BUILTIN_INFO(HEXAGON.S2_valignrb,DI_ftype_DIDIQI,3)
 //
 def int_hexagon_S2_valignrb :
-Hexagon_di_didiqi_Intrinsic<"HEXAGON.S2.valignrb">;
+Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_valignrb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vspliceib,DI_ftype_DIDISI,3)
 //
 def int_hexagon_S2_vspliceib :
-Hexagon_di_didisi_Intrinsic<"HEXAGON.S2.vspliceib">;
+Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vspliceib">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsplicerb,DI_ftype_DIDIQI,3)
 //
 def int_hexagon_S2_vsplicerb :
-Hexagon_di_didiqi_Intrinsic<"HEXAGON.S2.vsplicerb">;
+Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_vsplicerb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsplatrh,DI_ftype_SI,1)
 //
 def int_hexagon_S2_vsplatrh :
-Hexagon_di_si_Intrinsic<"HEXAGON.S2.vsplatrh">;
+Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsplatrh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsplatrb,SI_ftype_SI,1)
 //
 def int_hexagon_S2_vsplatrb :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.vsplatrb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_vsplatrb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_insert,SI_ftype_SISISISI,4)
 //
 def int_hexagon_S2_insert :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON.S2.insert">;
+Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_insert">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tableidxb_goodsyntax,SI_ftype_SISISISI,4)
 //
 def int_hexagon_S2_tableidxb_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON.S2.tableidxb.goodsyntax">;
+Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tableidxh_goodsyntax,SI_ftype_SISISISI,4)
 //
 def int_hexagon_S2_tableidxh_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON.S2.tableidxh.goodsyntax">;
+Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tableidxw_goodsyntax,SI_ftype_SISISISI,4)
 //
 def int_hexagon_S2_tableidxw_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON.S2.tableidxw.goodsyntax">;
+Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tableidxd_goodsyntax,SI_ftype_SISISISI,4)
 //
 def int_hexagon_S2_tableidxd_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON.S2.tableidxd.goodsyntax">;
+Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax">;
+//
+// BUILTIN_INFO(HEXAGON.A4_bitspliti,DI_ftype_SISI,2)
+//
+def int_hexagon_A4_bitspliti :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitspliti">;
+//
+// BUILTIN_INFO(HEXAGON.A4_bitsplit,DI_ftype_SISI,2)
+//
+def int_hexagon_A4_bitsplit :
+Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitsplit">;
+//
+// BUILTIN_INFO(HEXAGON.S4_extract,SI_ftype_SISISI,3)
+//
+def int_hexagon_S4_extract :
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_extract">;
 //
 // BUILTIN_INFO(HEXAGON.S2_extractu,SI_ftype_SISISI,3)
 //
 def int_hexagon_S2_extractu :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S2.extractu">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_extractu">;
 //
 // BUILTIN_INFO(HEXAGON.S2_insertp,DI_ftype_DIDISISI,4)
 //
 def int_hexagon_S2_insertp :
-Hexagon_di_didisisi_Intrinsic<"HEXAGON.S2.insertp">;
+Hexagon_di_didisisi_Intrinsic<"HEXAGON_S2_insertp">;
+//
+// BUILTIN_INFO(HEXAGON.S4_extractp,DI_ftype_DISISI,3)
+//
+def int_hexagon_S4_extractp :
+Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_extractp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_extractup,DI_ftype_DISISI,3)
 //
 def int_hexagon_S2_extractup :
-Hexagon_di_disisi_Intrinsic<"HEXAGON.S2.extractup">;
+Hexagon_di_disisi_Intrinsic<"HEXAGON_S2_extractup">;
 //
 // BUILTIN_INFO(HEXAGON.S2_insert_rp,SI_ftype_SISIDI,3)
 //
 def int_hexagon_S2_insert_rp :
-Hexagon_si_sisidi_Intrinsic<"HEXAGON.S2.insert.rp">;
+Hexagon_si_sisidi_Intrinsic<"HEXAGON_S2_insert_rp">;
+//
+// BUILTIN_INFO(HEXAGON.S4_extract_rp,SI_ftype_SIDI,2)
+//
+def int_hexagon_S4_extract_rp :
+Hexagon_si_sidi_Intrinsic<"HEXAGON_S4_extract_rp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_extractu_rp,SI_ftype_SIDI,2)
 //
 def int_hexagon_S2_extractu_rp :
-Hexagon_si_sidi_Intrinsic<"HEXAGON.S2.extractu.rp">;
+Hexagon_si_sidi_Intrinsic<"HEXAGON_S2_extractu_rp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_insertp_rp,DI_ftype_DIDIDI,3)
 //
 def int_hexagon_S2_insertp_rp :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.S2.insertp.rp">;
+Hexagon_di_dididi_Intrinsic<"HEXAGON_S2_insertp_rp">;
+//
+// BUILTIN_INFO(HEXAGON.S4_extractp_rp,DI_ftype_DIDI,2)
+//
+def int_hexagon_S4_extractp_rp :
+Hexagon_di_didi_Intrinsic<"HEXAGON_S4_extractp_rp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_extractup_rp,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_extractup_rp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.extractup.rp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_extractup_rp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tstbit_i,QI_ftype_SISI,2)
 //
 def int_hexagon_S2_tstbit_i :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.S2.tstbit.i">;
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_S2_tstbit_i">;
+//
+// BUILTIN_INFO(HEXAGON.S4_ntstbit_i,QI_ftype_SISI,2)
+//
+def int_hexagon_S4_ntstbit_i :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_S4_ntstbit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S2_setbit_i,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_setbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.setbit.i">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S2_togglebit_i,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_togglebit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.togglebit.i">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S2_clrbit_i,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_clrbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.clrbit.i">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S2_tstbit_r,QI_ftype_SISI,2)
 //
 def int_hexagon_S2_tstbit_r :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.S2.tstbit.r">;
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_S2_tstbit_r">;
+//
+// BUILTIN_INFO(HEXAGON.S4_ntstbit_r,QI_ftype_SISI,2)
+//
+def int_hexagon_S4_ntstbit_r :
+Hexagon_qi_sisi_Intrinsic<"HEXAGON_S4_ntstbit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_setbit_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_setbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.setbit.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_togglebit_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_togglebit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.togglebit.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_clrbit_r,SI_ftype_SISI,2)
 //
 def int_hexagon_S2_clrbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.S2.clrbit.r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.i.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.i.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.i.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.r.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vh">;
+//
+// BUILTIN_INFO(HEXAGON.S5_asrhub_rnd_sat_goodsyntax,SI_ftype_DISI,2)
+//
+def int_hexagon_S5_asrhub_rnd_sat_goodsyntax :
+Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax">;
+//
+// BUILTIN_INFO(HEXAGON.S5_asrhub_sat,SI_ftype_DISI,2)
+//
+def int_hexagon_S5_asrhub_sat :
+Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_sat">;
+//
+// BUILTIN_INFO(HEXAGON.S5_vasrhrnd_goodsyntax,DI_ftype_DISI,2)
+//
+def int_hexagon_S5_vasrhrnd_goodsyntax :
+Hexagon_di_disi_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.r.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.r.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_vh,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsl_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsl.r.vh">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.i.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_i_svw_trun,SI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_i_svw_trun :
-Hexagon_si_disi_Intrinsic<"HEXAGON.S2.asr.i.svw.trun">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_i_svw_trun">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_svw_trun,SI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_r_svw_trun :
-Hexagon_si_disi_Intrinsic<"HEXAGON.S2.asr.r.svw.trun">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_i_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.i.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_i_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.i.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asr_r_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asr_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asr.r.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_asl_r_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_asl_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.asl.r.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsr_r_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsr_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsr.r.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lsl_r_vw,DI_ftype_DISI,2)
 //
 def int_hexagon_S2_lsl_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON.S2.lsl.r.vw">;
+Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vrndpackwh,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vrndpackwh :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vrndpackwh">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vrndpackwhs,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vrndpackwhs :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vrndpackwhs">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwhs">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsxtbh,DI_ftype_SI,1)
 //
 def int_hexagon_S2_vsxtbh :
-Hexagon_di_si_Intrinsic<"HEXAGON.S2.vsxtbh">;
+Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxtbh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vzxtbh,DI_ftype_SI,1)
 //
 def int_hexagon_S2_vzxtbh :
-Hexagon_di_si_Intrinsic<"HEXAGON.S2.vzxtbh">;
+Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxtbh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsathub,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vsathub :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vsathub">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathub">;
 //
 // BUILTIN_INFO(HEXAGON.S2_svsathub,SI_ftype_SI,1)
 //
 def int_hexagon_S2_svsathub :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.svsathub">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathub">;
 //
 // BUILTIN_INFO(HEXAGON.S2_svsathb,SI_ftype_SI,1)
 //
 def int_hexagon_S2_svsathb :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.svsathb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsathb,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vsathb :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vsathb">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vtrunohb,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vtrunohb :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vtrunohb">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunohb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vtrunewh,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_vtrunewh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.vtrunewh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunewh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vtrunowh,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_vtrunowh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.vtrunowh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunowh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vtrunehb,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vtrunehb :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vtrunehb">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunehb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsxthw,DI_ftype_SI,1)
 //
 def int_hexagon_S2_vsxthw :
-Hexagon_di_si_Intrinsic<"HEXAGON.S2.vsxthw">;
+Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxthw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vzxthw,DI_ftype_SI,1)
 //
 def int_hexagon_S2_vzxthw :
-Hexagon_di_si_Intrinsic<"HEXAGON.S2.vzxthw">;
+Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxthw">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsatwh,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vsatwh :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vsatwh">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsatwuh,SI_ftype_DI,1)
 //
 def int_hexagon_S2_vsatwuh :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.vsatwuh">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwuh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_packhl,DI_ftype_SISI,2)
 //
 def int_hexagon_S2_packhl :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.S2.packhl">;
+Hexagon_di_sisi_Intrinsic<"HEXAGON_S2_packhl">;
 //
 // BUILTIN_INFO(HEXAGON.A2_swiz,SI_ftype_SI,1)
 //
 def int_hexagon_A2_swiz :
-Hexagon_si_si_Intrinsic<"HEXAGON.A2.swiz">;
+Hexagon_si_si_Intrinsic<"HEXAGON_A2_swiz">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsathub_nopack,DI_ftype_DI,1)
 //
 def int_hexagon_S2_vsathub_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.vsathub.nopack">;
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathub_nopack">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsathb_nopack,DI_ftype_DI,1)
 //
 def int_hexagon_S2_vsathb_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.vsathb.nopack">;
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathb_nopack">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsatwh_nopack,DI_ftype_DI,1)
 //
 def int_hexagon_S2_vsatwh_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.vsatwh.nopack">;
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwh_nopack">;
 //
 // BUILTIN_INFO(HEXAGON.S2_vsatwuh_nopack,DI_ftype_DI,1)
 //
 def int_hexagon_S2_vsatwuh_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.vsatwuh.nopack">;
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwuh_nopack">;
 //
 // BUILTIN_INFO(HEXAGON.S2_shuffob,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_shuffob :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.shuffob">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffob">;
 //
 // BUILTIN_INFO(HEXAGON.S2_shuffeb,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_shuffeb :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.shuffeb">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_shuffoh,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_shuffoh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.shuffoh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffoh">;
 //
 // BUILTIN_INFO(HEXAGON.S2_shuffeh,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_shuffeh :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.shuffeh">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeh">;
+//
+// BUILTIN_INFO(HEXAGON.S5_popcountp,SI_ftype_DI,1)
+//
+def int_hexagon_S5_popcountp :
+Hexagon_si_di_Intrinsic<"HEXAGON_S5_popcountp">;
+//
+// BUILTIN_INFO(HEXAGON.S4_parity,SI_ftype_SISI,2)
+//
+def int_hexagon_S4_parity :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_parity">;
 //
 // BUILTIN_INFO(HEXAGON.S2_parityp,SI_ftype_DIDI,2)
 //
 def int_hexagon_S2_parityp :
-Hexagon_si_didi_Intrinsic<"HEXAGON.S2.parityp">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_S2_parityp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_lfsp,DI_ftype_DIDI,2)
 //
 def int_hexagon_S2_lfsp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S2.lfsp">;
+Hexagon_di_didi_Intrinsic<"HEXAGON_S2_lfsp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_clbnorm,SI_ftype_SI,1)
 //
 def int_hexagon_S2_clbnorm :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.clbnorm">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_clbnorm">;
+//
+// BUILTIN_INFO(HEXAGON.S4_clbaddi,SI_ftype_SISI,2)
+//
+def int_hexagon_S4_clbaddi :
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_clbaddi">;
+//
+// BUILTIN_INFO(HEXAGON.S4_clbpnorm,SI_ftype_DI,1)
+//
+def int_hexagon_S4_clbpnorm :
+Hexagon_si_di_Intrinsic<"HEXAGON_S4_clbpnorm">;
+//
+// BUILTIN_INFO(HEXAGON.S4_clbpaddi,SI_ftype_DISI,2)
+//
+def int_hexagon_S4_clbpaddi :
+Hexagon_si_disi_Intrinsic<"HEXAGON_S4_clbpaddi">;
 //
 // BUILTIN_INFO(HEXAGON.S2_clb,SI_ftype_SI,1)
 //
 def int_hexagon_S2_clb :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.clb">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_clb">;
 //
 // BUILTIN_INFO(HEXAGON.S2_cl0,SI_ftype_SI,1)
 //
 def int_hexagon_S2_cl0 :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.cl0">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl0">;
 //
 // BUILTIN_INFO(HEXAGON.S2_cl1,SI_ftype_SI,1)
 //
 def int_hexagon_S2_cl1 :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.cl1">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl1">;
 //
 // BUILTIN_INFO(HEXAGON.S2_clbp,SI_ftype_DI,1)
 //
 def int_hexagon_S2_clbp :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.clbp">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_clbp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_cl0p,SI_ftype_DI,1)
 //
 def int_hexagon_S2_cl0p :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.cl0p">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl0p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_cl1p,SI_ftype_DI,1)
 //
 def int_hexagon_S2_cl1p :
-Hexagon_si_di_Intrinsic<"HEXAGON.S2.cl1p">;
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl1p">;
 //
 // BUILTIN_INFO(HEXAGON.S2_brev,SI_ftype_SI,1)
 //
 def int_hexagon_S2_brev :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.brev">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_brev">;
+//
+// BUILTIN_INFO(HEXAGON.S2_brevp,DI_ftype_DI,1)
+//
+def int_hexagon_S2_brevp :
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_brevp">;
 //
 // BUILTIN_INFO(HEXAGON.S2_ct0,SI_ftype_SI,1)
 //
 def int_hexagon_S2_ct0 :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.ct0">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct0">;
 //
 // BUILTIN_INFO(HEXAGON.S2_ct1,SI_ftype_SI,1)
 //
 def int_hexagon_S2_ct1 :
-Hexagon_si_si_Intrinsic<"HEXAGON.S2.ct1">;
-//
-// BUILTIN_INFO(HEXAGON.S2_interleave,DI_ftype_DI,1)
-//
-def int_hexagon_S2_interleave :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.interleave">;
-//
-// BUILTIN_INFO(HEXAGON.S2_deinterleave,DI_ftype_DI,1)
-//
-def int_hexagon_S2_deinterleave :
-Hexagon_di_di_Intrinsic<"HEXAGON.S2.deinterleave">;
-
-//
-// BUILTIN_INFO(SI_to_SXTHI_asrh,SI_ftype_SI,1)
-//
-def int_hexagon_SI_to_SXTHI_asrh :
-Hexagon_si_si_Intrinsic<"SI.to.SXTHI.asrh">;
-
-//
-// BUILTIN_INFO(HEXAGON.A4_orn,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_orn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.orn">;
-//
-// BUILTIN_INFO(HEXAGON.A4_andn,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_andn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.andn">;
-//
-// BUILTIN_INFO(HEXAGON.A4_orn,DI_ftype_DIDI,2)
-//
-def int_hexagon_A4_ornp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A4.ornp">;
-//
-// BUILTIN_INFO(HEXAGON.A4_andn,DI_ftype_DIDI,2)
-//
-def int_hexagon_A4_andnp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.A4.andnp">;
-//
-// BUILTIN_INFO(HEXAGON.A4_combineir,DI_ftype_sisi,2)
-//
-def int_hexagon_A4_combineir :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.A4.combineir">;
-//
-// BUILTIN_INFO(HEXAGON.A4_combineir,DI_ftype_sisi,2)
-//
-def int_hexagon_A4_combineri :
-Hexagon_di_sisi_Intrinsic<"HEXAGON.A4.combineri">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmpneq,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmpneq :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmpneq">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmpneqi,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmpneqi :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmpneqi">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmplte,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmplte :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmplte">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmpltei,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmpltei :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmpltei">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmplteu,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmplteu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmplteu">;
-//
-// BUILTIN_INFO(HEXAGON.C4_cmplteui,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_cmplteui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON.C4.cmplteui">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpneq,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpneq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.rcmpneq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpneqi,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpneqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.rcmpneqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpeq,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpeq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.rcmpeq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpeqi,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpeqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.rcmpeqi">;
-//
-// BUILTIN_INFO(HEXAGON.C4_fastcorner9,QI_ftype_QIQI,2)
-//
-def int_hexagon_C4_fastcorner9 :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C4.fastcorner9">;
-//
-// BUILTIN_INFO(HEXAGON.C4_fastcorner9_not,QI_ftype_QIQI,2)
-//
-def int_hexagon_C4_fastcorner9_not :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON.C4.fastcorner9_not">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_andn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_andn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.and_andn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_and,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_and :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.and_and">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_orn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_orn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.and_orn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_or,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_or :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.and_or">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_andn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_andn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.or_andn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_and,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_and :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.or_and">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_orn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_orn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.or_orn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_or,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_or :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON.C4.or_or">;
-//
-// BUILTIN_INFO(HEXAGON.S4_addaddi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_addaddi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S4.addaddi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_subaddi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_subaddi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S4.subaddi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_andnp,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_andnp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S4.andnp">;
+Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct1">;
 //
-// BUILTIN_INFO(HEXAGON.S4_ornp,DI_ftype_DIDI,2)
+// BUILTIN_INFO(HEXAGON.S2_ct0p,SI_ftype_DI,1)
 //
-def int_hexagon_S4_ornp :
-Hexagon_di_didi_Intrinsic<"HEXAGON.S4.ornp">;
+def int_hexagon_S2_ct0p :
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct0p">;
 //
-// BUILTIN_INFO(HEXAGON.M4_xor_xacc,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_xor_xacc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON.M4.xor_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.and_and">;
+// BUILTIN_INFO(HEXAGON.S2_ct1p,SI_ftype_DI,1)
 //
-// BUILTIN_INFO(HEXAGON.M4_and_andn,SI_ftype_SISISI,3)
+def int_hexagon_S2_ct1p :
+Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct1p">;
 //
-def int_hexagon_M4_and_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.and_andn">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.and_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_xor,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_xor :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.and_xor">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.xor_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.xor_and">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_andn,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.xor_andn">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.or_and">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.or_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_xor,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_xor :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.or_xor">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_andn,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.M4.or_andn">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_andix,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_andix :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S4.or_andix">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_andi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_andi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S4.or_andi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_ori,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_ori :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON.S4.or_ori">;
-//
-// BUILTIN_INFO(HEXAGON.A4_modwrapu,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_modwrapu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.modwrapu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cround_ri,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_cround_ri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.cround_ri">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cround_rr,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_cround_rr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.cround_rr">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_ri,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_ri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.round_ri">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_rr,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_rr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.round_rr">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_ri_sat,SI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.S2_interleave,DI_ftype_DI,1)
 //
-def int_hexagon_A4_round_ri_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.round_ri_sat">;
+def int_hexagon_S2_interleave :
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">;
 //
-// BUILTIN_INFO(HEXAGON.A4_round_rr_sat,SI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.S2_deinterleave,DI_ftype_DI,1)
 //
-def int_hexagon_A4_round_rr_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON.A4.round_rr_sat">;
+def int_hexagon_S2_deinterleave :
+Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">;
diff --git a/include/llvm/IntrinsicsMips.td b/include/llvm/IntrinsicsMips.td
new file mode 100644
index 0000000..4375ac2
--- /dev/null
+++ b/include/llvm/IntrinsicsMips.td
@@ -0,0 +1,264 @@
+//===- IntrinsicsMips.td - Defines Mips intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the MIPS-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP data types
+def mips_v2q15_ty: LLVMType<v2i16>;
+def mips_q31_ty: LLVMType<i32>;
+
+let TargetPrefix = "mips" in {  // All intrinsics start with "llvm.mips.".
+
+//===----------------------------------------------------------------------===//
+// Addition/subtraction
+
+def int_mips_addu_qb : GCCBuiltin<"__builtin_mips_addu_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_addu_s_qb : GCCBuiltin<"__builtin_mips_addu_s_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_subu_qb : GCCBuiltin<"__builtin_mips_subu_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
+def int_mips_subu_s_qb : GCCBuiltin<"__builtin_mips_subu_s_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], []>;
+
+def int_mips_addq_ph : GCCBuiltin<"__builtin_mips_addq_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_addq_s_ph : GCCBuiltin<"__builtin_mips_addq_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_subq_ph : GCCBuiltin<"__builtin_mips_subq_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_subq_s_ph : GCCBuiltin<"__builtin_mips_subq_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], []>;
+
+def int_mips_madd: GCCBuiltin<"__builtin_mips_madd">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_maddu: GCCBuiltin<"__builtin_mips_maddu">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, Commutative]>;
+
+def int_mips_msub: GCCBuiltin<"__builtin_mips_msub">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_msubu: GCCBuiltin<"__builtin_mips_msubu">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_addq_s_w: GCCBuiltin<"__builtin_mips_addq_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
+def int_mips_subq_s_w: GCCBuiltin<"__builtin_mips_subq_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], []>;
+
+def int_mips_addsc: GCCBuiltin<"__builtin_mips_addsc">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative]>;
+def int_mips_addwc: GCCBuiltin<"__builtin_mips_addwc">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [Commutative]>;
+
+def int_mips_modsub: GCCBuiltin<"__builtin_mips_modsub">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_raddu_w_qb: GCCBuiltin<"__builtin_mips_raddu_w_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// Absolute value
+
+def int_mips_absq_s_ph: GCCBuiltin<"__builtin_mips_absq_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty], []>;
+def int_mips_absq_s_w: GCCBuiltin<"__builtin_mips_absq_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty], []>;
+
+//===----------------------------------------------------------------------===//
+// Precision reduce/expand
+
+def int_mips_precrq_qb_ph: GCCBuiltin<"__builtin_mips_precrq_qb_ph">,
+  Intrinsic<[llvm_v4i8_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_precrqu_s_qb_ph: GCCBuiltin<"__builtin_mips_precrqu_s_qb_ph">,
+  Intrinsic<[llvm_v4i8_ty], [mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_precrq_ph_w: GCCBuiltin<"__builtin_mips_precrq_ph_w">,
+  Intrinsic<[mips_v2q15_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
+def int_mips_precrq_rs_ph_w: GCCBuiltin<"__builtin_mips_precrq_rs_ph_w">,
+  Intrinsic<[mips_v2q15_ty], [mips_q31_ty, mips_q31_ty], []>;
+def int_mips_preceq_w_phl: GCCBuiltin<"__builtin_mips_preceq_w_phl">,
+  Intrinsic<[mips_q31_ty], [mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_preceq_w_phr: GCCBuiltin<"__builtin_mips_preceq_w_phr">,
+  Intrinsic<[mips_q31_ty], [mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_precequ_ph_qbl: GCCBuiltin<"__builtin_mips_precequ_ph_qbl">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_precequ_ph_qbr: GCCBuiltin<"__builtin_mips_precequ_ph_qbr">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_precequ_ph_qbla: GCCBuiltin<"__builtin_mips_precequ_ph_qbla">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_precequ_ph_qbra: GCCBuiltin<"__builtin_mips_precequ_ph_qbra">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_preceu_ph_qbl: GCCBuiltin<"__builtin_mips_preceu_ph_qbl">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_preceu_ph_qbr: GCCBuiltin<"__builtin_mips_preceu_ph_qbr">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_preceu_ph_qbla: GCCBuiltin<"__builtin_mips_preceu_ph_qbla">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_preceu_ph_qbra: GCCBuiltin<"__builtin_mips_preceu_ph_qbra">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// Shift
+
+def int_mips_shll_qb: GCCBuiltin<"__builtin_mips_shll_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], []>;
+def int_mips_shrl_qb: GCCBuiltin<"__builtin_mips_shrl_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shll_ph: GCCBuiltin<"__builtin_mips_shll_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], []>;
+def int_mips_shll_s_ph: GCCBuiltin<"__builtin_mips_shll_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], []>;
+def int_mips_shra_ph: GCCBuiltin<"__builtin_mips_shra_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shra_r_ph: GCCBuiltin<"__builtin_mips_shra_r_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shll_s_w: GCCBuiltin<"__builtin_mips_shll_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, llvm_i32_ty], []>;
+def int_mips_shra_r_w: GCCBuiltin<"__builtin_mips_shra_r_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shilo: GCCBuiltin<"__builtin_mips_shilo">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+
+def int_mips_muleu_s_ph_qbl: GCCBuiltin<"__builtin_mips_muleu_s_ph_qbl">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty, mips_v2q15_ty], []>;
+def int_mips_muleu_s_ph_qbr: GCCBuiltin<"__builtin_mips_muleu_s_ph_qbr">,
+  Intrinsic<[mips_v2q15_ty], [llvm_v4i8_ty, mips_v2q15_ty], []>;
+def int_mips_mulq_rs_ph: GCCBuiltin<"__builtin_mips_mulq_rs_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_muleq_s_w_phl: GCCBuiltin<"__builtin_mips_muleq_s_w_phl">,
+  Intrinsic<[mips_q31_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_muleq_s_w_phr: GCCBuiltin<"__builtin_mips_muleq_s_w_phr">,
+  Intrinsic<[mips_q31_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_mulsaq_s_w_ph: GCCBuiltin<"__builtin_mips_mulsaq_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_maq_s_w_phl: GCCBuiltin<"__builtin_mips_maq_s_w_phl">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_maq_s_w_phr: GCCBuiltin<"__builtin_mips_maq_s_w_phr">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_maq_sa_w_phl: GCCBuiltin<"__builtin_mips_maq_sa_w_phl">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_maq_sa_w_phr: GCCBuiltin<"__builtin_mips_maq_sa_w_phr">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_mult: GCCBuiltin<"__builtin_mips_mult">,
+  Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_multu: GCCBuiltin<"__builtin_mips_multu">,
+  Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, Commutative]>;
+
+//===----------------------------------------------------------------------===//
+// Dot product with accumulate/subtract
+
+def int_mips_dpau_h_qbl: GCCBuiltin<"__builtin_mips_dpau_h_qbl">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem]>;
+def int_mips_dpau_h_qbr: GCCBuiltin<"__builtin_mips_dpau_h_qbr">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem]>;
+def int_mips_dpsu_h_qbl: GCCBuiltin<"__builtin_mips_dpsu_h_qbl">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem]>;
+def int_mips_dpsu_h_qbr: GCCBuiltin<"__builtin_mips_dpsu_h_qbr">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem]>;
+def int_mips_dpaq_s_w_ph: GCCBuiltin<"__builtin_mips_dpaq_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpsq_s_w_ph: GCCBuiltin<"__builtin_mips_dpsq_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpaq_sa_l_w: GCCBuiltin<"__builtin_mips_dpaq_sa_l_w">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_q31_ty, mips_q31_ty], []>;
+def int_mips_dpsq_sa_l_w: GCCBuiltin<"__builtin_mips_dpsq_sa_l_w">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_q31_ty, mips_q31_ty], []>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+
+def int_mips_cmpu_eq_qb: GCCBuiltin<"__builtin_mips_cmpu_eq_qb">,
+  Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpu_lt_qb: GCCBuiltin<"__builtin_mips_cmpu_lt_qb">,
+  Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpu_le_qb: GCCBuiltin<"__builtin_mips_cmpu_le_qb">,
+  Intrinsic<[], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgu_eq_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgu_lt_qb: GCCBuiltin<"__builtin_mips_cmpgu_lt_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgu_le_qb: GCCBuiltin<"__builtin_mips_cmpgu_le_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmp_eq_ph: GCCBuiltin<"__builtin_mips_cmp_eq_ph">,
+  Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_cmp_lt_ph: GCCBuiltin<"__builtin_mips_cmp_lt_ph">,
+  Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_cmp_le_ph: GCCBuiltin<"__builtin_mips_cmp_le_ph">,
+  Intrinsic<[], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+
+//===----------------------------------------------------------------------===//
+// Extracting
+
+def int_mips_extr_s_h: GCCBuiltin<"__builtin_mips_extr_s_h">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+def int_mips_extr_w: GCCBuiltin<"__builtin_mips_extr_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+def int_mips_extr_rs_w: GCCBuiltin<"__builtin_mips_extr_rs_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+def int_mips_extr_r_w: GCCBuiltin<"__builtin_mips_extr_r_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+def int_mips_extp: GCCBuiltin<"__builtin_mips_extp">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+def int_mips_extpdp: GCCBuiltin<"__builtin_mips_extpdp">,
+  Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+
+//===----------------------------------------------------------------------===//
+// Misc
+
+def int_mips_wrdsp: GCCBuiltin<"__builtin_mips_wrdsp">,
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
+def int_mips_rddsp: GCCBuiltin<"__builtin_mips_rddsp">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem]>;
+
+def int_mips_insv: GCCBuiltin<"__builtin_mips_insv">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
+def int_mips_bitrev: GCCBuiltin<"__builtin_mips_bitrev">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_packrl_ph: GCCBuiltin<"__builtin_mips_packrl_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+
+def int_mips_repl_qb: GCCBuiltin<"__builtin_mips_repl_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_repl_ph: GCCBuiltin<"__builtin_mips_repl_ph">,
+  Intrinsic<[mips_v2q15_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_pick_qb: GCCBuiltin<"__builtin_mips_pick_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrReadMem]>;
+def int_mips_pick_ph: GCCBuiltin<"__builtin_mips_pick_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrReadMem]>;
+
+def int_mips_mthlip: GCCBuiltin<"__builtin_mips_mthlip">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+
+def int_mips_bposge32: GCCBuiltin<"__builtin_mips_bposge32">,
+  Intrinsic<[llvm_i32_ty], [], [IntrReadMem]>;
+
+def int_mips_lbux: GCCBuiltin<"__builtin_mips_lbux">,
+  Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
+def int_mips_lhx: GCCBuiltin<"__builtin_mips_lhx">,
+  Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
+def int_mips_lwx: GCCBuiltin<"__builtin_mips_lwx">,
+  Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
+}
diff --git a/include/llvm/IntrinsicsNVVM.td b/include/llvm/IntrinsicsNVVM.td
new file mode 100644
index 0000000..1853c99
--- /dev/null
+++ b/include/llvm/IntrinsicsNVVM.td
@@ -0,0 +1,952 @@
+//===- IntrinsicsNVVM.td - Defines NVVM intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the NVVM-specific intrinsics for use with NVPTX.
+//
+//===----------------------------------------------------------------------===//
+
+def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
+
+//
+// MISC
+//
+
+  def int_nvvm_clz_i : GCCBuiltin<"__nvvm_clz_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_clz_ll : GCCBuiltin<"__nvvm_clz_ll">,
+      Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_popc_i : GCCBuiltin<"__nvvm_popc_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_popc_ll : GCCBuiltin<"__nvvm_popc_ll">,
+      Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Min Max
+//
+
+  def int_nvvm_min_i : GCCBuiltin<"__nvvm_min_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_min_ui : GCCBuiltin<"__nvvm_min_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_min_ll : GCCBuiltin<"__nvvm_min_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_min_ull : GCCBuiltin<"__nvvm_min_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_max_i : GCCBuiltin<"__nvvm_max_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_max_ui : GCCBuiltin<"__nvvm_max_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_max_ll : GCCBuiltin<"__nvvm_max_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_max_ull : GCCBuiltin<"__nvvm_max_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
+        , [IntrNoMem, Commutative]>;
+  def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Multiplication
+//
+
+  def int_nvvm_mulhi_i : GCCBuiltin<"__nvvm_mulhi_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mulhi_ui : GCCBuiltin<"__nvvm_mulhi_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mulhi_ll : GCCBuiltin<"__nvvm_mulhi_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mulhi_ull : GCCBuiltin<"__nvvm_mulhi_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul_rn_ftz_f : GCCBuiltin<"__nvvm_mul_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rn_f : GCCBuiltin<"__nvvm_mul_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_ftz_f : GCCBuiltin<"__nvvm_mul_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_f : GCCBuiltin<"__nvvm_mul_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_ftz_f : GCCBuiltin<"__nvvm_mul_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_f : GCCBuiltin<"__nvvm_mul_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_ftz_f : GCCBuiltin<"__nvvm_mul_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_f : GCCBuiltin<"__nvvm_mul_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul_rn_d : GCCBuiltin<"__nvvm_mul_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_d : GCCBuiltin<"__nvvm_mul_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_d : GCCBuiltin<"__nvvm_mul_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_d : GCCBuiltin<"__nvvm_mul_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul24_i : GCCBuiltin<"__nvvm_mul24_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul24_ui : GCCBuiltin<"__nvvm_mul24_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Div
+//
+
+  def int_nvvm_div_approx_ftz_f : GCCBuiltin<"__nvvm_div_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_approx_f : GCCBuiltin<"__nvvm_div_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rn_ftz_f : GCCBuiltin<"__nvvm_div_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rn_f : GCCBuiltin<"__nvvm_div_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rz_ftz_f : GCCBuiltin<"__nvvm_div_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rz_f : GCCBuiltin<"__nvvm_div_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rm_ftz_f : GCCBuiltin<"__nvvm_div_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rm_f : GCCBuiltin<"__nvvm_div_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rp_ftz_f : GCCBuiltin<"__nvvm_div_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rp_f : GCCBuiltin<"__nvvm_div_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rn_d : GCCBuiltin<"__nvvm_div_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rz_d : GCCBuiltin<"__nvvm_div_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rm_d : GCCBuiltin<"__nvvm_div_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rp_d : GCCBuiltin<"__nvvm_div_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Brev
+//
+
+  def int_nvvm_brev32 : GCCBuiltin<"__nvvm_brev32">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_brev64 : GCCBuiltin<"__nvvm_brev64">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+//
+// Sad
+//
+
+  def int_nvvm_sad_i : GCCBuiltin<"__nvvm_sad_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_sad_ui : GCCBuiltin<"__nvvm_sad_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Floor  Ceil
+//
+
+  def int_nvvm_floor_ftz_f : GCCBuiltin<"__nvvm_floor_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_floor_f : GCCBuiltin<"__nvvm_floor_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_floor_d : GCCBuiltin<"__nvvm_floor_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_ceil_ftz_f : GCCBuiltin<"__nvvm_ceil_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ceil_f : GCCBuiltin<"__nvvm_ceil_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ceil_d : GCCBuiltin<"__nvvm_ceil_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Abs
+//
+
+  def int_nvvm_abs_i : GCCBuiltin<"__nvvm_abs_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_abs_ll : GCCBuiltin<"__nvvm_abs_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_fabs_ftz_f : GCCBuiltin<"__nvvm_fabs_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_fabs_f : GCCBuiltin<"__nvvm_fabs_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Round
+//
+
+  def int_nvvm_round_ftz_f : GCCBuiltin<"__nvvm_round_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_round_f : GCCBuiltin<"__nvvm_round_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_round_d : GCCBuiltin<"__nvvm_round_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Trunc
+//
+
+  def int_nvvm_trunc_ftz_f : GCCBuiltin<"__nvvm_trunc_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_trunc_f : GCCBuiltin<"__nvvm_trunc_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_trunc_d : GCCBuiltin<"__nvvm_trunc_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Saturate
+//
+
+  def int_nvvm_saturate_ftz_f : GCCBuiltin<"__nvvm_saturate_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_saturate_f : GCCBuiltin<"__nvvm_saturate_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_saturate_d : GCCBuiltin<"__nvvm_saturate_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Exp2  Log2
+//
+
+  def int_nvvm_ex2_approx_ftz_f : GCCBuiltin<"__nvvm_ex2_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f : GCCBuiltin<"__nvvm_ex2_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_lg2_approx_f : GCCBuiltin<"__nvvm_lg2_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_lg2_approx_d : GCCBuiltin<"__nvvm_lg2_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Sin  Cos
+//
+
+  def int_nvvm_sin_approx_ftz_f : GCCBuiltin<"__nvvm_sin_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sin_approx_f : GCCBuiltin<"__nvvm_sin_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_cos_approx_ftz_f : GCCBuiltin<"__nvvm_cos_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+//
+// Fma
+//
+
+  def int_nvvm_fma_rn_ftz_f : GCCBuiltin<"__nvvm_fma_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rn_f : GCCBuiltin<"__nvvm_fma_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_ftz_f : GCCBuiltin<"__nvvm_fma_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_f : GCCBuiltin<"__nvvm_fma_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_ftz_f : GCCBuiltin<"__nvvm_fma_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_f : GCCBuiltin<"__nvvm_fma_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_ftz_f : GCCBuiltin<"__nvvm_fma_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_f : GCCBuiltin<"__nvvm_fma_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fma_rn_d : GCCBuiltin<"__nvvm_fma_rn_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_d : GCCBuiltin<"__nvvm_fma_rz_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_d : GCCBuiltin<"__nvvm_fma_rm_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_d : GCCBuiltin<"__nvvm_fma_rp_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Rcp
+//
+
+  def int_nvvm_rcp_rn_ftz_f : GCCBuiltin<"__nvvm_rcp_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rn_f : GCCBuiltin<"__nvvm_rcp_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_ftz_f : GCCBuiltin<"__nvvm_rcp_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_f : GCCBuiltin<"__nvvm_rcp_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_ftz_f : GCCBuiltin<"__nvvm_rcp_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_f : GCCBuiltin<"__nvvm_rcp_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_ftz_f : GCCBuiltin<"__nvvm_rcp_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_f : GCCBuiltin<"__nvvm_rcp_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_rcp_rn_d : GCCBuiltin<"__nvvm_rcp_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_d : GCCBuiltin<"__nvvm_rcp_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_d : GCCBuiltin<"__nvvm_rcp_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_d : GCCBuiltin<"__nvvm_rcp_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_rcp_approx_ftz_d : GCCBuiltin<"__nvvm_rcp_approx_ftz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Sqrt
+//
+
+  def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_ftz_f : GCCBuiltin<"__nvvm_sqrt_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_f : GCCBuiltin<"__nvvm_sqrt_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_ftz_f : GCCBuiltin<"__nvvm_sqrt_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_f : GCCBuiltin<"__nvvm_sqrt_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_ftz_f : GCCBuiltin<"__nvvm_sqrt_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_f : GCCBuiltin<"__nvvm_sqrt_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_approx_ftz_f : GCCBuiltin<"__nvvm_sqrt_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_approx_f : GCCBuiltin<"__nvvm_sqrt_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_sqrt_rn_d : GCCBuiltin<"__nvvm_sqrt_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_d : GCCBuiltin<"__nvvm_sqrt_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_d : GCCBuiltin<"__nvvm_sqrt_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_d : GCCBuiltin<"__nvvm_sqrt_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Rsqrt
+//
+
+  def int_nvvm_rsqrt_approx_ftz_f : GCCBuiltin<"__nvvm_rsqrt_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rsqrt_approx_f : GCCBuiltin<"__nvvm_rsqrt_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rsqrt_approx_d : GCCBuiltin<"__nvvm_rsqrt_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Add
+//
+
+  def int_nvvm_add_rn_ftz_f : GCCBuiltin<"__nvvm_add_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rn_f : GCCBuiltin<"__nvvm_add_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_ftz_f : GCCBuiltin<"__nvvm_add_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_f : GCCBuiltin<"__nvvm_add_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_ftz_f : GCCBuiltin<"__nvvm_add_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_f : GCCBuiltin<"__nvvm_add_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_ftz_f : GCCBuiltin<"__nvvm_add_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_f : GCCBuiltin<"__nvvm_add_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_add_rn_d : GCCBuiltin<"__nvvm_add_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_d : GCCBuiltin<"__nvvm_add_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_d : GCCBuiltin<"__nvvm_add_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_d : GCCBuiltin<"__nvvm_add_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Convert
+//
+
+  def int_nvvm_d2f_rn_ftz : GCCBuiltin<"__nvvm_d2f_rn_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rn : GCCBuiltin<"__nvvm_d2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rz_ftz : GCCBuiltin<"__nvvm_d2f_rz_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rz : GCCBuiltin<"__nvvm_d2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rm_ftz : GCCBuiltin<"__nvvm_d2f_rm_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rm : GCCBuiltin<"__nvvm_d2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rp_ftz : GCCBuiltin<"__nvvm_d2f_rp_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rp : GCCBuiltin<"__nvvm_d2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2i_rn : GCCBuiltin<"__nvvm_d2i_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rz : GCCBuiltin<"__nvvm_d2i_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rm : GCCBuiltin<"__nvvm_d2i_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rp : GCCBuiltin<"__nvvm_d2i_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ui_rn : GCCBuiltin<"__nvvm_d2ui_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rz : GCCBuiltin<"__nvvm_d2ui_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rm : GCCBuiltin<"__nvvm_d2ui_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rp : GCCBuiltin<"__nvvm_d2ui_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_i2d_rn : GCCBuiltin<"__nvvm_i2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rz : GCCBuiltin<"__nvvm_i2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rm : GCCBuiltin<"__nvvm_i2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rp : GCCBuiltin<"__nvvm_i2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_ui2d_rn : GCCBuiltin<"__nvvm_ui2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rz : GCCBuiltin<"__nvvm_ui2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rm : GCCBuiltin<"__nvvm_ui2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rp : GCCBuiltin<"__nvvm_ui2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2i_rn_ftz : GCCBuiltin<"__nvvm_f2i_rn_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rn : GCCBuiltin<"__nvvm_f2i_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rz_ftz : GCCBuiltin<"__nvvm_f2i_rz_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rz : GCCBuiltin<"__nvvm_f2i_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rm_ftz : GCCBuiltin<"__nvvm_f2i_rm_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rm : GCCBuiltin<"__nvvm_f2i_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rp_ftz : GCCBuiltin<"__nvvm_f2i_rp_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rp : GCCBuiltin<"__nvvm_f2i_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ui_rn_ftz : GCCBuiltin<"__nvvm_f2ui_rn_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rn : GCCBuiltin<"__nvvm_f2ui_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rz_ftz : GCCBuiltin<"__nvvm_f2ui_rz_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rz : GCCBuiltin<"__nvvm_f2ui_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rm_ftz : GCCBuiltin<"__nvvm_f2ui_rm_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rm : GCCBuiltin<"__nvvm_f2ui_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rp_ftz : GCCBuiltin<"__nvvm_f2ui_rp_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rp : GCCBuiltin<"__nvvm_f2ui_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_i2f_rn : GCCBuiltin<"__nvvm_i2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rz : GCCBuiltin<"__nvvm_i2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rm : GCCBuiltin<"__nvvm_i2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rp : GCCBuiltin<"__nvvm_i2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_ui2f_rn : GCCBuiltin<"__nvvm_ui2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rz : GCCBuiltin<"__nvvm_ui2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rm : GCCBuiltin<"__nvvm_ui2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rp : GCCBuiltin<"__nvvm_ui2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_lohi_i2d : GCCBuiltin<"__nvvm_lohi_i2d">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_d2i_lo : GCCBuiltin<"__nvvm_d2i_lo">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_hi : GCCBuiltin<"__nvvm_d2i_hi">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ll_rn_ftz : GCCBuiltin<"__nvvm_f2ll_rn_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rn : GCCBuiltin<"__nvvm_f2ll_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rz_ftz : GCCBuiltin<"__nvvm_f2ll_rz_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rz : GCCBuiltin<"__nvvm_f2ll_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rm_ftz : GCCBuiltin<"__nvvm_f2ll_rm_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rm : GCCBuiltin<"__nvvm_f2ll_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rp_ftz : GCCBuiltin<"__nvvm_f2ll_rp_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rp : GCCBuiltin<"__nvvm_f2ll_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ull_rn_ftz : GCCBuiltin<"__nvvm_f2ull_rn_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rn : GCCBuiltin<"__nvvm_f2ull_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rz_ftz : GCCBuiltin<"__nvvm_f2ull_rz_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rz : GCCBuiltin<"__nvvm_f2ull_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rm_ftz : GCCBuiltin<"__nvvm_f2ull_rm_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rm : GCCBuiltin<"__nvvm_f2ull_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rp_ftz : GCCBuiltin<"__nvvm_f2ull_rp_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rp : GCCBuiltin<"__nvvm_f2ull_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ll_rn : GCCBuiltin<"__nvvm_d2ll_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rz : GCCBuiltin<"__nvvm_d2ll_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rm : GCCBuiltin<"__nvvm_d2ll_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rp : GCCBuiltin<"__nvvm_d2ll_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ull_rn : GCCBuiltin<"__nvvm_d2ull_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rz : GCCBuiltin<"__nvvm_d2ull_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rm : GCCBuiltin<"__nvvm_d2ull_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rp : GCCBuiltin<"__nvvm_d2ull_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_ll2f_rn : GCCBuiltin<"__nvvm_ll2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rz : GCCBuiltin<"__nvvm_ll2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rm : GCCBuiltin<"__nvvm_ll2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rp : GCCBuiltin<"__nvvm_ll2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rn : GCCBuiltin<"__nvvm_ull2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rz : GCCBuiltin<"__nvvm_ull2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rm : GCCBuiltin<"__nvvm_ull2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rp : GCCBuiltin<"__nvvm_ull2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_ll2d_rn : GCCBuiltin<"__nvvm_ll2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rz : GCCBuiltin<"__nvvm_ll2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rm : GCCBuiltin<"__nvvm_ll2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rp : GCCBuiltin<"__nvvm_ll2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rn : GCCBuiltin<"__nvvm_ull2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rz : GCCBuiltin<"__nvvm_ull2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rm : GCCBuiltin<"__nvvm_ull2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rp : GCCBuiltin<"__nvvm_ull2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2h_rn_ftz : GCCBuiltin<"__nvvm_f2h_rn_ftz">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2h_rn : GCCBuiltin<"__nvvm_f2h_rn">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_h2f : GCCBuiltin<"__nvvm_h2f">,
+      Intrinsic<[llvm_float_ty], [llvm_i16_ty], [IntrNoMem]>;
+
+//
+// Bitcast
+//
+
+  def int_nvvm_bitcast_f2i : GCCBuiltin<"__nvvm_bitcast_f2i">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_bitcast_i2f : GCCBuiltin<"__nvvm_bitcast_i2f">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_bitcast_ll2d : GCCBuiltin<"__nvvm_bitcast_ll2d">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+
+// Atomic not available as an llvm intrinsic.
+  def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
+          [LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+  def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
+          [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+  def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
+          [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+
+// Bar.Sync
+  def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+  def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+  def int_nvvm_barrier0_or : GCCBuiltin<"__nvvm_bar0_or">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+
+  // Membar
+  def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_membar_gl : GCCBuiltin<"__nvvm_membar_gl">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
+      Intrinsic<[], [], []>;
+
+
+// Accessing special registers
+  def int_nvvm_read_ptx_sreg_tid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_x">;
+  def int_nvvm_read_ptx_sreg_tid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_y">;
+  def int_nvvm_read_ptx_sreg_tid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_z">;
+
+  def int_nvvm_read_ptx_sreg_ntid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_x">;
+  def int_nvvm_read_ptx_sreg_ntid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_y">;
+  def int_nvvm_read_ptx_sreg_ntid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_z">;
+
+  def int_nvvm_read_ptx_sreg_ctaid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_x">;
+  def int_nvvm_read_ptx_sreg_ctaid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_y">;
+  def int_nvvm_read_ptx_sreg_ctaid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_z">;
+
+  def int_nvvm_read_ptx_sreg_nctaid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_x">;
+  def int_nvvm_read_ptx_sreg_nctaid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_y">;
+  def int_nvvm_read_ptx_sreg_nctaid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_z">;
+
+  def int_nvvm_read_ptx_sreg_warpsize :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_warpsize">;
+
+
+// Generated within nvvm. Use for ldu on sm_20 or later
+// @TODO: Revisit this, Changed LLVMAnyPointerType to LLVMPointerType
+def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.i">;
+def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.f">;
+def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.p">;
+
+
+// Use for generic pointers
+// - These intrinsics are used to convert address spaces.
+// - The input pointer and output pointer must have the same type, except for
+//   the address-space. (This restriction is not enforced here as there is
+//   currently no way to describe it).
+// - This complements the llvm bitcast, which can be used to cast one type
+//   of pointer to another type of pointer, while the address space remains
+//   the same.
+def int_nvvm_ptr_local_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.local.to.gen">;
+def int_nvvm_ptr_shared_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.shared.to.gen">;
+def int_nvvm_ptr_global_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.global.to.gen">;
+def int_nvvm_ptr_constant_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.constant.to.gen">;
+
+def int_nvvm_ptr_gen_to_global: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.global">;
+def int_nvvm_ptr_gen_to_shared: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.shared">;
+def int_nvvm_ptr_gen_to_local: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.local">;
+def int_nvvm_ptr_gen_to_constant: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.constant">;
+
+// Used in nvvm internally to help address space opt and ptx code generation
+// This is for params that are passed to kernel functions by pointer by-val.
+def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
+                                     [llvm_anyptr_ty],
+                                   [IntrNoMem, NoCapture<0>],
+                                   "llvm.nvvm.ptr.gen.to.param">;
+
+// Move intrinsics, used in nvvm internally
+
+def int_nvvm_move_i8 : Intrinsic<[llvm_i8_ty], [llvm_i8_ty], [IntrNoMem],
+  "llvm.nvvm.move.i8">;
+def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
+  "llvm.nvvm.move.i16">;
+def int_nvvm_move_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem],
+  "llvm.nvvm.move.i32">;
+def int_nvvm_move_i64 : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem],
+  "llvm.nvvm.move.i64">;
+def int_nvvm_move_float : Intrinsic<[llvm_float_ty], [llvm_float_ty],
+  [IntrNoMem], "llvm.nvvm.move.float">;
+def int_nvvm_move_double : Intrinsic<[llvm_double_ty], [llvm_double_ty],
+  [IntrNoMem], "llvm.nvvm.move.double">;
+def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+  [IntrNoMem, NoCapture<0>], "llvm.nvvm.move.ptr">;
+
+
+/// Error / Warn
+def int_nvvm_compiler_error :
+    Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.error">;
+def int_nvvm_compiler_warn :
+    Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
+
+
+// Old PTX back-end intrinsics retained here for backwards-compatibility
+
+multiclass PTXReadSpecialRegisterIntrinsic_v4i32<string prefix> {
+// FIXME: Do we need the 128-bit integer type version?
+//    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem]>;
+
+// FIXME: Enable this once v4i32 support is enabled in back-end.
+//    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem]>;
+
+  def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+               GCCBuiltin<!strconcat(prefix, "_x")>;
+  def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+               GCCBuiltin<!strconcat(prefix, "_y")>;
+  def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+               GCCBuiltin<!strconcat(prefix, "_z")>;
+  def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+               GCCBuiltin<!strconcat(prefix, "_w")>;
+}
+
+class PTXReadSpecialRegisterIntrinsic_r32<string name>
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
+class PTXReadSpecialRegisterIntrinsic_r64<string name>
+  : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
+defm int_ptx_read_tid        : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_tid">;
+defm int_ptx_read_ntid       : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_ntid">;
+
+def int_ptx_read_laneid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_laneid">;
+def int_ptx_read_warpid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_warpid">;
+def int_ptx_read_nwarpid     : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_nwarpid">;
+
+defm int_ptx_read_ctaid      : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_ctaid">;
+defm int_ptx_read_nctaid     : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_nctaid">;
+
+def int_ptx_read_smid        : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_smid">;
+def int_ptx_read_nsmid       : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_nsmid">;
+def int_ptx_read_gridid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_gridid">;
+
+def int_ptx_read_lanemask_eq : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_eq">;
+def int_ptx_read_lanemask_le : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_le">;
+def int_ptx_read_lanemask_lt : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_lt">;
+def int_ptx_read_lanemask_ge : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_ge">;
+def int_ptx_read_lanemask_gt : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_gt">;
+
+def int_ptx_read_clock       : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_clock">;
+def int_ptx_read_clock64     : PTXReadSpecialRegisterIntrinsic_r64
+                               <"__builtin_ptx_read_clock64">;
+
+def int_ptx_read_pm0         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm0">;
+def int_ptx_read_pm1         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm1">;
+def int_ptx_read_pm2         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm2">;
+def int_ptx_read_pm3         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm3">;
+
+def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], []>,
+                       GCCBuiltin<"__builtin_ptx_bar_sync">;
diff --git a/include/llvm/IntrinsicsPTX.td b/include/llvm/IntrinsicsPTX.td
deleted file mode 100644
index 28379c9..0000000
--- a/include/llvm/IntrinsicsPTX.td
+++ /dev/null
@@ -1,92 +0,0 @@
-//===- IntrinsicsPTX.td - Defines PTX intrinsics -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the PTX-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "ptx" in {
-  multiclass PTXReadSpecialRegisterIntrinsic_v4i32<string prefix> {
-// FIXME: Do we need the 128-bit integer type version?
-//    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem]>;
-
-// FIXME: Enable this once v4i32 support is enabled in back-end.
-//    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem]>;
-
-    def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-                 GCCBuiltin<!strconcat(prefix, "_x")>;
-    def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-                 GCCBuiltin<!strconcat(prefix, "_y")>;
-    def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-                 GCCBuiltin<!strconcat(prefix, "_z")>;
-    def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-                 GCCBuiltin<!strconcat(prefix, "_w")>;
-  }
-
-  class PTXReadSpecialRegisterIntrinsic_r32<string name>
-    : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-      GCCBuiltin<name>;
-
-  class PTXReadSpecialRegisterIntrinsic_r64<string name>
-    : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>,
-      GCCBuiltin<name>;
-}
-
-defm int_ptx_read_tid        : PTXReadSpecialRegisterIntrinsic_v4i32
-                               <"__builtin_ptx_read_tid">;
-defm int_ptx_read_ntid       : PTXReadSpecialRegisterIntrinsic_v4i32
-                               <"__builtin_ptx_read_ntid">;
-
-def int_ptx_read_laneid      : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_laneid">;
-def int_ptx_read_warpid      : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_warpid">;
-def int_ptx_read_nwarpid     : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_nwarpid">;
-
-defm int_ptx_read_ctaid      : PTXReadSpecialRegisterIntrinsic_v4i32
-                               <"__builtin_ptx_read_ctaid">;
-defm int_ptx_read_nctaid     : PTXReadSpecialRegisterIntrinsic_v4i32
-                               <"__builtin_ptx_read_nctaid">;
-
-def int_ptx_read_smid        : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_smid">;
-def int_ptx_read_nsmid       : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_nsmid">;
-def int_ptx_read_gridid      : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_gridid">;
-
-def int_ptx_read_lanemask_eq : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_lanemask_eq">;
-def int_ptx_read_lanemask_le : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_lanemask_le">;
-def int_ptx_read_lanemask_lt : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_lanemask_lt">;
-def int_ptx_read_lanemask_ge : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_lanemask_ge">;
-def int_ptx_read_lanemask_gt : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_lanemask_gt">;
-
-def int_ptx_read_clock       : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_clock">;
-def int_ptx_read_clock64     : PTXReadSpecialRegisterIntrinsic_r64
-                               <"__builtin_ptx_read_clock64">;
-
-def int_ptx_read_pm0         : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_pm0">;
-def int_ptx_read_pm1         : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_pm1">;
-def int_ptx_read_pm2         : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_pm2">;
-def int_ptx_read_pm3         : PTXReadSpecialRegisterIntrinsic_r32
-                               <"__builtin_ptx_read_pm3">;
-
-let TargetPrefix = "ptx" in
-  def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], []>,
-                         GCCBuiltin<"__builtin_ptx_bar_sync">;
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index cb7b3ea..e8039f2 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -819,6 +819,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [IntrNoMem]>;
 }
 
+// PCLMUL instruction
+let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
+  def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">,
+          Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
+                    [IntrNoMem]>;
+}
+
 // Vector pack
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_packusdw        : GCCBuiltin<"__builtin_ia32_packusdw128">,
@@ -1005,6 +1012,28 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 }
 
 //===----------------------------------------------------------------------===//
+// SSE4A
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse4a_extrqi : GCCBuiltin<"__builtin_ia32_extrqi">,
+    Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty],
+              [IntrNoMem]>;
+  def int_x86_sse4a_extrq  : GCCBuiltin<"__builtin_ia32_extrq">,
+    Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  def int_x86_sse4a_insertqi : GCCBuiltin<"__builtin_ia32_insertqi">,
+    Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+                                llvm_i8_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_sse4a_insertq  : GCCBuiltin<"__builtin_ia32_insertq">,
+    Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+  def int_x86_sse4a_movnt_ss : GCCBuiltin<"__builtin_ia32_movntss">,
+    Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty], []>;
+  def int_x86_sse4a_movnt_sd : GCCBuiltin<"__builtin_ia32_movntsd">,
+    Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
 // AVX
 
 // Arithmetic ops
@@ -1272,16 +1301,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], []>;
 }
 
-// Cacheability support ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_movnt_dq_256 : GCCBuiltin<"__builtin_ia32_movntdq256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty], []>;
-  def int_x86_avx_movnt_pd_256 : GCCBuiltin<"__builtin_ia32_movntpd256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], []>;
-  def int_x86_avx_movnt_ps_256 : GCCBuiltin<"__builtin_ia32_movntps256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], []>;
-}
-
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
@@ -1725,6 +1744,75 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [IntrNoMem]>;
 }
 
+// Gather ops
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">,
+      Intrinsic<[llvm_v2f64_ty],
+        [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">,
+      Intrinsic<[llvm_v4f64_ty],
+        [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">,
+      Intrinsic<[llvm_v2f64_ty],
+        [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">,
+      Intrinsic<[llvm_v4f64_ty],
+        [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">,
+      Intrinsic<[llvm_v4f32_ty],
+        [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">,
+      Intrinsic<[llvm_v8f32_ty],
+        [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">,
+      Intrinsic<[llvm_v4f32_ty],
+        [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">,
+      Intrinsic<[llvm_v4f32_ty],
+        [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+
+  def int_x86_avx2_gather_d_q : GCCBuiltin<"__builtin_ia32_gatherd_q">,
+      Intrinsic<[llvm_v2i64_ty],
+        [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_q_256 : GCCBuiltin<"__builtin_ia32_gatherd_q256">,
+      Intrinsic<[llvm_v4i64_ty],
+        [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_q : GCCBuiltin<"__builtin_ia32_gatherq_q">,
+      Intrinsic<[llvm_v2i64_ty],
+        [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_q_256 : GCCBuiltin<"__builtin_ia32_gatherq_q256">,
+      Intrinsic<[llvm_v4i64_ty],
+        [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_d : GCCBuiltin<"__builtin_ia32_gatherd_d">,
+      Intrinsic<[llvm_v4i32_ty],
+        [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
+      Intrinsic<[llvm_v8i32_ty],
+        [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_d : GCCBuiltin<"__builtin_ia32_gatherq_d">,
+      Intrinsic<[llvm_v4i32_ty],
+        [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+  def int_x86_avx2_gather_q_d_256 : GCCBuiltin<"__builtin_ia32_gatherq_d256">,
+      Intrinsic<[llvm_v4i32_ty],
+        [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
+        [IntrReadMem]>;
+}
+
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
@@ -1740,137 +1828,137 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 
 //===----------------------------------------------------------------------===//
-// FMA4
+// FMA3 and FMA4
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
+  def int_x86_fma_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
+  def int_x86_fma_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
+  def int_x86_fma_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">,
+  def int_x86_fma_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">,
+  def int_x86_fma_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">,
+  def int_x86_fma_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
+  def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_sd : GCCBuiltin<"__builtin_ia32_vfmsubsd">,
+  def int_x86_fma_vfmsub_sd : GCCBuiltin<"__builtin_ia32_vfmsubsd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_ps : GCCBuiltin<"__builtin_ia32_vfmsubps">,
+  def int_x86_fma_vfmsub_ps : GCCBuiltin<"__builtin_ia32_vfmsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_pd : GCCBuiltin<"__builtin_ia32_vfmsubpd">,
+  def int_x86_fma_vfmsub_pd : GCCBuiltin<"__builtin_ia32_vfmsubpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256">,
+  def int_x86_fma_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256">,
+  def int_x86_fma_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
+  def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_sd : GCCBuiltin<"__builtin_ia32_vfnmaddsd">,
+  def int_x86_fma_vfnmadd_sd : GCCBuiltin<"__builtin_ia32_vfnmaddsd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_ps : GCCBuiltin<"__builtin_ia32_vfnmaddps">,
+  def int_x86_fma_vfnmadd_ps : GCCBuiltin<"__builtin_ia32_vfnmaddps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_pd : GCCBuiltin<"__builtin_ia32_vfnmaddpd">,
+  def int_x86_fma_vfnmadd_pd : GCCBuiltin<"__builtin_ia32_vfnmaddpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256">,
+  def int_x86_fma_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256">,
+  def int_x86_fma_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
+  def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_sd : GCCBuiltin<"__builtin_ia32_vfnmsubsd">,
+  def int_x86_fma_vfnmsub_sd : GCCBuiltin<"__builtin_ia32_vfnmsubsd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_ps : GCCBuiltin<"__builtin_ia32_vfnmsubps">,
+  def int_x86_fma_vfnmsub_ps : GCCBuiltin<"__builtin_ia32_vfnmsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_pd : GCCBuiltin<"__builtin_ia32_vfnmsubpd">,
+  def int_x86_fma_vfnmsub_pd : GCCBuiltin<"__builtin_ia32_vfnmsubpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256">,
+  def int_x86_fma_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256">,
+  def int_x86_fma_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
+  def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
+  def int_x86_fma_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmaddsub_ps_256 :
+  def int_x86_fma_vfmaddsub_ps_256 :
                GCCBuiltin<"__builtin_ia32_vfmaddsubps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmaddsub_pd_256 :
+  def int_x86_fma_vfmaddsub_pd_256 :
               GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
+  def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsubadd_pd : GCCBuiltin<"__builtin_ia32_vfmsubaddpd">,
+  def int_x86_fma_vfmsubadd_pd : GCCBuiltin<"__builtin_ia32_vfmsubaddpd">,
               Intrinsic<[llvm_v2f64_ty],
                         [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsubadd_ps_256 :
+  def int_x86_fma_vfmsubadd_ps_256 :
               GCCBuiltin<"__builtin_ia32_vfmsubaddps256">,
               Intrinsic<[llvm_v8f32_ty],
                         [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma4_vfmsubadd_pd_256 :
+  def int_x86_fma_vfmsubadd_pd_256 :
               GCCBuiltin<"__builtin_ia32_vfmsubaddpd256">,
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
@@ -1901,26 +1989,19 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                                           llvm_v8f32_ty, llvm_i8_ty],
                         [IntrNoMem]>;
 
-  def int_x86_xop_vfrcz_pd :
-              GCCBuiltin<"__builtin_ia32_vfrczpd">,
+  def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ps :
-              GCCBuiltin<"__builtin_ia32_vfrczps">,
+  def int_x86_xop_vfrcz_ps : GCCBuiltin<"__builtin_ia32_vfrczps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_sd :
-              GCCBuiltin<"__builtin_ia32_vfrczsd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ss :
-              GCCBuiltin<"__builtin_ia32_vfrczss">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vfrcz_pd_256 :
-              GCCBuiltin<"__builtin_ia32_vfrczpd256">,
+  def int_x86_xop_vfrcz_sd : GCCBuiltin<"__builtin_ia32_vfrczsd">,
+              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+  def int_x86_xop_vfrcz_ss : GCCBuiltin<"__builtin_ia32_vfrczss">,
+              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_xop_vfrcz_pd_256 : GCCBuiltin<"__builtin_ia32_vfrczpd256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_xop_vfrcz_ps_256 :
-              GCCBuiltin<"__builtin_ia32_vfrczps256">,
+  def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+
   def int_x86_xop_vpcmov :
               GCCBuiltin<"__builtin_ia32_vpcmov">,
               Intrinsic<[llvm_v2i64_ty],
@@ -1931,262 +2012,32 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4i64_ty],
                         [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty],
                         [IntrNoMem]>;
-  def int_x86_xop_vpcomeqb :
-              GCCBuiltin<"__builtin_ia32_vpcomeqb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomeqw :
-              GCCBuiltin<"__builtin_ia32_vpcomeqw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomeqd :
-              GCCBuiltin<"__builtin_ia32_vpcomeqd">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomeqq :
-              GCCBuiltin<"__builtin_ia32_vpcomeqq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomequb :
-              GCCBuiltin<"__builtin_ia32_vpcomequb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomequd :
-              GCCBuiltin<"__builtin_ia32_vpcomequd">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomequq :
-              GCCBuiltin<"__builtin_ia32_vpcomequq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomequw :
-              GCCBuiltin<"__builtin_ia32_vpcomequw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseb :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalsed :
-              GCCBuiltin<"__builtin_ia32_vpcomfalsed">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseq :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseub :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseud :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseuq :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalseuw :
-              GCCBuiltin<"__builtin_ia32_vpcomfalseuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomfalsew :
-              GCCBuiltin<"__builtin_ia32_vpcomfalsew">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeb :
-              GCCBuiltin<"__builtin_ia32_vpcomgeb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomged :
-              GCCBuiltin<"__builtin_ia32_vpcomged">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeq :
-              GCCBuiltin<"__builtin_ia32_vpcomgeq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeub :
-              GCCBuiltin<"__builtin_ia32_vpcomgeub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeud :
-              GCCBuiltin<"__builtin_ia32_vpcomgeud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeuq :
-              GCCBuiltin<"__builtin_ia32_vpcomgeuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgeuw :
-              GCCBuiltin<"__builtin_ia32_vpcomgeuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgew :
-              GCCBuiltin<"__builtin_ia32_vpcomgew">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtb :
-              GCCBuiltin<"__builtin_ia32_vpcomgtb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtd :
-              GCCBuiltin<"__builtin_ia32_vpcomgtd">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtq :
-              GCCBuiltin<"__builtin_ia32_vpcomgtq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtub :
-              GCCBuiltin<"__builtin_ia32_vpcomgtub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtud :
-              GCCBuiltin<"__builtin_ia32_vpcomgtud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtuq :
-              GCCBuiltin<"__builtin_ia32_vpcomgtuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtuw :
-              GCCBuiltin<"__builtin_ia32_vpcomgtuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomgtw :
-              GCCBuiltin<"__builtin_ia32_vpcomgtw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleb :
-              GCCBuiltin<"__builtin_ia32_vpcomleb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomled :
-              GCCBuiltin<"__builtin_ia32_vpcomled">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleq :
-              GCCBuiltin<"__builtin_ia32_vpcomleq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleub :
-              GCCBuiltin<"__builtin_ia32_vpcomleub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleud :
-              GCCBuiltin<"__builtin_ia32_vpcomleud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleuq :
-              GCCBuiltin<"__builtin_ia32_vpcomleuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomleuw :
-              GCCBuiltin<"__builtin_ia32_vpcomleuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomlew :
-              GCCBuiltin<"__builtin_ia32_vpcomlew">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltb :
-              GCCBuiltin<"__builtin_ia32_vpcomltb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltd :
-              GCCBuiltin<"__builtin_ia32_vpcomltd">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltq :
-              GCCBuiltin<"__builtin_ia32_vpcomltq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltub :
-              GCCBuiltin<"__builtin_ia32_vpcomltub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltud :
-              GCCBuiltin<"__builtin_ia32_vpcomltud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltuq :
-              GCCBuiltin<"__builtin_ia32_vpcomltuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltuw :
-              GCCBuiltin<"__builtin_ia32_vpcomltuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomltw :
-              GCCBuiltin<"__builtin_ia32_vpcomltw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneb :
-              GCCBuiltin<"__builtin_ia32_vpcomneb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomned :
-              GCCBuiltin<"__builtin_ia32_vpcomned">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneq :
-              GCCBuiltin<"__builtin_ia32_vpcomneq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneub :
-              GCCBuiltin<"__builtin_ia32_vpcomneub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneud :
-              GCCBuiltin<"__builtin_ia32_vpcomneud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneuq :
-              GCCBuiltin<"__builtin_ia32_vpcomneuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomneuw :
-              GCCBuiltin<"__builtin_ia32_vpcomneuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomnew :
-              GCCBuiltin<"__builtin_ia32_vpcomnew">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueb :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueb">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrued :
-              GCCBuiltin<"__builtin_ia32_vpcomtrued">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueq :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueub :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueub">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueud :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueud">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueuq :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueuq">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtrueuw :
-              GCCBuiltin<"__builtin_ia32_vpcomtrueuw">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcomtruew :
-              GCCBuiltin<"__builtin_ia32_vpcomtruew">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                        [IntrNoMem]>;
+
+  def int_x86_xop_vpcomb : GCCBuiltin<"__builtin_ia32_vpcomb">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomw : GCCBuiltin<"__builtin_ia32_vpcomw">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomd : GCCBuiltin<"__builtin_ia32_vpcomd">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomq : GCCBuiltin<"__builtin_ia32_vpcomq">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomub : GCCBuiltin<"__builtin_ia32_vpcomub">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomuw : GCCBuiltin<"__builtin_ia32_vpcomuw">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomud : GCCBuiltin<"__builtin_ia32_vpcomud">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_xop_vpcomuq : GCCBuiltin<"__builtin_ia32_vpcomuq">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+                         llvm_i8_ty], [IntrNoMem]>;
+
   def int_x86_xop_vphaddbd :
               GCCBuiltin<"__builtin_ia32_vphaddbd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -2297,22 +2148,32 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v16i8_ty],
                         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_x86_xop_vprotb :
-              GCCBuiltin<"__builtin_ia32_vprotb">,
+
+  def int_x86_xop_vprotb : GCCBuiltin<"__builtin_ia32_vprotb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
-  def int_x86_xop_vprotd :
-              GCCBuiltin<"__builtin_ia32_vprotd">,
+  def int_x86_xop_vprotd : GCCBuiltin<"__builtin_ia32_vprotd">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
-  def int_x86_xop_vprotq :
-              GCCBuiltin<"__builtin_ia32_vprotq">,
+  def int_x86_xop_vprotq : GCCBuiltin<"__builtin_ia32_vprotq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
-  def int_x86_xop_vprotw :
-              GCCBuiltin<"__builtin_ia32_vprotw">,
+  def int_x86_xop_vprotw : GCCBuiltin<"__builtin_ia32_vprotw">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
+  def int_x86_xop_vprotbi : GCCBuiltin<"__builtin_ia32_vprotbi">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_xop_vprotdi : GCCBuiltin<"__builtin_ia32_vprotdi">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_xop_vprotqi : GCCBuiltin<"__builtin_ia32_vprotqi">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_xop_vprotwi : GCCBuiltin<"__builtin_ia32_vprotwi">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty],
+                        [IntrNoMem]>;
+
   def int_x86_xop_vpshab :
               GCCBuiltin<"__builtin_ia32_vpshab">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
@@ -2675,3 +2536,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
 }
+
+//===----------------------------------------------------------------------===//
+// RDRAND intrinsics. Return a random value and whether it is valid.
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  // These are declared side-effecting so they don't get eliminated by CSE or
+  // LICM.
+  def int_x86_rdrand_16 : Intrinsic<[llvm_i16_ty, llvm_i32_ty], [], []>;
+  def int_x86_rdrand_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>;
+  def int_x86_rdrand_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
+}
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 2258d45..697c94c 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -55,6 +55,7 @@ namespace {
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
       (void) llvm::createTypeBasedAliasAnalysisPass();
       (void) llvm::createBlockPlacementPass();
+      (void) llvm::createBoundsCheckingPass();
       (void) llvm::createBreakCriticalEdgesPass();
       (void) llvm::createCFGSimplificationPass();
       (void) llvm::createConstantMergePass();
diff --git a/include/llvm/MC/EDInstInfo.h b/include/llvm/MC/EDInstInfo.h
index 0b9d3f6..5b02467 100644
--- a/include/llvm/MC/EDInstInfo.h
+++ b/include/llvm/MC/EDInstInfo.h
@@ -12,7 +12,7 @@
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
-  
+
 #define EDIS_MAX_OPERANDS 13
 #define EDIS_MAX_SYNTAXES 2
 
@@ -23,7 +23,7 @@ struct EDInstInfo {
   uint8_t       operandFlags[EDIS_MAX_OPERANDS];
   const signed char operandOrders[EDIS_MAX_SYNTAXES][EDIS_MAX_OPERANDS];
 };
-  
+
 } // namespace llvm
 
 #endif
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 0f67c99..9f5230b 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -147,6 +147,11 @@ namespace llvm {
     // FIXME: Make this a more general encoding setting?
     bool AllowUTF8;
 
+    /// UseDataRegionDirectives - This is true if data region markers should
+    /// be printed as ".data_region/.end_data_region" directives. If false,
+    /// use "$d/$a" labels instead.
+    bool UseDataRegionDirectives;
+
     //===--- Data Emission Directives -------------------------------------===//
 
     /// ZeroDirective - this should be set to the directive used to get some
@@ -172,18 +177,6 @@ namespace llvm {
     const char *Data32bitsDirective;         // Defaults to "\t.long\t"
     const char *Data64bitsDirective;         // Defaults to "\t.quad\t"
 
-    /// [Data|Code]Begin - These magic labels are used to marked a region as
-    /// data or code, and are used to provide additional information for
-    /// correct disassembly on targets that like to mix data and code within
-    /// a segment.  These labels will be implicitly suffixed by the streamer
-    /// to give them unique names.
-    const char *DataBegin;                   // Defaults to "$d."
-    const char *CodeBegin;                   // Defaults to "$a."
-    const char *JT8Begin;                    // Defaults to "$a."
-    const char *JT16Begin;                   // Defaults to "$a."
-    const char *JT32Begin;                   // Defaults to "$a."
-    bool SupportsDataRegions;
-
     /// GPRel64Directive - if non-null, a directive that is used to emit a word
     /// which should be relocated as a 64-bit GP-relative offset, e.g. .gpdword
     /// on Mips.
@@ -322,26 +315,14 @@ namespace llvm {
     /// DwarfSectionOffsetDirective - Special section offset directive.
     const char* DwarfSectionOffsetDirective; // Defaults to NULL
 
-    /// DwarfRequiresRelocationForSectionOffset - True if we need to produce a
-    /// relocation when we want a section offset in dwarf.
-    bool DwarfRequiresRelocationForSectionOffset;  // Defaults to true;
-
-    /// DwarfUsesLabelOffsetDifference - True if Dwarf2 output can
-    /// use EmitLabelOffsetDifference.
-    bool DwarfUsesLabelOffsetForRanges;
-
-    /// DwarfUsesRelocationsForStringPool - True if this Dwarf output must use
-    /// relocations to refer to entries in the string pool.
-    bool DwarfUsesRelocationsForStringPool;
+    /// DwarfUsesRelocationsAcrossSections - True if Dwarf2 output generally
+    /// uses relocations for references to other .debug_* sections.
+    bool DwarfUsesRelocationsAcrossSections;
 
     /// DwarfRegNumForCFI - True if dwarf register numbers are printed
     /// instead of symbolic register names in .cfi_* directives.
     bool DwarfRegNumForCFI;  // Defaults to false;
 
-    //===--- CBE Asm Translation Table -----------------------------------===//
-
-    const char *const *AsmTransCBE;          // Defaults to empty
-
     //===--- Prologue State ----------------------------------------------===//
 
     std::vector<MachineMove> InitialFrameState;
@@ -388,14 +369,6 @@ namespace llvm {
     const char *getGPRel64Directive() const { return GPRel64Directive; }
     const char *getGPRel32Directive() const { return GPRel32Directive; }
 
-    /// [Code|Data]Begin label name accessors.
-    const char *getCodeBeginLabelName() const { return CodeBegin; }
-    const char *getDataBeginLabelName() const { return DataBegin; }
-    const char *getJumpTable8BeginLabelName() const { return JT8Begin; }
-    const char *getJumpTable16BeginLabelName() const { return JT16Begin; }
-    const char *getJumpTable32BeginLabelName() const { return JT32Begin; }
-    bool getSupportsDataRegions() const { return SupportsDataRegions; }
-
     /// getNonexecutableStackSection - Targets can implement this method to
     /// specify a section to switch to if the translation unit doesn't have any
     /// trampolines that require an executable stack.
@@ -492,6 +465,9 @@ namespace llvm {
     bool doesAllowUTF8() const {
       return AllowUTF8;
     }
+    bool doesSupportDataRegionDirectives() const {
+      return UseDataRegionDirectives;
+    }
     const char *getZeroDirective() const {
       return ZeroDirective;
     }
@@ -565,21 +541,12 @@ namespace llvm {
     const char *getDwarfSectionOffsetDirective() const {
       return DwarfSectionOffsetDirective;
     }
-    bool doesDwarfRequireRelocationForSectionOffset() const {
-      return DwarfRequiresRelocationForSectionOffset;
-    }
-    bool doesDwarfUseLabelOffsetForRanges() const {
-      return DwarfUsesLabelOffsetForRanges;
-    }
-    bool doesDwarfUseRelocationsForStringPool() const {
-      return DwarfUsesRelocationsForStringPool;
+    bool doesDwarfUseRelocationsAcrossSections() const {
+      return DwarfUsesRelocationsAcrossSections;
     }
     bool useDwarfRegNumForCFI() const {
       return DwarfRegNumForCFI;
     }
-    const char *const *getAsmCBE() const {
-      return AsmTransCBE;
-    }
 
     void addInitialFrameState(MCSymbol *label, const MachineLocation &D,
                               const MachineLocation &S) {
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index d139173..b7b2d66 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -130,7 +130,7 @@ public:
 
   void addFixup(MCFixup Fixup) {
     // Enforce invariant that fixups are in offset order.
-    assert((Fixups.empty() || Fixup.getOffset() > Fixups.back().getOffset()) &&
+    assert((Fixups.empty() || Fixup.getOffset() >= Fixups.back().getOffset()) &&
            "Fixups must be added in order!");
     Fixups.push_back(Fixup);
   }
@@ -651,6 +651,16 @@ struct IndirectSymbolData {
   MCSectionData *SectionData;
 };
 
+// FIXME: Ditto this. Purely so the Streamer and the ObjectWriter can talk
+// to one another.
+struct DataRegionData {
+  // This enum should be kept in sync w/ the mach-o definition in
+  // llvm/Object/MachOFormat.h.
+  enum KindTy { Data = 1, JumpTable8, JumpTable16, JumpTable32 } Kind;
+  MCSymbol *Start;
+  MCSymbol *End;
+};
+
 class MCAssembler {
   friend class MCAsmLayout;
 
@@ -668,6 +678,10 @@ public:
     const_indirect_symbol_iterator;
   typedef std::vector<IndirectSymbolData>::iterator indirect_symbol_iterator;
 
+  typedef std::vector<DataRegionData>::const_iterator
+    const_data_region_iterator;
+  typedef std::vector<DataRegionData>::iterator data_region_iterator;
+
 private:
   MCAssembler(const MCAssembler&);    // DO NOT IMPLEMENT
   void operator=(const MCAssembler&); // DO NOT IMPLEMENT
@@ -698,6 +712,7 @@ private:
 
   std::vector<IndirectSymbolData> IndirectSymbols;
 
+  std::vector<DataRegionData> DataRegions;
   /// The set of function symbols for which a .thumb_func directive has
   /// been seen.
   //
@@ -884,6 +899,33 @@ public:
   size_t indirect_symbol_size() const { return IndirectSymbols.size(); }
 
   /// @}
+  /// @name Data Region List Access
+  /// @{
+
+  // FIXME: This is a total hack, this should not be here. Once things are
+  // factored so that the streamer has direct access to the .o writer, it can
+  // disappear.
+  std::vector<DataRegionData> &getDataRegions() {
+    return DataRegions;
+  }
+
+  data_region_iterator data_region_begin() {
+    return DataRegions.begin();
+  }
+  const_data_region_iterator data_region_begin() const {
+    return DataRegions.begin();
+  }
+
+  data_region_iterator data_region_end() {
+    return DataRegions.end();
+  }
+  const_data_region_iterator data_region_end() const {
+    return DataRegions.end();
+  }
+
+  size_t data_region_size() const { return DataRegions.size(); }
+
+  /// @}
   /// @name Backend Data Access
   /// @{
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index b586319..59545d3 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -161,6 +161,10 @@ namespace llvm {
     /// with a unique but unspecified name.
     MCSymbol *CreateTempSymbol();
 
+    /// getUniqueSymbolID() - Return a unique identifier for use in constructing
+    /// symbol names.
+    unsigned getUniqueSymbolID() { return NextUniqueID++; }
+
     /// CreateDirectionalLocalSymbol - Create the definition of a directional
     /// local symbol for numbered label (used for "1:" definitions).
     MCSymbol *CreateDirectionalLocalSymbol(int64_t LocalLabelVal);
diff --git a/include/llvm/MC/MCDirectives.h b/include/llvm/MC/MCDirectives.h
index 9180d1b..0461766 100644
--- a/include/llvm/MC/MCDirectives.h
+++ b/include/llvm/MC/MCDirectives.h
@@ -52,6 +52,14 @@ enum MCAssemblerFlag {
   MCAF_Code64                 ///< .code64 (X86)
 };
 
+enum MCDataRegionType {
+  MCDR_DataRegion,            ///< .data_region
+  MCDR_DataRegionJT8,         ///< .data_region jt8
+  MCDR_DataRegionJT16,        ///< .data_region jt16
+  MCDR_DataRegionJT32,        ///< .data_region jt32
+  MCDR_DataRegionEnd          ///< .end_data_region
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index 4b5fbec..53a9ce0 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h
@@ -13,13 +13,13 @@
 #include "llvm-c/Disassembler.h"
 
 namespace llvm {
-  
+
 class MCInst;
 class MCSubtargetInfo;
 class MemoryObject;
 class raw_ostream;
 class MCContext;
-  
+
 struct EDInstInfo;
 
 /// MCDisassembler - Superclass for all disassemblers.  Consumes a memory region
@@ -58,12 +58,12 @@ public:
   MCDisassembler(const MCSubtargetInfo &STI) : GetOpInfo(0), SymbolLookUp(0),
                                                DisInfo(0), Ctx(0),
                                                STI(STI), CommentStream(0) {}
-  
+
   virtual ~MCDisassembler();
-  
+
   /// getInstruction  - Returns the disassembly of a single instruction.
   ///
-  /// @param instr    - An MCInst to populate with the contents of the 
+  /// @param instr    - An MCInst to populate with the contents of the
   ///                   instruction.
   /// @param size     - A value to populate with the size of the instruction, or
   ///                   the number of bytes consumed while attempting to decode
@@ -74,7 +74,7 @@ public:
   /// @param vStream  - The stream to print warnings and diagnostic messages on.
   /// @param cStream  - The stream to print comments and annotations on.
   /// @return         - MCDisassembler::Success if the instruction is valid,
-  ///                   MCDisassembler::SoftFail if the instruction was 
+  ///                   MCDisassembler::SoftFail if the instruction was
   ///                                            disassemblable but invalid,
   ///                   MCDisassembler::Fail if the instruction was invalid.
   virtual DecodeStatus  getInstruction(MCInst& instr,
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index f153cb0..abbe188 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -54,11 +54,13 @@ class MCELFObjectTargetWriter {
   const uint16_t EMachine;
   const unsigned HasRelocationAddend : 1;
   const unsigned Is64Bit : 1;
+  const unsigned IsN64 : 1;
 
 protected:
 
   MCELFObjectTargetWriter(bool Is64Bit_, uint8_t OSABI_,
-                          uint16_t EMachine_,  bool HasRelocationAddend_);
+                          uint16_t EMachine_,  bool HasRelocationAddend,
+                          bool IsN64=false);
 
 public:
   static uint8_t getOSABI(Triple::OSType OSType) {
@@ -95,7 +97,47 @@ public:
   uint16_t getEMachine() { return EMachine; }
   bool hasRelocationAddend() { return HasRelocationAddend; }
   bool is64Bit() const { return Is64Bit; }
+  bool isN64() const { return IsN64; }
   /// @}
+
+  // Instead of changing everyone's API we pack the N64 Type fields
+  // into the existing 32 bit data unsigned.
+#define R_TYPE_SHIFT 0
+#define R_TYPE_MASK 0xffffff00
+#define R_TYPE2_SHIFT 8
+#define R_TYPE2_MASK 0xffff00ff
+#define R_TYPE3_SHIFT 16
+#define R_TYPE3_MASK 0xff00ffff
+#define R_SSYM_SHIFT 24
+#define R_SSYM_MASK 0x00ffffff
+
+  // N64 relocation type accessors
+  unsigned getRType(uint32_t Type) const {
+    return (unsigned)((Type >> R_TYPE_SHIFT) & 0xff);
+  }
+  unsigned getRType2(uint32_t Type) const {
+    return (unsigned)((Type >> R_TYPE2_SHIFT) & 0xff);
+  }
+  unsigned getRType3(uint32_t Type) const {
+    return (unsigned)((Type >> R_TYPE3_SHIFT) & 0xff);
+  }
+  unsigned getRSsym(uint32_t Type) const {
+    return (unsigned)((Type >> R_SSYM_SHIFT) & 0xff);
+  }
+
+  // N64 relocation type setting
+  unsigned setRType(unsigned Value, unsigned Type) const {
+    return ((Type & R_TYPE_MASK) | ((Value & 0xff) << R_TYPE_SHIFT));
+  }
+  unsigned setRType2(unsigned Value, unsigned Type) const {
+    return (Type & R_TYPE2_MASK) | ((Value & 0xff) << R_TYPE2_SHIFT);
+  }
+  unsigned setRType3(unsigned Value, unsigned Type) const {
+    return (Type & R_TYPE3_MASK) | ((Value & 0xff) << R_TYPE3_SHIFT);
+  }
+  unsigned setRSsym(unsigned Value, unsigned Type) const {
+    return (Type & R_SSYM_MASK) | ((Value & 0xff) << R_SSYM_SHIFT);
+  }
 };
 
 /// \brief Construct a new ELF writer instance.
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index ff33641..aa62eb2 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -176,6 +176,8 @@ public:
     VK_PPC_DARWIN_LO16,  // lo16(symbol)
     VK_PPC_GAS_HA16,     // symbol@ha
     VK_PPC_GAS_LO16,      // symbol@l
+    VK_PPC_TPREL16_HA,   // symbol@tprel@ha
+    VK_PPC_TPREL16_LO,   // symbol@tprel@l
 
     VK_Mips_GPREL,
     VK_Mips_GOT_CALL,
@@ -194,7 +196,9 @@ public:
     VK_Mips_GPOFF_LO,
     VK_Mips_GOT_DISP,
     VK_Mips_GOT_PAGE,
-    VK_Mips_GOT_OFST 
+    VK_Mips_GOT_OFST,
+    VK_Mips_HIGHER,
+    VK_Mips_HIGHEST
   };
 
 private:
diff --git a/include/llvm/MC/MCFixupKindInfo.h b/include/llvm/MC/MCFixupKindInfo.h
index 1961687..6979ad5 100644
--- a/include/llvm/MC/MCFixupKindInfo.h
+++ b/include/llvm/MC/MCFixupKindInfo.h
@@ -18,7 +18,7 @@ struct MCFixupKindInfo {
     /// Is this fixup kind PCrelative? This is used by the assembler backend to
     /// evaluate fixup values in a target independent manner when possible.
     FKF_IsPCRel = (1 << 0),
-    
+
     /// Should this fixup kind force a 4-byte aligned effective PC value?
     FKF_IsAlignedDownTo32Bits = (1 << 1)
   };
diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h
index e942892..65d1559 100644
--- a/include/llvm/MC/MCInstrItineraries.h
+++ b/include/llvm/MC/MCInstrItineraries.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_MC_MCINSTRITINERARIES_H
 #define LLVM_MC_MCINSTRITINERARIES_H
 
+#include "llvm/MC/MCSchedule.h"
 #include <algorithm>
 
 namespace llvm {
@@ -95,7 +96,7 @@ struct InstrStage {
 /// operands are read and written.
 ///
 struct InstrItinerary {
-  unsigned NumMicroOps;        ///< # of micro-ops, 0 means it's variable
+  int      NumMicroOps;        ///< # of micro-ops, -1 means it's variable
   unsigned FirstStage;         ///< Index of first stage in itinerary
   unsigned LastStage;          ///< Index of last + 1 stage in itinerary
   unsigned FirstOperandCycle;  ///< Index of first operand rd/wr
@@ -109,21 +110,22 @@ struct InstrItinerary {
 ///
 class InstrItineraryData {
 public:
+  const MCSchedModel   *SchedModel;     ///< Basic machine properties.
   const InstrStage     *Stages;         ///< Array of stages selected
   const unsigned       *OperandCycles;  ///< Array of operand cycles selected
   const unsigned       *Forwardings;    ///< Array of pipeline forwarding pathes
   const InstrItinerary *Itineraries;    ///< Array of itineraries selected
-  unsigned              IssueWidth;     ///< Max issue per cycle. 0=Unknown.
 
   /// Ctors.
   ///
-  InstrItineraryData() : Stages(0), OperandCycles(0), Forwardings(0),
-                         Itineraries(0), IssueWidth(0) {}
+  InstrItineraryData() : SchedModel(&MCSchedModel::DefaultSchedModel),
+                         Stages(0), OperandCycles(0),
+                         Forwardings(0), Itineraries(0) {}
 
-  InstrItineraryData(const InstrStage *S, const unsigned *OS,
-                     const unsigned *F, const InstrItinerary *I)
-    : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I),
-      IssueWidth(0) {}
+  InstrItineraryData(const MCSchedModel *SM, const InstrStage *S,
+                     const unsigned *OS, const unsigned *F)
+    : SchedModel(SM), Stages(S), OperandCycles(OS), Forwardings(F),
+      Itineraries(SchedModel->InstrItineraries) {}
 
   /// isEmpty - Returns true if there are no itineraries.
   ///
@@ -155,15 +157,17 @@ public:
   /// class.  The latency is the maximum completion time for any stage
   /// in the itinerary.
   ///
+  /// InstrStages override the itinerary's MinLatency property. In fact, if the
+  /// stage latencies, which may be zero, are less than MinLatency,
+  /// getStageLatency returns a value less than MinLatency.
+  ///
+  /// If no stages exist, MinLatency is used. If MinLatency is invalid (<0),
+  /// then it defaults to one cycle.
   unsigned getStageLatency(unsigned ItinClassIndx) const {
     // If the target doesn't provide itinerary information, use a simple
-    // non-zero default value for all instructions.  Some target's provide a
-    // dummy (Generic) itinerary which should be handled as if it's itinerary is
-    // empty. We identify this by looking for a reference to stage zero (invalid
-    // stage). This is different from beginStage == endState != 0, which could
-    // be used for zero-latency pseudo ops.
-    if (isEmpty() || Itineraries[ItinClassIndx].FirstStage == 0)
-      return 1;
+    // non-zero default value for all instructions.
+    if (isEmpty())
+      return SchedModel->MinLatency < 0 ? 1 : SchedModel->MinLatency;
 
     // Calculate the maximum completion time for any stage.
     unsigned Latency = 0, StartCycle = 0;
@@ -238,16 +242,16 @@ public:
     return UseCycle;
   }
 
-  /// isMicroCoded - Return true if the instructions in the given class decode
-  /// to more than one micro-ops.
-  bool isMicroCoded(unsigned ItinClassIndx) const {
+  /// getNumMicroOps - Return the number of micro-ops that the given class
+  /// decodes to. Return -1 for classes that require dynamic lookup via
+  /// TargetInstrInfo.
+  int getNumMicroOps(unsigned ItinClassIndx) const {
     if (isEmpty())
-      return false;
-    return Itineraries[ItinClassIndx].NumMicroOps != 1;
+      return 1;
+    return Itineraries[ItinClassIndx].NumMicroOps;
   }
 };
 
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 9bb598f..949d907 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -179,6 +179,9 @@ public:
 
   void WriteNlist(MachSymbolData &MSD, const MCAsmLayout &Layout);
 
+  void WriteLinkeditLoadCommand(uint32_t Type, uint32_t DataOffset,
+                                uint32_t DataSize);
+
   // FIXME: We really need to improve the relocation validation. Basically, we
   // want to implement a separate computation which evaluates the relocation
   // entry as the linker would, and verifies that the resultant fixup value is
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index aea4b41..74e2263 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -22,17 +22,17 @@ namespace llvm {
   class StringRef;
   class Triple;
 
-class MCObjectFileInfo {  
+class MCObjectFileInfo {
 protected:
   /// CommDirectiveSupportsAlignment - True if .comm supports alignment.  This
   /// is a hack for as long as we support 10.4 Tiger, whose assembler doesn't
   /// support alignment on comm.
   bool CommDirectiveSupportsAlignment;
-  
+
   /// SupportsWeakEmptyEHFrame - True if target object file supports a
   /// weak_definition of constant 0 for an omitted EH frame.
   bool SupportsWeakOmittedEHFrame;
-  
+
   /// IsFunctionEHFrameSymbolPrivate - This flag is set to true if the
   /// "EH_frame" symbol for EH information should be an assembler temporary (aka
   /// private linkage, aka an L or .L label) or false if it should be a normal
@@ -53,20 +53,20 @@ protected:
   /// TextSection - Section directive for standard text.
   ///
   const MCSection *TextSection;
-  
+
   /// DataSection - Section directive for standard data.
   ///
   const MCSection *DataSection;
-  
+
   /// BSSSection - Section that is default initialized to zero.
   const MCSection *BSSSection;
-  
+
   /// ReadOnlySection - Section that is readonly and can contain arbitrary
   /// initialized data.  Targets are not required to have a readonly section.
   /// If they don't, various bits of code will fall back to using the data
   /// section for constants.
   const MCSection *ReadOnlySection;
-  
+
   /// StaticCtorSection - This section contains the static constructor pointer
   /// list.
   const MCSection *StaticCtorSection;
@@ -74,7 +74,7 @@ protected:
   /// StaticDtorSection - This section contains the static destructor pointer
   /// list.
   const MCSection *StaticDtorSection;
-  
+
   /// LSDASection - If exception handling is supported by the target, this is
   /// the section the Language Specific Data Area information is emitted to.
   const MCSection *LSDASection;
@@ -109,7 +109,7 @@ protected:
   // Extra TLS Variable Data section.  If the target needs to put additional
   // information for a TLS variable, it'll go here.
   const MCSection *TLSExtraDataSection;
-  
+
   /// TLSDataSection - Section directive for Thread Local data.
   /// ELF, MachO and COFF.
   const MCSection *TLSDataSection;        // Defaults to ".tdata".
@@ -141,11 +141,11 @@ protected:
   /// Contains the source code name of the variable, visibility and a pointer
   /// to the initial value (.tdata or .tbss).
   const MCSection *TLSTLVSection;         // Defaults to ".tlv".
-  
+
   /// TLSThreadInitSection - Section for thread local data initialization
   /// functions.
   const MCSection *TLSThreadInitSection;  // Defaults to ".thread_init_func".
-  
+
   const MCSection *CStringSection;
   const MCSection *UStringSection;
   const MCSection *TextCoalSection;
@@ -169,7 +169,7 @@ protected:
 public:
   void InitMCObjectFileInfo(StringRef TT, Reloc::Model RM, CodeModel::Model CM,
                             MCContext &ctx);
-  
+
   bool isFunctionEHFrameSymbolPrivate() const {
     return IsFunctionEHFrameSymbolPrivate;
   }
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 6e44e6ce..9591a00 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -182,11 +182,6 @@ public:
 
   /// @}
 
-  /// Utility function to encode a SLEB128 value.
-  static void EncodeSLEB128(int64_t Value, raw_ostream &OS);
-  /// Utility function to encode a ULEB128 value.
-  static void EncodeULEB128(uint64_t Value, raw_ostream &OS,
-                            unsigned Padding = 0);
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 27acf2f..46a9d71 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -106,10 +106,18 @@ public:
 /// of AX.
 ///
 struct MCRegisterDesc {
-  const char *Name;         // Printable name for the reg (for debugging)
-  uint32_t   Overlaps;      // Overlapping registers, described above
-  uint32_t   SubRegs;       // Sub-register set, described above
-  uint32_t   SuperRegs;     // Super-register set, described above
+  uint32_t Name;      // Printable name for the reg (for debugging)
+  uint32_t Overlaps;  // Overlapping registers, described above
+  uint32_t SubRegs;   // Sub-register set, described above
+  uint32_t SuperRegs; // Super-register set, described above
+
+  // Offset into MCRI::SubRegIndices of a list of sub-register indices for each
+  // sub-register in SubRegs.
+  uint32_t SubRegIndices;
+
+  // RegUnits - Points to the list of register units. The low 4 bits holds the
+  // Scale, the high bits hold an offset into DiffLists. See MCRegUnitIterator.
+  uint32_t RegUnits;
 };
 
 /// MCRegisterInfo base class - We assume that the target defines a static
@@ -142,10 +150,15 @@ private:
   unsigned RAReg;                             // Return address register
   const MCRegisterClass *Classes;             // Pointer to the regclass array
   unsigned NumClasses;                        // Number of entries in the array
-  const uint16_t *RegLists;                   // Pointer to the reglists array
+  unsigned NumRegUnits;                       // Number of regunits.
+  const uint16_t (*RegUnitRoots)[2];          // Pointer to regunit root table.
+  const uint16_t *DiffLists;                  // Pointer to the difflists array
+  const char *RegStrings;                     // Pointer to the string table.
   const uint16_t *SubRegIndices;              // Pointer to the subreg lookup
                                               // array.
   unsigned NumSubRegIndices;                  // Number of subreg indices.
+  const uint16_t *RegEncodingTable;           // Pointer to array of register
+                                              // encodings.
 
   unsigned L2DwarfRegsSize;
   unsigned EHL2DwarfRegsSize;
@@ -158,21 +171,83 @@ private:
   DenseMap<unsigned, int> L2SEHRegs;          // LLVM to SEH regs mapping
 
 public:
+  /// DiffListIterator - Base iterator class that can traverse the
+  /// differentially encoded register and regunit lists in DiffLists.
+  /// Don't use this class directly, use one of the specialized sub-classes
+  /// defined below.
+  class DiffListIterator {
+    uint16_t Val;
+    const uint16_t *List;
+
+  protected:
+    /// Create an invalid iterator. Call init() to point to something useful.
+    DiffListIterator() : Val(0), List(0) {}
+
+    /// init - Point the iterator to InitVal, decoding subsequent values from
+    /// DiffList. The iterator will initially point to InitVal, sub-classes are
+    /// responsible for skipping the seed value if it is not part of the list.
+    void init(uint16_t InitVal, const uint16_t *DiffList) {
+      Val = InitVal;
+      List = DiffList;
+    }
+
+    /// advance - Move to the next list position, return the applied
+    /// differential. This function does not detect the end of the list, that
+    /// is the caller's responsibility (by checking for a 0 return value).
+    unsigned advance() {
+      assert(isValid() && "Cannot move off the end of the list.");
+      uint16_t D = *List++;
+      Val += D;
+      return D;
+    }
+
+  public:
+
+    /// isValid - returns true if this iterator is not yet at the end.
+    bool isValid() const { return List; }
+
+    /// Dereference the iterator to get the value at the current position.
+    unsigned operator*() const { return Val; }
+
+    /// Pre-increment to move to the next position.
+    void operator++() {
+      // The end of the list is encoded as a 0 differential.
+      if (!advance())
+        List = 0;
+    }
+  };
+
+  // These iterators are allowed to sub-class DiffListIterator and access
+  // internal list pointers.
+  friend class MCSubRegIterator;
+  friend class MCSuperRegIterator;
+  friend class MCRegAliasIterator;
+  friend class MCRegUnitIterator;
+  friend class MCRegUnitRootIterator;
+
   /// InitMCRegisterInfo - Initialize MCRegisterInfo, called by TableGen
   /// auto-generated routines. *DO NOT USE*.
   void InitMCRegisterInfo(const MCRegisterDesc *D, unsigned NR, unsigned RA,
                           const MCRegisterClass *C, unsigned NC,
-                          const uint16_t *RL,
+                          const uint16_t (*RURoots)[2],
+                          unsigned NRU,
+                          const uint16_t *DL,
+                          const char *Strings,
                           const uint16_t *SubIndices,
-                          unsigned NumIndices) {
+                          unsigned NumIndices,
+                          const uint16_t *RET) {
     Desc = D;
     NumRegs = NR;
     RAReg = RA;
     Classes = C;
-    RegLists = RL;
+    DiffLists = DL;
+    RegStrings = Strings;
     NumClasses = NC;
+    RegUnitRoots = RURoots;
+    NumRegUnits = NRU;
     SubRegIndices = SubIndices;
     NumSubRegIndices = NumIndices;
+    RegEncodingTable = RET;
   }
 
   /// mapLLVMRegsToDwarfRegs - Used to initialize LLVM register to Dwarf
@@ -231,73 +306,25 @@ public:
     return operator[](RegNo);
   }
 
-  /// getAliasSet - Return the set of registers aliased by the specified
-  /// register, or a null list of there are none.  The list returned is zero
-  /// terminated.
-  ///
-  const uint16_t *getAliasSet(unsigned RegNo) const {
-    // The Overlaps set always begins with Reg itself.
-    return RegLists + get(RegNo).Overlaps + 1;
-  }
-
-  /// getOverlaps - Return a list of registers that overlap Reg, including
-  /// itself. This is the same as the alias set except Reg is included in the
-  /// list.
-  /// These are exactly the registers in { x | regsOverlap(x, Reg) }.
-  ///
-  const uint16_t *getOverlaps(unsigned RegNo) const {
-    return RegLists + get(RegNo).Overlaps;
-  }
-
-  /// getSubRegisters - Return the list of registers that are sub-registers of
-  /// the specified register, or a null list of there are none. The list
-  /// returned is zero terminated and sorted according to super-sub register
-  /// relations. e.g. X86::RAX's sub-register list is EAX, AX, AL, AH.
-  ///
-  const uint16_t *getSubRegisters(unsigned RegNo) const {
-    return RegLists + get(RegNo).SubRegs;
-  }
-
   /// getSubReg - Returns the physical register number of sub-register "Index"
   /// for physical register RegNo. Return zero if the sub-register does not
   /// exist.
-  unsigned getSubReg(unsigned Reg, unsigned Idx) const {
-    return *(SubRegIndices + (Reg - 1) * NumSubRegIndices + Idx - 1);
-  }
+  unsigned getSubReg(unsigned Reg, unsigned Idx) const;
 
   /// getMatchingSuperReg - Return a super-register of the specified register
   /// Reg so its sub-register of index SubIdx is Reg.
   unsigned getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
-                               const MCRegisterClass *RC) const {
-    for (const uint16_t *SRs = getSuperRegisters(Reg); unsigned SR = *SRs;++SRs)
-      if (Reg == getSubReg(SR, SubIdx) && RC->contains(SR))
-        return SR;
-    return 0;
-  }
+                               const MCRegisterClass *RC) const;
 
   /// getSubRegIndex - For a given register pair, return the sub-register index
   /// if the second register is a sub-register of the first. Return zero
   /// otherwise.
-  unsigned getSubRegIndex(unsigned RegNo, unsigned SubRegNo) const {
-    for (unsigned I = 1; I <= NumSubRegIndices; ++I)
-      if (getSubReg(RegNo, I) == SubRegNo)
-        return I;
-    return 0;
-  }
-
-  /// getSuperRegisters - Return the list of registers that are super-registers
-  /// of the specified register, or a null list of there are none. The list
-  /// returned is zero terminated and sorted according to super-sub register
-  /// relations. e.g. X86::AL's super-register list is AX, EAX, RAX.
-  ///
-  const uint16_t *getSuperRegisters(unsigned RegNo) const {
-    return RegLists + get(RegNo).SuperRegs;
-  }
+  unsigned getSubRegIndex(unsigned RegNo, unsigned SubRegNo) const;
 
   /// getName - Return the human-readable symbolic target-specific name for the
   /// specified physical register.
   const char *getName(unsigned RegNo) const {
-    return get(RegNo).Name;
+    return RegStrings + get(RegNo).Name;
   }
 
   /// getNumRegs - Return the number of registers this target has (useful for
@@ -306,40 +333,26 @@ public:
     return NumRegs;
   }
 
+  /// getNumRegUnits - Return the number of (native) register units in the
+  /// target. Register units are numbered from 0 to getNumRegUnits() - 1. They
+  /// can be accessed through MCRegUnitIterator defined below.
+  unsigned getNumRegUnits() const {
+    return NumRegUnits;
+  }
+
   /// getDwarfRegNum - Map a target register to an equivalent dwarf register
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const {
-    const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
-    unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
-
-    DwarfLLVMRegPair Key = { RegNum, 0 };
-    const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-    if (I == M+Size || I->FromReg != RegNum)
-      return -1;
-    return I->ToReg;
-  }
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
 
   /// getLLVMRegNum - Map a dwarf register back to a target register.
   ///
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const {
-    const DwarfLLVMRegPair *M = isEH ? EHDwarf2LRegs : Dwarf2LRegs;
-    unsigned Size = isEH ? EHDwarf2LRegsSize : Dwarf2LRegsSize;
-
-    DwarfLLVMRegPair Key = { RegNum, 0 };
-    const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-    assert(I != M+Size && I->FromReg == RegNum && "Invalid RegNum");
-    return I->ToReg;
-  }
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   /// getSEHRegNum - Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
-  int getSEHRegNum(unsigned RegNum) const {
-    const DenseMap<unsigned, int>::const_iterator I = L2SEHRegs.find(RegNum);
-    if (I == L2SEHRegs.end()) return (int)RegNum;
-    return I->second;
-  }
+  int getSEHRegNum(unsigned RegNum) const;
 
   regclass_iterator regclass_begin() const { return Classes; }
   regclass_iterator regclass_end() const { return Classes+NumClasses; }
@@ -354,6 +367,128 @@ public:
     assert(i < getNumRegClasses() && "Register Class ID out of range");
     return Classes[i];
   }
+
+   /// getEncodingValue - Returns the encoding for RegNo
+  uint16_t getEncodingValue(unsigned RegNo) const {
+    assert(RegNo < NumRegs &&
+           "Attempting to get encoding for invalid register number!");
+    return RegEncodingTable[RegNo];
+  }
+
+};
+
+//===----------------------------------------------------------------------===//
+//                          Register List Iterators
+//===----------------------------------------------------------------------===//
+
+// MCRegisterInfo provides lists of super-registers, sub-registers, and
+// aliasing registers. Use these iterator classes to traverse the lists.
+
+/// MCSubRegIterator enumerates all sub-registers of Reg.
+class MCSubRegIterator : public MCRegisterInfo::DiffListIterator {
+public:
+  MCSubRegIterator(unsigned Reg, const MCRegisterInfo *MCRI) {
+    init(Reg, MCRI->DiffLists + MCRI->get(Reg).SubRegs);
+    ++*this;
+  }
+};
+
+/// MCSuperRegIterator enumerates all super-registers of Reg.
+class MCSuperRegIterator : public MCRegisterInfo::DiffListIterator {
+public:
+  MCSuperRegIterator(unsigned Reg, const MCRegisterInfo *MCRI) {
+    init(Reg, MCRI->DiffLists + MCRI->get(Reg).SuperRegs);
+    ++*this;
+  }
+};
+
+/// MCRegAliasIterator enumerates all registers aliasing Reg.
+/// If IncludeSelf is set, Reg itself is included in the list.
+class MCRegAliasIterator : public MCRegisterInfo::DiffListIterator {
+public:
+  MCRegAliasIterator(unsigned Reg, const MCRegisterInfo *MCRI,
+                     bool IncludeSelf) {
+    init(Reg, MCRI->DiffLists + MCRI->get(Reg).Overlaps);
+    // Initially, the iterator points to Reg itself.
+    if (!IncludeSelf)
+      ++*this;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//                               Register Units
+//===----------------------------------------------------------------------===//
+
+// Register units are used to compute register aliasing. Every register has at
+// least one register unit, but it can have more. Two registers overlap if and
+// only if they have a common register unit.
+//
+// A target with a complicated sub-register structure will typically have many
+// fewer register units than actual registers. MCRI::getNumRegUnits() returns
+// the number of register units in the target.
+
+// MCRegUnitIterator enumerates a list of register units for Reg. The list is
+// in ascending numerical order.
+class MCRegUnitIterator : public MCRegisterInfo::DiffListIterator {
+public:
+  /// MCRegUnitIterator - Create an iterator that traverses the register units
+  /// in Reg.
+  MCRegUnitIterator(unsigned Reg, const MCRegisterInfo *MCRI) {
+    // Decode the RegUnits MCRegisterDesc field.
+    unsigned RU = MCRI->get(Reg).RegUnits;
+    unsigned Scale = RU & 15;
+    unsigned Offset = RU >> 4;
+
+    // Initialize the iterator to Reg * Scale, and the List pointer to
+    // DiffLists + Offset.
+    init(Reg * Scale, MCRI->DiffLists + Offset);
+
+    // That may not be a valid unit, we need to advance by one to get the real
+    // unit number. The first differential can be 0 which would normally
+    // terminate the list, but since we know every register has at least one
+    // unit, we can allow a 0 differential here.
+    advance();
+  }
+};
+
+// Each register unit has one or two root registers. The complete set of
+// registers containing a register unit is the union of the roots and their
+// super-registers. All registers aliasing Unit can be visited like this:
+//
+//   for (MCRegUnitRootIterator RI(Unit, MCRI); RI.isValid(); ++RI) {
+//     unsigned Root = *RI;
+//     visit(Root);
+//     for (MCSuperRegIterator SI(Root, MCRI); SI.isValid(); ++SI)
+//       visit(*SI);
+//    }
+
+/// MCRegUnitRootIterator enumerates the root registers of a register unit.
+class MCRegUnitRootIterator {
+  uint16_t Reg0;
+  uint16_t Reg1;
+public:
+  MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) {
+    assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit");
+    Reg0 = MCRI->RegUnitRoots[RegUnit][0];
+    Reg1 = MCRI->RegUnitRoots[RegUnit][1];
+  }
+
+  /// Dereference to get the current root register.
+  unsigned operator*() const {
+    return Reg0;
+  }
+
+  /// isValid - Check if the iterator is at the end of the list.
+  bool isValid() const {
+    return Reg0;
+  }
+
+  /// Preincrement to move to the next root register.
+  void operator++() {
+    assert(isValid() && "Cannot move off the end of the list.");
+    Reg0 = Reg1;
+    Reg1 = 0;
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
new file mode 100644
index 0000000..3b1cdf1
--- /dev/null
+++ b/include/llvm/MC/MCSchedule.h
@@ -0,0 +1,114 @@
+//===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the classes used to describe a subtarget's machine model
+// for scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSCHEDMODEL_H
+#define LLVM_MC_MCSCHEDMODEL_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+struct InstrItinerary;
+
+/// Machine model for scheduling, bundling, and heuristics.
+///
+/// The machine model directly provides basic information about the
+/// microarchitecture to the scheduler in the form of properties. It also
+/// optionally refers to scheduler resources tables and itinerary
+/// tables. Scheduler resources tables model the latency and cost for each
+/// instruction type. Itinerary tables are an independant mechanism that
+/// provides a detailed reservation table describing each cycle of instruction
+/// execution. Subtargets may define any or all of the above categories of data
+/// depending on the type of CPU and selected scheduler.
+class MCSchedModel {
+public:
+  static MCSchedModel DefaultSchedModel; // For unknown processors.
+
+  // IssueWidth is the maximum number of instructions that may be scheduled in
+  // the same per-cycle group.
+  unsigned IssueWidth;
+  static const unsigned DefaultIssueWidth = 1;
+
+  // MinLatency is the minimum latency between a register write
+  // followed by a data dependent read. This determines which
+  // instructions may be scheduled in the same per-cycle group. This
+  // is distinct from *expected* latency, which determines the likely
+  // critical path but does not guarantee a pipeline
+  // hazard. MinLatency can always be overridden by the number of
+  // InstrStage cycles.
+  //
+  // (-1) Standard in-order processor.
+  //      Use InstrItinerary OperandCycles as MinLatency.
+  //      If no OperandCycles exist, then use the cycle of the last InstrStage.
+  //
+  //  (0) Out-of-order processor, or in-order with bundled dependencies.
+  //      RAW dependencies may be dispatched in the same cycle.
+  //      Optional InstrItinerary OperandCycles provides expected latency.
+  //
+  // (>0) In-order processor with variable latencies.
+  //      Use the greater of this value or the cycle of the last InstrStage.
+  //      Optional InstrItinerary OperandCycles provides expected latency.
+  //      TODO: can't yet specify both min and expected latency per operand.
+  int MinLatency;
+  static const unsigned DefaultMinLatency = -1;
+
+  // LoadLatency is the expected latency of load instructions.
+  //
+  // If MinLatency >= 0, this may be overriden for individual load opcodes by
+  // InstrItinerary OperandCycles.
+  unsigned LoadLatency;
+  static const unsigned DefaultLoadLatency = 4;
+
+  // HighLatency is the expected latency of "very high latency" operations.
+  // See TargetInstrInfo::isHighLatencyDef().
+  // By default, this is set to an arbitrarily high number of cycles
+  // likely to have some impact on scheduling heuristics.
+  // If MinLatency >= 0, this may be overriden by InstrItinData OperandCycles.
+  unsigned HighLatency;
+  static const unsigned DefaultHighLatency = 10;
+
+  // MispredictPenalty is the typical number of extra cycles the processor
+  // takes to recover from a branch misprediction.
+  unsigned MispredictPenalty;
+  static const unsigned DefaultMispredictPenalty = 10;
+
+private:
+  // TODO: Add a reference to proc resource types and sched resource tables.
+
+  // Instruction itinerary tables used by InstrItineraryData.
+  friend class InstrItineraryData;
+  const InstrItinerary *InstrItineraries;
+
+public:
+  // Default's must be specified as static const literals so that tablegenerated
+  // target code can use it in static initializers. The defaults need to be
+  // initialized in this default ctor because some clients directly instantiate
+  // MCSchedModel instead of using a generated itinerary.
+  MCSchedModel(): IssueWidth(DefaultIssueWidth),
+                  MinLatency(DefaultMinLatency),
+                  LoadLatency(DefaultLoadLatency),
+                  HighLatency(DefaultHighLatency),
+                  MispredictPenalty(DefaultMispredictPenalty),
+                  InstrItineraries(0) {}
+
+  // Table-gen driven ctor.
+  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
+               const InstrItinerary *ii):
+    IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
+    MispredictPenalty(mp), InstrItineraries(ii){}
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 2595600..e8c3e59 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -69,22 +69,7 @@ namespace llvm {
     SmallVector<std::pair<const MCSection *,
                 const MCSection *>, 4> SectionStack;
 
-    unsigned UniqueCodeBeginSuffix;
-    unsigned UniqueDataBeginSuffix;
-
   protected:
-    /// Indicator of whether the previous data-or-code indicator was for
-    /// code or not.  Used to determine when we need to emit a new indicator.
-    enum DataType {
-      Data,
-      Code,
-      JumpTable8,
-      JumpTable16,
-      JumpTable32
-    };
-    DataType RegionIndicator;
-
-
     MCStreamer(MCContext &Ctx);
 
     const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
@@ -241,47 +226,15 @@ namespace llvm {
     /// used in an assignment.
     virtual void EmitLabel(MCSymbol *Symbol);
 
-    /// EmitDataRegion - Emit a label that marks the beginning of a data
-    /// region.
-    /// On ELF targets, this corresponds to an assembler statement such as:
-    ///   $d.1:
-    virtual void EmitDataRegion();
-
-    /// EmitJumpTable8Region - Emit a label that marks the beginning of a
-    /// jump table composed of 8-bit offsets.
-    /// On ELF targets, this corresponds to an assembler statement such as:
-    ///   $d.1:
-    virtual void EmitJumpTable8Region();
-
-    /// EmitJumpTable16Region - Emit a label that marks the beginning of a
-    /// jump table composed of 16-bit offsets.
-    /// On ELF targets, this corresponds to an assembler statement such as:
-    ///   $d.1:
-    virtual void EmitJumpTable16Region();
-
-    /// EmitJumpTable32Region - Emit a label that marks the beginning of a
-    /// jump table composed of 32-bit offsets.
-    /// On ELF targets, this corresponds to an assembler statement such as:
-    ///   $d.1:
-    virtual void EmitJumpTable32Region();
-
-    /// EmitCodeRegion - Emit a label that marks the beginning of a code
-    /// region.
-    /// On ELF targets, this corresponds to an assembler statement such as:
-    ///   $a.1:
-    virtual void EmitCodeRegion();
-
-    /// ForceCodeRegion - Forcibly sets the current region mode to code.  Used
-    /// at function entry points.
-    void ForceCodeRegion() { RegionIndicator = Code; }
-
-
     virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol);
 
-    /// EmitAssemblerFlag - Note in the output the specified @p Flag
+    /// EmitAssemblerFlag - Note in the output the specified @p Flag.
     virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
 
+    /// EmitDataRegion - Note in the output the specified region @p Kind.
+    virtual void EmitDataRegion(MCDataRegionType Kind) {}
+
     /// EmitThumbFunc - Note in the output that the specified @p Func is
     /// a Thumb mode function (ARM target only).
     virtual void EmitThumbFunc(MCSymbol *Func) = 0;
@@ -373,7 +326,7 @@ namespace llvm {
     /// @param ByteAlignment - The alignment of the zerofill symbol if
     /// non-zero. This must be a power of 2 on some targets.
     virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                              unsigned Size = 0,unsigned ByteAlignment = 0) = 0;
+                              uint64_t Size = 0,unsigned ByteAlignment = 0) = 0;
 
     /// EmitTBSSSymbol - Emit a thread local bss (.tbss) symbol.
     ///
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 3b53f20..31d632d 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -30,10 +30,10 @@ class MCSubtargetInfo {
   std::string TargetTriple;            // Target triple
   const SubtargetFeatureKV *ProcFeatures;  // Processor feature list
   const SubtargetFeatureKV *ProcDesc;  // Processor descriptions
-  const SubtargetInfoKV *ProcItins;    // Scheduling itineraries
-  const InstrStage *Stages;            // Instruction stages
-  const unsigned *OperandCycles;       // Operand cycles
-  const unsigned *ForwardingPathes;    // Forwarding pathes
+  const SubtargetInfoKV *ProcSchedModel; // Scheduler machine model
+  const InstrStage *Stages;            // Instruction itinerary stages
+  const unsigned *OperandCycles;       // Itinerary operand cycles
+  const unsigned *ForwardingPaths;     // Forwarding paths
   unsigned NumFeatures;                // Number of processor features
   unsigned NumProcs;                   // Number of processors
   uint64_t FeatureBits;                // Feature bits for current CPU + FS
@@ -42,7 +42,8 @@ public:
   void InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                            const SubtargetFeatureKV *PF,
                            const SubtargetFeatureKV *PD,
-                           const SubtargetInfoKV *PI, const InstrStage *IS,
+                           const SubtargetInfoKV *ProcSched,
+                           const InstrStage *IS,
                            const unsigned *OC, const unsigned *FP,
                            unsigned NF, unsigned NP);
 
@@ -69,6 +70,10 @@ public:
   /// bits. This version will also change all implied bits.
   uint64_t ToggleFeature(StringRef FS);
 
+  /// getSchedModelForCPU - Get the machine model of a CPU.
+  ///
+  MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
+
   /// getInstrItineraryForCPU - Get scheduling itinerary of a CPU.
   ///
   InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
diff --git a/include/llvm/MC/MCTargetAsmLexer.h b/include/llvm/MC/MCTargetAsmLexer.h
index acb3d4d..f5c8c09 100644
--- a/include/llvm/MC/MCTargetAsmLexer.h
+++ b/include/llvm/MC/MCTargetAsmLexer.h
@@ -14,72 +14,72 @@
 
 namespace llvm {
 class Target;
-  
+
 /// MCTargetAsmLexer - Generic interface to target specific assembly lexers.
 class MCTargetAsmLexer {
   /// The current token
   AsmToken CurTok;
-  
+
   /// The location and description of the current error
   SMLoc ErrLoc;
   std::string Err;
-  
+
   MCTargetAsmLexer(const MCTargetAsmLexer &);   // DO NOT IMPLEMENT
   void operator=(const MCTargetAsmLexer &);  // DO NOT IMPLEMENT
 protected: // Can only create subclasses.
   MCTargetAsmLexer(const Target &);
-  
+
   virtual AsmToken LexToken() = 0;
-  
+
   void SetError(const SMLoc &errLoc, const std::string &err) {
     ErrLoc = errLoc;
     Err = err;
   }
-  
+
   /// TheTarget - The Target that this machine was created for.
   const Target &TheTarget;
   MCAsmLexer *Lexer;
-  
+
 public:
   virtual ~MCTargetAsmLexer();
-  
+
   const Target &getTarget() const { return TheTarget; }
-  
+
   /// InstallLexer - Set the lexer to get tokens from lower-level lexer \arg L.
   void InstallLexer(MCAsmLexer &L) {
     Lexer = &L;
   }
-  
+
   MCAsmLexer *getLexer() {
     return Lexer;
   }
-  
+
   /// Lex - Consume the next token from the input stream and return it.
   const AsmToken &Lex() {
     return CurTok = LexToken();
   }
-  
+
   /// getTok - Get the current (last) lexed token.
   const AsmToken &getTok() {
     return CurTok;
   }
-  
+
   /// getErrLoc - Get the current error location
   const SMLoc &getErrLoc() {
     return ErrLoc;
   }
-  
+
   /// getErr - Get the current error string
   const std::string &getErr() {
     return Err;
   }
-  
+
   /// getKind - Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
-  
+
   /// is - Check if the current token has kind \arg K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
-  
+
   /// isNot - Check if the current token has kind \arg K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
 };
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 4e3fd0d..929a204 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -79,6 +79,19 @@ public:
   /// \param DirectiveID - the identifier token of the directive.
   virtual bool ParseDirective(AsmToken DirectiveID) = 0;
 
+  /// MatchInstruction - Recognize a series of operands of a parsed instruction
+  /// as an actual MCInst.  This returns false on success and returns true on
+  /// failure to match.
+  ///
+  /// On failure, the target parser is responsible for emitting a diagnostic
+  /// explaining the match failure.
+  virtual bool
+  MatchInstruction(SMLoc IDLoc,
+                   SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                   SmallVectorImpl<MCInst> &MCInsts) {
+    return true;
+  }
+
   /// MatchAndEmitInstruction - Recognize a series of operands of a parsed
   /// instruction as an actual MCInst and emit it to the specified MCStreamer.
   /// This returns false on success and returns true on failure to match.
diff --git a/include/llvm/MC/MachineLocation.h b/include/llvm/MC/MachineLocation.h
index 8ddfdbc..5caad33 100644
--- a/include/llvm/MC/MachineLocation.h
+++ b/include/llvm/MC/MachineLocation.h
@@ -11,7 +11,7 @@
 // from a base address plus an offset.  Register indirection can be specified by
 // using an offset of zero.
 //
-// The MachineMove class is used to represent abstract move operations in the 
+// The MachineMove class is used to represent abstract move operations in the
 // prolog/epilog of a compiled function.  A collection of these objects can be
 // used by a debug consumer to track the location of values when unwinding stack
 // frames.
@@ -23,7 +23,7 @@
 
 namespace llvm {
   class MCSymbol;
-  
+
 class MachineLocation {
 private:
   bool IsRegister;                      // True if location is a register.
@@ -46,7 +46,7 @@ public:
       return IsRegister == Other.IsRegister && Register == Other.Register &&
         Offset == Other.Offset;
   }
-  
+
   // Accessors
   bool isReg()           const { return IsRegister; }
   unsigned getReg()      const { return Register; }
@@ -77,7 +77,7 @@ private:
   /// Label - Symbol for post-instruction address when result of move takes
   /// effect.
   MCSymbol *Label;
-  
+
   // Move to & from location.
   MachineLocation Destination, Source;
 public:
@@ -86,7 +86,7 @@ public:
   MachineMove(MCSymbol *label, const MachineLocation &D,
               const MachineLocation &S)
   : Label(label), Destination(D), Source(S) {}
-  
+
   // Accessors
   MCSymbol *getLabel()                    const { return Label; }
   const MachineLocation &getDestination() const { return Destination; }
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index 1a7dc92..507d882 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -25,7 +25,7 @@
 namespace llvm {
   class raw_ostream;
   class StringRef;
-  
+
 //===----------------------------------------------------------------------===//
 ///
 /// SubtargetFeatureKV - Used to provide key value pairs for feature and
@@ -36,13 +36,13 @@ struct SubtargetFeatureKV {
   const char *Desc;                     // Help descriptor
   uint64_t Value;                       // K-V integer value
   uint64_t Implies;                     // K-V bit mask
-  
+
   // Compare routine for std binary search
   bool operator<(const SubtargetFeatureKV &S) const {
     return strcmp(Key, S.Key) < 0;
   }
 };
-  
+
 //===----------------------------------------------------------------------===//
 ///
 /// SubtargetInfoKV - Used to provide key value pairs for CPU and arbitrary
@@ -51,16 +51,16 @@ struct SubtargetFeatureKV {
 struct SubtargetInfoKV {
   const char *Key;                      // K-V key string
   void *Value;                          // K-V pointer value
-  
+
   // Compare routine for std binary search
   bool operator<(const SubtargetInfoKV &S) const {
     return strcmp(Key, S.Key) < 0;
   }
 };
-  
+
 //===----------------------------------------------------------------------===//
 ///
-/// SubtargetFeatures - Manages the enabling and disabling of subtarget 
+/// SubtargetFeatures - Manages the enabling and disabling of subtarget
 /// specific features.  Features are encoded as a string of the form
 ///   "cpu,+attr1,+attr2,-attr3,...,+attrN"
 /// A comma separates each feature from the next (all lowercase.)
@@ -81,27 +81,27 @@ public:
 
   /// Adding Features.
   void AddFeature(const StringRef String, bool IsEnabled = true);
-           
+
   /// ToggleFeature - Toggle a feature and returns the newly updated feature
   /// bits.
   uint64_t ToggleFeature(uint64_t Bits, const StringRef String,
                          const SubtargetFeatureKV *FeatureTable,
                          size_t FeatureTableSize);
-           
+
   /// Get feature bits of a CPU.
   uint64_t getFeatureBits(const StringRef CPU,
                           const SubtargetFeatureKV *CPUTable,
                           size_t CPUTableSize,
                           const SubtargetFeatureKV *FeatureTable,
                           size_t FeatureTableSize);
-                         
+
   /// Get scheduling itinerary of a CPU.
   void *getItinerary(const StringRef CPU,
                      const SubtargetInfoKV *Table, size_t TableSize);
-  
+
   /// Print feature string.
   void print(raw_ostream &OS) const;
-  
+
   // Dump feature info.
   void dump() const;
 
diff --git a/include/llvm/Support/MDBuilder.h b/include/llvm/MDBuilder.h
index 40f028a..2aa48b0 100644
--- a/include/llvm/Support/MDBuilder.h
+++ b/include/llvm/MDBuilder.h
@@ -1,4 +1,4 @@
-//===---- llvm/Support/MDBuilder.h - Builder for LLVM metadata --*- C++ -*-===//
+//===---- llvm/MDBuilder.h - Builder for LLVM metadata ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_MDBUILDER_H
-#define LLVM_SUPPORT_MDBUILDER_H
+#ifndef LLVM_MDBUILDER_H
+#define LLVM_MDBUILDER_H
 
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -49,6 +49,29 @@ namespace llvm {
       return MDNode::get(Context, Op);
     }
 
+    //===------------------------------------------------------------------===//
+    // Prof metadata.
+    //===------------------------------------------------------------------===//
+
+    /// \brief Return metadata containing two branch weights.
+    MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight) {
+      uint32_t Weights[] = { TrueWeight, FalseWeight };
+      return createBranchWeights(Weights);
+    }
+
+    /// \brief Return metadata containing a number of branch weights.
+    MDNode *createBranchWeights(ArrayRef<uint32_t> Weights) {
+      assert(Weights.size() >= 2 && "Need at least two branch weights!");
+
+      SmallVector<Value *, 4> Vals(Weights.size()+1);
+      Vals[0] = createString("branch_weights");
+
+      Type *Int32Ty = Type::getInt32Ty(Context);
+      for (unsigned i = 0, e = Weights.size(); i != e; ++i)
+        Vals[i+1] = ConstantInt::get(Int32Ty, Weights[i]);
+
+      return MDNode::get(Context, Vals);
+    }
 
     //===------------------------------------------------------------------===//
     // Range metadata.
diff --git a/include/llvm/Metadata.h b/include/llvm/Metadata.h
index 7357986..b40549b 100644
--- a/include/llvm/Metadata.h
+++ b/include/llvm/Metadata.h
@@ -165,6 +165,11 @@ public:
   static bool classof(const Value *V) {
     return V->getValueID() == MDNodeVal;
   }
+
+  /// Methods for metadata merging.
+  static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
+  static MDNode *getMostGenericFPMath(MDNode *A, MDNode *B);
+  static MDNode *getMostGenericRange(MDNode *A, MDNode *B);
 private:
   // destroy - Delete this node.  Only when there are no uses.
   void destroy();
diff --git a/include/llvm/Module.h b/include/llvm/Module.h
index b9c9881..e6303ac 100644
--- a/include/llvm/Module.h
+++ b/include/llvm/Module.h
@@ -301,10 +301,6 @@ public:
   typedef DenseMap<StructType*, unsigned, DenseMapInfo<StructType*> >
                    NumeredTypesMapTy;
 
-  /// findUsedStructTypes - Walk the entire module and find all of the
-  /// struct types that are in use, returning them in a vector.
-  void findUsedStructTypes(std::vector<StructType*> &StructTypes) const;
-  
   /// getTypeByName - Return the type with the specified name, or null if there
   /// is none by that name.
   StructType *getTypeByName(StringRef Name) const;
@@ -497,6 +493,13 @@ public:
   static iplist<GlobalAlias> Module::*getSublistAccess(GlobalAlias*) {
     return &Module::AliasList;
   }
+  /// Get the Module's list of named metadata (constant).
+  const NamedMDListType  &getNamedMDList() const      { return NamedMDList; }
+  /// Get the Module's list of named metadata.
+  NamedMDListType        &getNamedMDList()            { return NamedMDList; }
+  static ilist<NamedMDNode> Module::*getSublistAccess(NamedMDNode*) {
+    return &Module::NamedMDList;
+  }
   /// Get the symbol table of global variable and function identifiers
   const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; }
   /// Get the Module's symbol table of global variable and function identifiers.
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index 77a08d5..befe812 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -90,7 +90,7 @@ public:
 
 /// @brief Create a Binary from Source, autodetecting the file type.
 ///
-/// @param Source The data to create the Binary from. Ownership is transfered
+/// @param Source The data to create the Binary from. Ownership is transferred
 ///        to Result if successful. If an error is returned, Source is destroyed
 ///        by createBinary before returning.
 /// @param Result A pointer to the resulting Binary if no error occured.
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 68b5ca1..967420e 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -168,6 +168,10 @@ public:
   virtual section_iterator begin_sections() const;
   virtual section_iterator end_sections() const;
 
+  const coff_section *getCOFFSection(section_iterator &It) const;
+  const coff_symbol *getCOFFSymbol(symbol_iterator &It) const;
+  const coff_relocation *getCOFFRelocation(relocation_iterator &It) const;
+  
   virtual uint8_t getBytesInAddress() const;
   virtual StringRef getFileFormatName() const;
   virtual unsigned getArch() const;
@@ -184,6 +188,8 @@ public:
     return ec;
   }
   error_code getSymbolName(const coff_symbol *symbol, StringRef &Res) const;
+  ArrayRef<uint8_t> getSymbolAuxData(const coff_symbol *symbol) const;
+
   error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
   error_code getSectionContents(const coff_section *Sec,
                                 ArrayRef<uint8_t> &Res) const;
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index e493f5b..7698441 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -217,7 +217,7 @@ struct Elf_Verdef_Impl {
   }
 };
 
-/// Elf_Verdaux: This is the structure of auxilary data in the SHT_GNU_verdef
+/// Elf_Verdaux: This is the structure of auxiliary data in the SHT_GNU_verdef
 /// section (.gnu.version_d). This structure is identical for ELF32 and ELF64.
 template<support::endianness target_endianness, bool is64Bits>
 struct Elf_Verdaux_Impl {
@@ -505,9 +505,6 @@ private:
   const Elf_Rela *getRela(DataRefImpl Rela) const;
   const char     *getString(uint32_t section, uint32_t offset) const;
   const char     *getString(const Elf_Shdr *section, uint32_t offset) const;
-  error_code      getSymbolName(const Elf_Shdr *section,
-                                const Elf_Sym *Symb,
-                                StringRef &Res) const;
   error_code      getSymbolVersion(const Elf_Shdr *section,
                                    const Elf_Sym *Symb,
                                    StringRef &Version,
@@ -519,6 +516,11 @@ protected:
   void            validateSymbol(DataRefImpl Symb) const;
 
 public:
+  error_code      getSymbolName(const Elf_Shdr *section,
+                                const Elf_Sym *Symb,
+                                StringRef &Res) const;
+  error_code      getSectionName(const Elf_Shdr *section,
+                                 StringRef &Res) const;
   const Elf_Dyn  *getDyn(DataRefImpl DynData) const;
   error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
                               bool &IsDefault) const;
@@ -597,11 +599,15 @@ public:
   virtual StringRef getObjectType() const { return "ELF"; }
   virtual unsigned getArch() const;
   virtual StringRef getLoadName() const;
+  virtual error_code getSectionContents(const Elf_Shdr *sec,
+                                        StringRef &Res) const;
 
   uint64_t getNumSections() const;
   uint64_t getStringTableIndex() const;
   ELF::Elf64_Word getSymbolTableIndex(const Elf_Sym *symb) const;
   const Elf_Shdr *getSection(const Elf_Sym *symb) const;
+  const Elf_Shdr *getElfSection(section_iterator &It) const;
+  const Elf_Sym *getElfSymbol(symbol_iterator &It) const;
 
   // Methods for type inquiry through isa, cast, and dyn_cast
   bool isDyldType() const { return isDyldELFObject; }
@@ -783,6 +789,21 @@ ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Shdr *
+ELFObjectFile<target_endianness, is64Bits>
+                             ::getElfSection(section_iterator &It) const {
+  llvm::object::DataRefImpl ShdrRef = It->getRawDataRefImpl();
+  return reinterpret_cast<const Elf_Shdr *>(ShdrRef.p);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Sym *
+ELFObjectFile<target_endianness, is64Bits>
+                             ::getElfSymbol(symbol_iterator &It) const {
+  return getSymbol(It->getRawDataRefImpl());
+}
+
+template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolFileOffset(DataRefImpl Symb,
                                           uint64_t &Result) const {
@@ -1060,6 +1081,15 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSectionContents(const Elf_Shdr *Sec,
+                                             StringRef &Result) const {
+  const char *start = (const char*)base() + Sec->sh_offset;
+  Result = StringRef(start, Sec->sh_size);
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSectionAlignment(DataRefImpl Sec,
                                               uint64_t &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
@@ -1414,6 +1444,98 @@ error_code ELFObjectFile<target_endianness, is64Bits>
       res = "Unknown";
     }
     break;
+  case ELF::EM_HEXAGON:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
+    default:
+      res = "Unknown";
+    }
+    break;
   default:
     res = "Unknown";
   }
@@ -1489,6 +1611,9 @@ error_code ELFObjectFile<target_endianness, is64Bits>
       res = "Unknown";
     }
     break;
+  case ELF::EM_HEXAGON:
+    res = symname;
+    break;
   default:
     res = "Unknown";
   }
@@ -1888,6 +2013,8 @@ StringRef ELFObjectFile<target_endianness, is64Bits>
       return "ELF32-x86-64";
     case ELF::EM_ARM:
       return "ELF32-arm";
+    case ELF::EM_HEXAGON:
+      return "ELF32-hexagon";
     default:
       return "ELF32-unknown";
     }
@@ -1915,6 +2042,8 @@ unsigned ELFObjectFile<target_endianness, is64Bits>::getArch() const {
     return Triple::x86_64;
   case ELF::EM_ARM:
     return Triple::arm;
+  case ELF::EM_HEXAGON:
+    return Triple::hexagon;
   default:
     return Triple::UnknownArch;
   }
@@ -2054,6 +2183,14 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSectionName(const Elf_Shdr *section,
+                                        StringRef &Result) const {
+  Result = StringRef(getString(dot_shstrtab_sec, section->sh_name));
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolVersion(const Elf_Shdr *section,
                                            const Elf_Sym *symb,
                                            StringRef &Version,
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
index 089cde9..f30d431 100644
--- a/include/llvm/Object/MachOFormat.h
+++ b/include/llvm/Object/MachOFormat.h
@@ -97,7 +97,8 @@ namespace macho {
     DysymtabLoadCommandSize = 80,
     Nlist32Size = 12,
     Nlist64Size = 16,
-    RelocationInfoSize = 8
+    RelocationInfoSize = 8,
+    LinkeditLoadCommandSize = 16
   };
 
   /// \brief Constants for header magic field.
@@ -140,7 +141,8 @@ namespace macho {
     LCT_UUID = 0x1b,
     LCT_CodeSignature = 0x1d,
     LCT_SegmentSplitInfo = 0x1e,
-    LCT_FunctionStarts = 0x26
+    LCT_FunctionStarts = 0x26,
+    LCT_DataInCode = 0x29
   };
 
   /// \brief Load command structure.
@@ -280,6 +282,18 @@ namespace macho {
   };
 
   /// @}
+  /// @name Data-in-code Table Entry
+  /// @{
+
+  // See <mach-o/loader.h>.
+  enum DataRegionType { Data = 1, JumpTable8, JumpTable16, JumpTable32 };
+  struct DataInCodeTableEntry {
+    uint32_t Offset;  /* from mach_header to start of data region */
+    uint16_t Length;  /* number of bytes in data region */
+    uint16_t Kind;    /* a DataRegionType value  */
+  };
+
+  /// @}
   /// @name Indirect Symbol Table
   /// @{
 
diff --git a/include/llvm/Object/MachOObject.h b/include/llvm/Object/MachOObject.h
index 0560402..86f150a 100644
--- a/include/llvm/Object/MachOObject.h
+++ b/include/llvm/Object/MachOObject.h
@@ -174,6 +174,9 @@ public:
   void ReadSymbol64TableEntry(
     uint64_t SymbolTableOffset, unsigned Index,
     InMemoryStruct<macho::Symbol64TableEntry> &Res) const;
+  void ReadDataInCodeTableEntry(
+    uint64_t TableOffset, unsigned Index,
+    InMemoryStruct<macho::DataInCodeTableEntry> &Res) const;
   void ReadULEB128s(uint64_t Index, SmallVectorImpl<uint64_t> &Out) const;
 
   /// @}
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 4dd7fb5..2ec656b 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -76,13 +76,13 @@ public:
   }
 };
 
-static bool operator ==(const DataRefImpl &a, const DataRefImpl &b) {
+inline bool operator ==(const DataRefImpl &a, const DataRefImpl &b) {
   // Check bitwise identical. This is the only legal way to compare a union w/o
   // knowing which member is in use.
   return std::memcmp(&a, &b, sizeof(DataRefImpl)) == 0;
 }
 
-static bool operator <(const DataRefImpl &a, const DataRefImpl &b) {
+inline bool operator <(const DataRefImpl &a, const DataRefImpl &b) {
   // Check bitwise identical. This is the only legal way to compare a union w/o
   // knowing which member is in use.
   return std::memcmp(&a, &b, sizeof(DataRefImpl)) < 0;
@@ -126,6 +126,8 @@ public:
   ///
   /// This is for display purposes only.
   error_code getValueString(SmallVectorImpl<char> &Result) const;
+
+  DataRefImpl getRawDataRefImpl() const;
 };
 typedef content_iterator<RelocationRef> relocation_iterator;
 
@@ -570,6 +572,11 @@ inline error_code RelocationRef::getValueString(SmallVectorImpl<char> &Result)
 inline error_code RelocationRef::getHidden(bool &Result) const {
   return OwningObject->getRelocationHidden(RelocationPimpl, Result);
 }
+
+inline DataRefImpl RelocationRef::getRawDataRefImpl() const {
+  return RelocationPimpl;
+}
+
 // Inline function definitions.
 inline LibraryRef::LibraryRef(DataRefImpl LibraryP, const ObjectFile *Owner)
   : LibraryPimpl(LibraryP)
diff --git a/include/llvm/PassManagers.h b/include/llvm/PassManagers.h
index fa29f50..0af5853 100644
--- a/include/llvm/PassManagers.h
+++ b/include/llvm/PassManagers.h
@@ -15,6 +15,7 @@
 #define LLVM_PASSMANAGERS_H
 
 #include "llvm/Pass.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/DenseMap.h"
@@ -184,7 +185,7 @@ public:
   void schedulePass(Pass *P);
 
   /// Set pass P as the last user of the given analysis passes.
-  void setLastUser(const SmallVectorImpl<Pass *> &AnalysisPasses, Pass *P);
+  void setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P);
 
   /// Collect passes whose last user is P
   void collectLastUses(SmallVectorImpl<Pass *> &LastUses, Pass *P);
diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index cebfa79..85607c8 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h
@@ -15,6 +15,9 @@
 #ifndef LLVM_SUPPORT_ALIGNOF_H
 #define LLVM_SUPPORT_ALIGNOF_H
 
+#include "llvm/Support/Compiler.h"
+#include <cstddef>
+
 namespace llvm {
 
 template <typename T>
@@ -54,7 +57,94 @@ struct AlignOf {
 ///  class besides some cosmetic cleanliness.  Example usage:
 ///  alignOf<int>() returns the alignment of an int.
 template <typename T>
-static inline unsigned alignOf() { return AlignOf<T>::Alignment; }
+inline unsigned alignOf() { return AlignOf<T>::Alignment; }
+
+
+/// \brief Helper for building an aligned character array type.
+///
+/// This template is used to explicitly build up a collection of aligned
+/// character types. We have to build these up using a macro and explicit
+/// specialization to cope with old versions of MSVC and GCC where only an
+/// integer literal can be used to specify an alignment constraint. Once built
+/// up here, we can then begin to indirect between these using normal C++
+/// template parameters.
+template <size_t Alignment> struct AlignedCharArrayImpl {};
+template <> struct AlignedCharArrayImpl<0> {
+  typedef char type;
+};
+#if __has_feature(cxx_alignas)
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <> struct AlignedCharArrayImpl<x> { \
+    typedef char alignas(x) type; \
+  }
+#elif defined(__clang__) || defined(__GNUC__)
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <> struct AlignedCharArrayImpl<x> { \
+    typedef char type __attribute__((aligned(x))); \
+  }
+#elif defined(_MSC_VER)
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <> struct AlignedCharArrayImpl<x> { \
+    typedef __declspec(align(x)) char type; \
+  }
+#else
+# error No supported align as directive.
+#endif
+
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(512);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1024);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2048);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4096);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
+// Any larger and MSVC complains.
+#undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+/// \brief This class template exposes a typedef for type containing a suitable
+/// aligned character array to hold elements of any of up to four types.
+///
+/// These types may be arrays, structs, or any other types. The goal is to
+/// produce a union type containing a character array which, when used, forms
+/// storage suitable to placement new any of these types over. Support for more
+/// than four types can be added at the cost of more boiler plate.
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char>
+class AlignedCharArray {
+  class AlignerImpl {
+    T1 t1; T2 t2; T3 t3; T4 t4;
+
+    AlignerImpl(); // Never defined or instantiated.
+  };
+  union SizerImpl {
+    char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)];
+  };
+
+public:
+  // Sadly, Clang and GCC both fail to align a character array properly even
+  // with an explicit alignment attribute. To work around this, we union
+  // the character array that will actually be used with a struct that contains
+  // a single aligned character member. Tests seem to indicate that both Clang
+  // and GCC will properly register the alignment of a struct containing an
+  // aligned member, and this alignment should carry over to the character
+  // array in the union.
+  union union_type {
+    // This is the only member of the union which should be used by clients:
+    char buffer[sizeof(SizerImpl)];
+
+    // This member of the union only exists to force the alignment.
+    struct {
+      typename llvm::AlignedCharArrayImpl<AlignOf<AlignerImpl>::Alignment>::type
+        nonce_inner_member;
+    } nonce_member;
+  };
+};
 
 } // end namespace llvm
 #endif
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index 88c60ba..6c2ee08 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -50,6 +50,8 @@ namespace COFF {
   };
 
   enum MachineTypes {
+    MT_Invalid = -1,
+
     IMAGE_FILE_MACHINE_UNKNOWN   = 0x0,
     IMAGE_FILE_MACHINE_AM33      = 0x13,
     IMAGE_FILE_MACHINE_AMD64     = 0x8664,
@@ -74,6 +76,8 @@ namespace COFF {
   };
 
   enum Characteristics {
+    C_Invalid = 0,
+
     /// The file does not contain base relocations and must be loaded at its
     /// preferred base. If this cannot be done, the loader will error.
     IMAGE_FILE_RELOCS_STRIPPED         = 0x0001,
@@ -114,9 +118,9 @@ namespace COFF {
   struct symbol {
     char     Name[NameSize];
     uint32_t Value;
+    uint16_t SectionNumber;
     uint16_t Type;
     uint8_t  StorageClass;
-    uint16_t SectionNumber;
     uint8_t  NumberOfAuxSymbols;
   };
 
@@ -138,6 +142,8 @@ namespace COFF {
 
   /// Storage class tells where and what the symbol represents
   enum SymbolStorageClass {
+    SSC_Invalid = -1,
+
     IMAGE_SYM_CLASS_END_OF_FUNCTION  = -1,  ///< Physical end of function
     IMAGE_SYM_CLASS_NULL             = 0,   ///< No symbol
     IMAGE_SYM_CLASS_AUTOMATIC        = 1,   ///< Stack variable
@@ -214,6 +220,8 @@ namespace COFF {
   };
 
   enum SectionCharacteristics {
+    SC_Invalid = -1,
+
     IMAGE_SCN_TYPE_NO_PAD            = 0x00000008,
     IMAGE_SCN_CNT_CODE               = 0x00000020,
     IMAGE_SCN_CNT_INITIALIZED_DATA   = 0x00000040,
diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h
index 20634ed..c23bb6a 100644
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h
@@ -184,6 +184,11 @@ public:
     CALLSITE_DELEGATE_SETTER(setAttributes(PAL));
   }
 
+  /// \brief Return true if this function has the given attribute.
+  bool hasFnAttr(Attributes N) const {
+    CALLSITE_DELEGATE_GETTER(hasFnAttr(N));
+  }
+
   /// paramHasAttr - whether the call or the callee has the given attribute.
   bool paramHasAttr(uint16_t i, Attributes attr) const {
     CALLSITE_DELEGATE_GETTER(paramHasAttr(i, attr));
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index c212d2d..ae1570d 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -217,11 +217,11 @@ public:
   void setMiscFlag(enum MiscFlags M) { Misc |= M; }
   void setPosition(unsigned pos) { Position = pos; }
 protected:
-  explicit Option(enum NumOccurrencesFlag Occurrences, 
+  explicit Option(enum NumOccurrencesFlag OccurrencesFlag,
                   enum OptionHidden Hidden)
-    : NumOccurrences(0), Occurrences(Occurrences), HiddenFlag(Hidden), 
-      Formatting(NormalFormatting), Position(0),
-      AdditionalVals(0), NextRegistered(0),
+    : NumOccurrences(0), Occurrences(OccurrencesFlag), Value(0),
+      HiddenFlag(Hidden), Formatting(NormalFormatting), Misc(0),
+      Position(0), AdditionalVals(0), NextRegistered(0),
       ArgStr(""), HelpStr(""), ValueStr("") {
   }
 
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index d0b186e..f654f32 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -19,6 +19,25 @@
 # define __has_feature(x) 0
 #endif
 
+/// LLVM_HAS_RVALUE_REFERENCES - Does the compiler provide r-value references?
+/// This implies that <utility> provides the one-argument std::move;  it
+/// does not imply the existence of any other C++ library features.
+#if (__has_feature(cxx_rvalue_references)   \
+     || defined(__GXX_EXPERIMENTAL_CXX0X__) \
+     || _MSC_VER >= 1600)
+#define LLVM_USE_RVALUE_REFERENCES 1
+#else
+#define LLVM_USE_RVALUE_REFERENCES 0
+#endif
+
+/// llvm_move - Expands to ::std::move if the compiler supports
+/// r-value references; otherwise, expands to the argument.
+#if LLVM_USE_RVALUE_REFERENCES
+#define llvm_move(value) (::std::move(value))
+#else
+#define llvm_move(value) (value)
+#endif
+
 /// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
 /// into a shared library, then the class should be private to the library and
 /// not accessible from outside it.  Can also be used to mark variables and
@@ -102,7 +121,7 @@
 // 3.4 supported this but is buggy in various cases and produces unimplemented
 // errors, just use it in GCC 4.0 and later.
 #if __GNUC__ > 3
-#define LLVM_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #else
diff --git a/include/llvm/Support/ConstantRange.h b/include/llvm/Support/ConstantRange.h
index ced3a2c..90dd69f 100644
--- a/include/llvm/Support/ConstantRange.h
+++ b/include/llvm/Support/ConstantRange.h
@@ -155,6 +155,10 @@ public:
   /// constant range.
   ConstantRange subtract(const APInt &CI) const;
 
+  /// \brief Subtract the specified range from this range (aka relative
+  /// complement of the sets).
+  ConstantRange difference(const ConstantRange &CR) const;
+
   /// intersectWith - Return the range that results from the intersection of
   /// this range with another range.  The resultant range is guaranteed to
   /// include all elements contained in both input ranges, and to have the
diff --git a/include/llvm/Support/DataTypes.h.cmake b/include/llvm/Support/DataTypes.h.cmake
index a3a6489..7484abd 100644
--- a/include/llvm/Support/DataTypes.h.cmake
+++ b/include/llvm/Support/DataTypes.h.cmake
@@ -79,18 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 #endif
 
-#ifdef _OpenBSD_
-#define INT8_MAX 127
-#define INT8_MIN -128
-#define UINT8_MAX 255
-#define INT16_MAX 32767
-#define INT16_MIN -32768
-#define UINT16_MAX 65535
-#define INT32_MAX 2147483647
-#define INT32_MIN -2147483648
-#define UINT32_MAX 4294967295U
-#endif
-
 #else /* _MSC_VER */
 /* Visual C++ doesn't provide standard integer headers, but it does provide
    built-in data types. */
diff --git a/include/llvm/Support/DataTypes.h.in b/include/llvm/Support/DataTypes.h.in
index b492bb1..b9fb48a 100644
--- a/include/llvm/Support/DataTypes.h.in
+++ b/include/llvm/Support/DataTypes.h.in
@@ -79,18 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 #endif
 
-#ifdef _OpenBSD_
-#define INT8_MAX 127
-#define INT8_MIN -128
-#define UINT8_MAX 255
-#define INT16_MAX 32767
-#define INT16_MIN -32768
-#define UINT16_MAX 65535
-#define INT32_MAX 2147483647
-#define INT32_MIN -2147483648
-#define UINT32_MAX 4294967295U
-#endif
-
 #else /* _MSC_VER */
 /* Visual C++ doesn't provide standard integer headers, but it does provide
    built-in data types. */
diff --git a/include/llvm/Support/Debug.h b/include/llvm/Support/Debug.h
index e723272..896fe84 100644
--- a/include/llvm/Support/Debug.h
+++ b/include/llvm/Support/Debug.h
@@ -19,7 +19,7 @@
 // foo class.
 //
 // When compiling without assertions, the -debug-* options and all code in
-// DEBUG() statements disappears, so it does not effect the runtime of the code.
+// DEBUG() statements disappears, so it does not affect the runtime of the code.
 //
 //===----------------------------------------------------------------------===//
 
@@ -49,11 +49,11 @@ extern bool DebugFlag;
 ///
 bool isCurrentDebugType(const char *Type);
 
-/// SetCurrentDebugType - Set the current debug type, as if the -debug-only=X
+/// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
 /// option were specified.  Note that DebugFlag also needs to be set to true for
 /// debug output to be produced.
 ///
-void SetCurrentDebugType(const char *Type);
+void setCurrentDebugType(const char *Type);
 
 /// DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug
 /// information.  In the '-debug' option is specified on the commandline, and if
@@ -70,7 +70,7 @@ void SetCurrentDebugType(const char *Type);
 
 #else
 #define isCurrentDebugType(X) (false)
-#define SetCurrentDebugType(X)
+#define setCurrentDebugType(X)
 #define DEBUG_WITH_TYPE(TYPE, X) do { } while (0)
 #endif
 
diff --git a/include/llvm/Support/DebugLoc.h b/include/llvm/Support/DebugLoc.h
index 2ee9f87..0498075 100644
--- a/include/llvm/Support/DebugLoc.h
+++ b/include/llvm/Support/DebugLoc.h
@@ -15,9 +15,8 @@
 #ifndef LLVM_SUPPORT_DEBUGLOC_H
 #define LLVM_SUPPORT_DEBUGLOC_H
 
-#include "llvm/ADT/DenseMapInfo.h"
-
 namespace llvm {
+  template <typename T> struct DenseMapInfo;
   class MDNode;
   class LLVMContext;
   
@@ -103,10 +102,10 @@ namespace llvm {
 
   template <>
   struct DenseMapInfo<DebugLoc> {
-    static DebugLoc getEmptyKey();
-    static DebugLoc getTombstoneKey();
+    static DebugLoc getEmptyKey() { return DebugLoc::getEmptyKey(); }
+    static DebugLoc getTombstoneKey() { return DebugLoc::getTombstoneKey(); }
     static unsigned getHashValue(const DebugLoc &Key);
-    static bool isEqual(const DebugLoc &LHS, const DebugLoc &RHS);
+    static bool isEqual(DebugLoc LHS, DebugLoc RHS) { return LHS == RHS; }
   };
 } // end namespace llvm
 
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 04953b6..f7ae60f 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -248,7 +248,7 @@ enum {
   EM_CYPRESS_M8C   = 161, // Cypress M8C microprocessor
   EM_R32C          = 162, // Renesas R32C series microprocessors
   EM_TRIMEDIA      = 163, // NXP Semiconductors TriMedia architecture family
-  EM_QDSP6         = 164, // QUALCOMM DSP6 Processor
+  EM_HEXAGON       = 164, // Qualcomm Hexagon processor
   EM_8051          = 165, // Intel 8051 and variants
   EM_STXP7X        = 166, // STMicroelectronics STxP7x family of configurable
                           // and extensible RISC processors
@@ -674,6 +674,97 @@ enum {
   R_MIPS_NUM               = 218
 };
 
+// ELF Relocation types for Hexagon
+// Release 5 ABI - Document: 80-V9418-3 Rev. J
+enum {
+  R_HEX_NONE              =  0,
+  R_HEX_B22_PCREL         =  1,
+  R_HEX_B15_PCREL         =  2,
+  R_HEX_B7_PCREL          =  3,
+  R_HEX_LO16              =  4,
+  R_HEX_HI16              =  5,
+  R_HEX_32                =  6,
+  R_HEX_16                =  7,
+  R_HEX_8                 =  8,
+  R_HEX_GPREL16_0         =  9,
+  R_HEX_GPREL16_1         =  10,
+  R_HEX_GPREL16_2         =  11,
+  R_HEX_GPREL16_3         =  12,
+  R_HEX_HL16              =  13,
+  R_HEX_B13_PCREL         =  14,
+  R_HEX_B9_PCREL          =  15,
+  R_HEX_B32_PCREL_X       =  16,
+  R_HEX_32_6_X            =  17,
+  R_HEX_B22_PCREL_X       =  18,
+  R_HEX_B15_PCREL_X       =  19,
+  R_HEX_B13_PCREL_X       =  20,
+  R_HEX_B9_PCREL_X        =  21,
+  R_HEX_B7_PCREL_X        =  22,
+  R_HEX_16_X              =  23,
+  R_HEX_12_X              =  24,
+  R_HEX_11_X              =  25,
+  R_HEX_10_X              =  26,
+  R_HEX_9_X               =  27,
+  R_HEX_8_X               =  28,
+  R_HEX_7_X               =  29,
+  R_HEX_6_X               =  30,
+  R_HEX_32_PCREL          =  31,
+  R_HEX_COPY              =  32,
+  R_HEX_GLOB_DAT          =  33,
+  R_HEX_JMP_SLOT          =  34,
+  R_HEX_RELATIVE          =  35,
+  R_HEX_PLT_B22_PCREL     =  36,
+  R_HEX_GOTREL_LO16       =  37,
+  R_HEX_GOTREL_HI16       =  38,
+  R_HEX_GOTREL_32         =  39,
+  R_HEX_GOT_LO16          =  40,
+  R_HEX_GOT_HI16          =  41,
+  R_HEX_GOT_32            =  42,
+  R_HEX_GOT_16            =  43,
+  R_HEX_DTPMOD_32         =  44,
+  R_HEX_DTPREL_LO16       =  45,
+  R_HEX_DTPREL_HI16       =  46,
+  R_HEX_DTPREL_32         =  47,
+  R_HEX_DTPREL_16         =  48,
+  R_HEX_GD_PLT_B22_PCREL  =  49,
+  R_HEX_GD_GOT_LO16       =  50,
+  R_HEX_GD_GOT_HI16       =  51,
+  R_HEX_GD_GOT_32         =  52,
+  R_HEX_GD_GOT_16         =  53,
+  R_HEX_IE_LO16           =  54,
+  R_HEX_IE_HI16           =  55,
+  R_HEX_IE_32             =  56,
+  R_HEX_IE_GOT_LO16       =  57,
+  R_HEX_IE_GOT_HI16       =  58,
+  R_HEX_IE_GOT_32         =  59,
+  R_HEX_IE_GOT_16         =  60,
+  R_HEX_TPREL_LO16        =  61,
+  R_HEX_TPREL_HI16        =  62,
+  R_HEX_TPREL_32          =  63,
+  R_HEX_TPREL_16          =  64,
+  R_HEX_6_PCREL_X         =  65,
+  R_HEX_GOTREL_32_6_X     =  66,
+  R_HEX_GOTREL_16_X       =  67,
+  R_HEX_GOTREL_11_X       =  68,
+  R_HEX_GOT_32_6_X        =  69,
+  R_HEX_GOT_16_X          =  70,
+  R_HEX_GOT_11_X          =  71,
+  R_HEX_DTPREL_32_6_X     =  72,
+  R_HEX_DTPREL_16_X       =  73,
+  R_HEX_DTPREL_11_X       =  74,
+  R_HEX_GD_GOT_32_6_X     =  75,
+  R_HEX_GD_GOT_16_X       =  76,
+  R_HEX_GD_GOT_11_X       =  77,
+  R_HEX_IE_32_6_X         =  78,
+  R_HEX_IE_16_X           =  79,
+  R_HEX_IE_GOT_32_6_X     =  80,
+  R_HEX_IE_GOT_16_X       =  81,
+  R_HEX_IE_GOT_11_X       =  82,
+  R_HEX_TPREL_32_6_X      =  83,
+  R_HEX_TPREL_16_X        =  84,
+  R_HEX_TPREL_11_X        =  85
+};
+
 // Section header.
 struct Elf32_Shdr {
   Elf32_Word sh_name;      // Section name (index into string table)
@@ -736,6 +827,8 @@ enum {
   SHT_GROUP         = 17, // Section group.
   SHT_SYMTAB_SHNDX  = 18, // Indices for SHN_XINDEX entries.
   SHT_LOOS          = 0x60000000, // Lowest operating system-specific type.
+  SHT_GNU_ATTRIBUTES= 0x6ffffff5, // Object attributes.
+  SHT_GNU_HASH      = 0x6ffffff6, // GNU-style hash table.
   SHT_GNU_verdef    = 0x6ffffffd, // GNU version definitions.
   SHT_GNU_verneed   = 0x6ffffffe, // GNU version references.
   SHT_GNU_versym    = 0x6fffffff, // GNU symbol versions table.
@@ -1017,6 +1110,9 @@ enum {
   PT_SUNW_EH_FRAME = 0x6474e550,
   PT_SUNW_UNWIND   = 0x6464e550,
 
+  PT_GNU_STACK  = 0x6474e551, // Indicates stack executability.
+  PT_GNU_RELRO  = 0x6474e552, // Read-only after relocation.
+
   PT_HIOS    = 0x6fffffff, // Highest operating system-specific pt entry type.
   PT_LOPROC  = 0x70000000, // Lowest processor-specific program hdr entry type.
   PT_HIPROC  = 0x7fffffff  // Highest processor-specific program hdr entry type.
@@ -1095,7 +1191,16 @@ enum {
   DT_LOOS         = 0x60000000, // Start of environment specific tags.
   DT_HIOS         = 0x6FFFFFFF, // End of environment specific tags.
   DT_LOPROC       = 0x70000000, // Start of processor specific tags.
-  DT_HIPROC       = 0x7FFFFFFF  // End of processor specific tags.
+  DT_HIPROC       = 0x7FFFFFFF, // End of processor specific tags.
+
+  DT_RELACOUNT    = 0x6FFFFFF9, // ELF32_Rela count.
+  DT_RELCOUNT     = 0x6FFFFFFA, // ELF32_Rel count.
+
+  DT_FLAGS_1      = 0X6FFFFFFB, // Flags_1.
+  DT_VERDEF       = 0X6FFFFFFC, // The address of the version definition table.
+  DT_VERDEFNUM    = 0X6FFFFFFD, // The number of entries in DT_VERDEF.
+  DT_VERNEED      = 0X6FFFFFFE, // The address of the version Dependency table.
+  DT_VERNEEDNUM   = 0X6FFFFFFF  // The number of entries in DT_VERNEED.
 };
 
 // DT_FLAGS values.
@@ -1107,6 +1212,27 @@ enum {
   DF_STATIC_TLS = 0x10  // Reject attempts to load dynamically.
 };
 
+// State flags selectable in the `d_un.d_val' element of the DT_FLAGS_1 entry.
+enum {
+  DF_1_NOW        = 0x00000001, // Set RTLD_NOW for this object.
+  DF_1_GLOBAL     = 0x00000002, // Set RTLD_GLOBAL for this object.
+  DF_1_GROUP      = 0x00000004, // Set RTLD_GROUP for this object.
+  DF_1_NODELETE   = 0x00000008, // Set RTLD_NODELETE for this object.
+  DF_1_LOADFLTR   = 0x00000010, // Trigger filtee loading at runtime.
+  DF_1_INITFIRST  = 0x00000020, // Set RTLD_INITFIRST for this object.
+  DF_1_NOOPEN     = 0x00000040, // Set RTLD_NOOPEN for this object.
+  DF_1_ORIGIN     = 0x00000080, // $ORIGIN must be handled.
+  DF_1_DIRECT     = 0x00000100, // Direct binding enabled.
+  DF_1_TRANS      = 0x00000200,
+  DF_1_INTERPOSE  = 0x00000400, // Object is used to interpose.
+  DF_1_NODEFLIB   = 0x00000800, // Ignore default lib search path.
+  DF_1_NODUMP     = 0x00001000, // Object can't be dldump'ed.
+  DF_1_CONFALT    = 0x00002000, // Configuration alternative created.
+  DF_1_ENDFILTEE  = 0x00004000, // Filtee terminates filters search.
+  DF_1_DISPRELDNE = 0x00008000, // Disp reloc applied at build time.
+  DF_1_DISPRELPND = 0x00010000  // Disp reloc applied at run-time.
+};
+
 // ElfXX_VerDef structure version (GNU versioning)
 enum {
   VER_DEF_NONE    = 0,
diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index 733ab75..8d5649d 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h
@@ -49,7 +49,7 @@ struct alignment_access_helper<value_type, unaligned>
 
 namespace endian {
   template<typename value_type, alignment align>
-  static value_type read_le(const void *memory) {
+  inline value_type read_le(const void *memory) {
     value_type t =
       reinterpret_cast<const detail::alignment_access_helper
         <value_type, align> *>(memory)->val;
@@ -59,7 +59,7 @@ namespace endian {
   }
 
   template<typename value_type, alignment align>
-  static void write_le(void *memory, value_type value) {
+  inline void write_le(void *memory, value_type value) {
     if (sys::isBigEndianHost())
       value = sys::SwapByteOrder(value);
     reinterpret_cast<detail::alignment_access_helper<value_type, align> *>
@@ -67,7 +67,7 @@ namespace endian {
   }
 
   template<typename value_type, alignment align>
-  static value_type read_be(const void *memory) {
+  inline value_type read_be(const void *memory) {
     value_type t =
       reinterpret_cast<const detail::alignment_access_helper
         <value_type, align> *>(memory)->val;
@@ -77,7 +77,7 @@ namespace endian {
   }
 
   template<typename value_type, alignment align>
-  static void write_be(void *memory, value_type value) {
+  inline void write_be(void *memory, value_type value) {
     if (sys::isLittleEndianHost())
       value = sys::SwapByteOrder(value);
     reinterpret_cast<detail::alignment_access_helper<value_type, align> *>
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
new file mode 100644
index 0000000..0f07164
--- /dev/null
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -0,0 +1,97 @@
+//=== FileOutputBuffer.h - File Output Buffer -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility for creating a in-memory buffer that will be written to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILEOUTPUTBUFFER_H
+#define LLVM_SUPPORT_FILEOUTPUTBUFFER_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class error_code;
+template<class T> class OwningPtr;
+
+/// FileOutputBuffer - This interface provides simple way to create an in-memory
+/// buffer which will be written to a file. During the lifetime of these 
+/// objects, the content or existence of the specified file is undefined. That
+/// is, creating an OutputBuffer for a file may immediately remove the file.
+/// If the FileOutputBuffer is committed, the target file's content will become 
+/// the buffer content at the time of the commit.  If the FileOutputBuffer is  
+/// not committed, the file will be deleted in the FileOutputBuffer destructor.
+class FileOutputBuffer {
+public:
+
+  enum  {
+    F_executable = 1  /// set the 'x' bit on the resulting file
+  }; 
+
+  /// Factory method to create an OutputBuffer object which manages a read/write
+  /// buffer of the specified size. When committed, the buffer will be written
+  /// to the file at the specified path.  
+  static error_code create(StringRef FilePath, size_t Size, 
+                           OwningPtr<FileOutputBuffer> &Result, 
+                           unsigned Flags=0);
+  
+
+  /// Returns a pointer to the start of the buffer.
+  uint8_t *getBufferStart() const {
+    return BufferStart;
+  }
+  
+  /// Returns a pointer to the end of the buffer.
+  uint8_t *getBufferEnd() const {
+    return BufferEnd;
+  }
+  
+  /// Returns size of the buffer.
+  size_t getBufferSize() const {
+    return BufferEnd - BufferStart;
+  }
+  
+  /// Returns path where file will show up if buffer is committed.
+  StringRef getPath() const {
+    return FinalPath;
+  }
+    
+  /// Flushes the content of the buffer to its file and deallocates the 
+  /// buffer.  If commit() is not called before this object's destructor
+  /// is called, the file is deleted in the destructor. The optional parameter
+  /// is used if it turns out you want the file size to be smaller than
+  /// initially requested.
+  error_code commit(int64_t NewSmallerSize = -1);
+  
+  /// If this object was previously committed, the destructor just deletes
+  /// this object.  If this object was not committed, the destructor
+  /// deallocates the buffer and the target file is never written.
+  ~FileOutputBuffer();
+
+  
+protected:
+  FileOutputBuffer(const FileOutputBuffer &); // DO NOT IMPLEMENT
+  FileOutputBuffer &operator=(const FileOutputBuffer &); // DO NOT IMPLEMENT
+  FileOutputBuffer(uint8_t *Start, uint8_t *End, 
+                    StringRef Path, StringRef TempPath);
+    
+  uint8_t            *BufferStart;
+  uint8_t            *BufferEnd;
+  SmallString<128>    FinalPath;
+  SmallString<128>    TempPath;
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index e6f9926..e0353f9 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -94,13 +94,62 @@ struct space_info {
   uint64_t available;
 };
 
+
+enum perms {
+  no_perms     = 0,
+  owner_read   = 0400, 
+  owner_write  = 0200, 
+  owner_exe    = 0100, 
+  owner_all    = owner_read | owner_write | owner_exe,
+  group_read   =  040, 
+  group_write  =  020, 
+  group_exe    =  010, 
+  group_all    = group_read | group_write | group_exe,
+  others_read  =   04, 
+  others_write =   02, 
+  others_exe   =   01, 
+  others_all   = others_read | others_write | others_exe, 
+  all_all      = owner_all | group_all | others_all,
+  set_uid_on_exe  = 04000, 
+  set_gid_on_exe  = 02000, 
+  sticky_bit      = 01000,
+  perms_mask      = all_all | set_uid_on_exe | set_gid_on_exe | sticky_bit, 
+  perms_not_known = 0xFFFF,
+  add_perms       = 0x1000,
+  remove_perms    = 0x2000, 
+  symlink_perms   = 0x4000
+};
+
+// Helper functions so that you can use & and | to manipulate perms bits:
+inline perms operator|(perms l , perms r) {
+  return static_cast<perms>(
+             static_cast<unsigned short>(l) | static_cast<unsigned short>(r)); 
+}
+inline perms operator&(perms l , perms r) {
+  return static_cast<perms>(
+             static_cast<unsigned short>(l) & static_cast<unsigned short>(r)); 
+}
+inline perms &operator|=(perms &l, perms r) {
+  l = l | r; 
+  return l; 
+}
+inline perms &operator&=(perms &l, perms r) {
+  l = l & r; 
+  return l; 
+}
+inline perms operator~(perms x) {
+  return static_cast<perms>(~static_cast<unsigned short>(x));
+}
+
+
+ 
 /// file_status - Represents the result of a call to stat and friends. It has
 ///               a platform specific member to store the result.
 class file_status
 {
   #if defined(LLVM_ON_UNIX)
-  dev_t st_dev;
-  ino_t st_ino;
+  dev_t fs_st_dev;
+  ino_t fs_st_ino;
   #elif defined (LLVM_ON_WIN32)
   uint32_t LastWriteTimeHigh;
   uint32_t LastWriteTimeLow;
@@ -113,12 +162,19 @@ class file_status
   friend bool equivalent(file_status A, file_status B);
   friend error_code status(const Twine &path, file_status &result);
   file_type Type;
+  perms Perms;
 public:
-  explicit file_status(file_type v=file_type::status_error)
-    : Type(v) {}
+  explicit file_status(file_type v=file_type::status_error, 
+                      perms prms=perms_not_known)
+    : Type(v), Perms(prms) {}
 
+  // getters
   file_type type() const { return Type; }
+  perms permissions() const { return Perms; }
+  
+  // setters
   void type(file_type v) { Type = v; }
+  void permissions(perms p) { Perms = p; }
 };
 
 /// file_magic - An "enum class" enumeration of file types based on magic (the first
@@ -309,6 +365,13 @@ bool equivalent(file_status A, file_status B);
 ///          platform specific error_code.
 error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
+/// @brief Simpler version of equivalent for clients that don't need to
+///        differentiate between an error and false.
+inline bool equivalent(const Twine &A, const Twine &B) {
+  bool result;
+  return !equivalent(A, B, result) && result;
+}
+
 /// @brief Get file size.
 ///
 /// @param path Input path.
@@ -388,6 +451,13 @@ error_code is_symlink(const Twine &path, bool &result);
 ///          platform specific error_code.
 error_code status(const Twine &path, file_status &result);
 
+/// @brief Modifies permission bits on a file
+///
+/// @param path Input path.
+/// @results errc::success if permissions have been changed, otherwise a
+///          platform specific error_code.
+error_code permissions(const Twine &path, perms prms);
+
 /// @brief Is status available?
 ///
 /// @param path Input path.
@@ -422,8 +492,8 @@ error_code status_known(const Twine &path, bool &result);
 /// @results errc::success if result_{fd,path} have been successfully set,
 ///          otherwise a platform specific error_code.
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path,
-                             bool makeAbsolute = true);
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute = true, unsigned mode = 0600);
 
 /// @brief Canonicalize path.
 ///
@@ -506,6 +576,33 @@ error_code FindLibrary(const Twine &short_name, SmallVectorImpl<char> &result);
 error_code GetMainExecutable(const char *argv0, void *MainAddr,
                              SmallVectorImpl<char> &result);
 
+
+/// @brief Memory maps the contents of a file
+///
+/// @param path Path to file to map.
+/// @param file_offset Byte offset in file where mapping should begin.
+/// @param size_t Byte length of range of the file to map.
+/// @param map_writable If true, the file will be mapped in r/w such
+///        that changes to the mapped buffer will be flushed back
+///        to the file.  If false, the file will be mapped read-only
+///        and the buffer will be read-only.
+/// @param result Set to the start address of the mapped buffer.
+/// @results errc::success if result has been successfully set, otherwise a
+///          platform specific error_code.
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                          bool map_writable, void *&result);
+
+
+/// @brief Memory unmaps the contents of a file
+///
+/// @param base Pointer to the start of the buffer.
+/// @param size Byte length of the range to unmmap.
+/// @results errc::success if result has been successfully set, otherwise a
+///          platform specific error_code.
+error_code unmap_file_pages(void *base, size_t size);
+
+
+
 /// @}
 /// @name Iterators
 /// @{
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 49cd87f..19e1ce8 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -63,8 +63,8 @@ public:
   bool readFunctionTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
     if (Tag.empty() || 
-	Tag[0] != '\0' || Tag[1] != '\0' ||
-	Tag[2] != '\0' || Tag[3] != '\1') {
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\0' || Tag[3] != '\1') {
       return false;
     }
     Cursor += 4;
@@ -76,8 +76,8 @@ public:
   bool readBlockTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
     if (Tag.empty() || 
-	Tag[0] != '\0' || Tag[1] != '\0' ||
-	Tag[2] != '\x41' || Tag[3] != '\x01') {
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\x41' || Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -89,8 +89,8 @@ public:
   bool readEdgeTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
     if (Tag.empty() || 
-	Tag[0] != '\0' || Tag[1] != '\0' ||
-	Tag[2] != '\x43' || Tag[3] != '\x01') {
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\x43' || Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -102,8 +102,8 @@ public:
   bool readLineTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
     if (Tag.empty() || 
-	Tag[0] != '\0' || Tag[1] != '\0' ||
-	Tag[2] != '\x45' || Tag[3] != '\x01') {
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\x45' || Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -115,8 +115,8 @@ public:
   bool readArcTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
     if (Tag.empty() || 
-	Tag[0] != '\0' || Tag[1] != '\0' ||
-	Tag[2] != '\xa1' || Tag[3] != '\1') {
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\xa1' || Tag[3] != '\1') {
       return false;
     }
     Cursor += 4;
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index ae32da5..f178b0c 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -172,7 +172,7 @@ public:
 
       // If we should include the address of the node in the label, do so now.
       if (DTraits.hasNodeAddressLabel(Node, G))
-        O << "|" << (void*)Node;
+        O << "|" << static_cast<const void*>(Node);
     }
 
     std::string edgeSourceLabels;
@@ -192,7 +192,7 @@ public:
 
       // If we should include the address of the node in the label, do so now.
       if (DTraits.hasNodeAddressLabel(Node, G))
-        O << "|" << (void*)Node;
+        O << "|" << static_cast<const void*>(Node);
     }
 
     if (DTraits.hasEdgeDestLabels()) {
diff --git a/include/llvm/Support/InstVisitor.h b/include/llvm/Support/InstVisitor.h
index 52de8f6..109b3cf 100644
--- a/include/llvm/Support/InstVisitor.h
+++ b/include/llvm/Support/InstVisitor.h
@@ -13,6 +13,8 @@
 
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -145,14 +147,17 @@ public:
   // visitMul to proxy to visitBinaryOperator for instance in case the user does
   // not need this generality.
   //
-  // The one problem case we have to handle here though is that the PHINode
-  // class and opcode name are the exact same.  Because of this, we cannot
-  // define visitPHINode (the inst version) to forward to visitPHINode (the
-  // generic version) without multiply defined symbols and recursion.  To handle
-  // this, we do not autoexpand "Other" instructions, we do it manually.
-  //
+  // These functions can also implement fan-out, when a single opcode and
+  // instruction have multiple more specific Instruction subclasses. The Call
+  // instruction currently supports this. We implement that by redirecting that
+  // instruction to a special delegation helper.
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
-    RetTy visit##OPCODE(CLASS &I) { DELEGATE(CLASS); }
+    RetTy visit##OPCODE(CLASS &I) { \
+      if (NUM == Instruction::Call) \
+        return delegateCallInst(I); \
+      else \
+        DELEGATE(CLASS); \
+    }
 #include "llvm/Instruction.def"
 
   // Specific Instruction type classes... note that all of the casts are
@@ -195,6 +200,17 @@ public:
   RetTy visitInsertValueInst(InsertValueInst &I)  { DELEGATE(Instruction); }
   RetTy visitLandingPadInst(LandingPadInst &I)    { DELEGATE(Instruction); }
 
+  // Handle the special instrinsic instruction classes.
+  RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgInfoIntrinsic);}
+  RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgInfoIntrinsic);}
+  RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { DELEGATE(IntrinsicInst); }
+  RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
+  RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
+  RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
+  RetTy visitMemTransferInst(MemTransferInst &I)  { DELEGATE(MemIntrinsic); }
+  RetTy visitMemIntrinsic(MemIntrinsic &I)        { DELEGATE(IntrinsicInst); }
+  RetTy visitIntrinsicInst(IntrinsicInst &I)      { DELEGATE(CallInst); }
+
   // Call and Invoke are slightly different as they delegate first through
   // a generic CallSite visitor.
   RetTy visitCallInst(CallInst &I) {
@@ -234,6 +250,29 @@ public:
   // Note that you MUST override this function if your return type is not void.
   //
   void visitInstruction(Instruction &I) {}  // Ignore unhandled instructions
+
+private:
+  // Special helper function to delegate to CallInst subclass visitors.
+  RetTy delegateCallInst(CallInst &I) {
+    if (const Function *F = I.getCalledFunction()) {
+      switch ((Intrinsic::ID)F->getIntrinsicID()) {
+      default:                     DELEGATE(IntrinsicInst);
+      case Intrinsic::dbg_declare: DELEGATE(DbgDeclareInst);
+      case Intrinsic::dbg_value:   DELEGATE(DbgValueInst);
+      case Intrinsic::memcpy:      DELEGATE(MemCpyInst);
+      case Intrinsic::memmove:     DELEGATE(MemMoveInst);
+      case Intrinsic::memset:      DELEGATE(MemSetInst);
+      case Intrinsic::not_intrinsic: break;
+      }
+    }
+    DELEGATE(CallInst);
+  }
+
+  // An overload that will never actually be called, it is used only from dead
+  // code in the dispatching from opcodes to instruction subclasses.
+  RetTy delegateCallInst(Instruction &I) {
+    llvm_unreachable("delegateCallInst called for non-CallInst");
+  }
 };
 
 #undef DELEGATE
diff --git a/include/llvm/Support/IntegersSubset.h b/include/llvm/Support/IntegersSubset.h
new file mode 100644
index 0000000..bb9e769
--- /dev/null
+++ b/include/llvm/Support/IntegersSubset.h
@@ -0,0 +1,541 @@
+//===-- llvm/IntegersSubset.h - The subset of integers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// @file
+/// This file contains class that implements constant set of ranges:
+/// [<Low0,High0>,...,<LowN,HighN>]. Initially, this class was created for
+/// SwitchInst and was used for case value representation that may contain
+/// multiple ranges for a single successor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CONSTANTRANGESSET_H_
+#define CONSTANTRANGESSET_H_
+
+#include <list>
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+
+namespace llvm {
+
+  // The IntItem is a wrapper for APInt.
+  // 1. It determines sign of integer, it allows to use
+  //    comparison operators >,<,>=,<=, and as result we got shorter and cleaner
+  //    constructions.
+  // 2. It helps to implement PR1255 (case ranges) as a series of small patches.
+  // 3. Currently we can interpret IntItem both as ConstantInt and as APInt.
+  //    It allows to provide SwitchInst methods that works with ConstantInt for
+  //    non-updated passes. And it allows to use APInt interface for new methods.
+  // 4. IntItem can be easily replaced with APInt.
+
+  // The set of macros that allows to propagate APInt operators to the IntItem.
+
+#define INT_ITEM_DEFINE_COMPARISON(op,func) \
+  bool operator op (const APInt& RHS) const { \
+    return getAPIntValue().func(RHS); \
+  }
+
+#define INT_ITEM_DEFINE_UNARY_OP(op) \
+  IntItem operator op () const { \
+    APInt res = op(getAPIntValue()); \
+    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
+    return IntItem(cast<ConstantInt>(NewVal)); \
+  }
+
+#define INT_ITEM_DEFINE_BINARY_OP(op) \
+  IntItem operator op (const APInt& RHS) const { \
+    APInt res = getAPIntValue() op RHS; \
+    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
+    return IntItem(cast<ConstantInt>(NewVal)); \
+  }
+
+#define INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(op) \
+  IntItem& operator op (const APInt& RHS) {\
+    APInt res = getAPIntValue();\
+    res op RHS; \
+    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
+    ConstantIntVal = cast<ConstantInt>(NewVal); \
+    return *this; \
+  }
+
+#define INT_ITEM_DEFINE_PREINCDEC(op) \
+    IntItem& operator op () { \
+      APInt res = getAPIntValue(); \
+      op(res); \
+      Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
+      ConstantIntVal = cast<ConstantInt>(NewVal); \
+      return *this; \
+    }
+
+#define INT_ITEM_DEFINE_POSTINCDEC(op) \
+    IntItem& operator op (int) { \
+      APInt res = getAPIntValue();\
+      op(res); \
+      Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
+      OldConstantIntVal = ConstantIntVal; \
+      ConstantIntVal = cast<ConstantInt>(NewVal); \
+      return IntItem(OldConstantIntVal); \
+    }
+
+#define INT_ITEM_DEFINE_OP_STANDARD_INT(RetTy, op, IntTy) \
+  RetTy operator op (IntTy RHS) const { \
+    return (*this) op APInt(getAPIntValue().getBitWidth(), RHS); \
+  }
+
+class IntItem {
+  ConstantInt *ConstantIntVal;
+  const APInt* APIntVal;
+  IntItem(const ConstantInt *V) :
+    ConstantIntVal(const_cast<ConstantInt*>(V)),
+    APIntVal(&ConstantIntVal->getValue()){}
+  const APInt& getAPIntValue() const {
+    return *APIntVal;
+  }
+public:
+
+  IntItem() {}
+
+  operator const APInt&() const {
+    return getAPIntValue();
+  }
+
+  // Propagate APInt operators.
+  // Note, that
+  // /,/=,>>,>>= are not implemented in APInt.
+  // <<= is implemented for unsigned RHS, but not implemented for APInt RHS.
+
+  INT_ITEM_DEFINE_COMPARISON(<, ult)
+  INT_ITEM_DEFINE_COMPARISON(>, ugt)
+  INT_ITEM_DEFINE_COMPARISON(<=, ule)
+  INT_ITEM_DEFINE_COMPARISON(>=, uge)
+
+  INT_ITEM_DEFINE_COMPARISON(==, eq)
+  INT_ITEM_DEFINE_OP_STANDARD_INT(bool,==,uint64_t)
+
+  INT_ITEM_DEFINE_COMPARISON(!=, ne)
+  INT_ITEM_DEFINE_OP_STANDARD_INT(bool,!=,uint64_t)
+
+  INT_ITEM_DEFINE_BINARY_OP(*)
+  INT_ITEM_DEFINE_BINARY_OP(+)
+  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,+,uint64_t)
+  INT_ITEM_DEFINE_BINARY_OP(-)
+  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,-,uint64_t)
+  INT_ITEM_DEFINE_BINARY_OP(<<)
+  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,<<,unsigned)
+  INT_ITEM_DEFINE_BINARY_OP(&)
+  INT_ITEM_DEFINE_BINARY_OP(^)
+  INT_ITEM_DEFINE_BINARY_OP(|)
+
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(*=)
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(+=)
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(-=)
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(&=)
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(^=)
+  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(|=)
+
+  // Special case for <<=
+  IntItem& operator <<= (unsigned RHS) {
+    APInt res = getAPIntValue();
+    res <<= RHS;
+    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res);
+    ConstantIntVal = cast<ConstantInt>(NewVal);
+    return *this;
+  }
+
+  INT_ITEM_DEFINE_UNARY_OP(-)
+  INT_ITEM_DEFINE_UNARY_OP(~)
+
+  INT_ITEM_DEFINE_PREINCDEC(++)
+  INT_ITEM_DEFINE_PREINCDEC(--)
+
+  // The set of workarounds, since currently we use ConstantInt implemented
+  // integer.
+
+  static IntItem fromConstantInt(const ConstantInt *V) {
+    return IntItem(V);
+  }
+  static IntItem fromType(Type* Ty, const APInt& V) {
+    ConstantInt *C = cast<ConstantInt>(ConstantInt::get(Ty, V));
+    return fromConstantInt(C);
+  }
+  static IntItem withImplLikeThis(const IntItem& LikeThis, const APInt& V) {
+    ConstantInt *C = cast<ConstantInt>(ConstantInt::get(
+        LikeThis.ConstantIntVal->getContext(), V));
+    return fromConstantInt(C);
+  }
+  ConstantInt *toConstantInt() const {
+    return ConstantIntVal;
+  }
+};
+
+template<class IntType>
+class IntRange {
+protected:
+    IntType Low;
+    IntType High;
+    bool IsEmpty : 1;
+    bool IsSingleNumber : 1;
+
+public:
+    typedef IntRange<IntType> self;
+    typedef std::pair<self, self> SubRes;
+
+    IntRange() : IsEmpty(true) {}
+    IntRange(const self &RHS) :
+      Low(RHS.Low), High(RHS.High),
+      IsEmpty(RHS.IsEmpty), IsSingleNumber(RHS.IsSingleNumber) {}
+    IntRange(const IntType &C) :
+      Low(C), High(C), IsEmpty(false), IsSingleNumber(true) {}
+
+    IntRange(const IntType &L, const IntType &H) : Low(L), High(H),
+      IsEmpty(false), IsSingleNumber(Low == High) {}
+
+    bool isEmpty() const { return IsEmpty; }
+    bool isSingleNumber() const { return IsSingleNumber; }
+
+    const IntType& getLow() const {
+      assert(!IsEmpty && "Range is empty.");
+      return Low;
+    }
+    const IntType& getHigh() const {
+      assert(!IsEmpty && "Range is empty.");
+      return High;
+    }
+
+    bool operator<(const self &RHS) const {
+      assert(!IsEmpty && "Left range is empty.");
+      assert(!RHS.IsEmpty && "Right range is empty.");
+      if (Low == RHS.Low) {
+        if (High > RHS.High)
+          return true;
+        return false;
+      }
+      if (Low < RHS.Low)
+        return true;
+      return false;
+    }
+
+    bool operator==(const self &RHS) const {
+      assert(!IsEmpty && "Left range is empty.");
+      assert(!RHS.IsEmpty && "Right range is empty.");
+      return Low == RHS.Low && High == RHS.High;
+    }
+
+    bool operator!=(const self &RHS) const {
+      return !operator ==(RHS);
+    }
+
+    static bool LessBySize(const self &LHS, const self &RHS) {
+      return (LHS.High - LHS.Low) < (RHS.High - RHS.Low);
+    }
+
+    bool isInRange(const IntType &IntVal) const {
+      assert(!IsEmpty && "Range is empty.");
+      return IntVal >= Low && IntVal <= High;
+    }
+
+    SubRes sub(const self &RHS) const {
+      SubRes Res;
+
+      // RHS is either more global and includes this range or
+      // if it doesn't intersected with this range.
+      if (!isInRange(RHS.Low) && !isInRange(RHS.High)) {
+
+        // If RHS more global (it is enough to check
+        // only one border in this case.
+        if (RHS.isInRange(Low))
+          return std::make_pair(self(Low, High), self());
+
+        return Res;
+      }
+
+      if (Low < RHS.Low) {
+        Res.first.Low = Low;
+        IntType NewHigh = RHS.Low;
+        --NewHigh;
+        Res.first.High = NewHigh;
+      }
+      if (High > RHS.High) {
+        IntType NewLow = RHS.High;
+        ++NewLow;
+        Res.second.Low = NewLow;
+        Res.second.High = High;
+      }
+      return Res;
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// IntegersSubsetGeneric - class that implements the subset of integers. It
+/// consists from ranges and single numbers.
+template <class IntTy>
+class IntegersSubsetGeneric {
+public:
+  // Use Chris Lattner idea, that was initially described here:
+  // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120213/136954.html
+  // In short, for more compact memory consumption we can store flat
+  // numbers collection, and define range as pair of indices.
+  // In that case we can safe some memory on 32 bit machines.
+  typedef std::vector<IntTy> FlatCollectionTy;
+  typedef std::pair<IntTy*, IntTy*> RangeLinkTy;
+  typedef std::vector<RangeLinkTy> RangeLinksTy;
+  typedef typename RangeLinksTy::const_iterator RangeLinksConstIt;
+
+  typedef IntegersSubsetGeneric<IntTy> self;
+
+protected:
+
+  FlatCollectionTy FlatCollection;
+  RangeLinksTy RangeLinks;
+
+  bool IsSingleNumber;
+  bool IsSingleNumbersOnly;
+
+public:
+
+  template<class RangesCollectionTy>
+  explicit IntegersSubsetGeneric(const RangesCollectionTy& Links) {
+    assert(Links.size() && "Empty ranges are not allowed.");
+
+    // In case of big set of single numbers consumes additional RAM space,
+    // but allows to avoid additional reallocation.
+    FlatCollection.reserve(Links.size() * 2);
+    RangeLinks.reserve(Links.size());
+    IsSingleNumbersOnly = true;
+    for (typename RangesCollectionTy::const_iterator i = Links.begin(),
+         e = Links.end(); i != e; ++i) {
+      RangeLinkTy RangeLink;
+      FlatCollection.push_back(i->getLow());
+      RangeLink.first = &FlatCollection.back();
+      if (i->getLow() != i->getHigh()) {
+        FlatCollection.push_back(i->getHigh());
+        IsSingleNumbersOnly = false;
+      }
+      RangeLink.second = &FlatCollection.back();
+      RangeLinks.push_back(RangeLink);
+    }
+    IsSingleNumber = IsSingleNumbersOnly && RangeLinks.size() == 1;
+  }
+
+  IntegersSubsetGeneric(const self& RHS) {
+    *this = RHS;
+  }
+
+  self& operator=(const self& RHS) {
+    FlatCollection.clear();
+    RangeLinks.clear();
+    FlatCollection.reserve(RHS.RangeLinks.size() * 2);
+    RangeLinks.reserve(RHS.RangeLinks.size());
+    for (RangeLinksConstIt i = RHS.RangeLinks.begin(), e = RHS.RangeLinks.end();
+         i != e; ++i) {
+      RangeLinkTy RangeLink;
+      FlatCollection.push_back(*(i->first));
+      RangeLink.first = &FlatCollection.back();
+      if (i->first != i->second)
+        FlatCollection.push_back(*(i->second));
+      RangeLink.second = &FlatCollection.back();
+      RangeLinks.push_back(RangeLink);
+    }
+    IsSingleNumber = RHS.IsSingleNumber;
+    IsSingleNumbersOnly = RHS.IsSingleNumbersOnly;
+    return *this;
+  }
+
+  typedef IntRange<IntTy> Range;
+
+  /// Checks is the given constant satisfies this case. Returns
+  /// true if it equals to one of contained values or belongs to the one of
+  /// contained ranges.
+  bool isSatisfies(const IntTy &CheckingVal) const {
+    if (IsSingleNumber)
+      return FlatCollection.front() == CheckingVal;
+    if (IsSingleNumbersOnly)
+      return std::find(FlatCollection.begin(),
+                       FlatCollection.end(),
+                       CheckingVal) != FlatCollection.end();
+
+    for (unsigned i = 0, e = getNumItems(); i < e; ++i) {
+      if (RangeLinks[i].first == RangeLinks[i].second) {
+        if (*RangeLinks[i].first == CheckingVal)
+          return true;
+      } else if (*RangeLinks[i].first <= CheckingVal &&
+                 *RangeLinks[i].second >= CheckingVal)
+        return true;
+    }
+    return false;
+  }
+
+  /// Returns set's item with given index.
+  Range getItem(unsigned idx) const {
+    const RangeLinkTy &Link = RangeLinks[idx];
+    if (Link.first != Link.second)
+      return Range(*Link.first, *Link.second);
+    else
+      return Range(*Link.first);
+  }
+
+  /// Return number of items (ranges) stored in set.
+  unsigned getNumItems() const {
+    return RangeLinks.size();
+  }
+
+  /// Returns true if whole subset contains single element.
+  bool isSingleNumber() const {
+    return IsSingleNumber;
+  }
+
+  /// Returns true if whole subset contains only single numbers, no ranges.
+  bool isSingleNumbersOnly() const {
+    return IsSingleNumbersOnly;
+  }
+
+  /// Does the same like getItem(idx).isSingleNumber(), but
+  /// works faster, since we avoid creation of temporary range object.
+  bool isSingleNumber(unsigned idx) const {
+    return RangeLinks[idx].first == RangeLinks[idx].second;
+  }
+
+  /// Returns set the size, that equals number of all values + sizes of all
+  /// ranges.
+  /// Ranges set is considered as flat numbers collection.
+  /// E.g.: for range [<0>, <1>, <4,8>] the size will 7;
+  ///       for range [<0>, <1>, <5>] the size will 3
+  unsigned getSize() const {
+    APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
+    for (unsigned i = 0, e = getNumItems(); i != e; ++i) {
+      const APInt &Low = getItem(i).getLow();
+      const APInt &High = getItem(i).getHigh();
+      APInt S = High - Low + 1;
+      sz += S;
+    }
+    return sz.getZExtValue();
+  }
+
+  /// Allows to access single value even if it belongs to some range.
+  /// Ranges set is considered as flat numbers collection.
+  /// [<1>, <4,8>] is considered as [1,4,5,6,7,8]
+  /// For range [<1>, <4,8>] getSingleValue(3) returns 6.
+  APInt getSingleValue(unsigned idx) const {
+    APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
+    for (unsigned i = 0, e = getNumItems(); i != e; ++i) {
+      const APInt &Low = getItem(i).getLow();
+      const APInt &High = getItem(i).getHigh();
+      APInt S = High - Low + 1;
+      APInt oldSz = sz;
+      sz += S;
+      if (sz.ugt(idx)) {
+        APInt Res = Low;
+        APInt Offset(oldSz.getBitWidth(), idx);
+        Offset -= oldSz;
+        Res += Offset;
+        return Res;
+      }
+    }
+    assert(0 && "Index exceeds high border.");
+    return sz;
+  }
+
+  /// Does the same as getSingleValue, but works only if subset contains
+  /// single numbers only.
+  const IntTy& getSingleNumber(unsigned idx) const {
+    assert(IsSingleNumbersOnly && "This method works properly if subset "
+                                  "contains single numbers only.");
+    return FlatCollection[idx];
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// IntegersSubset - currently is extension of IntegersSubsetGeneric
+/// that also supports conversion to/from Constant* object.
+class IntegersSubset : public IntegersSubsetGeneric<IntItem> {
+
+  typedef IntegersSubsetGeneric<IntItem> ParentTy;
+
+  Constant *Holder;
+
+  static unsigned getNumItemsFromConstant(Constant *C) {
+    return cast<ArrayType>(C->getType())->getNumElements();
+  }
+
+  static Range getItemFromConstant(Constant *C, unsigned idx) {
+    const Constant *CV = C->getAggregateElement(idx);
+
+    unsigned NumEls = cast<VectorType>(CV->getType())->getNumElements();
+    switch (NumEls) {
+    case 1:
+      return Range(IntItem::fromConstantInt(
+                     cast<ConstantInt>(CV->getAggregateElement(0U))),
+                   IntItem::fromConstantInt(cast<ConstantInt>(
+                     cast<ConstantInt>(CV->getAggregateElement(0U)))));
+    case 2:
+      return Range(IntItem::fromConstantInt(
+                     cast<ConstantInt>(CV->getAggregateElement(0U))),
+                   IntItem::fromConstantInt(
+                   cast<ConstantInt>(CV->getAggregateElement(1))));
+    default:
+      assert(0 && "Only pairs and single numbers are allowed here.");
+      return Range();
+    }
+  }
+
+  std::vector<Range> rangesFromConstant(Constant *C) {
+    unsigned NumItems = getNumItemsFromConstant(C);
+    std::vector<Range> r;
+    r.reserve(NumItems);
+    for (unsigned i = 0, e = NumItems; i != e; ++i)
+      r.push_back(getItemFromConstant(C, i));
+    return r;
+  }
+
+public:
+
+  explicit IntegersSubset(Constant *C) : ParentTy(rangesFromConstant(C)),
+                          Holder(C) {}
+
+  IntegersSubset(const IntegersSubset& RHS) :
+    ParentTy(*(const ParentTy *)&RHS), // FIXME: tweak for msvc.
+    Holder(RHS.Holder) {}
+
+  template<class RangesCollectionTy>
+  explicit IntegersSubset(const RangesCollectionTy& Src) : ParentTy(Src) {
+    std::vector<Constant*> Elts;
+    Elts.reserve(Src.size());
+    for (typename RangesCollectionTy::const_iterator i = Src.begin(),
+         e = Src.end(); i != e; ++i) {
+      const Range &R = *i;
+      std::vector<Constant*> r;
+      if (R.isSingleNumber()) {
+        r.reserve(2);
+        // FIXME: Since currently we have ConstantInt based numbers
+        // use hack-conversion of IntItem to ConstantInt
+        r.push_back(R.getLow().toConstantInt());
+        r.push_back(R.getHigh().toConstantInt());
+      } else {
+        r.reserve(1);
+        r.push_back(R.getLow().toConstantInt());
+      }
+      Constant *CV = ConstantVector::get(r);
+      Elts.push_back(CV);
+    }
+    ArrayType *ArrTy =
+        ArrayType::get(Elts.front()->getType(), (uint64_t)Elts.size());
+    Holder = ConstantArray::get(ArrTy, Elts);
+  }
+
+  operator Constant*() { return Holder; }
+  operator const Constant*() const { return Holder; }
+  Constant *operator->() { return Holder; }
+  const Constant *operator->() const { return Holder; }
+};
+
+}
+
+#endif /* CONSTANTRANGESSET_H_ */
diff --git a/include/llvm/Support/IntegersSubsetMapping.h b/include/llvm/Support/IntegersSubsetMapping.h
new file mode 100644
index 0000000..cab18dc
--- /dev/null
+++ b/include/llvm/Support/IntegersSubsetMapping.h
@@ -0,0 +1,580 @@
+//===- IntegersSubsetMapping.h - Mapping subset ==> Successor ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// @file
+/// IntegersSubsetMapping is mapping from A to B, where
+/// Items in A is subsets of integers,
+/// Items in B some pointers (Successors).
+/// If user which to add another subset for successor that is already
+/// exists in mapping, IntegersSubsetMapping merges existing subset with
+/// added one.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CRSBUILDER_H_
+#define CRSBUILDER_H_
+
+#include "llvm/Support/IntegersSubset.h"
+#include <list>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+template <class SuccessorClass,
+          class IntegersSubsetTy = IntegersSubset,
+          class IntTy = IntItem>
+class IntegersSubsetMapping {
+  // FIXME: To much similar iterators typedefs, similar names. 
+  //        - Rename RangeIterator to the cluster iterator.
+  //        - Remove unused "add" methods.
+  //        - Class contents needs cleaning.
+public:
+  
+  typedef IntRange<IntTy> RangeTy;
+  
+  struct RangeEx : public RangeTy {
+    RangeEx() : Weight(1) {}
+    RangeEx(const RangeTy &R) : RangeTy(R), Weight(1) {}
+    RangeEx(const IntTy &C) : RangeTy(C), Weight(1) {}
+    RangeEx(const IntTy &L, const IntTy &H) : RangeTy(L, H), Weight(1) {}
+    RangeEx(const IntTy &L, const IntTy &H, unsigned W) :
+      RangeTy(L, H), Weight(W) {}
+    unsigned Weight;
+  };
+
+  typedef std::pair<RangeEx, SuccessorClass*> Cluster;
+
+  typedef std::list<RangeTy> RangesCollection;
+  typedef typename RangesCollection::iterator RangesCollectionIt;
+  typedef typename RangesCollection::const_iterator RangesCollectionConstIt;
+  typedef IntegersSubsetMapping<SuccessorClass, IntegersSubsetTy, IntTy> self;
+  
+protected:
+
+  typedef std::list<Cluster> CaseItems;
+  typedef typename CaseItems::iterator CaseItemIt;
+  typedef typename CaseItems::const_iterator CaseItemConstIt;
+  
+  // TODO: Change unclean CRS prefixes to SubsetMap for example.
+  typedef std::map<SuccessorClass*, RangesCollection > CRSMap;
+  typedef typename CRSMap::iterator CRSMapIt;
+
+  struct ClustersCmp {
+    bool operator()(const Cluster &C1, const Cluster &C2) {
+      return C1.first < C2.first;
+    }
+  };
+  
+  CaseItems Items;
+  bool Sorted;
+  
+  bool isIntersected(CaseItemIt& LItem, CaseItemIt& RItem) {
+    return LItem->first.getHigh() >= RItem->first.getLow();
+  }
+
+  bool isJoinable(CaseItemIt& LItem, CaseItemIt& RItem) {
+    if (LItem->second != RItem->second) {
+      assert(!isIntersected(LItem, RItem) &&
+             "Intersected items with different successors!");
+      return false;
+    }
+    APInt RLow = RItem->first.getLow();
+    if (RLow != APInt::getNullValue(RLow.getBitWidth()))
+      --RLow;
+    return LItem->first.getHigh() >= RLow;
+  }
+  
+  void sort() {
+    if (!Sorted) {
+      std::vector<Cluster> clustersVector;
+      clustersVector.reserve(Items.size());
+      clustersVector.insert(clustersVector.begin(), Items.begin(), Items.end());
+      std::sort(clustersVector.begin(), clustersVector.end(), ClustersCmp());
+      Items.clear();
+      Items.insert(Items.begin(), clustersVector.begin(), clustersVector.end());
+      Sorted = true;
+    }
+  }
+  
+  enum DiffProcessState {
+    L_OPENED,
+    INTERSECT_OPENED,
+    R_OPENED,
+    ALL_IS_CLOSED
+  };
+  
+  class DiffStateMachine {
+    
+    DiffProcessState State;
+    IntTy OpenPt;
+    SuccessorClass *CurrentLSuccessor;
+    SuccessorClass *CurrentRSuccessor;
+    
+    self *LeftMapping;
+    self *IntersectionMapping;
+    self *RightMapping;
+
+  public:
+    
+    typedef
+      IntegersSubsetMapping<SuccessorClass, IntegersSubsetTy, IntTy> MappingTy;
+    
+    DiffStateMachine(MappingTy *L,
+                                 MappingTy *Intersection,
+                                 MappingTy *R) :
+                                 State(ALL_IS_CLOSED),
+                                 LeftMapping(L),
+                                 IntersectionMapping(Intersection),
+                                 RightMapping(R)
+                                 {}
+    
+    void onLOpen(const IntTy &Pt, SuccessorClass *S) {
+      switch (State) {
+      case R_OPENED:
+        if (Pt > OpenPt/*Don't add empty ranges.*/ && RightMapping) 
+          RightMapping->add(OpenPt, Pt-1, CurrentRSuccessor);
+        State = INTERSECT_OPENED;
+        break;
+      case ALL_IS_CLOSED:
+        State = L_OPENED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;
+      }
+      CurrentLSuccessor = S;
+      OpenPt = Pt;
+    }
+    
+    void onLClose(const IntTy &Pt) {
+      switch (State) {
+      case L_OPENED:
+        assert(Pt >= OpenPt &&
+               "Subset is not sorted or contains overlapped ranges");
+        if (LeftMapping)
+          LeftMapping->add(OpenPt, Pt, CurrentLSuccessor);
+        State = ALL_IS_CLOSED;
+        break;
+      case INTERSECT_OPENED:
+        if (IntersectionMapping)
+          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
+        OpenPt = Pt + 1;
+        State = R_OPENED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;
+      }
+    }
+    
+    void onROpen(const IntTy &Pt, SuccessorClass *S) {
+      switch (State) {
+      case L_OPENED:
+        if (Pt > OpenPt && LeftMapping)
+          LeftMapping->add(OpenPt, Pt-1, CurrentLSuccessor);
+        State = INTERSECT_OPENED;
+        break;
+      case ALL_IS_CLOSED:
+        State = R_OPENED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;
+      }
+      CurrentRSuccessor = S;
+      OpenPt = Pt;      
+    }
+    
+    void onRClose(const IntTy &Pt) {
+      switch (State) {
+      case R_OPENED:
+        assert(Pt >= OpenPt &&
+               "Subset is not sorted or contains overlapped ranges");
+        if (RightMapping)
+          RightMapping->add(OpenPt, Pt, CurrentRSuccessor);
+        State = ALL_IS_CLOSED;
+        break;
+      case INTERSECT_OPENED:
+        if (IntersectionMapping)
+          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
+        OpenPt = Pt + 1;
+        State = L_OPENED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;
+      }
+    }
+    
+    void onLROpen(const IntTy &Pt,
+                  SuccessorClass *LS,
+                  SuccessorClass *RS) {
+      switch (State) {
+      case ALL_IS_CLOSED:
+        State = INTERSECT_OPENED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;
+      }
+      CurrentLSuccessor = LS;
+      CurrentRSuccessor = RS;
+      OpenPt = Pt;        
+    }
+    
+    void onLRClose(const IntTy &Pt) {
+      switch (State) {
+      case INTERSECT_OPENED:
+        if (IntersectionMapping)
+          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
+        State = ALL_IS_CLOSED;
+        break;
+      default:
+        assert(0 && "Got unexpected point.");
+        break;        
+      }
+    }
+    
+    bool isLOpened() { return State == L_OPENED; }
+    bool isROpened() { return State == R_OPENED; }
+  };
+
+public:
+  
+  // Don't public CaseItems itself. Don't allow edit the Items directly. 
+  // Just present the user way to iterate over the internal collection
+  // sharing iterator, begin() and end(). Editing should be controlled by
+  // factory.
+  typedef CaseItemIt RangeIterator;
+  
+  typedef std::pair<SuccessorClass*, IntegersSubsetTy> Case;
+  typedef std::list<Case> Cases;
+  typedef typename Cases::iterator CasesIt;
+  
+  IntegersSubsetMapping() {
+    Sorted = false;
+  }
+  
+  bool verify() {
+    RangeIterator DummyErrItem;
+    return verify(DummyErrItem);
+  }
+  
+  bool verify(RangeIterator& errItem) {
+    if (Items.empty())
+      return true;
+    sort();
+    for (CaseItemIt j = Items.begin(), i = j++, e = Items.end();
+         j != e; i = j++) {
+      if (isIntersected(i, j) && i->second != j->second) {
+        errItem = j;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool isOverlapped(self &RHS) {
+    if (Items.empty() || RHS.empty())
+      return true;
+    
+    for (CaseItemIt L = Items.begin(), R = RHS.Items.begin(),
+         el = Items.end(), er = RHS.Items.end(); L != el && R != er;) {
+      
+      const RangeTy &LRange = L->first;
+      const RangeTy &RRange = R->first;
+      
+      if (LRange.getLow() > RRange.getLow()) {
+        if (RRange.isSingleNumber() || LRange.getLow() > RRange.getHigh())
+          ++R;
+        else
+          return true;
+      } else if (LRange.getLow() < RRange.getLow()) {
+        if (LRange.isSingleNumber() || LRange.getHigh() < RRange.getLow())
+          ++L;
+        else
+          return true;
+      } else // iRange.getLow() == jRange.getLow() 
+        return true;
+    }
+    return false;
+  }
+   
+  
+  void optimize() {
+    if (Items.size() < 2)
+      return;
+    sort();
+    CaseItems OldItems = Items;
+    Items.clear();
+    const IntTy *Low = &OldItems.begin()->first.getLow();
+    const IntTy *High = &OldItems.begin()->first.getHigh();
+    unsigned Weight = 1;
+    SuccessorClass *Successor = OldItems.begin()->second;
+    for (CaseItemIt j = OldItems.begin(), i = j++, e = OldItems.end();
+         j != e; i = j++) {
+      if (isJoinable(i, j)) {
+        const IntTy *CurHigh = &j->first.getHigh();
+        ++Weight;
+        if (*CurHigh > *High)
+          High = CurHigh;
+      } else {
+        RangeEx R(*Low, *High, Weight);
+        add(R, Successor);
+        Low = &j->first.getLow();
+        High = &j->first.getHigh(); 
+        Weight = 1;
+        Successor = j->second;
+      }
+    }
+    RangeEx R(*Low, *High, Weight);
+    add(R, Successor);
+    // We recollected the Items, but we kept it sorted.
+    Sorted = true;
+  }
+  
+  /// Adds a constant value.
+  void add(const IntTy &C, SuccessorClass *S = 0) {
+    RangeTy R(C);
+    add(R, S);
+  }
+  
+  /// Adds a range.
+  void add(const IntTy &Low, const IntTy &High, SuccessorClass *S = 0) {
+    RangeTy R(Low, High);
+    add(R, S);
+  }
+  void add(const RangeTy &R, SuccessorClass *S = 0) {
+    RangeEx REx = R;
+    add(REx, S);
+  }   
+  void add(const RangeEx &R, SuccessorClass *S = 0) {
+    Items.push_back(std::make_pair(R, S));
+    Sorted = false;
+  }  
+  
+  /// Adds all ranges and values from given ranges set to the current
+  /// mapping.
+  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0) {
+    for (unsigned i = 0, e = CRS.getNumItems(); i < e; ++i) {
+      RangeTy R = CRS.getItem(i);
+      add(R, S);
+    }
+  }
+  
+  void add(self& RHS) {
+    Items.insert(Items.end(), RHS.Items.begin(), RHS.Items.end());
+  }
+  
+  void add(self& RHS, SuccessorClass *S) {
+    for (CaseItemIt i = RHS.Items.begin(), e = RHS.Items.end(); i != e; ++i)
+      add(i->first, S);
+  }  
+  
+  void add(const RangesCollection& RHS, SuccessorClass *S = 0) {
+    for (RangesCollectionConstIt i = RHS.begin(), e = RHS.end(); i != e; ++i)
+      add(*i, S);
+  }  
+  
+  /// Removes items from set.
+  void removeItem(RangeIterator i) { Items.erase(i); }
+  
+  /// Moves whole case from current mapping to the NewMapping object.
+  void detachCase(self& NewMapping, SuccessorClass *Succ) {
+    for (CaseItemIt i = Items.begin(); i != Items.end();)
+      if (i->second == Succ) {
+        NewMapping.add(i->first, i->second);
+        Items.erase(i++);
+      } else
+        ++i;
+  }
+  
+  /// Removes all clusters for given successor.
+  void removeCase(SuccessorClass *Succ) {
+    for (CaseItemIt i = Items.begin(); i != Items.end();)
+      if (i->second == Succ) {
+        Items.erase(i++);
+      } else
+        ++i;
+  }  
+  
+  /// Find successor that satisfies given value.
+  SuccessorClass *findSuccessor(const IntTy& Val) {
+    for (CaseItemIt i = Items.begin(); i != Items.end(); ++i) {
+      if (i->first.isInRange(Val))
+        return i->second;
+    }
+    return 0;
+  }  
+  
+  /// Calculates the difference between this mapping and RHS.
+  /// THIS without RHS is placed into LExclude,
+  /// RHS without THIS is placed into RExclude,
+  /// THIS intersect RHS is placed into Intersection.
+  void diff(self *LExclude, self *Intersection, self *RExclude,
+                             const self& RHS) {
+    
+    DiffStateMachine Machine(LExclude, Intersection, RExclude);
+    
+    CaseItemConstIt L = Items.begin(), R = RHS.Items.begin();
+    while (L != Items.end() && R != RHS.Items.end()) {
+      const Cluster &LCluster = *L;
+      const RangeEx &LRange = LCluster.first;
+      const Cluster &RCluster = *R;
+      const RangeEx &RRange = RCluster.first;
+      
+      if (LRange.getHigh() < RRange.getLow()) {
+        Machine.onLOpen(LRange.getLow(), LCluster.second);
+        Machine.onLClose(LRange.getHigh());
+        ++L;
+        continue;
+      }
+      
+      if (LRange.getLow() > RRange.getHigh()) {
+        Machine.onROpen(RRange.getLow(), RCluster.second);
+        Machine.onRClose(RRange.getHigh());
+        ++R;
+        continue;
+      }
+
+      if (LRange.getLow() < RRange.getLow()) {
+        // May be opened in previous iteration.
+        if (!Machine.isLOpened())
+          Machine.onLOpen(LRange.getLow(), LCluster.second);
+        Machine.onROpen(RRange.getLow(), RCluster.second);
+      }
+      else if (RRange.getLow() < LRange.getLow()) {
+        if (!Machine.isROpened())
+          Machine.onROpen(RRange.getLow(), RCluster.second);
+        Machine.onLOpen(LRange.getLow(), LCluster.second);
+      }
+      else
+        Machine.onLROpen(LRange.getLow(), LCluster.second, RCluster.second);
+      
+      if (LRange.getHigh() < RRange.getHigh()) {
+        Machine.onLClose(LRange.getHigh());
+        ++L;
+        while(L != Items.end() && L->first.getHigh() < RRange.getHigh()) {
+          Machine.onLOpen(L->first.getLow(), L->second);
+          Machine.onLClose(L->first.getHigh());
+          ++L;
+        }
+      }
+      else if (RRange.getHigh() < LRange.getHigh()) {
+        Machine.onRClose(RRange.getHigh());
+        ++R;
+        while(R != RHS.Items.end() && R->first.getHigh() < LRange.getHigh()) {
+          Machine.onROpen(R->first.getLow(), R->second);
+          Machine.onRClose(R->first.getHigh());
+          ++R;
+        }
+      }
+      else {
+        Machine.onLRClose(LRange.getHigh());
+        ++L;
+        ++R;
+      }
+    }
+    
+    if (L != Items.end()) {
+      if (Machine.isLOpened()) {
+        Machine.onLClose(L->first.getHigh());
+        ++L;
+      }
+      if (LExclude)
+        while (L != Items.end()) {
+          LExclude->add(L->first, L->second);
+          ++L;
+        }
+    } else if (R != RHS.Items.end()) {
+      if (Machine.isROpened()) {
+        Machine.onRClose(R->first.getHigh());
+        ++R;
+      }
+      if (RExclude)
+        while (R != RHS.Items.end()) {
+          RExclude->add(R->first, R->second);
+          ++R;
+        }
+    }
+  }  
+  
+  /// Builds the finalized case objects.
+  void getCases(Cases& TheCases, bool PreventMerging = false) {
+    //FIXME: PreventMerging is a temporary parameter.
+    //Currently a set of passes is still knows nothing about
+    //switches with case ranges, and if these passes meet switch
+    //with complex case that crashs the application.
+    if (PreventMerging) {
+      for (RangeIterator i = this->begin(); i != this->end(); ++i) {
+        RangesCollection SingleRange;
+        SingleRange.push_back(i->first);
+        TheCases.push_back(std::make_pair(i->second,
+                                          IntegersSubsetTy(SingleRange)));
+      }
+      return;
+    }
+    CRSMap TheCRSMap;
+    for (RangeIterator i = this->begin(); i != this->end(); ++i)
+      TheCRSMap[i->second].push_back(i->first);
+    for (CRSMapIt i = TheCRSMap.begin(), e = TheCRSMap.end(); i != e; ++i)
+      TheCases.push_back(std::make_pair(i->first, IntegersSubsetTy(i->second)));
+  }
+  
+  /// Builds the finalized case objects ignoring successor values, as though
+  /// all ranges belongs to the same successor.
+  IntegersSubsetTy getCase() {
+    RangesCollection Ranges;
+    for (RangeIterator i = this->begin(); i != this->end(); ++i)
+      Ranges.push_back(i->first);
+    return IntegersSubsetTy(Ranges);
+  }  
+  
+  /// Returns pointer to value of case if it is single-numbered or 0
+  /// in another case.
+  const IntTy* getCaseSingleNumber(SuccessorClass *Succ) {
+    const IntTy* Res = 0;
+    for (CaseItemIt i = Items.begin(); i != Items.end(); ++i)
+      if (i->second == Succ) {
+        if (!i->first.isSingleNumber())
+          return 0;
+        if (Res)
+          return 0;
+        else 
+          Res = &(i->first.getLow());
+      }
+    return Res;
+  }  
+  
+  /// Returns true if there is no ranges and values inside.
+  bool empty() const { return Items.empty(); }
+  
+  void clear() {
+    Items.clear();
+    // Don't reset Sorted flag:
+    // 1. For empty mapping it matters nothing.
+    // 2. After first item will added Sorted flag will cleared.
+  }  
+  
+  // Returns number of clusters
+  unsigned size() const {
+    return Items.size();
+  }
+  
+  RangeIterator begin() { return Items.begin(); }
+  RangeIterator end() { return Items.end(); }
+};
+
+class BasicBlock;
+typedef IntegersSubsetMapping<BasicBlock> IntegersSubsetToBB;
+
+}
+
+#endif /* CRSBUILDER_H_ */
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
new file mode 100644
index 0000000..00c7eea
--- /dev/null
+++ b/include/llvm/Support/LEB128.h
@@ -0,0 +1,58 @@
+//===- llvm/Support/LEB128.h - [SU]LEB128 utility functions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares some utility functions for encoding SLEB128 and
+// ULEB128 values.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEM_LEB128_H
+#define LLVM_SYSTEM_LEB128_H
+
+#include <llvm/Support/raw_ostream.h>
+
+namespace llvm {
+
+/// Utility function to encode a SLEB128 value.
+static inline void encodeSLEB128(int64_t Value, raw_ostream &OS) {
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    // NOTE: this assumes that this signed shift is an arithmetic right shift.
+    Value >>= 7;
+    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte that that more bytes will follow.
+    OS << char(Byte);
+  } while (More);
+}
+
+/// Utility function to encode a ULEB128 value.
+static inline void encodeULEB128(uint64_t Value, raw_ostream &OS,
+                                 unsigned Padding = 0) {
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0 || Padding != 0)
+      Byte |= 0x80; // Mark this byte that that more bytes will follow.
+    OS << char(Byte);
+  } while (Value != 0);
+
+  // Pad with 0x80 and emit a null byte at the end.
+  if (Padding != 0) {
+    for (; Padding != 1; --Padding)
+      OS << '\x80';
+    OS << '\x00';
+  }
+}
+
+}  // namespace llvm
+
+#endif  // LLVM_SYSTEM_LEB128_H
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index 44a7a79..7f28c3f 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -174,50 +174,50 @@ namespace llvm {
 
       RebaseTypePointer                     = 1u, // REBASE_TYPE_POINTER
       RebaseTypeTextAbsolute32              = 2u, // REBASE_TYPE_TEXT_ABSOLUTE32
-      RebaseTypeTextPCRelative32	    = 3u, // REBASE_TYPE_TEXT_PCREL32
+      RebaseTypeTextPCRelative32            = 3u, // REBASE_TYPE_TEXT_PCREL32
 
       RebaseOpcodeMask                          = 0xF0u, // REBASE_OPCODE_MASK
       RebaseImmediateMask                       = 0x0Fu, // REBASE_IMMEDIATE_MASK
       RebaseOpcodeDone                          = 0x00u, // REBASE_OPCODE_DONE
       RebaseOpcodeSetTypeImmediate              = 0x10u, // REBASE_OPCODE_SET_TYPE_IMM
-      RebaseOpcodeSetSegmentAndOffsetULEB	= 0x20u, // REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      RebaseOpcodeSetSegmentAndOffsetULEB       = 0x20u, // REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
       RebaseOpcodeAddAddressULEB                = 0x30u, // REBASE_OPCODE_ADD_ADDR_ULEB
-      RebaseOpcodeAddAddressImmediateScaled	= 0x40u, // REBASE_OPCODE_ADD_ADDR_IMM_SCALED
-      RebaseOpcodeDoRebaseImmediateTimes	= 0x50u, // REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      RebaseOpcodeAddAddressImmediateScaled     = 0x40u, // REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      RebaseOpcodeDoRebaseImmediateTimes        = 0x50u, // REBASE_OPCODE_DO_REBASE_IMM_TIMES
       RebaseOpcodeDoRebaseULEBTimes             = 0x60u, // REBASE_OPCODE_DO_REBASE_ULEB_TIMES
       RebaseOpcodeDoRebaseAddAddressULEB        = 0x70u, // REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB
       RebaseOpcodeDoRebaseULEBTimesSkippingULEB = 0x80u, // REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
 
 
       BindTypePointer           = 1u, // BIND_TYPE_POINTER
-      BindTypeTextAbsolute32	= 2u, // BIND_TYPE_TEXT_ABSOLUTE32
-      BindTypeTextPCRelative32	= 3u, // BIND_TYPE_TEXT_PCREL32
+      BindTypeTextAbsolute32    = 2u, // BIND_TYPE_TEXT_ABSOLUTE32
+      BindTypeTextPCRelative32  = 3u, // BIND_TYPE_TEXT_PCREL32
 
       BindSpecialDylibSelf            =  0u, // BIND_SPECIAL_DYLIB_SELF
       BindSpecialDylibMainExecutable  = -1u, // BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE
       BindSpecialDylibFlatLookup      = -2u, // BIND_SPECIAL_DYLIB_FLAT_LOOKUP
 
       BindSymbolFlagsWeakImport         = 0x1u, // BIND_SYMBOL_FLAGS_WEAK_IMPORT
-      BindSymbolFlagsNonWeakDefinition	= 0x8u, // BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION
+      BindSymbolFlagsNonWeakDefinition  = 0x8u, // BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION
 
       BindOpcodeMask                            = 0xF0u, // BIND_OPCODE_MASK
       BindImmediateMask                         = 0x0Fu, // BIND_IMMEDIATE_MASK
       BindOpcodeDone                            = 0x00u, // BIND_OPCODE_DONE
       BindOpcodeSetDylibOrdinalImmediate        = 0x10u, // BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
       BindOpcodeSetDylibOrdinalULEB             = 0x20u, // BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB
-      BindOpcodeSetDylibSpecialImmediate	= 0x30u, // BIND_OPCODE_SET_DYLIB_SPECIAL_IMM
-      BindOpcodeSetSymbolTrailingFlagsImmediate	= 0x40u, // BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
-      BindOpcodeSetTypeImmediate		= 0x50u, // BIND_OPCODE_SET_TYPE_IMM
+      BindOpcodeSetDylibSpecialImmediate        = 0x30u, // BIND_OPCODE_SET_DYLIB_SPECIAL_IMM
+      BindOpcodeSetSymbolTrailingFlagsImmediate = 0x40u, // BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      BindOpcodeSetTypeImmediate                = 0x50u, // BIND_OPCODE_SET_TYPE_IMM
       BindOpcodeSetAppendSLEB                   = 0x60u, // BIND_OPCODE_SET_ADDEND_SLEB
       BindOpcodeSetSegmentAndOffsetULEB         = 0x70u, // BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
       BindOpcodeAddAddressULEB                  = 0x80u, // BIND_OPCODE_ADD_ADDR_ULEB
       BindOpcodeDoBind                          = 0x90u, // BIND_OPCODE_DO_BIND
-      BindOpcodeDoBindAddAddressULEB		= 0xA0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB
-      BindOpcodeDoBindAddAddressImmediateScaled	= 0xB0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED
+      BindOpcodeDoBindAddAddressULEB            = 0xA0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB
+      BindOpcodeDoBindAddAddressImmediateScaled = 0xB0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED
       BindOpcodeDoBindULEBTimesSkippingULEB     = 0xC0u, // BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB
 
       ExportSymbolFlagsKindMask           = 0x03u, // EXPORT_SYMBOL_FLAGS_KIND_MASK
-      ExportSymbolFlagsKindRegular	  = 0x00u, // EXPORT_SYMBOL_FLAGS_KIND_REGULAR
+      ExportSymbolFlagsKindRegular        = 0x00u, // EXPORT_SYMBOL_FLAGS_KIND_REGULAR
       ExportSymbolFlagsKindThreadLocal    = 0x01u, // EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL
       ExportSymbolFlagsWeakDefinition     = 0x04u, // EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION
       ExportSymbolFlagsIndirectDefinition = 0x08u, // EXPORT_SYMBOL_FLAGS_INDIRECT_DEFINITION
@@ -227,7 +227,7 @@ namespace llvm {
       // Constant masks for the "n_type" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64
       NlistMaskStab             = 0xe0, // N_STAB
-      NlistMaskPrivateExternal	= 0x10, // N_PEXT
+      NlistMaskPrivateExternal  = 0x10, // N_PEXT
       NlistMaskType             = 0x0e, // N_TYPE
       NlistMaskExternal         = 0x01, // N_EXT
 
@@ -249,35 +249,35 @@ namespace llvm {
 
       // Constant values for the "n_type" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64 when "(n_type & NlistMaskStab) != 0"
-      StabGlobalSymbol          = 0x20u,  // N_GSYM	
-      StabFunctionName          = 0x22u,  // N_FNAME	
-      StabFunction              = 0x24u,  // N_FUN	
-      StabStaticSymbol          = 0x26u,  // N_STSYM	
-      StabLocalCommon           = 0x28u,  // N_LCSYM	
+      StabGlobalSymbol          = 0x20u,  // N_GSYM
+      StabFunctionName          = 0x22u,  // N_FNAME
+      StabFunction              = 0x24u,  // N_FUN
+      StabStaticSymbol          = 0x26u,  // N_STSYM
+      StabLocalCommon           = 0x28u,  // N_LCSYM
       StabBeginSymbol           = 0x2Eu,  // N_BNSYM
-      StabSourceFileOptions     = 0x3Cu,  // N_OPT	
-      StabRegisterSymbol        = 0x40u,  // N_RSYM	
-      StabSourceLine            = 0x44u,  // N_SLINE	
+      StabSourceFileOptions     = 0x3Cu,  // N_OPT
+      StabRegisterSymbol        = 0x40u,  // N_RSYM
+      StabSourceLine            = 0x44u,  // N_SLINE
       StabEndSymbol             = 0x4Eu,  // N_ENSYM
-      StabStructureType         = 0x60u,  // N_SSYM	
-      StabSourceFileName        = 0x64u,  // N_SO	
-      StabObjectFileName        = 0x66u,  // N_OSO	
-      StabLocalSymbol           = 0x80u,  // N_LSYM	
-      StabBeginIncludeFileName  = 0x82u,  // N_BINCL	
-      StabIncludeFileName       = 0x84u,  // N_SOL	
+      StabStructureType         = 0x60u,  // N_SSYM
+      StabSourceFileName        = 0x64u,  // N_SO
+      StabObjectFileName        = 0x66u,  // N_OSO
+      StabLocalSymbol           = 0x80u,  // N_LSYM
+      StabBeginIncludeFileName  = 0x82u,  // N_BINCL
+      StabIncludeFileName       = 0x84u,  // N_SOL
       StabCompilerParameters    = 0x86u,  // N_PARAMS
       StabCompilerVersion       = 0x88u,  // N_VERSION
       StabCompilerOptLevel      = 0x8Au,  // N_OLEVEL
-      StabParameter             = 0xA0u,  // N_PSYM	
-      StabEndIncludeFile        = 0xA2u,  // N_EINCL	
-      StabAlternateEntry        = 0xA4u,  // N_ENTRY	
-      StabLeftBracket           = 0xC0u,  // N_LBRAC	
-      StabDeletedIncludeFile    = 0xC2u,  // N_EXCL	
-      StabRightBracket          = 0xE0u,  // N_RBRAC	
-      StabBeginCommon           = 0xE2u,  // N_BCOMM	
-      StabEndCommon             = 0xE4u,  // N_ECOMM	
-      StabEndCommonLocal        = 0xE8u,  // N_ECOML	
-      StabLength                = 0xFEu   // N_LENG	
+      StabParameter             = 0xA0u,  // N_PSYM
+      StabEndIncludeFile        = 0xA2u,  // N_EINCL
+      StabAlternateEntry        = 0xA4u,  // N_ENTRY
+      StabLeftBracket           = 0xC0u,  // N_LBRAC
+      StabDeletedIncludeFile    = 0xC2u,  // N_EXCL
+      StabRightBracket          = 0xE0u,  // N_RBRAC
+      StabBeginCommon           = 0xE2u,  // N_BCOMM
+      StabEndCommon             = 0xE4u,  // N_ECOMM
+      StabEndCommonLocal        = 0xE8u,  // N_ECOML
+      StabLength                = 0xFEu   // N_LENG
 
     };
 
@@ -490,12 +490,12 @@ namespace llvm {
       uint32_t nextrel;
       uint32_t locreloff;
       uint32_t nlocrel;
-    };	
+    };
 
     struct dylib_table_of_contents {
       uint32_t symbol_index;
       uint32_t module_index;
-    };	
+    };
 
     struct dylib_module {
       uint32_t module_name;
@@ -511,7 +511,7 @@ namespace llvm {
       uint32_t ninit_nterm;
       uint32_t objc_module_info_addr;
       uint32_t objc_module_info_size;
-    };	
+    };
 
     struct dylib_module_64 {
       uint32_t module_name;
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index d085c94..4005161 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -414,14 +414,14 @@ int IsInf(double d);
 
 /// MinAlign - A and B are either alignments or offsets.  Return the minimum
 /// alignment that may be assumed after adding the two together.
-static inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+inline uint64_t MinAlign(uint64_t A, uint64_t B) {
   // The largest power of 2 that divides both A and B.
   return (A | B) & -(A | B);
 }
 
 /// NextPowerOf2 - Returns the next power of two (in 64-bits)
 /// that is strictly greater than A.  Returns zero on overflow.
-static inline uint64_t NextPowerOf2(uint64_t A) {
+inline uint64_t NextPowerOf2(uint64_t A) {
   A |= (A >> 1);
   A |= (A >> 2);
   A |= (A >> 4);
diff --git a/include/llvm/Support/PathV2.h b/include/llvm/Support/PathV2.h
index 6d38c95..8d79709 100644
--- a/include/llvm/Support/PathV2.h
+++ b/include/llvm/Support/PathV2.h
@@ -47,9 +47,9 @@ namespace path {
 /// C:\foo\bar => C:,/,foo,bar
 ///
 class const_iterator {
-  StringRef Path;      //< The entire path.
-  StringRef Component; //< The current component. Not necessarily in Path.
-  size_t    Position;  //< The iterators current position within Path.
+  StringRef Path;      ///< The entire path.
+  StringRef Component; ///< The current component. Not necessarily in Path.
+  size_t    Position;  ///< The iterators current position within Path.
 
   // An end iterator has Position = Path.size() + 1.
   friend const_iterator begin(StringRef path);
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index d796b79..088897c 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -1,4 +1,4 @@
-//===- llvm/Support/Process.h ------------------------------------*- C++ -*-===//
+//===- llvm/Support/Process.h -----------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -97,6 +97,10 @@ namespace sys {
       /// the user rather than being put on a pipe or stored in a file.
       static bool FileDescriptorIsDisplayed(int fd);
 
+      /// This function determines if the given file descriptor is displayd and
+      /// supports colors.
+      static bool FileDescriptorHasColors(int fd);
+
       /// This function determines the number of columns in the window
       /// if standard output is connected to a "tty" or "console"
       /// window. If standard output is not connected to a tty or
@@ -142,6 +146,10 @@ namespace sys {
 
       /// Resets the terminals colors, or returns an escape sequence to do so.
       static const char *ResetColor();
+
+      /// Get the result of a process wide random number generator. The
+      /// generator will be automatically seeded in non-deterministic fashion.
+      static unsigned GetRandomNumber();
     /// @}
   };
 }
diff --git a/include/llvm/Support/SMLoc.h b/include/llvm/Support/SMLoc.h
index d48bfcc..1bf810b 100644
--- a/include/llvm/Support/SMLoc.h
+++ b/include/llvm/Support/SMLoc.h
@@ -24,7 +24,6 @@ class SMLoc {
   const char *Ptr;
 public:
   SMLoc() : Ptr(0) {}
-  SMLoc(const SMLoc &RHS) : Ptr(RHS.Ptr) {}
 
   bool isValid() const { return Ptr != 0; }
 
@@ -48,7 +47,7 @@ public:
   SMLoc Start, End;
 
   SMRange() {}
-  SMRange(SMLoc Start, SMLoc End) : Start(Start), End(End) {
+  SMRange(SMLoc St, SMLoc En) : Start(St), End(En) {
     assert(Start.isValid() == End.isValid() &&
            "Start and end should either both be valid or both be invalid!");
   }
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 76967db..8949a3a 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -123,7 +123,14 @@ public:
 
   /// FindLineNumber - Find the line number for the specified location in the
   /// specified file.  This is not a fast method.
-  unsigned FindLineNumber(SMLoc Loc, int BufferID = -1) const;
+  unsigned FindLineNumber(SMLoc Loc, int BufferID = -1) const {
+    return getLineAndColumn(Loc, BufferID).first;
+  }
+
+  /// getLineAndColumn - Find the line and column number for the specified
+  /// location in the specified file.  This is not a fast method.
+  std::pair<unsigned, unsigned>
+    getLineAndColumn(SMLoc Loc, int BufferID = -1) const;
 
   /// PrintMessage - Emit a message about the specified location with the
   /// specified string.
@@ -169,9 +176,9 @@ public:
   SMDiagnostic()
     : SM(0), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
   // Diagnostic with no location (e.g. file not found, command line arg error).
-  SMDiagnostic(const std::string &filename, SourceMgr::DiagKind Kind,
+  SMDiagnostic(const std::string &filename, SourceMgr::DiagKind Knd,
                const std::string &Msg)
-    : SM(0), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Kind),
+    : SM(0), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
       Message(Msg) {}
   
   // Diagnostic with a location.
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 8808130..c0be8f1 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -108,6 +108,7 @@ namespace llvm {
                                                   const MCRegisterInfo &MRI,
                                                   const MCSubtargetInfo &STI);
     typedef MCCodeEmitter *(*MCCodeEmitterCtorTy)(const MCInstrInfo &II,
+                                                  const MCRegisterInfo &MRI,
                                                   const MCSubtargetInfo &STI,
                                                   MCContext &Ctx);
     typedef MCStreamer *(*MCObjectStreamerCtorTy)(const Target &T,
@@ -405,11 +406,12 @@ namespace llvm {
 
     /// createMCCodeEmitter - Create a target specific code emitter.
     MCCodeEmitter *createMCCodeEmitter(const MCInstrInfo &II,
+                                       const MCRegisterInfo &MRI,
                                        const MCSubtargetInfo &STI,
                                        MCContext &Ctx) const {
       if (!MCCodeEmitterCtorFn)
         return 0;
-      return MCCodeEmitterCtorFn(II, STI, Ctx);
+      return MCCodeEmitterCtorFn(II, MRI, STI, Ctx);
     }
 
     /// createMCObjectStreamer - Create a target specific MCStreamer.
@@ -510,6 +512,21 @@ namespace llvm {
     static const Target *lookupTarget(const std::string &Triple,
                                       std::string &Error);
 
+    /// lookupTarget - Lookup a target based on an architecture name
+    /// and a target triple.  If the architecture name is non-empty,
+    /// then the lookup is done by architecture.  Otherwise, the target
+    /// triple is used.
+    ///
+    /// \param ArchName - The architecture to use for finding a target.
+    /// \param TheTriple - The triple to use for finding a target.  The
+    /// triple is updated with canonical architecture name if a lookup
+    /// by architecture is done.
+    /// \param Error - On failure, an error string describing why no target was
+    /// found.
+    static const Target *lookupTarget(const std::string &ArchName,
+                                      Triple &TheTriple,
+                                      std::string &Error);
+
     /// getClosestTargetForJIT - Pick the best target that is compatible with
     /// the current host.  If no close target can be found, this returns null
     /// and sets the Error string to a reason.
@@ -1129,6 +1146,7 @@ namespace llvm {
 
   private:
     static MCCodeEmitter *Allocator(const MCInstrInfo &II,
+                                    const MCRegisterInfo &MRI,
                                     const MCSubtargetInfo &STI,
                                     MCContext &Ctx) {
       return new MCCodeEmitterImpl();
diff --git a/include/llvm/Support/ThreadLocal.h b/include/llvm/Support/ThreadLocal.h
index 15350a7..62ec90a 100644
--- a/include/llvm/Support/ThreadLocal.h
+++ b/include/llvm/Support/ThreadLocal.h
@@ -15,6 +15,7 @@
 #define LLVM_SYSTEM_THREAD_LOCAL_H
 
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/DataTypes.h"
 #include <cassert>
 
 namespace llvm {
@@ -22,7 +23,15 @@ namespace llvm {
     // ThreadLocalImpl - Common base class of all ThreadLocal instantiations.
     // YOU SHOULD NEVER USE THIS DIRECTLY.
     class ThreadLocalImpl {
-      void* data;
+      typedef uint64_t ThreadLocalDataTy;
+      /// \brief Platform-specific thread local data.
+      ///
+      /// This is embedded in the class and we avoid malloc'ing/free'ing it,
+      /// to make this class more safe for use along with CrashRecoveryContext.
+      union {
+        char data[sizeof(ThreadLocalDataTy)];
+        ThreadLocalDataTy align_data;
+      };
     public:
       ThreadLocalImpl();
       virtual ~ThreadLocalImpl();
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index b7210b2..61e21b8 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -110,11 +110,12 @@ protected:
            V != DenseMapInfo<Value *>::getTombstoneKey();
   }
 
-private:
+public:
   // Callbacks made from Value.
   static void ValueIsDeleted(Value *V);
   static void ValueIsRAUWd(Value *Old, Value *New);
 
+private:
   // Internal implementation details.
   ValueHandleBase **getPrevPtr() const { return PrevPair.getPointer(); }
   HandleBaseKind getKind() const { return PrevPair.getInt(); }
@@ -367,7 +368,7 @@ protected:
   CallbackVH(const CallbackVH &RHS)
     : ValueHandleBase(Callback, RHS) {}
 
-  virtual ~CallbackVH();
+  virtual ~CallbackVH() {}
 
   void setValPtr(Value *P) {
     ValueHandleBase::operator=(P);
@@ -389,15 +390,13 @@ public:
   ///
   /// All implementations must remove the reference from this object to the
   /// Value that's being destroyed.
-  virtual void deleted() {
-    setValPtr(NULL);
-  }
+  virtual void deleted();
 
   /// Called when this->getValPtr()->replaceAllUsesWith(new_value) is called,
   /// _before_ any of the uses have actually been replaced.  If WeakVH were
   /// implemented as a CallbackVH, it would use this method to call
   /// setValPtr(new_value).  AssertingVH would do nothing in this method.
-  virtual void allUsesReplacedWith(Value *) {}
+  virtual void allUsesReplacedWith(Value *);
 };
 
 // Specialize simplify_type to allow CallbackVH to participate in
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 47206b3..98910eb 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -130,7 +130,7 @@ public:
   void setError(const Twine &Message, Token &Location) const;
   bool failed() const;
 
-  virtual void skip() {};
+  virtual void skip() {}
 
   unsigned int getType() const { return TypeID; }
   static inline bool classof(const Node *) { return true; }
@@ -336,7 +336,7 @@ public:
   enum MappingType {
     MT_Block,
     MT_Flow,
-    MT_Inline //< An inline mapping node is used for "[key: value]".
+    MT_Inline ///< An inline mapping node is used for "[key: value]".
   };
 
   MappingNode(OwningPtr<Document> &D, StringRef Anchor, MappingType MT)
@@ -513,37 +513,44 @@ private:
 /// @brief Iterator abstraction for Documents over a Stream.
 class document_iterator {
 public:
-  document_iterator() : Doc(NullDoc) {}
-  document_iterator(OwningPtr<Document> &D) : Doc(D) {}
+  document_iterator() : Doc(0) {}
+  document_iterator(OwningPtr<Document> &D) : Doc(&D) {}
 
   bool operator ==(const document_iterator &Other) {
-    return Doc == Other.Doc;
+    if (isAtEnd() || Other.isAtEnd())
+      return isAtEnd() && Other.isAtEnd();
+
+    return *Doc == *Other.Doc;
   }
   bool operator !=(const document_iterator &Other) {
     return !(*this == Other);
   }
 
   document_iterator operator ++() {
-    if (!Doc->skip()) {
-      Doc.reset(0);
+    assert(Doc != 0 && "incrementing iterator past the end.");
+    if (!(*Doc)->skip()) {
+      Doc->reset(0);
     } else {
-      Stream &S = Doc->stream;
-      Doc.reset(new Document(S));
+      Stream &S = (*Doc)->stream;
+      Doc->reset(new Document(S));
     }
     return *this;
   }
 
   Document &operator *() {
-    return *Doc;
+    return *Doc->get();
   }
 
   OwningPtr<Document> &operator ->() {
-    return Doc;
+    return *Doc;
   }
 
 private:
-  static OwningPtr<Document> NullDoc;
-  OwningPtr<Document> &Doc;
+  bool isAtEnd() const {
+    return Doc == 0 || *Doc == 0;
+  }
+
+  OwningPtr<Document> *Doc;
 };
 
 }
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 6c5d478..5de749a 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -230,6 +230,9 @@ public:
   /// rather than being put on a pipe or stored in a file.
   virtual bool is_displayed() const { return false; }
 
+  /// This function determines if this stream is displayed and supports colors.
+  virtual bool has_colors() const { return is_displayed(); }
+
   //===--------------------------------------------------------------------===//
   // Subclass Interface
   //===--------------------------------------------------------------------===//
@@ -386,10 +389,12 @@ public:
 
   virtual bool is_displayed() const;
 
+  virtual bool has_colors() const;
+
   /// has_error - Return the value of the flag in this raw_fd_ostream indicating
   /// whether an output error has been encountered.
   /// This doesn't implicitly flush any pending output.  Also, it doesn't
-  /// guarantee to detect all errors unless the the stream has been closed.
+  /// guarantee to detect all errors unless the stream has been closed.
   bool has_error() const {
     return Error;
   }
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index a3a551f..7b97547 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -21,6 +21,11 @@
 #include <cstddef>
 #include <utility>
 
+#ifndef __has_feature
+#define LLVM_DEFINED_HAS_FEATURE
+#define __has_feature(x) 0
+#endif
+
 // This is actually the conforming implementation which works with abstract
 // classes.  However, enough compilers have trouble with it that most will use
 // the one in boost/type_traits/object_traits.hpp. This implementation actually
@@ -58,9 +63,15 @@ struct is_class
 /// type can be copied around with memcpy instead of running ctors etc.
 template <typename T>
 struct isPodLike {
+#if __has_feature(is_trivially_copyable)
+  // If the compiler supports the is_trivially_copyable trait use it, as it
+  // matches the definition of isPodLike closely.
+  static const bool value = __is_trivially_copyable(T);
+#else
   // If we don't know anything else, we can (at least) assume that all non-class
   // types are PODs.
   static const bool value = !is_class<T>::value;
+#endif
 };
 
 // std::pair's are pod-like if their elements are.
@@ -202,4 +213,8 @@ struct conditional<false, T, F> { typedef F type; };
 
 }
 
+#ifdef LLVM_DEFINED_HAS_FEATURE
+#undef __has_feature
+#endif
+
 #endif
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 3aea1ae..a8256b7 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1558,12 +1558,14 @@ public:
     return I == Defs.end() ? 0 : I->second;
   }
   void addClass(Record *R) {
-    assert(getClass(R->getNameInitAsString()) == 0 && "Class already exists!");
-    Classes.insert(std::make_pair(R->getNameInitAsString(), R));
+    bool Ins = Classes.insert(std::make_pair(R->getName(), R)).second;
+    (void)Ins;
+    assert(Ins && "Class already exists");
   }
   void addDef(Record *R) {
-    assert(getDef(R->getNameInitAsString()) == 0 && "Def already exists!");
-    Defs.insert(std::make_pair(R->getNameInitAsString(), R));
+    bool Ins = Defs.insert(std::make_pair(R->getName(), R)).second;
+    (void)Ins;
+    assert(Ins && "Record already exists");
   }
 
   /// removeClass - Remove, but do not delete, the specified record.
diff --git a/utils/TableGen/StringMatcher.h b/include/llvm/TableGen/StringMatcher.h
index 1dadc76..1dadc76 100644
--- a/utils/TableGen/StringMatcher.h
+++ b/include/llvm/TableGen/StringMatcher.h
diff --git a/include/llvm/TableGen/TableGenBackend.h b/include/llvm/TableGen/TableGenBackend.h
index 3ebcd92..bedf7fb 100644
--- a/include/llvm/TableGen/TableGenBackend.h
+++ b/include/llvm/TableGen/TableGenBackend.h
@@ -1,4 +1,4 @@
-//===- llvm/TableGen/TableGenBackend.h - Backend base class -----*- C++ -*-===//
+//===- llvm/TableGen/TableGenBackend.h - Backend utilities ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,36 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The TableGenBackend class is provided as a common interface for all TableGen
-// backends.  It provides useful services and an standardized interface.
+// Useful utilities for TableGen backends.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TABLEGEN_TABLEGENBACKEND_H
 #define LLVM_TABLEGEN_TABLEGENBACKEND_H
 
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
 
-class Record;
-class RecordKeeper;
+class raw_ostream;
 
-struct TableGenBackend {
-  virtual void anchor();
-  virtual ~TableGenBackend() {}
-
-  // run - All TableGen backends should implement the run method, which should
-  // be the main entry point.
-  virtual void run(raw_ostream &OS) = 0;
-
-
-public:   // Useful helper routines...
-  /// EmitSourceFileHeader - Output a LLVM style file header to the specified
-  /// ostream.
-  void EmitSourceFileHeader(StringRef Desc, raw_ostream &OS) const;
-
-};
+/// emitSourceFileHeader - Output a LLVM style file header to the specified
+/// raw_ostream.
+void emitSourceFileHeader(StringRef Desc, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index fa1ec55..24be2b1 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -64,18 +64,6 @@ class Register<string n, list<string> altNames = []> {
   // register.
   list<RegAltNameIndex> RegAltNameIndices = [];
 
-  // CompositeIndices - Specify subreg indices that don't correspond directly to
-  // a register in SubRegs and are not inherited. The following formats are
-  // supported:
-  //
-  // (a)     Identity  - Reg:a == Reg
-  // (a b)   Alias     - Reg:a == Reg:b
-  // (a b,c) Composite - Reg:a == (Reg:b):c
-  //
-  // This can be used to disambiguate a sub-sub-register that exists in more
-  // than one subregister and other weird stuff.
-  list<dag> CompositeIndices = [];
-
   // DwarfNumbers - Numbers used internally by gcc/gdb to identify the register.
   // These values can be determined by locating the <target>.h file in the
   // directory llvmgcc/gcc/config/<target>/ and looking for REGISTER_NAMES.  The
@@ -96,6 +84,9 @@ class Register<string n, list<string> altNames = []> {
   // x86 register AX is covered by its sub-registers AL and AH, but EAX is not
   // covered by its sub-register AX.
   bit CoveredBySubRegs = 0;
+
+  // HWEncoding - The target specific hardware encoding for this register.
+  bits<16> HWEncoding = 0;
 }
 
 // RegisterWithSubRegs - This can be used to define instances of Register which
@@ -108,13 +99,20 @@ class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> {
   let SubRegs = subregs;
 }
 
+// DAGOperand - An empty base class that unifies RegisterClass's and other forms
+// of Operand's that are legal as type qualifiers in DAG patterns.  This should
+// only ever be used for defining multiclasses that are polymorphic over both
+// RegisterClass's and other Operand's.
+class DAGOperand { }
+
 // RegisterClass - Now that all of the registers are defined, and aliases
 // between registers are defined, specify which registers belong to which
 // register classes.  This also defines the default allocation order of
 // registers by register allocators.
 //
 class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
-                    dag regList, RegAltNameIndex idx = NoRegAltName> {
+                    dag regList, RegAltNameIndex idx = NoRegAltName>
+  : DAGOperand {
   string Namespace = namespace;
 
   // RegType - Specify the list ValueType of the registers in this register
@@ -151,10 +149,6 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // a valid alternate name for the given index.
   RegAltNameIndex altNameIndex = idx;
 
-  // SubRegClasses - Specify the register class of subregisters as a list of
-  // dags: (RegClass SubRegIndex, SubRegindex, ...)
-  list<dag> SubRegClasses = [];
-
   // isAllocatable - Specify that the register class can be used for virtual
   // registers and register allocation.  Some register classes are only used to
   // model instruction operand constraints, and should have isAllocatable = 0.
@@ -192,7 +186,8 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
 // also in the second set.
 //
 // (sequence "R%u", 0, 15) -> [R0, R1, ..., R15]. Generate a sequence of
-// numbered registers.
+// numbered registers.  Takes an optional 4th operand which is a stride to use
+// when generating the sequence.
 //
 // (shl GPR, 4) - Remove the first N elements.
 //
@@ -245,9 +240,6 @@ class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs> {
   // SubRegIndices - N SubRegIndex instances. This provides the names of the
   // sub-registers in the synthesized super-registers.
   list<SubRegIndex> SubRegIndices = Indices;
-
-  // Compose sub-register indices like in a normal Register.
-  list<dag> CompositeIndices = [];
 }
 
 
@@ -402,6 +394,13 @@ class Instruction {
 
   string AsmMatchConverter = "";
 
+  /// TwoOperandAliasConstraint - Enable TableGen to auto-generate a
+  /// two-operand matcher inst-alias for a three operand instruction.
+  /// For example, the arm instruction "add r3, r3, r5" can be written
+  /// as "add r3, r5". The constraint is of the same form as a tied-operand
+  /// constraint. For example, "$Rn = $Rd".
+  string TwoOperandAliasConstraint = "";
+
   ///@}
 }
 
@@ -431,6 +430,10 @@ class Predicate<string cond> {
   /// e.g. "ModeThumb,FeatureThumb2" is translated to
   ///      "(Bits & ModeThumb) != 0 && (Bits & FeatureThumb2) != 0".
   string AssemblerCondString = "";
+
+  /// PredicateName - User-level name to use for the predicate. Mainly for use
+  /// in diagnostics such as missing feature errors in the asm matcher.
+  string PredicateName = "";
 }
 
 /// NoHonorSignDependentRounding - This predicate is true if support for
@@ -512,6 +515,11 @@ class AsmOperandClass {
   /// to immediates or registers and are very instruction specific (as flags to
   /// set in a processor register, coprocessor number, ...).
   string ParserMethod = ?;
+
+  // The diagnostic type to present when referencing this operand in a
+  // match failure error message. By default, use a generic "invalid operand"
+  // diagnostic. The target AsmParser maps these codes to text.
+  string DiagnosticType = "";
 }
 
 def ImmAsmOperand : AsmOperandClass {
@@ -521,7 +529,7 @@ def ImmAsmOperand : AsmOperandClass {
 /// Operand Types - These provide the built-in operand types that may be used
 /// by a target.  Targets can optionally provide their own operand types as
 /// needed, though this should not be needed for RISC targets.
-class Operand<ValueType ty> {
+class Operand<ValueType ty> : DAGOperand {
   ValueType Type = ty;
   string PrintMethod = "printOperand";
   string EncoderMethod = "";
@@ -541,7 +549,8 @@ class Operand<ValueType ty> {
   AsmOperandClass ParserMatchClass = ImmAsmOperand;
 }
 
-class RegisterOperand<RegisterClass regclass, string pm = "printOperand"> {
+class RegisterOperand<RegisterClass regclass, string pm = "printOperand">
+  : DAGOperand {
   // RegClass - The register class of the operand.
   RegisterClass RegClass = regclass;
   // PrintMethod - The target method to call to print register operands of
@@ -729,7 +738,7 @@ class AsmParser {
 def DefaultAsmParser : AsmParser;
 
 //===----------------------------------------------------------------------===//
-// AsmParserVariant - Subtargets can have multiple different assembly parsers 
+// AsmParserVariant - Subtargets can have multiple different assembly parsers
 // (e.g. AT&T vs Intel syntax on X86 for example). This class can be
 // implemented by targets to describe such variants.
 //
@@ -754,9 +763,10 @@ def DefaultAsmParserVariant : AsmParserVariant;
 
 /// AssemblerPredicate - This is a Predicate that can be used when the assembler
 /// matches instructions and aliases.
-class AssemblerPredicate<string cond> {
+class AssemblerPredicate<string cond, string name = ""> {
   bit AssemblerMatcherPredicate = 1;
   string AssemblerCondString = cond;
+  string PredicateName = name;
 }
 
 /// TokenAlias - This class allows targets to define assembler token
@@ -861,7 +871,7 @@ class Target {
   // AssemblyParsers - The AsmParser instances available for this target.
   list<AsmParser> AssemblyParsers = [DefaultAsmParser];
 
-  /// AssemblyParserVariants - The AsmParserVariant instances available for 
+  /// AssemblyParserVariants - The AsmParserVariant instances available for
   /// this target.
   list<AsmParserVariant> AssemblyParserVariants = [DefaultAsmParserVariant];
 
@@ -909,6 +919,10 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
   //
   string Name = n;
 
+  // SchedModel - The machine model for scheduling and instruction cost.
+  //
+  SchedMachineModel SchedModel = NoSchedModel;
+
   // ProcItin - The scheduling information for the target processor.
   //
   ProcessorItineraries ProcItin = pi;
@@ -917,6 +931,14 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
   list<SubtargetFeature> Features = f;
 }
 
+// ProcessorModel allows subtargets to specify the more general
+// SchedMachineModel instead if a ProcessorItinerary. Subtargets will
+// gradually move to this newer form.
+class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f>
+  : Processor<n, NoItineraries, f> {
+  let SchedModel = m;
+}
+
 //===----------------------------------------------------------------------===//
 // Pull in the common support for calling conventions.
 //
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index a6251e7..f8cebef 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -36,16 +36,16 @@ namespace ISD {
     static const uint64_t ByValOffs      = 4;
     static const uint64_t Nest           = 1ULL<<5;  ///< Nested fn static chain
     static const uint64_t NestOffs       = 5;
-    static const uint64_t ByValAlign     = 0xFULL << 6; //< Struct alignment
+    static const uint64_t ByValAlign     = 0xFULL << 6; ///< Struct alignment
     static const uint64_t ByValAlignOffs = 6;
     static const uint64_t Split          = 1ULL << 10;
     static const uint64_t SplitOffs      = 10;
     static const uint64_t OrigAlign      = 0x1FULL<<27;
     static const uint64_t OrigAlignOffs  = 27;
-    static const uint64_t ByValSize      = 0xffffffffULL << 32; //< Struct size
+    static const uint64_t ByValSize      = 0xffffffffULL << 32; ///< Struct size
     static const uint64_t ByValSizeOffs  = 32;
 
-    static const uint64_t One            = 1ULL; //< 1 of this type, for shifts
+    static const uint64_t One            = 1ULL; ///< 1 of this type, for shifts
 
     uint64_t Flags;
   public:
diff --git a/include/llvm/Target/TargetData.h b/include/llvm/Target/TargetData.h
index d116f39..4f94ab7 100644
--- a/include/llvm/Target/TargetData.h
+++ b/include/llvm/Target/TargetData.h
@@ -53,10 +53,10 @@ enum AlignTypeEnum {
 /// @note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
 struct TargetAlignElem {
-  AlignTypeEnum       AlignType : 8;  //< Alignment type (AlignTypeEnum)
-  unsigned            ABIAlign;       //< ABI alignment for this type/bitw
-  unsigned            PrefAlign;      //< Pref. alignment for this type/bitw
-  uint32_t            TypeBitWidth;   //< Type bit width
+  AlignTypeEnum       AlignType : 8;  ///< Alignment type (AlignTypeEnum)
+  unsigned            ABIAlign;       ///< ABI alignment for this type/bitw
+  unsigned            PrefAlign;      ///< Pref. alignment for this type/bitw
+  uint32_t            TypeBitWidth;   ///< Type bit width
 
   /// Initializer
   static TargetAlignElem get(AlignTypeEnum align_type, unsigned abi_align,
diff --git a/include/llvm/Target/TargetELFWriterInfo.h b/include/llvm/Target/TargetELFWriterInfo.h
index 114295e..5e48629 100644
--- a/include/llvm/Target/TargetELFWriterInfo.h
+++ b/include/llvm/Target/TargetELFWriterInfo.h
@@ -43,7 +43,8 @@ namespace llvm {
       EM_ARM = 40,     // ARM
       EM_ALPHA = 41,   // DEC Alpha
       EM_SPARCV9 = 43, // SPARC V9
-      EM_X86_64 = 62   // AMD64
+      EM_X86_64 = 62,  // AMD64
+      EM_HEXAGON = 164 // Qualcomm Hexagon
     };
 
     // ELF File classes
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index d1e380c..a18b030 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGET_TARGETINSTRINFO_H
 #define LLVM_TARGET_TARGETINSTRINFO_H
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -27,6 +28,7 @@ class MachineMemOperand;
 class MachineRegisterInfo;
 class MDNode;
 class MCInst;
+class MCSchedModel;
 class SDNode;
 class ScheduleHazardRecognizer;
 class SelectionDAG;
@@ -57,7 +59,8 @@ public:
   /// class constraint for OpNum, or NULL.
   const TargetRegisterClass *getRegClass(const MCInstrDesc &TID,
                                          unsigned OpNum,
-                                         const TargetRegisterInfo *TRI) const;
+                                         const TargetRegisterInfo *TRI,
+                                         const MachineFunction &MF) const;
 
   /// isTriviallyReMaterializable - Return true if the instruction is trivially
   /// rematerializable, meaning it has no side effects and requires no operands
@@ -185,14 +188,6 @@ public:
                              const MachineInstr *Orig,
                              const TargetRegisterInfo &TRI) const = 0;
 
-  /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
-  /// two-addrss instruction inserted by two-address pass.
-  virtual void scheduleTwoAddrSource(MachineInstr *SrcMI,
-                                     MachineInstr *UseMI,
-                                     const TargetRegisterInfo &TRI) const {
-    // Do nothing.
-  }
-
   /// duplicate - Create a duplicate of the Orig instruction in MF. This is like
   /// MachineFunction::CloneMachineInstr(), but the target may update operands
   /// that are required to be unique.
@@ -319,7 +314,7 @@ public:
   /// being executed is given by Probability, and Confidence is a measure
   /// of our confidence that it will be properly predicted.
   virtual
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            const BranchProbability &Probability) const {
     return false;
@@ -347,7 +342,7 @@ public:
   /// Probability, and Confidence is a measure of our confidence that it
   /// will be properly predicted.
   virtual bool
-  isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                             const BranchProbability &Probability) const {
     return false;
   }
@@ -368,6 +363,56 @@ public:
     return false;
   }
 
+  /// canInsertSelect - Return true if it is possible to insert a select
+  /// instruction that chooses between TrueReg and FalseReg based on the
+  /// condition code in Cond.
+  ///
+  /// When successful, also return the latency in cycles from TrueReg,
+  /// FalseReg, and Cond to the destination register. The Cond latency should
+  /// compensate for a conditional branch being removed. For example, if a
+  /// conditional branch has a 3 cycle latency from the condition code read,
+  /// and a cmov instruction has a 2 cycle latency from the condition code
+  /// read, CondCycles should be returned as -1.
+  ///
+  /// @param MBB         Block where select instruction would be inserted.
+  /// @param Cond        Condition returned by AnalyzeBranch.
+  /// @param TrueReg     Virtual register to select when Cond is true.
+  /// @param FalseReg    Virtual register to select when Cond is false.
+  /// @param CondCycles  Latency from Cond+Branch to select output.
+  /// @param TrueCycles  Latency from TrueReg to select output.
+  /// @param FalseCycles Latency from FalseReg to select output.
+  virtual bool canInsertSelect(const MachineBasicBlock &MBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               unsigned TrueReg, unsigned FalseReg,
+                               int &CondCycles,
+                               int &TrueCycles, int &FalseCycles) const {
+    return false;
+  }
+
+  /// insertSelect - Insert a select instruction into MBB before I that will
+  /// copy TrueReg to DstReg when Cond is true, and FalseReg to DstReg when
+  /// Cond is false.
+  ///
+  /// This function can only be called after canInsertSelect() returned true.
+  /// The condition in Cond comes from AnalyzeBranch, and it can be assumed
+  /// that the same flags or registers required by Cond are available at the
+  /// insertion point.
+  ///
+  /// @param MBB      Block where select instruction should be inserted.
+  /// @param I        Insertion point.
+  /// @param DL       Source location for debugging.
+  /// @param DstReg   Virtual register to be defined by select instruction.
+  /// @param Cond     Condition as computed by AnalyzeBranch.
+  /// @param TrueReg  Virtual register to copy when Cond is true.
+  /// @param FalseReg Virtual register to copy when Cons is false.
+  virtual void insertSelect(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I, DebugLoc DL,
+                            unsigned DstReg,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            unsigned TrueReg, unsigned FalseReg) const {
+    llvm_unreachable("Target didn't implement TargetInstrInfo::insertSelect!");
+  }
+
   /// copyPhysReg - Emit instructions to copy a pair of physical registers.
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -608,6 +653,13 @@ public:
   CreateTargetHazardRecognizer(const TargetMachine *TM,
                                const ScheduleDAG *DAG) const = 0;
 
+  /// CreateTargetMIHazardRecognizer - Allocate and return a hazard recognizer
+  /// to use for this target when scheduling the machine instructions before
+  /// register allocation.
+  virtual ScheduleHazardRecognizer*
+  CreateTargetMIHazardRecognizer(const InstrItineraryData*,
+                                 const ScheduleDAG *DAG) const = 0;
+
   /// CreateTargetPostRAHazardRecognizer - Allocate and return a hazard
   /// recognizer to use for this target when scheduling the machine instructions
   /// after register allocation.
@@ -615,23 +667,40 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
                                      const ScheduleDAG *DAG) const = 0;
 
-  /// AnalyzeCompare - For a comparison instruction, return the source register
-  /// in SrcReg and the value it compares against in CmpValue. Return true if
-  /// the comparison instruction can be analyzed.
-  virtual bool AnalyzeCompare(const MachineInstr *MI,
-                              unsigned &SrcReg, int &Mask, int &Value) const {
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI,
+                              unsigned &SrcReg, unsigned &SrcReg2,
+                              int &Mask, int &Value) const {
     return false;
   }
 
-  /// OptimizeCompareInstr - See if the comparison instruction can be converted
+  /// optimizeCompareInstr - See if the comparison instruction can be converted
   /// into something more efficient. E.g., on ARM most instructions can set the
   /// flags register, obviating the need for a separate CMP.
-  virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr,
-                                    unsigned SrcReg, int Mask, int Value,
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr,
+                                    unsigned SrcReg, unsigned SrcReg2,
+                                    int Mask, int Value,
                                     const MachineRegisterInfo *MRI) const {
     return false;
   }
 
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                        const MachineRegisterInfo *MRI,
+                        unsigned &FoldAsLoadDefReg,
+                        MachineInstr *&DefMI) const {
+    return 0;
+  }
+
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
   virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
@@ -640,9 +709,11 @@ public:
   }
 
   /// getNumMicroOps - Return the number of u-operations the given machine
-  /// instruction will be decoded to on the target cpu.
+  /// instruction will be decoded to on the target cpu. The itinerary's
+  /// IssueWidth is the number of microops that can be dispatched each
+  /// cycle. An instruction with zero microops takes no dispatch resources.
   virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
-                                  const MachineInstr *MI) const;
+                                  const MachineInstr *MI) const = 0;
 
   /// isZeroCost - Return true for pseudo instructions that don't consume any
   /// machine resources in their current form. These are common cases that the
@@ -652,18 +723,45 @@ public:
     return Opcode <= TargetOpcode::COPY;
   }
 
+  virtual int getOperandLatency(const InstrItineraryData *ItinData,
+                                SDNode *DefNode, unsigned DefIdx,
+                                SDNode *UseNode, unsigned UseIdx) const = 0;
+
   /// getOperandLatency - Compute and return the use operand latency of a given
   /// pair of def and use.
   /// In most cases, the static scheduling itinerary was enough to determine the
   /// operand latency. But it may not be possible for instructions with variable
   /// number of defs / uses.
+  ///
+  /// This is a raw interface to the itinerary that may be directly overriden by
+  /// a target. Use computeOperandLatency to get the best estimate of latency.
   virtual int getOperandLatency(const InstrItineraryData *ItinData,
-                              const MachineInstr *DefMI, unsigned DefIdx,
-                              const MachineInstr *UseMI, unsigned UseIdx) const;
-
-  virtual int getOperandLatency(const InstrItineraryData *ItinData,
-                                SDNode *DefNode, unsigned DefIdx,
-                                SDNode *UseNode, unsigned UseIdx) const = 0;
+                                const MachineInstr *DefMI, unsigned DefIdx,
+                                const MachineInstr *UseMI,
+                                unsigned UseIdx) const = 0;
+
+  /// computeOperandLatency - Compute and return the latency of the given data
+  /// dependent def and use when the operand indices are already known.
+  ///
+  /// FindMin may be set to get the minimum vs. expected latency.
+  unsigned computeOperandLatency(const InstrItineraryData *ItinData,
+                                 const MachineInstr *DefMI, unsigned DefIdx,
+                                 const MachineInstr *UseMI, unsigned UseIdx,
+                                 bool FindMin = false) const;
+
+  /// computeOperandLatency - Compute and return the latency of the given data
+  /// dependent def and use. DefMI must be a valid def. UseMI may be NULL for
+  /// an unknown use. If the subtarget allows, this may or may not need to call
+  /// getOperandLatency().
+  ///
+  /// FindMin may be set to get the minimum vs. expected latency. Minimum
+  /// latency is used for scheduling groups, while expected latency is for
+  /// instruction cost and critical path.
+  unsigned computeOperandLatency(const InstrItineraryData *ItinData,
+                                 const TargetRegisterInfo *TRI,
+                                 const MachineInstr *DefMI,
+                                 const MachineInstr *UseMI,
+                                 unsigned Reg, bool FindMin) const;
 
   /// getOutputLatency - Compute and return the output dependency latency of a
   /// a given pair of defs which both target the same register. This is usually
@@ -677,13 +775,17 @@ public:
   /// getInstrLatency - Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
   /// PredCost.
-  virtual int getInstrLatency(const InstrItineraryData *ItinData,
-                              const MachineInstr *MI,
-                              unsigned *PredCost = 0) const;
+  virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                                   const MachineInstr *MI,
+                                   unsigned *PredCost = 0) const = 0;
 
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const = 0;
 
+  /// Return the default expected latency for a def based on it's opcode.
+  unsigned defaultDefLatency(const MCSchedModel *SchedModel,
+                             const MachineInstr *DefMI) const;
+
   /// isHighLatencyDef - Return true if this opcode has high latency to its
   /// result.
   virtual bool isHighLatencyDef(int opc) const { return false; }
@@ -705,7 +807,7 @@ public:
   /// if the target considered it 'low'.
   virtual
   bool hasLowDefLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr *DefMI, unsigned DefIdx) const;
+                        const MachineInstr *DefMI, unsigned DefIdx) const = 0;
 
   /// verifyInstruction - Perform target specific instruction verification.
   virtual
@@ -862,20 +964,40 @@ public:
   virtual bool isSchedulingBoundary(const MachineInstr *MI,
                                     const MachineBasicBlock *MBB,
                                     const MachineFunction &MF) const;
-  using TargetInstrInfo::getOperandLatency;
+
   virtual int getOperandLatency(const InstrItineraryData *ItinData,
                                 SDNode *DefNode, unsigned DefIdx,
                                 SDNode *UseNode, unsigned UseIdx) const;
-  using TargetInstrInfo::getInstrLatency;
+
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const;
 
+  virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                                  const MachineInstr *MI) const;
+
+  virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                                   const MachineInstr *MI,
+                                   unsigned *PredCost = 0) const;
+
+  virtual
+  bool hasLowDefLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx) const;
+
+  virtual int getOperandLatency(const InstrItineraryData *ItinData,
+                                const MachineInstr *DefMI, unsigned DefIdx,
+                                const MachineInstr *UseMI,
+                                unsigned UseIdx) const;
+
   bool usePreRAHazardRecognizer() const;
 
   virtual ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetMachine*, const ScheduleDAG*) const;
 
   virtual ScheduleHazardRecognizer *
+  CreateTargetMIHazardRecognizer(const InstrItineraryData*,
+                                 const ScheduleDAG*) const;
+
+  virtual ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
                                      const ScheduleDAG*) const;
 };
diff --git a/include/llvm/Target/TargetItinerary.td b/include/llvm/Target/TargetItinerary.td
new file mode 100644
index 0000000..cc74006
--- /dev/null
+++ b/include/llvm/Target/TargetItinerary.td
@@ -0,0 +1,136 @@
+//===- TargetItinerary.td - Target Itinierary Description --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent scheduling interfaces
+// which should be implemented by each target that uses instruction
+// itineraries for scheduling. Itineraries are details reservation
+// tables for each instruction class. They are most appropriate for
+// in-order machine with complicated scheduling or bundling constraints.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Processor functional unit - These values represent the function units
+// available across all chip sets for the target.  Eg., IntUnit, FPUnit, ...
+// These may be independent values for each chip set or may be shared across
+// all chip sets of the target.  Each functional unit is treated as a resource
+// during scheduling and has an affect instruction order based on availability
+// during a time interval.
+//
+class FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Pipeline bypass / forwarding - These values specifies the symbolic names of
+// pipeline bypasses which can be used to forward results of instructions
+// that are forwarded to uses.
+class Bypass;
+def NoBypass : Bypass;
+
+class ReservationKind<bits<1> val> {
+  int Value = val;
+}
+
+def Required : ReservationKind<0>;
+def Reserved : ReservationKind<1>;
+
+//===----------------------------------------------------------------------===//
+// Instruction stage - These values represent a non-pipelined step in
+// the execution of an instruction.  Cycles represents the number of
+// discrete time slots needed to complete the stage.  Units represent
+// the choice of functional units that can be used to complete the
+// stage.  Eg. IntUnit1, IntUnit2. NextCycles indicates how many
+// cycles should elapse from the start of this stage to the start of
+// the next stage in the itinerary.  For example:
+//
+// A stage is specified in one of two ways:
+//
+//   InstrStage<1, [FU_x, FU_y]>     - TimeInc defaults to Cycles
+//   InstrStage<1, [FU_x, FU_y], 0>  - TimeInc explicit
+//
+
+class InstrStage<int cycles, list<FuncUnit> units,
+                 int timeinc = -1,
+                 ReservationKind kind = Required> {
+  int Cycles          = cycles;       // length of stage in machine cycles
+  list<FuncUnit> Units = units;       // choice of functional units
+  int TimeInc         = timeinc;      // cycles till start of next stage
+  int Kind            = kind.Value;   // kind of FU reservation
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary - An itinerary represents a sequential series of steps
+// required to complete an instruction.  Itineraries are represented as lists of
+// instruction stages.
+//
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary classes - These values represent 'named' instruction
+// itinerary.  Using named itineraries simplifies managing groups of
+// instructions across chip sets.  An instruction uses the same itinerary class
+// across all chip sets.  Thus a new chip set can be added without modifying
+// instruction information.
+//
+class InstrItinClass;
+def NoItinerary : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary data - These values provide a runtime map of an
+// instruction itinerary class (name) to its itinerary data.
+//
+// NumMicroOps represents the number of micro-operations that each instruction
+// in the class are decoded to. If the number is zero, then it means the
+// instruction can decode into variable number of micro-ops and it must be
+// determined dynamically. This directly relates to the itineraries
+// global IssueWidth property, which constrains the number of microops
+// that can issue per cycle.
+//
+// OperandCycles are optional "cycle counts". They specify the cycle after
+// instruction issue the values which correspond to specific operand indices
+// are defined or read. Bypasses are optional "pipeline forwarding pathes", if
+// a def by an instruction is available on a specific bypass and the use can
+// read from the same bypass, then the operand use latency is reduced by one.
+//
+//  InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Pipe1]>,
+//                               InstrStage<1, [A9_AGU]>],
+//                              [3, 1], [A9_LdBypass]>,
+//  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+//                              [1, 1], [NoBypass, A9_LdBypass]>,
+//
+// In this example, the instruction of IIC_iLoadi reads its input on cycle 1
+// (after issue) and the result of the load is available on cycle 3. The result
+// is available via forwarding path A9_LdBypass. If it's used by the first
+// source operand of instructions of IIC_iMVNr class, then the operand latency
+// is reduced by 1.
+class InstrItinData<InstrItinClass Class, list<InstrStage> stages,
+                    list<int> operandcycles = [],
+                    list<Bypass> bypasses = [], int uops = 1> {
+  InstrItinClass TheClass = Class;
+  int NumMicroOps = uops;
+  list<InstrStage> Stages = stages;
+  list<int> OperandCycles = operandcycles;
+  list<Bypass> Bypasses = bypasses;
+}
+
+//===----------------------------------------------------------------------===//
+// Processor itineraries - These values represent the set of all itinerary
+// classes for a given chip set.
+//
+// Set property values to -1 to use the default.
+// See InstrItineraryProps for comments and defaults.
+class ProcessorItineraries<list<FuncUnit> fu, list<Bypass> bp,
+                           list<InstrItinData> iid> {
+  list<FuncUnit> FU = fu;
+  list<Bypass> BP = bp;
+  list<InstrItinData> IID = iid;
+}
+
+// NoItineraries - A marker that can be used by processors without schedule
+// info. Subtargets using NoItineraries can bypass the scheduler's
+// expensive HazardRecognizer because no reservation table is needed.
+def NoItineraries : ProcessorItineraries<[], [], []>;
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index c8cacf2..ea2874f 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -18,36 +18,47 @@ namespace llvm {
 
   namespace LibFunc {
     enum Func {
+      /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
+      cxa_atexit,
+      /// void __cxa_guard_abort(guard_t *guard);
+      /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
+      cxa_guard_abort,      
+      /// int __cxa_guard_acquire(guard_t *guard);
+      cxa_guard_acquire,
+      /// void __cxa_guard_release(guard_t *guard);
+      cxa_guard_release,
+      /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
+      memcpy_chk,
       /// double acos(double x);
       acos,
-      /// long double acosl(long double x);
-      acosl,
       /// float acosf(float x);
       acosf,
+      /// long double acosl(long double x);
+      acosl,
       /// double asin(double x);
       asin,
-      /// long double asinl(long double x);
-      asinl,
       /// float asinf(float x);
       asinf,
+      /// long double asinl(long double x);
+      asinl,
       /// double atan(double x);
       atan,
-      /// long double atanl(long double x);
-      atanl,
-      /// float atanf(float x);
-      atanf,
       /// double atan2(double y, double x);
       atan2,
-      /// long double atan2l(long double y, long double x);
-      atan2l,
       /// float atan2f(float y, float x);
       atan2f,
+      /// long double atan2l(long double y, long double x);
+      atan2l,
+      /// float atanf(float x);
+      atanf,
+      /// long double atanl(long double x);
+      atanl,
       /// double ceil(double x);
       ceil,
-      /// long double ceill(long double x);
-      ceill,
       /// float ceilf(float x);
       ceilf,
+      /// long double ceill(long double x);
+      ceill,
       /// double copysign(double x, double y);
       copysign,
       /// float copysignf(float x, float y);
@@ -56,54 +67,56 @@ namespace llvm {
       copysignl,
       /// double cos(double x);
       cos,
-      /// long double cosl(long double x);
-      cosl,
       /// float cosf(float x);
       cosf,
       /// double cosh(double x);
       cosh,
-      /// long double coshl(long double x);
-      coshl,
       /// float coshf(float x);
       coshf,
+      /// long double coshl(long double x);
+      coshl,
+      /// long double cosl(long double x);
+      cosl,
       /// double exp(double x);
       exp,
-      /// long double expl(long double x);
-      expl,
-      /// float expf(float x);
-      expf,
       /// double exp2(double x);
       exp2,
-      /// long double exp2l(long double x);
-      exp2l,
       /// float exp2f(float x);
       exp2f,
+      /// long double exp2l(long double x);
+      exp2l,
+      /// float expf(float x);
+      expf,
+      /// long double expl(long double x);
+      expl,
       /// double expm1(double x);
       expm1,
-      /// long double expm1l(long double x);
-      expm1l,
       /// float expm1f(float x);
       expm1f,
+      /// long double expm1l(long double x);
+      expm1l,
       /// double fabs(double x);
       fabs,
-      /// long double fabsl(long double x);
-      fabsl,
       /// float fabsf(float x);
       fabsf,
+      /// long double fabsl(long double x);
+      fabsl,
+      /// int fiprintf(FILE *stream, const char *format, ...);
+      fiprintf,
       /// double floor(double x);
       floor,
-      /// long double floorl(long double x);
-      floorl,
       /// float floorf(float x);
       floorf,
-      /// int fiprintf(FILE *stream, const char *format, ...);
-      fiprintf,
+      /// long double floorl(long double x);
+      floorl,
       /// double fmod(double x, double y);
       fmod,
-      /// long double fmodl(long double x, long double y);
-      fmodl,
       /// float fmodf(float x, float y);
       fmodf,
+      /// long double fmodl(long double x, long double y);
+      fmodl,
+      /// int fputc(int c, FILE *stream);
+      fputc,
       /// int fputs(const char *s, FILE *stream);
       fputs,
       /// size_t fwrite(const void *ptr, size_t size, size_t nitems,
@@ -113,28 +126,32 @@ namespace llvm {
       iprintf,
       /// double log(double x);
       log,
-      /// long double logl(long double x);
-      logl,
-      /// float logf(float x);
-      logf,
-      /// double log2(double x);
-      log2,
-      /// double long double log2l(long double x);
-      log2l,
-      /// float log2f(float x);
-      log2f,
       /// double log10(double x);
       log10,
-      /// long double log10l(long double x);
-      log10l,
       /// float log10f(float x);
       log10f,
+      /// long double log10l(long double x);
+      log10l,
       /// double log1p(double x);
       log1p,
-      /// long double log1pl(long double x);
-      log1pl,
       /// float log1pf(float x);
       log1pf,
+      /// long double log1pl(long double x);
+      log1pl,
+      /// double log2(double x);
+      log2,
+      /// float log2f(float x);
+      log2f,
+      /// double long double log2l(long double x);
+      log2l,
+      /// float logf(float x);
+      logf,
+      /// long double logl(long double x);
+      logl,
+      /// void *memchr(const void *s, int c, size_t n);
+      memchr,
+      /// int memcmp(const void *s1, const void *s2, size_t n);
+      memcmp,
       /// void *memcpy(void *s1, const void *s2, size_t n);
       memcpy,
       /// void *memmove(void *s1, const void *s2, size_t n);
@@ -155,6 +172,10 @@ namespace llvm {
       powf,
       /// long double powl(long double x, long double y);
       powl,
+      /// int putchar(int c);
+      putchar,
+      /// int puts(const char *s);
+      puts,
       /// double rint(double x);
       rint,
       /// float rintf(float x);
@@ -169,51 +190,58 @@ namespace llvm {
       roundl,
       /// double sin(double x);
       sin,
-      /// long double sinl(long double x);
-      sinl,
       /// float sinf(float x);
       sinf,
       /// double sinh(double x);
       sinh,
-      /// long double sinhl(long double x);
-      sinhl,
       /// float sinhf(float x);
       sinhf,
+      /// long double sinhl(long double x);
+      sinhl,
+      /// long double sinl(long double x);
+      sinl,
       /// int siprintf(char *str, const char *format, ...);
       siprintf,
       /// double sqrt(double x);
       sqrt,
-      /// long double sqrtl(long double x);
-      sqrtl,
       /// float sqrtf(float x);
       sqrtf,
+      /// long double sqrtl(long double x);
+      sqrtl,
+      /// char *strcat(char *s1, const char *s2);
+      strcat,
+      /// char *strchr(const char *s, int c);
+      strchr,
+      /// char *strcpy(char *s1, const char *s2);
+      strcpy,
+      /// size_t strlen(const char *s);
+      strlen,
+      /// char *strncat(char *s1, const char *s2, size_t n);
+      strncat,
+      /// int strncmp(const char *s1, const char *s2, size_t n);
+      strncmp,
+      /// char *strncpy(char *s1, const char *s2, size_t n);
+      strncpy,
+      /// size_t strnlen(const char *s, size_t maxlen);
+      strnlen,
       /// double tan(double x);
       tan,
-      /// long double tanl(long double x);
-      tanl,
       /// float tanf(float x);
       tanf,
       /// double tanh(double x);
       tanh,
-      /// long double tanhl(long double x);
-      tanhl,
       /// float tanhf(float x);
       tanhf,
+      /// long double tanhl(long double x);
+      tanhl,
+      /// long double tanl(long double x);
+      tanl,
       /// double trunc(double x);
       trunc,
       /// float truncf(float x);
       truncf,
       /// long double truncl(long double x);
       truncl,
-      /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
-      cxa_atexit,
-      /// void __cxa_guard_abort(guard_t *guard);
-      /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
-      cxa_guard_abort,      
-      /// int __cxa_guard_acquire(guard_t *guard);
-      cxa_guard_acquire,
-      /// void __cxa_guard_release(guard_t *guard);
-      cxa_guard_release,
 
       NumLibFuncs
     };
@@ -247,12 +275,41 @@ public:
   TargetLibraryInfo(const Triple &T);
   explicit TargetLibraryInfo(const TargetLibraryInfo &TLI);
   
+  /// getLibFunc - Search for a particular function name.  If it is one of the
+  /// known library functions, return true and set F to the corresponding value.
+  bool getLibFunc(StringRef funcName, LibFunc::Func &F) const;
+
   /// has - This function is used by optimizations that want to match on or form
   /// a given library function.
   bool has(LibFunc::Func F) const {
     return getState(F) != Unavailable;
   }
 
+  /// hasOptimizedCodeGen - Return true if the function is both available as
+  /// a builtin and a candidate for optimized code generation.
+  bool hasOptimizedCodeGen(LibFunc::Func F) const {
+    if (getState(F) == Unavailable)
+      return false;
+    switch (F) {
+    default: break;
+    case LibFunc::copysign:  case LibFunc::copysignf:  case LibFunc::copysignl:
+    case LibFunc::fabs:      case LibFunc::fabsf:      case LibFunc::fabsl:
+    case LibFunc::sin:       case LibFunc::sinf:       case LibFunc::sinl:
+    case LibFunc::cos:       case LibFunc::cosf:       case LibFunc::cosl:
+    case LibFunc::sqrt:      case LibFunc::sqrtf:      case LibFunc::sqrtl:
+    case LibFunc::floor:     case LibFunc::floorf:     case LibFunc::floorl:
+    case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl:
+    case LibFunc::ceil:      case LibFunc::ceilf:      case LibFunc::ceill:
+    case LibFunc::rint:      case LibFunc::rintf:      case LibFunc::rintl:
+    case LibFunc::trunc:     case LibFunc::truncf:     case LibFunc::truncl:
+    case LibFunc::log2:      case LibFunc::log2f:      case LibFunc::log2l:
+    case LibFunc::exp2:      case LibFunc::exp2f:      case LibFunc::exp2l:
+    case LibFunc::memcmp:
+      return true;
+    }
+    return false;
+  }
+
   StringRef getName(LibFunc::Func F) const {
     AvailabilityState State = getState(F);
     if (State == Unavailable)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 720c9df..acf0419 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -25,6 +25,7 @@
 #include "llvm/CallingConv.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Attributes.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/Support/DebugLoc.h"
@@ -50,6 +51,7 @@ namespace llvm {
   template<typename T> class SmallVectorImpl;
   class TargetData;
   class TargetRegisterClass;
+  class TargetLibraryInfo;
   class TargetLoweringObjectFile;
   class Value;
 
@@ -150,6 +152,12 @@ public:
   /// that should be avoided.
   bool isJumpExpensive() const { return JumpIsExpensive; }
 
+  /// isPredictableSelectExpensive - Return true if selects are only cheaper
+  /// than branches if the branch is unlikely to be predicted right.
+  bool isPredictableSelectExpensive() const {
+    return predictableSelectIsExpensive;
+  }
+
   /// getSetCCResultType - Return the ValueType of the result of SETCC
   /// operations.  Also used to obtain the target's preferred type for
   /// the condition operand of SELECT and BRCOND nodes.  In the case of
@@ -358,7 +366,9 @@ public:
   /// for it.
   LegalizeAction getOperationAction(unsigned Op, EVT VT) const {
     if (VT.isExtended()) return Expand;
-    assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
+    // If a target-specific SDNode requires legalization, require the target
+    // to provide custom legalization for it.
+    if (Op > array_lengthof(OpActions[0])) return Custom;
     unsigned I = (unsigned) VT.getSimpleVT().SimpleTy;
     return (LegalizeAction)OpActions[I][Op];
   }
@@ -670,6 +680,12 @@ public:
     return UseUnderscoreLongJmp;
   }
 
+  /// supportJumpTables - return whether the target can generate code for
+  /// jump tables.
+  bool supportJumpTables() const {
+    return SupportJumpTables;
+  }
+
   /// getStackPointerRegisterToSaveRestore - If a physical register, this
   /// specifies the register that llvm.savestack/llvm.restorestack should save
   /// and restore.
@@ -984,6 +1000,12 @@ protected:
     UseUnderscoreLongJmp = Val;
   }
 
+  /// setSupportJumpTables - Indicate whether the target can generate code for
+  /// jump tables.
+  void setSupportJumpTables(bool Val) {
+    SupportJumpTables = Val;
+  }
+
   /// setStackPointerRegisterToSaveRestore - If set to a physical register, this
   /// specifies the register that llvm.savestack/llvm.restorestack should save
   /// and restore.
@@ -1169,7 +1191,7 @@ protected:
     ShouldFoldAtomicFences = fold;
   }
 
-  /// setInsertFencesForAtomic - Set if the the DAG builder should
+  /// setInsertFencesForAtomic - Set if the DAG builder should
   /// automatically insert fences and reduce the order of atomic memory
   /// operations to Monotonic.
   void setInsertFencesForAtomic(bool fence) {
@@ -1197,11 +1219,6 @@ public:
     llvm_unreachable("Not Implemented");
   }
 
-  /// LowerCallTo - This function lowers an abstract call to a function into an
-  /// actual call.  This returns a pair of operands.  The first element is the
-  /// return value for the function (if RetTy is not VoidTy).  The second
-  /// element is the outgoing token chain. It calls LowerCall to do the actual
-  /// lowering.
   struct ArgListEntry {
     SDValue Node;
     Type* Ty;
@@ -1217,13 +1234,72 @@ public:
       isSRet(false), isNest(false), isByVal(false), Alignment(0) { }
   };
   typedef std::vector<ArgListEntry> ArgListTy;
-  std::pair<SDValue, SDValue>
-  LowerCallTo(SDValue Chain, Type *RetTy, bool RetSExt, bool RetZExt,
-              bool isVarArg, bool isInreg, unsigned NumFixedArgs,
-              CallingConv::ID CallConv, bool isTailCall,
-              bool doesNotRet, bool isReturnValueUsed,
-              SDValue Callee, ArgListTy &Args,
-              SelectionDAG &DAG, DebugLoc dl) const;
+
+  /// CallLoweringInfo - This structure contains all information that is
+  /// necessary for lowering calls. It is passed to TLI::LowerCallTo when the
+  /// SelectionDAG builder needs to lower a call, and targets will see this
+  /// struct in their LowerCall implementation.
+  struct CallLoweringInfo {
+    SDValue Chain;
+    Type *RetTy;
+    bool RetSExt           : 1;
+    bool RetZExt           : 1;
+    bool IsVarArg          : 1;
+    bool IsInReg           : 1;
+    bool DoesNotReturn     : 1;
+    bool IsReturnValueUsed : 1;
+
+    // IsTailCall should be modified by implementations of
+    // TargetLowering::LowerCall that perform tail call conversions.
+    bool IsTailCall;
+
+    unsigned NumFixedArgs;
+    CallingConv::ID CallConv;
+    SDValue Callee;
+    ArgListTy &Args;
+    SelectionDAG &DAG;
+    DebugLoc DL;
+    ImmutableCallSite *CS;
+    SmallVector<ISD::OutputArg, 32> Outs;
+    SmallVector<SDValue, 32> OutVals;
+    SmallVector<ISD::InputArg, 32> Ins;
+
+
+    /// CallLoweringInfo - Constructs a call lowering context based on the
+    /// ImmutableCallSite \p cs.
+    CallLoweringInfo(SDValue chain, Type *retTy,
+                     FunctionType *FTy, bool isTailCall, SDValue callee,
+                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl,
+                     ImmutableCallSite &cs)
+    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attribute::SExt)),
+      RetZExt(cs.paramHasAttr(0, Attribute::ZExt)), IsVarArg(FTy->isVarArg()),
+      IsInReg(cs.paramHasAttr(0, Attribute::InReg)),
+      DoesNotReturn(cs.doesNotReturn()),
+      IsReturnValueUsed(!cs.getInstruction()->use_empty()),
+      IsTailCall(isTailCall), NumFixedArgs(FTy->getNumParams()),
+      CallConv(cs.getCallingConv()), Callee(callee), Args(args), DAG(dag),
+      DL(dl), CS(&cs) {}
+
+    /// CallLoweringInfo - Constructs a call lowering context based on the
+    /// provided call information.
+    CallLoweringInfo(SDValue chain, Type *retTy, bool retSExt, bool retZExt,
+                     bool isVarArg, bool isInReg, unsigned numFixedArgs,
+                     CallingConv::ID callConv, bool isTailCall,
+                     bool doesNotReturn, bool isReturnValueUsed, SDValue callee,
+                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl)
+    : Chain(chain), RetTy(retTy), RetSExt(retSExt), RetZExt(retZExt),
+      IsVarArg(isVarArg), IsInReg(isInReg), DoesNotReturn(doesNotReturn),
+      IsReturnValueUsed(isReturnValueUsed), IsTailCall(isTailCall),
+      NumFixedArgs(numFixedArgs), CallConv(callConv), Callee(callee),
+      Args(args), DAG(dag), DL(dl), CS(NULL) {}
+  };
+
+  /// LowerCallTo - This function lowers an abstract call to a function into an
+  /// actual call.  This returns a pair of operands.  The first element is the
+  /// return value for the function (if RetTy is not VoidTy).  The second
+  /// element is the outgoing token chain. It calls LowerCall to do the actual
+  /// lowering.
+  std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;
 
   /// LowerCall - This hook must be implemented to lower calls into the
   /// the specified DAG. The outgoing arguments to the call are described
@@ -1232,13 +1308,7 @@ public:
   /// InVals array with legal-type return values from the call, and return
   /// the resulting token chain value.
   virtual SDValue
-    LowerCall(SDValue /*Chain*/, SDValue /*Callee*/,
-              CallingConv::ID /*CallConv*/, bool /*isVarArg*/,
-              bool /*doesNotRet*/, bool &/*isTailCall*/,
-              const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
-              const SmallVectorImpl<SDValue> &/*OutVals*/,
-              const SmallVectorImpl<ISD::InputArg> &/*Ins*/,
-              DebugLoc /*dl*/, SelectionDAG &/*DAG*/,
+    LowerCall(CallLoweringInfo &/*CLI*/,
               SmallVectorImpl<SDValue> &/*InVals*/) const {
     llvm_unreachable("Not Implemented");
   }
@@ -1251,7 +1321,7 @@ public:
   /// registers.  If false is returned, an sret-demotion is performed.
   ///
   virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
-			      MachineFunction &/*MF*/, bool /*isVarArg*/,
+                              MachineFunction &/*MF*/, bool /*isVarArg*/,
                const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
                LLVMContext &/*Context*/) const
   {
@@ -1346,7 +1416,8 @@ public:
 
   /// createFastISel - This method returns a target specific FastISel object,
   /// or null if the target does not support "fast" ISel.
-  virtual FastISel *createFastISel(FunctionLoweringInfo &) const {
+  virtual FastISel *createFastISel(FunctionLoweringInfo &,
+                                   const TargetLibraryInfo *) const {
     return 0;
   }
 
@@ -1602,6 +1673,14 @@ public:
     return false;
   }
 
+  /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+  /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+  /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+  /// is expanded to mul + add.
+  virtual bool isFMAFasterThanMulAndAdd(EVT) const {
+    return false;
+  }
+
   /// isNarrowingProfitable - Return true if it's profitable to narrow
   /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
   /// from i32 to i8 but not from i32 to i16.
@@ -1665,13 +1744,6 @@ private:
   const TargetData *TD;
   const TargetLoweringObjectFile &TLOF;
 
-  /// We are in the process of implementing a new TypeLegalization action
-  /// which is the promotion of vector elements. This feature is under
-  /// development. Until this feature is complete, it is only enabled using a
-  /// flag. We pass this flag using a member because of circular dep issues.
-  /// This member will be removed with the flag once we complete the transition.
-  bool mayPromoteElements;
-
   /// PointerTy - The type to use for pointers, usually i32 or i64.
   ///
   MVT PointerTy;
@@ -1708,6 +1780,10 @@ private:
   /// llvm.longjmp.  Defaults to false.
   bool UseUnderscoreLongJmp;
 
+  /// SupportJumpTables - Whether the target can generate code for jumptables.
+  /// If it's not true, then each jumptable must be lowered into if-then-else's.
+  bool SupportJumpTables;
+
   /// BooleanContents - Information about the contents of the high-bits in
   /// boolean values held in a type wider than i1.  See getBooleanContents.
   BooleanContent BooleanContents;
@@ -1875,9 +1951,8 @@ private:
     if (NumElts == 1)
       return LegalizeKind(TypeScalarizeVector, EltVT);
 
-    // If we allow the promotion of vector elements using a flag,
-    // then try to widen vector elements until a legal type is found.
-    if (mayPromoteElements && EltVT.isInteger()) {
+    // Try to widen vector elements until a legal type is found.
+    if (EltVT.isInteger()) {
       // Vectors with a number of elements that is not a power of two are always
       // widened, for example <3 x float> -> <4 x float>.
       if (!VT.isPow2VectorType()) {
@@ -2028,14 +2103,14 @@ protected:
   /// optimization.
   bool benefitFromCodePlacementOpt;
 
+  /// predictableSelectIsExpensive - Tells the code generator that select is
+  /// more expensive than a branch if the branch is usually predicted right.
+  bool predictableSelectIsExpensive;
+
 private:
   /// isLegalRC - Return true if the value types that can be represented by the
   /// specified register class are all legal.
   bool isLegalRC(const TargetRegisterClass *RC) const;
-
-  /// hasLegalSuperRegRegClasses - Return true if the specified register class
-  /// has one or more super-reg register classes that are legal.
-  bool hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const;
 };
 
 /// GetReturnInfo - Given an LLVM IR type and return type attributes,
@@ -2043,8 +2118,7 @@ private:
 /// the offsets, if the return value is being lowered to memory.
 void GetReturnInfo(Type* ReturnType, Attributes attr,
                    SmallVectorImpl<ISD::OutputArg> &Outs,
-                   const TargetLowering &TLI,
-                   SmallVectorImpl<uint64_t> *Offsets = 0);
+                   const TargetLowering &TLI);
 
 } // end llvm namespace
 
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 1a05604..e4bf32b 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGET_TARGETMACHINE_H
 #define LLVM_TARGET_TARGETMACHINE_H
 
+#include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/StringRef.h"
@@ -247,7 +248,9 @@ public:
   virtual bool addPassesToEmitFile(PassManagerBase &,
                                    formatted_raw_ostream &,
                                    CodeGenFileType,
-                                   bool /*DisableVerify*/ = true) {
+                                   bool /*DisableVerify*/ = true,
+                                   AnalysisID StartAfter = 0,
+                                   AnalysisID StopAfter = 0) {
     return true;
   }
 
@@ -297,7 +300,9 @@ public:
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   bool DisableVerify = true);
+                                   bool DisableVerify = true,
+                                   AnalysisID StartAfter = 0,
+                                   AnalysisID StopAfter = 0);
 
   /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
   /// get machine code emitted.  This uses a JITCodeEmitter object to handle
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 12a2757..d1a07d1 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -30,20 +30,28 @@ namespace llvm {
     };
   }
 
+  namespace FPOpFusion {
+    enum FPOpFusionMode {
+      Fast,     // Enable fusion of FP ops wherever it's profitable.
+      Standard, // Only allow fusion of 'blessed' ops (currently just fmuladd).
+      Strict    // Never fuse FP-ops.
+    };
+  }
+
   class TargetOptions {
   public:
     TargetOptions()
         : PrintMachineCode(false), NoFramePointerElim(false),
           NoFramePointerElimNonLeaf(false), LessPreciseFPMADOption(false),
-          NoExcessFPPrecision(false), UnsafeFPMath(false), NoInfsFPMath(false),
+          UnsafeFPMath(false), NoInfsFPMath(false),
           NoNaNsFPMath(false), HonorSignDependentRoundingFPMathOption(false),
           UseSoftFloat(false), NoZerosInBSS(false), JITExceptionHandling(false),
           JITEmitDebugInfo(false), JITEmitDebugInfoToDisk(false),
           GuaranteedTailCallOpt(false), DisableTailCalls(false),
-          StackAlignmentOverride(0), RealignStack(true),
-          DisableJumpTables(false), EnableFastISel(false),
+          StackAlignmentOverride(0), RealignStack(true), EnableFastISel(false),
           PositionIndependentExecutable(false), EnableSegmentedStacks(false),
-          TrapFuncName(""), FloatABIType(FloatABI::Default)
+          UseInitArray(false), TrapFuncName(""), FloatABIType(FloatABI::Default),
+          AllowFPOpFusion(FPOpFusion::Standard)
     {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
@@ -74,14 +82,6 @@ namespace llvm {
     unsigned LessPreciseFPMADOption : 1;
     bool LessPreciseFPMAD() const;
 
-    /// NoExcessFPPrecision - This flag is enabled when the
-    /// -disable-excess-fp-precision flag is specified on the command line.
-    /// When this flag is off (the default), the code generator is allowed to
-    /// produce results that are "more precise" than IEEE allows.  This includes
-    /// use of FMA-like operations and use of the X86 FP registers without
-    /// rounding all over the place.
-    unsigned NoExcessFPPrecision : 1;
-
     /// UnsafeFPMath - This flag is enabled when the
     /// -enable-unsafe-fp-math flag is specified on the command line.  When
     /// this flag is off (the default), the code generator is not allowed to
@@ -155,10 +155,6 @@ namespace llvm {
     /// automatically realigned, if needed.
     unsigned RealignStack : 1;
 
-    /// DisableJumpTables - This flag indicates jump tables should not be
-    /// generated.
-    unsigned DisableJumpTables : 1;
-
     /// EnableFastISel - This flag enables fast-path instruction selection
     /// which trades away generated code quality in favor of reducing
     /// compile time.
@@ -172,6 +168,10 @@ namespace llvm {
 
     unsigned EnableSegmentedStacks : 1;
 
+    /// UseInitArray - Use .init_array instead of .ctors for static
+    /// constructors.
+    unsigned UseInitArray : 1;
+
     /// getTrapFunctionName - If this returns a non-empty string, this means
     /// isel should lower Intrinsic::trap to a call to the specified function
     /// name instead of an ISD::TRAP node.
@@ -185,6 +185,25 @@ namespace llvm {
     /// Such a combination is unfortunately popular (e.g. arm-apple-darwin).
     /// Hard presumes that the normal FP ABI is used.
     FloatABI::ABIType FloatABIType;
+
+    /// AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
+    /// This controls the creation of fused FP ops that store intermediate
+    /// results in higher precision than IEEE allows (E.g. FMAs).
+    ///
+    /// Fast mode - allows formation of fused FP ops whenever they're
+    /// profitable.
+    /// Standard mode - allow fusion only for 'blessed' FP ops. At present the
+    /// only blessed op is the fmuladd intrinsic. In the future more blessed ops
+    /// may be added.
+    /// Strict mode - allow fusion only if/when it can be proven that the excess
+    /// precision won't effect the result.
+    ///
+    /// Note: This option only controls formation of fused ops by the optimizers.
+    /// Fused operations that are explicitly specified (e.g. FMA via the
+    /// llvm.fma.* intrinsic) will always be honored, regardless of the value of
+    /// this option.
+    FPOpFusion::FPOpFusionMode AllowFPOpFusion;
+
   };
 } // End llvm namespace
 
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 6ddd364..df4d900 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -42,9 +42,9 @@ public:
   // Instance variables filled by tablegen, do not use!
   const MCRegisterClass *MC;
   const vt_iterator VTs;
-  const unsigned *SubClassMask;
+  const uint32_t *SubClassMask;
+  const uint16_t *SuperRegIndices;
   const sc_iterator SuperClasses;
-  const sc_iterator SuperRegClasses;
   ArrayRef<uint16_t> (*OrderFunc)(const MachineFunction&);
 
   /// getID() - Return the register class ID number.
@@ -119,18 +119,6 @@ public:
     return I;
   }
 
-  /// superregclasses_begin / superregclasses_end - Loop over all of
-  /// the superreg register classes of this register class.
-  sc_iterator superregclasses_begin() const {
-    return SuperRegClasses;
-  }
-
-  sc_iterator superregclasses_end() const {
-    sc_iterator I = SuperRegClasses;
-    while (*I != NULL) ++I;
-    return I;
-  }
-
   /// hasSubClass - return true if the specified TargetRegisterClass
   /// is a proper sub-class of this TargetRegisterClass.
   bool hasSubClass(const TargetRegisterClass *RC) const {
@@ -163,6 +151,18 @@ public:
     return SubClassMask;
   }
 
+  /// getSuperRegIndices - Returns a 0-terminated list of sub-register indices
+  /// that project some super-register class into this register class. The list
+  /// has an entry for each Idx such that:
+  ///
+  ///   There exists SuperRC where:
+  ///     For all Reg in SuperRC:
+  ///       this->contains(Reg:Idx)
+  ///
+  const uint16_t *getSuperRegIndices() const {
+    return SuperRegIndices;
+  }
+
   /// getSuperClasses - Returns a NULL terminated list of super-classes.  The
   /// classes are ordered by ID which is also a topological ordering from large
   /// to small classes.  The list does NOT include the current class.
@@ -301,6 +301,11 @@ public:
   const TargetRegisterClass *
     getMinimalPhysRegClass(unsigned Reg, EVT VT = MVT::Other) const;
 
+  /// getAllocatableClass - Return the maximal subclass of the given register
+  /// class that is alloctable, or NULL.
+  const TargetRegisterClass *
+    getAllocatableClass(const TargetRegisterClass *RC) const;
+
   /// getAllocatableSet - Returns a bitset indexed by register number
   /// indicating if a register is allocatable or not. If a register class is
   /// specified, returns the subset for the class.
@@ -332,9 +337,23 @@ public:
     if (regA == regB) return true;
     if (isVirtualRegister(regA) || isVirtualRegister(regB))
       return false;
-    for (const uint16_t *regList = getOverlaps(regA)+1; *regList; ++regList) {
-      if (*regList == regB) return true;
-    }
+
+    // Regunits are numerically ordered. Find a common unit.
+    MCRegUnitIterator RUA(regA, this);
+    MCRegUnitIterator RUB(regB, this);
+    do {
+      if (*RUA == *RUB) return true;
+      if (*RUA < *RUB) ++RUA;
+      else             ++RUB;
+    } while (RUA.isValid() && RUB.isValid());
+    return false;
+  }
+
+  /// hasRegUnit - Returns true if Reg contains RegUnit.
+  bool hasRegUnit(unsigned Reg, unsigned RegUnit) const {
+    for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units)
+      if (*Units == RegUnit)
+        return true;
     return false;
   }
 
@@ -346,10 +365,10 @@ public:
 
   /// isSuperRegister - Returns true if regB is a super-register of regA.
   ///
-  bool isSuperRegister(unsigned regA, unsigned regB) const {
-    for (const uint16_t *regList = getSuperRegisters(regA); *regList;++regList){
-      if (*regList == regB) return true;
-    }
+  bool isSuperRegister(unsigned RegA, unsigned RegB) const {
+    for (MCSuperRegIterator I(RegA, this); I.isValid(); ++I)
+      if (*I == RegB)
+        return true;
     return false;
   }
 
@@ -416,7 +435,7 @@ public:
   /// TableGen will synthesize missing A sub-classes.
   virtual const TargetRegisterClass *
   getMatchingSuperRegClass(const TargetRegisterClass *A,
-                           const TargetRegisterClass *B, unsigned Idx) const =0;
+                           const TargetRegisterClass *B, unsigned Idx) const;
 
   /// getSubClassWithSubReg - Returns the largest legal sub-class of RC that
   /// supports the sub-register index Idx.
@@ -431,7 +450,10 @@ public:
   ///
   /// TableGen will synthesize missing RC sub-classes.
   virtual const TargetRegisterClass *
-  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const =0;
+  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const {
+    assert(Idx == 0 && "Target has no sub-registers");
+    return RC;
+  }
 
   /// composeSubRegIndices - Return the subregister index you get from composing
   /// two subregister indices.
@@ -450,6 +472,34 @@ public:
     return b;
   }
 
+  /// getCommonSuperRegClass - Find a common super-register class if it exists.
+  ///
+  /// Find a register class, SuperRC and two sub-register indices, PreA and
+  /// PreB, such that:
+  ///
+  ///   1. PreA + SubA == PreB + SubB  (using composeSubRegIndices()), and
+  ///
+  ///   2. For all Reg in SuperRC: Reg:PreA in RCA and Reg:PreB in RCB, and
+  ///
+  ///   3. SuperRC->getSize() >= max(RCA->getSize(), RCB->getSize()).
+  ///
+  /// SuperRC will be chosen such that no super-class of SuperRC satisfies the
+  /// requirements, and there is no register class with a smaller spill size
+  /// that satisfies the requirements.
+  ///
+  /// SubA and SubB must not be 0. Use getMatchingSuperRegClass() instead.
+  ///
+  /// Either of the PreA and PreB sub-register indices may be returned as 0. In
+  /// that case, the returned register class will be a sub-class of the
+  /// corresponding argument register class.
+  ///
+  /// The function returns NULL if no register class can be found.
+  ///
+  const TargetRegisterClass*
+  getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                         const TargetRegisterClass *RCB, unsigned SubB,
+                         unsigned &PreA, unsigned &PreB) const;
+
   //===--------------------------------------------------------------------===//
   // Register Class Information
   //
@@ -479,7 +529,8 @@ public:
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.  If a target supports multiple different pointer register classes,
   /// kind specifies which one is indicated.
-  virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const {
+  virtual const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const {
     llvm_unreachable("Target didn't implement getPointerRegClass!");
   }
 
@@ -515,13 +566,16 @@ public:
     return 0;
   }
 
-  /// Get the weight in units of pressure for this register class.
+// Get the weight in units of pressure for this register class.
   virtual const RegClassWeight &getRegClassWeight(
     const TargetRegisterClass *RC) const = 0;
 
   /// Get the number of dimensions of register pressure.
   virtual unsigned getNumRegPressureSets() const = 0;
 
+  /// Get the name of this register unit pressure set.
+  virtual const char *getRegPressureSetName(unsigned Idx) const = 0;
+
   /// Get the register unit pressure limit for this dimension.
   /// This limit must be adjusted dynamically for reserved registers.
   virtual unsigned getRegPressureSetLimit(unsigned Idx) const = 0;
@@ -609,6 +663,12 @@ public:
     return false;
   }
 
+  /// trackLivenessAfterRegAlloc - returns true if the live-ins should be tracked
+  /// after register allocation.
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return false;
+  }
+
   /// needsStackRealignment - true if storage within the function requires the
   /// stack pointer to be aligned more than the normal calling convention calls
   /// for.
@@ -708,6 +768,62 @@ public:
 };
 
 
+//===----------------------------------------------------------------------===//
+//                           SuperRegClassIterator
+//===----------------------------------------------------------------------===//
+//
+// Iterate over the possible super-registers for a given register class. The
+// iterator will visit a list of pairs (Idx, Mask) corresponding to the
+// possible classes of super-registers.
+//
+// Each bit mask will have at least one set bit, and each set bit in Mask
+// corresponds to a SuperRC such that:
+//
+//   For all Reg in SuperRC: Reg:Idx is in RC.
+//
+// The iterator can include (O, RC->getSubClassMask()) as the first entry which
+// also satisfies the above requirement, assuming Reg:0 == Reg.
+//
+class SuperRegClassIterator {
+  const unsigned RCMaskWords;
+  unsigned SubReg;
+  const uint16_t *Idx;
+  const uint32_t *Mask;
+
+public:
+  /// Create a SuperRegClassIterator that visits all the super-register classes
+  /// of RC. When IncludeSelf is set, also include the (0, sub-classes) entry.
+  SuperRegClassIterator(const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        bool IncludeSelf = false)
+    : RCMaskWords((TRI->getNumRegClasses() + 31) / 32),
+      SubReg(0),
+      Idx(RC->getSuperRegIndices()),
+      Mask(RC->getSubClassMask()) {
+    if (!IncludeSelf)
+      ++*this;
+  }
+
+  /// Returns true if this iterator is still pointing at a valid entry.
+  bool isValid() const { return Idx; }
+
+  /// Returns the current sub-register index.
+  unsigned getSubReg() const { return SubReg; }
+
+  /// Returns the bit mask if register classes that getSubReg() projects into
+  /// RC.
+  const uint32_t *getMask() const { return Mask; }
+
+  /// Advance iterator to the next entry.
+  void operator++() {
+    assert(isValid() && "Cannot move iterator past end.");
+    Mask += RCMaskWords;
+    SubReg = *Idx++;
+    if (!SubReg)
+      Idx = 0;
+  }
+};
+
 // This is useful when building IndexedMaps keyed on virtual registers
 struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
   unsigned operator()(unsigned Reg) const {
@@ -742,6 +858,29 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
   return OS;
 }
 
+/// PrintRegUnit - Helper class for printing register units on a raw_ostream.
+///
+/// Register units are named after their root registers:
+///
+///   AL      - Single root.
+///   FP0~ST7 - Dual roots.
+///
+/// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
+///
+class PrintRegUnit {
+  const TargetRegisterInfo *TRI;
+  unsigned Unit;
+public:
+  PrintRegUnit(unsigned unit, const TargetRegisterInfo *tri)
+    : TRI(tri), Unit(unit) {}
+  void print(raw_ostream&) const;
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
+  PR.print(OS);
+  return OS;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 97ea82a..4dc488d 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -1,10 +1,10 @@
 //===- TargetSchedule.td - Target Independent Scheduling ---*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file defines the target-independent scheduling interfaces which should
@@ -12,119 +12,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Processor functional unit - These values represent the function units
-// available across all chip sets for the target.  Eg., IntUnit, FPUnit, ...
-// These may be independent values for each chip set or may be shared across
-// all chip sets of the target.  Each functional unit is treated as a resource
-// during scheduling and has an affect instruction order based on availability
-// during a time interval.
-//  
-class FuncUnit;
-
-//===----------------------------------------------------------------------===//
-// Pipeline bypass / forwarding - These values specifies the symbolic names of
-// pipeline bypasses which can be used to forward results of instructions
-// that are forwarded to uses.
-class Bypass;
-def NoBypass : Bypass;
-
-class ReservationKind<bits<1> val> {
-  int Value = val;
-}
-
-def Required : ReservationKind<0>;
-def Reserved : ReservationKind<1>;
+include "llvm/Target/TargetItinerary.td"
 
-//===----------------------------------------------------------------------===//
-// Instruction stage - These values represent a non-pipelined step in
-// the execution of an instruction.  Cycles represents the number of
-// discrete time slots needed to complete the stage.  Units represent
-// the choice of functional units that can be used to complete the
-// stage.  Eg. IntUnit1, IntUnit2. NextCycles indicates how many
-// cycles should elapse from the start of this stage to the start of
-// the next stage in the itinerary.  For example:
-//
-// A stage is specified in one of two ways:
-//
-//   InstrStage<1, [FU_x, FU_y]>     - TimeInc defaults to Cycles
-//   InstrStage<1, [FU_x, FU_y], 0>  - TimeInc explicit
+// The SchedMachineModel is defined by subtargets for three categories of data:
+// 1) Basic properties for coarse grained instruction cost model.
+// 2) Scheduler Read/Write resources for simple per-opcode cost model.
+// 3) Instruction itineraties for detailed reservation tables.
 //
+// Default values for basic properties are defined in MCSchedModel. "-1"
+// indicates that the property is not overriden by the target description.
+class SchedMachineModel {
+  int IssueWidth = -1; // Max instructions that may be scheduled per cycle.
+  int MinLatency = -1; // Determines which instrucions are allowed in a group.
+                       // (-1) inorder (0) ooo, (1): inorder +var latencies.
+  int LoadLatency = -1; // Cycles for loads to access the cache.
+  int HighLatency = -1; // Approximation of cycles for "high latency" ops.
+  int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
 
-class InstrStage<int cycles, list<FuncUnit> units,
-                 int timeinc = -1,
-                 ReservationKind kind = Required> {
-  int Cycles          = cycles;       // length of stage in machine cycles
-  list<FuncUnit> Units = units;       // choice of functional units
-  int TimeInc         = timeinc;      // cycles till start of next stage
-  int Kind            = kind.Value;   // kind of FU reservation
-}
+  ProcessorItineraries Itineraries = NoItineraries;
 
-//===----------------------------------------------------------------------===//
-// Instruction itinerary - An itinerary represents a sequential series of steps
-// required to complete an instruction.  Itineraries are represented as lists of
-// instruction stages.
-//
-
-//===----------------------------------------------------------------------===//
-// Instruction itinerary classes - These values represent 'named' instruction
-// itinerary.  Using named itineraries simplifies managing groups of
-// instructions across chip sets.  An instruction uses the same itinerary class
-// across all chip sets.  Thus a new chip set can be added without modifying
-// instruction information.
-//
-// NumMicroOps represents the number of micro-operations that each instruction
-// in the class are decoded to. If the number is zero, then it means the
-// instruction can decode into variable number of micro-ops and it must be
-// determined dynamically.
-//
-class InstrItinClass<int ops = 1> {
-  int NumMicroOps = ops;
+  bit NoModel = 0; // Special tag to indicate missing machine model.
 }
-def NoItinerary : InstrItinClass;
 
-//===----------------------------------------------------------------------===//
-// Instruction itinerary data - These values provide a runtime map of an 
-// instruction itinerary class (name) to its itinerary data.
-//
-// OperandCycles are optional "cycle counts". They specify the cycle after
-// instruction issue the values which correspond to specific operand indices
-// are defined or read. Bypasses are optional "pipeline forwarding pathes", if
-// a def by an instruction is available on a specific bypass and the use can
-// read from the same bypass, then the operand use latency is reduced by one.
-//
-//  InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Pipe1]>,
-//                               InstrStage<1, [A9_AGU]>],
-//                              [3, 1], [A9_LdBypass]>,
-//  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
-//                              [1, 1], [NoBypass, A9_LdBypass]>,
-//
-// In this example, the instruction of IIC_iLoadi reads its input on cycle 1
-// (after issue) and the result of the load is available on cycle 3. The result
-// is available via forwarding path A9_LdBypass. If it's used by the first
-// source operand of instructions of IIC_iMVNr class, then the operand latency
-// is reduced by 1.
-class InstrItinData<InstrItinClass Class, list<InstrStage> stages,
-                    list<int> operandcycles = [],
-                    list<Bypass> bypasses = []> {
-  InstrItinClass TheClass = Class;
-  list<InstrStage> Stages = stages;
-  list<int> OperandCycles = operandcycles;
-  list<Bypass> Bypasses = bypasses;
-}
-
-//===----------------------------------------------------------------------===//
-// Processor itineraries - These values represent the set of all itinerary
-// classes for a given chip set.
-//
-class ProcessorItineraries<list<FuncUnit> fu, list<Bypass> bp,
-                           list<InstrItinData> iid> {
-  list<FuncUnit> FU = fu;
-  list<Bypass> BP = bp;
-  list<InstrItinData> IID = iid;
+def NoSchedModel : SchedMachineModel {
+  let NoModel = 1;
 }
 
-// NoItineraries - A marker that can be used by processors without schedule
-// info.
-def NoItineraries : ProcessorItineraries<[], [], []>;
-
+// TODO: Define classes for processor and scheduler resources.
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index f55cf0e..3f81c06 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -404,11 +404,16 @@ def brind      : SDNode<"ISD::BRIND"      , SDTBrind,  [SDNPHasChain]>;
 def br         : SDNode<"ISD::BR"         , SDTBr,     [SDNPHasChain]>;
 def trap       : SDNode<"ISD::TRAP"       , SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
+def debugtrap  : SDNode<"ISD::DEBUGTRAP"  , SDTNone,
+                        [SDNPHasChain, SDNPSideEffect]>;
 
 def prefetch   : SDNode<"ISD::PREFETCH"   , SDTPrefetch,
                         [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                          SDNPMemOperand]>;
 
+def readcyclecounter : SDNode<"ISD::READCYCLECOUNTER", SDTIntLeaf,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
 def membarrier : SDNode<"ISD::MEMBARRIER" , SDTMemBarrier,
                         [SDNPHasChain, SDNPSideEffect]>;
 
@@ -593,6 +598,13 @@ def not  : PatFrag<(ops node:$in), (xor node:$in, -1)>;
 def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
 def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;
 
+// null_frag - The null pattern operator is used in multiclass instantiations
+// which accept an SDPatternOperator for use in matching patterns for internal
+// definitions. When expanding a pattern, if the null fragment is referenced
+// in the expansion, the pattern is discarded and it is as-if '[]' had been
+// specified. This allows multiclasses to have the isel patterns be optional.
+def null_frag : SDPatternOperator;
+
 // load fragments.
 def unindexedload : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
   return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index bbf3a69..4b0c448 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -38,6 +38,13 @@ ModulePass *createAddressSanitizerPass();
 // Insert ThreadSanitizer (race detection) instrumentation
 FunctionPass *createThreadSanitizerPass();
 
+
+// BoundsChecking - This pass instruments the code to perform run-time bounds
+// checking on loads, stores, and other memory intrinsics.
+// Penalty is the maximum run-time that is acceptable for the user.
+//
+FunctionPass *createBoundsCheckingPass(unsigned Penalty = 5);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 7f055d4..3dce6fe 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -74,7 +74,10 @@ FunctionPass *createAggressiveDCEPass();
 // if possible.
 //
 FunctionPass *createScalarReplAggregatesPass(signed Threshold = -1,
-                                             bool UseDomTree = true);
+                                             bool UseDomTree = true,
+                                             signed StructMemberThreshold = -1,
+                                             signed ArrayElementThreshold = -1,
+                                             signed ScalarLoadThreshold = -1);
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 2f9dc54..8a939cc 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -202,10 +202,6 @@ void SplitLandingPadPredecessors(BasicBlock *OrigBB,ArrayRef<BasicBlock*> Preds,
 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                        BasicBlock *Pred);
 
-/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a
-/// given basic block.
-DebugLoc GetFirstDebugLocInBasicBlock(const BasicBlock *BB);
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index 17cd58eb..a6e41f0 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -15,7 +15,7 @@
 #ifndef TRANSFORMS_UTILS_BUILDLIBCALLS_H
 #define TRANSFORMS_UTILS_BUILDLIBCALLS_H
 
-#include "llvm/Support/IRBuilder.h"
+#include "llvm/IRBuilder.h"
 
 namespace llvm {
   class Value;
@@ -28,41 +28,52 @@ namespace llvm {
   /// EmitStrLen - Emit a call to the strlen function to the builder, for the
   /// specified pointer.  Ptr is required to be some pointer type, and the
   /// return value has 'intptr_t' type.
-  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD);
+  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
+                    const TargetLibraryInfo *TLI);
+
+  /// EmitStrNLen - Emit a call to the strnlen function to the builder, for the
+  /// specified pointer.  Ptr is required to be some pointer type, MaxLen must
+  /// be of size_t type, and the return value has 'intptr_t' type.
+  Value *EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
+                     const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// EmitStrChr - Emit a call to the strchr function to the builder, for the
   /// specified pointer and character.  Ptr is required to be some pointer type,
   /// and the return value has 'i8*' type.
-  Value *EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, const TargetData *TD);
+  Value *EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, const TargetData *TD,
+                    const TargetLibraryInfo *TLI);
 
   /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
   Value *EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
-                     const TargetData *TD);
+                     const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
   /// specified pointer arguments.
   Value *EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                    const TargetData *TD, StringRef Name = "strcpy");
+                    const TargetData *TD, const TargetLibraryInfo *TLI,
+                    StringRef Name = "strcpy");
 
   /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
   /// specified pointer arguments and length.
   Value *EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
-                    const TargetData *TD, StringRef Name = "strncpy");
+                     const TargetData *TD, const TargetLibraryInfo *TLI,
+                     StringRef Name = "strncpy");
 
   /// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
   /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
   /// are pointers.
   Value *EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                       IRBuilder<> &B, const TargetData *TD);
+                       IRBuilder<> &B, const TargetData *TD,
+                       const TargetLibraryInfo *TLI);
 
   /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
   /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
   Value *EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
-                    const TargetData *TD);
+                    const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// EmitMemCmp - Emit a call to the memcmp function.
   Value *EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
-                    const TargetData *TD);
+                    const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name'
   /// (e.g.  'floor').  This function is known to take a single of type matching
@@ -74,26 +85,28 @@ namespace llvm {
 
   /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
   /// is an integer.
-  Value *EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD);
+  Value *EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
+                     const TargetLibraryInfo *TLI);
 
   /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
   /// some pointer.
-  void EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD);
+  Value *EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
+                  const TargetLibraryInfo *TLI);
 
   /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
   /// an i32, and File is a pointer to FILE.
-  void EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                 const TargetData *TD);
+  Value *EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+                   const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
   /// pointer and File is a pointer to FILE.
-  void EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const TargetData *TD,
-                 const TargetLibraryInfo *TLI);
+  Value *EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const TargetData *TD,
+                   const TargetLibraryInfo *TLI);
 
   /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
   /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
-  void EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
-                  const TargetData *TD, const TargetLibraryInfo *TLI);
+  Value *EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+                    const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// SimplifyFortifiedLibCalls - Helper class for folding checked library
   /// calls (e.g. __strcpy_chk) into their unchecked counterparts.
@@ -105,7 +118,7 @@ namespace llvm {
                             bool isString) const = 0;
   public:
     virtual ~SimplifyFortifiedLibCalls();
-    bool fold(CallInst *CI, const TargetData *TD);
+    bool fold(CallInst *CI, const TargetData *TD, const TargetLibraryInfo *TLI);
   };
 }
 
diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
new file mode 100644
index 0000000..1122678
--- /dev/null
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -0,0 +1,127 @@
+//===-- Transform/Utils/CodeExtractor.h - Code extraction util --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A utility to support extracting code from one function into its own
+// stand-alone function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CODE_EXTRACTOR_H
+#define LLVM_TRANSFORMS_UTILS_CODE_EXTRACTOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+  class BasicBlock;
+  class DominatorTree;
+  class Function;
+  class Loop;
+  class Module;
+  class RegionNode;
+  class Type;
+  class Value;
+
+  /// \brief Utility class for extracting code into a new function.
+  ///
+  /// This utility provides a simple interface for extracting some sequence of
+  /// code into its own function, replacing it with a call to that function. It
+  /// also provides various methods to query about the nature and result of
+  /// such a transformation.
+  ///
+  /// The rough algorithm used is:
+  /// 1) Find both the inputs and outputs for the extracted region.
+  /// 2) Pass the inputs as arguments, remapping them within the extracted
+  ///    function to arguments.
+  /// 3) Add allocas for any scalar outputs, adding all of the outputs' allocas
+  ///    as arguments, and inserting stores to the arguments for any scalars.
+  class CodeExtractor {
+    typedef SetVector<Value *> ValueSet;
+
+    // Various bits of state computed on construction.
+    DominatorTree *const DT;
+    const bool AggregateArgs;
+
+    // Bits of intermediate state computed at various phases of extraction.
+    SetVector<BasicBlock *> Blocks;
+    unsigned NumExitBlocks;
+    Type *RetTy;
+
+  public:
+    /// \brief Create a code extractor for a single basic block.
+    ///
+    /// In this formation, we don't require a dominator tree. The given basic
+    /// block is set up for extraction.
+    CodeExtractor(BasicBlock *BB, bool AggregateArgs = false);
+
+    /// \brief Create a code extractor for a sequence of blocks.
+    ///
+    /// Given a sequence of basic blocks where the first block in the sequence
+    /// dominates the rest, prepare a code extractor object for pulling this
+    /// sequence out into its new function. When a DominatorTree is also given,
+    /// extra checking and transformations are enabled.
+    CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = 0,
+                  bool AggregateArgs = false);
+
+    /// \brief Create a code extractor for a loop body.
+    ///
+    /// Behaves just like the generic code sequence constructor, but uses the
+    /// block sequence of the loop.
+    CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs = false);
+
+    /// \brief Create a code extractor for a region node.
+    ///
+    /// Behaves just like the generic code sequence constructor, but uses the
+    /// block sequence of the region node passed in.
+    CodeExtractor(DominatorTree &DT, const RegionNode &RN,
+                  bool AggregateArgs = false);
+
+    /// \brief Perform the extraction, returning the new function.
+    ///
+    /// Returns zero when called on a CodeExtractor instance where isEligible
+    /// returns false.
+    Function *extractCodeRegion();
+
+    /// \brief Test whether this code extractor is eligible.
+    ///
+    /// Based on the blocks used when constructing the code extractor,
+    /// determine whether it is eligible for extraction.
+    bool isEligible() const { return !Blocks.empty(); }
+
+    /// \brief Compute the set of input values and output values for the code.
+    ///
+    /// These can be used either when performing the extraction or to evaluate
+    /// the expected size of a call to the extracted function. Note that this
+    /// work cannot be cached between the two as once we decide to extract
+    /// a code sequence, that sequence is modified, including changing these
+    /// sets, before extraction occurs. These modifications won't have any
+    /// significant impact on the cost however.
+    void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs) const;
+
+  private:
+    void severSplitPHINodes(BasicBlock *&Header);
+    void splitReturnBlocks();
+
+    Function *constructFunction(const ValueSet &inputs,
+                                const ValueSet &outputs,
+                                BasicBlock *header,
+                                BasicBlock *newRootNode, BasicBlock *newHeader,
+                                Function *oldFunction, Module *M);
+
+    void moveCodeToFunction(Function *newFunction);
+
+    void emitCallAndSwitchStatement(Function *newFunction,
+                                    BasicBlock *newHeader,
+                                    ValueSet &inputs,
+                                    ValueSet &outputs);
+
+  };
+}
+
+#endif
diff --git a/include/llvm/Transforms/Utils/FunctionUtils.h b/include/llvm/Transforms/Utils/FunctionUtils.h
deleted file mode 100644
index 8d71e43..0000000
--- a/include/llvm/Transforms/Utils/FunctionUtils.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-- Transform/Utils/FunctionUtils.h - Function Utils --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of transformations manipulate LLVM functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_FUNCTION_H
-#define LLVM_TRANSFORMS_UTILS_FUNCTION_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include <vector>
-
-namespace llvm {
-  class BasicBlock;
-  class DominatorTree;
-  class Function;
-  class Loop;
-
-  /// ExtractCodeRegion - Rip out a sequence of basic blocks into a new
-  /// function.
-  ///
-  Function* ExtractCodeRegion(DominatorTree& DT,
-                              ArrayRef<BasicBlock*> code,
-                              bool AggregateArgs = false);
-
-  /// ExtractLoop - Rip out a natural loop into a new function.
-  ///
-  Function* ExtractLoop(DominatorTree& DT, Loop *L,
-                        bool AggregateArgs = false);
-
-  /// ExtractBasicBlock - Rip out a basic block (and the associated landing pad)
-  /// into a new function.
-  ///
-  Function* ExtractBasicBlock(ArrayRef<BasicBlock*> BBs,
-                              bool AggregateArgs = false);
-}
-
-#endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 7f99dbc..495eab7 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -15,6 +15,11 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOCAL_H
 #define LLVM_TRANSFORMS_UTILS_LOCAL_H
 
+#include "llvm/IRBuilder.h"
+#include "llvm/Operator.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+
 namespace llvm {
 
 class User;
@@ -160,6 +165,65 @@ static inline unsigned getKnownAlignment(Value *V, const TargetData *TD = 0) {
   return getOrEnforceKnownAlignment(V, 0, TD);
 }
 
+/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
+/// code necessary to compute the offset from the base pointer (without adding
+/// in the base pointer).  Return the result as a signed integer of intptr size.
+/// When NoAssumptions is true, no assumptions about index computation not
+/// overflowing is made.
+template<typename IRBuilderTy>
+Value *EmitGEPOffset(IRBuilderTy *Builder, const TargetData &TD, User *GEP,
+                     bool NoAssumptions = false) {
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
+  Value *Result = Constant::getNullValue(IntPtrTy);
+
+  // If the GEP is inbounds, we know that none of the addressing operations will
+  // overflow in an unsigned sense.
+  bool isInBounds = cast<GEPOperator>(GEP)->isInBounds() && !NoAssumptions;
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
+      if (OpC->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+
+        if (Size)
+          Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
+                                      GEP->getName()+".offs");
+        continue;
+      }
+
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
+      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
+      // Emit an add instruction.
+      Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
+      continue;
+    }
+    // Convert to correct type.
+    if (Op->getType() != IntPtrTy)
+      Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
+    if (Size != 1) {
+      // We'll let instcombine(mul) convert this to a shl if possible.
+      Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
+                              GEP->getName()+".idx", isInBounds /*NUW*/);
+    }
+
+    // Emit an add instruction.
+    Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs");
+  }
+  return Result;
+}
+
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 98d51a2..0bb6ec6 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -21,7 +21,6 @@ namespace llvm {
 
 class AllocaInst;
 class DominatorTree;
-class DominanceFrontier;
 class AliasSetTracker;
 
 /// isAllocaPromotable - Return true if this alloca is legal for promotion.
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 652916c..1e49a9c 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -28,6 +28,9 @@ struct VectorizeConfig {
   /// @brief The size of the native vector registers.
   unsigned VectorBits;
 
+  /// @brief Vectorize boolean values.
+  bool VectorizeBools;
+
   /// @brief Vectorize integer values.
   bool VectorizeInts;
 
@@ -49,6 +52,9 @@ struct VectorizeConfig {
   /// @brief Vectorize select instructions.
   bool VectorizeSelect;
 
+  /// @brief Vectorize comparison instructions.
+  bool VectorizeCmp;
+
   /// @brief Vectorize getelementptr instructions.
   bool VectorizeGEP;
 
@@ -80,6 +86,9 @@ struct VectorizeConfig {
   /// @brief The maximum number of pairing iterations.
   unsigned MaxIter;
 
+  /// @brief Don't try to form odd-length vectors.
+  bool Pow2LenOnly;
+
   /// @brief Don't boost the chain-depth contribution of loads and stores.
   bool NoMemOpBoost;
 
diff --git a/include/llvm/Support/TypeBuilder.h b/include/llvm/TypeBuilder.h
index c756069..0b56479 100644
--- a/include/llvm/Support/TypeBuilder.h
+++ b/include/llvm/TypeBuilder.h
@@ -1,4 +1,4 @@
-//===---- llvm/Support/TypeBuilder.h - Builder for LLVM types ---*- C++ -*-===//
+//===---- llvm/TypeBuilder.h - Builder for LLVM types -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_TYPEBUILDER_H
-#define LLVM_SUPPORT_TYPEBUILDER_H
+#ifndef LLVM_TYPEBUILDER_H
+#define LLVM_TYPEBUILDER_H
 
 #include "llvm/DerivedTypes.h"
 #include "llvm/LLVMContext.h"
diff --git a/include/llvm/TypeFinder.h b/include/llvm/TypeFinder.h
new file mode 100644
index 0000000..5d80705
--- /dev/null
+++ b/include/llvm/TypeFinder.h
@@ -0,0 +1,78 @@
+//===-- llvm/TypeFinder.h - Class for finding used struct types -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the TypeFinder class. 
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TYPEFINDER_H
+#define LLVM_TYPEFINDER_H
+
+#include "llvm/ADT/DenseSet.h"
+#include <vector>
+
+namespace llvm {
+
+class MDNode;
+class Module;
+class StructType;
+class Type;
+class Value;
+
+/// TypeFinder - Walk over a module, identifying all of the types that are
+/// used by the module.
+class TypeFinder {
+  // To avoid walking constant expressions multiple times and other IR
+  // objects, we keep several helper maps.
+  DenseSet<const Value*> VisitedConstants;
+  DenseSet<Type*> VisitedTypes;
+
+  std::vector<StructType*> StructTypes;
+  bool OnlyNamed;
+
+public:
+  TypeFinder() : OnlyNamed(false) {}
+
+  void run(const Module &M, bool onlyNamed);
+  void clear();
+
+  typedef std::vector<StructType*>::iterator iterator;
+  typedef std::vector<StructType*>::const_iterator const_iterator;
+
+  iterator begin() { return StructTypes.begin(); }
+  iterator end() { return StructTypes.end(); }
+
+  const_iterator begin() const { return StructTypes.begin(); }
+  const_iterator end() const { return StructTypes.end(); }
+
+  bool empty() const { return StructTypes.empty(); }
+  size_t size() const { return StructTypes.size(); }
+  iterator erase(iterator I, iterator E) { return StructTypes.erase(I, E); }
+
+  StructType *&operator[](unsigned Idx) { return StructTypes[Idx]; }
+
+private:
+  /// incorporateType - This method adds the type to the list of used
+  /// structures if it's not in there already.
+  void incorporateType(Type *Ty);
+
+  /// incorporateValue - This method is used to walk operand lists finding types
+  /// hiding in constant expressions and other operands that won't be walked in
+  /// other ways.  GlobalValues, basic blocks, instructions, and inst operands
+  /// are all explicitly enumerated.
+  void incorporateValue(const Value *V);
+
+  /// incorporateMDNode - This method is used to walk the operands of an MDNode
+  /// to find types hiding within.
+  void incorporateMDNode(const MDNode *V);
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/User.h b/include/llvm/User.h
index c52f32f..5d5460c 100644
--- a/include/llvm/User.h
+++ b/include/llvm/User.h
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class defines the interface that one who 'use's a Value must implement.
+// This class defines the interface that one who uses a Value must implement.
 // Each instance of the Value class keeps track of what User's have handles
 // to it.
 //
-//  * Instructions are the largest class of User's.
+//  * Instructions are the largest class of Users.
 //  * Constants may be users of other constants (think arrays and stuff)
 //
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 95c834b..3b6aab1 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -25,6 +25,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Pass.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Function.h"
@@ -356,6 +359,86 @@ AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) {
   return ModRef;
 }
 
+namespace {
+  /// Only find pointer captures which happen before the given instruction. Uses
+  /// the dominator tree to determine whether one instruction is before another.
+  struct CapturesBefore : public CaptureTracker {
+    CapturesBefore(const Instruction *I, DominatorTree *DT)
+      : BeforeHere(I), DT(DT), Captured(false) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool shouldExplore(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      if (BeforeHere != I &&
+          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
+        return false;
+      return true;
+    }
+
+    bool captured(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      if (BeforeHere != I &&
+          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
+        return false;
+      Captured = true;
+      return true;
+    }
+
+    const Instruction *BeforeHere;
+    DominatorTree *DT;
+
+    bool Captured;
+  };
+}
+
+// FIXME: this is really just shoring-up a deficiency in alias analysis.
+// BasicAA isn't willing to spend linear time determining whether an alloca
+// was captured before or after this particular call, while we are. However,
+// with a smarter AA in place, this test is just wasting compile time.
+AliasAnalysis::ModRefResult
+AliasAnalysis::callCapturesBefore(const Instruction *I,
+                                  const AliasAnalysis::Location &MemLoc,
+                                  DominatorTree *DT) {
+  if (!DT || !TD) return AliasAnalysis::ModRef;
+
+  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
+  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) ||
+      isa<Constant>(Object))
+    return AliasAnalysis::ModRef;
+
+  ImmutableCallSite CS(I);
+  if (!CS.getInstruction() || CS.getInstruction() == Object)
+    return AliasAnalysis::ModRef;
+
+  CapturesBefore CB(I, DT);
+  llvm::PointerMayBeCaptured(Object, &CB);
+  if (CB.Captured)
+    return AliasAnalysis::ModRef;
+
+  unsigned ArgNo = 0;
+  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+       CI != CE; ++CI, ++ArgNo) {
+    // Only look at the no-capture or byval pointer arguments.  If this
+    // pointer were passed to arguments that were neither of these, then it
+    // couldn't be no-capture.
+    if (!(*CI)->getType()->isPointerTy() ||
+        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
+      continue;
+
+    // If this is a no-capture pointer argument, see if we can tell that it
+    // is impossible to alias the pointer we're checking.  If not, we have to
+    // assume that the call could touch the pointer, even though it doesn't
+    // escape.
+    if (!isNoAlias(AliasAnalysis::Location(*CI),
+                   AliasAnalysis::Location(Object))) {
+      return AliasAnalysis::ModRef;
+    }
+  }
+  return AliasAnalysis::NoModRef;
+}
 
 // AliasAnalysis destructor: DO NOT move this to the header file for
 // AliasAnalysis or else clients of the AliasAnalysis class may not depend on
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index f80e2fb..92e8906 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -501,7 +501,7 @@ void AliasSetTracker::deleteValue(Value *PtrVal) {
   }
 
   // First, look up the PointerRec for this pointer.
-  PointerMapType::iterator I = PointerMap.find(PtrVal);
+  PointerMapType::iterator I = PointerMap.find_as(PtrVal);
   if (I == PointerMap.end()) return;  // Noop
 
   // If we found one, remove the pointer from the alias set it is in.
@@ -527,7 +527,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
   AA.copyValue(From, To);
 
   // First, look up the PointerRec for this pointer.
-  PointerMapType::iterator I = PointerMap.find(From);
+  PointerMapType::iterator I = PointerMap.find_as(From);
   if (I == PointerMap.end())
     return;  // Noop
   assert(I->second->hasAliasSet() && "Dead entry?");
@@ -536,7 +536,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
   if (Entry.hasAliasSet()) return;    // Already in the tracker!
 
   // Add it to the alias set it aliases...
-  I = PointerMap.find(From);
+  I = PointerMap.find_as(From);
   AliasSet *AS = I->second->getAliasSet(*this);
   AS->addPointer(*this, Entry, I->second->getSize(),
                  I->second->getTBAAInfo(),
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 20ecfd2..1d028c2 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -86,47 +86,10 @@ static bool isEscapeSource(const Value *V) {
 /// UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const TargetData &TD,
                               bool RoundToAlign = false) {
-  Type *AccessTy;
-  unsigned Align;
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
-    if (!GV->hasDefinitiveInitializer())
-      return AliasAnalysis::UnknownSize;
-    AccessTy = GV->getType()->getElementType();
-    Align = GV->getAlignment();
-  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-    if (!AI->isArrayAllocation())
-      AccessTy = AI->getType()->getElementType();
-    else
-      return AliasAnalysis::UnknownSize;
-    Align = AI->getAlignment();
-  } else if (const CallInst* CI = extractMallocCall(V)) {
-    if (!RoundToAlign && !isArrayMalloc(V, &TD))
-      // The size is the argument to the malloc call.
-      if (const ConstantInt* C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
-        return C->getZExtValue();
-    return AliasAnalysis::UnknownSize;
-  } else if (const Argument *A = dyn_cast<Argument>(V)) {
-    if (A->hasByValAttr()) {
-      AccessTy = cast<PointerType>(A->getType())->getElementType();
-      Align = A->getParamAlignment();
-    } else {
-      return AliasAnalysis::UnknownSize;
-    }
-  } else {
-    return AliasAnalysis::UnknownSize;
-  }
-
-  if (!AccessTy->isSized())
-    return AliasAnalysis::UnknownSize;
-
-  uint64_t Size = TD.getTypeAllocSize(AccessTy);
-  // If there is an explicitly specified alignment, and we need to
-  // take alignment into account, round up the size. (If the alignment
-  // is implicit, getTypeAllocSize is sufficient.)
-  if (RoundToAlign && Align)
-    Size = RoundUpToAlignment(Size, Align);
-
-  return Size;
+  uint64_t Size;
+  if (getObjectSize(V, Size, &TD, RoundToAlign))
+    return Size;
+  return AliasAnalysis::UnknownSize;
 }
 
 /// isObjectSmallerThan - Return true if we can prove that the object specified
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 2e3ec8b..96e68b4 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -12,9 +12,7 @@ add_llvm_library(LLVMAnalysis
   CaptureTracking.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
-  DIBuilder.cpp
   DbgInfoPrinter.cpp
-  DebugInfo.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   IVUsers.cpp
@@ -59,4 +57,6 @@ add_llvm_library(LLVMAnalysis
   ValueTracking.cpp
   )
 
+add_dependencies(LLVMAnalysis intrinsics_gen)
+
 add_subdirectory(IPA)
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index dd33eeb..974b906 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -34,7 +34,7 @@ namespace {
 
     bool captured(Use *U) {
       if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
-	return false;
+        return false;
 
       Captured = true;
       return true;
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 316e7bc9..acda34b 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -22,7 +22,11 @@ using namespace llvm;
 /// callIsSmall - If a call is likely to lower to a single target instruction,
 /// or is otherwise deemed small return true.
 /// TODO: Perhaps calls like memcpy, strcpy, etc?
-bool llvm::callIsSmall(const Function *F) {
+bool llvm::callIsSmall(ImmutableCallSite CS) {
+  if (isa<IntrinsicInst>(CS.getInstruction()))
+    return true;
+
+  const Function *F = CS.getCalledFunction();
   if (!F) return false;
 
   if (F->hasLocalLinkage()) return false;
@@ -79,8 +83,24 @@ bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) {
 
   if (const CastInst *CI = dyn_cast<CastInst>(I)) {
     // Noop casts, including ptr <-> int,  don't count.
-    if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || isa<PtrToIntInst>(CI))
+    if (CI->isLosslessCast())
+      return true;
+
+    Value *Op = CI->getOperand(0);
+    // An inttoptr cast is free so long as the input is a legal integer type
+    // which doesn't contain values outside the range of a pointer.
+    if (isa<IntToPtrInst>(CI) && TD &&
+        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
+        Op->getType()->getScalarSizeInBits() <= TD->getPointerSizeInBits())
       return true;
+
+    // A ptrtoint cast is free so long as the result is large enough to store
+    // the pointer, and a legal integer type.
+    if (isa<PtrToIntInst>(CI) && TD &&
+        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
+        Op->getType()->getScalarSizeInBits() >= TD->getPointerSizeInBits())
+      return true;
+
     // trunc to a native type is free (assuming the target has compare and
     // shift-right of the same width).
     if (TD && isa<TruncInst>(CI) &&
@@ -126,7 +146,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
           isRecursive = true;
       }
 
-      if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) {
+      if (!callIsSmall(CS)) {
         // Each argument to a call takes on average one instruction to set up.
         NumInsts += CS.arg_size();
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 783c32e..f5e619c 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -358,17 +358,20 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
       NumElts = AT->getNumElements();
     else
       NumElts = cast<VectorType>(C->getType())->getNumElements();
-    
+
     for (; Index != NumElts; ++Index) {
       if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
                               BytesLeft, TD))
         return false;
-      if (EltSize >= BytesLeft)
+
+      uint64_t BytesWritten = EltSize - Offset;
+      assert(BytesWritten <= EltSize && "Not indexing into this element?");
+      if (BytesWritten >= BytesLeft)
         return true;
-      
+
       Offset = 0;
-      BytesLeft -= EltSize;
-      CurPtr += EltSize;
+      BytesLeft -= BytesWritten;
+      CurPtr += BytesWritten;
     }
     return true;
   }
@@ -600,6 +603,22 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
   return C;
 }
 
+/// Strip the pointer casts, but preserve the address space information.
+static Constant* StripPtrCastKeepAS(Constant* Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
+  PointerType *OldPtrTy = cast<PointerType>(Ptr->getType());
+  Ptr = cast<Constant>(Ptr->stripPointerCasts());
+  PointerType *NewPtrTy = cast<PointerType>(Ptr->getType());
+
+  // Preserve the address space number of the pointer.
+  if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
+    NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
+      OldPtrTy->getAddressSpace());
+    Ptr = ConstantExpr::getBitCast(Ptr, NewPtrTy);
+  }
+  return Ptr;
+}
+
 /// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
 /// constant expression, do so.
 static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
@@ -636,13 +655,13 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       }
       return 0;
     }
-  
+
   unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
   APInt Offset =
     APInt(BitWidth, TD->getIndexedOffset(Ptr->getType(),
                                          makeArrayRef((Value **)Ops.data() + 1,
                                                       Ops.size() - 1)));
-  Ptr = cast<Constant>(Ptr->stripPointerCasts());
+  Ptr = StripPtrCastKeepAS(Ptr);
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
@@ -661,7 +680,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
     Ptr = cast<Constant>(GEP->getOperand(0));
     Offset += APInt(BitWidth,
                     TD->getIndexedOffset(Ptr->getType(), NestedOps));
-    Ptr = cast<Constant>(Ptr->stripPointerCasts());
+    Ptr = StripPtrCastKeepAS(Ptr);
   }
 
   // If the base value for this address is a literal integer value, fold the
@@ -780,14 +799,21 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
       // all operands are constants.
       if (isa<UndefValue>(Incoming))
         continue;
-      // If the incoming value is not a constant, or is a different constant to
-      // the one we saw previously, then give up.
+      // If the incoming value is not a constant, then give up.
       Constant *C = dyn_cast<Constant>(Incoming);
-      if (!C || (CommonValue && C != CommonValue))
+      if (!C)
+        return 0;
+      // Fold the PHI's operands.
+      if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C))
+        C = ConstantFoldConstantExpression(NewC, TD, TLI);
+      // If the incoming value is a different constant to
+      // the one we saw previously, then give up.
+      if (CommonValue && C != CommonValue)
         return 0;
       CommonValue = C;
     }
 
+
     // If we reach here, all incoming values are the same constant or undef.
     return CommonValue ? CommonValue : UndefValue::get(PN->getType());
   }
@@ -795,12 +821,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   // Scan the operand list, checking to see if they are all constants, if so,
   // hand off to ConstantFoldInstOperands.
   SmallVector<Constant*, 8> Ops;
-  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (Constant *Op = dyn_cast<Constant>(*i))
-      Ops.push_back(Op);
-    else
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+    Constant *Op = dyn_cast<Constant>(*i);
+    if (!Op)
       return 0;  // All operands not constant!
 
+    // Fold the Instruction's operands.
+    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op))
+      Op = ConstantFoldConstantExpression(NewCE, TD, TLI);
+
+    Ops.push_back(Op);
+  }
+
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
                                            TD, TLI);
diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp
index cd832ab..41cd34c 100644
--- a/lib/Analysis/DbgInfoPrinter.cpp
+++ b/lib/Analysis/DbgInfoPrinter.cpp
@@ -16,14 +16,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Pass.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
index 8ffef29..34d6d1b 100644
--- a/lib/Analysis/IPA/CMakeLists.txt
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -5,3 +5,5 @@ add_llvm_library(LLVMipa
   GlobalsModRef.cpp
   IPA.cpp
   )
+
+add_dependencies(LLVMipa intrinsics_gen)
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 963da75..449b7ee 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -246,7 +246,9 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
         CallSite CS(cast<Value>(I));
-        if (!CS || isa<IntrinsicInst>(I)) continue;
+        if (!CS) continue;
+        Function *Callee = CS.getCalledFunction();
+        if (Callee && Callee->isIntrinsic()) continue;
         
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from CallSites.
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index c1d8e3e..22f6e96 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -329,15 +329,8 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       // Check the value being stored.
       Value *Ptr = GetUnderlyingObject(SI->getOperand(0));
 
-      if (isMalloc(Ptr)) {
-        // Okay, easy case.
-      } else if (CallInst *CI = dyn_cast<CallInst>(Ptr)) {
-        Function *F = CI->getCalledFunction();
-        if (!F || !F->isDeclaration()) return false;     // Too hard to analyze.
-        if (F->getName() != "calloc") return false;   // Not calloc.
-      } else {
+      if (!isAllocLikeFn(Ptr))
         return false;  // Too hard to analyze.
-      }
 
       // Analyze all uses of the allocation.  If any of them are used in a
       // non-simple way (e.g. stored to another global) bail out.
@@ -454,19 +447,18 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
       for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
              E = inst_end(SCC[i]->getFunction());
            II != E && FunctionEffect != ModRef; ++II)
-        if (isa<LoadInst>(*II)) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
           FunctionEffect |= Ref;
-          if (cast<LoadInst>(*II).isVolatile())
+          if (LI->isVolatile())
             // Volatile loads may have side-effects, so mark them as writing
             // memory (for example, a flag inside the processor).
             FunctionEffect |= Mod;
-        } else if (isa<StoreInst>(*II)) {
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) {
           FunctionEffect |= Mod;
-          if (cast<StoreInst>(*II).isVolatile())
+          if (SI->isVolatile())
             // Treat volatile stores as reading memory somewhere.
             FunctionEffect |= Ref;
-        } else if (isMalloc(&cast<Instruction>(*II)) ||
-                   isFreeCall(&cast<Instruction>(*II))) {
+        } else if (isAllocationFn(&*II) || isFreeCall(&*II)) {
           FunctionEffect |= ModRef;
         } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
           // The callgraph doesn't include intrinsic calls.
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index b80966b..0a6682a 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/STLExtras.h"
@@ -120,6 +121,12 @@ bool IVUsers::AddUsersImpl(Instruction *I,
   if (!SE->isSCEVable(I->getType()))
     return false;   // Void and FP expressions cannot be reduced.
 
+  // IVUsers is used by LSR which assumes that all SCEV expressions are safe to
+  // pass to SCEVExpander. Expressions are not safe to expand if they represent
+  // operations that are not safe to speculate, namely integer division.
+  if (!isa<PHINode>(I) && !isSafeToSpeculativelyExecute(I, TD))
+    return false;
+
   // LSR is not APInt clean, do not touch integers bigger than 64-bits.
   // Also avoid creating IVs of non-native types. For example, we don't want a
   // 64-bit IV in 32-bit code just because the loop has one 64-bit cast.
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 3e3d2ab..bc1ecd2 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -178,7 +178,7 @@ bool CallAnalyzer::lookupSROAArgAndCost(
 
 /// \brief Disable SROA for the candidate marked by this cost iterator.
 ///
-/// This markes the candidate as no longer viable for SROA, and adds the cost
+/// This marks the candidate as no longer viable for SROA, and adds the cost
 /// savings associated with it back into the inline cost measurement.
 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
   // If we're no longer able to perform SROA we need to undo its cost savings
@@ -398,10 +398,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  // A ptrtoint cast is free so long as the result is large enough to store the
-  // pointer, and a legal integer type.
-  return TD && TD->isLegalInteger(IntegerSize) &&
-         IntegerSize >= TD->getPointerSizeInBits();
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@@ -428,10 +425,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  // An inttoptr cast is free so long as the input is a legal integer type
-  // which doesn't contain values outside the range of a pointer.
-  return TD && TD->isLegalInteger(IntegerSize) &&
-         IntegerSize <= TD->getPointerSizeInBits();
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
@@ -445,24 +439,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
 
-  // No-op casts don't have any cost.
-  if (I.isLosslessCast())
-    return true;
-
-  // trunc to a native type is free (assuming the target has compare and
-  // shift-right of the same width).
-  if (TD && isa<TruncInst>(I) &&
-      TD->isLegalInteger(TD->getTypeSizeInBits(I.getType())))
-    return true;
-
-  // Result of a cmp instruction is often extended (to be used by other
-  // cmp instructions, logical or return instructions). These are usually
-  // no-ops on most sane targets.
-  if (isa<CmpInst>(I.getOperand(0)))
-    return true;
-
-  // Assume the rest of the casts require work.
-  return false;
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
@@ -636,21 +613,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
     default:
       return Base::visitCallSite(CS);
 
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::invariant_start:
-    case Intrinsic::invariant_end:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
     case Intrinsic::memset:
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
-    case Intrinsic::objectsize:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-      // SROA can usually chew through these intrinsics and they have no cost
-      // so don't pay the price of analyzing them in detail.
-      return true;
+      // SROA can usually chew through these intrinsics, but they aren't free.
+      return false;
     }
   }
 
@@ -662,7 +629,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
       return false;
     }
 
-    if (!callIsSmall(F)) {
+    if (!callIsSmall(CS)) {
       // We account for the average 1 instruction per call argument setup
       // here.
       Cost += CS.arg_size() * InlineConstants::InstrCost;
@@ -706,6 +673,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
 }
 
 bool CallAnalyzer::visitInstruction(Instruction &I) {
+  // Some instructions are free. All of the free intrinsics can also be
+  // handled by SROA, etc.
+  if (isInstructionFree(&I, TD))
+    return true;
+
   // We found something we don't understand or can't handle. Mark any SROA-able
   // values in the operand list as no longer viable.
   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
@@ -825,9 +797,33 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     FiftyPercentVectorBonus = Threshold;
     TenPercentVectorBonus = Threshold / 2;
 
-    // Subtract off one instruction per call argument as those will be free after
-    // inlining.
-    Cost -= CS.arg_size() * InlineConstants::InstrCost;
+    // Give out bonuses per argument, as the instructions setting them up will
+    // be gone after inlining.
+    for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+      if (TD && CS.isByValArgument(I)) {
+        // We approximate the number of loads and stores needed by dividing the
+        // size of the byval type by the target's pointer size.
+        PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+        unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
+        unsigned PointerSize = TD->getPointerSizeInBits();
+        // Ceiling division.
+        unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+        // If it generates more than 8 stores it is likely to be expanded as an
+        // inline memcpy so we take that as an upper bound. Otherwise we assume
+        // one load and one store per word copied.
+        // FIXME: The maxStoresPerMemcpy setting from the target should be used
+        // here instead of a magic number of 8, but it's not available via
+        // TargetData.
+        NumStores = std::min(NumStores, 8U);
+
+        Cost -= 2 * NumStores * InlineConstants::InstrCost;
+      } else {
+        // For non-byval arguments subtract off one instruction per call
+        // argument.
+        Cost -= InlineConstants::InstrCost;
+      }
+    }
 
     // If there is only one call of the function, and it has internal linkage,
     // the cost of inlining it drops dramatically.
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 16e7a72..379a35a 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -47,7 +47,7 @@ struct Query {
   const DominatorTree *DT;
 
   Query(const TargetData *td, const TargetLibraryInfo *tli,
-        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {};
+        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {}
 };
 
 static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
@@ -1719,10 +1719,13 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return ConstantInt::get(ITy, false);
 
       // A local identified object (alloca or noalias call) can't equal any
-      // incoming argument, unless they're both null.
-      if (isa<Instruction>(LHSPtr) && isa<Argument>(RHSPtr) &&
-          Pred == CmpInst::ICMP_EQ)
-        return ConstantInt::get(ITy, false);
+      // incoming argument, unless they're both null or they belong to
+      // different functions. The latter happens during inlining.
+      if (Instruction *LHSInst = dyn_cast<Instruction>(LHSPtr))
+        if (Argument *RHSArg = dyn_cast<Argument>(RHSPtr))
+          if (LHSInst->getParent()->getParent() == RHSArg->getParent() &&
+              Pred == CmpInst::ICMP_EQ)
+            return ConstantInt::get(ITy, false);
     }
 
     // Assume that the constant null is on the right.
@@ -1732,14 +1735,17 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       else if (Pred == CmpInst::ICMP_NE)
         return ConstantInt::get(ITy, true);
     }
-  } else if (isa<Argument>(LHSPtr)) {
+  } else if (Argument *LHSArg = dyn_cast<Argument>(LHSPtr)) {
     RHSPtr = RHSPtr->stripInBoundsOffsets();
-    // An alloca can't be equal to an argument.
-    if (isa<AllocaInst>(RHSPtr)) {
-      if (Pred == CmpInst::ICMP_EQ)
-        return ConstantInt::get(ITy, false);
-      else if (Pred == CmpInst::ICMP_NE)
-        return ConstantInt::get(ITy, true);
+    // An alloca can't be equal to an argument unless they come from separate
+    // functions via inlining.
+    if (AllocaInst *RHSInst = dyn_cast<AllocaInst>(RHSPtr)) {
+      if (LHSArg->getParent() == RHSInst->getParent()->getParent()) {
+        if (Pred == CmpInst::ICMP_EQ)
+          return ConstantInt::get(ITy, false);
+        else if (Pred == CmpInst::ICMP_NE)
+          return ConstantInt::get(ITy, true);
+      }
     }
   }
 
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 5ca2746..9140786 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -172,7 +172,7 @@ public:
       if (NewR.isEmptySet())
         return markOverdefined();
       
-      bool changed = Range == NewR;
+      bool changed = Range != NewR;
       Range = NewR;
       return changed;
     }
@@ -457,8 +457,10 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
 void LazyValueInfoCache::solve() {
   while (!BlockValueStack.empty()) {
     std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
-    if (solveBlockValue(e.second, e.first))
+    if (solveBlockValue(e.second, e.first)) {
+      assert(BlockValueStack.top() == e);
       BlockValueStack.pop();
+    }
   }
 }
 
@@ -766,15 +768,10 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
   return true;
 }
 
-/// getEdgeValue - This method attempts to infer more complex 
-bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result) {
-  // If already a constant, there is nothing to compute.
-  if (Constant *VC = dyn_cast<Constant>(Val)) {
-    Result = LVILatticeVal::get(VC);
-    return true;
-  }
-  
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
+/// Val is not constrained on the edge.
+static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
+                              BasicBlock *BBTo, LVILatticeVal &Result) {
   // TODO: Handle more complex conditionals.  If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
@@ -818,7 +815,7 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
         ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1));
         if (CI && (ICI->getOperand(0) == Val || NegOffset)) {
           // Calculate the range of values that would satisfy the comparison.
-          ConstantRange CmpRange(CI->getValue(), CI->getValue()+1);
+          ConstantRange CmpRange(CI->getValue());
           ConstantRange TrueValues =
             ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
 
@@ -827,25 +824,8 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
 
           // If we're interested in the false dest, invert the condition.
           if (!isTrueDest) TrueValues = TrueValues.inverse();
-          
-          // Figure out the possible values of the query BEFORE this branch.  
-          if (!hasBlockValue(Val, BBFrom)) {
-            BlockValueStack.push(std::make_pair(BBFrom, Val));
-            return false;
-          }
-          
-          LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
-          if (!InBlock.isConstantRange()) {
-            Result = LVILatticeVal::getRange(TrueValues);
-            return true;
-          }
-
-          // Find all potential values that satisfy both the input and output
-          // conditions.
-          ConstantRange PossibleValues =
-            TrueValues.intersectWith(InBlock.getConstantRange());
-
-          Result = LVILatticeVal::getRange(PossibleValues);
+
+          Result = LVILatticeVal::getRange(TrueValues);
           return true;
         }
       }
@@ -855,40 +835,71 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // If the edge was formed by a switch on the value, then we may know exactly
   // what it is.
   if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) {
-    if (SI->getCondition() == Val) {
-      // We don't know anything in the default case.
-      if (SI->getDefaultDest() == BBTo) {
-        Result.markOverdefined();
-        return true;
-      }
-      
-      // We only know something if there is exactly one value that goes from
-      // BBFrom to BBTo.
-      unsigned NumEdges = 0;
-      ConstantInt *EdgeVal = 0;
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-           i != e; ++i) {
-        if (i.getCaseSuccessor() != BBTo) continue;
-        if (NumEdges++) break;
-        EdgeVal = i.getCaseValue();
-      }
-      assert(EdgeVal && "Missing successor?");
-      if (NumEdges == 1) {
-        Result = LVILatticeVal::get(EdgeVal);
-        return true;
-      }
+    if (SI->getCondition() != Val)
+      return false;
+
+    bool DefaultCase = SI->getDefaultDest() == BBTo;
+    unsigned BitWidth = Val->getType()->getIntegerBitWidth();
+    ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+      if (DefaultCase)
+        EdgesVals = EdgesVals.difference(EdgeVal);
+      else if (i.getCaseSuccessor() == BBTo)
+        EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
-  }
-  
-  // Otherwise see if the value is known in the block.
-  if (hasBlockValue(Val, BBFrom)) {
-    Result = getBlockValue(Val, BBFrom);
+    Result = LVILatticeVal::getRange(EdgesVals);
     return true;
   }
-  BlockValueStack.push(std::make_pair(BBFrom, Val));
   return false;
 }
 
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at
+/// the basic block if the edge does not constraint Val.
+bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
+                                      BasicBlock *BBTo, LVILatticeVal &Result) {
+  // If already a constant, there is nothing to compute.
+  if (Constant *VC = dyn_cast<Constant>(Val)) {
+    Result = LVILatticeVal::get(VC);
+    return true;
+  }
+
+  if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) {
+    if (!Result.isConstantRange() ||
+      Result.getConstantRange().getSingleElement())
+      return true;
+
+    // FIXME: this check should be moved to the beginning of the function when
+    // LVI better supports recursive values. Even for the single value case, we
+    // can intersect to detect dead code (an empty range).
+    if (!hasBlockValue(Val, BBFrom)) {
+      BlockValueStack.push(std::make_pair(BBFrom, Val));
+      return false;
+    }
+
+    // Try to intersect ranges of the BB and the constraint on the edge.
+    LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
+    if (!InBlock.isConstantRange())
+      return true;
+
+    ConstantRange Range =
+      Result.getConstantRange().intersectWith(InBlock.getConstantRange());
+    Result = LVILatticeVal::getRange(Range);
+    return true;
+  }
+
+  if (!hasBlockValue(Val, BBFrom)) {
+    BlockValueStack.push(std::make_pair(BBFrom, Val));
+    return false;
+  }
+
+  // if we couldn't compute the value on the edge, use the value from the BB
+  Result = getBlockValue(Val, BBFrom);
+  return true;
+}
+
 LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) {
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index f7a60a1..20c33a3 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
@@ -29,6 +30,10 @@
 #include <algorithm>
 using namespace llvm;
 
+// Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
+template class llvm::LoopBase<BasicBlock, Loop>;
+template class llvm::LoopInfoBase<BasicBlock, Loop>;
+
 // Always verify loopinfo if expensive checking is enabled.
 #ifdef XDEBUG
 static bool VerifyLoopInfo = true;
@@ -507,7 +512,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
 //
 bool LoopInfo::runOnFunction(Function &) {
   releaseMemory();
-  LI.Calculate(getAnalysis<DominatorTree>().getBase());    // Update
+  LI.Analyze(getAnalysis<DominatorTree>().getBase());
   return false;
 }
 
@@ -589,9 +594,6 @@ void LoopInfo::verifyAnalysis() const {
   }
 
   // Verify that blocks are mapped to valid loops.
-  //
-  // FIXME: With an up-to-date DFS (see LoopIterator.h) and DominatorTree, we
-  // could also verify that the blocks are still in the correct loops.
   for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(),
          E = LI.BBMap.end(); I != E; ++I) {
     assert(Loops.count(I->second) && "orphaned loop");
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index aba700a..1540112 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -162,7 +162,7 @@ void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
 // Recurse through all subloops and all loops  into LQ.
 static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
   LQ.push_back(L);
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I)
     addLoopIntoQueue(*I, LQ);
 }
 
@@ -183,8 +183,12 @@ bool LPPassManager::runOnFunction(Function &F) {
   // Collect inherited analysis from Module level pass manager.
   populateInheritedAnalysis(TPM->activeStack);
 
-  // Populate Loop Queue
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+  // Populate the loop queue in reverse program order. There is no clear need to
+  // process sibling loops in either forward or reverse order. There may be some
+  // advantage in deleting uses in a later loop before optimizing the
+  // definitions in an earlier loop. If we find a clear reason to process in
+  // forward order, then a forward variant of LoopPassManager should be created.
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
     addLoopIntoQueue(*I, LQ);
 
   if (LQ.empty()) // No loops, skip calling finalizers
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 22414b3..8578a63 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -32,7 +32,7 @@ namespace {
       Unknown
     };
 
-    static const char* DepTypeStr[];
+    static const char *const DepTypeStr[];
 
     typedef PointerIntPair<const Instruction *, 2, DepType> InstTypePair;
     typedef std::pair<InstTypePair, const BasicBlock *> Dep;
@@ -88,7 +88,7 @@ FunctionPass *llvm::createMemDepPrinter() {
   return new MemDepPrinter();
 }
 
-const char* MemDepPrinter::DepTypeStr[]
+const char *const MemDepPrinter::DepTypeStr[]
   = {"Clobber", "Def", "NonFuncLocal", "Unknown"};
 
 bool MemDepPrinter::runOnFunction(Function &F) {
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b145650..c0cc27b 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -12,80 +12,168 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "memory-builtins"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Metadata.h"
 #include "llvm/Module.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-//  malloc Call Utility Functions.
-//
+enum AllocType {
+  MallocLike         = 1<<0, // allocates
+  CallocLike         = 1<<1, // allocates + bzero
+  ReallocLike        = 1<<2, // reallocates
+  StrDupLike         = 1<<3,
+  AllocLike          = MallocLike | CallocLike | StrDupLike,
+  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+};
+
+struct AllocFnsTy {
+  const char *Name;
+  AllocType AllocTy;
+  unsigned char NumParams;
+  // First and Second size parameters (or -1 if unused)
+  signed char FstParam, SndParam;
+};
+
+// FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
+// know which functions are nounwind, noalias, nocapture parameters, etc.
+static const AllocFnsTy AllocationFnData[] = {
+  {"malloc",              MallocLike,  1, 0,  -1},
+  {"valloc",              MallocLike,  1, 0,  -1},
+  {"_Znwj",               MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {"_ZnwjRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {"_Znwm",               MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {"_ZnwmRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
+  {"_Znaj",               MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {"_ZnajRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {"_Znam",               MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {"_ZnamRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {"posix_memalign",      MallocLike,  3, 2,  -1},
+  {"calloc",              CallocLike,  2, 0,  1},
+  {"realloc",             ReallocLike, 2, 1,  -1},
+  {"reallocf",            ReallocLike, 2, 1,  -1},
+  {"strdup",              StrDupLike,  1, -1, -1},
+  {"strndup",             StrDupLike,  2, 1,  -1}
+};
+
+
+static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
+  if (LookThroughBitCast)
+    V = V->stripPointerCasts();
 
-/// isMalloc - Returns true if the value is either a malloc call or a
-/// bitcast of the result of a malloc call.
-bool llvm::isMalloc(const Value *I) {
-  return extractMallocCall(I) || extractMallocCallFromBitCast(I);
+  CallSite CS(const_cast<Value*>(V));
+  if (!CS.getInstruction())
+    return 0;
+
+  Function *Callee = CS.getCalledFunction();
+  if (!Callee || !Callee->isDeclaration())
+    return 0;
+  return Callee;
 }
 
-static bool isMallocCall(const CallInst *CI) {
-  if (!CI)
-    return false;
+/// \brief Returns the allocation data for the given value if it is a call to a
+/// known allocation function, and NULL otherwise.
+static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
+                                           bool LookThroughBitCast = false) {
+  Function *Callee = getCalledFunction(V, LookThroughBitCast);
+  if (!Callee)
+    return 0;
 
-  Function *Callee = CI->getCalledFunction();
-  if (Callee == 0 || !Callee->isDeclaration())
-    return false;
-  if (Callee->getName() != "malloc" &&
-      Callee->getName() != "_Znwj" && // operator new(unsigned int)
-      Callee->getName() != "_Znwm" && // operator new(unsigned long)
-      Callee->getName() != "_Znaj" && // operator new[](unsigned int)
-      Callee->getName() != "_Znam")   // operator new[](unsigned long)
-    return false;
+  unsigned i = 0;
+  bool found = false;
+  for ( ; i < array_lengthof(AllocationFnData); ++i) {
+    if (Callee->getName() == AllocationFnData[i].Name) {
+      found = true;
+      break;
+    }
+  }
+  if (!found)
+    return 0;
 
-  // Check malloc prototype.
-  // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
-  // attribute will exist.
+  const AllocFnsTy *FnData = &AllocationFnData[i];
+  if ((FnData->AllocTy & AllocTy) == 0)
+    return 0;
+
+  // Check function prototype.
+  // FIXME: Check the nobuiltin metadata?? (PR5130)
+  int FstParam = FnData->FstParam;
+  int SndParam = FnData->SndParam;
   FunctionType *FTy = Callee->getFunctionType();
-  return FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
-         FTy->getNumParams() == 1 &&
-         (FTy->getParamType(0)->isIntegerTy(32) ||
-          FTy->getParamType(0)->isIntegerTy(64));
+
+  if (FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
+      FTy->getNumParams() == FnData->NumParams &&
+      (FstParam < 0 ||
+       (FTy->getParamType(FstParam)->isIntegerTy(32) ||
+        FTy->getParamType(FstParam)->isIntegerTy(64))) &&
+      (SndParam < 0 ||
+       FTy->getParamType(SndParam)->isIntegerTy(32) ||
+       FTy->getParamType(SndParam)->isIntegerTy(64)))
+    return FnData;
+  return 0;
 }
 
-/// extractMallocCall - Returns the corresponding CallInst if the instruction
-/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
-/// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(const Value *I) {
-  const CallInst *CI = dyn_cast<CallInst>(I);
-  return (isMallocCall(CI)) ? CI : NULL;
+static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
+  ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
+  return CS && CS.hasFnAttr(Attribute::NoAlias);
 }
 
-CallInst *llvm::extractMallocCall(Value *I) {
-  CallInst *CI = dyn_cast<CallInst>(I);
-  return (isMallocCall(CI)) ? CI : NULL;
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
+/// like).
+bool llvm::isAllocationFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, LookThroughBitCast);
 }
 
-static bool isBitCastOfMallocCall(const BitCastInst *BCI) {
-  if (!BCI)
-    return false;
-    
-  return isMallocCall(dyn_cast<CallInst>(BCI->getOperand(0)));
+/// \brief Tests if a value is a call or invoke to a function that returns a
+/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
+bool llvm::isNoAliasFn(const Value *V, bool LookThroughBitCast) {
+  // it's safe to consider realloc as noalias since accessing the original
+  // pointer is undefined behavior
+  return isAllocationFn(V, LookThroughBitCast) ||
+         hasNoAliasAttr(V, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates uninitialized memory (such as malloc).
+bool llvm::isMallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates zero-filled memory (such as calloc).
+bool llvm::isCallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, CallocLike, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory (either malloc, calloc, or strdup like).
+bool llvm::isAllocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, AllocLike, LookThroughBitCast);
 }
 
-/// extractMallocCallFromBitCast - Returns the corresponding CallInst if the
-/// instruction is a bitcast of the result of a malloc call.
-CallInst *llvm::extractMallocCallFromBitCast(Value *I) {
-  BitCastInst *BCI = dyn_cast<BitCastInst>(I);
-  return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0))
-                                      : NULL;
+/// \brief Tests if a value is a call or invoke to a library function that
+/// reallocates memory (such as realloc).
+bool llvm::isReallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, ReallocLike, LookThroughBitCast);
 }
 
-const CallInst *llvm::extractMallocCallFromBitCast(const Value *I) {
-  const BitCastInst *BCI = dyn_cast<BitCastInst>(I);
-  return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0))
-                                      : NULL;
+/// extractMallocCall - Returns the corresponding CallInst if the instruction
+/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
+/// ignore InvokeInst here.
+const CallInst *llvm::extractMallocCall(const Value *I) {
+  return isMallocLikeFn(I) ? dyn_cast<CallInst>(I) : 0;
 }
 
 static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
@@ -134,7 +222,7 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
 PointerType *llvm::getMallocType(const CallInst *CI) {
-  assert(isMalloc(CI) && "getMallocType and not malloc call");
+  assert(isMallocLikeFn(CI) && "getMallocType and not malloc call");
   
   PointerType *MallocType = NULL;
   unsigned NumOfBitCastUses = 0;
@@ -176,13 +264,17 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI) {
 /// determined.
 Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD,
                                 bool LookThroughSExt) {
-  assert(isMalloc(CI) && "getMallocArraySize and not malloc call");
+  assert(isMallocLikeFn(CI) && "getMallocArraySize and not malloc call");
   return computeArraySize(CI, TD, LookThroughSExt);
 }
 
-//===----------------------------------------------------------------------===//
-//  free Call Utility Functions.
-//
+
+/// extractCallocCall - Returns the corresponding CallInst if the instruction
+/// is a calloc call.
+const CallInst *llvm::extractCallocCall(const Value *I) {
+  return isCallocLikeFn(I) ? cast<CallInst>(I) : 0;
+}
+
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
 const CallInst *llvm::isFreeCall(const Value *I) {
@@ -211,3 +303,438 @@ const CallInst *llvm::isFreeCall(const Value *I) {
 
   return CI;
 }
+
+
+
+//===----------------------------------------------------------------------===//
+//  Utility functions to compute size of objects.
+//
+
+
+/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// object size in Size if successful, and false otherwise.
+/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
+/// byval arguments, and global variables.
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
+                         bool RoundToAlign) {
+  if (!TD)
+    return false;
+
+  ObjectSizeOffsetVisitor Visitor(TD, Ptr->getContext(), RoundToAlign);
+  SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
+  if (!Visitor.bothKnown(Data))
+    return false;
+
+  APInt ObjSize = Data.first, Offset = Data.second;
+  // check for overflow
+  if (Offset.slt(0) || ObjSize.ult(Offset))
+    Size = 0;
+  else
+    Size = (ObjSize - Offset).getZExtValue();
+  return true;
+}
+
+
+STATISTIC(ObjectVisitorArgument,
+          "Number of arguments with unsolved size and offset");
+STATISTIC(ObjectVisitorLoad,
+          "Number of load instructions with unsolved size and offset");
+
+
+APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
+  if (RoundToAlign && Align)
+    return APInt(IntTyBits, RoundUpToAlignment(Size.getZExtValue(), Align));
+  return Size;
+}
+
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD,
+                                                 LLVMContext &Context,
+                                                 bool RoundToAlign)
+: TD(TD), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = TD->getIntPtrType(Context);
+  IntTyBits = IntTy->getBitWidth();
+  Zero = APInt::getNullValue(IntTyBits);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
+  V = V->stripPointerCasts();
+
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+    return visitGEPOperator(*GEP);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return visit(*I);
+  if (Argument *A = dyn_cast<Argument>(V))
+    return visitArgument(*A);
+  if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V))
+    return visitConstantPointerNull(*P);
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return visitGlobalVariable(*GV);
+  if (UndefValue *UV = dyn_cast<UndefValue>(V))
+    return visitUndefValue(*UV);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      return unknown(); // clueless
+
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
+        << '\n');
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  if (!I.isArrayAllocation())
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+
+  Value *ArraySize = I.getArraySize();
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) {
+    Size *= C->getValue().zextOrSelf(IntTyBits);
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+  }
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
+  // no interprocedural analysis is done at the moment
+  if (!A.hasByValAttr()) {
+    ++ObjectVisitorArgument;
+    return unknown();
+  }
+  PointerType *PT = cast<PointerType>(A.getType());
+  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  return std::make_pair(align(Size, A.getParamAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    APInt Size(IntTyBits, GetStringLength(CS.getArgument(0)));
+    if (!Size)
+      return unknown();
+
+    // strndup limits strlen
+    if (FnData->FstParam > 0) {
+      ConstantInt *Arg= dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+      if (!Arg)
+        return unknown();
+
+      APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits);
+      if (Size.ugt(MaxSize))
+        Size = MaxSize + 1;
+    }
+    return std::make_pair(Size, Zero);
+  }
+
+  ConstantInt *Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+  if (!Arg)
+    return unknown();
+
+  APInt Size = Arg->getValue().zextOrSelf(IntTyBits);
+  // size determined by just 1 parameter
+  if (FnData->SndParam < 0)
+    return std::make_pair(Size, Zero);
+
+  Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->SndParam));
+  if (!Arg)
+    return unknown();
+
+  Size *= Arg->getValue().zextOrSelf(IntTyBits);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
+  // Easy cases were already folded by previous passes.
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetType PtrData = compute(GEP.getPointerOperand());
+  if (!bothKnown(PtrData) || !GEP.hasAllConstantIndices())
+    return unknown();
+
+  SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end());
+  APInt Offset(IntTyBits,TD->getIndexedOffset(GEP.getPointerOperandType(),Ops));
+  return std::make_pair(PtrData.first, PtrData.second + Offset);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
+  if (!GV.hasDefinitiveInitializer())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  return std::make_pair(align(Size, GV.getAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) {
+  ++ObjectVisitorLoad;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
+  // too complex to analyze statically.
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
+  // ignore malformed self-looping selects
+  if (I.getTrueValue() == &I || I.getFalseValue() == &I)
+    return unknown();
+
+  SizeOffsetType TrueSide  = compute(I.getTrueValue());
+  SizeOffsetType FalseSide = compute(I.getFalseValue());
+  if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide)
+    return TrueSide;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I << '\n');
+  return unknown();
+}
+
+
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const TargetData *TD,
+                                                     LLVMContext &Context)
+: TD(TD), Context(Context), Builder(Context, TargetFolder(TD)),
+Visitor(TD, Context) {
+  IntTy = TD->getIntPtrType(Context);
+  Zero = ConstantInt::get(IntTy, 0);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
+  SizeOffsetEvalType Result = compute_(V);
+
+  if (!bothKnown(Result)) {
+    // erase everything that was computed in this iteration from the cache, so
+    // that no dangling references are left behind. We could be a bit smarter if
+    // we kept a dependency graph. It's probably not worth the complexity.
+    for (PtrSetTy::iterator I=SeenVals.begin(), E=SeenVals.end(); I != E; ++I) {
+      CacheMapTy::iterator CacheIt = CacheMap.find(*I);
+      // non-computable results can be safely cached
+      if (CacheIt != CacheMap.end() && anyKnown(CacheIt->second))
+        CacheMap.erase(CacheIt);
+    }
+  }
+
+  SeenVals.clear();
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
+  SizeOffsetType Const = Visitor.compute(V);
+  if (Visitor.bothKnown(Const))
+    return std::make_pair(ConstantInt::get(Context, Const.first),
+                          ConstantInt::get(Context, Const.second));
+
+  V = V->stripPointerCasts();
+
+  // check cache
+  CacheMapTy::iterator CacheIt = CacheMap.find(V);
+  if (CacheIt != CacheMap.end())
+    return CacheIt->second;
+
+  // always generate code immediately before the instruction being
+  // processed, so that the generated code dominates the same BBs
+  Instruction *PrevInsertPoint = Builder.GetInsertPoint();
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    Builder.SetInsertPoint(I);
+
+  // record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails
+  SeenVals.insert(V);
+
+  // now compute the size and offset
+  SizeOffsetEvalType Result;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    Result = visitGEPOperator(*GEP);
+  } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Result = visit(*I);
+  } else if (isa<Argument>(V) ||
+             (isa<ConstantExpr>(V) &&
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::IntToPtr) ||
+             isa<GlobalVariable>(V)) {
+    // ignore values where we cannot do more than what ObjectSizeVisitor can
+    Result = unknown();
+  } else {
+    DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: "
+          << *V << '\n');
+    Result = unknown();
+  }
+
+  if (PrevInsertPoint)
+    Builder.SetInsertPoint(PrevInsertPoint);
+
+  // Don't reuse CacheIt since it may be invalid at this point.
+  CacheMap[V] = Result;
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  // must be a VLA
+  assert(I.isArrayAllocation());
+  Value *ArraySize = I.getArraySize();
+  Value *Size = ConstantInt::get(ArraySize->getType(),
+                                 TD->getTypeAllocSize(I.getAllocatedType()));
+  Size = Builder.CreateMul(Size, ArraySize);
+  return std::make_pair(Size, Zero);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    // TODO
+    return unknown();
+  }
+
+  Value *FirstArg = CS.getArgument(FnData->FstParam);
+  FirstArg = Builder.CreateZExt(FirstArg, IntTy);
+  if (FnData->SndParam < 0)
+    return std::make_pair(FirstArg, Zero);
+
+  Value *SecondArg = CS.getArgument(FnData->SndParam);
+  SecondArg = Builder.CreateZExt(SecondArg, IntTy);
+  Value *Size = Builder.CreateMul(FirstArg, SecondArg);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractValueInst(ExtractValueInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetEvalType PtrData = compute_(GEP.getPointerOperand());
+  if (!bothKnown(PtrData))
+    return unknown();
+
+  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Offset = Builder.CreateAdd(PtrData.second, Offset);
+  return std::make_pair(PtrData.first, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
+  // create 2 PHIs: one for size and another for offset
+  PHINode *SizePHI   = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+  PHINode *OffsetPHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+
+  // insert right away in the cache to handle recursive PHIs
+  CacheMap[&PHI] = std::make_pair(SizePHI, OffsetPHI);
+
+  // compute offset/size for each PHI incoming pointer
+  for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) {
+    Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt());
+    SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
+
+    if (!bothKnown(EdgeData)) {
+      OffsetPHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      OffsetPHI->eraseFromParent();
+      SizePHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      SizePHI->eraseFromParent();
+      return unknown();
+    }
+    SizePHI->addIncoming(EdgeData.first, PHI.getIncomingBlock(i));
+    OffsetPHI->addIncoming(EdgeData.second, PHI.getIncomingBlock(i));
+  }
+
+  Value *Size = SizePHI, *Offset = OffsetPHI, *Tmp;
+  if ((Tmp = SizePHI->hasConstantValue())) {
+    Size = Tmp;
+    SizePHI->replaceAllUsesWith(Size);
+    SizePHI->eraseFromParent();
+  }
+  if ((Tmp = OffsetPHI->hasConstantValue())) {
+    Offset = Tmp;
+    OffsetPHI->replaceAllUsesWith(Offset);
+    OffsetPHI->eraseFromParent();
+  }
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) {
+  // ignore malformed self-looping selects
+  if (I.getTrueValue() == &I || I.getFalseValue() == &I)
+    return unknown();
+
+  SizeOffsetEvalType TrueSide  = compute_(I.getTrueValue());
+  SizeOffsetEvalType FalseSide = compute_(I.getFalseValue());
+
+  if (!bothKnown(TrueSide) || !bothKnown(FalseSide))
+    return unknown();
+  if (TrueSide == FalseSide)
+    return TrueSide;
+
+  Value *Size = Builder.CreateSelect(I.getCondition(), TrueSide.first,
+                                     FalseSide.first);
+  Value *Offset = Builder.CreateSelect(I.getCondition(), TrueSide.second,
+                                       FalseSide.second);
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I <<'\n');
+  return unknown();
+}
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3a544f3..059e574 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -16,13 +16,11 @@
 
 #define DEBUG_TYPE "memdep"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Function.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -229,13 +227,18 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
 
         // Otherwise if the two calls don't interact (e.g. InstCS is readnone)
         // keep scanning.
-        break;
+        continue;
       default:
         return MemDepResult::getClobber(Inst);
       }
     }
+
+    // If we could not obtain a pointer for the instruction and the instruction
+    // touches memory then assume that this is a dependency.
+    if (MR != AliasAnalysis::NoModRef)
+      return MemDepResult::getClobber(Inst);
   }
-  
+
   // No dependence found.  If this is the entry block of the function, it is
   // unknown, otherwise it is non-local.
   if (BB != &BB->getParent()->getEntryBlock())
@@ -339,86 +342,6 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
   }
 }
 
-namespace {
-  /// Only find pointer captures which happen before the given instruction. Uses
-  /// the dominator tree to determine whether one instruction is before another.
-  struct CapturesBefore : public CaptureTracker {
-    CapturesBefore(const Instruction *I, DominatorTree *DT)
-      : BeforeHere(I), DT(DT), Captured(false) {}
-
-    void tooManyUses() { Captured = true; }
-
-    bool shouldExplore(Use *U) {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      if (BeforeHere != I &&
-          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
-        return false;
-      return true;
-    }
-
-    bool captured(Use *U) {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      if (BeforeHere != I &&
-          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
-        return false;
-      Captured = true;
-      return true;
-    }
-
-    const Instruction *BeforeHere;
-    DominatorTree *DT;
-
-    bool Captured;
-  };
-}
-
-AliasAnalysis::ModRefResult
-MemoryDependenceAnalysis::getModRefInfo(const Instruction *Inst,
-                                        const AliasAnalysis::Location &MemLoc) {
-  AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
-  if (MR != AliasAnalysis::ModRef) return MR;
-
-  // FIXME: this is really just shoring-up a deficiency in alias analysis.
-  // BasicAA isn't willing to spend linear time determining whether an alloca
-  // was captured before or after this particular call, while we are. However,
-  // with a smarter AA in place, this test is just wasting compile time.
-  if (!DT) return AliasAnalysis::ModRef;
-  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
-  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object))
-    return AliasAnalysis::ModRef;
-  ImmutableCallSite CS(Inst);
-  if (!CS.getInstruction()) return AliasAnalysis::ModRef;
-
-  CapturesBefore CB(Inst, DT);
-  llvm::PointerMayBeCaptured(Object, &CB);
-
-  if (isa<Constant>(Object) || CS.getInstruction() == Object || CB.Captured)
-    return AliasAnalysis::ModRef;
-
-  unsigned ArgNo = 0;
-  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
-       CI != CE; ++CI, ++ArgNo) {
-    // Only look at the no-capture or byval pointer arguments.  If this
-    // pointer were passed to arguments that were neither of these, then it
-    // couldn't be no-capture.
-    if (!(*CI)->getType()->isPointerTy() ||
-        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
-      continue;
-
-    // If this is a no-capture pointer argument, see if we can tell that it
-    // is impossible to alias the pointer we're checking.  If not, we have to
-    // assume that the call could touch the pointer, even though it doesn't
-    // escape.
-    if (!AA->isNoAlias(AliasAnalysis::Location(*CI),
-                       AliasAnalysis::Location(Object))) {
-      return AliasAnalysis::ModRef;
-    }
-  }
-  return AliasAnalysis::NoModRef;
-}
-
 /// getPointerDependencyFrom - Return the instruction on which a memory
 /// location depends.  If isLoad is true, this routine ignores may-aliases with
 /// read-only operations.  If isLoad is false, this routine ignores may-aliases
@@ -556,8 +479,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     // a subsequent bitcast of the malloc call result.  There can be stores to
     // the malloced memory between the malloc call and its bitcast uses, and we
     // need to continue scanning until the malloc call.
-    if (isa<AllocaInst>(Inst) ||
-        (isa<CallInst>(Inst) && extractMallocCall(Inst))) {
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
       
       if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
@@ -566,7 +488,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     }
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
-    switch (getModRefInfo(Inst, MemLoc)) {
+    AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
+    // If necessary, perform additional analysis.
+    if (MR == AliasAnalysis::ModRef)
+      MR = AA->callCapturesBefore(Inst, MemLoc, DT);
+    switch (MR) {
     case AliasAnalysis::NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
       continue;
@@ -984,7 +910,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
   if (!Pair.second) {
     if (CacheInfo->Size < Loc.Size) {
       // The query's Size is greater than the cached one. Throw out the
-      // cached data and procede with the query at the greater size.
+      // cached data and proceed with the query at the greater size.
       CacheInfo->Pair = BBSkipFirstBlockPair();
       CacheInfo->Size = Loc.Size;
       for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
diff --git a/lib/Analysis/ModuleDebugInfoPrinter.cpp b/lib/Analysis/ModuleDebugInfoPrinter.cpp
index e7e999c..f8c7514 100644
--- a/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -16,10 +16,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Pass.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
index 80c5222..d4ad726 100644
--- a/lib/Analysis/PathNumbering.cpp
+++ b/lib/Analysis/PathNumbering.cpp
@@ -31,11 +31,11 @@
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <queue>
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
index eaa38da..5c7c97c 100644
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ b/lib/Analysis/ProfileInfoLoader.cpp
@@ -83,10 +83,8 @@ const unsigned ProfileInfoLoader::Uncounted = ~0U;
 // program if the file is invalid or broken.
 //
 ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
-                                     const std::string &Filename,
-                                     Module &TheModule) :
-                                     Filename(Filename), 
-                                     M(TheModule), Warned(false) {
+                                     const std::string &Filename)
+  : Filename(Filename) {
   FILE *F = fopen(Filename.c_str(), "rb");
   if (F == 0) {
     errs() << ToolName << ": Error opening '" << Filename << "': ";
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index c4da807..5ecf052 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -152,7 +152,7 @@ void LoaderPass::readEdge(ProfileInfo::Edge e,
 }
 
 bool LoaderPass::runOnModule(Module &M) {
-  ProfileInfoLoader PIL("profile-loader", Filename, M);
+  ProfileInfoLoader PIL("profile-loader", Filename);
 
   EdgeInformation.clear();
   std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index b507b1e..868f483 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -47,7 +47,7 @@ static cl::opt<enum Region::PrintStyle> printStyle("print-region-style",
   cl::values(
     clEnumValN(Region::PrintNone, "none",  "print no details"),
     clEnumValN(Region::PrintBB, "bb",
-               "print regions in detail with block_iterator"),
+               "print regions in detail with block_node_iterator"),
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator"),
     clEnumValEnd));
@@ -246,19 +246,19 @@ void Region::verifyRegionNest() const {
   verifyRegion();
 }
 
-Region::block_iterator Region::block_begin() {
+Region::block_node_iterator Region::block_node_begin() {
   return GraphTraits<FlatIt<Region*> >::nodes_begin(this);
 }
 
-Region::block_iterator Region::block_end() {
+Region::block_node_iterator Region::block_node_end() {
   return GraphTraits<FlatIt<Region*> >::nodes_end(this);
 }
 
-Region::const_block_iterator Region::block_begin() const {
+Region::const_block_node_iterator Region::block_node_begin() const {
   return GraphTraits<FlatIt<const Region*> >::nodes_begin(this);
 }
 
-Region::const_block_iterator Region::block_end() const {
+Region::const_block_node_iterator Region::block_node_end() const {
   return GraphTraits<FlatIt<const Region*> >::nodes_end(this);
 }
 
@@ -425,7 +425,9 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
     OS.indent(level*2 + 2);
 
     if (Style == PrintBB) {
-      for (const_block_iterator I = block_begin(), E = block_end(); I!=E; ++I)
+      for (const_block_node_iterator I = block_node_begin(),
+                                     E = block_node_end();
+           I != E; ++I)
         OS << **I << ", "; // TODO: remove the last ","
     } else if (Style == PrintRN) {
       for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3a3529b..c97b5eb 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -195,7 +195,8 @@ public:
 
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
     Out << Banner;
-    for (Region::block_iterator I = R->block_begin(), E = R->block_end();
+    for (Region::block_node_iterator I = R->block_node_begin(),
+                                     E = R->block_node_end();
          I != E; ++I)
       (*I)->getEntry()->print(Out);
 
diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
index a1730b0..8b23cc7 100644
--- a/lib/Analysis/RegionPrinter.cpp
+++ b/lib/Analysis/RegionPrinter.cpp
@@ -122,13 +122,11 @@ struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
     RegionInfo *RI = R->getRegionInfo();
 
     for (Region::const_block_iterator BI = R->block_begin(),
-         BE = R->block_end(); BI != BE; ++BI) {
-      BasicBlock *BB = (*BI)->getNodeAs<BasicBlock>();
-      if (RI->getRegionFor(BB) == R)
+         BE = R->block_end(); BI != BE; ++BI)
+      if (RI->getRegionFor(*BI) == R)
         O.indent(2 * (depth + 1)) << "Node"
-          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB))
+          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(*BI))
           << ";\n";
-    }
 
     O.indent(2 * depth) << "}\n";
   }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 205227c..a654648 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -826,8 +826,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(),
-                                               getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty)));
 
   // trunc(trunc(x)) --> trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
@@ -879,13 +878,6 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
     return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
   }
 
-  // As a special case, fold trunc(undef) to undef. We don't want to
-  // know too much about SCEVUnknowns, but this special case is handy
-  // and harmless.
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
-    if (isa<UndefValue>(U->getValue()))
-      return getSCEV(UndefValue::get(Ty));
-
   // The cast wasn't folded; create an explicit cast node. We can reuse
   // the existing insert position since if we get here, we won't have
   // made any changes which would invalidate it.
@@ -906,8 +898,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(),
-                                              getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));
 
   // zext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
@@ -976,12 +967,15 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
           const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *Add = getAddExpr(Start, ZMul);
+          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy);
+          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
           const SCEV *OperandExtendedAdd =
-            getAddExpr(getZeroExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getZeroExtendExpr(Step, WideTy)));
-          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
@@ -991,13 +985,11 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
           }
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
-          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          Add = getAddExpr(Start, SMul);
           OperandExtendedAdd =
-            getAddExpr(getZeroExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getSignExtendExpr(Step, WideTy)));
-          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NW, which is propagated to this AddRec.
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
@@ -1164,8 +1156,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(),
-                                              getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));
 
   // sext(sext(x)) --> sext(x)
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
@@ -1242,12 +1233,15 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
           const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *Add = getAddExpr(Start, SMul);
+          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy);
+          const SCEV *WideStart = getSignExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
           const SCEV *OperandExtendedAdd =
-            getAddExpr(getSignExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getSignExtendExpr(Step, WideTy)));
-          if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
@@ -1257,13 +1251,11 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           }
           // Similar to above, only this time treat the step value as unsigned.
           // This covers loops that count up with an unsigned step.
-          const SCEV *UMul = getMulExpr(CastedMaxBECount, Step);
-          Add = getAddExpr(Start, UMul);
           OperandExtendedAdd =
-            getAddExpr(getSignExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getZeroExtendExpr(Step, WideTy)));
-          if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
@@ -1345,13 +1337,6 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
     return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW);
   }
 
-  // As a special case, fold anyext(undef) to undef. We don't want to
-  // know too much about SCEVUnknowns, but this special case is handy
-  // and harmless.
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
-    if (isa<UndefValue>(U->getValue()))
-      return getSCEV(UndefValue::get(Ty));
-
   // If the expression is obviously signed, use the sext cast value.
   if (isa<SCEVSMaxExpr>(Op))
     return SExt;
@@ -1839,7 +1824,7 @@ static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) {
 
 /// Compute the result of "n choose k", the binomial coefficient.  If an
 /// intermediate computation overflows, Overflow will be set and the return will
-/// be garbage. Overflow is not cleared on absense of overflow.
+/// be garbage. Overflow is not cleared on absence of overflow.
 static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
   // We use the multiplicative formula:
   //     n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 .
@@ -2038,63 +2023,67 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
          ++OtherIdx) {
-      if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
-        // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
-        // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
-        //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
-        //   ]]],+,...up to x=2n}.
-        // Note that the arguments to choose() are always integers with values
-        // known at compile time, never SCEV objects.
-        //
-        // The implementation avoids pointless extra computations when the two
-        // addrec's are of different length (mathematically, it's equivalent to
-        // an infinite stream of zeros on the right).
-        bool OpsModified = false;
-        for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-             ++OtherIdx)
-          if (const SCEVAddRecExpr *OtherAddRec =
-                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
-            if (OtherAddRec->getLoop() == AddRecLoop) {
-              bool Overflow = false;
-              Type *Ty = AddRec->getType();
-              bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
-              SmallVector<const SCEV*, 7> AddRecOps;
-              for (int x = 0, xe = AddRec->getNumOperands() +
-                     OtherAddRec->getNumOperands() - 1;
-                   x != xe && !Overflow; ++x) {
-                const SCEV *Term = getConstant(Ty, 0);
-                for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
-                  uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
-                  for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
-                         ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
-                       z < ze && !Overflow; ++z) {
-                    uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
-                    uint64_t Coeff;
-                    if (LargerThan64Bits)
-                      Coeff = umul_ov(Coeff1, Coeff2, Overflow);
-                    else
-                      Coeff = Coeff1*Coeff2;
-                    const SCEV *CoeffTerm = getConstant(Ty, Coeff);
-                    const SCEV *Term1 = AddRec->getOperand(y-z);
-                    const SCEV *Term2 = OtherAddRec->getOperand(z);
-                    Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
-                  }
-                }
-                AddRecOps.push_back(Term);
-              }
-              if (!Overflow) {
-                const SCEV *NewAddRec = getAddRecExpr(AddRecOps,
-                                                      AddRec->getLoop(),
-                                                      SCEV::FlagAnyWrap);
-                if (Ops.size() == 2) return NewAddRec;
-                Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec);
-                Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
-                OpsModified = true;
-              }
+      if (AddRecLoop != cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop())
+        continue;
+
+      // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
+      // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
+      //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
+      //   ]]],+,...up to x=2n}.
+      // Note that the arguments to choose() are always integers with values
+      // known at compile time, never SCEV objects.
+      //
+      // The implementation avoids pointless extra computations when the two
+      // addrec's are of different length (mathematically, it's equivalent to
+      // an infinite stream of zeros on the right).
+      bool OpsModified = false;
+      for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+           ++OtherIdx) {
+        const SCEVAddRecExpr *OtherAddRec =
+          dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+        if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
+          continue;
+
+        bool Overflow = false;
+        Type *Ty = AddRec->getType();
+        bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
+        SmallVector<const SCEV*, 7> AddRecOps;
+        for (int x = 0, xe = AddRec->getNumOperands() +
+               OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
+          const SCEV *Term = getConstant(Ty, 0);
+          for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
+            uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
+            for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
+                   ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
+                 z < ze && !Overflow; ++z) {
+              uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
+              uint64_t Coeff;
+              if (LargerThan64Bits)
+                Coeff = umul_ov(Coeff1, Coeff2, Overflow);
+              else
+                Coeff = Coeff1*Coeff2;
+              const SCEV *CoeffTerm = getConstant(Ty, Coeff);
+              const SCEV *Term1 = AddRec->getOperand(y-z);
+              const SCEV *Term2 = OtherAddRec->getOperand(z);
+              Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
             }
-        if (OpsModified)
-          return getMulExpr(Ops);
+          }
+          AddRecOps.push_back(Term);
+        }
+        if (!Overflow) {
+          const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
+                                                SCEV::FlagAnyWrap);
+          if (Ops.size() == 2) return NewAddRec;
+          Ops[Idx] = NewAddRec;
+          Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+          OpsModified = true;
+          AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
+          if (!AddRec)
+            break;
+        }
       }
+      if (OpsModified)
+        return getMulExpr(Ops);
     }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -2723,7 +2712,7 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  ValueExprMapType::const_iterator I = ValueExprMap.find(V);
+  ValueExprMapType::const_iterator I = ValueExprMap.find_as(V);
   if (I != ValueExprMap.end()) return I->second;
   const SCEV *S = createSCEV(V);
 
@@ -2960,7 +2949,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
     if (!Visited.insert(I)) continue;
 
     ValueExprMapType::iterator It =
-      ValueExprMap.find(static_cast<Value *>(I));
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       const SCEV *Old = It->second;
 
@@ -3017,7 +3006,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
       if (BEValueV && StartValueV) {
         // While we are analyzing this PHI node, handle its value symbolically.
         const SCEV *SymbolicName = getUnknown(PN);
-        assert(ValueExprMap.find(PN) == ValueExprMap.end() &&
+        assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
                "PHI node already processed?");
         ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
 
@@ -4081,7 +4070,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
       if (!Visited.insert(I)) continue;
 
       ValueExprMapType::iterator It =
-        ValueExprMap.find(static_cast<Value *>(I));
+        ValueExprMap.find_as(static_cast<Value *>(I));
       if (It != ValueExprMap.end()) {
         const SCEV *Old = It->second;
 
@@ -4132,7 +4121,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     Instruction *I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       forgetMemoizedResults(It->second);
       ValueExprMap.erase(It);
@@ -4165,7 +4155,8 @@ void ScalarEvolution::forgetValue(Value *V) {
     I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       forgetMemoizedResults(It->second);
       ValueExprMap.erase(It);
@@ -5379,6 +5370,12 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     SqrtTerm *= B;
     SqrtTerm -= Four * (A * C);
 
+    if (SqrtTerm.isNegative()) {
+      // The loop is provably infinite.
+      const SCEV *CNC = SE.getCouldNotCompute();
+      return std::make_pair(CNC, CNC);
+    }
+
     // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
     // integer value or else APInt::sqrt() will assert.
     APInt SqrtVal(SqrtTerm.sqrt());
@@ -5481,7 +5478,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
   // We have not yet seen any such cases.
   const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
-  if (StepC == 0)
+  if (StepC == 0 || StepC->getValue()->equalsInt(0))
     return getCouldNotCompute();
 
   // For positive steps (counting up until unsigned overflow):
@@ -5602,9 +5599,14 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
 /// predicate Pred. Return true iff any changes were made.
 ///
 bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
-                                           const SCEV *&LHS, const SCEV *&RHS) {
+                                           const SCEV *&LHS, const SCEV *&RHS,
+                                           unsigned Depth) {
   bool Changed = false;
 
+  // If we hit the max recursion limit bail out.
+  if (Depth >= 3)
+    return false;
+
   // Canonicalize a constant to the right side.
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
     // Check for both operands constant.
@@ -5642,6 +5644,16 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_NE:
+      // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b.
+      if (!RA)
+        if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(LHS))
+          if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(AE->getOperand(0)))
+            if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 &&
+                ME->getOperand(0)->isAllOnesValue()) {
+              RHS = AE->getOperand(1);
+              LHS = ME->getOperand(1);
+              Changed = true;
+            }
       break;
     case ICmpInst::ICMP_UGE:
       if ((RA - 1).isMinValue()) {
@@ -5843,6 +5855,11 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
 
   // TODO: More simplifications are possible here.
 
+  // Recursively simplify until we either hit a recursion limit or nothing
+  // changes.
+  if (Changed)
+    return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);
+
   return Changed;
 
 trivially_true:
@@ -6040,12 +6057,34 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   return false;
 }
 
+/// RAII wrapper to prevent recursive application of isImpliedCond.
+/// ScalarEvolution's PendingLoopPredicates set must be empty unless we are
+/// currently evaluating isImpliedCond.
+struct MarkPendingLoopPredicate {
+  Value *Cond;
+  DenseSet<Value*> &LoopPreds;
+  bool Pending;
+
+  MarkPendingLoopPredicate(Value *C, DenseSet<Value*> &LP)
+    : Cond(C), LoopPreds(LP) {
+    Pending = !LoopPreds.insert(Cond).second;
+  }
+  ~MarkPendingLoopPredicate() {
+    if (!Pending)
+      LoopPreds.erase(Cond);
+  }
+};
+
 /// isImpliedCond - Test whether the condition described by Pred, LHS,
 /// and RHS is true whenever the given Cond value evaluates to true.
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
                                     const SCEV *LHS, const SCEV *RHS,
                                     Value *FoundCondValue,
                                     bool Inverse) {
+  MarkPendingLoopPredicate Mark(FoundCondValue, PendingLoopPredicates);
+  if (Mark.Pending)
+    return false;
+
   // Recursively handle And and Or conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
@@ -6572,6 +6611,8 @@ void ScalarEvolution::releaseMemory() {
     I->second.clear();
   }
 
+  assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
+
   BackedgeTakenCounts.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValuesAtScopes.clear();
@@ -6859,44 +6900,27 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
   return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
 }
 
-bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
-  switch (S->getSCEVType()) {
-  case scConstant:
-    return false;
-  case scTruncate:
-  case scZeroExtend:
-  case scSignExtend: {
-    const SCEVCastExpr *Cast = cast<SCEVCastExpr>(S);
-    const SCEV *CastOp = Cast->getOperand();
-    return Op == CastOp || hasOperand(CastOp, Op);
-  }
-  case scAddRecExpr:
-  case scAddExpr:
-  case scMulExpr:
-  case scUMaxExpr:
-  case scSMaxExpr: {
-    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      const SCEV *NAryOp = *I;
-      if (NAryOp == Op || hasOperand(NAryOp, Op))
-        return true;
-    }
-    return false;
-  }
-  case scUDivExpr: {
-    const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
-    const SCEV *LHS = UDiv->getLHS(), *RHS = UDiv->getRHS();
-    return LHS == Op || hasOperand(LHS, Op) ||
-           RHS == Op || hasOperand(RHS, Op);
-  }
-  case scUnknown:
-    return false;
-  case scCouldNotCompute:
-    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-  default:
-    llvm_unreachable("Unknown SCEV kind!");
+namespace {
+// Search for a SCEV expression node within an expression tree.
+// Implements SCEVTraversal::Visitor.
+struct SCEVSearch {
+  const SCEV *Node;
+  bool IsFound;
+
+  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+
+  bool follow(const SCEV *S) {
+    IsFound |= (S == Node);
+    return !IsFound;
   }
+  bool isDone() const { return IsFound; }
+};
+}
+
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  SCEVSearch Search(Op);
+  visitAll(S, Search);
+  return Search.IsFound;
 }
 
 void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 69507be..62710c5 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -37,7 +37,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
   // We use this precondition to produce a cast that will dominate all its
   // uses. In particular, this is crucial for the case where the builder's
   // insertion point *is* the point where we were asked to put the cast.
-  // Since we don't know the the builder's insertion point is actually
+  // Since we don't know the builder's insertion point is actually
   // where the uses will be added (only that it dominates it), we are
   // not allowed to move it.
   BasicBlock::iterator BIP = Builder.GetInsertPoint();
@@ -955,7 +955,8 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
 
   // InsertPos must itself dominate IncV so that IncV's new position satisfies
   // its existing users.
-  if (!SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
+  if (isa<PHINode>(InsertPos)
+      || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
     return false;
 
   // Check that the chain of IV operands leading back to Phi can be hoisted.
@@ -1699,3 +1700,44 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   }
   return NumElim;
 }
+
+namespace {
+// Search for a SCEV subexpression that is not safe to expand.  Any expression
+// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
+// UDiv expressions. We don't know if the UDiv is derived from an IR divide
+// instruction, but the important thing is that we prove the denominator is
+// nonzero before expansion.
+//
+// IVUsers already checks that IV-derived expressions are safe. So this check is
+// only needed when the expression includes some subexpression that is not IV
+// derived.
+//
+// Currently, we only allow division by a nonzero constant here. If this is
+// inadequate, we could easily allow division by SCEVUnknown by using
+// ValueTracking to check isKnownNonZero().
+struct SCEVFindUnsafe {
+  bool IsUnsafe;
+
+  SCEVFindUnsafe(): IsUnsafe(false) {}
+
+  bool follow(const SCEV *S) {
+    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
+    if (!D)
+      return true;
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+    if (SC && !SC->getValue()->isZero())
+      return true;
+    IsUnsafe = true;
+    return false;
+  }
+  bool isDone() const { return IsUnsafe; }
+};
+}
+
+namespace llvm {
+bool isSafeToExpand(const SCEV *S) {
+  SCEVFindUnsafe Search;
+  visitAll(S, Search);
+  return !Search.IsUnsafe;
+}
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 1418e01..cea34e1 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -694,7 +694,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     // taking conservative care to avoid excessive recursion.
     if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) {
       // Skip if every incoming value references to ourself.
-      if (P->hasConstantValue() == P)
+      if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
       KnownZero = APInt::getAllOnesValue(BitWidth);
@@ -1796,6 +1796,37 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) {
   return V;
 }
 
+void
+llvm::GetUnderlyingObjects(Value *V,
+                           SmallVectorImpl<Value *> &Objects,
+                           const TargetData *TD,
+                           unsigned MaxLookup) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = GetUnderlyingObject(P, TD, MaxLookup);
+
+    if (!Visited.insert(P))
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+
+    Objects.push_back(P);
+  } while (!Worklist.empty());
+}
+
 /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
 /// are lifetime markers.
 ///
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
index 68873e2..5cfc810 100644
--- a/lib/Archive/ArchiveReader.cpp
+++ b/lib/Archive/ArchiveReader.cpp
@@ -82,14 +82,9 @@ Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
   ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At;
   At += sizeof(ArchiveMemberHeader);
 
-  // Extract the size and determine if the file is
-  // compressed or not (negative length).
   int flags = 0;
   int MemberSize = atoi(Hdr->size);
-  if (MemberSize < 0) {
-    flags |= ArchiveMember::CompressedFlag;
-    MemberSize = -MemberSize;
-  }
+  assert(MemberSize >= 0);
 
   // Check the size of the member for sanity
   if (At + MemberSize > End) {
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
index 9ef2943..ec6b4b8 100644
--- a/lib/Archive/ArchiveWriter.cpp
+++ b/lib/Archive/ArchiveWriter.cpp
@@ -204,7 +204,6 @@ Archive::writeMember(
   std::ofstream& ARFile,
   bool CreateSymbolTable,
   bool TruncateNames,
-  bool ShouldCompress,
   std::string* ErrMsg
 ) {
 
@@ -349,7 +348,7 @@ Archive::writeSymbolTable(std::ofstream& ARFile) {
 // table, flattening the file names (no directories, 15 chars max) and
 // compressing each archive member.
 bool
-Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
+Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames,
                      std::string* ErrMsg)
 {
   // Make sure they haven't opened up the file, not loaded it,
@@ -394,7 +393,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
   // builds the symbol table, symTab.
   for (MembersList::iterator I = begin(), E = end(); I != E; ++I) {
     if (writeMember(*I, ArchiveFile, CreateSymbolTable,
-                     TruncateNames, Compress, ErrMsg)) {
+                     TruncateNames, ErrMsg)) {
       TmpArchive.eraseFromDisk();
       ArchiveFile.close();
       return true;
@@ -446,7 +445,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
     // compatibility with other ar(1) implementations as well as allowing the
     // archive to store both native .o and LLVM .bc files, both indexed.
     if (foreignST) {
-      if (writeMember(*foreignST, FinalFile, false, false, false, ErrMsg)) {
+      if (writeMember(*foreignST, FinalFile, false, false, ErrMsg)) {
         FinalFile.close();
         TmpArchive.eraseFromDisk();
         return true;
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 8818168..481733d 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -474,6 +474,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(extern_weak);
   KEYWORD(external);
   KEYWORD(thread_local);
+  KEYWORD(localdynamic);
+  KEYWORD(initialexec);
+  KEYWORD(localexec);
   KEYWORD(zeroinitializer);
   KEYWORD(undef);
   KEYWORD(null);
@@ -550,6 +553,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(naked);
   KEYWORD(nonlazybind);
   KEYWORD(address_safety);
+  KEYWORD(ia_nsdialect);
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -673,11 +677,12 @@ lltok::Kind LLLexer::LexIdentifier() {
 ///    HexFP80Constant   0xK[0-9A-Fa-f]+
 ///    HexFP128Constant  0xL[0-9A-Fa-f]+
 ///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+///    HexHalfConstant   0xH[0-9A-Fa-f]+
 lltok::Kind LLLexer::Lex0x() {
   CurPtr = TokStart + 2;
 
   char Kind;
-  if (CurPtr[0] >= 'K' && CurPtr[0] <= 'M') {
+  if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H') {
     Kind = *CurPtr++;
   } else {
     Kind = 'J';
@@ -718,6 +723,9 @@ lltok::Kind LLLexer::Lex0x() {
     HexToIntPair(TokStart+3, CurPtr, Pair);
     APFloatVal = APFloat(APInt(128, Pair));
     return lltok::APFloat;
+  case 'H':
+    APFloatVal = APFloat(APInt(16,HexIntToVal(TokStart+3, CurPtr)));
+    return lltok::APFloat;
   }
 }
 
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 068be3d..0ff8edd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -645,12 +645,13 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility) {
   unsigned AddrSpace;
-  bool ThreadLocal, IsConstant, UnnamedAddr;
+  bool IsConstant, UnnamedAddr;
+  GlobalVariable::ThreadLocalMode TLM;
   LocTy UnnamedAddrLoc;
   LocTy TyLoc;
 
   Type *Ty = 0;
-  if (ParseOptionalToken(lltok::kw_thread_local, ThreadLocal) ||
+  if (ParseOptionalThreadLocal(TLM) ||
       ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
                          &UnnamedAddrLoc) ||
@@ -691,7 +692,8 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
 
   if (GV == 0) {
     GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 0,
-                            Name, 0, false, AddrSpace);
+                            Name, 0, GlobalVariable::NotThreadLocal,
+                            AddrSpace);
   } else {
     if (GV->getType()->getElementType() != Ty)
       return Error(TyLoc,
@@ -710,7 +712,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   GV->setConstant(IsConstant);
   GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
   GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
-  GV->setThreadLocal(ThreadLocal);
+  GV->setThreadLocalMode(TLM);
   GV->setUnnamedAddr(UnnamedAddr);
 
   // Parse attributes on the global.
@@ -858,6 +860,46 @@ bool LLParser::ParseUInt32(unsigned &Val) {
   return false;
 }
 
+/// ParseTLSModel
+///   := 'localdynamic'
+///   := 'initialexec'
+///   := 'localexec'
+bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
+  switch (Lex.getKind()) {
+    default:
+      return TokError("expected localdynamic, initialexec or localexec");
+    case lltok::kw_localdynamic:
+      TLM = GlobalVariable::LocalDynamicTLSModel;
+      break;
+    case lltok::kw_initialexec:
+      TLM = GlobalVariable::InitialExecTLSModel;
+      break;
+    case lltok::kw_localexec:
+      TLM = GlobalVariable::LocalExecTLSModel;
+      break;
+  }
+
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalThreadLocal
+///   := /*empty*/
+///   := 'thread_local'
+///   := 'thread_local' '(' tlsmodel ')'
+bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
+  TLM = GlobalVariable::NotThreadLocal;
+  if (!EatIfPresent(lltok::kw_thread_local))
+    return false;
+
+  TLM = GlobalVariable::GeneralDynamicTLSModel;
+  if (Lex.getKind() == lltok::lparen) {
+    Lex.Lex();
+    return ParseTLSModel(TLM) ||
+      ParseToken(lltok::rparen, "expected ')' after thread local model");
+  }
+  return false;
+}
 
 /// ParseOptionalAddrSpace
 ///   := /*empty*/
@@ -920,6 +962,7 @@ bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
     case lltok::kw_naked:           Attrs |= Attribute::Naked; break;
     case lltok::kw_nonlazybind:     Attrs |= Attribute::NonLazyBind; break;
     case lltok::kw_address_safety:  Attrs |= Attribute::AddressSafety; break;
+    case lltok::kw_ia_nsdialect:    Attrs |= Attribute::IANSDialect; break;
 
     case lltok::kw_alignstack: {
       unsigned Alignment;
@@ -2692,7 +2735,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (FuncAttrs != Attribute::None)
     Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs));
 
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   if (PAL.paramHasAttr(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
@@ -3239,7 +3282,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args);
   II->setCallingConv(CC);
@@ -3635,7 +3678,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   CallInst *CI = CallInst::Create(Callee, Args);
   CI->setTailCall(isTail);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index dda8808..257c726 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -171,6 +171,9 @@ namespace llvm {
       Loc = Lex.getLoc();
       return ParseUInt32(Val);
     }
+
+    bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
+    bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind);
     bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index adf5d4f..0b0b980 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -44,13 +44,14 @@ namespace lltok {
     kw_unnamed_addr,
     kw_extern_weak,
     kw_external, kw_thread_local,
+    kw_localdynamic, kw_initialexec, kw_localexec,
     kw_zeroinitializer,
     kw_undef, kw_null,
     kw_to,
     kw_tail,
     kw_target,
     kw_triple,
-    kw_unwind, 
+    kw_unwind,
     kw_deplibs,
     kw_datalayout,
     kw_volatile,
@@ -104,6 +105,7 @@ namespace lltok {
     kw_naked,
     kw_nonlazybind,
     kw_address_safety,
+    kw_ia_nsdialect,
 
     kw_type,
     kw_opaque,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e399040..4ffee38 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -28,6 +28,10 @@
 #include "llvm/OperandTraits.h"
 using namespace llvm;
 
+enum {
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+};
+
 void BitcodeReader::materializeForwardReferencedFunctions() {
   while (!BlockAddrFwdRefs.empty()) {
     Function *F = BlockAddrFwdRefs.begin()->first;
@@ -57,7 +61,7 @@ void BitcodeReader::FreeState() {
 /// ConvertToString - Convert a string from a record into an std::string, return
 /// true on failure.
 template<typename StrTy>
-static bool ConvertToString(SmallVector<uint64_t, 64> &Record, unsigned Idx,
+static bool ConvertToString(ArrayRef<uint64_t> Record, unsigned Idx,
                             StrTy &Result) {
   if (Idx > Record.size())
     return true;
@@ -98,6 +102,17 @@ static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) {
   }
 }
 
+static GlobalVariable::ThreadLocalMode GetDecodedThreadLocalMode(unsigned Val) {
+  switch (Val) {
+    case 0: return GlobalVariable::NotThreadLocal;
+    default: // Map unknown non-zero value to general dynamic.
+    case 1: return GlobalVariable::GeneralDynamicTLSModel;
+    case 2: return GlobalVariable::LocalDynamicTLSModel;
+    case 3: return GlobalVariable::InitialExecTLSModel;
+    case 4: return GlobalVariable::LocalExecTLSModel;
+  }
+}
+
 static int GetDecodedCastOpcode(unsigned Val) {
   switch (Val) {
   default: return -1;
@@ -458,61 +473,19 @@ bool BitcodeReader::ParseAttributeBlock() {
       if (Record.size() & 1)
         return Error("Invalid ENTRY record");
 
-      // FIXME : Remove this autoupgrade code in LLVM 3.0.
-      // If Function attributes are using index 0 then transfer them
-      // to index ~0. Index 0 is used for return value attributes but used to be
-      // used for function attributes.
-      Attributes RetAttribute;
-      Attributes FnAttribute;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        // FIXME: remove in LLVM 3.0
-        // The alignment is stored as a 16-bit raw value from bits 31--16.
-        // We shift the bits above 31 down by 11 bits.
-
-        unsigned Alignment = (Record[i+1] & (0xffffull << 16)) >> 16;
-        if (Alignment && !isPowerOf2_32(Alignment))
-          return Error("Alignment is not a power of two.");
-
-        Attributes ReconstitutedAttr(Record[i+1] & 0xffff);
-        if (Alignment)
-          ReconstitutedAttr |= Attribute::constructAlignmentFromInt(Alignment);
-        ReconstitutedAttr |=
-            Attributes((Record[i+1] & (0xffffull << 32)) >> 11);
-
+        Attributes ReconstitutedAttr =
+          Attribute::decodeLLVMAttributesForBitcode(Record[i+1]);
         Record[i+1] = ReconstitutedAttr.Raw();
-        if (Record[i] == 0)
-          RetAttribute = ReconstitutedAttr;
-        else if (Record[i] == ~0U)
-          FnAttribute = ReconstitutedAttr;
-      }
-
-      Attributes OldRetAttrs = (Attribute::NoUnwind|Attribute::NoReturn|
-                              Attribute::ReadOnly|Attribute::ReadNone);
-
-      if (FnAttribute == Attribute::None && RetAttribute != Attribute::None &&
-          (RetAttribute & OldRetAttrs)) {
-        if (FnAttribute == Attribute::None) { // add a slot so they get added.
-          Record.push_back(~0U);
-          Record.push_back(0);
-        }
-
-        FnAttribute  |= RetAttribute & OldRetAttrs;
-        RetAttribute &= ~OldRetAttrs;
       }
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        if (Record[i] == 0) {
-          if (RetAttribute != Attribute::None)
-            Attrs.push_back(AttributeWithIndex::get(0, RetAttribute));
-        } else if (Record[i] == ~0U) {
-          if (FnAttribute != Attribute::None)
-            Attrs.push_back(AttributeWithIndex::get(~0U, FnAttribute));
-        } else if (Attributes(Record[i+1]) != Attribute::None)
+        if (Attributes(Record[i+1]) != Attribute::None)
           Attrs.push_back(AttributeWithIndex::get(Record[i],
                                                   Attributes(Record[i+1])));
       }
 
-      MAttributes.push_back(AttrListPtr::get(Attrs.begin(), Attrs.end()));
+      MAttributes.push_back(AttrListPtr::get(Attrs));
       Attrs.clear();
       break;
     }
@@ -621,7 +594,7 @@ bool BitcodeReader::ParseTypeTableBody() {
       break;
     }
     case bitc::TYPE_CODE_FUNCTION_OLD: {
-      // FIXME: attrid is dead, remove it in LLVM 3.0
+      // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
         return Error("Invalid FUNCTION type record");
@@ -851,11 +824,7 @@ bool BitcodeReader::ParseMetadata() {
       break;
     case bitc::METADATA_NAME: {
       // Read named of the named metadata.
-      unsigned NameLength = Record.size();
-      SmallString<8> Name;
-      Name.resize(NameLength);
-      for (unsigned i = 0; i != NameLength; ++i)
-        Name[i] = Record[i];
+      SmallString<8> Name(Record.begin(), Record.end());
       Record.clear();
       Code = Stream.ReadCode();
 
@@ -899,26 +868,18 @@ bool BitcodeReader::ParseMetadata() {
       break;
     }
     case bitc::METADATA_STRING: {
-      unsigned MDStringLength = Record.size();
-      SmallString<8> String;
-      String.resize(MDStringLength);
-      for (unsigned i = 0; i != MDStringLength; ++i)
-        String[i] = Record[i];
-      Value *V = MDString::get(Context,
-                               StringRef(String.data(), String.size()));
+      SmallString<8> String(Record.begin(), Record.end());
+      Value *V = MDString::get(Context, String);
       MDValueList.AssignValue(V, NextMDValueNo++);
       break;
     }
     case bitc::METADATA_KIND: {
-      unsigned RecordLength = Record.size();
-      if (Record.empty() || RecordLength < 2)
+      if (Record.size() < 2)
         return Error("Invalid METADATA_KIND record");
-      SmallString<8> Name;
-      Name.resize(RecordLength-1);
+
       unsigned Kind = Record[0];
-      for (unsigned i = 1; i != RecordLength; ++i)
-        Name[i-1] = Record[i];
-      
+      SmallString<8> Name(Record.begin()+1, Record.end());
+
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
         return Error("Conflicting METADATA_KIND records");
@@ -977,6 +938,14 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
   return false;
 }
 
+static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
+  SmallVector<uint64_t, 8> Words(Vals.size());
+  std::transform(Vals.begin(), Vals.end(), Words.begin(),
+                 DecodeSignRotatedValue);
+
+  return APInt(TypeBits, Words);
+}
+
 bool BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
     return Error("Malformed block record");
@@ -1032,14 +1001,10 @@ bool BitcodeReader::ParseConstants() {
       if (!CurTy->isIntegerTy() || Record.empty())
         return Error("Invalid WIDE_INTEGER record");
 
-      unsigned NumWords = Record.size();
-      SmallVector<uint64_t, 8> Words;
-      Words.resize(NumWords);
-      for (unsigned i = 0; i != NumWords; ++i)
-        Words[i] = DecodeSignRotatedValue(Record[i]);
-      V = ConstantInt::get(Context,
-                           APInt(cast<IntegerType>(CurTy)->getBitWidth(),
-                                 Words));
+      APInt VInt = ReadWideAPInt(Record,
+                                 cast<IntegerType>(CurTy)->getBitWidth());
+      V = ConstantInt::get(Context, VInt);
+      
       break;
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
@@ -1098,10 +1063,7 @@ bool BitcodeReader::ParseConstants() {
       if (Record.empty())
         return Error("Invalid CST_STRING record");
 
-      unsigned Size = Record.size();
-      SmallString<16> Elts;
-      for (unsigned i = 0; i != Size; ++i)
-        Elts.push_back(Record[i]);
+      SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
                                        BitCode == bitc::CST_CODE_CSTRING);
       break;
@@ -1138,23 +1100,16 @@ bool BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else if (EltTy->isFloatTy()) {
-        SmallVector<float, 16> Elts;
-        for (unsigned i = 0; i != Size; ++i) {
-          union { uint32_t I; float F; };
-          I = Record[i];
-          Elts.push_back(F);
-        }
+        SmallVector<float, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat);
         if (isa<VectorType>(CurTy))
           V = ConstantDataVector::get(Context, Elts);
         else
           V = ConstantDataArray::get(Context, Elts);
       } else if (EltTy->isDoubleTy()) {
-        SmallVector<double, 16> Elts;
-        for (unsigned i = 0; i != Size; ++i) {
-          union { uint64_t I; double F; };
-          I = Record[i];
-          Elts.push_back(F);
-        }
+        SmallVector<double, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(),
+                       BitsToDouble);
         if (isa<VectorType>(CurTy))
           V = ConstantDataVector::get(Context, Elts);
         else
@@ -1600,9 +1555,10 @@ bool BitcodeReader::ParseModule(bool Resume) {
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
       if (Record.size() > 6)
         Visibility = GetDecodedVisibility(Record[6]);
-      bool isThreadLocal = false;
+
+      GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
       if (Record.size() > 7)
-        isThreadLocal = Record[7];
+        TLM = GetDecodedThreadLocalMode(Record[7]);
 
       bool UnnamedAddr = false;
       if (Record.size() > 8)
@@ -1610,12 +1566,11 @@ bool BitcodeReader::ParseModule(bool Resume) {
 
       GlobalVariable *NewGV =
         new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0,
-                           isThreadLocal, AddressSpace);
+                           TLM, AddressSpace);
       NewGV->setAlignment(Alignment);
       if (!Section.empty())
         NewGV->setSection(Section);
       NewGV->setVisibility(Visibility);
-      NewGV->setThreadLocal(isThreadLocal);
       NewGV->setUnnamedAddr(UnnamedAddr);
 
       ValueList.push_back(NewGV);
@@ -1732,7 +1687,7 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       // have to read and ignore these final 4 bytes :-(
       if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
-	  Stream.AtEndOfStream())
+          Stream.AtEndOfStream())
         return false;
 
       return Error("Invalid record at top-level");
@@ -2271,6 +2226,65 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
+      // Check magic 
+      if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
+        // New SwitchInst format with case ranges.
+        
+        Type *OpTy = getTypeByID(Record[1]);
+        unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
+
+        Value *Cond = getFnValueByID(Record[2], OpTy);
+        BasicBlock *Default = getBasicBlock(Record[3]);
+        if (OpTy == 0 || Cond == 0 || Default == 0)
+          return Error("Invalid SWITCH record");
+
+        unsigned NumCases = Record[4];
+        
+        SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
+        InstructionList.push_back(SI);
+        
+        unsigned CurIdx = 5;
+        for (unsigned i = 0; i != NumCases; ++i) {
+          IntegersSubsetToBB CaseBuilder;
+          unsigned NumItems = Record[CurIdx++];
+          for (unsigned ci = 0; ci != NumItems; ++ci) {
+            bool isSingleNumber = Record[CurIdx++];
+            
+            APInt Low;
+            unsigned ActiveWords = 1;
+            if (ValueBitWidth > 64)
+              ActiveWords = Record[CurIdx++];
+            Low = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+            CurIdx += ActiveWords;
+
+            if (!isSingleNumber) {
+              ActiveWords = 1;
+              if (ValueBitWidth > 64)
+                ActiveWords = Record[CurIdx++];
+              APInt High =
+                  ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+              
+              CaseBuilder.add(IntItem::fromType(OpTy, Low),
+                              IntItem::fromType(OpTy, High));
+              CurIdx += ActiveWords;
+            } else
+              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+          }
+          BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
+          IntegersSubset Case = CaseBuilder.getCase(); 
+          SI->addCase(Case, DestBB);
+        }
+        uint16_t Hash = SI->hash();
+        if (Hash != (Record[0] & 0xFFFF))
+          return Error("Invalid SWITCH record");
+        I = SI;
+        break;
+      }
+      
+      // Old SwitchInst format without case ranges.
+      
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return Error("Invalid SWITCH record");
       Type *OpTy = getTypeByID(Record[0]);
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
index 693d431..dfe7e10 100644
--- a/lib/Bitcode/Reader/CMakeLists.txt
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -2,3 +2,5 @@ add_llvm_library(LLVMBitReader
   BitReader.cpp
   BitcodeReader.cpp
   )
+
+add_dependencies(LLVMBitReader intrinsics_gen)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index b25d2e9..5b1725f 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -62,7 +62,10 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV
+  FUNCTION_INST_UNREACHABLE_ABBREV,
+  
+  // SwitchInst Magic
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -174,18 +177,7 @@ static void WriteAttributeTable(const ValueEnumerator &VE,
     for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) {
       const AttributeWithIndex &PAWI = A.getSlot(i);
       Record.push_back(PAWI.Index);
-
-      // FIXME: remove in LLVM 3.0
-      // Store the alignment in the bitcode as a 16-bit raw value instead of a
-      // 5-bit log2 encoded value. Shift the bits above the alignment up by
-      // 11 bits.
-      uint64_t FauxAttr = PAWI.Attrs.Raw() & 0xffff;
-      if (PAWI.Attrs & Attribute::Alignment)
-        FauxAttr |= (1ull<<16)<<
-            (((PAWI.Attrs & Attribute::Alignment).Raw()-1) >> 16);
-      FauxAttr |= (PAWI.Attrs.Raw() & (0x3FFull << 21)) << 11;
-
-      Record.push_back(FauxAttr);
+      Record.push_back(Attribute::encodeLLVMAttributesForBitcode(PAWI.Attrs));
     }
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
@@ -387,6 +379,17 @@ static unsigned getEncodedVisibility(const GlobalValue *GV) {
   llvm_unreachable("Invalid visibility");
 }
 
+static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) {
+  switch (GV->getThreadLocalMode()) {
+    case GlobalVariable::NotThreadLocal:         return 0;
+    case GlobalVariable::GeneralDynamicTLSModel: return 1;
+    case GlobalVariable::LocalDynamicTLSModel:   return 2;
+    case GlobalVariable::InitialExecTLSModel:    return 3;
+    case GlobalVariable::LocalExecTLSModel:      return 4;
+  }
+  llvm_unreachable("Invalid TLS model");
+}
+
 // Emit top-level description of module, including target triple, inline asm,
 // descriptors for global variables, and function prototype info.
 static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
@@ -495,7 +498,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
         GV->getVisibility() != GlobalValue::DefaultVisibility ||
         GV->hasUnnamedAddr()) {
       Vals.push_back(getEncodedVisibility(GV));
-      Vals.push_back(GV->isThreadLocal());
+      Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(GV->hasUnnamedAddr());
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
@@ -719,6 +722,41 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
+static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
+                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
+                      bool EmitSizeForWideNumbers = false
+                      ) {
+  if (Val.getBitWidth() <= 64) {
+    uint64_t V = Val.getSExtValue();
+    if ((int64_t)V >= 0)
+      Vals.push_back(V << 1);
+    else
+      Vals.push_back((-V << 1) | 1);
+    Code = bitc::CST_CODE_INTEGER;
+    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+  } else {
+    // Wide integers, > 64 bits in size.
+    // We have an arbitrary precision integer value to write whose
+    // bit width is > 64. However, in canonical unsigned integer
+    // format it is likely that the high bits are going to be zero.
+    // So, we only write the number of active words.
+    unsigned NWords = Val.getActiveWords();
+    
+    if (EmitSizeForWideNumbers)
+      Vals.push_back(NWords);
+    
+    const uint64_t *RawWords = Val.getRawData();
+    for (unsigned i = 0; i != NWords; ++i) {
+      int64_t V = RawWords[i];
+      if (V >= 0)
+        Vals.push_back(V << 1);
+      else
+        Vals.push_back((-V << 1) | 1);
+    }
+    Code = bitc::CST_CODE_WIDE_INTEGER;
+  }
+}
+
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
                            const ValueEnumerator &VE,
                            BitstreamWriter &Stream, bool isGlobal) {
@@ -801,30 +839,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
-      if (IV->getBitWidth() <= 64) {
-        uint64_t V = IV->getSExtValue();
-        if ((int64_t)V >= 0)
-          Record.push_back(V << 1);
-        else
-          Record.push_back((-V << 1) | 1);
-        Code = bitc::CST_CODE_INTEGER;
-        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-      } else {                             // Wide integers, > 64 bits in size.
-        // We have an arbitrary precision integer value to write whose
-        // bit width is > 64. However, in canonical unsigned integer
-        // format it is likely that the high bits are going to be zero.
-        // So, we only write the number of active words.
-        unsigned NWords = IV->getValue().getActiveWords();
-        const uint64_t *RawWords = IV->getValue().getRawData();
-        for (unsigned i = 0; i != NWords; ++i) {
-          int64_t V = RawWords[i];
-          if (V >= 0)
-            Record.push_back(V << 1);
-          else
-            Record.push_back((-V << 1) | 1);
-        }
-        Code = bitc::CST_CODE_WIDE_INTEGER;
-      }
+      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
@@ -1137,16 +1152,63 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   case Instruction::Switch:
     {
+      // Redefine Vals, since here we need to use 64 bit values
+      // explicitly to store large APInt numbers.
+      SmallVector<uint64_t, 128> Vals64;
+      
       Code = bitc::FUNC_CODE_INST_SWITCH;
       SwitchInst &SI = cast<SwitchInst>(I);
-      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      Vals.push_back(VE.getValueID(SI.getCondition()));
-      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
+      
+      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16); 
+      Vals64.push_back(SwitchRecordHeader);      
+      
+      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      Vals64.push_back(VE.getValueID(SI.getCondition()));
+      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
+      Vals64.push_back(SI.getNumCases());
       for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        Vals.push_back(VE.getValueID(i.getCaseValue()));
-        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
+        IntegersSubset& CaseRanges = i.getCaseValueEx();
+        unsigned Code, Abbrev; // will unused.
+        
+        if (CaseRanges.isSingleNumber()) {
+          Vals64.push_back(1/*NumItems = 1*/);
+          Vals64.push_back(true/*IsSingleNumber = true*/);
+          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
+        } else {
+          
+          Vals64.push_back(CaseRanges.getNumItems());
+          
+          if (CaseRanges.isSingleNumbersOnly()) {
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+              
+              Vals64.push_back(true/*IsSingleNumber = true*/);
+              
+              EmitAPInt(Vals64, Code, Abbrev,
+                        CaseRanges.getSingleNumber(ri), true);
+            }
+          } else
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+              IntegersSubset::Range r = CaseRanges.getItem(ri);
+              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
+    
+              Vals64.push_back(IsSingleNumber);
+              
+              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
+              if (!IsSingleNumber)
+                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
+            }
+        }
+        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
+      
+      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+      
+      // Also do expected action - clear external Vals collection:
+      Vals.clear();
+      return;
     }
     break;
   case Instruction::IndirectBr:
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 822a564..205480a 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -16,10 +16,10 @@
 
 #define DEBUG_TYPE "post-RA-sched"
 #include "AggressiveAntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -157,8 +157,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     // In a return block, examine the function live-out regs.
     for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
          E = MRI.liveout_end(); I != E; ++I) {
-      for (const uint16_t *Alias = TRI->getOverlaps(*I);
-           unsigned Reg = *Alias; ++Alias) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
         State->UnionGroups(Reg, 0);
         KillIndices[Reg] = BB->size();
         DefIndices[Reg] = ~0u;
@@ -173,8 +173,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-      for (const uint16_t *Alias = TRI->getOverlaps(*I);
-           unsigned Reg = *Alias; ++Alias) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
         State->UnionGroups(Reg, 0);
         KillIndices[Reg] = BB->size();
         DefIndices[Reg] = ~0u;
@@ -189,8 +189,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
     unsigned Reg = *I;
     if (!IsReturnBlock && !Pristine.test(Reg)) continue;
-    for (const uint16_t *Alias = TRI->getOverlaps(Reg);
-         unsigned AliasReg = *Alias; ++Alias) {
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       State->UnionGroups(AliasReg, 0);
       KillIndices[AliasReg] = BB->size();
       DefIndices[AliasReg] = ~0u;
@@ -265,10 +265,8 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
         IsImplicitDefUse(MI, MO)) {
       const unsigned Reg = MO.getReg();
       PassthruRegs.insert(Reg);
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg) {
-        PassthruRegs.insert(*Subreg);
-      }
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        PassthruRegs.insert(*SubRegs);
     }
   }
 }
@@ -333,9 +331,8 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
     DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
   }
   // Repeat for subregisters.
-  for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-       *Subreg; ++Subreg) {
-    unsigned SubregReg = *Subreg;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubregReg = *SubRegs;
     if (!State->IsLive(SubregReg)) {
       KillIndices[SubregReg] = KillIdx;
       DefIndices[SubregReg] = ~0u;
@@ -392,8 +389,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // Any aliased that are live at this point are completely or
     // partially defined here, so group those aliases with Reg.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       if (State->IsLive(AliasReg)) {
         State->UnionGroups(Reg, AliasReg);
         DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " <<
@@ -404,7 +401,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     // Note register reference...
     const TargetRegisterClass *RC = NULL;
     if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI);
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -423,9 +420,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
       continue;
 
     // Update def for Reg and aliases.
-    for (const uint16_t *Alias = TRI->getOverlaps(Reg);
-         unsigned AliasReg = *Alias; ++Alias)
-      DefIndices[AliasReg] = Count;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      DefIndices[*AI] = Count;
   }
 }
 
@@ -479,7 +475,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     // Note register reference...
     const TargetRegisterClass *RC = NULL;
     if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI);
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -678,9 +674,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
         goto next_super_reg;
       } else {
         bool found = false;
-        for (const uint16_t *Alias = TRI->getAliasSet(NewReg);
-             *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
+        for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) {
+          unsigned AliasReg = *AI;
           if (State->IsLive(AliasReg) ||
               (KillIndices[Reg] > DefIndices[AliasReg])) {
             DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)");
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 87f6431..32ad34a 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -15,9 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AllocationOrder.h"
-#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 00874d4..447f398 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -203,6 +203,63 @@ ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
   }
 }
 
+
+/// getNoopInput - If V is a noop (i.e., lowers to no machine code), look
+/// through it (and any transitive noop operands to it) and return its input
+/// value.  This is used to determine if a tail call can be formed.
+///
+static const Value *getNoopInput(const Value *V, const TargetLowering &TLI) {
+  // If V is not an instruction, it can't be looked through.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !I->hasOneUse() || I->getNumOperands() == 0) return V;
+  
+  Value *Op = I->getOperand(0);
+
+  // Look through truly no-op truncates.
+  if (isa<TruncInst>(I) &&
+      TLI.isTruncateFree(I->getOperand(0)->getType(), I->getType()))
+    return getNoopInput(I->getOperand(0), TLI);
+  
+  // Look through truly no-op bitcasts.
+  if (isa<BitCastInst>(I)) {
+    // No type change at all.
+    if (Op->getType() == I->getType())
+      return getNoopInput(Op, TLI);
+
+    // Pointer to pointer cast.
+    if (Op->getType()->isPointerTy() && I->getType()->isPointerTy())
+      return getNoopInput(Op, TLI);
+    
+    if (isa<VectorType>(Op->getType()) && isa<VectorType>(I->getType()) &&
+        TLI.isTypeLegal(EVT::getEVT(Op->getType())) &&
+        TLI.isTypeLegal(EVT::getEVT(I->getType())))
+      return getNoopInput(Op, TLI);
+  }
+  
+  // Look through inttoptr.
+  if (isa<IntToPtrInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+          cast<IntegerType>(Op->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+  // Look through ptrtoint.
+  if (isa<PtrToIntInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+        cast<IntegerType>(I->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+
+  // Otherwise it's not something we can look through.
+  return V;
+}
+
+
 /// Test if the given instruction is in a position to be optimized
 /// with a tail-call. This roughly means that it's in a block with
 /// a return and there's nothing that needs to be scheduled
@@ -226,7 +283,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
   // been fully understood.
   if (!Ret &&
       (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt ||
-       !isa<UnreachableInst>(Term))) return false;
+       !isa<UnreachableInst>(Term)))
+    return false;
 
   // If I will have a chain, make sure no other instruction that will have a
   // chain interposes between I and the return.
@@ -264,28 +322,28 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
     return false;
 
   // Otherwise, make sure the unmodified return value of I is the return value.
-  for (const Instruction *U = dyn_cast<Instruction>(Ret->getOperand(0)); ;
-       U = dyn_cast<Instruction>(U->getOperand(0))) {
-    if (!U)
-      return false;
-    if (!U->hasOneUse())
+  // We handle two cases: multiple return values + scalars.
+  Value *RetVal = Ret->getOperand(0);
+  if (!isa<InsertValueInst>(RetVal) || !isa<StructType>(RetVal->getType()))
+    // Handle scalars first.
+    return getNoopInput(Ret->getOperand(0), TLI) == I;
+  
+  // If this is an aggregate return, look through the insert/extract values and
+  // see if each is transparent.
+  for (unsigned i = 0, e =cast<StructType>(RetVal->getType())->getNumElements();
+       i != e; ++i) {
+    const Value *InScalar = FindInsertedValue(RetVal, i);
+    if (InScalar == 0) return false;
+    InScalar = getNoopInput(InScalar, TLI);
+    
+    // If the scalar value being inserted is an extractvalue of the right index
+    // from the call, then everything is good.
+    const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(InScalar);
+    if (EVI == 0 || EVI->getOperand(0) != I || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != i)
       return false;
-    if (U == I)
-      break;
-    // Check for a truly no-op truncate.
-    if (isa<TruncInst>(U) &&
-        TLI.isTruncateFree(U->getOperand(0)->getType(), U->getType()))
-      continue;
-    // Check for a truly no-op bitcast.
-    if (isa<BitCastInst>(U) &&
-        (U->getOperand(0)->getType() == U->getType() ||
-         (U->getOperand(0)->getType()->isPointerTy() &&
-          U->getType()->isPointerTy())))
-      continue;
-    // Otherwise it's not a true no-op.
-    return false;
   }
-
+  
   return true;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index b60fda8..bf5d8c4 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -44,9 +44,7 @@ EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden,
 
 
 ARMException::ARMException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false)
-    {}
+  : DwarfException(A) {}
 
 ARMException::~ARMException() {}
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b0b2ff4..d9be7a1 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -24,7 +25,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -475,10 +475,8 @@ void AsmPrinter::EmitFunctionHeader() {
 void AsmPrinter::EmitFunctionEntryLabel() {
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined()) {
-    OutStreamer.ForceCodeRegion();
+  if (CurrentFnSym->isUndefined())
     return OutStreamer.EmitLabel(CurrentFnSym);
-  }
 
   report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
                      "' label emitted multiple times to assembly file");
@@ -615,7 +613,7 @@ bool AsmPrinter::needsSEHMoves() {
 }
 
 bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
-  return MAI->doesDwarfUseRelocationsForStringPool();
+  return MAI->doesDwarfUseRelocationsAcrossSections();
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
@@ -798,8 +796,8 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
 
-  for (const uint16_t *SR = TRI->getSuperRegisters(MLoc.getReg());
-       *SR && Reg < 0; ++SR) {
+  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid() && Reg < 0;
+       ++SR) {
     Reg = TRI->getDwarfRegNum(*SR, false);
     // FIXME: Get the bit range this register uses of the superregister
     // so that we can produce a DW_OP_bit_piece
@@ -1085,15 +1083,6 @@ void AsmPrinter::EmitJumpTableInfo() {
 
   EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getTargetData())));
 
-  // If we know the form of the jump table, go ahead and tag it as such.
-  if (!JTInDiffSection) {
-    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32) {
-      OutStreamer.EmitJumpTable32Region();
-    } else {
-      OutStreamer.EmitDataRegion();
-    }
-  }
-
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
 
@@ -1399,13 +1388,14 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                       unsigned Size)
   const {
 
-  // Emit Label+Offset
-  const MCExpr *Plus =
-    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Label, OutContext),
-                            MCConstantExpr::Create(Offset, OutContext),
-                            OutContext);
+  // Emit Label+Offset (or just Label if Offset is zero)
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Label, OutContext);
+  if (Offset)
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(Offset, OutContext),
+                                   OutContext);
 
-  OutStreamer.EmitValue(Plus, 4, 0/*AddrSpace*/);
+  OutStreamer.EmitValue(Expr, Size, 0/*AddrSpace*/);
 }
 
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d605854..db43b06 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -326,11 +326,11 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
           OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
         }
 
-	// We may have a location metadata attached to the end of the
-	// instruction, and at no point should see metadata at any
-	// other point while processing. It's an error if so.
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
         if (OpNo >= MI->getNumOperands() ||
-	    MI->getOperand(OpNo).isMetadata()) {
+            MI->getOperand(OpNo).isMetadata()) {
           Error = true;
         } else {
           unsigned OpFlags = MI->getOperand(OpNo).getImm();
@@ -409,9 +409,28 @@ void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
 /// instruction, using the specified assembler variant.  Targets should
 /// override this to format as appropriate.
 bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                 unsigned AsmVariant, const char *ExtraCode,
-                                 raw_ostream &O) {
-  // Target doesn't support this yet!
+    unsigned AsmVariant, const char *ExtraCode,
+    raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (ExtraCode[0]) {
+    default:
+      return true;  // Unknown modifier.
+    case 'c': // Substitute immediate value without immediate syntax
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'n':  // Negate the immediate constant.
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << -MO.getImm();
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index cc5b642..d30e5bb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains support for writing dwarf compile unit.
+// This file contains support for constructing a dwarf compile unit.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,9 +17,9 @@
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
@@ -33,7 +33,7 @@ using namespace llvm;
 
 /// CompileUnit - Compile unit constructor.
 CompileUnit::CompileUnit(unsigned I, unsigned L, DIE *D, AsmPrinter *A,
-			 DwarfDebug *DW)
+                         DwarfDebug *DW)
   : ID(I), Language(L), CUDie(D), Asm(A), DD(DW), IndexTyDie(0) {
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
@@ -198,7 +198,7 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
     return;
   DIFile File = Ty.getFile();
   unsigned FileID = DD->GetOrCreateSourceID(File.getFilename(),
-					    File.getDirectory());
+                                            File.getDirectory());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -308,7 +308,8 @@ void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
       addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
       addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+      if (!Location.isReg())
+        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     } else llvm_unreachable("unknown DIBuilder Opcode");
   }
 
@@ -418,27 +419,12 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
-  if (Location.isReg()) {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
@@ -646,8 +632,7 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty,
-			  unsigned Attribute) {
+void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
   if (!Ty.Verify())
     return;
 
@@ -776,6 +761,11 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         Buffer.addChild(ElemDie);
       }
     }
+    DIType DTy = CTy.getTypeDerivedFrom();
+    if (DTy.Verify()) {
+      addType(&Buffer, DTy);
+      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
+    }
   }
     break;
   case dwarf::DW_TAG_subroutine_type: {
@@ -801,9 +791,9 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add prototype flag if we're dealing with a C language and the
     // function has been prototyped.
     if (isPrototyped &&
-	(Language == dwarf::DW_LANG_C89 ||
-	 Language == dwarf::DW_LANG_C99 ||
-	 Language == dwarf::DW_LANG_ObjC))
+        (Language == dwarf::DW_LANG_C89 ||
+         Language == dwarf::DW_LANG_C99 ||
+         Language == dwarf::DW_LANG_ObjC))
       addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
   }
     break;
@@ -846,19 +836,19 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
         addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType()) {
-	DIDerivedType DDTy(Element);
-	if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-	  ElemDie = new DIE(dwarf::DW_TAG_friend);
-	  addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-	} else
-	  ElemDie = createMemberDIE(DIDerivedType(Element));
+        DIDerivedType DDTy(Element);
+        if (DDTy.getTag() == dwarf::DW_TAG_friend) {
+          ElemDie = new DIE(dwarf::DW_TAG_friend);
+          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
+        } else
+          ElemDie = createMemberDIE(DIDerivedType(Element));
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
         ElemDie = new DIE(Property.getTag());
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
-	addType(ElemDie, Property.getType());
-	addSourceLine(ElemDie, Property);
+        addType(ElemDie, Property.getType());
+        addSourceLine(ElemDie, Property);
         StringRef GetterName = Property.getObjCPropertyGetterName();
         if (!GetterName.empty())
           addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
@@ -925,19 +915,21 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type
-      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
-  {
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_class_type ||
+      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
+    // TODO: Do we care about size for enum forward declarations?
     if (Size)
       addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
-    else {
+    else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      if (CTy.isForwardDecl())
-        addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-      else
-        addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
-    }
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+
+    // If we're a forward decl, say so.
+    if (CTy.isForwardDecl())
+      addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
@@ -968,7 +960,7 @@ CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
 /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
 /// for the given DITemplateValueParameter.
 DIE *
-CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) {
+CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
   DIE *ParamDIE = getDIE(TPV);
   if (ParamDIE)
     return ParamDIE;
@@ -1015,17 +1007,17 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   if (SPDie)
     return SPDie;
 
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  insertDIE(SP, SPDie);
+
   DISubprogram SPDecl = SP.getFunctionDeclaration();
   DIE *DeclDie = NULL;
   if (SPDecl.isSubprogram()) {
     DeclDie = getOrCreateSubprogramDIE(SPDecl);
   }
 
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
-  
-  // DW_TAG_inlined_subroutine may refer to this DIE.
-  insertDIE(SP, SPDie);
-  
   // Add to context owner.
   addToContextOwner(SPDie, SP.getContext());
 
@@ -1240,7 +1232,8 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
-void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
+void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
+                                       DIE *IndexTy) {
   DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
   addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
   uint64_t L = SR.getLo();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 45e407e..b4ff9e8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,7 +15,7 @@
 #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DIE.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/OwningPtr.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index cb78878..649684a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -17,9 +17,10 @@
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -32,11 +33,10 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -117,7 +117,6 @@ DIType DbgVariable::getType() const {
       if (getName() == DT.getName())
         return (DT.getTypeDerivedFrom());
     }
-    return Ty;
   }
   return Ty;
 }
@@ -127,6 +126,7 @@ DIType DbgVariable::getType() const {
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   : Asm(A), MMI(Asm->MMI), FirstCU(0),
     AbbreviationsSet(InitAbbreviationsSetSize),
+    SourceIdMap(DIEValueAllocator), StringPool(DIEValueAllocator),
     PrevLabel(NULL) {
   NextStringPoolNumber = 0;
 
@@ -566,7 +566,7 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
-  if (Asm->MAI->doesDwarfRequireRelocationForSectionOffset())
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                     Asm->GetTempSymbol("section_line"));
   else
@@ -1310,8 +1310,9 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                MOE = MI->operands_end(); MOI != MOE; ++MOI) {
           if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
             continue;
-          for (const uint16_t *AI = TRI->getOverlaps(MOI->getReg());
-               unsigned Reg = *AI; ++AI) {
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
+               AI.isValid(); ++AI) {
+            unsigned Reg = *AI;
             const MDNode *Var = LiveUserVar[Reg];
             if (!Var)
               continue;
@@ -1381,7 +1382,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                                        MF->getFunction()->getContext());
     recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
                      FnStartDL.getScope(MF->getFunction()->getContext()),
-                     0);
+                     DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0);
   }
 }
 
@@ -1421,6 +1422,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         DIVariable DV(Variables.getElement(i));
         if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
           continue;
+        // Check that DbgVariable for DV wasn't created earlier, when
+        // findAbstractVariable() was called for inlined instance of DV.
+        LLVMContext &Ctx = DV->getContext();
+        DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
+        if (AbstractVariables.lookup(CleanDV))
+          continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
           addScopeVariable(Scope, new DbgVariable(DV, NULL));
       }
@@ -1623,7 +1630,7 @@ void DwarfDebug::emitDIE(DIE *Die) {
       // DW_AT_range Value encodes offset in debug_range section.
       DIEInteger *V = cast<DIEInteger>(Values[i]);
 
-      if (Asm->MAI->doesDwarfUseLabelOffsetForRanges()) {
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) {
         Asm->EmitLabelPlusOffset(DwarfDebugRangeSectionSym,
                                  V->getValue(),
                                  4);
@@ -1636,10 +1643,14 @@ void DwarfDebug::emitDIE(DIE *Die) {
       break;
     }
     case dwarf::DW_AT_location: {
-      if (DIELabel *L = dyn_cast<DIELabel>(Values[i]))
-        Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
-      else
+      if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelReference(L->getValue(), 4);
+        else
+          Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
+      } else {
         Values[i]->EmitValue(Asm, Form);
+      }
       break;
     }
     case dwarf::DW_AT_accessibility: {
@@ -2049,9 +2060,11 @@ void DwarfDebug::emitDebugLoc() {
             if (Element == DIBuilder::OpPlus) {
               Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
               Asm->EmitULEB128(DV.getAddrElement(++i));
-            } else if (Element == DIBuilder::OpDeref)
-              Asm->EmitInt8(dwarf::DW_OP_deref);
-            else llvm_unreachable("unknown Opcode found in complex address");
+            } else if (Element == DIBuilder::OpDeref) {
+              if (!Entry.Loc.isReg())
+                Asm->EmitInt8(dwarf::DW_OP_deref);
+            } else
+              llvm_unreachable("unknown Opcode found in complex address");
           }
         }
       }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 83f30f5..d1d6512 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,11 +14,11 @@
 #ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 #define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 
+#include "DIE.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -188,6 +188,9 @@ class DwarfDebug {
   /// MMI - Collected machine module information.
   MachineModuleInfo *MMI;
 
+  /// DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
   //===--------------------------------------------------------------------===//
   // Attributes used to construct specific Dwarf sections.
   //
@@ -210,11 +213,11 @@ class DwarfDebug {
 
   /// SourceIdMap - Source id map, i.e. pair of source filename and directory,
   /// separated by a zero byte, mapped to a unique id.
-  StringMap<unsigned> SourceIdMap;
+  StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
 
   /// StringPool - A String->Symbol mapping of strings used by indirect
   /// references.
-  StringMap<std::pair<MCSymbol*, unsigned> > StringPool;
+  StringMap<std::pair<MCSymbol*, unsigned>, BumpPtrAllocator&> StringPool;
   unsigned NextStringPoolNumber;
   
   /// SectionMap - Provides a unique id per text section.
@@ -232,7 +235,7 @@ class DwarfDebug {
   /// ScopeVariables - Collection of dbg variables of a scope.
   DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables;
 
-  /// AbstractVariables - Collection on abstract variables.
+  /// AbstractVariables - Collection of abstract variables.
   DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
 
   /// DotDebugLocEntries - Collection of DotDebugLocEntry.
@@ -292,9 +295,6 @@ class DwarfDebug {
 
   std::vector<FunctionDebugFrameInfo> DebugFrames;
 
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-
   // Section Symbols: these are assembler temporary labels that are emitted at
   // the beginning of each supported dwarf section.  These are used to form
   // section offsets and are created by EmitSectionLabels.
@@ -333,9 +333,6 @@ private:
   /// of the function.
   DIE *constructInlinedScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, LexicalScope *S);
-
   /// constructScopeDIE - Construct a DIE for this scope.
   DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
@@ -517,9 +514,6 @@ public:
   /// in the SourceIds map.
   unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName);
 
-  /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(DISubprogram SP);
-
   /// getStringPool - returns the entry into the start of the pool.
   MCSymbol *getStringPool();
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index b5f86ab..75f6056 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -175,17 +175,6 @@ public:
 };
 
 class ARMException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
-
-  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
-  /// should be emitted.
-  bool shouldEmitMoves;
-
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index ef1d2ba..fb65bb7 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -137,9 +137,8 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
       break;
     unsigned Reg = I->getOperand(0).getReg();
     ImpDefRegs.insert(Reg);
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs)
-      ImpDefRegs.insert(SubReg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      ImpDefRegs.insert(*SubRegs);
     ++I;
   }
   if (ImpDefRegs.empty())
@@ -188,7 +187,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   // Use a RegScavenger to help update liveness when required.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.tracksLiveness() && TRI->requiresRegisterScavenging(MF))
+  if (MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
     RS = new RegScavenger();
   else
     MRI.invalidateLiveness();
@@ -819,10 +818,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
 }
 
 bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
-
-  if (!EnableTailMerge) return false;
-
   bool MadeChange = false;
+  if (!EnableTailMerge) return MadeChange;
 
   // First find blocks with no successors.
   MergePotentials.clear();
@@ -839,6 +836,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   if (MergePotentials.size() == TailMergeThreshold)
     for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
       TriedMerging.insert(MergePotentials[i].getBlock());
+
   // See if we can do any tail merging on those.
   if (MergePotentials.size() >= 2)
     MadeChange |= TryTailMergeBlocks(NULL, NULL);
@@ -864,88 +862,97 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
        I != E; ++I) {
-    if (I->pred_size() >= 2) {
-      SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
-      MachineBasicBlock *IBB = I;
-      MachineBasicBlock *PredBB = prior(I);
-      MergePotentials.clear();
-      for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
-                                            E2 = I->pred_end();
-           P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
-        MachineBasicBlock *PBB = *P;
-        if (TriedMerging.count(PBB))
-          continue;
-        // Skip blocks that loop to themselves, can't tail merge these.
-        if (PBB == IBB)
-          continue;
-        // Visit each predecessor only once.
-        if (!UniquePreds.insert(PBB))
-          continue;
-        // Skip blocks which may jump to a landing pad. Can't tail merge these.
-        if (PBB->getLandingPadSuccessor())
-          continue;
-        MachineBasicBlock *TBB = 0, *FBB = 0;
-        SmallVector<MachineOperand, 4> Cond;
-        if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
-          // Failing case:  IBB is the target of a cbr, and
-          // we cannot reverse the branch.
-          SmallVector<MachineOperand, 4> NewCond(Cond);
-          if (!Cond.empty() && TBB == IBB) {
-            if (TII->ReverseBranchCondition(NewCond))
+    if (I->pred_size() < 2) continue;
+    SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+    MachineBasicBlock *IBB = I;
+    MachineBasicBlock *PredBB = prior(I);
+    MergePotentials.clear();
+    for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
+           E2 = I->pred_end();
+         P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
+      MachineBasicBlock *PBB = *P;
+      if (TriedMerging.count(PBB))
+        continue;
+
+      // Skip blocks that loop to themselves, can't tail merge these.
+      if (PBB == IBB)
+        continue;
+
+      // Visit each predecessor only once.
+      if (!UniquePreds.insert(PBB))
+        continue;
+
+      // Skip blocks which may jump to a landing pad. Can't tail merge these.
+      if (PBB->getLandingPadSuccessor())
+        continue;
+
+      MachineBasicBlock *TBB = 0, *FBB = 0;
+      SmallVector<MachineOperand, 4> Cond;
+      if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+        // Failing case: IBB is the target of a cbr, and we cannot reverse the
+        // branch.
+        SmallVector<MachineOperand, 4> NewCond(Cond);
+        if (!Cond.empty() && TBB == IBB) {
+          if (TII->ReverseBranchCondition(NewCond))
+            continue;
+          // This is the QBB case described above
+          if (!FBB)
+            FBB = llvm::next(MachineFunction::iterator(PBB));
+        }
+
+        // Failing case: the only way IBB can be reached from PBB is via
+        // exception handling.  Happens for landing pads.  Would be nice to have
+        // a bit in the edge so we didn't have to do all this.
+        if (IBB->isLandingPad()) {
+          MachineFunction::iterator IP = PBB;  IP++;
+          MachineBasicBlock *PredNextBB = NULL;
+          if (IP != MF.end())
+            PredNextBB = IP;
+          if (TBB == NULL) {
+            if (IBB != PredNextBB)      // fallthrough
+              continue;
+          } else if (FBB) {
+            if (TBB != IBB && FBB != IBB)   // cbr then ubr
+              continue;
+          } else if (Cond.empty()) {
+            if (TBB != IBB)               // ubr
+              continue;
+          } else {
+            if (TBB != IBB && IBB != PredNextBB)  // cbr
               continue;
-            // This is the QBB case described above
-            if (!FBB)
-              FBB = llvm::next(MachineFunction::iterator(PBB));
-          }
-          // Failing case:  the only way IBB can be reached from PBB is via
-          // exception handling.  Happens for landing pads.  Would be nice
-          // to have a bit in the edge so we didn't have to do all this.
-          if (IBB->isLandingPad()) {
-            MachineFunction::iterator IP = PBB;  IP++;
-            MachineBasicBlock *PredNextBB = NULL;
-            if (IP != MF.end())
-              PredNextBB = IP;
-            if (TBB == NULL) {
-              if (IBB != PredNextBB)      // fallthrough
-                continue;
-            } else if (FBB) {
-              if (TBB != IBB && FBB != IBB)   // cbr then ubr
-                continue;
-            } else if (Cond.empty()) {
-              if (TBB != IBB)               // ubr
-                continue;
-            } else {
-              if (TBB != IBB && IBB != PredNextBB)  // cbr
-                continue;
-            }
-          }
-          // Remove the unconditional branch at the end, if any.
-          if (TBB && (Cond.empty() || FBB)) {
-            DebugLoc dl;  // FIXME: this is nowhere
-            TII->RemoveBranch(*PBB);
-            if (!Cond.empty())
-              // reinsert conditional branch only, for now
-              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
           }
-          MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
         }
+
+        // Remove the unconditional branch at the end, if any.
+        if (TBB && (Cond.empty() || FBB)) {
+          DebugLoc dl;  // FIXME: this is nowhere
+          TII->RemoveBranch(*PBB);
+          if (!Cond.empty())
+            // reinsert conditional branch only, for now
+            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
+        }
+
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
       }
-      // If this is a large problem, avoid visiting the same basic blocks
-      // multiple times.
-      if (MergePotentials.size() == TailMergeThreshold)
-        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
-          TriedMerging.insert(MergePotentials[i].getBlock());
-      if (MergePotentials.size() >= 2)
-        MadeChange |= TryTailMergeBlocks(IBB, PredBB);
-      // Reinsert an unconditional branch if needed.
-      // The 1 below can occur as a result of removing blocks in
-      // TryTailMergeBlocks.
-      PredBB = prior(I);     // this may have been changed in TryTailMergeBlocks
-      if (MergePotentials.size() == 1 &&
-          MergePotentials.begin()->getBlock() != PredBB)
-        FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
     }
+
+    // If this is a large problem, avoid visiting the same basic blocks multiple
+    // times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
+
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(IBB, PredBB);
+
+    // Reinsert an unconditional branch if needed. The 1 below can occur as a
+    // result of removing blocks in TryTailMergeBlocks.
+    PredBB = prior(I);     // this may have been changed in TryTailMergeBlocks
+    if (MergePotentials.size() == 1 &&
+        MergePotentials.begin()->getBlock() != PredBB)
+      FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
   }
+
   return MadeChange;
 }
 
@@ -1459,7 +1466,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 }
 
 /// findHoistingInsertPosAndDeps - Find the location to move common instructions
-/// in successors to. The location is ususally just before the terminator,
+/// in successors to. The location is usually just before the terminator,
 /// however if the terminator is a conditional branch and its previous
 /// instruction is the flag setting instruction, the previous instruction is
 /// the preferred location. This function also gathers uses and defs of the
@@ -1483,9 +1490,8 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      Uses.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Uses.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     } else if (!MO.isDead())
       // Don't try to hoist code in the rare case the terminator defines a
       // register that is later used.
@@ -1545,18 +1551,16 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      Uses.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Uses.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     } else {
       if (Uses.count(Reg)) {
         Uses.erase(Reg);
-        for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-          Uses.erase(*SR); // Use getSubRegisters to be conservative
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          Uses.erase(*SubRegs); // Use sub-registers to be conservative
       }
-      Defs.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Defs.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Defs.insert(*AI);
     }
   }
 
@@ -1683,8 +1687,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       unsigned Reg = MO.getReg();
       if (!Reg || !LocalDefsSet.count(Reg))
         continue;
-      for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR)
-        LocalDefsSet.erase(*OR);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.erase(*AI);
     }
 
     // Track local defs so we can update liveins.
@@ -1696,8 +1700,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!Reg)
         continue;
       LocalDefs.push_back(Reg);
-      for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR)
-        LocalDefsSet.insert(*OR);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.insert(*AI);
     }
 
     HasDups = true;
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 21729cd..2e189ad 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMCodeGen
   DeadMachineInstructionElim.cpp
   DFAPacketizer.cpp
   DwarfEHPrepare.cpp
+  EarlyIfConversion.cpp
   EdgeBundles.cpp
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
@@ -30,6 +31,7 @@ add_llvm_library(LLVMCodeGen
   LiveInterval.cpp
   LiveIntervalAnalysis.cpp
   LiveIntervalUnion.cpp
+  LiveRegMatrix.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LiveRangeCalc.cpp
@@ -59,6 +61,7 @@ add_llvm_library(LLVMCodeGen
   MachineSSAUpdater.cpp
   MachineScheduler.cpp
   MachineSink.cpp
+  MachineTraceMetrics.cpp
   MachineVerifier.cpp
   OcamlGC.cpp
   OptimizePHIs.cpp
@@ -77,8 +80,8 @@ add_llvm_library(LLVMCodeGen
   RegAllocPBQP.cpp
   RegisterClassInfo.cpp
   RegisterCoalescer.cpp
+  RegisterPressure.cpp
   RegisterScavenging.cpp
-  RenderMachineFunction.cpp
   ScheduleDAG.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
@@ -103,5 +106,7 @@ add_llvm_library(LLVMCodeGen
   VirtRegMap.cpp
   )
 
+add_dependencies(LLVMCodeGen intrinsics_gen)
+
 add_subdirectory(SelectionDAG)
 add_subdirectory(AsmPrinter)
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index ea16a25..939af3f 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -39,18 +39,20 @@ void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
-bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                << "********** Function: "
-               << fn.getFunction()->getName() << '\n');
-
-  LiveIntervals &lis = getAnalysis<LiveIntervals>();
-  VirtRegAuxInfo vrai(fn, lis, getAnalysis<MachineLoopInfo>());
-  for (LiveIntervals::iterator I = lis.begin(), E = lis.end(); I != E; ++I) {
-    LiveInterval &li = *I->second;
-    if (TargetRegisterInfo::isVirtualRegister(li.reg))
-      vrai.CalculateWeightAndHint(li);
+               << MF.getFunction()->getName() << '\n');
+
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
   }
   return false;
 }
@@ -86,6 +88,27 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
   return tri.getMatchingSuperReg(hreg, sub, rc);
 }
 
+// Check if all values in LI are rematerializable
+static bool isRematerializable(const LiveInterval &LI,
+                               const LiveIntervals &LIS,
+                               const TargetInstrInfo &TII) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    if (VNI->isPHIDef())
+      return false;
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Dead valno in interval");
+
+    if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis()))
+      return false;
+  }
+  return true;
+}
+
 void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
@@ -171,17 +194,11 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   }
 
   // If all of the definitions of the interval are re-materializable,
-  // it is a preferred candidate for spilling. If none of the defs are
-  // loads, then it's potentially very cheap to re-materialize.
+  // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  bool isLoad = false;
-  if (LIS.isReMaterializable(li, 0, isLoad)) {
-    if (isLoad)
-      totalWeight *= 0.9F;
-    else
-      totalWeight *= 0.5F;
-  }
+  if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
+    totalWeight *= 0.5F;
 
   li.weight = normalizeSpillWeight(totalWeight, li.getSize());
 }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 2b7dfdb..0b747fd 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -49,8 +49,7 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
     Size = MinSize;
   if (MinAlign > (int)Align)
     Align = MinAlign;
-  if (MF.getFrameInfo()->getMaxAlignment() < Align)
-    MF.getFrameInfo()->setMaxAlignment(Align);
+  MF.getFrameInfo()->ensureMaxAlignment(Align);
   TM.getTargetLowering()->HandleByVal(this, Size);
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
@@ -58,9 +57,8 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void CCState::MarkAllocated(unsigned Reg) {
-  for (const uint16_t *Alias = TRI.getOverlaps(Reg);
-       unsigned Reg = *Alias; ++Alias)
-    UsedRegs[Reg/32] |= 1 << (Reg&31);
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
 /// AnalyzeFormalArguments - Analyze an array of argument values,
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index a81bb5c..fb2c2e8 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeCalculateSpillWeightsPass(Registry);
   initializeCodePlacementOptPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
+  initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
@@ -53,7 +54,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeProcessImplicitDefsPass(Registry);
   initializePEIPass(Registry);
   initializeRegisterCoalescerPass(Registry);
-  initializeRenderMachineFunctionPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
@@ -65,7 +65,9 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeUnreachableBlockElimPass(Registry);
   initializeUnreachableMachineBlockElimPass(Registry);
   initializeVirtRegMapPass(Registry);
+  initializeVirtRegRewriterPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
+  initializeMachineFunctionPrinterPassPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index c13c05e..99233df 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -201,7 +201,7 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
           // fallthrough edge.
           if (!Prior->isSuccessor(End))
             goto next_pred;
-          // Otherwise we can stop scanning and procede to move the blocks.
+          // Otherwise we can stop scanning and proceed to move the blocks.
           break;
         }
         // If we hit a switch or something complicated, don't move anything
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index bad5010..a9de1c749 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -62,17 +62,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     // In a return block, examine the function live-out regs.
     for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
          E = MRI.liveout_end(); I != E; ++I) {
-      unsigned Reg = *I;
-      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BBSize;
-      DefIndices[Reg] = ~0u;
-
-      // Repeat, for all aliases.
-      for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        unsigned AliasReg = *Alias;
-        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BBSize;
-        DefIndices[AliasReg] = ~0u;
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
       }
     }
   }
@@ -84,17 +78,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-      unsigned Reg = *I;
-      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BBSize;
-      DefIndices[Reg] = ~0u;
-
-      // Repeat, for all aliases.
-      for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        unsigned AliasReg = *Alias;
-        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BBSize;
-        DefIndices[AliasReg] = ~0u;
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
       }
     }
 
@@ -104,18 +92,12 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   BitVector Pristine = MFI->getPristineRegs(BB);
   for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
-    unsigned Reg = *I;
-    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
-    Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-    KillIndices[Reg] = BBSize;
-    DefIndices[Reg] = ~0u;
-
-    // Repeat, for all aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
-      Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[AliasReg] = BBSize;
-      DefIndices[AliasReg] = ~0u;
+    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+      unsigned Reg = *AI;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BBSize;
+      DefIndices[Reg] = ~0u;
     }
   }
 }
@@ -208,7 +190,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     const TargetRegisterClass *NewRC = 0;
 
     if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -218,11 +200,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
 
     // Now check for aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
       // If an alias of the reg is used during the live range, give up.
       // Note that this allows us to skip checking if AntiDepReg
       // overlaps with any of the aliases, among other things.
-      unsigned AliasReg = *Alias;
+      unsigned AliasReg = *AI;
       if (Classes[AliasReg]) {
         Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
         Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
@@ -236,9 +218,8 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (MO.isUse() && Special) {
       if (!KeepRegs.test(Reg)) {
         KeepRegs.set(Reg);
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg)
-          KeepRegs.set(*Subreg);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          KeepRegs.set(*SubRegs);
       }
     }
   }
@@ -247,7 +228,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
 void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
                                              unsigned Count) {
   // Update liveness.
-  // Proceding upwards, registers that are defed but not used in this
+  // Proceeding upwards, registers that are defed but not used in this
   // instruction are now dead.
 
   if (!TII->isPredicated(MI)) {
@@ -282,9 +263,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       Classes[Reg] = 0;
       RegRefs.erase(Reg);
       // Repeat, for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg) {
-        unsigned SubregReg = *Subreg;
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubregReg = *SubRegs;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
         KeepRegs.reset(SubregReg);
@@ -292,11 +272,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
         RegRefs.erase(SubregReg);
       }
       // Conservatively mark super-registers as unusable.
-      for (const uint16_t *Super = TRI->getSuperRegisters(Reg);
-           *Super; ++Super) {
-        unsigned SuperReg = *Super;
-        Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      }
+      for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+        Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1);
     }
   }
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -308,7 +285,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
     const TargetRegisterClass *NewRC = 0;
     if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -328,8 +305,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
                "Kill and Def maps aren't consistent for Reg!");
     }
     // Repeat, for all aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       if (KillIndices[AliasReg] == ~0u) {
         KillIndices[AliasReg] = Count;
         DefIndices[AliasReg] = ~0u;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 7746259..ad95c48 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -17,11 +17,11 @@
 #define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/ADT/BitVector.h"
 #include <map>
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 5ff641c..ff2f113 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,10 +23,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 using namespace llvm;
@@ -100,22 +100,23 @@ void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) {
   reserveResources(&MID);
 }
 
-namespace {
+namespace llvm {
 // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
 // Schedule method to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                       MachineDominatorTree &MDT, bool IsPostRA);
+                   MachineDominatorTree &MDT, bool IsPostRA);
   // Schedule - Actual scheduling work.
   void schedule();
 };
-} // end anonymous namespace
+}
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(
   MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
   bool IsPostRA) :
   ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
+  CanHandleTerminators = true;
 }
 
 void DefaultVLIWScheduler::schedule() {
@@ -129,49 +130,25 @@ VLIWPacketizerList::VLIWPacketizerList(
   bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
   TII = TM.getInstrInfo();
   ResourceTracker = TII->CreateTargetScheduleState(&TM, 0);
-  SchedulerImpl = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
 }
 
 // VLIWPacketizerList Dtor
 VLIWPacketizerList::~VLIWPacketizerList() {
-  delete SchedulerImpl;
-  delete ResourceTracker;
-}
-
-// ignorePseudoInstruction - ignore pseudo instructions.
-bool VLIWPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) {
-  if (MI->isDebugValue())
-    return true;
-
-  if (TII->isSchedulingBoundary(MI, MBB, MF))
-    return true;
-
-  return false;
-}
-
-// isSoloInstruction - return true if instruction I must end previous
-// packet.
-bool VLIWPacketizerList::isSoloInstruction(MachineInstr *I) {
-  if (I->isInlineAsm())
-    return true;
-
-  return false;
-}
+  if (VLIWScheduler)
+    delete VLIWScheduler;
 
-// addToPacket - Add I to the current packet and reserve resource.
-void VLIWPacketizerList::addToPacket(MachineInstr *MI) {
-  CurrentPacketMIs.push_back(MI);
-  ResourceTracker->reserveResources(MI);
+  if (ResourceTracker)
+    delete ResourceTracker;
 }
 
 // endPacket - End the current packet, bundle packet instructions and reset
 // DFA state.
 void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
-                                         MachineInstr *I) {
+                                         MachineInstr *MI) {
   if (CurrentPacketMIs.size() > 1) {
     MachineInstr *MIFirst = CurrentPacketMIs.front();
-    finalizeBundle(*MBB, MIFirst, I);
+    finalizeBundle(*MBB, MIFirst, MI);
   }
   CurrentPacketMIs.clear();
   ResourceTracker->clearResources();
@@ -181,31 +158,35 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
 void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator BeginItr,
                                       MachineBasicBlock::iterator EndItr) {
-  assert(MBB->end() == EndItr && "Bad EndIndex");
-
-  SchedulerImpl->enterRegion(MBB, BeginItr, EndItr, MBB->size());
-
-  // Build the DAG without reordering instructions.
-  SchedulerImpl->schedule();
-
-  // Remember scheduling units.
-  SUnits = SchedulerImpl->SUnits;
+  assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
+  VLIWScheduler->startBlock(MBB);
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->schedule();
+
+  // Generate MI -> SU map.
+  MIToSUnit.clear();
+  for (unsigned i = 0, e = VLIWScheduler->SUnits.size(); i != e; ++i) {
+    SUnit *SU = &VLIWScheduler->SUnits[i];
+    MIToSUnit[SU->getInstr()] = SU;
+  }
 
   // The main packetizer loop.
   for (; BeginItr != EndItr; ++BeginItr) {
     MachineInstr *MI = BeginItr;
 
-    // Ignore pseudo instructions.
-    if (ignorePseudoInstruction(MI, MBB))
-      continue;
+    this->initPacketizerState();
 
     // End the current packet if needed.
-    if (isSoloInstruction(MI)) {
+    if (this->isSoloInstruction(MI)) {
       endPacket(MBB, MI);
       continue;
     }
 
-    SUnit *SUI = SchedulerImpl->getSUnit(MI);
+    // Ignore pseudo instructions.
+    if (this->ignorePseudoInstruction(MI, MBB))
+      continue;
+
+    SUnit *SUI = MIToSUnit[MI];
     assert(SUI && "Missing SUnit Info!");
 
     // Ask DFA if machine resource is available for MI.
@@ -215,13 +196,13 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
       for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
            VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
         MachineInstr *MJ = *VI;
-        SUnit *SUJ = SchedulerImpl->getSUnit(MJ);
+        SUnit *SUJ = MIToSUnit[MJ];
         assert(SUJ && "Missing SUnit Info!");
 
         // Is it legal to packetize SUI and SUJ together.
-        if (!isLegalToPacketizeTogether(SUI, SUJ)) {
+        if (!this->isLegalToPacketizeTogether(SUI, SUJ)) {
           // Allow packetization if dependency can be pruned.
-          if (!isLegalToPruneDependencies(SUI, SUJ)) {
+          if (!this->isLegalToPruneDependencies(SUI, SUJ)) {
             // End the packet if dependency cannot be pruned.
             endPacket(MBB, MI);
             break;
@@ -234,11 +215,11 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
     }
 
     // Add MI to the current packet.
-    addToPacket(MI);
+    BeginItr = this->addToPacket(MI);
   } // For all instructions in BB.
 
   // End any packet left behind.
   endPacket(MBB, EndItr);
-
-  SchedulerImpl->exitRegion();
+  VLIWScheduler->exitRegion();
+  VLIWScheduler->finishBlock();
 }
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index aa10d1d..b4394e8 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -171,9 +171,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
             // Check the subreg set, not the alias set, because a def
             // of a super-register may still be partially live after
             // this def.
-            for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-                 *SubRegs; ++SubRegs)
-              LivePhysRegs.reset(*SubRegs);
+            for (MCSubRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+              LivePhysRegs.reset(*SR);
           }
         } else if (MO.isRegMask()) {
           // Register mask of preserved registers. All clobbers are dead.
@@ -187,10 +186,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
         if (MO.isReg() && MO.isUse()) {
           unsigned Reg = MO.getReg();
           if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-            LivePhysRegs.set(Reg);
-            for (const uint16_t *AliasSet = TRI->getAliasSet(Reg);
-                 *AliasSet; ++AliasSet)
-              LivePhysRegs.set(*AliasSet);
+            for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+              LivePhysRegs.set(*AI);
           }
         }
       }
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 944dd4f..7095624 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -39,7 +39,7 @@ namespace {
     Constant *RewindFunction;
 
     bool InsertUnwindResumeCalls(Function &Fn);
-    Instruction *GetExceptionObject(ResumeInst *RI);
+    Value *GetExceptionObject(ResumeInst *RI);
 
   public:
     static char ID; // Pass identification, replacement for typeid.
@@ -68,9 +68,9 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) {
 /// GetExceptionObject - Return the exception object from the value passed into
 /// the 'resume' instruction (typically an aggregate). Clean up any dead
 /// instructions, including the 'resume' instruction.
-Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
+Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   Value *V = RI->getOperand(0);
-  Instruction *ExnObj = 0;
+  Value *ExnObj = 0;
   InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
   LoadInst *SelLoad = 0;
   InsertValueInst *ExcIVI = 0;
@@ -81,7 +81,7 @@ Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
       ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0));
       if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) &&
           ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) {
-        ExnObj = cast<Instruction>(ExcIVI->getOperand(1));
+        ExnObj = ExcIVI->getOperand(1);
         SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1));
         EraseIVIs = true;
       }
@@ -139,7 +139,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     // _Unwind_Resume to the end of the single resume block.
     ResumeInst *RI = Resumes.front();
     BasicBlock *UnwindBB = RI->getParent();
-    Instruction *ExnObj = GetExceptionObject(RI);
+    Value *ExnObj = GetExceptionObject(RI);
 
     // Call the _Unwind_Resume function.
     CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
@@ -162,7 +162,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     BasicBlock *Parent = RI->getParent();
     BranchInst::Create(UnwindBB, Parent);
 
-    Instruction *ExnObj = GetExceptionObject(RI);
+    Value *ExnObj = GetExceptionObject(RI);
     PN->addIncoming(ExnObj, Parent);
 
     ++NumResumesLowered;
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 0000000..f9347ef
--- /dev/null
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,803 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "early-ifcvt"
+#include "MachineTraceMetrics.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumDiamondsSeen,  "Number of diamonds");
+STATISTIC(NumDiamondsConv,  "Number of diamonds converted");
+STATISTIC(NumTrianglesSeen, "Number of triangles");
+STATISTIC(NumTrianglesConv, "Number of triangles converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Returns the Tail predecessor for the True side.
+  MachineBasicBlock *getTPred() const { return TBB == Tail ? Head : TBB; }
+
+  /// Returns the Tail predecessor for the  False side.
+  MachineBasicBlock *getFPred() const { return FBB == Tail ? Head : FBB; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg, FReg;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles, TCycles, FCycles;
+
+    PHIInfo(MachineInstr *phi)
+      : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+private:
+  /// The branch condition determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+  /// Replace PHI instructions in Tail with selects.
+  void replacePHIInstrs();
+
+  /// Insert selects and rewrite PHI operands to use them.
+  void rewritePHIOperands();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  bool canConvertIf(MachineBasicBlock *MBB);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+       E = MBB->getFirstTerminator(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      if (MO->isRegMask()) {
+        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        return false;
+      }
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+
+      // Remember clobbered regunits.
+      if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          ClobberedRegUnits.set(*Units);
+
+      if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI || DefMI->getParent() != Head)
+        continue;
+      if (InsertAfter.insert(DefMI))
+        DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
+      if (DefMI->isTerminator()) {
+        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<unsigned, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(I)) {
+      DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO->isDef())
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          LiveRegUnits.erase(*Units);
+      // Unless I reads Reg.
+      if (MO->readsReg())
+        Reads.push_back(Reg);
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid();
+           ++Units)
+        if (ClobberedRegUnits.test(*Units))
+          LiveRegUnits.insert(*Units);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      DEBUG({
+        dbgs() << "Would clobber";
+        for (SparseSet<unsigned>::const_iterator
+             i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
+          dbgs() << ' ' << PrintRegUnit(*i, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+  Head = MBB;
+  TBB = FBB = Tail = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  Tail = Succ0->succ_begin()[0];
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << "/BB#" << Succ1->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // If Tail doesn't have any phis, there must be side effects.
+  if (Tail->empty() || !Tail->front().isPHI()) {
+    DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) {
+    DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = getTPred();
+  MachineBasicBlock *FPred = getFPred();
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
+                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (TBB != Tail && !canSpeculateInstrs(TBB))
+    return false;
+  if (FBB != Tail && !canSpeculateInstrs(FBB))
+    return false;
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  if (isTriangle())
+    ++NumTrianglesSeen;
+  else
+    ++NumDiamondsSeen;
+  return true;
+}
+
+/// replacePHIInstrs - Completely replace PHI instructions with selects.
+/// This is possible when the only Tail predecessors are the if-converted
+/// blocks.
+void SSAIfConv::replacePHIInstrs() {
+  assert(Tail->pred_size() == 2 && "Cannot replace PHIs");
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    assert(PI.PHI->getNumOperands() == 5 && "Unexpected PHI operands.");
+    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = 0;
+  }
+}
+
+/// rewritePHIOperands - When there are additional Tail predecessors, insert
+/// select instructions in Head and rewrite PHI operands to use the selects.
+/// Keep the PHI instructions in Tail to handle the other predecessors.
+void SSAIfConv::rewritePHIOperands() {
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+    unsigned DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+
+    // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
+    for (unsigned i = PI.PHI->getNumOperands(); i != 1; i -= 2) {
+      MachineBasicBlock *MBB = PI.PHI->getOperand(i-1).getMBB();
+      if (MBB == getTPred()) {
+        PI.PHI->getOperand(i-1).setMBB(Head);
+        PI.PHI->getOperand(i-2).setReg(DstReg);
+      } else if (MBB == getFPred()) {
+        PI.PHI->RemoveOperand(i-1);
+        PI.PHI->RemoveOperand(i-2);
+      }
+    }
+    DEBUG(dbgs() << "          --> " << *PI.PHI);
+  }
+}
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Update statistics.
+  if (isTriangle())
+    ++NumTrianglesConv;
+  else
+    ++NumDiamondsConv;
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail)
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  if (FBB != Tail)
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+
+  // Are there extra Tail predecessors?
+  bool ExtraPreds = Tail->pred_size() != 2;
+  if (ExtraPreds)
+    rewritePHIOperands();
+  else
+    replacePHIInstrs();
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber()
+                 << " into head BB#" << Head->getNumber() << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  DEBUG(dbgs() << *Head);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
+  void invalidateTraces();
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    }
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void EarlyIfConverter::invalidateTraces() {
+  Traces->verifyAnalysis();
+  Traces->invalidate(IfConv.Head);
+  Traces->invalidate(IfConv.Tail);
+  Traces->invalidate(IfConv.TBB);
+  Traces->invalidate(IfConv.FBB);
+  Traces->verifyAnalysis();
+}
+
+// Adjust cycles with downward saturation.
+static unsigned adjCycles(unsigned Cyc, int Delta) {
+  if (Delta < 0 && Cyc + Delta > Cyc)
+    return 0;
+  return Cyc + Delta;
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool EarlyIfConverter::shouldConvertIf() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
+  MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
+  DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
+                              FBBTrace.getCriticalPath());
+
+  // Set a somewhat arbitrary limit on the critical path extension we accept.
+  unsigned CritLimit = SchedModel->MispredictPenalty/2;
+
+  // If-conversion only makes sense when there is unexploited ILP. Compute the
+  // maximum-ILP resource length of the trace after if-conversion. Compare it
+  // to the shortest critical path.
+  SmallVector<const MachineBasicBlock*, 1> ExtraBlocks;
+  if (IfConv.TBB != IfConv.Tail)
+    ExtraBlocks.push_back(IfConv.TBB);
+  unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
+  DEBUG(dbgs() << "Resource length " << ResLength
+               << ", minimal critical path " << MinCrit << '\n');
+  if (ResLength > MinCrit + CritLimit) {
+    DEBUG(dbgs() << "Not enough available ILP.\n");
+    return false;
+  }
+
+  // Assume that the depth of the first head terminator will also be the depth
+  // of the select instruction inserted, as determined by the flag dependency.
+  // TBB / FBB data dependencies may delay the select even more.
+  MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
+  unsigned BranchDepth =
+    HeadTrace.getInstrCycles(IfConv.Head->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+
+  // Look at all the tail phis, and compute the critical path extension caused
+  // by inserting select instructions.
+  MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
+    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+    unsigned Slack = TailTrace.getInstrSlack(PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(PI.PHI).Depth;
+    DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+
+    // The condition is pulled into the critical path.
+    unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
+    if (CondDepth > MaxDepth) {
+      unsigned Extra = CondDepth - MaxDepth;
+      DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The TBB value is pulled into the critical path.
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(PI.PHI), PI.TCycles);
+    if (TDepth > MaxDepth) {
+      unsigned Extra = TDepth - MaxDepth;
+      DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The FBB value is pulled into the critical path.
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(PI.PHI), PI.FCycles);
+    if (FDepth > MaxDepth) {
+      unsigned Extra = FDepth - MaxDepth;
+      DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    invalidateTraces();
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+               << "********** Function: "
+               << ((Value*)MF.getFunction())->getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel = MF.getTarget().getInstrItineraryData()->SchedModel;
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (po_iterator<MachineDominatorTree*>
+       I = po_begin(DomTree), E = po_end(DomTree); I != E; ++I)
+    if (tryConvertIf(I->getBlock()))
+      Changed = true;
+
+  MF.verify(this, "After early if-conversion");
+  return Changed;
+}
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index a48c540..fee8e47 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -59,7 +59,7 @@ struct DomainValue {
 
   // Pointer to the next DomainValue in a chain.  When two DomainValues are
   // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by folowing the chain.
+  // references can be updated by following the chain.
   DomainValue *Next;
 
   // Twiddleable instructions using or defining these registers.
@@ -666,7 +666,8 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     // or -1.
     AliasMap.resize(TRI->getNumRegs(), -1);
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
-      for (const uint16_t *AI = TRI->getOverlaps(RC->getRegister(i)); *AI; ++AI)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
+           AI.isValid(); ++AI)
         AliasMap[*AI] = i;
   }
 
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index b14afc2..7a17331 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -131,13 +131,16 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   } else {
     TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
                      MI->getOperand(2).isKill());
+
+    // Implicitly define DstReg for subsequent uses.
+    MachineBasicBlock::iterator CopyMI = MI;
+    --CopyMI;
+    CopyMI->addRegisterDefined(DstReg);
+
     // Transfer the kill/dead flags, if needed.
     if (MI->getOperand(0).isDead())
       TransferDeadFlag(MI, DstSubReg, TRI);
-    DEBUG({
-        MachineBasicBlock::iterator dMI = MI;
-        dbgs() << "subreg: " << *(--dMI);
-      });
+    DEBUG(dbgs() << "subreg: " << *CopyMI);
   }
 
   DEBUG(dbgs() << '\n');
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 75ae5b9..4214ba1 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -155,7 +156,9 @@ namespace {
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
     const MachineBranchProbabilityInfo *MBPI;
+    MachineRegisterInfo *MRI;
 
+    bool PreRegAlloc;
     bool MadeChange;
     int FnNum;
   public:
@@ -263,14 +266,20 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MRI = &MF.getRegInfo();
   InstrItins = MF.getTarget().getInstrItineraryData();
   if (!TII) return false;
 
-  // Tail merge tend to expose more if-conversion opportunities.
-  BranchFolder BF(true, false);
-  bool BFChange = BF.OptimizeFunction(MF, TII,
+  PreRegAlloc = MRI->isSSA();
+
+  bool BFChange = false;
+  if (!PreRegAlloc) {
+    // Tail merge tend to expose more if-conversion opportunities.
+    BranchFolder BF(true, false);
+    BFChange = BF.OptimizeFunction(MF, TII,
                                    MF.getTarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
+  }
 
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
                << MF.getFunction()->getName() << "\'");
@@ -621,7 +630,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   if (BBI.IsDone)
     return;
 
-  bool AlreadyPredicated = BBI.Predicate.size() > 0;
+  bool AlreadyPredicated = !BBI.Predicate.empty();
   // First analyze the end of BB branches.
   BBI.TrueBB = BBI.FalseBB = NULL;
   BBI.BrCond.clear();
@@ -786,8 +795,8 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
 
   unsigned Dups = 0;
   unsigned Dups2 = 0;
-  bool TNeedSub = TrueBBI.Predicate.size() > 0;
-  bool FNeedSub = FalseBBI.Predicate.size() > 0;
+  bool TNeedSub = !TrueBBI.Predicate.empty();
+  bool FNeedSub = !FalseBBI.Predicate.empty();
   bool Enqueued = false;
 
   BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
@@ -962,9 +971,8 @@ static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
          E = BB->livein_end(); I != E; ++I) {
     unsigned Reg = *I;
     Redefs.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
-      Redefs.insert(*Subreg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Redefs.insert(*SubRegs);
   }
 }
 
@@ -983,8 +991,8 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
       Defs.push_back(Reg);
     else if (MO.isKill()) {
       Redefs.erase(Reg);
-      for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-        Redefs.erase(*SR);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.erase(*SubRegs);
     }
   }
   for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
@@ -993,11 +1001,12 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
       if (AddImpUse)
         // Treat predicated update as read + write.
         MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
-                                                true/*IsImp*/,false/*IsKill*/));
+                                              true/*IsImp*/,false/*IsKill*/,
+                                              false/*IsDead*/,true/*IsUndef*/));
     } else {
       Redefs.insert(Reg);
-      for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-        Redefs.insert(*SR);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.insert(*SubRegs);
     }
   }
 }
@@ -1335,8 +1344,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
           // These are defined before ctrl flow reach the 'false' instructions.
           // They cannot be modified by the 'true' instructions.
           ExtUses.insert(Reg);
-          for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-            ExtUses.insert(*SR);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            ExtUses.insert(*SubRegs);
         }
       }
 
@@ -1344,8 +1353,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
         unsigned Reg = Defs[i];
         if (!ExtUses.count(Reg)) {
           RedefsByFalse.insert(Reg);
-          for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-            RedefsByFalse.insert(*SR);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            RedefsByFalse.insert(*SubRegs);
         }
       }
     }
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index d5ea666..07e37af 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -52,7 +52,6 @@ static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
 
 namespace {
 class InlineSpiller : public Spiller {
-  MachineFunctionPass &Pass;
   MachineFunction &MF;
   LiveIntervals &LIS;
   LiveStacks &LSS;
@@ -137,8 +136,7 @@ public:
   InlineSpiller(MachineFunctionPass &pass,
                 MachineFunction &mf,
                 VirtRegMap &vrm)
-    : Pass(pass),
-      MF(mf),
+    : MF(mf),
       LIS(pass.getAnalysis<LiveIntervals>()),
       LSS(pass.getAnalysis<LiveStacks>()),
       AA(&pass.getAnalysis<AliasAnalysis>()),
@@ -578,11 +576,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveRange *SrcLR = SrcLI.getLiveRangeContaining(VNI->def.getRegSlot(true));
-        assert(SrcLR && "Copy from non-existing value");
+        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        assert(SrcQ.valueIn() && "Copy from non-existing value");
         // Check if this COPY kills its source.
-        SVI->second.KillsSource = (SrcLR->end == VNI->def);
-        VNInfo *SrcVNI = SrcLR->valno;
+        SVI->second.KillsSource = SrcQ.isKill();
+        VNInfo *SrcVNI = SrcQ.valueIn();
         DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
                      << SrcVNI->id << '@' << SrcVNI->def
                      << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
@@ -1083,6 +1081,10 @@ void InlineSpiller::insertReload(LiveInterval &NewLI,
                            MRI.getRegClass(NewLI.reg), &TRI);
   --MI; // Point to load instruction.
   SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
+  // Some (out-of-tree) targets have EC reload instructions.
+  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
+    if (MO->isEarlyClobber())
+      LoadIdx = LoadIdx.getRegSlot(true);
   DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
   VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
@@ -1275,8 +1277,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
-               << ':' << edit.getParent() << "\nFrom original "
-               << LIS.getInterval(Original) << '\n');
+               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
+               << "\nFrom original " << LIS.getInterval(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index 8368b58..1541bf0 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -39,7 +39,7 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
   unsigned E = PhysRegEntries[PhysReg];
   if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
     if (!Entries[E].valid(LIUArray, TRI))
-      Entries[E].revalidate();
+      Entries[E].revalidate(LIUArray, TRI);
     return &Entries[E];
   }
   // No valid entry exists, pick the next round-robin entry.
@@ -61,13 +61,15 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
 }
 
 /// revalidate - LIU contents have changed, update tags.
-void InterferenceCache::Entry::revalidate() {
+void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
+                                          const TargetRegisterInfo *TRI) {
   // Invalidate all block entries.
   ++Tag;
   // Invalidate all iterators.
   PrevPos = SlotIndex();
-  for (unsigned i = 0, e = Aliases.size(); i != e; ++i)
-    Aliases[i].second = Aliases[i].first->getTag();
+  unsigned i = 0;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i)
+    RegUnits[i].VirtTag = LIUArray[*Units].getTag();
 }
 
 void InterferenceCache::Entry::reset(unsigned physReg,
@@ -79,28 +81,23 @@ void InterferenceCache::Entry::reset(unsigned physReg,
   ++Tag;
   PhysReg = physReg;
   Blocks.resize(MF->getNumBlockIDs());
-  Aliases.clear();
-  for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS) {
-    LiveIntervalUnion *LIU = LIUArray + *AS;
-    Aliases.push_back(std::make_pair(LIU, LIU->getTag()));
-  }
 
   // Reset iterators.
   PrevPos = SlotIndex();
-  unsigned e = Aliases.size();
-  Iters.resize(e);
-  for (unsigned i = 0; i != e; ++i)
-    Iters[i].setMap(Aliases[i].first->getMap());
+  RegUnits.clear();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    RegUnits.push_back(LIUArray[*Units]);
+    RegUnits.back().Fixed = &LIS->getRegUnit(*Units);
+  }
 }
 
 bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
                                      const TargetRegisterInfo *TRI) {
-  unsigned i = 0, e = Aliases.size();
-  for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS, ++i) {
-    LiveIntervalUnion *LIU = LIUArray + *AS;
-    if (i == e ||  Aliases[i].first != LIU)
+  unsigned i = 0, e = RegUnits.size();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) {
+    if (i == e)
       return false;
-    if (LIU->changedSince(Aliases[i].second))
+    if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag))
       return false;
   }
   return i == e;
@@ -112,12 +109,20 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
 
   // Use advanceTo only when possible.
   if (PrevPos != Start) {
-    if (!PrevPos.isValid() || Start < PrevPos)
-      for (unsigned i = 0, e = Iters.size(); i != e; ++i)
-        Iters[i].find(Start);
-    else
-      for (unsigned i = 0, e = Iters.size(); i != e; ++i)
-        Iters[i].advanceTo(Start);
+    if (!PrevPos.isValid() || Start < PrevPos) {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.find(Start);
+        RUI.FixedI = RUI.Fixed->find(Start);
+      }
+    } else {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.advanceTo(Start);
+        if (RUI.FixedI != RUI.Fixed->end())
+          RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start);
+      }
+    }
     PrevPos = Start;
   }
 
@@ -129,9 +134,9 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
     BI->Tag = Tag;
     BI->First = BI->Last = SlotIndex();
 
-    // Check for first interference.
-    for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
-      Iter &I = Iters[i];
+    // Check for first interference from virtregs.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
       if (!I.valid())
         continue;
       SlotIndex StartI = I.start();
@@ -141,6 +146,19 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
         BI->First = StartI;
     }
 
+    // Same thing for fixed interference.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveInterval::const_iterator I = RegUnits[i].FixedI;
+      LiveInterval::const_iterator E = RegUnits[i].Fixed->end();
+      if (I == E)
+        continue;
+      SlotIndex StartI = I->start;
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
     // Also check for register mask interference.
     RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum);
     RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum);
@@ -168,8 +186,8 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
   }
 
   // Check for last interference in block.
-  for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
-    Iter &I = Iters[i];
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
     if (!I.valid() || I.start() >= Stop)
       continue;
     I.advanceTo(Stop);
@@ -183,6 +201,23 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
       ++I;
   }
 
+  // Fixed interference.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveInterval::iterator &I = RegUnits[i].FixedI;
+    LiveInterval *LI = RegUnits[i].Fixed;
+    if (I == LI->end() || I->start >= Stop)
+      continue;
+    I = LI->advanceTo(I, Stop);
+    bool Backup = I == LI->end() || I->start >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I->end;
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
   // Also check for register mask interference.
   SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start;
   for (unsigned i = RegMaskSlots.size();
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 485a325..3c928a5 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// InterferenceCache remembers per-block interference in LiveIntervalUnions.
+// InterferenceCache remembers per-block interference from LiveIntervalUnions,
+// fixed RegUnit interference, and register masks.
 //
 //===----------------------------------------------------------------------===//
 
@@ -59,14 +60,31 @@ class InterferenceCache {
     /// PrevPos - The previous position the iterators were moved to.
     SlotIndex PrevPos;
 
-    /// AliasTags - A LiveIntervalUnion pointer and tag for each alias of
-    /// PhysReg.
-    SmallVector<std::pair<LiveIntervalUnion*, unsigned>, 8> Aliases;
+    /// RegUnitInfo - Information tracked about each RegUnit in PhysReg.
+    /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos)
+    /// had just been called.
+    struct RegUnitInfo {
+      /// Iterator pointing into the LiveIntervalUnion containing virtual
+      /// register interference.
+      LiveIntervalUnion::SegmentIter VirtI;
 
-    typedef LiveIntervalUnion::SegmentIter Iter;
+      /// Tag of the LIU last time we looked.
+      unsigned VirtTag;
 
-    /// Iters - an iterator for each alias
-    SmallVector<Iter, 8> Iters;
+      /// Fixed interference in RegUnit.
+      LiveInterval *Fixed;
+
+      /// Iterator pointing into the fixed RegUnit interference.
+      LiveInterval::iterator FixedI;
+
+      RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()), Fixed(0) {
+        VirtI.setMap(LIU.getMap());
+      }
+    };
+
+    /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have
+    /// more than 4 RegUnits.
+    SmallVector<RegUnitInfo, 4> RegUnits;
 
     /// Blocks - Interference for each block in the function.
     SmallVector<BlockInterference, 8> Blocks;
@@ -91,7 +109,7 @@ class InterferenceCache {
 
     bool hasRefs() const { return RefCount > 0; }
 
-    void revalidate();
+    void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
 
     /// valid - Return true if this is a valid entry for physReg.
     bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index a9ca42f..8d2282a 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -11,17 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 template <class ArgIt>
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index a1f479a..cac0c83 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/PassManager.h"
+#include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -78,40 +79,15 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
          "and that InitializeAllTargetMCs() is being invoked!");
 }
 
-/// Turn exception handling constructs into something the code generators can
-/// handle.
-static void addPassesToHandleExceptions(TargetMachine *TM,
-                                        PassManagerBase &PM) {
-  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
-  case ExceptionHandling::SjLj:
-    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
-    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
-    // catch info can get misplaced when a selector ends up more than one block
-    // removed from the parent invoke(s). This could happen when a landing
-    // pad is shared by multiple invokes and is also a target of a normal
-    // edge from elsewhere.
-    PM.add(createSjLjEHPreparePass(TM->getTargetLowering()));
-    // FALLTHROUGH
-  case ExceptionHandling::DwarfCFI:
-  case ExceptionHandling::ARM:
-  case ExceptionHandling::Win64:
-    PM.add(createDwarfEHPass(TM));
-    break;
-  case ExceptionHandling::None:
-    PM.add(createLowerInvokePass(TM->getTargetLowering()));
-
-    // The lower invoke pass may create unreachable code. Remove it.
-    PM.add(createUnreachableBlockEliminationPass());
-    break;
-  }
-}
-
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
                                           PassManagerBase &PM,
-                                          bool DisableVerify) {
+                                          bool DisableVerify,
+                                          AnalysisID StartAfter,
+                                          AnalysisID StopAfter) {
   // Targets may override createPassConfig to provide a target-specific sublass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  PassConfig->setStartStopPasses(StartAfter, StopAfter);
 
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
@@ -120,7 +96,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 
   PassConfig->addIRPasses();
 
-  addPassesToHandleExceptions(TM, PM);
+  PassConfig->addPassesToHandleExceptions();
 
   PassConfig->addISelPrepare();
 
@@ -155,16 +131,30 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             formatted_raw_ostream &Out,
                                             CodeGenFileType FileType,
-                                            bool DisableVerify) {
+                                            bool DisableVerify,
+                                            AnalysisID StartAfter,
+                                            AnalysisID StopAfter) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify);
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
+                                               StartAfter, StopAfter);
   if (!Context)
     return true;
 
+  if (StopAfter) {
+    // FIXME: The intent is that this should eventually write out a YAML file,
+    // containing the LLVM IR, the machine-level IR (when stopping after a
+    // machine-level pass), and whatever other information is needed to
+    // deserialize the code and resume compilation.  For now, just write the
+    // LLVM IR.
+    PM.add(createPrintModulePass(&Out));
+    return false;
+  }
+
   if (hasMCSaveTempLabels())
     Context->setAllowTemporaryLabels(false);
 
   const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   OwningPtr<MCStreamer> AsmStreamer;
 
@@ -180,7 +170,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     MCAsmBackend *MAB = 0;
     if (ShowMCEncoding) {
       const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI, *Context);
+      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
+                                            *Context);
       MAB = getTarget().createMCAsmBackend(getTargetTriple());
     }
 
@@ -198,8 +189,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI,
-                                                         *Context);
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                         STI, *Context);
     MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
     if (MCE == 0 || MAB == 0)
       return true;
@@ -242,7 +233,7 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    JITCodeEmitter &JCE,
                                                    bool DisableVerify) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify);
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
   if (!Context)
     return true;
 
@@ -262,7 +253,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
                                           raw_ostream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify);
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
   if (!Ctx)
     return true;
 
@@ -271,9 +262,10 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
 
   // Create the code emitter for the target if it exists.  If not, .o file
   // emission fails.
+  const MCRegisterInfo &MRI = *getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(),STI,
-                                                       *Ctx);
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                       STI, *Ctx);
   MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
   if (MCE == 0 || MAB == 0)
     return true;
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index f1abcbb..6b6b9d0 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -16,8 +16,8 @@
 
 #define DEBUG_TYPE "lexicalscopes"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 2187833..d631726 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -23,9 +23,9 @@
 #include "LiveDebugVariables.h"
 #include "VirtRegMap.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Metadata.h"
 #include "llvm/Value.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LexicalScopes.h"
@@ -243,7 +243,7 @@ public:
 
   /// computeIntervals - Compute the live intervals of all locations after
   /// collecting all their def points.
-  void computeIntervals(MachineRegisterInfo &MRI,
+  void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                         LiveIntervals &LIS, MachineDominatorTree &MDT,
                         UserValueScopes &UVS);
 
@@ -618,6 +618,7 @@ UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
 
 void
 UserValue::computeIntervals(MachineRegisterInfo &MRI,
+                            const TargetRegisterInfo &TRI,
                             LiveIntervals &LIS,
                             MachineDominatorTree &MDT,
                             UserValueScopes &UVS) {
@@ -634,15 +635,32 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
     unsigned LocNo = Defs[i].second;
     const MachineOperand &Loc = locations[LocNo];
 
+    if (!Loc.isReg()) {
+      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      continue;
+    }
+
     // Register locations are constrained to where the register value is live.
-    if (Loc.isReg() && LIS.hasInterval(Loc.getReg())) {
-      LiveInterval *LI = &LIS.getInterval(Loc.getReg());
-      const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
+      LiveInterval *LI = 0;
+      const VNInfo *VNI = 0;
+      if (LIS.hasInterval(Loc.getReg())) {
+        LI = &LIS.getInterval(Loc.getReg());
+        VNI = LI->getVNInfoAt(Idx);
+      }
       SmallVector<SlotIndex, 16> Kills;
       extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS);
-      addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
-    } else
-      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      if (LI)
+        addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
+      continue;
+    }
+
+    // For physregs, use the live range of the first regunit as a guide.
+    unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
+    LiveInterval *LI = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    // Don't track copies from physregs, it is too expensive.
+    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -656,7 +674,7 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 void LDVImpl::computeIntervals() {
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
     UserValueScopes UVS(userValues[i]->getDebugLoc(), LS);
-    userValues[i]->computeIntervals(MF->getRegInfo(), *LIS, *MDT, UVS);
+    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, *MDT, UVS);
     userValues[i]->mapVirtRegs(this);
   }
 }
@@ -721,7 +739,8 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
 
   if (TargetRegisterInfo::isVirtualRegister(NewReg))
     mapVirtReg(NewReg, UV);
-  virtRegToEqClass.erase(OldReg);
+  if (OldReg != NewReg)
+    virtRegToEqClass.erase(OldReg);
 
   do {
     UV->renameRegister(OldReg, NewReg, SubIdx, TRI);
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ac18843..0a795e6 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -48,6 +48,26 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
   return I;
 }
 
+VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
+                                    VNInfo::Allocator &VNInfoAllocator) {
+  assert(!Def.isDead() && "Cannot define a value at the dead slot");
+  iterator I = find(Def);
+  if (I == end()) {
+    VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+  if (SlotIndex::isSameInstr(Def, I->start)) {
+    assert(I->start == Def && "Cannot insert def, already live");
+    assert(I->valno->def == Def && "Inconsistent existing value def");
+    return I->valno;
+  }
+  assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
+  VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  return VNI;
+}
+
 /// killedInRange - Return true if the interval has kills in [Start,End).
 bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
   Ranges::const_iterator r =
@@ -140,7 +160,7 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
       valnos.pop_back();
     } while (!valnos.empty() && valnos.back()->isUnused());
   } else {
-    ValNo->setIsUnused(true);
+    ValNo->markUnused();
   }
 }
 
@@ -176,16 +196,16 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   // If NewEnd was in the middle of an interval, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
-
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
-  Ranges::iterator Next = llvm::next(I);
-  if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
-    I->end = Next->end;
-    ranges.erase(Next);
+  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+      MergeTo->valno == ValNo) {
+    I->end = MergeTo->end;
+    ++MergeTo;
   }
+
+  // Erase any dead ranges.
+  ranges.erase(llvm::next(I), MergeTo);
 }
 
 
@@ -353,18 +373,6 @@ void LiveInterval::removeValNo(VNInfo *ValNo) {
   markValNoForDeletion(ValNo);
 }
 
-/// findDefinedVNInfo - Find the VNInfo defined by the specified
-/// index (register interval).
-VNInfo *LiveInterval::findDefinedVNInfoForRegInt(SlotIndex Idx) const {
-  for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end();
-       i != e; ++i) {
-    if ((*i)->def == Idx)
-      return *i;
-  }
-
-  return 0;
-}
-
 /// join - Join two live intervals (this, and other) together.  This applies
 /// mappings to the value numbers in the LHS/RHS intervals as specified.  If
 /// the intervals are not joinable, this aborts.
@@ -373,6 +381,8 @@ void LiveInterval::join(LiveInterval &Other,
                         const int *RHSValNoAssignments,
                         SmallVector<VNInfo*, 16> &NewVNInfo,
                         MachineRegisterInfo *MRI) {
+  verify();
+
   // Determine if any of our live range values are mapped.  This is uncommon, so
   // we want to avoid the interval scan if not.
   bool MustMapCurValNos = false;
@@ -440,16 +450,148 @@ void LiveInterval::join(LiveInterval &Other,
     valnos.resize(NumNewVals);  // shrinkify
 
   // Okay, now insert the RHS live ranges into the LHS.
-  iterator InsertPos = begin();
   unsigned RangeNo = 0;
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
     // Map the valno in the other live range to the current live range.
     I->valno = NewVNInfo[OtherAssignments[RangeNo]];
     assert(I->valno && "Adding a dead range?");
-    InsertPos = addRangeFrom(*I, InsertPos);
+  }
+  mergeIntervalRanges(Other);
+
+  verify();
+}
+
+/// \brief Helper function for merging in another LiveInterval's ranges.
+///
+/// This is a helper routine implementing an efficient merge of another
+/// LiveIntervals ranges into the current interval.
+///
+/// \param LHSValNo If non-NULL, set as the new value number for every range
+///                 from RHS which is merged into the LHS.
+/// \param RHSValNo If non-NULL, then only ranges in RHS whose original value
+///                 number maches this value number will be merged into LHS.
+void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS,
+                                       VNInfo *LHSValNo,
+                                       const VNInfo *RHSValNo) {
+  if (RHS.empty())
+    return;
+
+  // Ensure we're starting with a valid range. Note that we don't verify RHS
+  // because it may have had its value numbers adjusted in preparation for
+  // merging.
+  verify();
+
+  // The strategy for merging these efficiently is as follows:
+  //
+  // 1) Find the beginning of the impacted ranges in the LHS.
+  // 2) Create a new, merged sub-squence of ranges merging from the position in
+  //    #1 until either LHS or RHS is exhausted. Any part of LHS between RHS
+  //    entries being merged will be copied into this new range.
+  // 3) Replace the relevant section in LHS with these newly merged ranges.
+  // 4) Append any remaning ranges from RHS if LHS is exhausted in #2.
+  //
+  // We don't follow the typical in-place merge strategy for sorted ranges of
+  // appending the new ranges to the back and then using std::inplace_merge
+  // because one step of the merge can both mutate the original elements and
+  // remove elements from the original. Essentially, because the merge includes
+  // collapsing overlapping ranges, a more complex approach is required.
+
+  // We do an initial binary search to optimize for a common pattern: a large
+  // LHS, and a very small RHS.
+  const_iterator RI = RHS.begin(), RE = RHS.end();
+  iterator LE = end(), LI = std::upper_bound(begin(), LE, *RI);
+
+  // Merge into NewRanges until one of the ranges is exhausted.
+  SmallVector<LiveRange, 4> NewRanges;
+
+  // Keep track of where to begin the replacement.
+  iterator ReplaceI = LI;
+
+  // If there are preceding ranges in the LHS, put the last one into NewRanges
+  // so we can optionally extend it. Adjust the replacement point accordingly.
+  if (LI != begin()) {
+    ReplaceI = llvm::prior(LI);
+    NewRanges.push_back(*ReplaceI);
+  }
+
+  // Now loop over the mergable portions of both LHS and RHS, merging into
+  // NewRanges.
+  while (LI != LE && RI != RE) {
+    // Skip incoming ranges with the wrong value.
+    if (RHSValNo && RI->valno != RHSValNo) {
+      ++RI;
+      continue;
+    }
+
+    // Select the first range. We pick the earliest start point, and then the
+    // largest range.
+    LiveRange R = *LI;
+    if (*RI < R) {
+      R = *RI;
+      ++RI;
+      if (LHSValNo)
+        R.valno = LHSValNo;
+    } else {
+      ++LI;
+    }
+
+    if (NewRanges.empty()) {
+      NewRanges.push_back(R);
+      continue;
+    }
+
+    LiveRange &LastR = NewRanges.back();
+    if (R.valno == LastR.valno) {
+      // Try to merge this range into the last one.
+      if (R.start <= LastR.end) {
+        LastR.end = std::max(LastR.end, R.end);
+        continue;
+      }
+    } else {
+      // We can't merge ranges across a value number.
+      assert(R.start >= LastR.end &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+
+    // If all else fails, just append the range.
+    NewRanges.push_back(R);
+  }
+  assert(RI == RE || LI == LE);
+
+  // Check for being able to merge into the trailing sequence of ranges on the LHS.
+  if (!NewRanges.empty())
+    for (; LI != LE && (LI->valno == NewRanges.back().valno &&
+                        LI->start <= NewRanges.back().end);
+         ++LI)
+      NewRanges.back().end = std::max(NewRanges.back().end, LI->end);
+
+  // Replace the ranges in the LHS with the newly merged ones. It would be
+  // really nice if there were a move-supporting 'replace' directly in
+  // SmallVector, but as there is not, we pay the price of copies to avoid
+  // wasted memory allocations.
+  SmallVectorImpl<LiveRange>::iterator NRI = NewRanges.begin(),
+                                       NRE = NewRanges.end();
+  for (; ReplaceI != LI && NRI != NRE; ++ReplaceI, ++NRI)
+    *ReplaceI = *NRI;
+  if (NRI == NRE)
+    ranges.erase(ReplaceI, LI);
+  else
+    ranges.insert(LI, NRI, NRE);
+
+  // And finally insert any trailing end of RHS (if we have one).
+  for (; RI != RE; ++RI) {
+    LiveRange R = *RI;
+    if (LHSValNo)
+      R.valno = LHSValNo;
+    if (!ranges.empty() &&
+        ranges.back().valno == R.valno && R.start <= ranges.back().end)
+      ranges.back().end = std::max(ranges.back().end, R.end);
+    else
+      ranges.push_back(R);
   }
 
-  ComputeJoinedWeight(Other);
+  // Ensure we finished with a valid new sequence of ranges.
+  verify();
 }
 
 /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
@@ -458,38 +600,20 @@ void LiveInterval::join(LiveInterval &Other,
 /// the overlapping LiveRanges have the specified value number.
 void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
                                         VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+  mergeIntervalRanges(RHS, LHSValNo);
 }
 
-
 /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
 /// in RHS into this live interval as the specified value number.
 /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
 /// current interval, it will replace the value numbers of the overlaped
 /// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(
-                                    const LiveInterval &RHS,
-                                    const VNInfo *RHSValNo, VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    if (I->valno != RHSValNo)
-      continue;
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
+                                       const VNInfo *RHSValNo,
+                                       VNInfo *LHSValNo) {
+  mergeIntervalRanges(RHS, LHSValNo, RHSValNo);
 }
 
-
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
 /// LiveRanges with the V1 value number with the V2 value number.  This can
@@ -543,9 +667,6 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
     }
   }
 
-  // Merge the relevant flags.
-  V2->mergeFlags(V1);
-
   // Now that V1 is dead, remove it.
   markValNoForDeletion(V1);
 
@@ -569,6 +690,8 @@ void LiveInterval::Copy(const LiveInterval &RHS,
     const LiveRange &LR = RHS.ranges[i];
     addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
   }
+
+  verify();
 }
 
 unsigned LiveInterval::getSize() const {
@@ -578,29 +701,6 @@ unsigned LiveInterval::getSize() const {
   return Sum;
 }
 
-/// ComputeJoinedWeight - Set the weight of a live interval Joined
-/// after Other has been merged into it.
-void LiveInterval::ComputeJoinedWeight(const LiveInterval &Other) {
-  // If either of these intervals was spilled, the weight is the
-  // weight of the non-spilled interval.  This can only happen with
-  // iterative coalescers.
-
-  if (Other.weight != HUGE_VALF) {
-    weight += Other.weight;
-  }
-  else if (weight == HUGE_VALF &&
-      !TargetRegisterInfo::isPhysicalRegister(reg)) {
-    // Remove this assert if you have an iterative coalescer
-    assert(0 && "Joining to spilled interval");
-    weight = Other.weight;
-  }
-  else {
-    // Otherwise the weight stays the same
-    // Remove this assert if you have an iterative coalescer
-    assert(0 && "Joining from spilled interval");
-  }
-}
-
 raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
   return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
 }
@@ -609,15 +709,10 @@ void LiveRange::dump() const {
   dbgs() << *this << "\n";
 }
 
-void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
-  OS << PrintReg(reg, TRI);
-  if (weight != 0)
-    OS << ',' << weight;
-
+void LiveInterval::print(raw_ostream &OS) const {
   if (empty())
-    OS << " EMPTY";
+    OS << "EMPTY";
   else {
-    OS << " = ";
     for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
            E = ranges.end(); I != E; ++I) {
       OS << *I;
@@ -639,9 +734,7 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
       } else {
         OS << vni->def;
         if (vni->isPHIDef())
-          OS << "-phidef";
-        if (vni->hasPHIKill())
-          OS << "-phikill";
+          OS << "-phi";
       }
     }
   }
@@ -651,6 +744,23 @@ void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 
+#ifndef NDEBUG
+void LiveInterval::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != 0);
+    assert(I->valno == valnos[I->valno->id]);
+    if (llvm::next(I) != E) {
+      assert(I->end <= llvm::next(I)->start);
+      if (I->end == llvm::next(I)->start)
+        assert(I->valno != llvm::next(I)->valno);
+    }
+  }
+}
+#endif
+
 
 void LiveRange::print(raw_ostream &os) const {
   os << *this;
@@ -712,13 +822,13 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     MachineOperand &MO = RI.getOperand();
     MachineInstr *MI = MO.getParent();
     ++RI;
-    if (MO.isUse() && MO.isUndef())
-      continue;
     // DBG_VALUE instructions should have been eliminated earlier.
-    SlotIndex Idx = LIS.getInstructionIndex(MI);
-    Idx = Idx.getRegSlot(MO.isUse());
-    const VNInfo *VNI = LI.getVNInfoAt(Idx);
-    assert(VNI && "Interval not live at use.");
+    LiveRangeQuery LRQ(LI, LIS.getInstructionIndex(MI));
+    const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
+    // In the case of an <undef> use that isn't tied to any def, VNI will be
+    // NULL. If the use is tied to a def, VNI will be the defined value.
+    if (!VNI)
+      continue;
     MO.setReg(LIV[getEqClass(VNI)]->reg);
   }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 934cc12..d0f8ae1 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Value.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -31,20 +32,20 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "LiveRangeCalc.h"
 #include <algorithm>
 #include <limits>
 #include <cmath>
 using namespace llvm;
 
-// Hidden options for help debugging.
-static cl::opt<bool> DisableReMat("disable-rematerialization",
-                                  cl::init(false), cl::Hidden);
-
-STATISTIC(numIntervals , "Number of original intervals");
+// Switch to the new experimental algorithm for computing live intervals.
+static cl::opt<bool>
+NewLiveIntervals("new-live-intervals", cl::Hidden,
+                 cl::desc("Use new algorithm forcomputing live intervals"));
 
 char LiveIntervals::ID = 0;
+char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
@@ -61,23 +62,35 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LiveVariables>();
   AU.addPreserved<LiveVariables>();
   AU.addPreservedID(MachineLoopInfoID);
+  AU.addRequiredTransitiveID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addPreserved<SlotIndexes>();
   AU.addRequiredTransitive<SlotIndexes>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
+  DomTree(0), LRCalc(0) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+LiveIntervals::~LiveIntervals() {
+  delete LRCalc;
+}
+
 void LiveIntervals::releaseMemory() {
   // Free the live intervals themselves.
-  for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(),
-       E = r2iMap_.end(); I != E; ++I)
-    delete I->second;
-
-  r2iMap_.clear();
+  for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
+    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+  VirtRegIntervals.clear();
   RegMaskSlots.clear();
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    delete RegUnitIntervals[i];
+  RegUnitIntervals.clear();
+
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
@@ -85,20 +98,34 @@ void LiveIntervals::releaseMemory() {
 /// runOnMachineFunction - Register allocate the whole function
 ///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
-  mf_ = &fn;
-  mri_ = &mf_->getRegInfo();
-  tm_ = &fn.getTarget();
-  tri_ = tm_->getRegisterInfo();
-  tii_ = tm_->getInstrInfo();
-  aa_ = &getAnalysis<AliasAnalysis>();
-  lv_ = &getAnalysis<LiveVariables>();
-  indexes_ = &getAnalysis<SlotIndexes>();
-  allocatableRegs_ = tri_->getAllocatableSet(fn);
-  reservedRegs_ = tri_->getReservedRegs(fn);
-
-  computeIntervals();
-
-  numIntervals += getNumIntervals();
+  MF = &fn;
+  MRI = &MF->getRegInfo();
+  TM = &fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  LV = &getAnalysis<LiveVariables>();
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  if (!LRCalc)
+    LRCalc = new LiveRangeCalc();
+  AllocatableRegs = TRI->getAllocatableSet(fn);
+  ReservedRegs = TRI->getReservedRegs(fn);
+
+  // Allocate space for all virtual registers.
+  VirtRegIntervals.resize(MRI->getNumVirtRegs());
+
+  if (NewLiveIntervals) {
+    // This is the new way of computing live intervals.
+    // It is independent of LiveVariables, and it can run at any time.
+    computeVirtRegs();
+    computeRegMasks();
+  } else {
+    // This is the old way of computing live intervals.
+    // It depends on LiveVariables.
+    computeIntervals();
+  }
+  computeLiveInRegUnits();
 
   DEBUG(dump());
   return true;
@@ -108,27 +135,24 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
-  // Dump the physregs.
-  for (unsigned Reg = 1, RegE = tri_->getNumRegs(); Reg != RegE; ++Reg)
-    if (const LiveInterval *LI = r2iMap_.lookup(Reg)) {
-      LI->print(OS, tri_);
-      OS << '\n';
-    }
+  // Dump the regunits.
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    if (LiveInterval *LI = RegUnitIntervals[i])
+      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
 
   // Dump the virtregs.
-  for (unsigned Reg = 0, RegE = mri_->getNumVirtRegs(); Reg != RegE; ++Reg)
-    if (const LiveInterval *LI =
-        r2iMap_.lookup(TargetRegisterInfo::index2VirtReg(Reg))) {
-      LI->print(OS, tri_);
-      OS << '\n';
-    }
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (hasInterval(Reg))
+      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+  }
 
   printInstrs(OS);
 }
 
 void LiveIntervals::printInstrs(raw_ostream &OS) const {
   OS << "********** MACHINEINSTRS **********\n";
-  mf_->print(OS, indexes_);
+  MF->print(OS, Indexes);
 }
 
 void LiveIntervals::dumpInstrs() const {
@@ -176,13 +200,13 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
                                              MachineOperand& MO,
                                              unsigned MOIdx,
                                              LiveInterval &interval) {
-  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_));
+  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, TRI));
 
   // Virtual registers may be defined multiple times (due to phi
   // elimination and 2-addr elimination).  Much of what we do only has to be
   // done once for the vreg.  We use an empty interval to detect the first
   // time we see a vreg.
-  LiveVariables::VarInfo& vi = lv_->getVarInfo(interval.reg);
+  LiveVariables::VarInfo& vi = LV->getVarInfo(interval.reg);
   if (interval.empty()) {
     // Get the Idx of the defining instructions.
     SlotIndex defIndex = MIIdx.getRegSlot(MO.isEarlyClobber());
@@ -226,22 +250,22 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
     DEBUG(dbgs() << " +" << NewLR);
     interval.addRange(NewLR);
 
-    bool PHIJoin = lv_->isPHIJoin(interval.reg);
+    bool PHIJoin = LV->isPHIJoin(interval.reg);
 
     if (PHIJoin) {
-      // A phi join register is killed at the end of the MBB and revived as a new
-      // valno in the killing blocks.
+      // A phi join register is killed at the end of the MBB and revived as a
+      // new valno in the killing blocks.
       assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
       DEBUG(dbgs() << " phi-join");
-      ValNo->setHasPHIKill(true);
     } else {
       // Iterate over all of the blocks that the variable is completely
       // live in, adding [insrtIndex(begin), instrIndex(end)+4) to the
       // live interval.
       for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(),
                E = vi.AliveBlocks.end(); I != E; ++I) {
-        MachineBasicBlock *aliveBlock = mf_->getBlockNumbered(*I);
-        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock), ValNo);
+        MachineBasicBlock *aliveBlock = MF->getBlockNumbered(*I);
+        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock),
+                     ValNo);
         interval.addRange(LR);
         DEBUG(dbgs() << " +" << LR);
       }
@@ -260,7 +284,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         assert(getInstructionFromIndex(Start) == 0 &&
                "PHI def index points at actual instruction.");
         ValNo = interval.getNextValue(Start, VNInfoAllocator);
-        ValNo->setIsPHIDef(true);
       }
       LiveRange LR(Start, killIdx, ValNo);
       interval.addRange(LR);
@@ -319,11 +342,8 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         interval.addRange(LiveRange(RedefIndex, RedefIndex.getDeadSlot(),
                                     OldValNo));
 
-      DEBUG({
-          dbgs() << " RESULT: ";
-          interval.print(dbgs(), tri_);
-        });
-    } else if (lv_->isPHIJoin(interval.reg)) {
+      DEBUG(dbgs() << " RESULT: " << interval);
+    } else if (LV->isPHIJoin(interval.reg)) {
       // In the case of PHI elimination, each variable definition is only
       // live until the end of the block.  We've already taken care of the
       // rest of the live range.
@@ -337,7 +357,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       SlotIndex killIndex = getMBBEndIdx(mbb);
       LiveRange LR(defIndex, killIndex, ValNo);
       interval.addRange(LR);
-      ValNo->setHasPHIKill(true);
       DEBUG(dbgs() << " phi-join +" << LR);
     } else {
       llvm_unreachable("Multiply defined register");
@@ -347,101 +366,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
   DEBUG(dbgs() << '\n');
 }
 
-static bool isRegLiveIntoSuccessor(const MachineBasicBlock *MBB, unsigned Reg) {
-  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
-                                              SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    const MachineBasicBlock* succ = *SI;
-    if (succ->isLiveIn(Reg))
-      return true;
-  }
-  return false;
-}
-
-void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
-                                              MachineBasicBlock::iterator mi,
-                                              SlotIndex MIIdx,
-                                              MachineOperand& MO,
-                                              LiveInterval &interval) {
-  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_));
-
-  SlotIndex baseIndex = MIIdx;
-  SlotIndex start = baseIndex.getRegSlot(MO.isEarlyClobber());
-  SlotIndex end = start;
-
-  // If it is not used after definition, it is considered dead at
-  // the instruction defining it. Hence its interval is:
-  // [defSlot(def), defSlot(def)+1)
-  // For earlyclobbers, the defSlot was pushed back one; the extra
-  // advance below compensates.
-  if (MO.isDead()) {
-    DEBUG(dbgs() << " dead");
-    end = start.getDeadSlot();
-    goto exit;
-  }
-
-  // If it is not dead on definition, it must be killed by a
-  // subsequent instruction. Hence its interval is:
-  // [defSlot(def), useSlot(kill)+1)
-  baseIndex = baseIndex.getNextIndex();
-  while (++mi != MBB->end()) {
-
-    if (mi->isDebugValue())
-      continue;
-    if (getInstructionFromIndex(baseIndex) == 0)
-      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-
-    if (mi->killsRegister(interval.reg, tri_)) {
-      DEBUG(dbgs() << " killed");
-      end = baseIndex.getRegSlot();
-      goto exit;
-    } else {
-      int DefIdx = mi->findRegisterDefOperandIdx(interval.reg,false,false,tri_);
-      if (DefIdx != -1) {
-        if (mi->isRegTiedToUseOperand(DefIdx)) {
-          // Two-address instruction.
-          end = baseIndex.getRegSlot(mi->getOperand(DefIdx).isEarlyClobber());
-        } else {
-          // Another instruction redefines the register before it is ever read.
-          // Then the register is essentially dead at the instruction that
-          // defines it. Hence its interval is:
-          // [defSlot(def), defSlot(def)+1)
-          DEBUG(dbgs() << " dead");
-          end = start.getDeadSlot();
-        }
-        goto exit;
-      }
-    }
-
-    baseIndex = baseIndex.getNextIndex();
-  }
-
-  // If we get here the register *should* be live out.
-  assert(!isAllocatable(interval.reg) && "Physregs shouldn't be live out!");
-
-  // FIXME: We need saner rules for reserved regs.
-  if (isReserved(interval.reg)) {
-    end = start.getDeadSlot();
-  } else {
-    // Unreserved, unallocable registers like EFLAGS can be live across basic
-    // block boundaries.
-    assert(isRegLiveIntoSuccessor(MBB, interval.reg) &&
-           "Unreserved reg not live-out?");
-    end = getMBBEndIdx(MBB);
-  }
-exit:
-  assert(start < end && "did not find end of interval?");
-
-  // Already exists? Extend old live interval.
-  VNInfo *ValNo = interval.getVNInfoAt(start);
-  bool Extend = ValNo != 0;
-  if (!Extend)
-    ValNo = interval.getNextValue(start, VNInfoAllocator);
-  LiveRange LR(start, end, ValNo);
-  interval.addRange(LR);
-  DEBUG(dbgs() << " +" << LR << '\n');
-}
-
 void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator MI,
                                       SlotIndex MIIdx,
@@ -450,93 +374,6 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
     handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx,
                              getOrCreateInterval(MO.getReg()));
-  else
-    handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
-                              getOrCreateInterval(MO.getReg()));
-}
-
-void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
-                                         SlotIndex MIIdx,
-                                         LiveInterval &interval) {
-  assert(TargetRegisterInfo::isPhysicalRegister(interval.reg) &&
-         "Only physical registers can be live in.");
-  assert((!isAllocatable(interval.reg) || MBB->getParent()->begin() ||
-          MBB->isLandingPad()) &&
-          "Allocatable live-ins only valid for entry blocks and landing pads.");
-
-  DEBUG(dbgs() << "\t\tlivein register: " << PrintReg(interval.reg, tri_));
-
-  // Look for kills, if it reaches a def before it's killed, then it shouldn't
-  // be considered a livein.
-  MachineBasicBlock::iterator mi = MBB->begin();
-  MachineBasicBlock::iterator E = MBB->end();
-  // Skip over DBG_VALUE at the start of the MBB.
-  if (mi != E && mi->isDebugValue()) {
-    while (++mi != E && mi->isDebugValue())
-      ;
-    if (mi == E)
-      // MBB is empty except for DBG_VALUE's.
-      return;
-  }
-
-  SlotIndex baseIndex = MIIdx;
-  SlotIndex start = baseIndex;
-  if (getInstructionFromIndex(baseIndex) == 0)
-    baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-
-  SlotIndex end = baseIndex;
-  bool SeenDefUse = false;
-
-  while (mi != E) {
-    if (mi->killsRegister(interval.reg, tri_)) {
-      DEBUG(dbgs() << " killed");
-      end = baseIndex.getRegSlot();
-      SeenDefUse = true;
-      break;
-    } else if (mi->modifiesRegister(interval.reg, tri_)) {
-      // Another instruction redefines the register before it is ever read.
-      // Then the register is essentially dead at the instruction that defines
-      // it. Hence its interval is:
-      // [defSlot(def), defSlot(def)+1)
-      DEBUG(dbgs() << " dead");
-      end = start.getDeadSlot();
-      SeenDefUse = true;
-      break;
-    }
-
-    while (++mi != E && mi->isDebugValue())
-      // Skip over DBG_VALUE.
-      ;
-    if (mi != E)
-      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-  }
-
-  // Live-in register might not be used at all.
-  if (!SeenDefUse) {
-    if (isAllocatable(interval.reg) ||
-        !isRegLiveIntoSuccessor(MBB, interval.reg)) {
-      // Allocatable registers are never live through.
-      // Non-allocatable registers that aren't live into any successors also
-      // aren't live through.
-      DEBUG(dbgs() << " dead");
-      return;
-    } else {
-      // If we get here the register is non-allocatable and live into some
-      // successor. We'll conservatively assume it's live-through.
-      DEBUG(dbgs() << " live through");
-      end = getMBBEndIdx(MBB);
-    }
-  }
-
-  SlotIndex defIdx = getMBBStartIdx(MBB);
-  assert(getInstructionFromIndex(defIdx) == 0 &&
-         "PHI def index points at actual instruction.");
-  VNInfo *vni = interval.getNextValue(defIdx, VNInfoAllocator);
-  vni->setIsPHIDef(true);
-  LiveRange LR(start, end, vni);
-
-  interval.addRange(LR);
-  DEBUG(dbgs() << " +" << LR << '\n');
 }
 
 /// computeIntervals - computes the live intervals for virtual
@@ -546,12 +383,12 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
 void LiveIntervals::computeIntervals() {
   DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
                << "********** Function: "
-               << ((Value*)mf_->getFunction())->getName() << '\n');
+               << ((Value*)MF->getFunction())->getName() << '\n');
 
-  RegMaskBlocks.resize(mf_->getNumBlockIDs());
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
 
   SmallVector<unsigned, 8> UndefUses;
-  for (MachineFunction::iterator MBBI = mf_->begin(), E = mf_->end();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = MBBI;
     RegMaskBlocks[MBB->getNumber()].first = RegMaskSlots.size();
@@ -564,22 +401,16 @@ void LiveIntervals::computeIntervals() {
     DEBUG(dbgs() << "BB#" << MBB->getNumber()
           << ":\t\t# derived from " << MBB->getName() << "\n");
 
-    // Create intervals for live-ins to this BB first.
-    for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(),
-           LE = MBB->livein_end(); LI != LE; ++LI) {
-      handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*LI));
-    }
-
     // Skip over empty initial indices.
     if (getInstructionFromIndex(MIIndex) == 0)
-      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
 
     for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
          MI != miEnd; ++MI) {
       DEBUG(dbgs() << MIIndex << "\t" << *MI);
       if (MI->isDebugValue())
         continue;
-      assert(indexes_->getInstructionFromIndex(MIIndex) == MI &&
+      assert(Indexes->getInstructionFromIndex(MIIndex) == MI &&
              "Lost SlotIndex synchronization");
 
       // Handle defs.
@@ -593,7 +424,7 @@ void LiveIntervals::computeIntervals() {
           continue;
         }
 
-        if (!MO.isReg() || !MO.getReg())
+        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
 
         // handle register defs - build intervals
@@ -604,7 +435,7 @@ void LiveIntervals::computeIntervals() {
       }
 
       // Move to the next instr slot.
-      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
     }
 
     // Compute the number of register mask instructions in this block.
@@ -626,14 +457,147 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
   return new LiveInterval(reg, Weight);
 }
 
-/// dupInterval - Duplicate a live interval. The caller is responsible for
-/// managing the allocated memory.
-LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
-  LiveInterval *NewLI = createInterval(li->reg);
-  NewLI->Copy(*li, mri_, getVNInfoAllocator());
-  return NewLI;
+
+/// computeVirtRegInterval - Compute the live interval of a virtual register,
+/// based on defs and uses.
+void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  assert(LI->empty() && "Should only compute empty intervals.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->createDeadDefs(LI);
+  LRCalc->extendToUses(LI);
+}
+
+void LiveIntervals::computeVirtRegs() {
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval *LI = createInterval(Reg);
+    VirtRegIntervals[Reg] = LI;
+    computeVirtRegInterval(LI);
+  }
+}
+
+void LiveIntervals::computeRegMasks() {
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
+
+  // Find all instructions with regmask operands.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
+    RMB.first = RegMaskSlots.size();
+    for (MachineBasicBlock::iterator MI = MBB->begin(), ME = MBB->end();
+         MI != ME; ++MI)
+      for (MIOperands MO(MI); MO.isValid(); ++MO) {
+        if (!MO->isRegMask())
+          continue;
+          RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
+          RegMaskBits.push_back(MO->getRegMask());
+      }
+    // Compute the number of register mask instructions in this block.
+    RMB.second = RegMaskSlots.size() - RMB.first;;
+  }
 }
 
+//===----------------------------------------------------------------------===//
+//                           Register Unit Liveness
+//===----------------------------------------------------------------------===//
+//
+// Fixed interference typically comes from ABI boundaries: Function arguments
+// and return values are passed in fixed registers, and so are exception
+// pointers entering landing pads. Certain instructions require values to be
+// present in specific registers. That is also represented through fixed
+// interference.
+//
+
+/// computeRegUnitInterval - Compute the live interval of a register unit, based
+/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// or contain only dead phi-defs from ABI blocks.
+void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
+  unsigned Unit = LI->reg;
+
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+
+  // The physregs aliasing Unit are the roots and their super-registers.
+  // Create all values as dead defs before extending to uses. Note that roots
+  // may share super-registers. That's OK because createDeadDefs() is
+  // idempotent. It is very rare for a register unit to have multiple roots, so
+  // uniquing super-registers is probably not worthwhile.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!MRI->reg_empty(Root))
+      LRCalc->createDeadDefs(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      if (!MRI->reg_empty(*Supers))
+        LRCalc->createDeadDefs(LI, *Supers);
+    }
+  }
+
+  // Now extend LI to reach all uses.
+  // Ignore uses of reserved registers. We only track defs of those.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!isReserved(Root) && !MRI->reg_empty(Root))
+      LRCalc->extendToUses(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      unsigned Reg = *Supers;
+      if (!isReserved(Reg) && !MRI->reg_empty(Reg))
+        LRCalc->extendToUses(LI, Reg);
+    }
+  }
+}
+
+
+/// computeLiveInRegUnits - Precompute the live ranges of any register units
+/// that are live-in to an ABI block somewhere. Register values can appear
+/// without a corresponding def when entering the entry block or a landing pad.
+///
+void LiveIntervals::computeLiveInRegUnits() {
+  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+
+  // Keep track of the intervals allocated.
+  SmallVector<LiveInterval*, 8> NewIntvs;
+
+  // Check all basic blocks for live-ins.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock *MBB = MFI;
+
+    // We only care about ABI blocks: Entry + landing pads.
+    if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty())
+      continue;
+
+    // Create phi-defs at Begin for all live-in registers.
+    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
+    for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(),
+         LIE = MBB->livein_end(); LII != LIE; ++LII) {
+      for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
+        unsigned Unit = *Units;
+        LiveInterval *Intv = RegUnitIntervals[Unit];
+        if (!Intv) {
+          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
+          NewIntvs.push_back(Intv);
+        }
+        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        (void)VNI;
+        DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+
+  // Compute the 'normal' part of the intervals.
+  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
+    computeRegUnitInterval(NewIntvs[i]);
+}
+
+
 /// shrinkToUses - After removing some uses of a register, shrink its live
 /// range to just the remaining uses. This method does not compute reaching
 /// defs for new uses, and it doesn't remove dead defs.
@@ -649,14 +613,13 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
 
   // Visit all instructions reading li->reg.
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li->reg);
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(li->reg);
        MachineInstr *UseMI = I.skipInstruction();) {
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
     SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
-    // Note: This intentionally picks up the wrong VNI in case of an EC redef.
-    // See below.
-    VNInfo *VNI = li->getVNInfoBefore(Idx);
+    LiveRangeQuery LRQ(*li, Idx);
+    VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
       // no live value. It is likely caused by a target getting <undef> flags
@@ -667,13 +630,10 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       continue;
     }
     // Special case: An early-clobber tied operand reads and writes the
-    // register one slot early.  The getVNInfoBefore call above would have
-    // picked up the value defined by UseMI.  Adjust the kill slot and value.
-    if (SlotIndex::isSameInstr(VNI->def, Idx)) {
-      Idx = VNI->def;
-      VNI = li->getVNInfoBefore(Idx);
-      assert(VNI && "Early-clobber tied value not available");
-    }
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
     WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
@@ -747,7 +707,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
-      VNI->setIsUnused(true);
+      VNI->markUnused();
       NewLI.removeRange(*LII);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       CanSeparate = true;
@@ -755,7 +715,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(VNI->def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(li->reg, tri_);
+      MI->addRegisterDead(li->reg, TRI);
       if (dead && MI->allDefsAreDead()) {
         DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
         dead->push_back(MI);
@@ -775,13 +735,11 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 //
 
 void LiveIntervals::addKillFlags() {
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    unsigned Reg = I->first;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
       continue;
-    if (mri_->reg_nodbg_empty(Reg))
-      continue;
-    LiveInterval *LI = I->second;
+    LiveInterval *LI = &getInterval(Reg);
 
     // Every instruction that kills Reg corresponds to a live range end point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
@@ -797,101 +755,6 @@ void LiveIntervals::addKillFlags() {
   }
 }
 
-/// getReMatImplicitUse - If the remat definition MI has one (for now, we only
-/// allow one) virtual register operand, then its uses are implicitly using
-/// the register. Returns the virtual register.
-unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
-                                            MachineInstr *MI) const {
-  unsigned RegOp = 0;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isUse())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0 || Reg == li.reg)
-      continue;
-
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isAllocatable(Reg))
-      continue;
-    RegOp = MO.getReg();
-    break; // Found vreg operand - leave the loop.
-  }
-  return RegOp;
-}
-
-/// isValNoAvailableAt - Return true if the val# of the specified interval
-/// which reaches the given instruction also reaches the specified use index.
-bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
-                                       SlotIndex UseIdx) const {
-  VNInfo *UValNo = li.getVNInfoAt(UseIdx);
-  return UValNo && UValNo == li.getVNInfoAt(getInstructionIndex(MI));
-}
-
-/// isReMaterializable - Returns true if the definition MI of the specified
-/// val# of the specified interval is re-materializable.
-bool
-LiveIntervals::isReMaterializable(const LiveInterval &li,
-                                  const VNInfo *ValNo, MachineInstr *MI,
-                                  const SmallVectorImpl<LiveInterval*> *SpillIs,
-                                  bool &isLoad) {
-  if (DisableReMat)
-    return false;
-
-  if (!tii_->isTriviallyReMaterializable(MI, aa_))
-    return false;
-
-  // Target-specific code can mark an instruction as being rematerializable
-  // if it has one virtual reg use, though it had better be something like
-  // a PIC base register which is likely to be live everywhere.
-  unsigned ImpUse = getReMatImplicitUse(li, MI);
-  if (ImpUse) {
-    const LiveInterval &ImpLi = getInterval(ImpUse);
-    for (MachineRegisterInfo::use_nodbg_iterator
-           ri = mri_->use_nodbg_begin(li.reg), re = mri_->use_nodbg_end();
-         ri != re; ++ri) {
-      MachineInstr *UseMI = &*ri;
-      SlotIndex UseIdx = getInstructionIndex(UseMI);
-      if (li.getVNInfoAt(UseIdx) != ValNo)
-        continue;
-      if (!isValNoAvailableAt(ImpLi, MI, UseIdx))
-        return false;
-    }
-
-    // If a register operand of the re-materialized instruction is going to
-    // be spilled next, then it's not legal to re-materialize this instruction.
-    if (SpillIs)
-      for (unsigned i = 0, e = SpillIs->size(); i != e; ++i)
-        if (ImpUse == (*SpillIs)[i]->reg)
-          return false;
-  }
-  return true;
-}
-
-/// isReMaterializable - Returns true if every definition of MI of every
-/// val# of the specified interval is re-materializable.
-bool
-LiveIntervals::isReMaterializable(const LiveInterval &li,
-                                  const SmallVectorImpl<LiveInterval*> *SpillIs,
-                                  bool &isLoad) {
-  isLoad = false;
-  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
-       i != e; ++i) {
-    const VNInfo *VNI = *i;
-    if (VNI->isUnused())
-      continue; // Dead val#.
-    // Is the def for the val# rematerializable?
-    MachineInstr *ReMatDefMI = getInstructionFromIndex(VNI->def);
-    if (!ReMatDefMI)
-      return false;
-    bool DefIsLoad = false;
-    if (!ReMatDefMI ||
-        !isReMaterializable(li, VNI, ReMatDefMI, SpillIs, DefIsLoad))
-      return false;
-    isLoad |= DefIsLoad;
-  }
-  return true;
-}
-
 MachineBasicBlock*
 LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
   // A local live range must be fully contained inside the block, meaning it is
@@ -911,11 +774,30 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
 
   // getMBBFromIndex doesn't need to search the MBB table when both indexes
   // belong to proper instructions.
-  MachineBasicBlock *MBB1 = indexes_->getMBBFromIndex(Start);
-  MachineBasicBlock *MBB2 = indexes_->getMBBFromIndex(Stop);
+  MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
+  MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
   return MBB1 == MBB2 ? MBB1 : NULL;
 }
 
+bool
+LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *PHI = *I;
+    if (PHI->isUnused() || !PHI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
+    // Conservatively return true instead of scanning huge predecessor lists.
+    if (PHIMBB->pred_size() > 100)
+      return true;
+    for (MachineBasicBlock::const_pred_iterator
+         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+        return true;
+  }
+  return false;
+}
+
 float
 LiveIntervals::getSpillWeight(bool isDef, bool isUse, unsigned loopDepth) {
   // Limit the loop depth ridiculousness.
@@ -940,7 +822,6 @@ LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
   VNInfo* VN = Interval.getNextValue(
     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
     getVNInfoAllocator());
-  VN->setHasPHIKill(true);
   LiveRange LR(
      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
      getMBBEndIdx(startInst->getParent()), VN);
@@ -990,7 +871,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
       if (!Found) {
         // This is the first overlap. Initialize UsableRegs to all ones.
         UsableRegs.clear();
-        UsableRegs.resize(tri_->getNumRegs(), true);
+        UsableRegs.resize(TRI->getNumRegs(), true);
         Found = true;
       }
       // Remove usable registers clobbered by this mask.
@@ -1101,6 +982,9 @@ public:
 
     BundleRanges BR = createBundleRanges(Entering, Internal, Exiting);
 
+    Entering.clear();
+    Internal.clear();
+    Exiting.clear();
     collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx);
     assert(!hasRegMaskOp && "Can't have RegMask operand in bundle.");
 
@@ -1176,78 +1060,44 @@ private:
       // TODO: Currently we're skipping uses that are reserved or have no
       // interval, but we're not updating their kills. This should be
       // fixed.
-      if (!LIS.hasInterval(Reg) ||
-          (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg)))
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg))
         continue;
 
-      LiveInterval* LI = &LIS.getInterval(Reg);
-
-      if (MO.readsReg()) {
-        LiveRange* LR = LI->getLiveRangeContaining(OldIdx);
-        if (LR != 0)
-          Entering.insert(std::make_pair(LI, LR));
-      }
-      if (MO.isDef()) {
-        if (MO.isEarlyClobber()) {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot(true));
-          assert(LR != 0 && "No EC range?");
-          if (LR->end > OldIdx.getDeadSlot())
-            Exiting.insert(std::make_pair(LI, LR));
-          else
-            Internal.insert(std::make_pair(LI, LR));
-        } else if (MO.isDead()) {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot());
-          assert(LR != 0 && "No dead-def range?");
-          Internal.insert(std::make_pair(LI, LR));
-        } else {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getDeadSlot());
-          assert(LR && LR->end > OldIdx.getDeadSlot() &&
-                 "Non-dead-def should have live range exiting.");
-          Exiting.insert(std::make_pair(LI, LR));
-        }
+      // Collect ranges for register units. These live ranges are computed on
+      // demand, so just skip any that haven't been computed yet.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+          if (LiveInterval *LI = LIS.getCachedRegUnit(*Units))
+            collectRanges(MO, LI, Entering, Internal, Exiting, OldIdx);
+      } else {
+        // Collect ranges for individual virtual registers.
+        collectRanges(MO, &LIS.getInterval(Reg),
+                      Entering, Internal, Exiting, OldIdx);
       }
     }
   }
 
-  // Collect IntRangePairs for all operands of MI that may need fixing.
-  void collectRangesInBundle(MachineInstr* MI, RangeSet& Entering,
-                             RangeSet& Exiting, SlotIndex MIStartIdx,
-                             SlotIndex MIEndIdx) {
-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                    MOE = MI->operands_end();
-         MOI != MOE; ++MOI) {
-      const MachineOperand& MO = *MOI;
-      assert(!MO.isRegMask() && "Can't have RegMasks in bundles.");
-      if (!MO.isReg() || MO.getReg() == 0)
-        continue;
-
-      unsigned Reg = MO.getReg();
-
-      // TODO: Currently we're skipping uses that are reserved or have no
-      // interval, but we're not updating their kills. This should be
-      // fixed.
-      if (!LIS.hasInterval(Reg) ||
-          (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg)))
-        continue;
-
-      LiveInterval* LI = &LIS.getInterval(Reg);
-
-      if (MO.readsReg()) {
-        LiveRange* LR = LI->getLiveRangeContaining(MIStartIdx);
-        if (LR != 0)
-          Entering.insert(std::make_pair(LI, LR));
-      }
-      if (MO.isDef()) {
-        assert(!MO.isEarlyClobber() && "Early clobbers not allowed in bundles.");
-        assert(!MO.isDead() && "Dead-defs not allowed in bundles.");
-        LiveRange* LR = LI->getLiveRangeContaining(MIEndIdx.getDeadSlot());
-        assert(LR != 0 && "Internal ranges not allowed in bundles.");
+  void collectRanges(const MachineOperand &MO, LiveInterval *LI,
+                     RangeSet &Entering, RangeSet &Internal, RangeSet &Exiting,
+                     SlotIndex OldIdx) {
+    if (MO.readsReg()) {
+      LiveRange* LR = LI->getLiveRangeContaining(OldIdx);
+      if (LR != 0)
+        Entering.insert(std::make_pair(LI, LR));
+    }
+    if (MO.isDef()) {
+      LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot());
+      assert(LR != 0 && "No live range for def?");
+      if (LR->end > OldIdx.getDeadSlot())
         Exiting.insert(std::make_pair(LI, LR));
-      }
+      else
+        Internal.insert(std::make_pair(LI, LR));
     }
   }
 
-  BundleRanges createBundleRanges(RangeSet& Entering, RangeSet& Internal, RangeSet& Exiting) {
+  BundleRanges createBundleRanges(RangeSet& Entering,
+                                  RangeSet& Internal,
+                                  RangeSet& Exiting) {
     BundleRanges BR;
 
     for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
@@ -1284,7 +1134,8 @@ private:
       return; // Bail out if we don't have kill flags on the old register.
     MachineInstr* NewKillMI = LIS.getInstructionFromIndex(newKillIdx);
     assert(OldKillMI->killsRegister(reg) && "Old 'kill' instr isn't a kill.");
-    assert(!NewKillMI->killsRegister(reg) && "New kill instr is already a kill.");
+    assert(!NewKillMI->killsRegister(reg) &&
+           "New kill instr is already a kill.");
     OldKillMI->clearRegisterKills(reg, &TRI);
     NewKillMI->addRegisterKilled(reg, &TRI);
   }
@@ -1523,22 +1374,23 @@ private:
 };
 
 void LiveIntervals::handleMove(MachineInstr* MI) {
-  SlotIndex OldIndex = indexes_->getInstructionIndex(MI);
-  indexes_->removeMachineInstrFromMaps(MI);
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  Indexes->removeMachineInstrFromMaps(MI);
   SlotIndex NewIndex = MI->isInsideBundle() ?
-                        indexes_->getInstructionIndex(MI) :
-                        indexes_->insertMachineInstrInMaps(MI);
+                        Indexes->getInstructionIndex(MI) :
+                        Indexes->insertMachineInstrInMaps(MI);
   assert(getMBBStartIdx(MI->getParent()) <= OldIndex &&
          OldIndex < getMBBEndIdx(MI->getParent()) &&
          "Cannot handle moves across basic block boundaries.");
   assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
 
-  HMEditor HME(*this, *mri_, *tri_, NewIndex);
+  HMEditor HME(*this, *MRI, *TRI, NewIndex);
   HME.moveAllRangesFrom(MI, OldIndex);
 }
 
-void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart) {
-  SlotIndex NewIndex = indexes_->getInstructionIndex(BundleStart);
-  HMEditor HME(*this, *mri_, *tri_, NewIndex);
+void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
+                                         MachineInstr* BundleStart) {
+  SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
+  HMEditor HME(*this, *MRI, *TRI, NewIndex);
   HME.moveAllRangesInto(MI, BundleStart);
 }
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index 60a6880..dadd02b 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -81,7 +81,6 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
 
 void
 LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
-  OS << "LIU " << PrintReg(RepReg, TRI);
   if (empty()) {
     OS << " empty\n";
     return;
@@ -209,3 +208,26 @@ bool LiveIntervalUnion::Query::checkLoopInterference(MachineLoopRange *Loop) {
     VRI = VirtReg->advanceTo(VRI, Overlaps.start());
   }
 }
+
+void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
+                                    unsigned NSize) {
+  // Reuse existing allocation.
+  if (NSize == Size)
+    return;
+  clear();
+  Size = NSize;
+  LIUs = static_cast<LiveIntervalUnion*>(
+    malloc(sizeof(LiveIntervalUnion)*NSize));
+  for (unsigned i = 0; i != Size; ++i)
+    new(LIUs + i) LiveIntervalUnion(Alloc);
+}
+
+void LiveIntervalUnion::Array::clear() {
+  if (!LIUs)
+    return;
+  for (unsigned i = 0; i != Size; ++i)
+    LIUs[i].~LiveIntervalUnion();
+  free(LIUs);
+  Size =  0;
+  LIUs = 0;
+}
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index dbf5ac1..cd4e690 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -60,13 +60,11 @@ public:
   class Query;
 
 private:
-  const unsigned RepReg;  // representative register number
   unsigned Tag;           // unique tag for current contents.
   LiveSegments Segments;  // union of virtual reg segments
 
 public:
-  LiveIntervalUnion(unsigned r, Allocator &a) : RepReg(r), Tag(0), Segments(a)
-    {}
+  explicit LiveIntervalUnion(Allocator &a) : Tag(0), Segments(a) {}
 
   // Iterate over all segments in the union of live virtual registers ordered
   // by their starting position.
@@ -183,6 +181,28 @@ public:
     Query(const Query&);          // DO NOT IMPLEMENT
     void operator=(const Query&); // DO NOT IMPLEMENT
   };
+
+  // Array of LiveIntervalUnions.
+  class Array {
+    unsigned Size;
+    LiveIntervalUnion *LIUs;
+  public:
+    Array() : Size(0), LIUs(0) {}
+    ~Array() { clear(); }
+
+    // Initialize the array to have Size entries.
+    // Reuse an existing allocation if the size matches.
+    void init(LiveIntervalUnion::Allocator&, unsigned Size);
+
+    unsigned size() const { return Size; }
+
+    void clear();
+
+    LiveIntervalUnion& operator[](unsigned idx) {
+      assert(idx <  Size && "idx out of bounds");
+      return LIUs[idx];
+    }
+  };
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d8ab791..d828f25 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -14,10 +14,19 @@
 #define DEBUG_TYPE "regalloc"
 #include "LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
-void LiveRangeCalc::reset(const MachineFunction *MF) {
+void LiveRangeCalc::reset(const MachineFunction *MF,
+                          SlotIndexes *SI,
+                          MachineDominatorTree *MDT,
+                          VNInfo::Allocator *VNIA) {
+  MRI = &MF->getRegInfo();
+  Indexes = SI;
+  DomTree = MDT;
+  Alloc = VNIA;
+
   unsigned N = MF->getNumBlockIDs();
   Seen.clear();
   Seen.resize(N);
@@ -26,8 +35,72 @@ void LiveRangeCalc::reset(const MachineFunction *MF) {
 }
 
 
+void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LI->createDeadDef() will deduplicate.
+  for (MachineRegisterInfo::def_iterator
+       I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
+    const MachineInstr *MI = &*I;
+    // Find the corresponding slot index.
+    SlotIndex Idx;
+    if (MI->isPHI())
+      // PHI defs begin at the basic block start index.
+      Idx = Indexes->getMBBStartIdx(MI->getParent());
+    else
+      // Instructions are either normal 'r', or early clobber 'e'.
+      Idx = Indexes->getInstructionIndex(MI)
+        .getRegSlot(I.getOperand().isEarlyClobber());
+
+    // Create the def in LI. This may find an existing def.
+    LI->createDeadDef(Idx, *Alloc);
+  }
+}
+
+
+void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all operands that read Reg. This may include partial defs.
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ++I) {
+    const MachineOperand &MO = I.getOperand();
+    if (!MO.readsReg())
+      continue;
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    const MachineInstr *MI = &*I;
+
+    // Find the SlotIndex being read.
+    SlotIndex Idx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // PHI operands are paired: (Reg, PredMBB).
+      // Extend the live range to be live-out from PredMBB.
+      Idx = Indexes->getMBBEndIdx(MI->getOperand(I.getOperandNo()+1).getMBB());
+    } else {
+      // This is a normal instruction.
+      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
+      // Check for early-clobber redefs.
+      unsigned DefIdx;
+      if (MO.isDef()) {
+        if (MO.isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      } else if (MI->isRegTiedToDefOperand(I.getOperandNo(), &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        if (MI->getOperand(DefIdx).isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      }
+    }
+    extend(LI, Idx, Reg);
+  }
+}
+
+
 // Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) {
+void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI) {
   for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
          E = LiveIn.end(); I != E; ++I) {
     if (!I->DomNode)
@@ -56,9 +129,7 @@ void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) {
 
 void LiveRangeCalc::extend(LiveInterval *LI,
                            SlotIndex Kill,
-                           SlotIndexes *Indexes,
-                           MachineDominatorTree *DomTree,
-                           VNInfo::Allocator *Alloc) {
+                           unsigned PhysReg) {
   assert(LI && "Missing live range");
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
@@ -75,34 +146,31 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, Indexes, DomTree);
+  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, PhysReg);
 
   // When there were multiple different values, we may need new PHIs.
   if (!VNI)
-    updateSSA(Indexes, DomTree, Alloc);
+    updateSSA();
 
-  updateLiveIns(VNI, Indexes);
+  updateLiveIns(VNI);
 }
 
 
 // This function is called by a client after using the low-level API to add
 // live-out and live-in blocks.  The unique value optimization is not
 // available, SplitEditor::transferValues handles that case directly anyway.
-void LiveRangeCalc::calculateValues(SlotIndexes *Indexes,
-                                    MachineDominatorTree *DomTree,
-                                    VNInfo::Allocator *Alloc) {
+void LiveRangeCalc::calculateValues() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
-  updateSSA(Indexes, DomTree, Alloc);
-  updateLiveIns(0, Indexes);
+  updateSSA();
+  updateLiveIns(0);
 }
 
 
 VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
                                         MachineBasicBlock *KillMBB,
                                         SlotIndex Kill,
-                                        SlotIndexes *Indexes,
-                                        MachineDominatorTree *DomTree) {
+                                        unsigned PhysReg) {
   // Blocks where LI should be live-in.
   SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
 
@@ -113,7 +181,22 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   // Using Seen as a visited set, perform a BFS for all reaching defs.
   for (unsigned i = 0; i != WorkList.size(); ++i) {
     MachineBasicBlock *MBB = WorkList[i];
-    assert(!MBB->pred_empty() && "Value live-in to entry block?");
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      llvm_unreachable("Use not jointly dominated by defs.");
+    }
+
+    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+        !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      errs() << "The register needs to be live in to BB#" << MBB->getNumber()
+             << ", but is missing from the live-in list.\n";
+      llvm_unreachable("Invalid global physical register");
+    }
+#endif
+
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
@@ -168,9 +251,7 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
 // This is essentially the same iterative algorithm that SSAUpdater uses,
 // except we already have a dominator tree, so we don't have to recompute it.
-void LiveRangeCalc::updateSSA(SlotIndexes *Indexes,
-                              MachineDominatorTree *DomTree,
-                              VNInfo::Allocator *Alloc) {
+void LiveRangeCalc::updateSSA() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
 
@@ -238,7 +319,6 @@ void LiveRangeCalc::updateSSA(SlotIndexes *Indexes,
         SlotIndex Start, End;
         tie(Start, End) = Indexes->getMBBRange(MBB);
         VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
-        VNI->setIsPHIDef(true);
         I->Value = VNI;
         // This block is done, we know the final value.
         I->DomNode = 0;
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index b8c8585..909829b 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -34,6 +34,11 @@ template <class NodeT> class DomTreeNodeBase;
 typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
 
 class LiveRangeCalc {
+  const MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  MachineDominatorTree *DomTree;
+  VNInfo::Allocator *Alloc;
+
   /// Seen - Bit vector of active entries in LiveOut, also used as a visited
   /// set by findReachingDefs.  One entry per basic block, indexed by block
   /// number.  This is kept as a separate bit vector because it can be cleared
@@ -100,26 +105,27 @@ class LiveRangeCalc {
   /// to be live-in are added to LiveIn.  If a unique reaching def is found,
   /// its value is returned, if Kill is jointly dominated by multiple values,
   /// NULL is returned.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
   VNInfo *findReachingDefs(LiveInterval *LI,
                            MachineBasicBlock *KillMBB,
                            SlotIndex Kill,
-                           SlotIndexes *Indexes,
-                           MachineDominatorTree *DomTree);
+                           unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
   ///
   /// Every live-in block must be jointly dominated by the added live-out
   /// blocks.  No values are read from the live ranges.
-  void updateSSA(SlotIndexes *Indexes,
-                 MachineDominatorTree *DomTree,
-                 VNInfo::Allocator *Alloc);
+  void updateSSA();
 
   /// updateLiveIns - Add liveness as specified in the LiveIn vector, using VNI
   /// as a wildcard value for LiveIn entries without a value.
-  void updateLiveIns(VNInfo *VNI, SlotIndexes*);
+  void updateLiveIns(VNInfo *VNI);
 
 public:
+  LiveRangeCalc() : MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
+
   //===--------------------------------------------------------------------===//
   // High-level interface.
   //===--------------------------------------------------------------------===//
@@ -132,14 +138,14 @@ public:
   /// that may overlap a previously computed live range, and before the first
   /// live range in a function.  If live ranges are not known to be
   /// non-overlapping, call reset before each.
-  void reset(const MachineFunction *MF);
+  void reset(const MachineFunction *MF,
+             SlotIndexes*,
+             MachineDominatorTree*,
+             VNInfo::Allocator*);
 
   /// calculate - Calculate the live range of a virtual register from its defs
   /// and uses.  LI must be empty with no values.
-  void calculate(LiveInterval *LI,
-                 MachineRegisterInfo *MRI,
-                 SlotIndexes *Indexes,
-                 VNInfo::Allocator *Alloc);
+  void calculate(LiveInterval *LI);
 
   //===--------------------------------------------------------------------===//
   // Mid-level interface.
@@ -154,21 +160,30 @@ public:
   /// Kill is not dominated by a single existing value, PHI-defs are inserted
   /// as required to preserve SSA form.  If Kill is known to be dominated by a
   /// single existing value, Alloc may be null.
-  void extend(LiveInterval *LI,
-              SlotIndex Kill,
-              SlotIndexes *Indexes,
-              MachineDominatorTree *DomTree,
-              VNInfo::Allocator *Alloc);
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+
+  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
+  /// Each instruction defining Reg gets a new VNInfo with a corresponding
+  /// minimal live range.
+  void createDeadDefs(LiveInterval *LI, unsigned Reg);
 
-  /// extendToUses - Extend the live range of LI to reach all uses.
+  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
+  void createDeadDefs(LiveInterval *LI) {
+    createDeadDefs(LI, LI->reg);
+  }
+
+  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveInterval *LI,
-                    MachineRegisterInfo *MRI,
-                    SlotIndexes *Indexes,
-                    MachineDominatorTree *DomTree,
-                    VNInfo::Allocator *Alloc);
+  void extendToUses(LiveInterval *LI, unsigned Reg);
+
+  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
+  void extendToUses(LiveInterval *LI) {
+    extendToUses(LI, LI->reg);
+  }
 
   //===--------------------------------------------------------------------===//
   // Low-level interface.
@@ -216,9 +231,7 @@ public:
   ///
   /// Every predecessor of a live-in block must have been given a value with
   /// setLiveOutValue, the value may be null for live-trough blocks.
-  void calculateValues(SlotIndexes *Indexes,
-                       MachineDominatorTree *DomTree,
-                       VNInfo::Allocator *Alloc);
+  void calculateValues();
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 695f536..b4ce9aa 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -38,7 +38,7 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
   LiveInterval &LI = LIS.getOrCreateInterval(VReg);
-  newRegs_.push_back(&LI);
+  NewRegs.push_back(&LI);
   return LI;
 }
 
@@ -46,16 +46,16 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           AliasAnalysis *aa) {
   assert(DefMI && "Missing instruction");
-  scannedRemattable_ = true;
+  ScannedRemattable = true;
   if (!TII.isTriviallyReMaterializable(DefMI, aa))
     return false;
-  remattable_.insert(VNI);
+  Remattable.insert(VNI);
   return true;
 }
 
 void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
-  for (LiveInterval::vni_iterator I = parent_.vni_begin(),
-       E = parent_.vni_end(); I != E; ++I) {
+  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
+       E = getParent().vni_end(); I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
@@ -64,13 +64,13 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
       continue;
     checkRematerializable(VNI, DefMI, aa);
   }
-  scannedRemattable_ = true;
+  ScannedRemattable = true;
 }
 
 bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
-  if (!scannedRemattable_)
+  if (!ScannedRemattable)
     scanRemattable(aa);
-  return !remattable_.empty();
+  return !Remattable.empty();
 }
 
 /// allUsesAvailableAt - Return true if all registers used by OrigMI at
@@ -82,12 +82,16 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
   UseIdx = UseIdx.getRegSlot(true);
   for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = OrigMI->getOperand(i);
-    if (!MO.isReg() || !MO.getReg() || MO.isDef())
-      continue;
-    // Reserved registers are OK.
-    if (MO.isUndef() || !LIS.hasInterval(MO.getReg()))
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
       continue;
 
+    // We can't remat physreg uses, unless it is a constant.
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MRI.isConstantPhysReg(MO.getReg(), VRM->getMachineFunction()))
+        continue;
+      return false;
+    }
+
     LiveInterval &li = LIS.getInterval(MO.getReg());
     const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
     if (!OVNI)
@@ -101,10 +105,10 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
 bool LiveRangeEdit::canRematerializeAt(Remat &RM,
                                        SlotIndex UseIdx,
                                        bool cheapAsAMove) {
-  assert(scannedRemattable_ && "Call anyRematerializable first");
+  assert(ScannedRemattable && "Call anyRematerializable first");
 
   // Use scanRemattable info.
-  if (!remattable_.count(RM.ParentVNI))
+  if (!Remattable.count(RM.ParentVNI))
     return false;
 
   // No defining instruction provided.
@@ -136,13 +140,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          bool Late) {
   assert(RM.OrigMI && "Invalid remat");
   TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
-  rematted_.insert(RM.ParentVNI);
+  Rematted.insert(RM.ParentVNI);
   return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
            .getRegSlot();
 }
 
 void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
-  if (delegate_ && delegate_->LRE_CanEraseVirtReg(Reg))
+  if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
     LIS.removeInterval(Reg);
 }
 
@@ -173,6 +177,19 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (!DefMI || !UseMI)
     return false;
 
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI,
+                          LIS.getInstructionIndex(DefMI),
+                          LIS.getInstructionIndex(UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(&TII, 0, SawStore))
+    return false;
+
   DEBUG(dbgs() << "Try to fold single def: " << *DefMI
                << "       into single use: " << *UseMI);
 
@@ -220,14 +237,22 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
 
       DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
 
+      // Collect virtual registers to be erased after MI is gone.
+      SmallVector<unsigned, 8> RegsToErase;
+      bool ReadsPhysRegs = false;
+
       // Check for live intervals that may shrink
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
              MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         if (!MOI->isReg())
           continue;
         unsigned Reg = MOI->getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+          // Check if MI reads any unreserved physregs.
+          if (Reg && MOI->readsReg() && !LIS.isReserved(Reg))
+            ReadsPhysRegs = true;
           continue;
+        }
         LiveInterval &LI = LIS.getInterval(Reg);
 
         // Shrink read registers, unless it is likely to be expensive and
@@ -242,22 +267,49 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         // Remove defined value.
         if (MOI->isDef()) {
           if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
-            if (delegate_)
-              delegate_->LRE_WillShrinkVirtReg(LI.reg);
+            if (TheDelegate)
+              TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
             LI.removeValNo(VNI);
-            if (LI.empty()) {
-              ToShrink.remove(&LI);
-              eraseVirtReg(Reg);
-            }
+            if (LI.empty())
+              RegsToErase.push_back(Reg);
           }
         }
       }
 
-      if (delegate_)
-        delegate_->LRE_WillEraseInstruction(MI);
-      LIS.RemoveMachineInstrFromMaps(MI);
-      MI->eraseFromParent();
-      ++NumDCEDeleted;
+      // Currently, we don't support DCE of physreg live ranges. If MI reads
+      // any unreserved physregs, don't erase the instruction, but turn it into
+      // a KILL instead. This way, the physreg live ranges don't end up
+      // dangling.
+      // FIXME: It would be better to have something like shrinkToUses() for
+      // physregs. That could potentially enable more DCE and it would free up
+      // the physreg. It would not happen often, though.
+      if (ReadsPhysRegs) {
+        MI->setDesc(TII.get(TargetOpcode::KILL));
+        // Remove all operands that aren't physregs.
+        for (unsigned i = MI->getNumOperands(); i; --i) {
+          const MachineOperand &MO = MI->getOperand(i-1);
+          if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+            continue;
+          MI->RemoveOperand(i-1);
+        }
+        DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+      } else {
+        if (TheDelegate)
+          TheDelegate->LRE_WillEraseInstruction(MI);
+        LIS.RemoveMachineInstrFromMaps(MI);
+        MI->eraseFromParent();
+        ++NumDCEDeleted;
+      }
+
+      // Erase any virtregs that are now empty and unused. There may be <undef>
+      // uses around. Keep the empty live range in that case.
+      for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+        unsigned Reg = RegsToErase[i];
+        if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+          ToShrink.remove(&LIS.getInterval(Reg));
+          eraseVirtReg(Reg);
+        }
+      }
     }
 
     if (ToShrink.empty())
@@ -268,8 +320,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     ToShrink.pop_back();
     if (foldAsLoad(LI, Dead))
       continue;
-    if (delegate_)
-      delegate_->LRE_WillShrinkVirtReg(LI->reg);
+    if (TheDelegate)
+      TheDelegate->LRE_WillShrinkVirtReg(LI->reg);
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
     
@@ -304,10 +356,14 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
       // interval must contain all the split products, and LI doesn't.
       if (IsOriginal)
         VRM->setIsSplitFromReg(Dups.back()->reg, 0);
-      if (delegate_)
-        delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
+      if (TheDelegate)
+        TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
     }
     ConEQ.Distribute(&Dups[0], MRI);
+    DEBUG({
+      for (unsigned i = 0; i != NumComp; ++i)
+        dbgs() << '\t' << *Dups[i] << '\n';
+    });
   }
 }
 
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
new file mode 100644
index 0000000..cdb1776
--- /dev/null
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -0,0 +1,152 @@
+//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LiveRegMatrix analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "LiveRegMatrix.h"
+#include "VirtRegMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumAssigned   , "Number of registers assigned");
+STATISTIC(NumUnassigned , "Number of registers unassigned");
+
+char LiveRegMatrix::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix",
+                      "Live Register Matrix", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
+                    "Live Register Matrix", false, false)
+
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
+  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+
+void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.addRequiredTransitive<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  if (NumRegUnits != Matrix.size())
+    Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
+  Matrix.init(LIUAlloc, NumRegUnits);
+
+  // Make sure no stale queries get reused.
+  invalidateVirtRegs();
+  return false;
+}
+
+void LiveRegMatrix::releaseMemory() {
+  for (unsigned i = 0, e = Matrix.size(); i != e; ++i) {
+    Matrix[i].clear();
+    Queries[i].clear();
+  }
+}
+
+void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
+  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
+               << " to " << PrintReg(PhysReg, TRI) << ':');
+  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].unify(VirtReg);
+  }
+  ++NumAssigned;
+  DEBUG(dbgs() << '\n');
+}
+
+void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
+  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
+               << " from " << PrintReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].extract(VirtReg);
+  }
+  ++NumUnassigned;
+  DEBUG(dbgs() << '\n');
+}
+
+bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  // Check if the cached information is valid.
+  // The same BitVector can be reused for all PhysRegs.
+  // We could cache multiple VirtRegs if it becomes necessary.
+  if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg;
+    RegMaskTag = UserTag;
+    RegMaskUsable.clear();
+    LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
+  }
+
+  // The BitVector is indexed by PhysReg, not register unit.
+  // Regmask interference is more fine grained than regunits.
+  // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
+  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+}
+
+bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  if (VirtReg.empty())
+    return false;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (VirtReg.overlaps(LIS->getRegUnit(*Units)))
+      return true;
+  return false;
+}
+
+LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+                                               unsigned RegUnit) {
+  LiveIntervalUnion::Query &Q = Queries[RegUnit];
+  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  return Q;
+}
+
+LiveRegMatrix::InterferenceKind
+LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
+  if (VirtReg.empty())
+    return IK_Free;
+
+  // Regmask interference is the fastest check.
+  if (checkRegMaskInterference(VirtReg, PhysReg))
+    return IK_RegMask;
+
+  // Check for fixed interference.
+  if (checkRegUnitInterference(VirtReg, PhysReg))
+    return IK_RegUnit;
+
+  // Check the matrix for virtual register interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (query(VirtReg, *Units).checkInterference())
+      return IK_VirtReg;
+
+  return IK_Free;
+}
diff --git a/lib/CodeGen/LiveRegMatrix.h b/lib/CodeGen/LiveRegMatrix.h
new file mode 100644
index 0000000..b3e2d7f
--- /dev/null
+++ b/lib/CodeGen/LiveRegMatrix.h
@@ -0,0 +1,148 @@
+//===-- LiveRegMatrix.h - Track register interference ---------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRegMatrix analysis pass keeps track of virtual register interference
+// along two dimensions: Slot indexes and register units. The matrix is used by
+// register allocators to ensure that no interfering virtual registers get
+// assigned to overlapping physical registers.
+//
+// Register units are defined in MCRegisterInfo.h, they represent the smallest
+// unit of interference when dealing with overlapping physical registers. The
+// LiveRegMatrix is represented as a LiveIntervalUnion per register unit. When
+// a virtual register is assigned to a physicval register, the live range for
+// the virtual register is inserted into the LiveIntervalUnion for each regunit
+// in the physreg.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVEREGMATRIX_H
+#define LLVM_CODEGEN_LIVEREGMATRIX_H
+
+#include "LiveIntervalUnion.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervalAnalysis;
+class MachineRegisterInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+
+class LiveRegMatrix : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  // UserTag changes whenever virtual registers have been modified.
+  unsigned UserTag;
+
+  // The matrix is represented as a LiveIntervalUnion per register unit.
+  LiveIntervalUnion::Allocator LIUAlloc;
+  LiveIntervalUnion::Array Matrix;
+
+  // Cached queries per register unit.
+  OwningArrayPtr<LiveIntervalUnion::Query> Queries;
+
+  // Cached register mask interference info.
+  unsigned RegMaskTag;
+  unsigned RegMaskVirtReg;
+  BitVector RegMaskUsable;
+
+  // MachineFunctionPass boilerplate.
+  virtual void getAnalysisUsage(AnalysisUsage&) const;
+  virtual bool runOnMachineFunction(MachineFunction&);
+  virtual void releaseMemory();
+public:
+  static char ID;
+  LiveRegMatrix();
+
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Check for interference before assigning virtual registers to physical
+  // registers.
+  //
+
+  /// Invalidate cached interference queries after modifying virtual register
+  /// live ranges. Interference checks may return stale information unless
+  /// caches are invalidated.
+  void invalidateVirtRegs() { ++UserTag; }
+
+  enum InterferenceKind {
+    /// No interference, go ahead and assign.
+    IK_Free = 0,
+
+    /// Virtual register interference. There are interfering virtual registers
+    /// assigned to PhysReg or its aliases. This interference could be resolved
+    /// by unassigning those other virtual registers.
+    IK_VirtReg,
+
+    /// Register unit interference. A fixed live range is in the way, typically
+    /// argument registers for a call. This can't be resolved by unassigning
+    /// other virtual registers.
+    IK_RegUnit,
+
+    /// RegMask interference. The live range is crossing an instruction with a
+    /// regmask operand that doesn't preserve PhysReg. This typically means
+    /// VirtReg is live across a call, and PhysReg isn't call-preserved.
+    IK_RegMask
+  };
+
+  /// Check for interference before assigning VirtReg to PhysReg.
+  /// If this function returns IK_Free, it is legal to assign(VirtReg, PhysReg).
+  /// When there is more than one kind of interference, the InterferenceKind
+  /// with the highest enum value is returned.
+  InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Assign VirtReg to PhysReg.
+  /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
+  /// update VirtRegMap. The live range is expected to be available in PhysReg.
+  void assign(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Unassign VirtReg from its PhysReg.
+  /// Assuming that VirtReg was previously assigned to a PhysReg, this undoes
+  /// the assignment and updates VirtRegMap accordingly.
+  void unassign(LiveInterval &VirtReg);
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Provide access to the underlying LiveIntervalUnions.
+  //
+
+  /// Check for regmask interference only.
+  /// Return true if VirtReg crosses a regmask operand that clobbers PhysReg.
+  /// If PhysReg is null, check if VirtReg crosses any regmask operands.
+  bool checkRegMaskInterference(LiveInterval &VirtReg, unsigned PhysReg = 0);
+
+  /// Check for regunit interference only.
+  /// Return true if VirtReg overlaps a fixed assignment of one of PhysRegs's
+  /// register units.
+  bool checkRegUnitInterference(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Query a line of the assigned virtual register matrix directly.
+  /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
+  /// This returns a reference to an internal Query data structure that is only
+  /// valid until the next query() call.
+  LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned RegUnit);
+
+  /// Directly access the live interval unions per regunit.
+  /// This returns an array indexed by the regunit number.
+  LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEREGMATRIX_H
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 5a0d97d..348ed3a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -192,8 +192,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
   unsigned LastDefReg = 0;
   unsigned LastDefDist = 0;
   MachineInstr *LastDef = NULL;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (!Def)
       continue;
@@ -216,9 +216,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
     unsigned DefReg = MO.getReg();
     if (TRI->isSubRegister(Reg, DefReg)) {
       PartDefRegs.insert(DefReg);
-      for (const uint16_t *SubRegs = TRI->getSubRegisters(DefReg);
-           unsigned SubReg = *SubRegs; ++SubRegs)
-        PartDefRegs.insert(SubReg);
+      for (MCSubRegIterator SubRegs(DefReg, TRI); SubRegs.isValid(); ++SubRegs)
+        PartDefRegs.insert(*SubRegs);
     }
   }
   return LastDef;
@@ -247,8 +246,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
                                                            true/*IsImp*/));
       PhysRegDef[Reg] = LastPartialDef;
       SmallSet<unsigned, 8> Processed;
-      for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-           unsigned SubReg = *SubRegs; ++SubRegs) {
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubReg = *SubRegs;
         if (Processed.count(SubReg))
           continue;
         if (PartDefRegs.count(SubReg))
@@ -259,7 +258,7 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
                                                              false/*IsDef*/,
                                                              true/*IsImp*/));
         PhysRegDef[SubReg] = LastPartialDef;
-        for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
           Processed.insert(*SS);
       }
     }
@@ -271,9 +270,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
 
   // Remember this use.
   PhysRegUse[Reg]  = MI;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs)
-    PhysRegUse[SubReg] =  MI;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    PhysRegUse[*SubRegs] =  MI;
 }
 
 /// FindLastRefOrPartRef - Return the last reference or partial reference of
@@ -287,8 +285,8 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
   unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
   unsigned LastPartDefDist = 0;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (Def && Def != LastDef) {
       // There was a def of this sub-register in between. This is a partial
@@ -336,8 +334,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
   MachineInstr *LastPartDef = 0;
   unsigned LastPartDefDist = 0;
   SmallSet<unsigned, 8> PartUses;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (Def && Def != LastDef) {
       // There was a def of this sub-register in between. This is a partial
@@ -351,7 +349,7 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
     }
     if (MachineInstr *Use = PhysRegUse[SubReg]) {
       PartUses.insert(SubReg);
-      for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
         PartUses.insert(*SS);
       unsigned Dist = DistanceMap[Use];
       if (Dist > LastRefOrPartRefDist) {
@@ -367,8 +365,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
     // EAX<dead>  = op  AL<imp-def>
     // That is, EAX def is dead but AL def extends pass it.
     PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       if (!PartUses.count(SubReg))
         continue;
       bool NeedDef = true;
@@ -388,11 +386,10 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
       else {
         LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
         PhysRegUse[SubReg] = LastRefOrPartRef;
-        for (const uint16_t *SSRegs = TRI->getSubRegisters(SubReg);
-             unsigned SSReg = *SSRegs; ++SSRegs)
-          PhysRegUse[SSReg] = LastRefOrPartRef;
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          PhysRegUse[*SS] = LastRefOrPartRef;
       }
-      for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
         PartUses.erase(*SS);
     }
   } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
@@ -434,7 +431,7 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
     // Kill the largest clobbered super-register.
     // This avoids needless implicit operands.
     unsigned Super = Reg;
-    for (const uint16_t *SR = TRI->getSuperRegisters(Reg); *SR; ++SR)
+    for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
       if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR))
         Super = *SR;
     HandlePhysRegKill(Super, 0);
@@ -447,11 +444,11 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
   SmallSet<unsigned, 32> Live;
   if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
     Live.insert(Reg);
-    for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
-      Live.insert(*SS);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Live.insert(*SubRegs);
   } else {
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       // If a register isn't itself defined, but all parts that make up of it
       // are defined, then consider it also defined.
       // e.g.
@@ -462,7 +459,7 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
         continue;
       if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
         Live.insert(SubReg);
-        for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
           Live.insert(*SS);
       }
     }
@@ -472,8 +469,8 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
   // is referenced.
   HandlePhysRegKill(Reg, MI);
   // Only some of the sub-registers are used.
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     if (!Live.count(SubReg))
       // Skip if this sub-register isn't defined.
       continue;
@@ -491,8 +488,8 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
     Defs.pop_back();
     PhysRegDef[Reg]  = MI;
     PhysRegUse[Reg]  = NULL;
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       PhysRegDef[SubReg]  = MI;
       PhysRegUse[SubReg]  = NULL;
     }
@@ -576,7 +573,8 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
         unsigned MOReg = MO.getReg();
         if (MO.isUse()) {
           MO.setIsKill(false);
-          UseRegs.push_back(MOReg);
+          if (MO.readsReg())
+            UseRegs.push_back(MOReg);
         } else /*MO.isDef()*/ {
           MO.setIsDead(false);
           DefRegs.push_back(MOReg);
@@ -732,8 +730,9 @@ void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
     for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
          BBI != BBE && BBI->isPHI(); ++BBI)
       for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
-        PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
-          .push_back(BBI->getOperand(i).getReg());
+        if (BBI->getOperand(i).readsReg())
+          PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI->getOperand(i).getReg());
 }
 
 bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 238bf52..fbc9e20 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -314,7 +314,8 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
             // No previously defined register was in range, so create a
             // new one.
             int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx);
-            const TargetRegisterClass *RC = TRI->getPointerRegClass();
+            const MachineFunction *MF = MI->getParent()->getParent();
+            const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
             BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
 
             DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 1abb8f2..fa6b450 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -109,7 +109,8 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
   assert(N->getParent() != 0 && "machine instruction not in a basic block");
 
   // Remove from the use/def lists.
-  N->RemoveRegOperandsFromUseLists();
+  if (MachineFunction *MF = N->getParent()->getParent())
+    N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
   N->setParent(0);
 
@@ -271,11 +272,9 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   }
   if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
   if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
-  if (Alignment) {
+  if (Alignment)
     OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
        << " bytes)";
-    Comma = ", ";
-  }
 
   OS << '\n';
 
@@ -312,8 +311,11 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   if (!succ_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Successors according to CFG:";
-    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI)
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
+      if (!Weights.empty())
+        OS << '(' << *getWeightIterator(SI) << ')';
+    }
     OS << '\n';
   }
 }
@@ -479,18 +481,42 @@ MachineBasicBlock::removeSuccessor(succ_iterator I) {
 
 void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
                                          MachineBasicBlock *New) {
-  uint32_t weight = 0;
-  succ_iterator SI = std::find(Successors.begin(), Successors.end(), Old);
+  if (Old == New)
+    return;
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(SI);
-    weight = *WI;
+  succ_iterator E = succ_end();
+  succ_iterator NewI = E;
+  succ_iterator OldI = E;
+  for (succ_iterator I = succ_begin(); I != E; ++I) {
+    if (*I == Old) {
+      OldI = I;
+      if (NewI != E)
+        break;
+    }
+    if (*I == New) {
+      NewI = I;
+      if (OldI != E)
+        break;
+    }
   }
+  assert(OldI != E && "Old is not a successor of this block");
+  Old->removePredecessor(this);
 
-  // Update the successor information.
-  removeSuccessor(SI);
-  addSuccessor(New, weight);
+  // If New isn't already a successor, let it take Old's place.
+  if (NewI == E) {
+    New->addPredecessor(this);
+    *OldI = New;
+    return;
+  }
+
+  // New is already a successor.
+  // Update its weight instead of adding a duplicate edge.
+  if (!Weights.empty()) {
+    weight_iterator OldWI = getWeightIterator(OldI);
+    *getWeightIterator(NewI) += *OldWI;
+    Weights.erase(OldWI);
+  }
+  Successors.erase(OldI);
 }
 
 void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
@@ -509,14 +535,13 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
 
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    uint32_t weight = 0;
-
+    uint32_t Weight = 0;
 
     // If Weight list is empty it means we don't use it (disabled optimization).
     if (!fromMBB->Weights.empty())
-      weight = *fromMBB->Weights.begin();
+      Weight = *fromMBB->Weights.begin();
 
-    addSuccessor(Succ, weight);
+    addSuccessor(Succ, Weight);
     fromMBB->removeSuccessor(Succ);
   }
 }
@@ -528,7 +553,10 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
 
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    addSuccessor(Succ);
+    uint32_t Weight = 0;
+    if (!fromMBB->Weights.empty())
+      Weight = *fromMBB->Weights.begin();
+    addSuccessor(Succ, Weight);
     fromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -542,9 +570,12 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
   }
 }
 
+bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
+  return std::find(pred_begin(), pred_end(), MBB) != pred_end();
+}
+
 bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
-  const_succ_iterator I = std::find(Successors.begin(), Successors.end(), MBB);
-  return I != Successors.end();
+  return std::find(succ_begin(), succ_end(), MBB) != succ_end();
 }
 
 bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
@@ -596,6 +627,11 @@ bool MachineBasicBlock::canFallThrough() {
 
 MachineBasicBlock *
 MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isLandingPad())
+    return NULL;
+
   MachineFunction *MF = getParent();
   DebugLoc dl;  // FIXME: this is nowhere
 
@@ -670,7 +706,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
   // Inherit live-ins from the successor
   for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
-	 E = Succ->livein_end(); I != E; ++I)
+         E = Succ->livein_end(); I != E; ++I)
     NMBB->addLiveIn(*I);
 
   // Update LiveVariables.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 5ba6851..c4dca2c 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -11,7 +11,7 @@
 // structure and branch probability estimates.
 //
 // The pass strives to preserve the structure of the CFG (that is, retain
-// a topological ordering of basic blocks) in the absense of a *strong* signal
+// a topological ordering of basic blocks) in the absence of a *strong* signal
 // to the contrary from probabilities. However, within the CFG structure, it
 // attempts to choose an ordering which favors placing more likely sequences of
 // blocks adjacent to each other.
@@ -63,17 +63,13 @@ namespace {
 ///
 /// This is the datastructure representing a chain of consecutive blocks that
 /// are profitable to layout together in order to maximize fallthrough
-/// probabilities. We also can use a block chain to represent a sequence of
-/// basic blocks which have some external (correctness) requirement for
-/// sequential layout.
+/// probabilities and code locality. We also can use a block chain to represent
+/// a sequence of basic blocks which have some external (correctness)
+/// requirement for sequential layout.
 ///
-/// Eventually, the block chains will form a directed graph over the function.
-/// We provide an SCC-supporting-iterator in order to quicky build and walk the
-/// SCCs of block chains within a function.
-///
-/// The block chains also have support for calculating and caching probability
-/// information related to the chain itself versus other chains. This is used
-/// for ranking during the final layout of block chains.
+/// Chains can be built around a single basic block and can be merged to grow
+/// them. They participate in a block-to-chain mapping, which is updated
+/// automatically as chains are merged together.
 class BlockChain {
   /// \brief The sequence of blocks belonging to this chain.
   ///
@@ -179,10 +175,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// \brief Allocator and owner of BlockChain structures.
   ///
-  /// We build BlockChains lazily by merging together high probability BB
-  /// sequences acording to the "Algo2" in the paper mentioned at the top of
-  /// the file. To reduce malloc traffic, we allocate them using this slab-like
-  /// allocator, and destroy them after the pass completes.
+  /// We build BlockChains lazily while processing the loop structure of
+  /// a function. To reduce malloc traffic, we allocate them using this
+  /// slab-like allocator, and destroy them after the pass completes. An
+  /// important guarantee is that this allocator produces stable pointers to
+  /// the chains.
   SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
 
   /// \brief Function wide BasicBlock to BlockChain mapping.
@@ -329,7 +326,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
   // the MBPI analysis, we manually compute probabilities using the edge
   // weights. This is suboptimal as it means that the somewhat subtle
   // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to effeciently support query patterns such as
+  // improve the MBPI interface to efficiently support query patterns such as
   // this.
   uint32_t BestWeight = 0;
   uint32_t WeightScale = 0;
@@ -988,8 +985,22 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // boiler plate.
     Cond.clear();
     MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond))
+    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      // If PrevBB has a two-way branch, try to re-order the branches
+      // such that we branch to the successor with higher weight first.
+      if (TBB && !Cond.empty() && FBB &&
+          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          !TII->ReverseBranchCondition(Cond)) {
+        DEBUG(dbgs() << "Reverse order of the two branches: "
+                     << getBlockName(PrevBB) << "\n");
+        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
+                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DebugLoc dl;  // FIXME: this is nowhere
+        TII->RemoveBranch(*PrevBB);
+        TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
+      }
       PrevBB->updateTerminator();
+    }
   }
 
   // Fixup the last block.
@@ -1000,29 +1011,63 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
 
   // Walk through the backedges of the function now that we have fully laid out
   // the basic blocks and align the destination of each backedge. We don't rely
-  // on the loop info here so that we can align backedges in unnatural CFGs and
-  // backedges that were introduced purely because of the loop rotations done
-  // during this layout pass.
-  // FIXME: This isn't quite right, we shouldn't align backedges that result
-  // from blocks being sunken below the exit block for the function.
+  // exclusively on the loop info here so that we can align backedges in
+  // unnatural CFGs and backedges that were introduced purely because of the
+  // loop rotations done during this layout pass.
   if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
     return;
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
     return;  // Don't care about loop alignment.
+  if (FunctionChain.begin() == FunctionChain.end())
+    return;  // Empty chain.
 
-  SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks;
-  for (BlockChain::iterator BI = FunctionChain.begin(),
+  const BranchProbability ColdProb(1, 5); // 20%
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(F.begin());
+  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
+  for (BlockChain::iterator BI = llvm::next(FunctionChain.begin()),
                             BE = FunctionChain.end();
        BI != BE; ++BI) {
-    PreviousBlocks.insert(*BI);
-    // Set alignment on the destination of all the back edges in the new
-    // ordering.
-    for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(),
-                                          SE = (*BI)->succ_end();
-         SI != SE; ++SI)
-      if (PreviousBlocks.count(*SI))
-        (*SI)->setAlignment(Align);
+    // Don't align non-looping basic blocks. These are unlikely to execute
+    // enough times to matter in practice. Note that we'll still handle
+    // unnatural CFGs inside of a natural outer loop (the common case) and
+    // rotated loops.
+    MachineLoop *L = MLI->getLoopFor(*BI);
+    if (!L)
+      continue;
+
+    // If the block is cold relative to the function entry don't waste space
+    // aligning it.
+    BlockFrequency Freq = MBFI->getBlockFreq(*BI);
+    if (Freq < WeightedEntryFreq)
+      continue;
+
+    // If the block is cold relative to its loop header, don't align it
+    // regardless of what edges into the block exist.
+    MachineBasicBlock *LoopHeader = L->getHeader();
+    BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
+    if (Freq < (LoopHeaderFreq * ColdProb))
+      continue;
+
+    // Check for the existence of a non-layout predecessor which would benefit
+    // from aligning this block.
+    MachineBasicBlock *LayoutPred = *llvm::prior(BI);
+
+    // Force alignment if all the predecessors are jumps. We already checked
+    // that the block isn't cold above.
+    if (!LayoutPred->isSuccessor(*BI)) {
+      (*BI)->setAlignment(Align);
+      continue;
+    }
+
+    // Align this block if the layout predecessor's edge into this block is
+    // cold relative to the block. When this is true, othe predecessors make up
+    // all of the hot entries into the block and thus alignment is likely to be
+    // important.
+    BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, *BI);
+    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
+    if (LayoutEdgeFreq <= (Freq * ColdProb))
+      (*BI)->setAlignment(Align);
   }
 }
 
@@ -1053,7 +1098,7 @@ namespace {
 ///
 /// A separate pass to compute interesting statistics for evaluating block
 /// placement. This is separate from the actual placement pass so that they can
-/// be computed in the absense of any placement transformations or when using
+/// be computed in the absence of any placement transformations or when using
 /// alternative placement strategies.
 class MachineBlockPlacementStats : public MachineFunctionPass {
   /// \brief A handle to the branch probability pass.
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index a63688e..896461f 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -84,7 +84,7 @@ namespace {
     bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
                                 MachineBasicBlock::const_iterator I,
-                                MachineBasicBlock::const_iterator E) const ;
+                                MachineBasicBlock::const_iterator E) const;
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
                                SmallSet<unsigned,8> &PhysRefs,
@@ -100,8 +100,7 @@ namespace {
     void ExitScope(MachineBasicBlock *MBB);
     bool ProcessBlock(MachineBasicBlock *MBB);
     void ExitScopeIfDone(MachineDomTreeNode *Node,
-                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
     bool PerformCSE(MachineDomTreeNode *Node);
   };
 } // end anonymous namespace
@@ -216,11 +215,12 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (MO.isDef() &&
         (MO.isDead() || isPhysDefTriviallyDead(Reg, I, MBB->end())))
       continue;
-    PhysRefs.insert(Reg);
+    // Reading constant physregs is ok.
+    if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRefs.insert(*AI);
     if (MO.isDef())
       PhysDefs.push_back(Reg);
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
-      PhysRefs.insert(*Alias);
   }
 
   return !PhysRefs.empty();
@@ -326,6 +326,29 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
                                    MachineInstr *CSMI, MachineInstr *MI) {
   // FIXME: Heuristics that works around the lack the live range splitting.
 
+  // If CSReg is used at all uses of Reg, CSE should not increase register
+  // pressure of CSReg.
+  bool MayIncreasePressure = true;
+  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
+      TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MayIncreasePressure = false;
+    SmallPtrSet<MachineInstr*, 8> CSUses;
+    for (MachineRegisterInfo::use_nodbg_iterator I =MRI->use_nodbg_begin(CSReg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      CSUses.insert(Use);
+    }
+    for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      if (!CSUses.count(Use)) {
+        MayIncreasePressure = true;
+        break;
+      }
+    }
+  }
+  if (!MayIncreasePressure) return true;
+
   // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
   // an immediate predecessor. We don't want to increase register pressure and
   // end up causing other computation to be spilled.
@@ -396,6 +419,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  SmallVector<unsigned, 2> ImplicitDefsToUpdate;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -437,7 +461,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // used, then it's not safe to replace it with a common subexpression.
     // It's also not safe if the instruction uses physical registers.
     bool CrossMBBPhysDef = false;
-    SmallSet<unsigned,8> PhysRefs;
+    SmallSet<unsigned, 8> PhysRefs;
     SmallVector<unsigned, 2> PhysDefs;
     if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs, PhysDefs)) {
       FoundCSE = false;
@@ -465,21 +489,31 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
     // Check if it's profitable to perform this CSE.
     bool DoCSE = true;
-    unsigned NumDefs = MI->getDesc().getNumDefs();
+    unsigned NumDefs = MI->getDesc().getNumDefs() +
+                       MI->getDesc().getNumImplicitDefs();
+    
     for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
       unsigned OldReg = MO.getReg();
       unsigned NewReg = CSMI->getOperand(i).getReg();
-      if (OldReg == NewReg)
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
+        ImplicitDefsToUpdate.push_back(i);
+      if (OldReg == NewReg) {
+        --NumDefs;
         continue;
+      }
 
       assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
              TargetRegisterInfo::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -488,6 +522,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // within the register class of the new instruction.
       const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
       if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -503,6 +538,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         MRI->clearKillFlags(CSEPairs[i].second);
       }
 
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
+        CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+
       if (CrossMBBPhysDef) {
         // Add physical register defs now coming in from a predecessor to MBB
         // livein list.
@@ -522,11 +562,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         ++NumCommutes;
       Changed = true;
     } else {
-      DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
       Exps.push_back(MI);
     }
     CSEPairs.clear();
+    ImplicitDefsToUpdate.clear();
   }
 
   return Changed;
@@ -537,8 +577,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 /// up the dominator tree to destroy ancestors which are now done.
 void
 MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
   if (OpenChildren[Node])
     return;
 
@@ -546,7 +585,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
   ExitScope(Node->getBlock());
 
   // Now traverse upwards to pop ancestors whose offsprings are all done.
-  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
     unsigned Left = --OpenChildren[Parent];
     if (Left != 0)
       break;
@@ -558,7 +597,6 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
 bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
-  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
   DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
 
   CurrVN = 0;
@@ -573,7 +611,6 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     OpenChildren[Node] = NumChildren;
     for (unsigned i = 0; i != NumChildren; ++i) {
       MachineDomTreeNode *Child = Children[i];
-      ParentMap[Child] = Node;
       WorkList.push_back(Child);
     }
   } while (!WorkList.empty());
@@ -586,7 +623,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     EnterScope(MBB);
     Changed |= ProcessBlock(MBB);
     // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
-    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+    ExitScopeIfDone(Node, OpenChildren);
   }
 
   return Changed;
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 9730eaa..bac3aa2 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -62,28 +62,16 @@ void
 MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
                               SourceMap &SrcMap,
                               DenseMap<unsigned, MachineInstr*> &AvailCopyMap) {
-  SourceMap::iterator SI = SrcMap.find(Reg);
-  if (SI != SrcMap.end()) {
-    const DestList& Defs = SI->second;
-    for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
-         I != E; ++I) {
-      unsigned MappedDef = *I;
-      // Source of copy is no longer available for propagation.
-      if (AvailCopyMap.erase(MappedDef)) {
-        for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
-          AvailCopyMap.erase(*SR);
-      }
-    }
-  }
-  for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-    SI = SrcMap.find(*AS);
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    SourceMap::iterator SI = SrcMap.find(*AI);
     if (SI != SrcMap.end()) {
       const DestList& Defs = SI->second;
       for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
            I != E; ++I) {
         unsigned MappedDef = *I;
+        // Source of copy is no longer available for propagation.
         if (AvailCopyMap.erase(MappedDef)) {
-          for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
+          for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
             AvailCopyMap.erase(*SR);
         }
       }
@@ -188,11 +176,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       }
 
       // If Src is defined by a previous copy, it cannot be eliminated.
-      CI = CopyMap.find(Src);
-      if (CI != CopyMap.end())
-        MaybeDeadCopies.remove(CI->second);
-      for (const uint16_t *AS = TRI->getAliasSet(Src); *AS; ++AS) {
-        CI = CopyMap.find(*AS);
+      for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
+        CI = CopyMap.find(*AI);
         if (CI != CopyMap.end())
           MaybeDeadCopies.remove(CI->second);
       }
@@ -211,13 +196,13 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // Remember Def is defined by the copy.
       // ... Make sure to clear the def maps of aliases first.
-      for (const uint16_t *AS = TRI->getAliasSet(Def); *AS; ++AS) {
-        CopyMap.erase(*AS);
-        AvailCopyMap.erase(*AS);
+      for (MCRegAliasIterator AI(Def, TRI, false); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
       }
       CopyMap[Def] = MI;
       AvailCopyMap[Def] = MI;
-      for (const uint16_t *SR = TRI->getSubRegisters(Def); *SR; ++SR) {
+      for (MCSubRegIterator SR(Def, TRI); SR.isValid(); ++SR) {
         CopyMap[*SR] = MI;
         AvailCopyMap[*SR] = MI;
       }
@@ -256,11 +241,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // If 'Reg' is defined by a copy, the copy is no longer a candidate
       // for elimination.
-      DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(Reg);
-      if (CI != CopyMap.end())
-        MaybeDeadCopies.remove(CI->second);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-        CI = CopyMap.find(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(*AI);
         if (CI != CopyMap.end())
           MaybeDeadCopies.remove(CI->second);
       }
@@ -296,11 +278,9 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       unsigned Reg = Defs[i];
 
       // No longer defined by a copy.
-      CopyMap.erase(Reg);
-      AvailCopyMap.erase(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-        CopyMap.erase(*AS);
-        AvailCopyMap.erase(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
       }
 
       // If 'Reg' is previously source of a copy, it is no longer available for
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d8c2f6a..d4aede8a 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -26,7 +27,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
@@ -60,7 +60,7 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   MFInfo = 0;
   FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering());
   if (Fn->hasFnAttr(Attribute::StackAlignment))
-    FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs(
+    FrameInfo->ensureMaxAlignment(Attribute::getStackAlignmentFromAttrs(
         Fn->getAttributes().getFnAttributes()));
   ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
   Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
@@ -84,9 +84,13 @@ MachineFunction::~MachineFunction() {
     MFInfo->~MachineFunctionInfo();
     Allocator.Deallocate(MFInfo);
   }
-  FrameInfo->~MachineFrameInfo();         Allocator.Deallocate(FrameInfo);
-  ConstantPool->~MachineConstantPool();   Allocator.Deallocate(ConstantPool);
-  
+
+  FrameInfo->~MachineFrameInfo();
+  Allocator.Deallocate(FrameInfo);
+
+  ConstantPool->~MachineConstantPool();
+  Allocator.Deallocate(ConstantPool);
+
   if (JumpTableInfo) {
     JumpTableInfo->~MachineJumpTableInfo();
     Allocator.Deallocate(JumpTableInfo);
@@ -98,7 +102,7 @@ MachineFunction::~MachineFunction() {
 MachineJumpTableInfo *MachineFunction::
 getOrCreateJumpTableInfo(unsigned EntryKind) {
   if (JumpTableInfo) return JumpTableInfo;
-  
+
   JumpTableInfo = new (Allocator)
     MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
   return JumpTableInfo;
@@ -116,12 +120,12 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
     MBBI = begin();
   else
     MBBI = MBB;
-  
+
   // Figure out the block number this should have.
   unsigned BlockNo = 0;
   if (MBBI != begin())
     BlockNo = prior(MBBI)->getNumber()+1;
-  
+
   for (; MBBI != E; ++MBBI, ++BlockNo) {
     if (MBBI->getNumber() != (int)BlockNo) {
       // Remove use of the old number.
@@ -130,7 +134,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
                "MBB number mismatch!");
         MBBNumbering[MBBI->getNumber()] = 0;
       }
-      
+
       // If BlockNo is already taken, set that block's number to -1.
       if (MBBNumbering[BlockNo])
         MBBNumbering[BlockNo]->setNumber(-1);
@@ -138,7 +142,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
       MBBNumbering[BlockNo] = MBBI;
       MBBI->setNumber(BlockNo);
     }
-  }    
+  }
 
   // Okay, all the blocks are renumbered.  If we have compactified the block
   // numbering, shrink MBBNumbering now.
@@ -295,16 +299,16 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
 
   // Print Frame Information
   FrameInfo->print(*this, OS);
-  
+
   // Print JumpTable Information
   if (JumpTableInfo)
     JumpTableInfo->print(OS);
 
   // Print Constant Pool
   ConstantPool->print(OS);
-  
+
   const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
-  
+
   if (RegInfo && !RegInfo->livein_empty()) {
     OS << "Function Live Ins: ";
     for (MachineRegisterInfo::livein_iterator
@@ -324,7 +328,7 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
       OS << ' ' << PrintReg(*I, TRI);
     OS << '\n';
   }
-  
+
   for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
     OS << '\n';
     BB->print(OS, Indexes);
@@ -411,10 +415,9 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, 
                                         bool isLinkerPrivate) const {
   assert(JumpTableInfo && "No jump tables");
-  
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
   const MCAsmInfo &MAI = *getTarget().getMCAsmInfo();
-  
+
   const char *Prefix = isLinkerPrivate ? MAI.getLinkerPrivateGlobalPrefix() :
                                          MAI.getPrivateGlobalPrefix();
   SmallString<60> Name;
@@ -691,7 +694,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   else if (B->getType() != IntTy)
     B = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
                                  const_cast<Constant*>(B), TD);
-  
+
   return A == B;
 }
 
@@ -714,7 +717,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
         Constants[i].Alignment = Alignment;
       return i;
     }
-  
+
   Constants.push_back(MachineConstantPoolEntry(C, Alignment));
   return Constants.size()-1;
 }
@@ -723,7 +726,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
                                                    unsigned Alignment) {
   assert(Alignment && "Alignment must be specified!");
   if (Alignment > PoolAlignment) PoolAlignment = Alignment;
-  
+
   // Check to see if we already have this constant.
   //
   // FIXME, this could be made much more efficient for large constant pools.
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 2aaa798..0102ac7 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -14,7 +14,9 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -28,6 +30,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   raw_ostream &OS;
   const std::string Banner;
 
+  MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
   MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
       : MachineFunctionPass(ID), OS(os), Banner(banner) {}
 
@@ -40,7 +43,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) {
     OS << "# " << Banner << ":\n";
-    MF.print(OS);
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
     return false;
   }
 };
@@ -48,6 +51,10 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 char MachineFunctionPrinterPass::ID = 0;
 }
 
+char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+                "Machine Function Printer", false, false)
+
 namespace llvm {
 /// Returns a newly-created MachineFunction Printer pass. The
 /// default banner is empty.
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e553a04..b166849 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/LLVMContext.h"
@@ -33,7 +34,6 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LeakDetector.h"
@@ -47,55 +47,6 @@ using namespace llvm;
 // MachineOperand Implementation
 //===----------------------------------------------------------------------===//
 
-/// AddRegOperandToRegInfo - Add this register operand to the specified
-/// MachineRegisterInfo.  If it is null, then the next/prev fields should be
-/// explicitly nulled out.
-void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
-  assert(isReg() && "Can only add reg operand to use lists");
-
-  // If the reginfo pointer is null, just explicitly null out or next/prev
-  // pointers, to ensure they are not garbage.
-  if (RegInfo == 0) {
-    Contents.Reg.Prev = 0;
-    Contents.Reg.Next = 0;
-    return;
-  }
-
-  // Otherwise, add this operand to the head of the registers use/def list.
-  MachineOperand **Head = &RegInfo->getRegUseDefListHead(getReg());
-
-  // For SSA values, we prefer to keep the definition at the start of the list.
-  // we do this by skipping over the definition if it is at the head of the
-  // list.
-  if (*Head && (*Head)->isDef())
-    Head = &(*Head)->Contents.Reg.Next;
-
-  Contents.Reg.Next = *Head;
-  if (Contents.Reg.Next) {
-    assert(getReg() == Contents.Reg.Next->getReg() &&
-           "Different regs on the same list!");
-    Contents.Reg.Next->Contents.Reg.Prev = &Contents.Reg.Next;
-  }
-
-  Contents.Reg.Prev = Head;
-  *Head = this;
-}
-
-/// RemoveRegOperandFromRegInfo - Remove this register operand from the
-/// MachineRegisterInfo it is linked with.
-void MachineOperand::RemoveRegOperandFromRegInfo() {
-  assert(isOnRegUseList() && "Reg operand is not on a use list");
-  // Unlink this from the doubly linked list of operands.
-  MachineOperand *NextOp = Contents.Reg.Next;
-  *Contents.Reg.Prev = NextOp;
-  if (NextOp) {
-    assert(NextOp->getReg() == getReg() && "Corrupt reg use/def chain!");
-    NextOp->Contents.Reg.Prev = Contents.Reg.Prev;
-  }
-  Contents.Reg.Prev = 0;
-  Contents.Reg.Next = 0;
-}
-
 void MachineOperand::setReg(unsigned Reg) {
   if (getReg() == Reg) return; // No change.
 
@@ -105,9 +56,10 @@ void MachineOperand::setReg(unsigned Reg) {
   if (MachineInstr *MI = getParent())
     if (MachineBasicBlock *MBB = MI->getParent())
       if (MachineFunction *MF = MBB->getParent()) {
-        RemoveRegOperandFromRegInfo();
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
         SmallContents.RegNo = Reg;
-        AddRegOperandToRegInfo(&MF->getRegInfo());
+        MRI.addRegOperandToUseList(this);
         return;
       }
 
@@ -136,15 +88,36 @@ void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
   setReg(Reg);
 }
 
+/// Change a def to a use, or a use to a def.
+void MachineOperand::setIsDef(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert((!Val || !isDebug()) && "Marking a debug operation as def");
+  if (IsDef == Val)
+    return;
+  // MRI may keep uses and defs in different list positions.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        IsDef = Val;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+  IsDef = Val;
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
   // If this operand is currently a register operand, and if this is in a
   // function, deregister the operand from the register's use/def list.
-  if (isReg() && getParent() && getParent()->getParent() &&
-      getParent()->getParent()->getParent())
-    RemoveRegOperandFromRegInfo();
+  if (isReg() && isOnRegUseList())
+    if (MachineInstr *MI = getParent())
+      if (MachineBasicBlock *MBB = MI->getParent())
+        if (MachineFunction *MF = MBB->getParent())
+          MF->getRegInfo().removeRegOperandFromUseList(this);
 
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
@@ -156,24 +129,20 @@ void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
 void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
-  // If this operand is already a register operand, use setReg to update the
+  MachineRegisterInfo *RegInfo = 0;
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent())
+        RegInfo = &MF->getRegInfo();
+  // If this operand is already a register operand, remove it from the
   // register's use/def lists.
-  if (isReg()) {
-    assert(!isEarlyClobber());
-    setReg(Reg);
-  } else {
-    // Otherwise, change this to a register and set the reg#.
-    OpKind = MO_Register;
-    SmallContents.RegNo = Reg;
-
-    // If this operand is embedded in a function, add the operand to the
-    // register's use/def list.
-    if (MachineInstr *MI = getParent())
-      if (MachineBasicBlock *MBB = MI->getParent())
-        if (MachineFunction *MF = MBB->getParent())
-          AddRegOperandToRegInfo(&MF->getRegInfo());
-  }
+  if (RegInfo && isReg())
+    RegInfo->removeRegOperandFromUseList(this);
 
+  // Change this to a register and set the reg#.
+  OpKind = MO_Register;
+  SmallContents.RegNo = Reg;
+  SubReg = 0;
   IsDef = isDef;
   IsImp = isImp;
   IsKill = isKill;
@@ -182,11 +151,18 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
   IsInternalRead = false;
   IsEarlyClobber = false;
   IsDebug = isDebug;
-  SubReg = 0;
+  // Ensure isOnRegUseList() returns false.
+  Contents.Reg.Prev = 0;
+
+  // If this operand is embedded in a function, add the operand to the
+  // register's use/def list.
+  if (RegInfo)
+    RegInfo->addRegOperandToUseList(this);
 }
 
 /// isIdenticalTo - Return true if this operand is identical to the specified
-/// operand.
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
 bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   if (getType() != Other.getType() ||
       getTargetFlags() != Other.getTargetFlags())
@@ -207,6 +183,7 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   case MachineOperand::MO_FrameIndex:
     return getIndex() == Other.getIndex();
   case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
     return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
   case MachineOperand::MO_JumpTableIndex:
     return getIndex() == Other.getIndex();
@@ -227,6 +204,47 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   llvm_unreachable("Invalid machine operand type");
 }
 
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(),
+                        MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        MO.getSymbolName());
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getBlockAddress());
+  case MachineOperand::MO_RegisterMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
 /// print - Print the specified machine operand.
 ///
 void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
@@ -255,12 +273,16 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           OS << "imp-";
         OS << "def";
         NeedComma = true;
+        // <def,read-undef> only makes sense when getSubReg() is set.
+        // Don't clutter the output otherwise.
+        if (isUndef() && getSubReg())
+          OS << ",read-undef";
       } else if (isImplicit()) {
           OS << "imp-use";
           NeedComma = true;
       }
 
-      if (isKill() || isDead() || isUndef() || isInternalRead()) {
+      if (isKill() || isDead() || (isUndef() && isUse()) || isInternalRead()) {
         if (NeedComma) OS << ',';
         NeedComma = false;
         if (isKill()) {
@@ -271,7 +293,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           OS << "dead";
           NeedComma = true;
         }
-        if (isUndef()) {
+        if (isUndef() && isUse()) {
           if (NeedComma) OS << ',';
           OS << "undef";
           NeedComma = true;
@@ -308,6 +330,11 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
     if (getOffset()) OS << "+" << getOffset();
     OS << '>';
     break;
+  case MachineOperand::MO_TargetIndex:
+    OS << "<ti#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
   case MachineOperand::MO_JumpTableIndex:
     OS << "<jt#" << getIndex() << '>';
     break;
@@ -605,24 +632,21 @@ MachineRegisterInfo *MachineInstr::getRegInfo() {
 /// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands already be on their use lists.
-void MachineInstr::RemoveRegOperandsFromUseLists() {
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
     if (Operands[i].isReg())
-      Operands[i].RemoveRegOperandFromRegInfo();
-  }
+      MRI.removeRegOperandFromUseList(&Operands[i]);
 }
 
 /// AddRegOperandsToUseLists - Add all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands not be on their use lists yet.
-void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo) {
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
     if (Operands[i].isReg())
-      Operands[i].AddRegOperandToRegInfo(&RegInfo);
-  }
+      MRI.addRegOperandToUseList(&Operands[i]);
 }
 
-
 /// addOperand - Add the specified operand to the instruction.  If it is an
 /// implicit operand, it is added to the end of the operand list.  If it is
 /// an explicit operand it is added at the end of the explicit operand list
@@ -650,13 +674,15 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
     while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
       --OpNo;
       if (RegInfo)
-        Operands[OpNo].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[OpNo]);
     }
   }
 
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
-  assert((isImpReg || MCID->isVariadic() || OpNo < MCID->getNumOperands()) &&
+  // RegMask operands go between the explicit and implicit operands.
+  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+          OpNo < MCID->getNumOperands()) &&
          "Trying to add an operand to a machine instr that is already done!");
 
   // All operands from OpNo have been removed from RegInfo.  If the Operands
@@ -665,7 +691,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (Reallocate)
     for (unsigned i = 0; i != OpNo; ++i)
       if (Operands[i].isReg())
-        Operands[i].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[i]);
 
   // Insert the new operand at OpNo.
   Operands.insert(Operands.begin() + OpNo, Op);
@@ -676,13 +702,15 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (Reallocate)
     for (unsigned i = 0; i != OpNo; ++i)
       if (Operands[i].isReg())
-        Operands[i].AddRegOperandToRegInfo(RegInfo);
+        RegInfo->addRegOperandToUseList(&Operands[i]);
 
   // When adding a register operand, tell RegInfo about it.
   if (Operands[OpNo].isReg()) {
-    // Add the new operand to RegInfo, even when RegInfo is NULL.
-    // This will initialize the linked list pointers.
-    Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
+    // Ensure isOnRegUseList() returns false, regardless of Op's status.
+    Operands[OpNo].Contents.Reg.Prev = 0;
+    // Add the new operand to RegInfo.
+    if (RegInfo)
+      RegInfo->addRegOperandToUseList(&Operands[OpNo]);
     // If the register operand is flagged as early, mark the operand as such.
     if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
       Operands[OpNo].setIsEarlyClobber(true);
@@ -692,7 +720,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (RegInfo) {
     for (unsigned i = OpNo + 1, e = Operands.size(); i != e; ++i) {
       assert(Operands[i].isReg() && "Should only be an implicit reg!");
-      Operands[i].AddRegOperandToRegInfo(RegInfo);
+      RegInfo->addRegOperandToUseList(&Operands[i]);
     }
   }
 }
@@ -702,12 +730,13 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
 ///
 void MachineInstr::RemoveOperand(unsigned OpNo) {
   assert(OpNo < Operands.size() && "Invalid operand number");
+  MachineRegisterInfo *RegInfo = getRegInfo();
 
   // Special case removing the last one.
   if (OpNo == Operands.size()-1) {
     // If needed, remove from the reg def/use list.
-    if (Operands.back().isReg() && Operands.back().isOnRegUseList())
-      Operands.back().RemoveRegOperandFromRegInfo();
+    if (RegInfo && Operands.back().isReg() && Operands.back().isOnRegUseList())
+      RegInfo->removeRegOperandFromUseList(&Operands.back());
 
     Operands.pop_back();
     return;
@@ -716,11 +745,10 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
   // Otherwise, we are removing an interior operand.  If we have reginfo to
   // update, remove all operands that will be shifted down from their reg lists,
   // move everything down, then re-add them.
-  MachineRegisterInfo *RegInfo = getRegInfo();
   if (RegInfo) {
     for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
       if (Operands[i].isReg())
-        Operands[i].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[i]);
     }
   }
 
@@ -729,7 +757,7 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
   if (RegInfo) {
     for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
       if (Operands[i].isReg())
-        Operands[i].AddRegOperandToRegInfo(RegInfo);
+        RegInfo->addRegOperandToUseList(&Operands[i]);
     }
   }
 }
@@ -868,7 +896,8 @@ void MachineInstr::eraseFromParent() {
       MBB->erase(MI);
     }
   }
-  getParent()->erase(this);
+  // Erase the individual instruction, which may itself be inside a bundle.
+  getParent()->erase_instr(this);
 }
 
 
@@ -938,9 +967,13 @@ const TargetRegisterClass*
 MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetInstrInfo *TII,
                                     const TargetRegisterInfo *TRI) const {
+  assert(getParent() && "Can't have an MBB reference here!");
+  assert(getParent()->getParent() && "Can't have an MF reference here!");
+  const MachineFunction &MF = *getParent()->getParent();
+
   // Most opcodes have fixed constraints in their MCInstrDesc.
   if (!isInlineAsm())
-    return TII->getRegClass(getDesc(), OpIdx, TRI);
+    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
 
   if (!getOperand(OpIdx).isReg())
     return NULL;
@@ -962,7 +995,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 
   // Assume that all registers in a memory operand are pointers.
   if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
-    return TRI->getPointerRegClass();
+    return TRI->getPointerRegClass(MF);
 
   return NULL;
 }
@@ -1530,12 +1563,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
         if (MRI.use_empty(Reg) && !MRI.isLiveOut(Reg)) {
           bool HasAliasLive = false;
-          for (const uint16_t *Alias = TM->getRegisterInfo()->getAliasSet(Reg);
-               unsigned AliasReg = *Alias; ++Alias)
+          for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
+               AI.isValid(); ++AI) {
+            unsigned AliasReg = *AI;
             if (!MRI.use_empty(AliasReg) || MRI.isLiveOut(AliasReg)) {
               HasAliasLive = true;
               break;
             }
+          }
           if (!HasAliasLive) {
             OmittedAnyCallClobbers = true;
             continue;
@@ -1667,7 +1702,8 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
                                      const TargetRegisterInfo *RegInfo,
                                      bool AddIfNotFound) {
   bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
-  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -1739,7 +1775,8 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
   bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
-  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -1758,9 +1795,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
       // There exists a super-register that's marked dead.
       if (RegInfo->isSuperRegister(IncomingReg, Reg))
         return true;
-      if (RegInfo->getSubRegisters(IncomingReg) &&
-          RegInfo->getSuperRegisters(Reg) &&
-          RegInfo->isSubRegister(IncomingReg, Reg))
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
         DeadOps.push_back(i);
     }
   }
@@ -1841,52 +1876,16 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   // Build up a buffer of hash code components.
-  //
-  // FIXME: This is a total hack. We should have a hash_value overload for
-  // MachineOperand, but currently that doesn't work because there are many
-  // different ideas of "equality" and thus different sets of information that
-  // contribute to the hash code. This one happens to want to take a specific
-  // subset. And it's still not clear that this routine uses the *correct*
-  // subset of information when computing the hash code. The goal is to use the
-  // same inputs for the hash code here that MachineInstr::isIdenticalTo uses to
-  // test for equality when passed the 'IgnoreVRegDefs' filter flag. It would
-  // be very useful to factor the selection of relevant inputs out of the two
-  // functions and into a common routine, but it's not clear how that can be
-  // done.
   SmallVector<size_t, 8> HashComponents;
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    switch (MO.getType()) {
-    default: break;
-    case MachineOperand::MO_Register:
-      if (MO.isDef() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;  // Skip virtual register defs.
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getReg()));
-      break;
-    case MachineOperand::MO_Immediate:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getImm()));
-      break;
-    case MachineOperand::MO_FrameIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_JumpTableIndex:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getIndex()));
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMBB()));
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getGlobal()));
-      break;
-    case MachineOperand::MO_BlockAddress:
-      HashComponents.push_back(hash_combine(MO.getType(),
-                                            MO.getBlockAddress()));
-      break;
-    case MachineOperand::MO_MCSymbol:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMCSymbol()));
-      break;
-    }
+    if (MO.isReg() && MO.isDef() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
   }
   return hash_combine_range(HashComponents.begin(), HashComponents.end());
 }
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 73489a7..b7de7bf 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -169,8 +169,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       }
 
       if (!MO.isDead()) {
-        for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-             unsigned SubReg = *SubRegs; ++SubRegs) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          unsigned SubReg = *SubRegs;
           if (LocalDefSet.insert(SubReg))
             LocalDefs.push_back(SubReg);
         }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 8c562cc..efec481 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -445,8 +445,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     }
 
     if (MO.isImplicit()) {
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        PhysRegClobbers.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegClobbers.set(*AI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -465,7 +465,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) {
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
       if (PhysRegClobbers.test(*AS))
@@ -517,8 +517,8 @@ void MachineLICM::HoistRegionPostRA() {
     for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
            E = BB->livein_end(); I != E; ++I) {
       unsigned Reg = *I;
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        PhysRegDefs.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegDefs.set(*AI);
     }
 
     SpeculationState = SpeculateUnknown;
@@ -540,8 +540,8 @@ void MachineLICM::HoistRegionPostRA() {
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        TermRegs.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        TermRegs.set(*AI);
     }
   }
 
@@ -1260,11 +1260,11 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   if (NewOpc == 0) return 0;
   const MCInstrDesc &MID = TII->get(NewOpc);
   if (MID.getNumDefs() != 1) return 0;
-  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI);
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   unsigned Reg = MRI->createVirtualRegister(RC);
 
-  MachineFunction &MF = *MI->getParent()->getParent();
   SmallVector<MachineInstr *, 2> NewMIs;
   bool Success =
     TII->unfoldMemoryOperand(MF, MI, Reg,
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 189cb2b..9f3829e 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -9,7 +9,7 @@
 //
 // This file defines the MachineLoopInfo class that is used to identify natural
 // loops and determine the loop depth of various nodes of the CFG.  Note that
-// the loops identified may actually be several natural loops that share the 
+// the loops identified may actually be several natural loops that share the
 // same header node... not just a single natural loop.
 //
 //===----------------------------------------------------------------------===//
@@ -17,17 +17,13 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
-namespace llvm {
-#define MLB class LoopBase<MachineBasicBlock, MachineLoop>
-TEMPLATE_INSTANTIATION(MLB);
-#undef MLB
-#define MLIB class LoopInfoBase<MachineBasicBlock, MachineLoop>
-TEMPLATE_INSTANTIATION(MLIB);
-#undef MLIB
-}
+// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
+template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
 char MachineLoopInfo::ID = 0;
 INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
@@ -40,7 +36,7 @@ char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
   releaseMemory();
-  LI.Calculate(getAnalysis<MachineDominatorTree>().getBase());    // Update
+  LI.Analyze(getAnalysis<MachineDominatorTree>().getBase());
   return false;
 }
 
diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp
index 58e067b..cb204fd 100644
--- a/lib/CodeGen/MachinePassRegistry.cpp
+++ b/lib/CodeGen/MachinePassRegistry.cpp
@@ -18,6 +18,19 @@ using namespace llvm;
 
 void MachinePassRegistryListener::anchor() { }
 
+/// setDefault - Set the default constructor by name.
+void MachinePassRegistry::setDefault(StringRef Name) {
+  MachinePassCtor Ctor = 0;
+  for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
+    if (R->getName() == Name) {
+      Ctor = R->getCtor();
+      break;
+    }
+  }
+  assert(Ctor && "Unregistered pass name");
+  setDefault(Ctor);
+}
+
 /// Add - Adds a function pass to the registration list.
 ///
 void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7ea1517..5fb938f 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -102,17 +102,9 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
 
   // New virtual register number.
   unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
-
-  // Add a reg, but keep track of whether the vector reallocated or not.
-  const unsigned FirstVirtReg = TargetRegisterInfo::index2VirtReg(0);
-  void *ArrayBase = getNumVirtRegs() == 0 ? 0 : &VRegInfo[FirstVirtReg];
   VRegInfo.grow(Reg);
   VRegInfo[Reg].first = RegClass;
   RegAllocHints.grow(Reg);
-
-  if (ArrayBase && &VRegInfo[FirstVirtReg] != ArrayBase)
-    // The vector reallocated, handle this now.
-    HandleVRegListReallocation();
   return Reg;
 }
 
@@ -126,21 +118,68 @@ void MachineRegisterInfo::clearVirtRegs() {
   VRegInfo.clear();
 }
 
-/// HandleVRegListReallocation - We just added a virtual register to the
-/// VRegInfo info list and it reallocated.  Update the use/def lists info
-/// pointers.
-void MachineRegisterInfo::HandleVRegListReallocation() {
-  // The back pointers for the vreg lists point into the previous vector.
-  // Update them to point to their correct slots.
-  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    MachineOperand *List = VRegInfo[Reg].second;
-    if (!List) continue;
-    // Update the back-pointer to be accurate once more.
-    List->Contents.Reg.Prev = &VRegInfo[Reg].second;
+/// Add MO to the linked list of operands for its register.
+void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
+  assert(!MO->isOnRegUseList() && "Already on list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+
+  // Head points to the first list element.
+  // Next is NULL on the last list element.
+  // Prev pointers are circular, so Head->Prev == Last.
+
+  // Head is NULL for an empty list.
+  if (!Head) {
+    MO->Contents.Reg.Prev = MO;
+    MO->Contents.Reg.Next = 0;
+    HeadRef = MO;
+    return;
+  }
+  assert(MO->getReg() == Head->getReg() && "Different regs on the same list!");
+
+  // Insert MO between Last and Head in the circular Prev chain.
+  MachineOperand *Last = Head->Contents.Reg.Prev;
+  assert(Last && "Inconsistent use list");
+  assert(MO->getReg() == Last->getReg() && "Different regs on the same list!");
+  Head->Contents.Reg.Prev = MO;
+  MO->Contents.Reg.Prev = Last;
+
+  // Def operands always precede uses. This allows def_iterator to stop early.
+  // Insert def operands at the front, and use operands at the back.
+  if (MO->isDef()) {
+    // Insert def at the front.
+    MO->Contents.Reg.Next = Head;
+    HeadRef = MO;
+  } else {
+    // Insert use at the end.
+    MO->Contents.Reg.Next = 0;
+    Last->Contents.Reg.Next = MO;
   }
 }
 
+/// Remove MO from its use-def list.
+void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
+  assert(MO->isOnRegUseList() && "Operand not on use list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+  assert(Head && "List already empty");
+
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *Next = MO->Contents.Reg.Next;
+  MachineOperand *Prev = MO->Contents.Reg.Prev;
+
+  // Prev links are circular, next link is NULL instead of looping back to Head.
+  if (MO == Head)
+    HeadRef = Next;
+  else
+    Prev->Contents.Reg.Next = Next;
+
+  (Next ? Next : Head)->Contents.Reg.Prev = Prev;
+
+  MO->Contents.Reg.Prev = 0;
+  MO->Contents.Reg.Next = 0;
+}
+
 /// replaceRegWith - Replace all instances of FromReg with ToReg in the
 /// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
 /// except that it also changes any definitions of the register as well.
@@ -162,14 +201,20 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
 MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
   // Since we are in SSA form, we can use the first definition.
   def_iterator I = def_begin(Reg);
+  assert((I.atEnd() || llvm::next(I) == def_end()) &&
+         "getVRegDef assumes a single definition or no definition");
   return !I.atEnd() ? &*I : 0;
 }
 
-bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const {
-  use_iterator UI = use_begin(RegNo);
-  if (UI == use_end())
-    return false;
-  return ++UI == use_end();
+/// getUniqueVRegDef - Return the unique machine instr that defines the
+/// specified virtual register or null if none is found.  If there are
+/// multiple definitions or no definition, return null.
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
+  if (def_empty(Reg)) return 0;
+  def_iterator I = def_begin(Reg);
+  if (llvm::next(I) != def_end())
+    return 0;
+  return &*I;
 }
 
 bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
@@ -268,15 +313,15 @@ bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
 
   // Check if any overlapping register is modified.
-  for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R)
-    if (!def_empty(*R))
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+    if (!def_empty(*AI))
       return false;
 
   // Check if any overlapping register is allocatable so it may be used later.
   if (AllocatableRegs.empty())
     AllocatableRegs = TRI->getAllocatableSet(MF);
-  for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R)
-    if (AllocatableRegs.test(*R))
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+    if (AllocatableRegs.test(*AI))
       return false;
   return true;
 }
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 070a557..076547a 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -42,7 +42,7 @@ MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
 }
 
 MachineSSAUpdater::~MachineSSAUpdater() {
-  delete &getAvailableVals(AV);
+  delete static_cast<AvailableValsTy*>(AV);
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
@@ -241,30 +241,6 @@ void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
       I->second = NewReg;
 }
 
-/// MachinePHIiter - Iterator for PHI operands.  This is used for the
-/// PHI_iterator in the SSAUpdaterImpl template.
-namespace {
-  class MachinePHIiter {
-  private:
-    MachineInstr *PHI;
-    unsigned idx;
- 
-  public:
-    explicit MachinePHIiter(MachineInstr *P) // begin iterator
-      : PHI(P), idx(1) {}
-    MachinePHIiter(MachineInstr *P, bool) // end iterator
-      : PHI(P), idx(PHI->getNumOperands()) {}
-
-    MachinePHIiter &operator++() { idx += 2; return *this; } 
-    bool operator==(const MachinePHIiter& x) const { return idx == x.idx; }
-    bool operator!=(const MachinePHIiter& x) const { return !operator==(x); }
-    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
-    MachineBasicBlock *getIncomingBlock() {
-      return PHI->getOperand(idx+1).getMBB();
-    }
-  };
-}
-
 /// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
 /// template, specialized for MachineSSAUpdater.
 namespace llvm {
@@ -279,7 +255,26 @@ public:
   static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
   static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
 
-  typedef MachinePHIiter PHI_iterator;
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+ 
+  public:
+    explicit PHI_iterator(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    PHI_iterator(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    PHI_iterator &operator++() { idx += 2; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
   static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
   static inline PHI_iterator PHI_end(PhiT *PHI) {
     return PHI_iterator(PHI, true);
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 1d3241b..a1dc948 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -17,9 +17,13 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -50,6 +54,15 @@ static bool ViewMISchedDAGs = false;
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
+MachineSchedContext::MachineSchedContext():
+    MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {
+  RegClassInfo = new RegisterClassInfo();
+}
+
+MachineSchedContext::~MachineSchedContext() {
+  delete RegClassInfo;
+}
+
 namespace {
 /// MachineScheduler runs after coalescing and before register allocation.
 class MachineScheduler : public MachineSchedContext,
@@ -122,6 +135,29 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
 /// default scheduler if the target does not set a default.
 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
 
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
 /// Top-level MachineScheduler pass driver.
 ///
 /// Visit blocks in function order. Divide each block into scheduling regions
@@ -139,6 +175,8 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
 /// design would be to split blocks at scheduling boundaries, but LLVM has a
 /// general bias against block splitting purely for implementation simplicity.
 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
+
   // Initialize the context of the pass.
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
@@ -149,6 +187,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   LIS = &getAnalysis<LiveIntervals>();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
+  RegClassInfo->runOnMachineFunction(*MF);
+
   // Select the scheduler, or set the default.
   MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
   if (Ctor == useDefaultMachineSched) {
@@ -163,13 +203,16 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
 
   // Visit all machine basic blocks.
+  //
+  // TODO: Visit blocks in global postorder or postorder within the bottom-up
+  // loop tree. Then we can optionally compute global RegPressure.
   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
        MBB != MBBEnd; ++MBB) {
 
     Scheduler->startBlock(MBB);
 
     // Break the block into scheduling regions [I, RegionEnd), and schedule each
-    // region as soon as it is discovered. RegionEnd points the the scheduling
+    // region as soon as it is discovered. RegionEnd points the scheduling
     // boundary at the bottom of the region. The DAG does not include RegionEnd,
     // but the region does (i.e. the next RegionEnd is above the previous
     // RegionBegin). If the current block has no terminator then RegionEnd ==
@@ -181,6 +224,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     unsigned RemainingCount = MBB->size();
     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
         RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
+
       // Avoid decrementing RegionEnd for blocks with no terminator.
       if (RegionEnd != MBB->end()
           || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
@@ -207,7 +251,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
         Scheduler->exitRegion();
         continue;
       }
-      DEBUG(dbgs() << "MachineScheduling " << MF->getFunction()->getName()
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF->getFunction()->getName()
             << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
@@ -260,6 +305,9 @@ public:
   /// be scheduled at the bottom.
   virtual SUnit *pickNode(bool &IsTopNode) = 0;
 
+  /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled a node.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
+
   /// When all predecessor dependencies have been resolved, free this node for
   /// top-down scheduling.
   virtual void releaseTopNode(SUnit *SU) = 0;
@@ -279,22 +327,45 @@ namespace {
 /// machine instructions while updating LiveIntervals.
 class ScheduleDAGMI : public ScheduleDAGInstrs {
   AliasAnalysis *AA;
+  RegisterClassInfo *RegClassInfo;
   MachineSchedStrategy *SchedImpl;
 
+  MachineBasicBlock::iterator LiveRegionEnd;
+
+  /// Register pressure in this region computed by buildSchedGraph.
+  IntervalPressure RegPressure;
+  RegPressureTracker RPTracker;
+
+  /// List of pressure sets that exceed the target's pressure limit before
+  /// scheduling, listed in increasing set ID order. Each pressure set is paired
+  /// with its max pressure in the currently scheduled regions.
+  std::vector<PressureElement> RegionCriticalPSets;
+
   /// The top of the unscheduled zone.
   MachineBasicBlock::iterator CurrentTop;
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
 
   /// The bottom of the unscheduled zone.
   MachineBasicBlock::iterator CurrentBottom;
+  IntervalPressure BotPressure;
+  RegPressureTracker BotRPTracker;
 
+#ifndef NDEBUG
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
   unsigned NumInstrsScheduled;
+#endif
 public:
   ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
     ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
-    AA(C->AA), SchedImpl(S), CurrentTop(), CurrentBottom(),
-    NumInstrsScheduled(0) {}
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure) {
+#ifndef NDEBUG
+    NumInstrsScheduled = 0;
+#endif
+  }
 
   ~ScheduleDAGMI() {
     delete SchedImpl;
@@ -303,22 +374,68 @@ public:
   MachineBasicBlock::iterator top() const { return CurrentTop; }
   MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
 
-  /// Implement ScheduleDAGInstrs interface.
+  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
+  /// region. This covers all instructions in a block, while schedule() may only
+  /// cover a subset.
+  void enterRegion(MachineBasicBlock *bb,
+                   MachineBasicBlock::iterator begin,
+                   MachineBasicBlock::iterator end,
+                   unsigned endcount);
+
+  /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
+  /// reorderable instructions.
   void schedule();
 
+  /// Get current register pressure for the top scheduled instructions.
+  const IntervalPressure &getTopPressure() const { return TopPressure; }
+  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
+
+  /// Get current register pressure for the bottom scheduled instructions.
+  const IntervalPressure &getBotPressure() const { return BotPressure; }
+  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
+
+  /// Get register pressure for the entire scheduling region before scheduling.
+  const IntervalPressure &getRegPressure() const { return RegPressure; }
+
+  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+    return RegionCriticalPSets;
+  }
+
+  /// getIssueWidth - Return the max instructions per scheduling group.
+  unsigned getIssueWidth() const {
+    return (InstrItins && InstrItins->SchedModel)
+      ? InstrItins->SchedModel->IssueWidth : 1;
+  }
+
+  /// getNumMicroOps - Return the number of issue slots required for this MI.
+  unsigned getNumMicroOps(MachineInstr *MI) const {
+    if (!InstrItins) return 1;
+    int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI);
+  }
+
 protected:
+  void initRegPressure();
+  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+
   void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
   bool checkSchedLimit();
 
+  void releaseRoots();
+
   void releaseSucc(SUnit *SU, SDep *SuccEdge);
   void releaseSuccessors(SUnit *SU);
   void releasePred(SUnit *SU, SDep *PredEdge);
   void releasePredecessors(SUnit *SU);
+
+  void placeDebugValues();
 };
 } // namespace
 
 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
 /// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
   SUnit *SuccSU = SuccEdge->getSUnit();
 
@@ -345,6 +462,8 @@ void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
 
 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
 /// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
   SUnit *PredSU = PredEdge->getSUnit();
 
@@ -371,12 +490,17 @@ void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
 
 void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
                                     MachineBasicBlock::iterator InsertPos) {
-  // Fix RegionBegin if the first instruction moves down.
+  // Advance RegionBegin if the first instruction moves down.
   if (&*RegionBegin == MI)
-    RegionBegin = llvm::next(RegionBegin);
+    ++RegionBegin;
+
+  // Update the instruction stream.
   BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
   LIS->handleMove(MI);
-  // Fix RegionBegin if another instruction moves above the first instruction.
+
+  // Recede RegionBegin if an instruction moves above the first.
   if (RegionBegin == InsertPos)
     RegionBegin = MI;
 }
@@ -392,12 +516,114 @@ bool ScheduleDAGMI::checkSchedLimit() {
   return true;
 }
 
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned endcount)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd =
+    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void ScheduleDAGMI::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.getPressure().dump(TRI));
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    BotRPTracker.recede();
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    if (RegionPressure[i] > Limit)
+      RegionCriticalPSets.push_back(PressureElement(i, 0));
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].PSetID) << " ";
+        dbgs() << "\n");
+}
+
+// FIXME: When the pressure tracker deals in pressure differences then we won't
+// iterate over all RegionCriticalPSets[i].
+void ScheduleDAGMI::
+updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
+  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
+    unsigned ID = RegionCriticalPSets[i].PSetID;
+    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
+    if ((int)NewMaxPressure[ID] > MaxUnits)
+      MaxUnits = NewMaxPressure[ID];
+  }
+}
+
+// Release all DAG roots for scheduling.
+void ScheduleDAGMI::releaseRoots() {
+  SmallVector<SUnit*, 16> BotRoots;
+
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (I->Preds.empty())
+      SchedImpl->releaseTopNode(&(*I));
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (I->Succs.empty())
+      BotRoots.push_back(&(*I));
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I)
+    SchedImpl->releaseBottomNode(*I);
+}
+
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
-/// after setting up the current scheduling region.
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
 void ScheduleDAGMI::schedule() {
-  buildSchedGraph(AA);
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
 
-  DEBUG(dbgs() << "********** MI Scheduling **********\n");
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
 
@@ -410,22 +636,12 @@ void ScheduleDAGMI::schedule() {
   releasePredecessors(&ExitSU);
 
   // Release all DAG roots for scheduling.
-  for (std::vector<SUnit>::iterator I = SUnits.begin(), E = SUnits.end();
-       I != E; ++I) {
-    // A SUnit is ready to top schedule if it has no predecessors.
-    if (I->Preds.empty())
-      SchedImpl->releaseTopNode(&(*I));
-    // A SUnit is ready to bottom schedule if it has no successors.
-    if (I->Succs.empty())
-      SchedImpl->releaseBottomNode(&(*I));
-  }
+  releaseRoots();
 
-  CurrentTop = RegionBegin;
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
   CurrentBottom = RegionEnd;
   bool IsTopNode = false;
   while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
-    DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
-          << " Scheduling Instruction:\n"; SU->dump(this));
     if (!checkSchedLimit())
       break;
 
@@ -435,28 +651,69 @@ void ScheduleDAGMI::schedule() {
     if (IsTopNode) {
       assert(SU->isTopReady() && "node still has unscheduled dependencies");
       if (&*CurrentTop == MI)
-        ++CurrentTop;
-      else
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else {
         moveInstruction(MI, CurrentTop);
+        TopRPTracker.setPos(MI);
+      }
+
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+
       // Release dependent instructions for scheduling.
       releaseSuccessors(SU);
     }
     else {
       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
-      if (&*llvm::prior(CurrentBottom) == MI)
-        --CurrentBottom;
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
       else {
-        if (&*CurrentTop == MI)
-          CurrentTop = llvm::next(CurrentTop);
+        if (&*CurrentTop == MI) {
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+          TopRPTracker.setPos(CurrentTop);
+        }
         moveInstruction(MI, CurrentBottom);
         CurrentBottom = MI;
       }
+      // Update bottom scheduled pressure.
+      BotRPTracker.recede();
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+
       // Release dependent instructions for scheduling.
       releasePredecessors(SU);
     }
     SU->isScheduled = true;
+    SchedImpl->schedNode(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void ScheduleDAGMI::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == llvm::prior(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
 }
 
 //===----------------------------------------------------------------------===//
@@ -464,56 +721,603 @@ void ScheduleDAGMI::schedule() {
 //===----------------------------------------------------------------------===//
 
 namespace {
+/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
+/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
+/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
+class ReadyQueue {
+  unsigned ID;
+  std::string Name;
+  std::vector<SUnit*> Queue;
+
+public:
+  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
+
+  unsigned getID() const { return ID; }
+
+  StringRef getName() const { return Name; }
+
+  // SU is in this queue if it's NodeQueueID is a superset of this ID.
+  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
+
+  bool empty() const { return Queue.empty(); }
+
+  unsigned size() const { return Queue.size(); }
+
+  typedef std::vector<SUnit*>::iterator iterator;
+
+  iterator begin() { return Queue.begin(); }
+
+  iterator end() { return Queue.end(); }
+
+  iterator find(SUnit *SU) {
+    return std::find(Queue.begin(), Queue.end(), SU);
+  }
+
+  void push(SUnit *SU) {
+    Queue.push_back(SU);
+    SU->NodeQueueId |= ID;
+  }
+
+  void remove(iterator I) {
+    (*I)->NodeQueueId &= ~ID;
+    *I = Queue.back();
+    Queue.pop_back();
+  }
+
+  void dump() {
+    dbgs() << Name << ": ";
+    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+      dbgs() << Queue[i]->NodeNum << " ";
+    dbgs() << "\n";
+  }
+};
+
 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class ConvergingScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    SchedCandidate(): SU(NULL) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    ScheduleDAGMI *DAG;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() { delete HazardRec; }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
   ScheduleDAGMI *DAG;
+  const TargetRegisterInfo *TRI;
 
-  unsigned NumTopReady;
-  unsigned NumBottomReady;
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
 
 public:
-  virtual void initialize(ScheduleDAGMI *dag) {
-    DAG = dag;
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingScheduler():
+    DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
 
-    assert((!ForceTopDown || !ForceBottomUp) &&
-           "-misched-topdown incompatible with -misched-bottomup");
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+} // namespace
+
+void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = dag;
+  TRI = DAG->TRI;
+  Top.DAG = dag;
+  Bot.DAG = dag;
+
+  // Initialize the HazardRecognizers.
+  const TargetMachine &TM = DAG->MF.getTarget();
+  const InstrItineraryData *Itin = TM.getInstrItineraryData();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned Latency =
+      DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true);
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + Latency)
+      SU->TopReadyCycle = PredReadyCycle + Latency;
   }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
 
-  virtual SUnit *pickNode(bool &IsTopNode) {
-    if (DAG->top() == DAG->bottom())
-      return NULL;
+void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
 
-    // As an initial placeholder heuristic, schedule in the direction that has
-    // the fewest choices.
-    SUnit *SU;
-    if (ForceTopDown || (!ForceBottomUp && NumTopReady <= NumBottomReady)) {
-      SU = DAG->getSUnit(DAG->top());
-      IsTopNode = true;
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned Latency =
+      DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true);
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
+      SU->BotReadyCycle = SuccReadyCycle + Latency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = DAG->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  }
+  else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
     }
-    else {
-      SU = DAG->getSUnit(llvm::prior(DAG->bottom()));
-      IsTopNode = false;
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
     }
-    if (SU->isTopReady()) {
-      assert(NumTopReady > 0 && "bad ready count");
-      --NumTopReady;
+    HazardRec->EmitInstruction(SU);
+  }
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += DAG->getNumMicroOps(SU->getInstr());
+  if (IssueCount >= DAG->getIssueWidth()) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
+                                         SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
+/// more desirable than RHS from scheduling standpoint.
+static bool compareRPDelta(const RegPressureDelta &LHS,
+                           const RegPressureDelta &RHS) {
+  // Compare each component of pressure in decreasing order of importance
+  // without checking if any are valid. Invalid PressureElements are assumed to
+  // have UnitIncrease==0, so are neutral.
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
+    return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
+    return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
+
+  // Avoid increasing the max pressure of the entire region.
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
+    return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
+
+  return false;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingScheduler::CandResult ConvergingScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+    // Avoid exceeding the target's limit.
+    if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
+      DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleExcess;
+      continue;
+    }
+    if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleExcess)
+      FoundCandidate = MultiPressure;
+
+    // Avoid increasing the max critical pressure in the scheduled region.
+    if (RPDelta.CriticalMax.UnitIncrease
+        < Candidate.RPDelta.CriticalMax.UnitIncrease) {
+      DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleCritical;
+      continue;
+    }
+    if (RPDelta.CriticalMax.UnitIncrease
+        > Candidate.RPDelta.CriticalMax.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleCritical)
+      FoundCandidate = MultiPressure;
+
+    // Avoid increasing the max pressure of the entire region.
+    if (RPDelta.CurrentMax.UnitIncrease
+        < Candidate.RPDelta.CurrentMax.UnitIncrease) {
+      DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleMax;
+      continue;
     }
-    if (SU->isBottomReady()) {
-      assert(NumBottomReady > 0 && "bad ready count");
-      --NumBottomReady;
+    if (RPDelta.CurrentMax.UnitIncrease
+        > Candidate.RPDelta.CurrentMax.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleMax)
+      FoundCandidate = MultiPressure;
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+
+    if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+        || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+      DEBUG(traceCandidate("NCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = NodeOrder;
     }
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
     return SU;
   }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
 
-  virtual void releaseTopNode(SUnit *SU) {
-    ++NumTopReady;
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
   }
-  virtual void releaseBottomNode(SUnit *SU) {
-    ++NumBottomReady;
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
   }
-};
-} // namespace
+  else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  }
+  else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
+/// it's state based on the current cycle before MachineSchedStrategy does.
+void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  }
+  else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
 
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
@@ -592,6 +1396,8 @@ public:
     return SU;
   }
 
+  virtual void schedNode(SUnit *SU, bool IsTopNode) {}
+
   virtual void releaseTopNode(SUnit *SU) {
     TopQ.push(SU);
   }
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1ce546b..bc383cb 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -99,6 +99,16 @@ namespace {
     bool PerformTrivialForwardCoalescing(MachineInstr *MI,
                                          MachineBasicBlock *MBB);
   };
+
+  // SuccessorSorter - Sort Successors according to their loop depth. 
+  struct SuccessorSorter {
+    SuccessorSorter(MachineLoopInfo *LoopInfo) : LI(LoopInfo) {}
+    bool operator()(const MachineBasicBlock *LHS,
+                    const MachineBasicBlock *RHS) const {
+      return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS);
+    }
+    MachineLoopInfo *LI;
+  };
 } // end anonymous namespace
 
 char MachineSinking::ID = 0;
@@ -526,8 +536,11 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
 
       // Otherwise, we should look at all the successors and decide which one
       // we should sink to.
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           E = MBB->succ_end(); SI != E; ++SI) {
+      // We give successors with smaller loop depth higher priority.
+      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
+      std::stable_sort(Succs.begin(), Succs.end(), SuccessorSorter(LI));
+      for (SmallVector<MachineBasicBlock*, 4>::iterator SI = Succs.begin(),
+           E = Succs.end(); SI != E; ++SI) {
         MachineBasicBlock *SuccBlock = *SI;
         bool LocalUse = false;
         if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
new file mode 100644
index 0000000..1a3aa60
--- /dev/null
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -0,0 +1,1153 @@
+//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-trace-metrics"
+#include "MachineTraceMetrics.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SparseSet.h"
+
+using namespace llvm;
+
+char MachineTraceMetrics::ID = 0;
+char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+
+MachineTraceMetrics::MachineTraceMetrics()
+  : MachineFunctionPass(ID), MF(0), TII(0), TRI(0), MRI(0), Loops(0) {
+  std::fill(Ensembles, array_endof(Ensembles), (Ensemble*)0);
+}
+
+void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  TII = MF->getTarget().getInstrInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+  ItinData = MF->getTarget().getInstrItineraryData();
+  MRI = &MF->getRegInfo();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  BlockInfo.resize(MF->getNumBlockIDs());
+  return false;
+}
+
+void MachineTraceMetrics::releaseMemory() {
+  MF = 0;
+  BlockInfo.clear();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i) {
+    delete Ensembles[i];
+    Ensembles[i] = 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                          Fixed block information
+//===----------------------------------------------------------------------===//
+//
+// The number of instructions in a basic block and the CPU resources used by
+// those instructions don't depend on any given trace strategy.
+
+/// Compute the resource usage in basic block MBB.
+const MachineTraceMetrics::FixedBlockInfo*
+MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
+  assert(MBB && "No basic block");
+  FixedBlockInfo *FBI = &BlockInfo[MBB->getNumber()];
+  if (FBI->hasResources())
+    return FBI;
+
+  // Compute resource usage in the block.
+  // FIXME: Compute per-functional unit counts.
+  FBI->HasCalls = false;
+  unsigned InstrCount = 0;
+  for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    const MachineInstr *MI = I;
+    if (MI->isTransient())
+      continue;
+    ++InstrCount;
+    if (MI->isCall())
+      FBI->HasCalls = true;
+  }
+  FBI->InstrCount = InstrCount;
+  return FBI;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Ensemble utility functions
+//===----------------------------------------------------------------------===//
+
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
+  BlockInfo.resize(MTM.BlockInfo.size());
+}
+
+// Virtual destructor serves as an anchor.
+MachineTraceMetrics::Ensemble::~Ensemble() {}
+
+const MachineLoop*
+MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
+  return MTM.Loops->getLoopFor(MBB);
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace above MBB.
+void MachineTraceMetrics::Ensemble::
+computeDepthResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+
+  // Compute resources from trace above. The top block is simple.
+  if (!TBI->Pred) {
+    TBI->InstrDepth = 0;
+    TBI->Head = MBB->getNumber();
+    return;
+  }
+
+  // Compute from the block above. A post-order traversal ensures the
+  // predecessor is always computed first.
+  TraceBlockInfo *PredTBI = &BlockInfo[TBI->Pred->getNumber()];
+  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");
+  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);
+  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;
+  TBI->Head = PredTBI->Head;
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace below MBB.
+void MachineTraceMetrics::Ensemble::
+computeHeightResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+
+  // Compute resources for the current block.
+  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;
+
+  // The trace tail is done.
+  if (!TBI->Succ) {
+    TBI->Tail = MBB->getNumber();
+    return;
+  }
+
+  // Compute from the block below. A post-order traversal ensures the
+  // predecessor is always computed first.
+  TraceBlockInfo *SuccTBI = &BlockInfo[TBI->Succ->getNumber()];
+  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");
+  TBI->InstrHeight += SuccTBI->InstrHeight;
+  TBI->Tail = SuccTBI->Tail;
+}
+
+// Check if depth resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getDepthResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidDepth() ? TBI : 0;
+}
+
+// Check if height resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getHeightResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidHeight() ? TBI : 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Trace Selection Strategies
+//===----------------------------------------------------------------------===//
+//
+// A trace selection strategy is implemented as a sub-class of Ensemble. The
+// trace through a block B is computed by two DFS traversals of the CFG
+// starting from B. One upwards, and one downwards. During the upwards DFS,
+// pickTracePred() is called on the post-ordered blocks. During the downwards
+// DFS, pickTraceSucc() is called in a post-order.
+//
+
+// We never allow traces that leave loops, but we do allow traces to enter
+// nested loops. We also never allow traces to contain back-edges.
+//
+// This means that a loop header can never appear above the center block of a
+// trace, except as the trace head. Below the center block, loop exiting edges
+// are banned.
+//
+// Return true if an edge from the From loop to the To loop is leaving a loop.
+// Either of To and From can be null.
+static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
+  return From && !From->contains(To);
+}
+
+// MinInstrCountEnsemble - Pick the trace that executes the least number of
+// instructions.
+namespace {
+class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const { return "MinInstr"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock*);
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*);
+
+public:
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
+};
+}
+
+// Select the preferred predecessor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  // Don't leave loops, and never follow back-edges.
+  if (CurLoop && MBB == CurLoop->getHeader())
+    return 0;
+  unsigned CurCount = MTM.getResources(MBB)->InstrCount;
+  const MachineBasicBlock *Best = 0;
+  unsigned BestDepth = 0;
+  for (MachineBasicBlock::const_pred_iterator
+       I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *Pred = *I;
+    const MachineTraceMetrics::TraceBlockInfo *PredTBI =
+      getDepthResources(Pred);
+    // Ignore cycles that aren't natural loops.
+    if (!PredTBI)
+      continue;
+    // Pick the predecessor that would give this block the smallest InstrDepth.
+    unsigned Depth = PredTBI->InstrDepth + CurCount;
+    if (!Best || Depth < BestDepth)
+      Best = Pred, BestDepth = Depth;
+  }
+  return Best;
+}
+
+// Select the preferred successor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  const MachineBasicBlock *Best = 0;
+  unsigned BestHeight = 0;
+  for (MachineBasicBlock::const_succ_iterator
+       I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    // Don't consider back-edges.
+    if (CurLoop && Succ == CurLoop->getHeader())
+      continue;
+    // Don't consider successors exiting CurLoop.
+    if (isExitingLoop(CurLoop, getLoopFor(Succ)))
+      continue;
+    const MachineTraceMetrics::TraceBlockInfo *SuccTBI =
+      getHeightResources(Succ);
+    // Ignore cycles that aren't natural loops.
+    if (!SuccTBI)
+      continue;
+    // Pick the successor that would give this block the smallest InstrHeight.
+    unsigned Height = SuccTBI->InstrHeight;
+    if (!Best || Height < BestHeight)
+      Best = Succ, BestHeight = Height;
+  }
+  return Best;
+}
+
+// Get an Ensemble sub-class for the requested trace strategy.
+MachineTraceMetrics::Ensemble *
+MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) {
+  assert(strategy < TS_NumStrategies && "Invalid trace strategy enum");
+  Ensemble *&E = Ensembles[strategy];
+  if (E)
+    return E;
+
+  // Allocate new Ensemble on demand.
+  switch (strategy) {
+  case TS_MinInstrCount: return (E = new MinInstrCountEnsemble(this));
+  default: llvm_unreachable("Invalid trace strategy enum");
+  }
+}
+
+void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Invalidate traces through BB#" << MBB->getNumber() << '\n');
+  BlockInfo[MBB->getNumber()].invalidate();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->invalidate(MBB);
+}
+
+void MachineTraceMetrics::verifyAnalysis() const {
+  if (!MF)
+    return;
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->verify();
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                               Trace building
+//===----------------------------------------------------------------------===//
+//
+// Traces are built by two CFG traversals. To avoid recomputing too much, use a
+// set abstraction that confines the search to the current loop, and doesn't
+// revisit blocks.
+
+namespace {
+struct LoopBounds {
+  MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
+  SmallPtrSet<const MachineBasicBlock*, 8> Visited;
+  const MachineLoopInfo *Loops;
+  bool Downward;
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops)
+    : Blocks(blocks), Loops(loops), Downward(false) {}
+};
+}
+
+// Specialize po_iterator_storage in order to prune the post-order traversal so
+// it is limited to the current loop and doesn't traverse the loop back edges.
+namespace llvm {
+template<>
+class po_iterator_storage<LoopBounds, true> {
+  LoopBounds &LB;
+public:
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+  void finishPostorder(const MachineBasicBlock*) {}
+
+  bool insertEdge(const MachineBasicBlock *From, const MachineBasicBlock *To) {
+    // Skip already visited To blocks.
+    MachineTraceMetrics::TraceBlockInfo &TBI = LB.Blocks[To->getNumber()];
+    if (LB.Downward ? TBI.hasValidHeight() : TBI.hasValidDepth())
+      return false;
+    // From is null once when To is the trace center block.
+    if (From) {
+      if (const MachineLoop *FromLoop = LB.Loops->getLoopFor(From)) {
+        // Don't follow backedges, don't leave FromLoop when going upwards.
+        if ((LB.Downward ? To : From) == FromLoop->getHeader())
+          return false;
+        // Don't leave FromLoop.
+        if (isExitingLoop(FromLoop, LB.Loops->getLoopFor(To)))
+          return false;
+      }
+    }
+    // To is a new block. Mark the block as visited in case the CFG has cycles
+    // that MachineLoopInfo didn't recognize as a natural loop.
+    return LB.Visited.insert(To);
+  }
+};
+}
+
+/// Compute the trace through MBB.
+void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Computing " << getName() << " trace through BB#"
+               << MBB->getNumber() << '\n');
+  // Set up loop bounds for the backwards post-order traversal.
+  LoopBounds Bounds(BlockInfo, MTM.Loops);
+
+  // Run an upwards post-order search for the trace start.
+  Bounds.Downward = false;
+  Bounds.Visited.clear();
+  typedef ipo_ext_iterator<const MachineBasicBlock*, LoopBounds> UpwardPO;
+  for (UpwardPO I = ipo_ext_begin(MBB, Bounds), E = ipo_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  pred for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the predecessors have been visited, pick the preferred one.
+    TBI.Pred = pickTracePred(*I);
+    DEBUG({
+      if (TBI.Pred)
+        dbgs() << "BB#" << TBI.Pred->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leading to I is now known, compute the depth resources.
+    computeDepthResources(*I);
+  }
+
+  // Run a downwards post-order search for the trace end.
+  Bounds.Downward = true;
+  Bounds.Visited.clear();
+  typedef po_ext_iterator<const MachineBasicBlock*, LoopBounds> DownwardPO;
+  for (DownwardPO I = po_ext_begin(MBB, Bounds), E = po_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  succ for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the successors have been visited, pick the preferred one.
+    TBI.Succ = pickTraceSucc(*I);
+    DEBUG({
+      if (TBI.Succ)
+        dbgs() << "BB#" << TBI.Succ->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leaving I is now known, compute the height resources.
+    computeHeightResources(*I);
+  }
+}
+
+/// Invalidate traces through BadMBB.
+void
+MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
+  SmallVector<const MachineBasicBlock*, 16> WorkList;
+  TraceBlockInfo &BadTBI = BlockInfo[BadMBB->getNumber()];
+
+  // Invalidate height resources of blocks above MBB.
+  if (BadTBI.hasValidHeight()) {
+    BadTBI.invalidateHeight();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " height.\n");
+      // Find any MBB predecessors that have MBB as their preferred successor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_pred_iterator
+           I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidHeight())
+          continue;
+        if (TBI.Succ == MBB) {
+          TBI.invalidateHeight();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Succ is actually a *I successor.
+        assert((!TBI.Succ || (*I)->isSuccessor(TBI.Succ)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Invalidate depth resources of blocks below MBB.
+  if (BadTBI.hasValidDepth()) {
+    BadTBI.invalidateDepth();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " depth.\n");
+      // Find any MBB successors that have MBB as their preferred predecessor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_succ_iterator
+           I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidDepth())
+          continue;
+        if (TBI.Pred == MBB) {
+          TBI.invalidateDepth();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Pred is actually a *I predecessor.
+        assert((!TBI.Pred || (*I)->isPredecessor(TBI.Pred)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Clear any per-instruction data. We only have to do this for BadMBB itself
+  // because the instructions in that block may change. Other blocks may be
+  // invalidated, but their instructions will stay the same, so there is no
+  // need to erase the Cycle entries. They will be overwritten when we
+  // recompute.
+  for (MachineBasicBlock::const_iterator I = BadMBB->begin(), E = BadMBB->end();
+       I != E; ++I)
+    Cycles.erase(I);
+}
+
+void MachineTraceMetrics::Ensemble::verify() const {
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
+         "Outdated BlockInfo size");
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+    const TraceBlockInfo &TBI = BlockInfo[Num];
+    if (TBI.hasValidDepth() && TBI.Pred) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isPredecessor(TBI.Pred) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Pred->getNumber()].hasValidDepth() &&
+             "Trace is broken, depth should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      assert(!(Loop && MBB == Loop->getHeader()) && "Trace contains backedge");
+    }
+    if (TBI.hasValidHeight() && TBI.Succ) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isSuccessor(TBI.Succ) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Succ->getNumber()].hasValidHeight() &&
+             "Trace is broken, height should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      const MachineLoop *SuccLoop = getLoopFor(TBI.Succ);
+      assert(!(Loop && Loop == SuccLoop && TBI.Succ == Loop->getHeader()) &&
+             "Trace contains backedge");
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                             Data Dependencies
+//===----------------------------------------------------------------------===//
+//
+// Compute the depth and height of each instruction based on data dependencies
+// and instruction latencies. These cycle numbers assume that the CPU can issue
+// an infinite number of instructions per cycle as long as their dependencies
+// are ready.
+
+// A data dependency is represented as a defining MI and operand numbers on the
+// defining and using MI.
+namespace {
+struct DataDep {
+  const MachineInstr *DefMI;
+  unsigned DefOp;
+  unsigned UseOp;
+
+  DataDep(const MachineInstr *DefMI, unsigned DefOp, unsigned UseOp)
+    : DefMI(DefMI), DefOp(DefOp), UseOp(UseOp) {}
+
+  /// Create a DataDep from an SSA form virtual register.
+  DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
+    : UseOp(UseOp) {
+    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
+    assert(!DefI.atEnd() && "Register has no defs");
+    DefMI = &*DefI;
+    DefOp = DefI.getOperandNo();
+    assert((++DefI).atEnd() && "Register has multiple defs");
+  }
+};
+}
+
+// Get the input data dependencies that must be ready before UseMI can issue.
+// Return true if UseMI has any physreg operands.
+static bool getDataDeps(const MachineInstr *UseMI,
+                        SmallVectorImpl<DataDep> &Deps,
+                        const MachineRegisterInfo *MRI) {
+  bool HasPhysRegs = false;
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      HasPhysRegs = true;
+      continue;
+    }
+    // Collect virtual register reads.
+    if (MO->readsReg())
+      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
+  }
+  return HasPhysRegs;
+}
+
+// Get the input data dependencies of a PHI instruction, using Pred as the
+// preferred predecessor.
+// This will add at most one dependency to Deps.
+static void getPHIDeps(const MachineInstr *UseMI,
+                       SmallVectorImpl<DataDep> &Deps,
+                       const MachineBasicBlock *Pred,
+                       const MachineRegisterInfo *MRI) {
+  // No predecessor at the beginning of a trace. Ignore dependencies.
+  if (!Pred)
+    return;
+  assert(UseMI->isPHI() && UseMI->getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI->getNumOperands(); i += 2) {
+    if (UseMI->getOperand(i + 1).getMBB() == Pred) {
+      unsigned Reg = UseMI->getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
+      return;
+    }
+  }
+}
+
+// Keep track of physreg data dependencies by recording each live register unit.
+// Associate each regunit with an instruction operand. Depending on the
+// direction instructions are scanned, it could be the operand that defined the
+// regunit, or the highest operand to read the regunit.
+namespace {
+struct LiveRegUnit {
+  unsigned RegUnit;
+  unsigned Cycle;
+  const MachineInstr *MI;
+  unsigned Op;
+
+  unsigned getSparseSetIndex() const { return RegUnit; }
+
+  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(0), Op(0) {}
+};
+}
+
+// Identify physreg dependencies for UseMI, and update the live regunit
+// tracking set when scanning instructions downwards.
+static void updatePhysDepsDownwards(const MachineInstr *UseMI,
+                                    SmallVectorImpl<DataDep> &Deps,
+                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> Kills;
+  SmallVector<unsigned, 8> LiveDefOps;
+
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    // Track live defs and kills for updating RegUnits.
+    if (MO->isDef()) {
+      if (MO->isDead())
+        Kills.push_back(Reg);
+      else
+        LiveDefOps.push_back(MO.getOperandNo());
+    } else if (MO->isKill())
+      Kills.push_back(Reg);
+    // Identify dependencies.
+    if (!MO->readsReg())
+      continue;
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
+      break;
+    }
+  }
+
+  // Update RegUnits to reflect live registers after UseMI.
+  // First kills.
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+      RegUnits.erase(*Units);
+
+  // Second, live defs.
+  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
+    unsigned DefOp = LiveDefOps[i];
+    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
+         Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      LRU.MI = UseMI;
+      LRU.Op = DefOp;
+    }
+  }
+}
+
+/// The length of the critical path through a trace is the maximum of two path
+/// lengths:
+///
+/// 1. The maximum height+depth over all instructions in the trace center block.
+///
+/// 2. The longest cross-block dependency chain. For small blocks, it is
+///    possible that the critical path through the trace doesn't include any
+///    instructions in the block.
+///
+/// This function computes the second number from the live-in list of the
+/// center block.
+unsigned MachineTraceMetrics::Ensemble::
+computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
+  assert(TBI.HasValidInstrDepths && "Missing depth info");
+  assert(TBI.HasValidInstrHeights && "Missing height info");
+  unsigned MaxLen = 0;
+  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+    const LiveInReg &LIR = TBI.LiveIns[i];
+    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+      continue;
+    const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+    // Ignore dependencies outside the current trace.
+    const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
+    if (!DefTBI.hasValidDepth() || DefTBI.Head != TBI.Head)
+      continue;
+    unsigned Len = LIR.Height + Cycles[DefMI].Depth;
+    MaxLen = std::max(MaxLen, Len);
+  }
+  return MaxLen;
+}
+
+/// Compute instruction depths for all instructions above or in MBB in its
+/// trace. This assumes that the trace through MBB has already been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrDepths(const MachineBasicBlock *MBB) {
+  // The top of the trace may already be computed, and HasValidInstrDepths
+  // implies Head->HasValidInstrDepths, so we only need to start from the first
+  // block in the trace that needs to be recomputed.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidDepth() && "Incomplete trace");
+    if (TBI.HasValidInstrDepths)
+      break;
+    Stack.push_back(MBB);
+    MBB = TBI.Pred;
+  } while (MBB);
+
+  // FIXME: If MBB is non-null at this point, it is the last pre-computed block
+  // in the trace. We should track any live-out physregs that were defined in
+  // the trace. This is quite rare in SSA form, typically created by CSE
+  // hoisting a compare.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // Go through trace blocks in top-down order, stopping after the center block.
+  SmallVector<DataDep, 8> Deps;
+  while (!Stack.empty()) {
+    MBB = Stack.pop_back_val();
+    DEBUG(dbgs() << "Depths for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrDepths = true;
+    TBI.CriticalPath = 0;
+
+    // Also compute the critical path length through MBB when possible.
+    if (TBI.HasValidInstrHeights)
+      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
+
+    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      const MachineInstr *UseMI = I;
+
+      // Collect all data dependencies.
+      Deps.clear();
+      if (UseMI->isPHI())
+        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(UseMI, Deps, MTM.MRI))
+        updatePhysDepsDownwards(UseMI, Deps, RegUnits, MTM.TRI);
+
+      // Filter and process dependencies, computing the earliest issue cycle.
+      unsigned Cycle = 0;
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+        const DataDep &Dep = Deps[i];
+        const TraceBlockInfo&DepTBI =
+          BlockInfo[Dep.DefMI->getParent()->getNumber()];
+        // Ignore dependencies from outside the current trace.
+        if (!DepTBI.hasValidDepth() || DepTBI.Head != TBI.Head)
+          continue;
+        assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
+        unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
+        // Add latency if DefMI is a real instruction. Transients get latency 0.
+        if (!Dep.DefMI->isTransient())
+          DepCycle += MTM.TII->computeOperandLatency(MTM.ItinData,
+                                                     Dep.DefMI, Dep.DefOp,
+                                                     UseMI, Dep.UseOp,
+                                                     /* FindMin = */ false);
+        Cycle = std::max(Cycle, DepCycle);
+      }
+      // Remember the instruction depth.
+      InstrCycles &MICycles = Cycles[UseMI];
+      MICycles.Depth = Cycle;
+
+      if (!TBI.HasValidInstrHeights) {
+        DEBUG(dbgs() << Cycle << '\t' << *UseMI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *UseMI);
+    }
+  }
+}
+
+// Identify physreg dependencies for MI when scanning instructions upwards.
+// Return the issue height of MI after considering any live regunits.
+// Height is the issue height computed from virtual register dependencies alone.
+static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
+                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      const InstrItineraryData *ItinData,
+                                      const TargetInstrInfo *TII,
+                                      const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> ReadOps;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO->readsReg())
+      ReadOps.push_back(MO.getOperandNo());
+    if (!MO->isDef())
+      continue;
+    // This is a def of Reg. Remove corresponding entries from RegUnits, and
+    // update MI Height to consider the physreg dependencies.
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      unsigned DepHeight = I->Cycle;
+      if (!MI->isTransient()) {
+        // We may not know the UseMI of this dependency, if it came from the
+        // live-in list.
+        if (I->MI)
+          DepHeight += TII->computeOperandLatency(ItinData,
+                                                  MI, MO.getOperandNo(),
+                                                  I->MI, I->Op);
+        else
+          // No UseMI. Just use the MI latency instead.
+          DepHeight += TII->getInstrLatency(ItinData, MI);
+      }
+      Height = std::max(Height, DepHeight);
+      // This regunit is dead above MI.
+      RegUnits.erase(I);
+    }
+  }
+
+  // Now we know the height of MI. Update any regunits read.
+  for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
+    unsigned Reg = MI->getOperand(ReadOps[i]).getReg();
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      // Set the height to the highest reader of the unit.
+      if (LRU.Cycle <= Height && LRU.MI != MI) {
+        LRU.Cycle = Height;
+        LRU.MI = MI;
+        LRU.Op = ReadOps[i];
+      }
+    }
+  }
+
+  return Height;
+}
+
+
+typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
+
+// Push the height of DefMI upwards if required to match UseMI.
+// Return true if this is the first time DefMI was seen.
+static bool pushDepHeight(const DataDep &Dep,
+                          const MachineInstr *UseMI, unsigned UseHeight,
+                          MIHeightMap &Heights,
+                          const InstrItineraryData *ItinData,
+                          const TargetInstrInfo *TII) {
+  // Adjust height by Dep.DefMI latency.
+  if (!Dep.DefMI->isTransient())
+    UseHeight += TII->computeOperandLatency(ItinData, Dep.DefMI, Dep.DefOp,
+                                            UseMI, Dep.UseOp);
+
+  // Update Heights[DefMI] to be the maximum height seen.
+  MIHeightMap::iterator I;
+  bool New;
+  tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
+    return true;
+
+  // DefMI has been pushed before. Give it the max height.
+  if (I->second < UseHeight)
+    I->second = UseHeight;
+  return false;
+}
+
+/// Assuming that DefMI was used by Trace.back(), add it to the live-in lists
+/// of all the blocks in Trace. Stop when reaching the block that contains
+/// DefMI.
+void MachineTraceMetrics::Ensemble::
+addLiveIns(const MachineInstr *DefMI,
+           ArrayRef<const MachineBasicBlock*> Trace) {
+  assert(!Trace.empty() && "Trace should contain at least one block");
+  unsigned Reg = DefMI->getOperand(0).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const MachineBasicBlock *DefMBB = DefMI->getParent();
+
+  // Reg is live-in to all blocks in Trace that follow DefMBB.
+  for (unsigned i = Trace.size(); i; --i) {
+    const MachineBasicBlock *MBB = Trace[i-1];
+    if (MBB == DefMBB)
+      return;
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    // Just add the register. The height will be updated later.
+    TBI.LiveIns.push_back(Reg);
+  }
+}
+
+/// Compute instruction heights in the trace through MBB. This updates MBB and
+/// the blocks below it in the trace. It is assumed that the trace has already
+/// been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrHeights(const MachineBasicBlock *MBB) {
+  // The bottom of the trace may already be computed.
+  // Find the blocks that need updating.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidHeight() && "Incomplete trace");
+    if (TBI.HasValidInstrHeights)
+      break;
+    Stack.push_back(MBB);
+    TBI.LiveIns.clear();
+    MBB = TBI.Succ;
+  } while (MBB);
+
+  // As we move upwards in the trace, keep track of instructions that are
+  // required by deeper trace instructions. Map MI -> height required so far.
+  MIHeightMap Heights;
+
+  // For physregs, the def isn't known when we see the use.
+  // Instead, keep track of the highest use of each regunit.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // If the bottom of the trace was already precomputed, initialize heights
+  // from its live-in list.
+  // MBB is the highest precomputed block in the trace.
+  if (MBB) {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg LI = TBI.LiveIns[i];
+      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+        // For virtual registers, the def latency is included.
+        unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
+        if (Height < LI.Height)
+          Height = LI.Height;
+      } else {
+        // For register units, the def latency is not included because we don't
+        // know the def yet.
+        RegUnits[LI.Reg].Cycle = LI.Height;
+      }
+    }
+  }
+
+  // Go through the trace blocks in bottom-up order.
+  SmallVector<DataDep, 8> Deps;
+  for (;!Stack.empty(); Stack.pop_back()) {
+    MBB = Stack.back();
+    DEBUG(dbgs() << "Heights for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrHeights = true;
+    TBI.CriticalPath = 0;
+
+    // Get dependencies from PHIs in the trace successor.
+    const MachineBasicBlock *Succ = TBI.Succ;
+    // If MBB is the last block in the trace, and it has a back-edge to the
+    // loop header, get loop-carried dependencies from PHIs in the header. For
+    // that purpose, pretend that all the loop header PHIs have height 0.
+    if (!Succ)
+      if (const MachineLoop *Loop = getLoopFor(MBB))
+        if (MBB->isSuccessor(Loop->getHeader()))
+          Succ = Loop->getHeader();
+
+    if (Succ) {
+      for (MachineBasicBlock::const_iterator I = Succ->begin(), E = Succ->end();
+           I != E && I->isPHI(); ++I) {
+        const MachineInstr *PHI = I;
+        Deps.clear();
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        if (!Deps.empty()) {
+          // Loop header PHI heights are all 0.
+          unsigned Height = TBI.Succ ? Cycles.lookup(PHI).Height : 0;
+          DEBUG(dbgs() << "pred\t" << Height << '\t' << *PHI);
+          if (pushDepHeight(Deps.front(), PHI, Height,
+                            Heights, MTM.ItinData, MTM.TII))
+            addLiveIns(Deps.front().DefMI, Stack);
+        }
+      }
+    }
+
+    // Go through the block backwards.
+    for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin();
+         BI != BB;) {
+      const MachineInstr *MI = --BI;
+
+      // Find the MI height as determined by virtual register uses in the
+      // trace below.
+      unsigned Cycle = 0;
+      MIHeightMap::iterator HeightI = Heights.find(MI);
+      if (HeightI != Heights.end()) {
+        Cycle = HeightI->second;
+        // We won't be seeing any more MI uses.
+        Heights.erase(HeightI);
+      }
+
+      // Don't process PHI deps. They depend on the specific predecessor, and
+      // we'll get them when visiting the predecessor.
+      Deps.clear();
+      bool HasPhysRegs = !MI->isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+
+      // There may also be regunit dependencies to include in the height.
+      if (HasPhysRegs)
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits,
+                                      MTM.ItinData, MTM.TII, MTM.TRI);
+
+      // Update the required height of any virtual registers read by MI.
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i)
+        if (pushDepHeight(Deps[i], MI, Cycle, Heights, MTM.ItinData, MTM.TII))
+          addLiveIns(Deps[i].DefMI, Stack);
+
+      InstrCycles &MICycles = Cycles[MI];
+      MICycles.Height = Cycle;
+      if (!TBI.HasValidInstrDepths) {
+        DEBUG(dbgs() << Cycle << '\t' << *MI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *MI);
+    }
+
+    // Update virtual live-in heights. They were added by addLiveIns() with a 0
+    // height because the final height isn't known until now.
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() <<  " Live-ins:");
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg &LIR = TBI.LiveIns[i];
+      const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+      LIR.Height = Heights.lookup(DefMI);
+      DEBUG(dbgs() << ' ' << PrintReg(LIR.Reg) << '@' << LIR.Height);
+    }
+
+    // Transfer the live regunits to the live-in list.
+    for (SparseSet<LiveRegUnit>::const_iterator
+         RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) {
+      TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle));
+      DEBUG(dbgs() << ' ' << PrintRegUnit(RI->RegUnit, MTM.TRI)
+                   << '@' << RI->Cycle);
+    }
+    DEBUG(dbgs() << '\n');
+
+    if (!TBI.HasValidInstrDepths)
+      continue;
+    // Add live-ins to the critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath,
+                                computeCrossBlockCriticalPath(TBI));
+    DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+  }
+}
+
+MachineTraceMetrics::Trace
+MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
+  // FIXME: Check cache tags, recompute as needed.
+  computeTrace(MBB);
+  computeInstrDepths(MBB);
+  computeInstrHeights(MBB);
+  return Trace(*this, BlockInfo[MBB->getNumber()]);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr *MI) const {
+  assert(MI && "Not an instruction.");
+  assert(getBlockNum() == unsigned(MI->getParent()->getNumber()) &&
+         "MI must be in the trace center block");
+  InstrCycles Cyc = getInstrCycles(MI);
+  return getCriticalPath() - (Cyc.Depth + Cyc.Height);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
+  const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
+  SmallVector<DataDep, 1> Deps;
+  getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
+  assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
+  DataDep &Dep = Deps.front();
+  unsigned DepCycle = getInstrCycles(Dep.DefMI).Depth;
+  // Add latency if DefMI is a real instruction. Transients get latency 0.
+  if (!Dep.DefMI->isTransient())
+    DepCycle += TE.MTM.TII->computeOperandLatency(TE.MTM.ItinData,
+                                                  Dep.DefMI, Dep.DefOp,
+                                                  PHI, Dep.UseOp,
+                                                  /* FindMin = */ false);
+  return DepCycle;
+}
+
+unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
+  // For now, we compute the resource depth from instruction count / issue
+  // width. Eventually, we should compute resource depth per functional unit
+  // and return the max.
+  unsigned Instrs = TBI.InstrDepth;
+  if (Bottom)
+    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
+  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
+    if (Model->IssueWidth != 0)
+      return Instrs / Model->IssueWidth;
+  // Assume issue width 1 without a schedule model.
+  return Instrs;
+}
+
+unsigned MachineTraceMetrics::Trace::
+getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks) const {
+  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
+    Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
+    if (Model->IssueWidth != 0)
+      return Instrs / Model->IssueWidth;
+  // Assume issue width 1 without a schedule model.
+  return Instrs;
+}
+
+void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
+  OS << getName() << " ensemble:\n";
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  BB#" << i << '\t';
+    BlockInfo[i].print(OS);
+    OS << '\n';
+  }
+}
+
+void MachineTraceMetrics::TraceBlockInfo::print(raw_ostream &OS) const {
+  if (hasValidDepth()) {
+    OS << "depth=" << InstrDepth;
+    if (Pred)
+      OS << " pred=BB#" << Pred->getNumber();
+    else
+      OS << " pred=null";
+    OS << " head=BB#" << Head;
+    if (HasValidInstrDepths)
+      OS << " +instrs";
+  } else
+    OS << "depth invalid";
+  OS << ", ";
+  if (hasValidHeight()) {
+    OS << "height=" << InstrHeight;
+    if (Succ)
+      OS << " succ=BB#" << Succ->getNumber();
+    else
+      OS << " succ=null";
+    OS << " tail=BB#" << Tail;
+    if (HasValidInstrHeights)
+      OS << " +instrs";
+  } else
+    OS << "height invalid";
+  if (HasValidInstrDepths && HasValidInstrHeights)
+    OS << ", crit=" << CriticalPath;
+}
+
+void MachineTraceMetrics::Trace::print(raw_ostream &OS) const {
+  unsigned MBBNum = &TBI - &TE.BlockInfo[0];
+
+  OS << TE.getName() << " trace BB#" << TBI.Head << " --> BB#" << MBBNum
+     << " --> BB#" << TBI.Tail << ':';
+  if (TBI.hasValidHeight() && TBI.hasValidDepth())
+    OS << ' ' << getInstrCount() << " instrs.";
+  if (TBI.HasValidInstrDepths && TBI.HasValidInstrHeights)
+    OS << ' ' << TBI.CriticalPath << " cycles.";
+
+  const MachineTraceMetrics::TraceBlockInfo *Block = &TBI;
+  OS << "\nBB#" << MBBNum;
+  while (Block->hasValidDepth() && Block->Pred) {
+    unsigned Num = Block->Pred->getNumber();
+    OS << " <- BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+
+  Block = &TBI;
+  OS << "\n    ";
+  while (Block->hasValidHeight() && Block->Succ) {
+    unsigned Num = Block->Succ->getNumber();
+    OS << " -> BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+  OS << '\n';
+}
diff --git a/lib/CodeGen/MachineTraceMetrics.h b/lib/CodeGen/MachineTraceMetrics.h
new file mode 100644
index 0000000..c5b86f3
--- /dev/null
+++ b/lib/CodeGen/MachineTraceMetrics.h
@@ -0,0 +1,341 @@
+//===- lib/CodeGen/MachineTraceMetrics.h - Super-scalar metrics -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the MachineTraceMetrics analysis pass
+// that estimates CPU resource usage and critical data dependency paths through
+// preferred traces. This is useful for super-scalar CPUs where execution speed
+// can be limited both by data dependencies and by limited execution resources.
+//
+// Out-of-order CPUs will often be executing instructions from multiple basic
+// blocks at the same time. This makes it difficult to estimate the resource
+// usage accurately in a single basic block. Resources can be estimated better
+// by looking at a trace through the current basic block.
+//
+// For every block, the MachineTraceMetrics pass will pick a preferred trace
+// that passes through the block. The trace is chosen based on loop structure,
+// branch probabilities, and resource usage. The intention is to pick likely
+// traces that would be the most affected by code transformations.
+//
+// It is expensive to compute a full arbitrary trace for every block, so to
+// save some computations, traces are chosen to be convergent. This means that
+// if the traces through basic blocks A and B ever cross when moving away from
+// A and B, they never diverge again. This applies in both directions - If the
+// traces meet above A and B, they won't diverge when going further back.
+//
+// Traces tend to align with loops. The trace through a block in an inner loop
+// will begin at the loop entry block and end at a back edge. If there are
+// nested loops, the trace may begin and end at those instead.
+//
+// For each trace, we compute the critical path length, which is the number of
+// cycles required to execute the trace when execution is limited by data
+// dependencies only. We also compute the resource height, which is the number
+// of cycles required to execute all instructions in the trace when ignoring
+// data dependencies.
+//
+// Every instruction in the current block has a slack - the number of cycles
+// execution of the instruction can be delayed without extending the critical
+// path.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
+#define LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class InstrItineraryData;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class raw_ostream;
+
+class MachineTraceMetrics : public MachineFunctionPass {
+  const MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const InstrItineraryData *ItinData;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *Loops;
+
+public:
+  class Ensemble;
+  class Trace;
+  static char ID;
+  MachineTraceMetrics();
+  void getAnalysisUsage(AnalysisUsage&) const;
+  bool runOnMachineFunction(MachineFunction&);
+  void releaseMemory();
+  void verifyAnalysis() const;
+
+  friend class Ensemble;
+  friend class Trace;
+
+  /// Per-basic block information that doesn't depend on the trace through the
+  /// block.
+  struct FixedBlockInfo {
+    /// The number of non-trivial instructions in the block.
+    /// Doesn't count PHI and COPY instructions that are likely to be removed.
+    unsigned InstrCount;
+
+    /// True when the block contains calls.
+    bool HasCalls;
+
+    FixedBlockInfo() : InstrCount(~0u), HasCalls(false) {}
+
+    /// Returns true when resource information for this block has been computed.
+    bool hasResources() const { return InstrCount != ~0u; }
+
+    /// Invalidate resource information.
+    void invalidate() { InstrCount = ~0u; }
+  };
+
+  /// Get the fixed resource information about MBB. Compute it on demand.
+  const FixedBlockInfo *getResources(const MachineBasicBlock*);
+
+  /// A virtual register or regunit required by a basic block or its trace
+  /// successors.
+  struct LiveInReg {
+    /// The virtual register required, or a register unit.
+    unsigned Reg;
+
+    /// For virtual registers: Minimum height of the defining instruction.
+    /// For regunits: Height of the highest user in the trace.
+    unsigned Height;
+
+    LiveInReg(unsigned Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {}
+  };
+
+  /// Per-basic block information that relates to a specific trace through the
+  /// block. Convergent traces means that only one of these is required per
+  /// block in a trace ensemble.
+  struct TraceBlockInfo {
+    /// Trace predecessor, or NULL for the first block in the trace.
+    /// Valid when hasValidDepth().
+    const MachineBasicBlock *Pred;
+
+    /// Trace successor, or NULL for the last block in the trace.
+    /// Valid when hasValidHeight().
+    const MachineBasicBlock *Succ;
+
+    /// The block number of the head of the trace. (When hasValidDepth()).
+    unsigned Head;
+
+    /// The block number of the tail of the trace. (When hasValidHeight()).
+    unsigned Tail;
+
+    /// Accumulated number of instructions in the trace above this block.
+    /// Does not include instructions in this block.
+    unsigned InstrDepth;
+
+    /// Accumulated number of instructions in the trace below this block.
+    /// Includes instructions in this block.
+    unsigned InstrHeight;
+
+    TraceBlockInfo() :
+      Pred(0), Succ(0),
+      InstrDepth(~0u), InstrHeight(~0u),
+      HasValidInstrDepths(false), HasValidInstrHeights(false) {}
+
+    /// Returns true if the depth resources have been computed from the trace
+    /// above this block.
+    bool hasValidDepth() const { return InstrDepth != ~0u; }
+
+    /// Returns true if the height resources have been computed from the trace
+    /// below this block.
+    bool hasValidHeight() const { return InstrHeight != ~0u; }
+
+    /// Invalidate depth resources when some block above this one has changed.
+    void invalidateDepth() { InstrDepth = ~0u; HasValidInstrDepths = false; }
+
+    /// Invalidate height resources when a block below this one has changed.
+    void invalidateHeight() { InstrHeight = ~0u; HasValidInstrHeights = false; }
+
+    // Data-dependency-related information. Per-instruction depth and height
+    // are computed from data dependencies in the current trace, using
+    // itinerary data.
+
+    /// Instruction depths have been computed. This implies hasValidDepth().
+    bool HasValidInstrDepths;
+
+    /// Instruction heights have been computed. This implies hasValidHeight().
+    bool HasValidInstrHeights;
+
+    /// Critical path length. This is the number of cycles in the longest data
+    /// dependency chain through the trace. This is only valid when both
+    /// HasValidInstrDepths and HasValidInstrHeights are set.
+    unsigned CriticalPath;
+
+    /// Live-in registers. These registers are defined above the current block
+    /// and used by this block or a block below it.
+    /// This does not include PHI uses in the current block, but it does
+    /// include PHI uses in deeper blocks.
+    SmallVector<LiveInReg, 4> LiveIns;
+
+    void print(raw_ostream&) const;
+  };
+
+  /// InstrCycles represents the cycle height and depth of an instruction in a
+  /// trace.
+  struct InstrCycles {
+    /// Earliest issue cycle as determined by data dependencies and instruction
+    /// latencies from the beginning of the trace. Data dependencies from
+    /// before the trace are not included.
+    unsigned Depth;
+
+    /// Minimum number of cycles from this instruction is issued to the of the
+    /// trace, as determined by data dependencies and instruction latencies.
+    unsigned Height;
+  };
+
+  /// A trace represents a plausible sequence of executed basic blocks that
+  /// passes through the current basic block one. The Trace class serves as a
+  /// handle to internal cached data structures.
+  class Trace {
+    Ensemble &TE;
+    TraceBlockInfo &TBI;
+
+    unsigned getBlockNum() const { return &TBI - &TE.BlockInfo[0]; }
+
+  public:
+    explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {}
+    void print(raw_ostream&) const;
+
+    /// Compute the total number of instructions in the trace.
+    unsigned getInstrCount() const {
+      return TBI.InstrDepth + TBI.InstrHeight;
+    }
+
+    /// Return the resource depth of the top/bottom of the trace center block.
+    /// This is the number of cycles required to execute all instructions from
+    /// the trace head to the trace center block. The resource depth only
+    /// considers execution resources, it ignores data dependencies.
+    /// When Bottom is set, instructions in the trace center block are included.
+    unsigned getResourceDepth(bool Bottom) const;
+
+    /// Return the resource length of the trace. This is the number of cycles
+    /// required to execute the instructions in the trace if they were all
+    /// independent, exposing the maximum instruction-level parallelism.
+    ///
+    /// Any blocks in Extrablocks are included as if they were part of the
+    /// trace.
+    unsigned getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks =
+                               ArrayRef<const MachineBasicBlock*>()) const;
+
+    /// Return the length of the (data dependency) critical path through the
+    /// trace.
+    unsigned getCriticalPath() const { return TBI.CriticalPath; }
+
+    /// Return the depth and height of MI. The depth is only valid for
+    /// instructions in or above the trace center block. The height is only
+    /// valid for instructions in or below the trace center block.
+    InstrCycles getInstrCycles(const MachineInstr *MI) const {
+      return TE.Cycles.lookup(MI);
+    }
+
+    /// Return the slack of MI. This is the number of cycles MI can be delayed
+    /// before the critical path becomes longer.
+    /// MI must be an instruction in the trace center block.
+    unsigned getInstrSlack(const MachineInstr *MI) const;
+
+    /// Return the Depth of a PHI instruction in a trace center block successor.
+    /// The PHI does not have to be part of the trace.
+    unsigned getPHIDepth(const MachineInstr *PHI) const;
+  };
+
+  /// A trace ensemble is a collection of traces selected using the same
+  /// strategy, for example 'minimum resource height'. There is one trace for
+  /// every block in the function.
+  class Ensemble {
+    SmallVector<TraceBlockInfo, 4> BlockInfo;
+    DenseMap<const MachineInstr*, InstrCycles> Cycles;
+    friend class Trace;
+
+    void computeTrace(const MachineBasicBlock*);
+    void computeDepthResources(const MachineBasicBlock*);
+    void computeHeightResources(const MachineBasicBlock*);
+    unsigned computeCrossBlockCriticalPath(const TraceBlockInfo&);
+    void computeInstrDepths(const MachineBasicBlock*);
+    void computeInstrHeights(const MachineBasicBlock*);
+    void addLiveIns(const MachineInstr *DefMI,
+                    ArrayRef<const MachineBasicBlock*> Trace);
+
+  protected:
+    MachineTraceMetrics &MTM;
+    virtual const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) =0;
+    virtual const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) =0;
+    explicit Ensemble(MachineTraceMetrics*);
+    const MachineLoop *getLoopFor(const MachineBasicBlock*) const;
+    const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const;
+    const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const;
+
+  public:
+    virtual ~Ensemble();
+    virtual const char *getName() const =0;
+    void print(raw_ostream&) const;
+    void invalidate(const MachineBasicBlock *MBB);
+    void verify() const;
+
+    /// Get the trace that passes through MBB.
+    /// The trace is computed on demand.
+    Trace getTrace(const MachineBasicBlock *MBB);
+  };
+
+  /// Strategies for selecting traces.
+  enum Strategy {
+    /// Select the trace through a block that has the fewest instructions.
+    TS_MinInstrCount,
+
+    TS_NumStrategies
+  };
+
+  /// Get the trace ensemble representing the given trace selection strategy.
+  /// The returned Ensemble object is owned by the MachineTraceMetrics analysis,
+  /// and valid for the lifetime of the analysis pass.
+  Ensemble *getEnsemble(Strategy);
+
+  /// Invalidate cached information about MBB. This must be called *before* MBB
+  /// is erased, or the CFG is otherwise changed.
+  ///
+  /// This invalidates per-block information about resource usage for MBB only,
+  /// and it invalidates per-trace information for any trace that passes
+  /// through MBB.
+  ///
+  /// Call Ensemble::getTrace() again to update any trace handles.
+  void invalidate(const MachineBasicBlock *MBB);
+
+private:
+  // One entry per basic block, indexed by block number.
+  SmallVector<FixedBlockInfo, 4> BlockInfo;
+
+  // One ensemble per strategy.
+  Ensemble* Ensembles[TS_NumStrategies];
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const MachineTraceMetrics::Trace &Tr) {
+  Tr.print(OS);
+  return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const MachineTraceMetrics::Ensemble &En) {
+  En.print(OS);
+  return OS;
+}
+} // end namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 74ba94d..172402e 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -89,8 +89,8 @@ namespace {
     void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
       RV.push_back(Reg);
       if (TargetRegisterInfo::isPhysicalRegister(Reg))
-        for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++)
-          RV.push_back(*R);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          RV.push_back(*SubRegs);
     }
 
     struct BBInfo {
@@ -191,9 +191,11 @@ namespace {
 
     void visitMachineFunctionBefore();
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineBundleBefore(const MachineInstr *MI);
     void visitMachineInstrBefore(const MachineInstr *MI);
     void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
     void visitMachineInstrAfter(const MachineInstr *MI);
+    void visitMachineBundleAfter(const MachineInstr *MI);
     void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
     void visitMachineFunctionAfter();
 
@@ -201,6 +203,10 @@ namespace {
     void report(const char *msg, const MachineBasicBlock *MBB);
     void report(const char *msg, const MachineInstr *MI);
     void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveInterval &LI);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveInterval &LI);
 
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void markReachable(const MachineBasicBlock *MBB);
@@ -210,6 +216,10 @@ namespace {
     void calcRegsRequired();
     void verifyLiveVariables();
     void verifyLiveIntervals();
+    void verifyLiveInterval(const LiveInterval&);
+    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
+    void verifyLiveIntervalSegment(const LiveInterval&,
+                                   LiveInterval::const_iterator);
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
@@ -288,6 +298,8 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
        MFI!=MFE; ++MFI) {
     visitMachineBasicBlockBefore(MFI);
+    // Keep track of the current bundle header.
+    const MachineInstr *CurBundle = 0;
     for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
            MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
       if (MBBI->getParent() != MFI) {
@@ -295,15 +307,21 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
         *OS << "Instruction: " << *MBBI;
         continue;
       }
-      // Skip BUNDLE instruction for now. FIXME: We should add code to verify
-      // the BUNDLE's specifically.
-      if (MBBI->isBundle())
-        continue;
+      // Is this a bundle header?
+      if (!MBBI->isInsideBundle()) {
+        if (CurBundle)
+          visitMachineBundleAfter(CurBundle);
+        CurBundle = MBBI;
+        visitMachineBundleBefore(CurBundle);
+      } else if (!CurBundle)
+        report("No bundle header", MBBI);
       visitMachineInstrBefore(MBBI);
       for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I)
         visitMachineOperand(&MBBI->getOperand(I), I);
       visitMachineInstrAfter(MBBI);
     }
+    if (CurBundle)
+      visitMachineBundleAfter(CurBundle);
     visitMachineBasicBlockAfter(MFI);
   }
   visitMachineFunctionAfter();
@@ -340,9 +358,9 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   assert(MBB);
   report(msg, MBB->getParent());
-  *OS << "- basic block: " << MBB->getName()
-      << " " << (void*)MBB
-      << " (BB#" << MBB->getNumber() << ")";
+  *OS << "- basic block: BB#" << MBB->getNumber()
+      << ' ' << MBB->getName()
+      << " (" << (void*)MBB << ')';
   if (Indexes)
     *OS << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
@@ -367,6 +385,28 @@ void MachineVerifier::report(const char *msg,
   *OS << "\n";
 }
 
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveInterval &LI) {
+  report(msg, MF);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveInterval &LI) {
+  report(msg, MBB);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
   BBInfo &MInfo = MBBInfoMap[MBB];
   if (!MInfo.reachable) {
@@ -384,10 +424,10 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // A sub-register of a reserved register is also reserved
   for (int Reg = regsReserved.find_first(); Reg>=0;
        Reg = regsReserved.find_next(Reg)) {
-    for (const uint16_t *Sub = TRI->getSubRegisters(Reg); *Sub; ++Sub) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
       // FIXME: This should probably be:
-      // assert(regsReserved.test(*Sub) && "Non-reserved sub-register");
-      regsReserved.set(*Sub);
+      // assert(regsReserved.test(*SubRegs) && "Non-reserved sub-register");
+      regsReserved.set(*SubRegs);
     }
   }
 
@@ -466,8 +506,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && MBB->back().isBarrier() &&
-          !TII->isPredicated(&MBB->back())) {
+      if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() &&
+          !TII->isPredicated(getBundleStart(&MBB->back()))) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
@@ -487,10 +527,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via unconditional branch but doesn't contain "
                "any instructions!", MBB);
-      } else if (!MBB->back().isBarrier()) {
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via unconditional branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via unconditional branch but the branch isn't a "
                "terminator instruction!", MBB);
       }
@@ -510,10 +550,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/fall-through but doesn't "
                "contain any instructions!", MBB);
-      } else if (MBB->back().isBarrier()) {
+      } else if (getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via conditional branch/fall-through but ends with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via conditional branch/fall-through but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -530,10 +570,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/branch but doesn't "
                "contain any instructions!", MBB);
-      } else if (!MBB->back().isBarrier()) {
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via conditional branch/branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via conditional branch/branch but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -554,8 +594,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       continue;
     }
     regsLive.insert(*I);
-    for (const uint16_t *R = TRI->getSubRegisters(*I); *R; R++)
-      regsLive.insert(*R);
+    for (MCSubRegIterator SubRegs(*I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
   }
   regsLiveInButUnused = regsLive;
 
@@ -564,8 +604,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   BitVector PR = MFI->getPristineRegs(MBB);
   for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
     regsLive.insert(I);
-    for (const uint16_t *R = TRI->getSubRegisters(I); *R; R++)
-      regsLive.insert(*R);
+    for (MCSubRegIterator SubRegs(I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
   }
 
   regsKilled.clear();
@@ -575,6 +615,30 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     lastIndex = Indexes->getMBBStartIdx(MBB);
 }
 
+// This function gets called for all bundle headers, including normal
+// stand-alone unbundled instructions.
+void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
+  if (Indexes && Indexes->hasIndex(MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(MI);
+    if (!(idx > lastIndex)) {
+      report("Instruction index out of order", MI);
+      *OS << "Last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = idx;
+  }
+
+  // Ensure non-terminators don't follow terminators.
+  // Ignore predicated terminators formed by if conversion.
+  // FIXME: If conversion shouldn't need to violate this rule.
+  if (MI->isTerminator() && !TII->isPredicated(MI)) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    report("Non-terminator instruction after the first terminator", MI);
+    *OS << "First terminator was:\t" << *FirstTerminator;
+  }
+}
+
 void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
@@ -608,17 +672,6 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
-  // Ensure non-terminators don't follow terminators.
-  // Ignore predicated terminators formed by if conversion.
-  // FIXME: If conversion shouldn't need to violate this rule.
-  if (MI->isTerminator() && !TII->isPredicated(MI)) {
-    if (!FirstTerminator)
-      FirstTerminator = MI;
-  } else if (FirstTerminator) {
-    report("Non-terminator instruction after the first terminator", MI);
-    *OS << "First terminator was:\t" << *FirstTerminator;
-  }
-
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
@@ -634,7 +687,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   if (MONum < MCID.getNumDefs()) {
     if (!MO->isReg())
       report("Explicit definition must be a register", MO, MONum);
-    else if (!MO->isDef())
+    else if (!MO->isDef() && !MCOI.isOptionalDef())
       report("Explicit definition marked as use", MO, MONum);
     else if (MO->isImplicit())
       report("Explicit definition marked as implicit", MO, MONum);
@@ -662,6 +715,12 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MRI->tracksLiveness() && !MI->isDebugValue())
       checkLiveness(MO, MONum);
 
+    // Verify two-address constraints after leaving SSA form.
+    unsigned DefIdx;
+    if (!MRI->isSSA() && MO->isUse() &&
+        MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+        Reg != MI->getOperand(DefIdx).getReg())
+      report("Two-address instruction operands must be identical", MO, MONum);
 
     // Check register classes.
     if (MONum < MCID.getNumOperands() && !MO->isImplicit()) {
@@ -672,7 +731,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           report("Illegal subregister index for physical register", MO, MONum);
           return;
         }
-        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             *OS << TRI->getName(Reg) << " is not a "
@@ -698,7 +758,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
             return;
           }
         }
-        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (SubIdx) {
             const TargetRegisterClass *SuperRC =
               TRI->getLargestLegalSuperClass(RC);
@@ -761,20 +822,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   if (MO->readsReg()) {
     regsLiveInButUnused.erase(Reg);
 
-    bool isKill = false;
-    unsigned defIdx;
-    if (MI->isRegTiedToDefOperand(MONum, &defIdx)) {
-      // A two-addr use counts as a kill if use and def are the same.
-      unsigned DefReg = MI->getOperand(defIdx).getReg();
-      if (Reg == DefReg)
-        isKill = true;
-      else if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        report("Two-address instruction operands must be identical", MO, MONum);
-      }
-    } else
-      isKill = MO->isKill();
-
-    if (isKill)
+    if (MO->isKill())
       addRegWithSubRegs(regsKilled, Reg);
 
     // Check that LiveVars knows this kill.
@@ -786,23 +834,44 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     }
 
     // Check LiveInts liveness and kill.
-    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-        LiveInts && !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI).getRegSlot(true);
-      if (LiveInts->hasInterval(Reg)) {
-        const LiveInterval &LI = LiveInts->getInterval(Reg);
-        if (!LI.liveAt(UseIdx)) {
-          report("No live range at use", MO, MONum);
-          *OS << UseIdx << " is not live in " << LI << '\n';
+    if (LiveInts && !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI);
+      // Check the cached regunit intervals.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
+            LiveRangeQuery LRQ(*LI, UseIdx);
+            if (!LRQ.valueIn()) {
+              report("No live range at use", MO, MONum);
+              *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
+                  << ' ' << *LI << '\n';
+            }
+            if (MO->isKill() && !LRQ.isKill()) {
+              report("Live range continues after kill flag", MO, MONum);
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+            }
+          }
         }
-        // Check for extra kill flags.
-        // Note that we allow missing kill flags for now.
-        if (MO->isKill() && !LI.killedAt(UseIdx.getRegSlot())) {
-          report("Live range continues after kill flag", MO, MONum);
-          *OS << "Live range: " << LI << '\n';
+      }
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          // This is a virtual register interval.
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          LiveRangeQuery LRQ(LI, UseIdx);
+          if (!LRQ.valueIn()) {
+            report("No live range at use", MO, MONum);
+            *OS << UseIdx << " is not live in " << LI << '\n';
+          }
+          // Check for extra kill flags.
+          // Note that we allow missing kill flags for now.
+          if (MO->isKill() && !LRQ.isKill()) {
+            report("Live range continues after kill flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        } else {
+          report("Virtual register has no live interval", MO, MONum);
         }
-      } else {
-        report("Virtual register has no Live interval", MO, MONum);
       }
     }
 
@@ -812,6 +881,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         // Reserved registers may be used even when 'dead'.
         if (!isReserved(Reg))
           report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
       } else {
         BBInfo &MInfo = MBBInfoMap[MI->getParent()];
         // We don't know which virtual registers are live in, so only complain
@@ -841,12 +912,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     // Check LiveInts for a live range, but only for virtual registers.
     if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
         !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getRegSlot();
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
+      DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
       if (LiveInts->hasInterval(Reg)) {
         const LiveInterval &LI = LiveInts->getInterval(Reg);
         if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
           assert(VNI && "NULL valno is not allowed");
-          if (VNI->def != DefIdx && !MO->isEarlyClobber()) {
+          if (VNI->def != DefIdx) {
             report("Inconsistent valno->def", MO, MONum);
             *OS << "Valno " << VNI->id << " is not defined at "
               << DefIdx << " in " << LI << '\n';
@@ -863,6 +935,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 }
 
 void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
+}
+
+// This function gets called after visiting all instructions in a bundle. The
+// argument points to the bundle header.
+// Normal stand-alone instructions are also considered 'bundles', and this
+// function is called for all of them.
+void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   BBInfo &MInfo = MBBInfoMap[MI->getParent()];
   set_union(MInfo.regsKilled, regsKilled);
   set_subtract(regsLive, regsKilled); regsKilled.clear();
@@ -876,15 +955,6 @@ void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
   }
   set_subtract(regsLive, regsDead);   regsDead.clear();
   set_union(regsLive, regsDefined);   regsDefined.clear();
-
-  if (Indexes && Indexes->hasIndex(MI)) {
-    SlotIndex idx = Indexes->getInstructionIndex(MI);
-    if (!(idx > lastIndex)) {
-      report("Instruction index out of order", MI);
-      *OS << "Last instruction was at " << lastIndex << '\n';
-    }
-    lastIndex = idx;
-  }
 }
 
 void
@@ -1025,7 +1095,21 @@ void MachineVerifier::visitMachineFunctionAfter() {
   // Now check liveness info if available
   calcRegsRequired();
 
-  if (MRI->isSSA() && !MF->empty()) {
+  // Check for killed virtual registers that should be live out.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      if (MInfo.regsKilled.count(*I)) {
+        report("Virtual register killed in block, but needed live out.", MFI);
+        *OS << "Virtual register " << PrintReg(*I)
+            << " is used after the block.\n";
+      }
+  }
+
+  if (!MF->empty()) {
     BBInfo &MInfo = MBBInfoMap[&MF->front()];
     for (RegSet::iterator
          I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
@@ -1069,292 +1153,298 @@ void MachineVerifier::verifyLiveVariables() {
 
 void MachineVerifier::verifyLiveIntervals() {
   assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
-  for (LiveIntervals::const_iterator LVI = LiveInts->begin(),
-       LVE = LiveInts->end(); LVI != LVE; ++LVI) {
-    const LiveInterval &LI = *LVI->second;
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
 
     // Spilling and splitting may leave unused registers around. Skip them.
-    if (MRI->use_empty(LI.reg))
+    if (MRI->reg_nodbg_empty(Reg))
       continue;
 
-    // Physical registers have much weirdness going on, mostly from coalescing.
-    // We should probably fix it, but for now just ignore them.
-    if (TargetRegisterInfo::isPhysicalRegister(LI.reg))
+    if (!LiveInts->hasInterval(Reg)) {
+      report("Missing live interval for virtual register", MF);
+      *OS << PrintReg(Reg, TRI) << " still has defs or uses\n";
       continue;
+    }
 
-    assert(LVI->first == LI.reg && "Invalid reg to interval mapping");
+    const LiveInterval &LI = LiveInts->getInterval(Reg);
+    assert(Reg == LI.reg && "Invalid reg to interval mapping");
+    verifyLiveInterval(LI);
+  }
 
-    for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-         I!=E; ++I) {
-      VNInfo *VNI = *I;
-      const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+  // Verify all the cached regunit intervals.
+  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
+      verifyLiveInterval(*LI);
+}
 
-      if (!DefVNI) {
-        if (!VNI->isUnused()) {
-          report("Valno not live at def and not marked unused", MF);
-          *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-        }
-        continue;
-      }
+void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
+                                              VNInfo *VNI) {
+  if (VNI->isUnused())
+    return;
 
-      if (VNI->isUnused())
-        continue;
+  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
 
-      if (DefVNI != VNI) {
-        report("Live range at def has different valno", MF);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-            << " where valno #" << DefVNI->id << " is live in " << LI << '\n';
-        continue;
-      }
+  if (!DefVNI) {
+    report("Valno not live at def and not marked unused", MF, LI);
+    *OS << "Valno #" << VNI->id << '\n';
+    return;
+  }
 
-      const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
-      if (!MBB) {
-        report("Invalid definition index", MF);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-            << " in " << LI << '\n';
-        continue;
-      }
+  if (DefVNI != VNI) {
+    report("Live range at def has different valno", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " where valno #" << DefVNI->id << " is live\n";
+    return;
+  }
 
-      if (VNI->isPHIDef()) {
-        if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-          report("PHIDef value is not defined at MBB start", MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << ", not at the beginning of BB#" << MBB->getNumber()
-              << " in " << LI << '\n';
-        }
-      } else {
-        // Non-PHI def.
-        const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
-        if (!MI) {
-          report("No instruction at def index", MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << " in " << LI << '\n';
-          continue;
-        }
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
+  if (!MBB) {
+    report("Invalid definition index", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " in " << LI << '\n';
+    return;
+  }
 
-        bool hasDef = false;
-        bool isEarlyClobber = false;
-        for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-          if (!MOI->isReg() || !MOI->isDef())
-            continue;
-          if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-            if (MOI->getReg() != LI.reg)
-              continue;
-          } else {
-            if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
-                !TRI->regsOverlap(LI.reg, MOI->getReg()))
-              continue;
-          }
-          hasDef = true;
-          if (MOI->isEarlyClobber())
-            isEarlyClobber = true;
-        }
+  if (VNI->isPHIDef()) {
+    if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
+      report("PHIDef value is not defined at MBB start", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+          << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
+    }
+    return;
+  }
 
-        if (!hasDef) {
-          report("Defining instruction does not modify register", MI);
-          *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-        }
+  // Non-PHI def.
+  const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
+  if (!MI) {
+    report("No instruction at def index", MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    return;
+  }
 
-        // Early clobber defs begin at USE slots, but other defs must begin at
-        // DEF slots.
-        if (isEarlyClobber) {
-          if (!VNI->def.isEarlyClobber()) {
-            report("Early clobber def must be at an early-clobber slot", MF);
-            *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-                << " in " << LI << '\n';
-          }
-        } else if (!VNI->def.isRegister()) {
-          report("Non-PHI, non-early clobber def must be at a register slot",
-                 MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << " in " << LI << '\n';
-        }
-      }
+  bool hasDef = false;
+  bool isEarlyClobber = false;
+  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+    if (!MOI->isReg() || !MOI->isDef())
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+      if (MOI->getReg() != LI.reg)
+        continue;
+    } else {
+      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+        continue;
     }
+    hasDef = true;
+    if (MOI->isEarlyClobber())
+      isEarlyClobber = true;
+  }
 
-    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I) {
-      const VNInfo *VNI = I->valno;
-      assert(VNI && "Live range has no valno");
+  if (!hasDef) {
+    report("Defining instruction does not modify register", MI);
+    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
+  }
 
-      if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
-        report("Foreign valno in live range", MF);
-        I->print(*OS);
-        *OS << " has a valno not in " << LI << '\n';
-      }
+  // Early clobber defs begin at USE slots, but other defs must begin at
+  // DEF slots.
+  if (isEarlyClobber) {
+    if (!VNI->def.isEarlyClobber()) {
+      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    }
+  } else if (!VNI->def.isRegister()) {
+    report("Non-PHI, non-early clobber def must be at a register slot",
+           MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+  }
+}
 
-      if (VNI->isUnused()) {
-        report("Live range valno is marked unused", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-      }
+void
+MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
+                                           LiveInterval::const_iterator I) {
+  const VNInfo *VNI = I->valno;
+  assert(VNI && "Live range has no valno");
+
+  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live range", MF, LI);
+    *OS << *I << " has a bad valno\n";
+  }
 
-      const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
-      if (!MBB) {
-        report("Bad start of live segment, no basic block", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-        continue;
-      }
-      SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
-      if (I->start != MBBStartIdx && I->start != VNI->def) {
-        report("Live segment must begin at MBB entry or valno def", MBB);
-        I->print(*OS);
-        *OS << " in " << LI << '\n' << "Basic block starts at "
-            << MBBStartIdx << '\n';
-      }
+  if (VNI->isUnused()) {
+    report("Live range valno is marked unused", MF, LI);
+    *OS << *I << '\n';
+  }
 
-      const MachineBasicBlock *EndMBB =
-                                LiveInts->getMBBFromIndex(I->end.getPrevSlot());
-      if (!EndMBB) {
-        report("Bad end of live segment, no basic block", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-        continue;
-      }
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  if (!MBB) {
+    report("Bad start of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
+  SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
+  if (I->start != MBBStartIdx && I->start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LI);
+    *OS << *I << '\n';
+  }
 
-      // No more checks for live-out segments.
-      if (I->end == LiveInts->getMBBEndIdx(EndMBB))
-        continue;
+  const MachineBasicBlock *EndMBB =
+    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+  if (!EndMBB) {
+    report("Bad end of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
 
-      // The live segment is ending inside EndMBB
-      const MachineInstr *MI =
-        LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
-      if (!MI) {
-        report("Live segment doesn't end at a valid instruction", EndMBB);
-        I->print(*OS);
-        *OS << " in " << LI << '\n' << "Basic block starts at "
-          << MBBStartIdx << '\n';
+  // No more checks for live-out segments.
+  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+    return;
+
+  // RegUnit intervals are allowed dead phis.
+  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
+      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+    return;
+
+  // The live segment is ending inside EndMBB
+  const MachineInstr *MI =
+    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+  if (!MI) {
+    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
+    *OS << *I << '\n';
+    return;
+  }
+
+  // The block slot must refer to a basic block boundary.
+  if (I->end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LI);
+    *OS << *I << '\n';
+  }
+
+  if (I->end.isDead()) {
+    // Segment ends on the dead slot.
+    // That means there must be a dead def.
+    if (!SlotIndex::isSameInstr(I->start, I->end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // A live segment can only end at an early-clobber slot if it is being
+  // redefined by an early-clobber def.
+  if (I->end.isEarlyClobber()) {
+    if (I+1 == LI.end() || (I+1)->start != I->end) {
+      report("Live segment ending at early clobber slot must be "
+             "redefined by an EC def in the same instruction", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // The following checks only apply to virtual registers. Physreg liveness
+  // is too weird to check.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    // A live range can end with either a redefinition, a kill flag on a
+    // use, or a dead flag on a def.
+    bool hasRead = false;
+    bool hasDeadDef = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || MOI->getReg() != LI.reg)
         continue;
-      }
+      if (MOI->readsReg())
+        hasRead = true;
+      if (MOI->isDef() && MOI->isDead())
+        hasDeadDef = true;
+    }
 
-      // The block slot must refer to a basic block boundary.
-      if (I->end.isBlock()) {
-        report("Live segment ends at B slot of an instruction", MI);
+    if (I->end.isDead()) {
+      if (!hasDeadDef) {
+        report("Instruction doesn't have a dead def operand", MI);
         I->print(*OS);
         *OS << " in " << LI << '\n';
       }
-
-      if (I->end.isDead()) {
-        // Segment ends on the dead slot.
-        // That means there must be a dead def.
-        if (!SlotIndex::isSameInstr(I->start, I->end)) {
-          report("Live segment ending at dead slot spans instructions", MI);
-          I->print(*OS);
-          *OS << " in " << LI << '\n';
-        }
-      }
-
-      // A live segment can only end at an early-clobber slot if it is being
-      // redefined by an early-clobber def.
-      if (I->end.isEarlyClobber()) {
-        if (I+1 == E || (I+1)->start != I->end) {
-          report("Live segment ending at early clobber slot must be "
-                 "redefined by an EC def in the same instruction", MI);
-          I->print(*OS);
-          *OS << " in " << LI << '\n';
-        }
+    } else {
+      if (!hasRead) {
+        report("Instruction ending live range doesn't read the register", MI);
+        *OS << *I << " in " << LI << '\n';
       }
+    }
+  }
 
-      // The following checks only apply to virtual registers. Physreg liveness
-      // is too weird to check.
-      if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-        // A live range can end with either a redefinition, a kill flag on a
-        // use, or a dead flag on a def.
-        bool hasRead = false;
-        bool hasDeadDef = false;
-        for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-          if (!MOI->isReg() || MOI->getReg() != LI.reg)
-            continue;
-          if (MOI->readsReg())
-            hasRead = true;
-          if (MOI->isDef() && MOI->isDead())
-            hasDeadDef = true;
-        }
-
-        if (I->end.isDead()) {
-          if (!hasDeadDef) {
-            report("Instruction doesn't have a dead def operand", MI);
-            I->print(*OS);
-            *OS << " in " << LI << '\n';
-          }
-        } else {
-          if (!hasRead) {
-            report("Instruction ending live range doesn't read the register",
-                   MI);
-            I->print(*OS);
-            *OS << " in " << LI << '\n';
-          }
-        }
-      }
+  // Now check all the basic blocks in this live segment.
+  MachineFunction::const_iterator MFI = MBB;
+  // Is this live range the beginning of a non-PHIDef VN?
+  if (I->start == VNI->def && !VNI->isPHIDef()) {
+    // Not live-in to any blocks.
+    if (MBB == EndMBB)
+      return;
+    // Skip this block.
+    ++MFI;
+  }
+  for (;;) {
+    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    // We don't know how to track physregs into a landing pad.
+    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+        MFI->isLandingPad()) {
+      if (&*MFI == EndMBB)
+        break;
+      ++MFI;
+      continue;
+    }
 
-      // Now check all the basic blocks in this live segment.
-      MachineFunction::const_iterator MFI = MBB;
-      // Is this live range the beginning of a non-PHIDef VN?
-      if (I->start == VNI->def && !VNI->isPHIDef()) {
-        // Not live-in to any blocks.
-        if (MBB == EndMBB)
-          continue;
-        // Skip this block.
-        ++MFI;
+    // Is VNI a PHI-def in the current block?
+    bool IsPHI = VNI->isPHIDef() &&
+      VNI->def == LiveInts->getMBBStartIdx(MFI);
+
+    // Check that VNI is live-out of all predecessors.
+    for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
+         PE = MFI->pred_end(); PI != PE; ++PI) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
+      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+
+      // All predecessors must have a live-out value.
+      if (!PVNI) {
+        report("Register not marked live out of predecessor", *PI, LI);
+        *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
+            << PEnd << '\n';
+        continue;
       }
-      for (;;) {
-        assert(LiveInts->isLiveInToMBB(LI, MFI));
-        // We don't know how to track physregs into a landing pad.
-        if (TargetRegisterInfo::isPhysicalRegister(LI.reg) &&
-            MFI->isLandingPad()) {
-          if (&*MFI == EndMBB)
-            break;
-          ++MFI;
-          continue;
-        }
-        // Check that VNI is live-out of all predecessors.
-        for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
-             PE = MFI->pred_end(); PI != PE; ++PI) {
-          SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
-          const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
-
-          if (VNI->isPHIDef() && VNI->def == LiveInts->getMBBStartIdx(MFI))
-            continue;
-
-          if (!PVNI) {
-            report("Register not marked live out of predecessor", *PI);
-            *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
-                << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
-                << PEnd << " in " << LI << '\n';
-            continue;
-          }
 
-          if (PVNI != VNI) {
-            report("Different value live out of predecessor", *PI);
-            *OS << "Valno #" << PVNI->id << " live out of BB#"
-                << (*PI)->getNumber() << '@' << PEnd
-                << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
-                << '@' << LiveInts->getMBBStartIdx(MFI) << " in " << LI << '\n';
-          }
-        }
-        if (&*MFI == EndMBB)
-          break;
-        ++MFI;
+      // Only PHI-defs can take different predecessor values.
+      if (!IsPHI && PVNI != VNI) {
+        report("Different value live out of predecessor", *PI, LI);
+        *OS << "Valno #" << PVNI->id << " live out of BB#"
+            << (*PI)->getNumber() << '@' << PEnd
+            << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
       }
     }
+    if (&*MFI == EndMBB)
+      break;
+    ++MFI;
+  }
+}
 
-    // Check the LI only has one connected component.
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      ConnectedVNInfoEqClasses ConEQ(*LiveInts);
-      unsigned NumComp = ConEQ.Classify(&LI);
-      if (NumComp > 1) {
-        report("Multiple connected components in live interval", MF);
-        *OS << NumComp << " components in " << LI << '\n';
-        for (unsigned comp = 0; comp != NumComp; ++comp) {
-          *OS << comp << ": valnos";
-          for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
-               E = LI.vni_end(); I!=E; ++I)
-            if (comp == ConEQ.getEqClass(*I))
-              *OS << ' ' << (*I)->id;
-          *OS << '\n';
-        }
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I!=E; ++I)
+    verifyLiveIntervalValue(LI, *I);
+
+  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
+    verifyLiveIntervalSegment(LI, I);
+
+  // Check the LI only has one connected component.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    ConnectedVNInfoEqClasses ConEQ(*LiveInts);
+    unsigned NumComp = ConEQ.Classify(&LI);
+    if (NumComp > 1) {
+      report("Multiple connected components in live interval", MF, LI);
+      for (unsigned comp = 0; comp != NumComp; ++comp) {
+        *OS << comp << ": valnos";
+        for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
+             E = LI.vni_end(); I!=E; ++I)
+          if (comp == ConEQ.getEqClass(*I))
+            *OS << ' ' << (*I)->id;
+        *OS << '\n';
       }
     }
   }
 }
-
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 0ed4c34..e6e23da 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -171,23 +171,30 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
   return true;
 }
 
+/// isImplicitlyDefined - Return true if all defs of VirtReg are implicit-defs.
+/// This includes registers with no defs.
+static bool isImplicitlyDefined(unsigned VirtReg,
+                                const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(VirtReg),
+       DE = MRI->def_end(); DI != DE; ++DI)
+    if (!DI->isImplicitDef())
+      return false;
+  return true;
+}
+
 /// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
 /// are implicit_def's.
 static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
                                          const MachineRegisterInfo *MRI) {
-  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
-    unsigned SrcReg = MPhi->getOperand(i).getReg();
-    const MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-    if (!DefMI || !DefMI->isImplicitDef())
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    if (!isImplicitlyDefined(MPhi->getOperand(i).getReg(), MRI))
       return false;
-  }
   return true;
 }
 
 
-
 /// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
-/// under the assuption that it needs to be lowered in a way that supports
+/// under the assumption that it needs to be lowered in a way that supports
 /// atomic execution of PHIs.  This lowering method is always correct all of the
 /// time.
 ///
@@ -287,7 +294,8 @@ void PHIElimination::LowerAtomicPHINode(
   for (int i = NumSrcs - 1; i >= 0; --i) {
     unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
     unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
-
+    bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
+      isImplicitlyDefined(SrcReg, MRI);
     assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
            "Machine PHI Operands must all be virtual registers!");
 
@@ -295,14 +303,6 @@ void PHIElimination::LowerAtomicPHINode(
     // path the PHI.
     MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
 
-    // If source is defined by an implicit def, there is no need to insert a
-    // copy.
-    MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-    if (DefMI->isImplicitDef()) {
-      ImpDefs.insert(DefMI);
-      continue;
-    }
-
     // Check to make sure we haven't already emitted the copy for this block.
     // This can happen because PHI nodes may have multiple entries for the same
     // basic block.
@@ -315,12 +315,27 @@ void PHIElimination::LowerAtomicPHINode(
       findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
 
     // Insert the copy.
-    if (!reusedIncoming && IncomingReg)
-      BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg, 0, SrcSubReg);
+    if (!reusedIncoming && IncomingReg) {
+      if (SrcUndef) {
+        // The source register is undefined, so there is no need for a real
+        // COPY, but we still need to ensure joint dominance by defs.
+        // Insert an IMPLICIT_DEF instruction.
+        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                TII->get(TargetOpcode::IMPLICIT_DEF), IncomingReg);
+
+        // Clean up the old implicit-def, if there even was one.
+        if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
+          if (DefMI->isImplicitDef())
+            ImpDefs.insert(DefMI);
+      } else {
+        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), IncomingReg)
+          .addReg(SrcReg, 0, SrcSubReg);
+      }
+    }
 
     // Now update live variable information if we have it.  Otherwise we're done
-    if (!LV) continue;
+    if (SrcUndef || !LV) continue;
 
     // We want to be able to insert a kill of the register if this PHI (aka, the
     // copy we just inserted) is the last use of the source value.  Live
@@ -340,39 +355,35 @@ void PHIElimination::LowerAtomicPHINode(
     // add a kill marker in this block saying that it kills the incoming value!
     if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
-      // register.  In most cases this is the copy, however, the first
-      // terminator instruction at the end of the block may also use the value.
-      // In this case, we should mark *it* as being the killing block, not the
-      // copy.
-      MachineBasicBlock::iterator KillInst;
-      MachineBasicBlock::iterator Term = opBlock.getFirstTerminator();
-      if (Term != opBlock.end() && Term->readsRegister(SrcReg)) {
-        KillInst = Term;
-
-        // Check that no other terminators use values.
-#ifndef NDEBUG
-        for (MachineBasicBlock::iterator TI = llvm::next(Term);
-             TI != opBlock.end(); ++TI) {
-          if (TI->isDebugValue())
-            continue;
-          assert(!TI->readsRegister(SrcReg) &&
-                 "Terminator instructions cannot use virtual registers unless"
-                 "they are the first terminator in a block!");
-        }
-#endif
-      } else if (reusedIncoming || !IncomingReg) {
-        // We may have to rewind a bit if we didn't insert a copy this time.
-        KillInst = Term;
-        while (KillInst != opBlock.begin()) {
-          --KillInst;
-          if (KillInst->isDebugValue())
-            continue;
-          if (KillInst->readsRegister(SrcReg))
-            break;
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+      for (MachineBasicBlock::iterator Term = FirstTerm;
+          Term != opBlock.end(); ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = FirstTerm;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugValue())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = prior(InsertPos);
         }
-      } else {
-        // We just inserted this copy.
-        KillInst = prior(InsertPos);
       }
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
 
@@ -412,28 +423,71 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
   bool Changed = false;
   for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
       unsigned Reg = BBI->getOperand(i).getReg();
       MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
-      // We break edges when registers are live out from the predecessor block
-      // (not considering PHI nodes). If the register is live in to this block
-      // anyway, we would gain nothing from splitting.
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
       // Avoid splitting backedges of loops. It would introduce small
       // out-of-line blocks into the loop which is very bad for code placement.
-      if (PreMBB != &MBB &&
-          !LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB)) {
-        if (!MLI ||
-            !(MLI->getLoopFor(PreMBB) == MLI->getLoopFor(&MBB) &&
-              MLI->isLoopHeader(&MBB))) {
-          if (PreMBB->SplitCriticalEdge(&MBB, this)) {
-            Changed = true;
-            ++NumCriticalEdgesSplit;
-          }
-        }
+      if (PreMBB == &MBB)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
+      if (IsLoopHeader && PreLoop == CurLoop)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      if (!LV.isLiveOut(Reg, *PreMBB))
+        continue;
+
+      DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
+                   << PreMBB->getNumber() << " -> BB#" << MBB.getNumber()
+                   << ": " << *BBI);
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      bool ShouldSplit = !LV.isLiveIn(Reg, MBB);
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, this)) {
+        DEBUG(dbgs() << "Failed to split ciritcal edge.\n");
+        continue;
       }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
     }
   }
   return Changed;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 490547b..cfa3eec 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -48,6 +49,8 @@ static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
 static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
     cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> EnableEarlyIfConversion("enable-early-ifcvt", cl::Hidden,
+    cl::desc("Enable Early If-conversion"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
@@ -80,15 +83,23 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL));
+static cl::opt<std::string>
+PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
+                   cl::desc("Print machine instrs"),
+                   cl::value_desc("pass-name"), cl::init("option-unspecified"));
+
+// Experimental option to run live inteerval analysis early.
+static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
+    cl::desc("Run live interval analysis earlier in the pipeline"));
 
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
 /// These should be converted to boolOrDefault in order to use applyOverride.
-static AnalysisID applyDisable(AnalysisID ID, bool Override) {
+static AnalysisID applyDisable(AnalysisID PassID, bool Override) {
   if (Override)
-    return &NoPassID;
-  return ID;
+    return 0;
+  return PassID;
 }
 
 /// Allow Pass selection to be overriden by command line options. This supports
@@ -101,13 +112,13 @@ static AnalysisID applyOverride(AnalysisID TargetID, cl::boolOrDefault Override,
   case cl::BOU_UNSET:
     return TargetID;
   case cl::BOU_TRUE:
-    if (TargetID != &NoPassID)
+    if (TargetID)
       return TargetID;
-    if (StandardID == &NoPassID)
+    if (StandardID == 0)
       report_fatal_error("Target cannot enable pass");
     return StandardID;
   case cl::BOU_FALSE:
-    return &NoPassID;
+    return 0;
   }
   llvm_unreachable("Invalid command line option state");
 }
@@ -149,6 +160,9 @@ static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) {
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, !EnableEarlyIfConversion);
+
   if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
@@ -178,9 +192,6 @@ INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
                 "Target Pass Configuration", false, false)
 char TargetPassConfig::ID = 0;
 
-static char NoPassIDAnchor = 0;
-char &llvm::NoPassID = NoPassIDAnchor;
-
 // Pseudo Pass IDs.
 char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
@@ -193,9 +204,13 @@ public:
   // that are part of a standard pass pipeline without overridding the entire
   // pipeline. This mechanism allows target options to inherit a standard pass's
   // user interface. For example, a target may disable a standard pass by
-  // default by substituting NoPass, and the user may still enable that standard
-  // pass with an explicit command line option.
+  // default by substituting a pass ID of zero, and the user may still enable
+  // that standard pass with an explicit command line option.
   DenseMap<AnalysisID,AnalysisID> TargetPasses;
+
+  /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
+  /// is inserted after each instance of the first one.
+  SmallVector<std::pair<AnalysisID, AnalysisID>, 4> InsertedPasses;
 };
 } // namespace llvm
 
@@ -207,7 +222,8 @@ TargetPassConfig::~TargetPassConfig() {
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
-  : ImmutablePass(ID), TM(tm), PM(&pm), Impl(0), Initialized(false),
+  : ImmutablePass(ID), PM(&pm), StartAfter(0), StopAfter(0),
+    Started(true), Stopped(false), TM(tm), Impl(0), Initialized(false),
     DisableVerify(false),
     EnableTailMerge(true) {
 
@@ -218,11 +234,22 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   initializeCodeGen(*PassRegistry::getPassRegistry());
 
   // Substitute Pseudo Pass IDs for real ones.
-  substitutePass(EarlyTailDuplicateID, TailDuplicateID);
-  substitutePass(PostRAMachineLICMID, MachineLICMID);
+  substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
+  substitutePass(&PostRAMachineLICMID, &MachineLICMID);
+
+  // Disable early if-conversion. Targets that are ready can enable it.
+  disablePass(&EarlyIfConverterID);
 
   // Temporarily disable experimental passes.
-  substitutePass(MachineSchedulerID, NoPassID);
+  substitutePass(&MachineSchedulerID, 0);
+}
+
+/// Insert InsertedPassID pass after TargetPassID.
+void TargetPassConfig::insertPass(AnalysisID TargetPassID,
+                                  AnalysisID InsertedPassID) {
+  assert(TargetPassID != InsertedPassID && "Insert a pass after itself!");
+  std::pair<AnalysisID, AnalysisID> P(TargetPassID, InsertedPassID);
+  Impl->InsertedPasses.push_back(P);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
@@ -244,8 +271,9 @@ void TargetPassConfig::setOpt(bool &Opt, bool Val) {
   Opt = Val;
 }
 
-void TargetPassConfig::substitutePass(char &StandardID, char &TargetID) {
-  Impl->TargetPasses[&StandardID] = &TargetID;
+void TargetPassConfig::substitutePass(AnalysisID StandardID,
+                                      AnalysisID TargetID) {
+  Impl->TargetPasses[StandardID] = TargetID;
 }
 
 AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
@@ -256,29 +284,62 @@ AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
   return I->second;
 }
 
-/// Add a CodeGen pass at this point in the pipeline after checking for target
-/// and command line overrides.
-AnalysisID TargetPassConfig::addPass(char &ID) {
+/// Add a pass to the PassManager if that pass is supposed to be run.  If the
+/// Started/Stopped flags indicate either that the compilation should start at
+/// a later pass or that it should stop after an earlier pass, then do not add
+/// the pass.  Finally, compare the current pass against the StartAfter
+/// and StopAfter options and change the Started/Stopped flags accordingly.
+void TargetPassConfig::addPass(Pass *P) {
   assert(!Initialized && "PassConfig is immutable");
 
-  AnalysisID TargetID = getPassSubstitution(&ID);
-  AnalysisID FinalID = overridePass(&ID, TargetID);
-  if (FinalID == &NoPassID)
+  // Cache the Pass ID here in case the pass manager finds this pass is
+  // redundant with ones already scheduled / available, and deletes it.
+  // Fundamentally, once we add the pass to the manager, we no longer own it
+  // and shouldn't reference it.
+  AnalysisID PassID = P->getPassID();
+
+  if (Started && !Stopped)
+    PM->add(P);
+  if (StopAfter == PassID)
+    Stopped = true;
+  if (StartAfter == PassID)
+    Started = true;
+  if (Stopped && !Started)
+    report_fatal_error("Cannot stop compilation after pass that is not run");
+}
+
+/// Add a CodeGen pass at this point in the pipeline after checking for target
+/// and command line overrides.
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+  AnalysisID TargetID = getPassSubstitution(PassID);
+  AnalysisID FinalID = overridePass(PassID, TargetID);
+  if (FinalID == 0)
     return FinalID;
 
   Pass *P = Pass::createPass(FinalID);
   if (!P)
     llvm_unreachable("Pass ID not registered");
-  PM->add(P);
+  addPass(P);
+  // Add the passes after the pass P if there is any.
+  for (SmallVector<std::pair<AnalysisID, AnalysisID>, 4>::iterator
+         I = Impl->InsertedPasses.begin(), E = Impl->InsertedPasses.end();
+       I != E; ++I) {
+    if ((*I).first == PassID) {
+      assert((*I).second && "Illegal Pass ID!");
+      Pass *NP = Pass::createPass((*I).second);
+      assert(NP && "Pass ID not registered");
+      addPass(NP);
+    }
+  }
   return FinalID;
 }
 
-void TargetPassConfig::printAndVerify(const char *Banner) const {
+void TargetPassConfig::printAndVerify(const char *Banner) {
   if (TM->shouldPrintMachineCode())
-    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
 
   if (VerifyMachineCode)
-    PM->add(createMachineVerifierPass(Banner));
+    addPass(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -288,46 +349,73 @@ void TargetPassConfig::addIRPasses() {
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
-  PM->add(createTypeBasedAliasAnalysisPass());
-  PM->add(createBasicAliasAnalysisPass());
+  addPass(createTypeBasedAliasAnalysisPass());
+  addPass(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
-    PM->add(createVerifierPass());
+    addPass(createVerifierPass());
 
   // Run loop strength reduction before anything else.
   if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
-    PM->add(createLoopStrengthReducePass(getTargetLowering()));
+    addPass(createLoopStrengthReducePass(getTargetLowering()));
     if (PrintLSR)
-      PM->add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+      addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
   }
 
-  PM->add(createGCLoweringPass());
+  addPass(createGCLoweringPass());
 
   // Make sure that no unreachable blocks are instruction selected.
-  PM->add(createUnreachableBlockEliminationPass());
+  addPass(createUnreachableBlockEliminationPass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+void TargetPassConfig::addPassesToHandleExceptions() {
+  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(createSjLjEHPreparePass(TM->getTargetLowering()));
+    // FALLTHROUGH
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+  case ExceptionHandling::Win64:
+    addPass(createDwarfEHPass(TM));
+    break;
+  case ExceptionHandling::None:
+    addPass(createLowerInvokePass(TM->getTargetLowering()));
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(createUnreachableBlockEliminationPass());
+    break;
+  }
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
 /// instruction selection.
 void TargetPassConfig::addISelPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
-    PM->add(createCodeGenPreparePass(getTargetLowering()));
+    addPass(createCodeGenPreparePass(getTargetLowering()));
 
-  PM->add(createStackProtectorPass(getTargetLowering()));
+  addPass(createStackProtectorPass(getTargetLowering()));
 
   addPreISel();
 
   if (PrintISelInput)
-    PM->add(createPrintFunctionPass("\n\n"
+    addPass(createPrintFunctionPass("\n\n"
                                     "*** Final LLVM Code input to ISel ***\n",
                                     &dbgs()));
 
   // All passes which modify the LLVM IR are now complete; run the verifier
   // to ensure that the IR is valid.
   if (!DisableVerify)
-    PM->add(createVerifierPass());
+    addPass(createVerifierPass());
 }
 
 /// Add the complete set of target-independent postISel code generator passes.
@@ -349,11 +437,26 @@ void TargetPassConfig::addISelPrepare() {
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
+  // Insert a machine instr printer pass after the specified pass.
+  // If -print-machineinstrs specified, print machineinstrs after all passes.
+  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
+    TM->Options.PrintMachineCode = true;
+  else if (!StringRef(PrintMachineInstrs.getValue())
+           .equals("option-unspecified")) {
+    const PassRegistry *PR = PassRegistry::getPassRegistry();
+    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
+    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    assert (TPI && IPI && "Pass ID not registered!");
+    const char *TID = (char *)(TPI->getTypeInfo());
+    const char *IID = (char *)(IPI->getTypeInfo());
+    insertPass(TID, IID);
+  }
+
   // Print the instruction selected machine code...
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  addPass(ExpandISelPseudosID);
+  addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -362,7 +465,7 @@ void TargetPassConfig::addMachinePasses() {
   else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(LocalStackSlotAllocationID);
+    addPass(&LocalStackSlotAllocationID);
   }
 
   // Run pre-ra passes.
@@ -381,7 +484,7 @@ void TargetPassConfig::addMachinePasses() {
     printAndVerify("After PostRegAlloc passes");
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
-  addPass(PrologEpilogCodeInserterID);
+  addPass(&PrologEpilogCodeInserterID);
   printAndVerify("After PrologEpilogCodeInserter");
 
   /// Add passes that optimize machine instructions after register allocation.
@@ -389,7 +492,7 @@ void TargetPassConfig::addMachinePasses() {
     addMachineLateOptimization();
 
   // Expand pseudo instructions before second scheduling pass.
-  addPass(ExpandPostRAPseudosID);
+  addPass(&ExpandPostRAPseudosID);
   printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
@@ -398,14 +501,14 @@ void TargetPassConfig::addMachinePasses() {
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
-    addPass(PostRASchedulerID);
+    addPass(&PostRASchedulerID);
     printAndVerify("After PostRAScheduler");
   }
 
   // GC
-  addPass(GCMachineCodeAnalysisID);
+  addPass(&GCMachineCodeAnalysisID);
   if (PrintGCInfo)
-    PM->add(createGCInfoPrinter(dbgs()));
+    addPass(createGCInfoPrinter(dbgs()));
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
@@ -418,30 +521,31 @@ void TargetPassConfig::addMachinePasses() {
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
-  if (addPass(EarlyTailDuplicateID) != &NoPassID)
+  if (addPass(&EarlyTailDuplicateID))
     printAndVerify("After Pre-RegAlloc TailDuplicate");
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(OptimizePHIsID);
+  addPass(&OptimizePHIsID);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(LocalStackSlotAllocationID);
+  addPass(&LocalStackSlotAllocationID);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
-  addPass(DeadMachineInstructionElimID);
+  addPass(&DeadMachineInstructionElimID);
   printAndVerify("After codegen DCE pass");
 
-  addPass(MachineLICMID);
-  addPass(MachineCSEID);
-  addPass(MachineSinkingID);
+  addPass(&EarlyIfConverterID);
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+  addPass(&MachineSinkingID);
   printAndVerify("After Machine LICM, CSE and Sinking passes");
 
-  addPass(PeepholeOptimizerID);
+  addPass(&PeepholeOptimizerID);
   printAndVerify("After codegen peephole optimization pass");
 }
 
@@ -519,10 +623,10 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(PHIEliminationID);
-  addPass(TwoAddressInstructionPassID);
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
 
-  PM->add(RegAllocPass);
+  addPass(RegAllocPass);
   printAndVerify("After Register Allocation");
 }
 
@@ -530,42 +634,51 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&ProcessImplicitDefsID);
+
   // LiveVariables currently requires pure SSA form.
   //
   // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
-  addPass(LiveVariablesID);
+  addPass(&LiveVariablesID);
 
   // Add passes that move from transformed SSA into conventional SSA. This is a
   // "copy coalescing" problem.
   //
   if (!EnableStrongPHIElim) {
     // Edge splitting is smarter with machine loop info.
-    addPass(MachineLoopInfoID);
-    addPass(PHIEliminationID);
+    addPass(&MachineLoopInfoID);
+    addPass(&PHIEliminationID);
   }
-  addPass(TwoAddressInstructionPassID);
 
-  // FIXME: Either remove this pass completely, or fix it so that it works on
-  // SSA form. We could modify LiveIntervals to be independent of this pass, But
-  // it would be even better to simply eliminate *all* IMPLICIT_DEFs before
-  // leaving SSA.
-  addPass(ProcessImplicitDefsID);
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (EarlyLiveIntervals)
+    addPass(&LiveIntervalsID);
+
+  addPass(&TwoAddressInstructionPassID);
 
   if (EnableStrongPHIElim)
-    addPass(StrongPHIEliminationID);
+    addPass(&StrongPHIEliminationID);
 
-  addPass(RegisterCoalescerID);
+  addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
-  if (addPass(MachineSchedulerID) != &NoPassID)
+  if (addPass(&MachineSchedulerID))
     printAndVerify("After Machine Scheduling");
 
   // Add the selected register allocation pass.
-  PM->add(RegAllocPass);
-  printAndVerify("After Register Allocation");
+  addPass(RegAllocPass);
+  printAndVerify("After Register Allocation, before rewriter");
+
+  // Allow targets to change the register assignments before rewriting.
+  if (addPreRewrite())
+    printAndVerify("After pre-rewrite passes");
+
+  // Finally rewrite virtual registers.
+  addPass(&VirtRegRewriterID);
+  printAndVerify("After Virtual Register Rewriter");
 
   // FinalizeRegAlloc is convenient until MachineInstrBundles is more mature,
   // but eventually, all users of it should probably be moved to addPostRA and
@@ -579,12 +692,12 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   //
   // FIXME: Re-enable coloring with register when it's capable of adding
   // kill markers.
-  addPass(StackSlotColoringID);
+  addPass(&StackSlotColoringID);
 
   // Run post-ra machine LICM to hoist reloads / remats.
   //
   // FIXME: can this move into MachineLateOptimization?
-  addPass(PostRAMachineLICMID);
+  addPass(&PostRAMachineLICMID);
 
   printAndVerify("After StackSlotColoring and postra Machine LICM");
 }
@@ -596,33 +709,33 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (addPass(BranchFolderPassID) != &NoPassID)
+  if (addPass(&BranchFolderPassID))
     printAndVerify("After BranchFolding");
 
   // Tail duplication.
-  if (addPass(TailDuplicateID) != &NoPassID)
+  if (addPass(&TailDuplicateID))
     printAndVerify("After TailDuplicate");
 
   // Copy propagation.
-  if (addPass(MachineCopyPropagationID) != &NoPassID)
+  if (addPass(&MachineCopyPropagationID))
     printAndVerify("After copy propagation pass");
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  AnalysisID ID = &NoPassID;
+  AnalysisID PassID = 0;
   if (!DisableBlockPlacement) {
     // MachineBlockPlacement is a new pass which subsumes the functionality of
     // CodPlacementOpt. The old code placement pass can be restored by
     // disabling block placement, but eventually it will be removed.
-    ID = addPass(MachineBlockPlacementID);
+    PassID = addPass(&MachineBlockPlacementID);
   } else {
-    ID = addPass(CodePlacementOptID);
+    PassID = addPass(&CodePlacementOptID);
   }
-  if (ID != &NoPassID) {
+  if (PassID) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
-      addPass(MachineBlockPlacementStatsID);
+      addPass(&MachineBlockPlacementStatsID);
 
     printAndVerify("After machine block placement.");
   }
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 9c5c029..6bc7e37 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -31,6 +31,15 @@
 //     same flag that the "cmp" instruction sets and that "bz" uses, then we can
 //     eliminate the "cmp" instruction.
 //
+//     Another instance, in this code:
+//
+//       sub r1, r3 | sub r1, imm
+//       cmp r3, r1 or cmp r1, r3 | cmp r1, imm
+//       bge L1
+//
+//     If the branch instruction can use flag from "sub", then we can replace
+//     "sub" with "subs" and eliminate the "cmp" instruction.
+//
 // - Optimize Bitcast pairs:
 //
 //     v1 = bitcast v0
@@ -69,6 +78,7 @@ STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
+STATISTIC(NumLoadFold,   "Number of loads folded");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
@@ -95,16 +105,17 @@ namespace {
     }
 
   private:
-    bool OptimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
-    bool OptimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
-    bool OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
-    bool FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+    bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg);
   };
 }
 
@@ -116,7 +127,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
                 "Peephole Optimizations", false, false)
 
-/// OptimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
+/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
 /// a single register and writes a single register and it does not modify the
 /// source, and if the source value is preserved as a sub-register of the
 /// result, then replace all reachable uses of the source with the subreg of the
@@ -126,7 +137,7 @@ INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
 /// the code. Since this code does not currently share EXTRACTs, just ignore all
 /// debug uses.
 bool PeepholeOptimizer::
-OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                  SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
   unsigned SrcReg, DstReg, SubIdx;
   if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
@@ -136,16 +147,30 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
       TargetRegisterInfo::isPhysicalRegister(SrcReg))
     return false;
 
-  MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
-  if (++UI == MRI->use_nodbg_end())
+  if (MRI->hasOneNonDBGUse(SrcReg))
     // No other uses.
     return false;
 
+  // Ensure DstReg can get a register class that actually supports
+  // sub-registers. Don't change the class until we commit.
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+  DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx);
+  if (!DstRC)
+    return false;
+
+  // The ext instr may be operating on a sub-register of SrcReg as well.
+  // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
+  // register.
+  // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
+  // SrcReg:SubIdx should be replaced.
+  bool UseSrcSubIdx = TM->getRegisterInfo()->
+    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0;
+
   // The source has other uses. See if we can replace the other uses with use of
   // the result of the extension.
   SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
-  UI = MRI->use_nodbg_begin(DstReg);
-  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
        UI != UE; ++UI)
     ReachedBBs.insert(UI->getParent());
 
@@ -156,8 +181,8 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   SmallVector<MachineOperand*, 8> ExtendedUses;
 
   bool ExtendLife = true;
-  UI = MRI->use_nodbg_begin(SrcReg);
-  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(SrcReg), UE = MRI->use_nodbg_end();
        UI != UE; ++UI) {
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = &*UI;
@@ -169,6 +194,10 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
       continue;
     }
 
+    // Only accept uses of SrcReg:SubIdx.
+    if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
+      continue;
+
     // It's an error to translate this:
     //
     //    %reg1025 = <sext> %reg1024
@@ -223,9 +252,9 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
     // Look for PHI uses of the extended result, we don't want to extend the
     // liveness of a PHI input. It breaks all kinds of assumptions down
     // stream. A PHI use is expected to be the kill of its source values.
-    UI = MRI->use_nodbg_begin(DstReg);
     for (MachineRegisterInfo::use_nodbg_iterator
-           UE = MRI->use_nodbg_end(); UI != UE; ++UI)
+         UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
+         UI != UE; ++UI)
       if (UI->isPHI())
         PHIBBs.insert(UI->getParent());
 
@@ -238,14 +267,20 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
         continue;
 
       // About to add uses of DstReg, clear DstReg's kill flags.
-      if (!Changed)
+      if (!Changed) {
         MRI->clearKillFlags(DstReg);
+        MRI->constrainRegClass(DstReg, DstRC);
+      }
 
       unsigned NewVR = MRI->createVirtualRegister(RC);
-      BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), NewVR)
+      MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY), NewVR)
         .addReg(DstReg, 0, SubIdx);
-
+      // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set.
+      if (UseSrcSubIdx) {
+        Copy->getOperand(0).setSubReg(SubIdx);
+        Copy->getOperand(0).setIsUndef();
+      }
       UseMO->setReg(NewVR);
       ++NumReuse;
       Changed = true;
@@ -255,7 +290,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// OptimizeBitcastInstr - If the instruction is a bitcast instruction A that
+/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
 /// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
 /// a value cross register classes), and the source is defined by another
 /// bitcast instruction B. And if the register class of source of B matches
@@ -265,7 +300,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
 ///   %vreg3<def> = VMOVRS %vreg0
 ///   Replace all uses of vreg3 with vreg1.
 
-bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
+bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
                                              MachineBasicBlock *MBB) {
   unsigned NumDefs = MI->getDesc().getNumDefs();
   unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
@@ -327,22 +362,23 @@ bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
   return true;
 }
 
-/// OptimizeCmpInstr - If the instruction is a compare and the previous
+/// optimizeCmpInstr - If the instruction is a compare and the previous
 /// instruction it's comparing against all ready sets (or could be modified to
 /// set) the same flag as the compare, then we can remove the comparison and use
 /// the flag from the previous instruction.
-bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI,
+bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
                                          MachineBasicBlock *MBB) {
   // If this instruction is a comparison against zero and isn't comparing a
   // physical register, we can try to optimize it.
-  unsigned SrcReg;
+  unsigned SrcReg, SrcReg2;
   int CmpMask, CmpValue;
-  if (!TII->AnalyzeCompare(MI, SrcReg, CmpMask, CmpValue) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
     return false;
 
   // Attempt to optimize the comparison instruction.
-  if (TII->OptimizeCompareInstr(MI, SrcReg, CmpMask, CmpValue, MRI)) {
+  if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
     ++NumCmps;
     return true;
   }
@@ -350,6 +386,30 @@ bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI,
   return false;
 }
 
+/// isLoadFoldable - Check whether MI is a candidate for folding into a later
+/// instruction. We only fold loads to virtual registers and the virtual
+/// register defined has a single use.
+bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI,
+                                       unsigned &FoldAsLoadDefReg) {
+  if (!MI->canFoldAsLoad() || !MI->mayLoad())
+    return false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+
+  unsigned Reg = MI->getOperand(0).getReg();
+  // To reduce compilation time, we check MRI->hasOneUse when inserting
+  // loads. It should be checked when processing uses of the load, since
+  // uses can be removed during peephole.
+  if (!MI->getOperand(0).getSubReg() &&
+      TargetRegisterInfo::isVirtualRegister(Reg) &&
+      MRI->hasOneUse(Reg)) {
+    FoldAsLoadDefReg = Reg;
+    return true;
+  }
+  return false;
+}
+
 bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
                                         SmallSet<unsigned, 4> &ImmDefRegs,
                                  DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
@@ -368,10 +428,10 @@ bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
   return false;
 }
 
-/// FoldImmediate - Try folding register operands that are defined by move
+/// foldImmediate - Try folding register operands that are defined by move
 /// immediate instructions, i.e. a trivial constant folding optimization, if
 /// and only if the def and use are in the same BB.
-bool PeepholeOptimizer::FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                                       SmallSet<unsigned, 4> &ImmDefRegs,
                                  DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
   for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
@@ -407,6 +467,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   SmallPtrSet<MachineInstr*, 8> LocalMIs;
   SmallSet<unsigned, 4> ImmDefRegs;
   DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+  unsigned FoldAsLoadDefReg;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
     MachineBasicBlock *MBB = &*I;
 
@@ -414,6 +475,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     LocalMIs.clear();
     ImmDefRegs.clear();
     ImmDefMIs.clear();
+    FoldAsLoadDefReg = 0;
 
     bool First = true;
     MachineBasicBlock::iterator PMII;
@@ -422,15 +484,20 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr *MI = &*MII;
       LocalMIs.insert(MI);
 
+      // If there exists an instruction which belongs to the following
+      // categories, we will discard the load candidate.
       if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
           MI->isKill() || MI->isInlineAsm() || MI->isDebugValue() ||
           MI->hasUnmodeledSideEffects()) {
+        FoldAsLoadDefReg = 0;
         ++MII;
         continue;
       }
+      if (MI->mayStore() || MI->isCall())
+        FoldAsLoadDefReg = 0;
 
       if (MI->isBitcast()) {
-        if (OptimizeBitcastInstr(MI, MBB)) {
+        if (optimizeBitcastInstr(MI, MBB)) {
           // MI is deleted.
           LocalMIs.erase(MI);
           Changed = true;
@@ -438,7 +505,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
       } else if (MI->isCompare()) {
-        if (OptimizeCmpInstr(MI, MBB)) {
+        if (optimizeCmpInstr(MI, MBB)) {
           // MI is deleted.
           LocalMIs.erase(MI);
           Changed = true;
@@ -450,11 +517,36 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;
       } else {
-        Changed |= OptimizeExtInstr(MI, MBB, LocalMIs);
+        Changed |= optimizeExtInstr(MI, MBB, LocalMIs);
         if (SeenMoveImm)
-          Changed |= FoldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
+          Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
       }
 
+      // Check whether MI is a load candidate for folding into a later
+      // instruction. If MI is not a candidate, check whether we can fold an
+      // earlier load into MI.
+      if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) {
+        // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr
+        // can enable folding by converting SUB to CMP.
+        MachineInstr *DefMI = 0;
+        MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
+                                                      FoldAsLoadDefReg, DefMI);
+        if (FoldMI) {
+          // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
+          LocalMIs.erase(MI);
+          LocalMIs.erase(DefMI);
+          LocalMIs.insert(FoldMI);
+          MI->eraseFromParent();
+          DefMI->eraseFromParent();
+          ++NumLoadFold;
+
+          // MI is replaced with FoldMI.
+          Changed = true;
+          PMII = FoldMI;
+          MII = llvm::next(PMII);
+          continue;
+        }
+      }
       First = false;
       PMII = MII;
       ++MII;
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 24d3e5a..7449ff5 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -22,7 +22,6 @@
 #include "AntiDepBreaker.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -31,6 +30,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -78,7 +78,6 @@ AntiDepBreaker::~AntiDepBreaker() { }
 
 namespace {
   class PostRAScheduler : public MachineFunctionPass {
-    AliasAnalysis *AA;
     const TargetInstrInfo *TII;
     RegisterClassInfo RegClassInfo;
 
@@ -206,6 +205,10 @@ SchedulePostRATDList::SchedulePostRATDList(
   const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
   HazardRec =
     TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
+
+  assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
+          MRI.tracksLiveness()) &&
+         "Live-ins must be accurate for anti-dependency breaking");
   AntiDepBreak =
     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
      (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
@@ -423,9 +426,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
       unsigned Reg = *I;
       LiveRegs.set(Reg);
       // Repeat, for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.set(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
     }
   }
   else {
@@ -437,9 +439,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
         unsigned Reg = *I;
         LiveRegs.set(Reg);
         // Repeat, for all subregs.
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg)
-          LiveRegs.set(*Subreg);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          LiveRegs.set(*SubRegs);
       }
     }
   }
@@ -464,10 +465,9 @@ bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI,
   MO.setIsKill(false);
   bool AllDead = true;
   const unsigned SuperReg = MO.getReg();
-  for (const uint16_t *Subreg = TRI->getSubRegisters(SuperReg);
-       *Subreg; ++Subreg) {
-    if (LiveRegs.test(*Subreg)) {
-      MI->addOperand(MachineOperand::CreateReg(*Subreg,
+  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
+    if (LiveRegs.test(*SubRegs)) {
+      MI->addOperand(MachineOperand::CreateReg(*SubRegs,
                                                true  /*IsDef*/,
                                                true  /*IsImp*/,
                                                false /*IsKill*/,
@@ -517,9 +517,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       LiveRegs.reset(Reg);
 
       // Repeat for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.reset(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.reset(*SubRegs);
     }
 
     // Examine all used registers and set/clear kill flag. When a
@@ -536,9 +535,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       if (!killedRegs.test(Reg)) {
         kill = true;
         // A register is not killed if any subregs are live...
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg) {
-          if (LiveRegs.test(*Subreg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          if (LiveRegs.test(*SubRegs)) {
             kill = false;
             break;
           }
@@ -570,9 +568,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
 
       LiveRegs.set(Reg);
 
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.set(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
     }
   }
 }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 1ad3479..34d075c 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -9,297 +9,163 @@
 
 #define DEBUG_TYPE "processimplicitdefs"
 
-#include "llvm/CodeGen/ProcessImplicitDefs.h"
-
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
 
 using namespace llvm;
 
+namespace {
+/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
+/// for each use. Add isUndef marker to implicit_def defs and their uses.
+class ProcessImplicitDefs : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  SmallSetVector<MachineInstr*, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  static char ID;
+
+  ProcessImplicitDefs() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+  virtual bool runOnMachineFunction(MachineFunction &fn);
+};
+} // end anonymous namespace
+
 char ProcessImplicitDefs::ID = 0;
 char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
 
 INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
                 "Process Implicit Definitions", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
                 "Process Implicit Definitions", false, false)
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved<LiveVariables>();
-  AU.addPreservedID(MachineLoopInfoID);
-  AU.addPreservedID(MachineDominatorsID);
-  AU.addPreservedID(TwoAddressInstructionPassID);
-  AU.addPreservedID(PHIEliminationID);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool
-ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
-                                            unsigned Reg, unsigned OpIdx,
-                                            SmallSet<unsigned, 8> &ImpDefRegs) {
-  switch(OpIdx) {
-  case 1:
-    return MI->isCopy() && (!MI->getOperand(0).readsReg() ||
-                            ImpDefRegs.count(MI->getOperand(0).getReg()));
-  case 2:
-    return MI->isSubregToReg() && (!MI->getOperand(0).readsReg() ||
-                                  ImpDefRegs.count(MI->getOperand(0).getReg()));
-  default: return false;
-  }
-}
-
-static bool isUndefCopy(MachineInstr *MI, unsigned Reg,
-                        SmallSet<unsigned, 8> &ImpDefRegs) {
-  if (MI->isCopy()) {
-    MachineOperand &MO0 = MI->getOperand(0);
-    MachineOperand &MO1 = MI->getOperand(1);
-    if (MO1.getReg() != Reg)
-      return false;
-    if (!MO0.readsReg() || ImpDefRegs.count(MO0.getReg()))
-      return true;
+bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
+  if (!MI->isCopyLike() &&
+      !MI->isInsertSubreg() &&
+      !MI->isRegSequence() &&
+      !MI->isPHI())
     return false;
-  }
-  return false;
+  for (MIOperands MO(MI); MO.isValid(); ++MO)
+    if (MO->isReg() && MO->isUse() && MO->readsReg())
+      return false;
+  return true;
 }
 
-/// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure
-/// there is one implicit_def for each use. Add isUndef marker to
-/// implicit_def defs and their uses.
-bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
-
-  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
-               << "********** Function: "
-               << ((Value*)fn.getFunction())->getName() << '\n');
-
-  bool Changed = false;
-
-  TII = fn.getTarget().getInstrInfo();
-  TRI = fn.getTarget().getRegisterInfo();
-  MRI = &fn.getRegInfo();
-  LV = getAnalysisIfAvailable<LiveVariables>();
-
-  SmallSet<unsigned, 8> ImpDefRegs;
-  SmallVector<MachineInstr*, 8> ImpDefMIs;
-  SmallVector<MachineInstr*, 4> RUses;
-  SmallPtrSet<MachineBasicBlock*,16> Visited;
-  SmallPtrSet<MachineInstr*, 8> ModInsts;
-
-  MachineBasicBlock *Entry = fn.begin();
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
-         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
-       DFI != E; ++DFI) {
-    MachineBasicBlock *MBB = *DFI;
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-         I != E; ) {
-      MachineInstr *MI = &*I;
-      ++I;
-      if (MI->isImplicitDef()) {
-        ImpDefMIs.push_back(MI);
-        // Is this a sub-register read-modify-write?
-        if (MI->getOperand(0).readsReg())
-          continue;
-        unsigned Reg = MI->getOperand(0).getReg();
-        ImpDefRegs.insert(Reg);
-        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-          for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
-            ImpDefRegs.insert(*SS);
-        }
+void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
+  DEBUG(dbgs() << "Processing " << *MI);
+  unsigned Reg = MI->getOperand(0).getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // implicit-def when possible.
+    for (MachineRegisterInfo::use_nodbg_iterator UI =
+         MRI->use_nodbg_begin(Reg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineOperand &MO = UI.getOperand();
+      MO.setIsUndef();
+      MachineInstr *UserMI = MO.getParent();
+      if (!canTurnIntoImplicitDef(UserMI))
         continue;
-      }
-
-      // Eliminate %reg1032:sub<def> = COPY undef.
-      if (MI->isCopy() && MI->getOperand(0).readsReg()) {
-        MachineOperand &MO = MI->getOperand(1);
-        if (MO.isUndef() || ImpDefRegs.count(MO.getReg())) {
-          if (LV && MO.isKill()) {
-            LiveVariables::VarInfo& vi = LV->getVarInfo(MO.getReg());
-            vi.removeKill(MI);
-          }
-          unsigned Reg = MI->getOperand(0).getReg();
-          MI->eraseFromParent();
-          Changed = true;
-
-          // A REG_SEQUENCE may have been expanded into partial definitions.
-          // If this was the last one, mark Reg as implicitly defined.
-          if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI->def_empty(Reg))
-            ImpDefRegs.insert(Reg);
-          continue;
-        }
-      }
-
-      bool ChangedToImpDef = false;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand& MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.readsReg())
-          continue;
-        unsigned Reg = MO.getReg();
-        if (!Reg)
-          continue;
-        if (!ImpDefRegs.count(Reg))
-          continue;
-        // Use is a copy, just turn it into an implicit_def.
-        if (CanTurnIntoImplicitDef(MI, Reg, i, ImpDefRegs)) {
-          bool isKill = MO.isKill();
-          MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
-          for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
-            MI->RemoveOperand(j);
-          if (isKill) {
-            ImpDefRegs.erase(Reg);
-            if (LV) {
-              LiveVariables::VarInfo& vi = LV->getVarInfo(Reg);
-              vi.removeKill(MI);
-            }
-          }
-          ChangedToImpDef = true;
-          Changed = true;
-          break;
-        }
-
-        Changed = true;
-        MO.setIsUndef();
-        // This is a partial register redef of an implicit def.
-        // Make sure the whole register is defined by the instruction.
-        if (MO.isDef()) {
-          MI->addRegisterDefined(Reg);
-          continue;
-        }
-        if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
-          // Make sure other reads of Reg are also marked <undef>.
-          for (unsigned j = i+1; j != e; ++j) {
-            MachineOperand &MOJ = MI->getOperand(j);
-            if (MOJ.isReg() && MOJ.getReg() == Reg && MOJ.readsReg())
-              MOJ.setIsUndef();
-          }
-          ImpDefRegs.erase(Reg);
-        }
-      }
-
-      if (ChangedToImpDef) {
-        // Backtrack to process this new implicit_def.
-        --I;
-      } else {
-        for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-          MachineOperand& MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isDef())
-            continue;
-          ImpDefRegs.erase(MO.getReg());
-        }
-      }
+      DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      WorkList.insert(UserMI);
     }
+    MI->eraseFromParent();
+    return;
+  }
 
-    // Any outstanding liveout implicit_def's?
-    for (unsigned i = 0, e = ImpDefMIs.size(); i != e; ++i) {
-      MachineInstr *MI = ImpDefMIs[i];
-      unsigned Reg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-          !ImpDefRegs.count(Reg)) {
-        // Delete all "local" implicit_def's. That include those which define
-        // physical registers since they cannot be liveout.
-        MI->eraseFromParent();
-        Changed = true;
+  // This is a physreg implicit-def.
+  // Look for the first instruction to use or define an alias.
+  MachineBasicBlock::instr_iterator UserMI = MI;
+  MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
+  bool Found = false;
+  for (++UserMI; UserMI != UserE; ++UserMI) {
+    for (MIOperands MO(UserMI); MO.isValid(); ++MO) {
+      if (!MO->isReg())
         continue;
-      }
-
-      // If there are multiple defs of the same register and at least one
-      // is not an implicit_def, do not insert implicit_def's before the
-      // uses.
-      bool Skip = false;
-      SmallVector<MachineInstr*, 4> DeadImpDefs;
-      for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg),
-             DE = MRI->def_end(); DI != DE; ++DI) {
-        MachineInstr *DeadImpDef = &*DI;
-        if (!DeadImpDef->isImplicitDef()) {
-          Skip = true;
-          break;
-        }
-        DeadImpDefs.push_back(DeadImpDef);
-      }
-      if (Skip)
+      unsigned UserReg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+          !TRI->regsOverlap(Reg, UserReg))
         continue;
+      // UserMI uses or redefines Reg. Set <undef> flags on all uses.
+      Found = true;
+      if (MO->isUse())
+        MO->setIsUndef();
+    }
+    if (Found)
+      break;
+  }
 
-      // The only implicit_def which we want to keep are those that are live
-      // out of its block.
-      for (unsigned j = 0, ee = DeadImpDefs.size(); j != ee; ++j)
-        DeadImpDefs[j]->eraseFromParent();
-      Changed = true;
-
-      // Process each use instruction once.
-      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
-             UE = MRI->use_end(); UI != UE; ++UI) {
-        if (UI.getOperand().isUndef())
-          continue;
-        MachineInstr *RMI = &*UI;
-        if (ModInsts.insert(RMI))
-          RUses.push_back(RMI);
-      }
+  // If we found the using MI, we can erase the IMPLICIT_DEF.
+  if (Found) {
+    DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    MI->eraseFromParent();
+    return;
+  }
 
-      for (unsigned i = 0, e = RUses.size(); i != e; ++i) {
-        MachineInstr *RMI = RUses[i];
+  // Using instr wasn't found, it could be in another block.
+  // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
+  for (unsigned i = MI->getNumOperands() - 1; i; --i)
+    MI->RemoveOperand(i);
+  DEBUG(dbgs() << "Keeping physreg: " << *MI);
+}
 
-        // Turn a copy use into an implicit_def.
-        if (isUndefCopy(RMI, Reg, ImpDefRegs)) {
-          RMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
+/// <undef> operands.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
-          bool isKill = false;
-          SmallVector<unsigned, 4> Ops;
-          for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
-            MachineOperand &RRMO = RMI->getOperand(j);
-            if (RRMO.isReg() && RRMO.getReg() == Reg) {
-              Ops.push_back(j);
-              if (RRMO.isKill())
-                isKill = true;
-            }
-          }
-          // Leave the other operands along.
-          for (unsigned j = 0, ee = Ops.size(); j != ee; ++j) {
-            unsigned OpIdx = Ops[j];
-            RMI->RemoveOperand(OpIdx-j);
-          }
+  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+               << "********** Function: "
+               << ((Value*)MF.getFunction())->getName() << '\n');
 
-          // Update LiveVariables varinfo if the instruction is a kill.
-          if (LV && isKill) {
-            LiveVariables::VarInfo& vi = LV->getVarInfo(Reg);
-            vi.removeKill(RMI);
-          }
-          continue;
-        }
+  bool Changed = false;
 
-        // Replace Reg with a new vreg that's marked implicit.
-        const TargetRegisterClass* RC = MRI->getRegClass(Reg);
-        unsigned NewVReg = MRI->createVirtualRegister(RC);
-        bool isKill = true;
-        for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
-          MachineOperand &RRMO = RMI->getOperand(j);
-          if (RRMO.isReg() && RRMO.getReg() == Reg) {
-            RRMO.setReg(NewVReg);
-            RRMO.setIsUndef();
-            if (isKill) {
-              // Only the first operand of NewVReg is marked kill.
-              RRMO.setIsKill();
-              isKill = false;
-            }
-          }
-        }
-      }
-      RUses.clear();
-      ModInsts.clear();
-    }
-    ImpDefRegs.clear();
-    ImpDefMIs.clear();
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
+  assert(WorkList.empty() && "Inconsistent worklist state");
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+       MFI != MFE; ++MFI) {
+    // Scan the basic block for implicit defs.
+    for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(),
+         MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI)
+      if (MBBI->isImplicitDef())
+        WorkList.insert(MBBI);
+
+    if (WorkList.empty())
+      continue;
+
+    DEBUG(dbgs() << "BB#" << MFI->getNumber() << " has " << WorkList.size()
+                 << " implicit defs.\n");
+    Changed = true;
+
+    // Drain the WorkList to recursively process any new implicit defs.
+    do processImplicitDef(WorkList.pop_back_val());
+    while (!WorkList.empty());
   }
-
   return Changed;
 }
-
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 458915e..c791ffb 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -302,7 +302,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
-  if (! ShrinkWrapThisFunction) {
+  if (!ShrinkWrapThisFunction) {
     // Spill using target interface.
     I = EntryBlock->begin();
     if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index b00eceb..993dbc7 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "RegAllocBase.h"
+#include "LiveRegMatrix.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
@@ -34,8 +35,6 @@
 
 using namespace llvm;
 
-STATISTIC(NumAssigned     , "Number of registers assigned");
-STATISTIC(NumUnassigned   , "Number of registers unassigned");
 STATISTIC(NumNewQueued    , "Number of new live ranges queued");
 
 // Temporary verification option until we can put verification inside
@@ -47,85 +46,20 @@ VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
 const char *RegAllocBase::TimerGroupName = "Register Allocation";
 bool RegAllocBase::VerifyEnabled = false;
 
-#ifndef NDEBUG
-// Verify each LiveIntervalUnion.
-void RegAllocBase::verify() {
-  LiveVirtRegBitSet VisitedVRegs;
-  OwningArrayPtr<LiveVirtRegBitSet>
-    unionVRegs(new LiveVirtRegBitSet[PhysReg2LiveUnion.numRegs()]);
-
-  // Verify disjoint unions.
-  for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) {
-    DEBUG(PhysReg2LiveUnion[PhysReg].print(dbgs(), TRI));
-    LiveVirtRegBitSet &VRegs = unionVRegs[PhysReg];
-    PhysReg2LiveUnion[PhysReg].verify(VRegs);
-    // Union + intersection test could be done efficiently in one pass, but
-    // don't add a method to SparseBitVector unless we really need it.
-    assert(!VisitedVRegs.intersects(VRegs) && "vreg in multiple unions");
-    VisitedVRegs |= VRegs;
-  }
-
-  // Verify vreg coverage.
-  for (LiveIntervals::iterator liItr = LIS->begin(), liEnd = LIS->end();
-       liItr != liEnd; ++liItr) {
-    unsigned reg = liItr->first;
-    if (TargetRegisterInfo::isPhysicalRegister(reg)) continue;
-    if (!VRM->hasPhys(reg)) continue; // spilled?
-    unsigned PhysReg = VRM->getPhys(reg);
-    if (!unionVRegs[PhysReg].test(reg)) {
-      dbgs() << "LiveVirtReg " << reg << " not in union " <<
-        TRI->getName(PhysReg) << "\n";
-      llvm_unreachable("unallocated live vreg");
-    }
-  }
-  // FIXME: I'm not sure how to verify spilled intervals.
-}
-#endif //!NDEBUG
-
 //===----------------------------------------------------------------------===//
 //                         RegAllocBase Implementation
 //===----------------------------------------------------------------------===//
 
-// Instantiate a LiveIntervalUnion for each physical register.
-void RegAllocBase::LiveUnionArray::init(LiveIntervalUnion::Allocator &allocator,
-                                        unsigned NRegs) {
-  NumRegs = NRegs;
-  Array =
-    static_cast<LiveIntervalUnion*>(malloc(sizeof(LiveIntervalUnion)*NRegs));
-  for (unsigned r = 0; r != NRegs; ++r)
-    new(Array + r) LiveIntervalUnion(r, allocator);
-}
-
-void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) {
-  NamedRegionTimer T("Initialize", TimerGroupName, TimePassesIsEnabled);
+void RegAllocBase::init(VirtRegMap &vrm,
+                        LiveIntervals &lis,
+                        LiveRegMatrix &mat) {
   TRI = &vrm.getTargetRegInfo();
   MRI = &vrm.getRegInfo();
   VRM = &vrm;
   LIS = &lis;
+  Matrix = &mat;
   MRI->freezeReservedRegs(vrm.getMachineFunction());
   RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
-
-  const unsigned NumRegs = TRI->getNumRegs();
-  if (NumRegs != PhysReg2LiveUnion.numRegs()) {
-    PhysReg2LiveUnion.init(UnionAllocator, NumRegs);
-    // Cache an interferece query for each physical reg
-    Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]);
-  }
-}
-
-void RegAllocBase::LiveUnionArray::clear() {
-  if (!Array)
-    return;
-  for (unsigned r = 0; r != NumRegs; ++r)
-    Array[r].~LiveIntervalUnion();
-  free(Array);
-  NumRegs =  0;
-  Array = 0;
-}
-
-void RegAllocBase::releaseMemory() {
-  for (unsigned r = 0, e = PhysReg2LiveUnion.numRegs(); r != e; ++r)
-    PhysReg2LiveUnion[r].clear();
 }
 
 // Visit all the live registers. If they are already assigned to a physical
@@ -133,35 +67,14 @@ void RegAllocBase::releaseMemory() {
 // them on the priority queue for later assignment.
 void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled);
-  for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); I != E; ++I) {
-    unsigned RegNum = I->first;
-    LiveInterval &VirtReg = *I->second;
-    if (TargetRegisterInfo::isPhysicalRegister(RegNum))
-      PhysReg2LiveUnion[RegNum].unify(VirtReg);
-    else
-      enqueue(&VirtReg);
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    enqueue(&LIS->getInterval(Reg));
   }
 }
 
-void RegAllocBase::assign(LiveInterval &VirtReg, unsigned PhysReg) {
-  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
-               << " to " << PrintReg(PhysReg, TRI) << '\n');
-  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
-  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
-  MRI->setPhysRegUsed(PhysReg);
-  PhysReg2LiveUnion[PhysReg].unify(VirtReg);
-  ++NumAssigned;
-}
-
-void RegAllocBase::unassign(LiveInterval &VirtReg, unsigned PhysReg) {
-  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
-               << " from " << PrintReg(PhysReg, TRI) << '\n');
-  assert(VRM->getPhys(VirtReg.reg) == PhysReg && "Inconsistent unassign");
-  PhysReg2LiveUnion[PhysReg].extract(VirtReg);
-  VRM->clearVirt(VirtReg.reg);
-  ++NumUnassigned;
-}
-
 // Top-level driver to manage the queue of unassigned VirtRegs and call the
 // selectOrSplit implementation.
 void RegAllocBase::allocatePhysRegs() {
@@ -179,14 +92,14 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     // Invalidate all interference queries, live ranges could have changed.
-    invalidateVirtRegs();
+    Matrix->invalidateVirtRegs();
 
     // selectOrSplit requests the allocator to return an available physical
     // register if possible and populate a list of new live intervals that
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
                  << MRI->getRegClass(VirtReg->reg)->getName()
-                 << ':' << *VirtReg << '\n');
+                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
     typedef SmallVector<LiveInterval*, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
@@ -211,7 +124,7 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     if (AvailablePhysReg)
-      assign(*VirtReg, AvailablePhysReg);
+      Matrix->assign(*VirtReg, AvailablePhysReg);
 
     for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
          I != E; ++I) {
@@ -230,51 +143,3 @@ void RegAllocBase::allocatePhysRegs() {
     }
   }
 }
-
-// Check if this live virtual register interferes with a physical register. If
-// not, then check for interference on each register that aliases with the
-// physical register. Return the interfering register.
-unsigned RegAllocBase::checkPhysRegInterference(LiveInterval &VirtReg,
-                                                unsigned PhysReg) {
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI)
-    if (query(VirtReg, *AliasI).checkInterference())
-      return *AliasI;
-  return 0;
-}
-
-// Add newly allocated physical registers to the MBB live in sets.
-void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
-  NamedRegionTimer T("MBB Live Ins", TimerGroupName, TimePassesIsEnabled);
-  SlotIndexes *Indexes = LIS->getSlotIndexes();
-  if (MF->size() <= 1)
-    return;
-
-  LiveIntervalUnion::SegmentIter SI;
-  for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) {
-    LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg];
-    if (LiveUnion.empty())
-      continue;
-    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " live-in:");
-    MachineFunction::iterator MBB = llvm::next(MF->begin());
-    MachineFunction::iterator MFE = MF->end();
-    SlotIndex Start, Stop;
-    tie(Start, Stop) = Indexes->getMBBRange(MBB);
-    SI.setMap(LiveUnion.getMap());
-    SI.find(Start);
-    while (SI.valid()) {
-      if (SI.start() <= Start) {
-        if (!MBB->isLiveIn(PhysReg))
-          MBB->addLiveIn(PhysReg);
-        DEBUG(dbgs() << "\tBB#" << MBB->getNumber() << ':'
-                     << PrintReg(SI.value()->reg, TRI));
-      } else if (SI.start() > Stop)
-        MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex());
-      if (++MBB == MFE)
-        break;
-      tie(Start, Stop) = Indexes->getMBBRange(MBB);
-      SI.advanceTo(Start);
-    }
-    DEBUG(dbgs() << '\n');
-  }
-}
-
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index 072fe2b..db0c8e1 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -37,9 +37,9 @@
 #ifndef LLVM_CODEGEN_REGALLOCBASE
 #define LLVM_CODEGEN_REGALLOCBASE
 
-#include "llvm/ADT/OwningPtr.h"
 #include "LiveIntervalUnion.h"
-#include "RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/OwningPtr.h"
 
 namespace llvm {
 
@@ -47,6 +47,7 @@ template<typename T> class SmallVectorImpl;
 class TargetRegisterInfo;
 class VirtRegMap;
 class LiveIntervals;
+class LiveRegMatrix;
 class Spiller;
 
 /// RegAllocBase provides the register allocation driver and interface that can
@@ -56,69 +57,20 @@ class Spiller;
 /// live range splitting. They must also override enqueue/dequeue to provide an
 /// assignment order.
 class RegAllocBase {
-  LiveIntervalUnion::Allocator UnionAllocator;
-
-  // Cache tag for PhysReg2LiveUnion entries. Increment whenever virtual
-  // registers may have changed.
-  unsigned UserTag;
-
-  // Array of LiveIntervalUnions indexed by physical register.
-  class LiveUnionArray {
-    unsigned NumRegs;
-    LiveIntervalUnion *Array;
-  public:
-    LiveUnionArray(): NumRegs(0), Array(0) {}
-    ~LiveUnionArray() { clear(); }
-
-    unsigned numRegs() const { return NumRegs; }
-
-    void init(LiveIntervalUnion::Allocator &, unsigned NRegs);
-
-    void clear();
-
-    LiveIntervalUnion& operator[](unsigned PhysReg) {
-      assert(PhysReg <  NumRegs && "physReg out of bounds");
-      return Array[PhysReg];
-    }
-  };
-
-  LiveUnionArray PhysReg2LiveUnion;
-
-  // Current queries, one per physreg. They must be reinitialized each time we
-  // query on a new live virtual register.
-  OwningArrayPtr<LiveIntervalUnion::Query> Queries;
-
 protected:
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   VirtRegMap *VRM;
   LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
-  RegAllocBase(): UserTag(0), TRI(0), MRI(0), VRM(0), LIS(0) {}
+  RegAllocBase(): TRI(0), MRI(0), VRM(0), LIS(0), Matrix(0) {}
 
   virtual ~RegAllocBase() {}
 
   // A RegAlloc pass should call this before allocatePhysRegs.
-  void init(VirtRegMap &vrm, LiveIntervals &lis);
-
-  // Get an initialized query to check interferences between lvr and preg.  Note
-  // that Query::init must be called at least once for each physical register
-  // before querying a new live virtual register. This ties Queries and
-  // PhysReg2LiveUnion together.
-  LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned PhysReg) {
-    Queries[PhysReg].init(UserTag, &VirtReg, &PhysReg2LiveUnion[PhysReg]);
-    return Queries[PhysReg];
-  }
-
-  // Get direct access to the underlying LiveIntervalUnion for PhysReg.
-  LiveIntervalUnion &getLiveUnion(unsigned PhysReg) {
-    return PhysReg2LiveUnion[PhysReg];
-  }
-
-  // Invalidate all cached information about virtual registers - live ranges may
-  // have changed.
-  void invalidateVirtRegs() { ++UserTag; }
+  void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat);
 
   // The top-level driver. The output is a VirtRegMap that us updated with
   // physical register assignments.
@@ -140,31 +92,6 @@ protected:
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
 
-  // A RegAlloc pass should call this when PassManager releases its memory.
-  virtual void releaseMemory();
-
-  // Helper for checking interference between a live virtual register and a
-  // physical register, including all its register aliases. If an interference
-  // exists, return the interfering register, which may be preg or an alias.
-  unsigned checkPhysRegInterference(LiveInterval& VirtReg, unsigned PhysReg);
-
-  /// assign - Assign VirtReg to PhysReg.
-  /// This should not be called from selectOrSplit for the current register.
-  void assign(LiveInterval &VirtReg, unsigned PhysReg);
-
-  /// unassign - Undo a previous assignment of VirtReg to PhysReg.
-  /// This can be invoked from selectOrSplit, but be careful to guarantee that
-  /// allocation is making progress.
-  void unassign(LiveInterval &VirtReg, unsigned PhysReg);
-
-  /// addMBBLiveIns - Add physreg liveins to basic blocks.
-  void addMBBLiveIns(MachineFunction *);
-
-#ifndef NDEBUG
-  // Verify each LiveIntervalUnion.
-  void verify();
-#endif
-
   // Use this group name for NamedRegionTimer.
   static const char *TimerGroupName;
 
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 77ee314..3a03807 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -13,11 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "AllocationOrder.h"
 #include "RegAllocBase.h"
 #include "LiveDebugVariables.h"
-#include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
+#include "LiveRegMatrix.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
@@ -64,10 +65,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
   // context
   MachineFunction *MF;
 
-  // analyses
-  LiveStacks *LS;
-  RenderMachineFunction *RMF;
-
   // state
   std::auto_ptr<Spiller> SpillerInstance;
   std::priority_queue<LiveInterval*, std::vector<LiveInterval*>,
@@ -118,9 +115,6 @@ public:
   bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
                           SmallVectorImpl<LiveInterval*> &SplitVRegs);
 
-  void spillReg(LiveInterval &VirtReg, unsigned PhysReg,
-                SmallVectorImpl<LiveInterval*> &SplitVRegs);
-
   static char ID;
 };
 
@@ -139,7 +133,7 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
   initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-  initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
 }
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -147,6 +141,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
@@ -159,41 +154,15 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<VirtRegMap>();
   AU.addPreserved<VirtRegMap>();
-  DEBUG(AU.addRequired<RenderMachineFunction>());
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void RABasic::releaseMemory() {
   SpillerInstance.reset(0);
-  RegAllocBase::releaseMemory();
 }
 
-// Helper for spillInterferences() that spills all interfering vregs currently
-// assigned to this physical register.
-void RABasic::spillReg(LiveInterval& VirtReg, unsigned PhysReg,
-                       SmallVectorImpl<LiveInterval*> &SplitVRegs) {
-  LiveIntervalUnion::Query &Q = query(VirtReg, PhysReg);
-  assert(Q.seenAllInterferences() && "need collectInterferences()");
-  const SmallVectorImpl<LiveInterval*> &PendingSpills = Q.interferingVRegs();
-
-  for (SmallVectorImpl<LiveInterval*>::const_iterator I = PendingSpills.begin(),
-         E = PendingSpills.end(); I != E; ++I) {
-    LiveInterval &SpilledVReg = **I;
-    DEBUG(dbgs() << "extracting from " <<
-          TRI->getName(PhysReg) << " " << SpilledVReg << '\n');
-
-    // Deallocate the interfering vreg by removing it from the union.
-    // A LiveInterval instance may not be in a union during modification!
-    unassign(SpilledVReg, PhysReg);
-
-    // Spill the extracted interval.
-    LiveRangeEdit LRE(SpilledVReg, SplitVRegs, *MF, *LIS, VRM);
-    spiller().spill(LRE);
-  }
-  // After extracting segments, the query's results are invalid. But keep the
-  // contents valid until we're done accessing pendingSpills.
-  Q.clear();
-}
 
 // Spill or split all live virtual registers currently unified under PhysReg
 // that interfere with VirtReg. The newly spilled or split live intervals are
@@ -202,22 +171,41 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
                                  SmallVectorImpl<LiveInterval*> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
-  unsigned NumInterferences = 0;
+  SmallVector<LiveInterval*, 8> Intfs;
+
   // Collect interferences assigned to any alias of the physical register.
-  for (const uint16_t *asI = TRI->getOverlaps(PhysReg); *asI; ++asI) {
-    LiveIntervalUnion::Query &QAlias = query(VirtReg, *asI);
-    NumInterferences += QAlias.collectInterferingVRegs();
-    if (QAlias.seenUnspillableVReg()) {
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    Q.collectInterferingVRegs();
+    if (Q.seenUnspillableVReg())
       return false;
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
+        return false;
+      Intfs.push_back(Intf);
     }
   }
   DEBUG(dbgs() << "spilling " << TRI->getName(PhysReg) <<
         " interferences with " << VirtReg << "\n");
-  assert(NumInterferences > 0 && "expect interference");
+  assert(!Intfs.empty() && "expected interference");
 
   // Spill each interfering vreg allocated to PhysReg or an alias.
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI)
-    spillReg(VirtReg, *AliasI, SplitVRegs);
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval &Spill = *Intfs[i];
+
+    // Skip duplicates.
+    if (!VRM->hasPhys(Spill.reg))
+      continue;
+
+    // Deallocate the interfering vreg by removing it from the union.
+    // A LiveInterval instance may not be in a union during modification!
+    Matrix->unassign(Spill);
+
+    // Spill the extracted interval.
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    spiller().spill(LRE);
+  }
   return true;
 }
 
@@ -235,49 +223,36 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // selectOrSplit().
 unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
-  // Check for register mask interference.  When live ranges cross calls, the
-  // set of usable registers is reduced to the callee-saved ones.
-  bool CrossRegMasks = LIS->checkRegMaskInterference(VirtReg, UsableRegs);
-
   // Populate a list of physical register spill candidates.
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  ArrayRef<unsigned> Order =
-    RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg));
-  for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E;
-       ++I) {
-    unsigned PhysReg = *I;
-
-    // If PhysReg is clobbered by a register mask, it isn't useful for
-    // allocation or spilling.
-    if (CrossRegMasks && !UsableRegs.test(PhysReg))
-      continue;
-
-    // Check interference and as a side effect, intialize queries for this
-    // VirtReg and its aliases.
-    unsigned interfReg = checkPhysRegInterference(VirtReg, PhysReg);
-    if (interfReg == 0) {
-      // Found an available register.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  while (unsigned PhysReg = Order.next()) {
+    // Check for interference in PhysReg
+    switch (Matrix->checkInterference(VirtReg, PhysReg)) {
+    case LiveRegMatrix::IK_Free:
+      // PhysReg is available, allocate it.
       return PhysReg;
-    }
-    LiveIntervalUnion::Query &IntfQ = query(VirtReg, interfReg);
-    IntfQ.collectInterferingVRegs(1);
-    LiveInterval *interferingVirtReg = IntfQ.interferingVRegs().front();
 
-    // The current VirtReg must either be spillable, or one of its interferences
-    // must have less spill weight.
-    if (interferingVirtReg->weight < VirtReg.weight ) {
+    case LiveRegMatrix::IK_VirtReg:
+      // Only virtual registers in the way, we may be able to spill them.
       PhysRegSpillCands.push_back(PhysReg);
+      continue;
+
+    default:
+      // RegMask or RegUnit interference.
+      continue;
     }
   }
+
   // Try to spill another interfering reg with less spill weight.
   for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(),
-         PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
-
-    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs)) continue;
+       PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
+    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
+      continue;
 
-    assert(checkPhysRegInterference(VirtReg, *PhysRegI) == 0 &&
+    assert(!Matrix->checkInterference(VirtReg, *PhysRegI) &&
            "Interference after spill.");
     // Tell the caller to allocate to this newly freed physical register.
     return *PhysRegI;
@@ -287,7 +262,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -301,53 +276,17 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                << ((Value*)mf.getFunction())->getName() << '\n');
 
   MF = &mf;
-  DEBUG(RMF = &getAnalysis<RenderMachineFunction>());
-
-  RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
 
-  addMBBLiveIns(MF);
-
   // Diagnostic output before rewriting
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
 
-  // optional HTML output
-  DEBUG(RMF->renderMachineFunction("After basic register allocation.", VRM));
-
-  // FIXME: Verification currently must run before VirtRegRewriter. We should
-  // make the rewriter a separate pass and override verifyAnalysis instead. When
-  // that happens, verification naturally falls under VerifyMachineCode.
-#ifndef NDEBUG
-  if (VerifyEnabled) {
-    // Verify accuracy of LiveIntervals. The standard machine code verifier
-    // ensures that each LiveIntervals covers all uses of the virtual reg.
-
-    // FIXME: MachineVerifier is badly broken when using the standard
-    // spiller. Always use -spiller=inline with -verify-regalloc. Even with the
-    // inline spiller, some tests fail to verify because the coalescer does not
-    // always generate verifiable code.
-    MF->verify(this, "In RABasic::verify");
-
-    // Verify that LiveIntervals are partitioned into unions and disjoint within
-    // the unions.
-    verify();
-  }
-#endif // !NDEBUG
-
-  // Run rewriter
-  VRM->rewrite(LIS->getSlotIndexes());
-
-  // Write out new DBG_VALUE instructions.
-  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers and release all the transient data.
-  VRM->clearAllVirt();
-  MRI->clearVirtRegs();
   releaseMemory();
-
   return true;
 }
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index e09b7f8..6b3a48e 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "RegisterClassInfo.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
@@ -77,7 +77,7 @@ namespace {
       explicit LiveReg(unsigned v)
         : LastUse(0), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false) {}
 
-      unsigned getSparseSetKey() const {
+      unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
       }
     };
@@ -201,20 +201,16 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
 /// its virtual register, and it is guaranteed to be a block-local register.
 ///
 bool RAFast::isLastUseOfLocalReg(MachineOperand &MO) {
-  // Check for non-debug uses or defs following MO.
-  // This is the most likely way to fail - fast path it.
-  MachineOperand *Next = &MO;
-  while ((Next = Next->getNextOperandForReg()))
-    if (!Next->isDebug())
-      return false;
-
   // If the register has ever been spilled or reloaded, we conservatively assume
   // it is a global register used in multiple blocks.
   if (StackSlotForVirtReg[MO.getReg()] != -1)
     return false;
 
   // Check that the use/def chain has exactly one operand - MO.
-  return &MRI->reg_nodbg_begin(MO.getReg()).getOperand() == &MO;
+  MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
+  if (&I.getOperand() != &MO)
+    return false;
+  return ++I == MRI->reg_nodbg_end();
 }
 
 /// addKillFlag - Set kill flags on last use of a virtual register.
@@ -354,8 +350,8 @@ void RAFast::usePhysReg(MachineOperand &MO) {
   }
 
   // Maybe a superregister is reserved?
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     switch (PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -408,8 +404,8 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 
   // This is a disabled register, disable all aliases.
   PhysRegState[PhysReg] = NewState;
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -456,8 +452,8 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
   // This is a disabled register, add up cost of aliases.
   DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is disabled.\n");
   unsigned Cost = 0;
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     if (UsedInInstr.test(Alias))
       return spillImpossible;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
@@ -659,9 +655,10 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
 // Return true if the operand kills its register.
 bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
   MachineOperand &MO = MI->getOperand(OpNum);
+  bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
-    return MO.isKill() || MO.isDead();
+    return MO.isKill() || Dead;
   }
 
   // Handle subregister index.
@@ -674,7 +671,13 @@ bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
     MI->addRegisterKilled(PhysReg, TRI, true);
     return true;
   }
-  return MO.isDead();
+
+  // A <def,read-undef> of a sub-register requires an implicit def of the full
+  // register.
+  if (MO.isDef() && MO.isUndef())
+    MI->addRegisterDefined(PhysReg, TRI);
+
+  return Dead;
 }
 
 // Handle special instruction operand like early clobbers and tied ops when
@@ -704,13 +707,10 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-    UsedInInstr.set(Reg);
-    if (ThroughRegs.count(PhysRegState[Reg]))
-      definePhysReg(MI, Reg, regFree);
-    for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-      UsedInInstr.set(*AS);
-      if (ThroughRegs.count(PhysRegState[*AS]))
-        definePhysReg(MI, *AS, regFree);
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      UsedInInstr.set(*AI);
+      if (ThroughRegs.count(PhysRegState[*AI]))
+        definePhysReg(MI, *AI, regFree);
     }
   }
 
@@ -1029,9 +1029,8 @@ void RAFast::AllocateBasicBlock() {
         if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
-        UsedInInstr.set(Reg);
-        for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-          UsedInInstr.set(*AS);
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          UsedInInstr.set(*AI);
       }
     }
 
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 3f2a617..6ac5428 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -16,6 +16,7 @@
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
+#include "LiveRegMatrix.h"
 #include "RegAllocBase.h"
 #include "Spiller.h"
 #include "SpillPlacement.h"
@@ -73,7 +74,6 @@ class RAGreedy : public MachineFunctionPass,
 
   // analyses
   SlotIndexes *Indexes;
-  LiveStacks *LS;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
   EdgeBundles *Bundles;
@@ -168,19 +168,6 @@ class RAGreedy : public MachineFunctionPass,
     }
   };
 
-  // Register mask interference. The current VirtReg is checked for register
-  // mask interference on entry to selectOrSplit().  If there is no
-  // interference, UsableRegs is left empty.  If there is interference,
-  // UsableRegs has a bit mask of registers that can be used without register
-  // mask interference.
-  BitVector UsableRegs;
-
-  /// clobberedByRegMask - Returns true if PhysReg is not directly usable
-  /// because of register mask clobbers.
-  bool clobberedByRegMask(unsigned PhysReg) const {
-    return !UsableRegs.empty() && !UsableRegs.test(PhysReg);
-  }
-
   // splitting state.
   std::auto_ptr<SplitAnalysis> SA;
   std::auto_ptr<SplitEditor> SE;
@@ -286,6 +273,8 @@ private:
                           SmallVectorImpl<LiveInterval*>&);
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
                          SmallVectorImpl<LiveInterval*>&);
+  unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
+                               SmallVectorImpl<LiveInterval*>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
     SmallVectorImpl<LiveInterval*>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
@@ -327,6 +316,7 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
   initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
   initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
   initializeSpillPlacementPass(*PassRegistry::getPassRegistry());
 }
@@ -336,6 +326,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
@@ -349,6 +340,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<VirtRegMap>();
   AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -360,8 +353,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 //===----------------------------------------------------------------------===//
 
 bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
-  if (unsigned PhysReg = VRM->getPhys(VirtReg)) {
-    unassign(LIS->getInterval(VirtReg), PhysReg);
+  if (VRM->hasPhys(VirtReg)) {
+    Matrix->unassign(LIS->getInterval(VirtReg));
     return true;
   }
   // Unassigned virtreg is probably in the priority queue.
@@ -370,13 +363,12 @@ bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
 }
 
 void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
-  unsigned PhysReg = VRM->getPhys(VirtReg);
-  if (!PhysReg)
+  if (!VRM->hasPhys(VirtReg))
     return;
 
   // Register is assigned, put it back on the queue for reassignment.
   LiveInterval &LI = LIS->getInterval(VirtReg);
-  unassign(LI, PhysReg);
+  Matrix->unassign(LI);
   enqueue(&LI);
 }
 
@@ -398,7 +390,6 @@ void RAGreedy::releaseMemory() {
   SpillerInstance.reset(0);
   ExtraRegInfo.clear();
   GlobalCand.clear();
-  RegAllocBase::releaseMemory();
 }
 
 void RAGreedy::enqueue(LiveInterval *LI) {
@@ -450,12 +441,9 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
   Order.rewind();
   unsigned PhysReg;
-  while ((PhysReg = Order.next())) {
-    if (clobberedByRegMask(PhysReg))
-      continue;
-    if (!checkPhysRegInterference(VirtReg, PhysReg))
+  while ((PhysReg = Order.next()))
+    if (!Matrix->checkInterference(VirtReg, PhysReg))
       break;
-  }
   if (!PhysReg || Order.isHint(PhysReg))
     return PhysReg;
 
@@ -464,7 +452,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
   // If we missed a simple hint, try to cheaply evict interference from the
   // preferred register.
   if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
-    if (Order.isHint(Hint) && !clobberedByRegMask(Hint)) {
+    if (Order.isHint(Hint)) {
       DEBUG(dbgs() << "missed hint " << PrintReg(Hint, TRI) << '\n');
       EvictionCost MaxCost(1);
       if (canEvictInterference(VirtReg, Hint, true, MaxCost)) {
@@ -527,6 +515,10 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 /// @returns True when interference can be evicted cheaper than MaxCost.
 bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
                                     bool IsHint, EvictionCost &MaxCost) {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
+    return false;
+
   // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
   // involved in an eviction before. If a cascade number was assigned, deny
   // evicting anything with the same or a newer cascade number. This prevents
@@ -539,8 +531,8 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     Cascade = NextCascade;
 
   EvictionCost Cost;
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) {
-    LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // If there is 10 or more interferences, chances are one is heavier.
     if (Q.collectInterferingVRegs(10) >= 10)
       return false;
@@ -548,15 +540,21 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     // Check if any interfering live range is heavier than MaxWeight.
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      if (TargetRegisterInfo::isPhysicalRegister(Intf->reg))
-        return false;
+      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+             "Only expecting virtual register interference from query");
       // Never evict spill products. They cannot split or spill.
       if (getStage(*Intf) == RS_Done)
         return false;
       // Once a live range becomes small enough, it is urgent that we find a
       // register for it. This is indicated by an infinite spill weight. These
       // urgent live ranges get to evict almost anything.
-      bool Urgent = !VirtReg.isSpillable() && Intf->isSpillable();
+      //
+      // Also allow urgent evictions of unspillable ranges from a strictly
+      // larger allocation order.
+      bool Urgent = !VirtReg.isSpillable() &&
+        (Intf->isSpillable() ||
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) <
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg)));
       // Only evict older cascades or live ranges without a cascade.
       unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade;
       if (Cascade <= IntfCascade) {
@@ -597,19 +595,29 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 
   DEBUG(dbgs() << "evicting " << PrintReg(PhysReg, TRI)
                << " interference: Cascade " << Cascade << '\n');
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) {
-    LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI);
+
+  // Collect all interfering virtregs first.
+  SmallVector<LiveInterval*, 8> Intfs;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
-    for (unsigned i = 0, e = Q.interferingVRegs().size(); i != e; ++i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i];
-      unassign(*Intf, VRM->getPhys(Intf->reg));
-      assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
-              VirtReg.isSpillable() < Intf->isSpillable()) &&
-             "Cannot decrease cascade number, illegal eviction");
-      ExtraRegInfo[Intf->reg].Cascade = Cascade;
-      ++NumEvicted;
-      NewVRegs.push_back(Intf);
-    }
+    ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
+    Intfs.append(IVR.begin(), IVR.end());
+  }
+
+  // Evict them second. This will invalidate the queries.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval *Intf = Intfs[i];
+    // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
+    if (!VRM->hasPhys(Intf->reg))
+      continue;
+    Matrix->unassign(*Intf);
+    assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
+            VirtReg.isSpillable() < Intf->isSpillable()) &&
+           "Cannot decrease cascade number, illegal eviction");
+    ExtraRegInfo[Intf->reg].Cascade = Cascade;
+    ++NumEvicted;
+    NewVRegs.push_back(Intf);
   }
 }
 
@@ -636,8 +644,6 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
 
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
-    if (clobberedByRegMask(PhysReg))
-      continue;
     if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
       continue;
     // The first use of a callee-saved register in a function has cost 1.
@@ -1183,7 +1189,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     return 0;
 
   // Prepare split editor.
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1231,7 +1237,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1265,6 +1271,65 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   return 0;
 }
 
+
+//===----------------------------------------------------------------------===//
+//                         Per-Instruction Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryInstructionSplit - Split a live range around individual instructions.
+/// This is normally not worthwhile since the spiller is doing essentially the
+/// same thing. However, when the live range is in a constrained register
+/// class, it may help to insert copies such that parts of the live range can
+/// be moved to a larger register class.
+///
+/// This is similar to spilling to a larger register class.
+unsigned
+RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  // There is no point to this if there are no larger sub-classes.
+  if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
+    return 0;
+
+  // Always enable split spill mode, since we're effectively spilling to a
+  // register.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit, SplitEditor::SM_Size);
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 1)
+    return 0;
+
+  DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n");
+
+  // Split around every non-copy instruction.
+  for (unsigned i = 0; i != Uses.size(); ++i) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]))
+      if (MI->isFullCopy()) {
+        DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+        continue;
+      }
+    SE->openIntv();
+    SlotIndex SegStart = SE->enterIntvBefore(Uses[i]);
+    SlotIndex SegStop  = SE->leaveIntvAfter(Uses[i]);
+    SE->useIntv(SegStart, SegStop);
+  }
+
+  if (LREdit.empty()) {
+    DEBUG(dbgs() << "All uses were copies.\n");
+    return 0;
+  }
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Assign all new registers to RS_Spill. This was the last chance.
+  setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  return 0;
+}
+
+
 //===----------------------------------------------------------------------===//
 //                             Local Splitting
 //===----------------------------------------------------------------------===//
@@ -1291,9 +1356,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
   GapWeight.assign(NumGaps, 0.0f);
 
   // Add interference from each overlapping register.
-  for (const uint16_t *AI = TRI->getOverlaps(PhysReg); *AI; ++AI) {
-    if (!query(const_cast<LiveInterval&>(SA->getParent()), *AI)
-           .checkInterference())
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units)
+          .checkInterference())
       continue;
 
     // We know that VirtReg is a continuous interval from FirstInstr to
@@ -1303,7 +1368,8 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
     // surrounding the instruction. The exception is interference before
     // StartIdx and after StopIdx.
     //
-    LiveIntervalUnion::SegmentIter IntI = getLiveUnion(*AI).find(StartIdx);
+    LiveIntervalUnion::SegmentIter IntI =
+      Matrix->getLiveUnions()[*Units] .find(StartIdx);
     for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
       // Skip the gaps before IntI.
       while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
@@ -1323,6 +1389,30 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
     }
   }
+
+  // Add fixed interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveInterval &LI = LIS->getRegUnit(*Units);
+    LiveInterval::const_iterator I = LI.find(StartIdx);
+    LiveInterval::const_iterator E = LI.end();
+
+    // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
+    for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
+      while (Uses[Gap+1].getBoundaryIndex() < I->start)
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = HUGE_VALF;
+        if (Uses[Gap+1].getBaseIndex() >= I->end)
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
 }
 
 /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
@@ -1355,7 +1445,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // If VirtReg is live across any register mask operands, compute a list of
   // gaps with register masks.
   SmallVector<unsigned, 8> RegMaskGaps;
-  if (!UsableRegs.empty()) {
+  if (Matrix->checkRegMaskInterference(VirtReg)) {
     // Get regmask slots for the whole block.
     ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
     DEBUG(dbgs() << RMS.size() << " regmasks in block:");
@@ -1417,7 +1507,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     calcGapWeights(PhysReg, GapWeight);
 
     // Remove any gaps with regmask clobbers.
-    if (clobberedByRegMask(PhysReg))
+    if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
         GapWeight[RegMaskGaps[i]] = HUGE_VALF;
 
@@ -1512,7 +1602,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -1561,7 +1651,10 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (LIS->intervalIsInOneMBB(VirtReg)) {
     NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled);
     SA->analyze(&VirtReg);
-    return tryLocalSplit(VirtReg, Order, NewVRegs);
+    unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+    return tryInstructionSplit(VirtReg, Order, NewVRegs);
   }
 
   NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled);
@@ -1574,7 +1667,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // an assertion when the coalescer is fixed.
   if (SA->didRepairRange()) {
     // VirtReg has changed, so all cached queries are invalid.
-    invalidateVirtRegs();
+    Matrix->invalidateVirtRegs();
     if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
       return PhysReg;
   }
@@ -1599,11 +1692,6 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
-  // Check if VirtReg is live across any calls.
-  UsableRegs.clear();
-  if (LIS->checkRegMaskInterference(VirtReg, UsableRegs))
-    DEBUG(dbgs() << "Live across regmasks.\n");
-
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
@@ -1644,7 +1732,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 
   // Finally spill VirtReg itself.
   NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-  LiveRangeEdit LRE(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   spiller().spill(LRE);
   setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
@@ -1665,7 +1753,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");
 
-  RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
@@ -1679,30 +1769,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ExtraRegInfo.clear();
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
-  IntfCache.init(MF, &getLiveUnion(0), Indexes, LIS, TRI);
+  IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
 
   allocatePhysRegs();
-  addMBBLiveIns(MF);
-  LIS->addKillFlags();
-
-  // Run rewriter
-  {
-    NamedRegionTimer T("Rewriter", TimerGroupName, TimePassesIsEnabled);
-    VRM->rewrite(Indexes);
-  }
-
-  // Write out new DBG_VALUE instructions.
-  {
-    NamedRegionTimer T("Emit Debug Info", TimerGroupName, TimePassesIsEnabled);
-    DebugVars->emitDebugValues(VRM);
-  }
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers and release all the transient data.
-  VRM->clearAllVirt();
-  MRI->clearVirtRegs();
   releaseMemory();
-
   return true;
 }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index a284614..d0db26b 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -31,7 +31,6 @@
 
 #define DEBUG_TYPE "regalloc"
 
-#include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "RegisterCoalescer.h"
@@ -98,7 +97,6 @@ public:
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
     initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
     initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-    initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
   }
 
   /// Return the pass name.
@@ -134,7 +132,6 @@ private:
   const TargetInstrInfo *tii;
   const MachineLoopInfo *loopInfo;
   MachineRegisterInfo *mri;
-  RenderMachineFunction *rmf;
 
   std::auto_ptr<Spiller> spiller;
   LiveIntervals *lis;
@@ -196,7 +193,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
                                                 const RegSet &vregs) {
 
   typedef std::vector<const LiveInterval*> LIVector;
-  ArrayRef<SlotIndex> regMaskSlots = lis->getRegMaskSlots();
+  LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
   MachineRegisterInfo *mri = &mf->getRegInfo();
   const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
 
@@ -205,12 +202,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
   RegSet pregs;
 
   // Collect the set of preg intervals, record that they're used in the MF.
-  for (LiveIntervals::const_iterator itr = lis->begin(), end = lis->end();
-       itr != end; ++itr) {
-    if (TargetRegisterInfo::isPhysicalRegister(itr->first)) {
-      pregs.insert(itr->first);
-      mri->setPhysRegUsed(itr->first);
-    }
+  for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) {
+    if (mri->def_empty(Reg))
+      continue;
+    pregs.insert(Reg);
+    mri->setPhysRegUsed(Reg);
   }
 
   BitVector reservedRegs = tri->getReservedRegs(*mf);
@@ -220,7 +216,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
        vregItr != vregEnd; ++vregItr) {
     unsigned vreg = *vregItr;
     const TargetRegisterClass *trc = mri->getRegClass(vreg);
-    const LiveInterval *vregLI = &lis->getInterval(vreg);
+    LiveInterval *vregLI = &LIS->getInterval(vreg);
+
+    // Record any overlaps with regmask operands.
+    BitVector regMaskOverlaps(tri->getNumRegs());
+    LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
 
     // Compute an initial allowed set for the current vreg.
     typedef std::vector<unsigned> VRAllowed;
@@ -228,80 +228,26 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf);
     for (unsigned i = 0; i != rawOrder.size(); ++i) {
       unsigned preg = rawOrder[i];
-      if (!reservedRegs.test(preg)) {
-        vrAllowed.push_back(preg);
-      }
-    }
-
-    RegSet overlappingPRegs;
-
-    // Record physical registers whose ranges overlap.
-    for (RegSet::const_iterator pregItr = pregs.begin(),
-                                pregEnd = pregs.end();
-         pregItr != pregEnd; ++pregItr) {
-      unsigned preg = *pregItr;
-      const LiveInterval *pregLI = &lis->getInterval(preg);
-
-      if (pregLI->empty()) {
+      if (reservedRegs.test(preg))
         continue;
-      }
 
-      if (vregLI->overlaps(*pregLI))
-        overlappingPRegs.insert(preg);      
-    }
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg))
+        continue;
 
-    // Record any overlaps with regmask operands.
-    BitVector regMaskOverlaps(tri->getNumRegs());
-    for (ArrayRef<SlotIndex>::iterator rmItr = regMaskSlots.begin(),
-                                       rmEnd = regMaskSlots.end();
-         rmItr != rmEnd; ++rmItr) {
-      SlotIndex rmIdx = *rmItr;
-      if (vregLI->liveAt(rmIdx)) {
-        MachineInstr *rmMI = lis->getInstructionFromIndex(rmIdx);
-        const uint32_t* regMask = 0;
-        for (MachineInstr::mop_iterator mopItr = rmMI->operands_begin(),
-                                        mopEnd = rmMI->operands_end();
-             mopItr != mopEnd; ++mopItr) {
-          if (mopItr->isRegMask()) {
-            regMask = mopItr->getRegMask();
-            break;
-          }
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) {
+        if (vregLI->overlaps(LIS->getRegUnit(*Units))) {
+          Interference = true;
+          break;
         }
-        assert(regMask != 0 && "Couldn't find register mask.");
-        regMaskOverlaps.setBitsNotInMask(regMask);
       }
-    }
+      if (Interference)
+        continue;
 
-    for (unsigned preg = 0; preg < tri->getNumRegs(); ++preg) {
-      if (regMaskOverlaps.test(preg))
-        overlappingPRegs.insert(preg);
-    }
-
-    for (RegSet::const_iterator pregItr = overlappingPRegs.begin(),
-                                pregEnd = overlappingPRegs.end();
-         pregItr != pregEnd; ++pregItr) {
-      unsigned preg = *pregItr;
-
-      // Remove the register from the allowed set.
-      VRAllowed::iterator eraseItr =
-        std::find(vrAllowed.begin(), vrAllowed.end(), preg);
-
-      if (eraseItr != vrAllowed.end()) {
-        vrAllowed.erase(eraseItr);
-      }
-
-      // Also remove any aliases.
-      const uint16_t *aliasItr = tri->getAliasSet(preg);
-      if (aliasItr != 0) {
-        for (; *aliasItr != 0; ++aliasItr) {
-          VRAllowed::iterator eraseItr =
-            std::find(vrAllowed.begin(), vrAllowed.end(), *aliasItr);
-
-          if (eraseItr != vrAllowed.end()) {
-            vrAllowed.erase(eraseItr);
-          }
-        }
-      }
+      // preg is usable for this virtual register.
+      vrAllowed.push_back(preg);
     }
 
     // Construct the node.
@@ -379,7 +325,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilderWithCoalescing::build(
   PBQP::Graph &g = p->getGraph();
 
   const TargetMachine &tm = mf->getTarget();
-  CoalescerPair cp(*tm.getInstrInfo(), *tm.getRegisterInfo());
+  CoalescerPair cp(*tm.getRegisterInfo());
 
   // Scan the machine function and add a coalescing cost whenever CoalescerPair
   // gives the Ok.
@@ -498,21 +444,17 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addRequired<MachineLoopInfo>();
   au.addPreserved<MachineLoopInfo>();
   au.addRequired<VirtRegMap>();
-  au.addRequired<RenderMachineFunction>();
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
 void RegAllocPBQP::findVRegIntervalsToAlloc() {
 
   // Iterate over all live ranges.
-  for (LiveIntervals::iterator itr = lis->begin(), end = lis->end();
-       itr != end; ++itr) {
-
-    // Ignore physical ones.
-    if (TargetRegisterInfo::isPhysicalRegister(itr->first))
+  for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (mri->reg_nodbg_empty(Reg))
       continue;
-
-    LiveInterval *li = itr->second;
+    LiveInterval *li = &lis->getInterval(Reg);
 
     // If this live interval is non-empty we will use pbqp to allocate it.
     // Empty intervals we allocate in a simple post-processing stage in
@@ -544,16 +486,17 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
 
     if (problem.isPRegOption(vreg, alloc)) {
       unsigned preg = problem.getPRegForOption(vreg, alloc);
-      DEBUG(dbgs() << "VREG " << vreg << " -> " << tri->getName(preg) << "\n");
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> "
+            << tri->getName(preg) << "\n");
       assert(preg != 0 && "Invalid preg selected.");
       vrm->assignVirt2Phys(vreg, preg);
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
       SmallVector<LiveInterval*, 8> newSpills;
-      LiveRangeEdit LRE(lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
+      LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
-      DEBUG(dbgs() << "VREG " << vreg << " -> SPILLED (Cost: "
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: "
                    << LRE.getParent().weight << ", New vregs: ");
 
       // Copy any newly inserted live intervals into the list of regs to
@@ -561,7 +504,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
            itr != end; ++itr) {
         assert(!(*itr)->empty() && "Empty spill range.");
-        DEBUG(dbgs() << (*itr)->reg << " ");
+        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
         vregsToAlloc.insert((*itr)->reg);
       }
 
@@ -579,9 +522,6 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
 
 
 void RegAllocPBQP::finalizeAlloc() const {
-  typedef LiveIntervals::iterator LIIterator;
-  typedef LiveInterval::Ranges::const_iterator LRIterator;
-
   // First allocate registers for the empty intervals.
   for (RegSet::const_iterator
          itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end();
@@ -597,51 +537,6 @@ void RegAllocPBQP::finalizeAlloc() const {
 
     vrm->assignVirt2Phys(li->reg, physReg);
   }
-
-  // Finally iterate over the basic blocks to compute and set the live-in sets.
-  SmallVector<MachineBasicBlock*, 8> liveInMBBs;
-  MachineBasicBlock *entryMBB = &*mf->begin();
-
-  for (LIIterator liItr = lis->begin(), liEnd = lis->end();
-       liItr != liEnd; ++liItr) {
-
-    const LiveInterval *li = liItr->second;
-    unsigned reg = 0;
-
-    // Get the physical register for this interval
-    if (TargetRegisterInfo::isPhysicalRegister(li->reg)) {
-      reg = li->reg;
-    } else if (vrm->isAssignedReg(li->reg)) {
-      reg = vrm->getPhys(li->reg);
-    } else {
-      // Ranges which are assigned a stack slot only are ignored.
-      continue;
-    }
-
-    if (reg == 0) {
-      // Filter out zero regs - they're for intervals that were spilled.
-      continue;
-    }
-
-    // Iterate over the ranges of the current interval...
-    for (LRIterator lrItr = li->begin(), lrEnd = li->end();
-         lrItr != lrEnd; ++lrItr) {
-
-      // Find the set of basic blocks which this range is live into...
-      if (lis->findLiveInMBBs(lrItr->start, lrItr->end,  liveInMBBs)) {
-        // And add the physreg for this interval to their live-in sets.
-        for (unsigned i = 0; i != liveInMBBs.size(); ++i) {
-          if (liveInMBBs[i] != entryMBB) {
-            if (!liveInMBBs[i]->isLiveIn(reg)) {
-              liveInMBBs[i]->addLiveIn(reg);
-            }
-          }
-        }
-        liveInMBBs.clear();
-      }
-    }
-  }
-
 }
 
 bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
@@ -655,7 +550,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   lis = &getAnalysis<LiveIntervals>();
   lss = &getAnalysis<LiveStacks>();
   loopInfo = &getAnalysis<MachineLoopInfo>();
-  rmf = &getAnalysis<RenderMachineFunction>();
 
   vrm = &getAnalysis<VirtRegMap>();
   spiller.reset(createInlineSpiller(*this, MF, *vrm));
@@ -719,22 +613,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc();
-
-  rmf->renderMachineFunction("After PBQP register allocation.", vrm);
-
   vregsToAlloc.clear();
   emptyIntervalVRegs.clear();
 
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n");
 
-  // Run rewriter
-  vrm->rewrite(lis->getSlotIndexes());
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers.
-  vrm->clearAllVirt();
-  mri->clearVirtRegs();
-
   return true;
 }
 
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 17165fa..652bc30 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -15,8 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -50,9 +50,8 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
     CSRNum.clear();
     CSRNum.resize(TRI->getNumRegs(), 0);
     for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
-      for (const uint16_t *AS = TRI->getOverlaps(Reg);
-           unsigned Alias = *AS; ++AS)
-        CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
     Update = true;
   }
   CalleeSaved = CSR;
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 75f88ca..9906334 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -16,34 +16,35 @@
 #define DEBUG_TYPE "regalloc"
 #include "RegisterCoalescer.h"
 #include "LiveDebugVariables.h"
-#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 
 #include "llvm/Pass.h"
 #include "llvm/Value.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cmath>
 using namespace llvm;
@@ -53,8 +54,6 @@ STATISTIC(numCrossRCs , "Number of cross class joins performed");
 STATISTIC(numCommutes , "Number of instruction commuting performed");
 STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
-STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
-STATISTIC(numAborts   , "Number of times interval joining aborted");
 STATISTIC(NumInflated , "Number of register classes inflated");
 
 static cl::opt<bool>
@@ -63,22 +62,13 @@ EnableJoining("join-liveintervals",
               cl::init(true));
 
 static cl::opt<bool>
-DisableCrossClassJoin("disable-cross-class-join",
-               cl::desc("Avoid coalescing cross register class copies"),
-               cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-EnablePhysicalJoin("join-physregs",
-                   cl::desc("Join physical register copies"),
-                   cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
 VerifyCoalescing("verify-coalescing",
          cl::desc("Verify machine instrs before and after register coalescing"),
          cl::Hidden);
 
 namespace {
-  class RegisterCoalescer : public MachineFunctionPass {
+  class RegisterCoalescer : public MachineFunctionPass,
+                            private LiveRangeEdit::Delegate {
     MachineFunction* MF;
     MachineRegisterInfo* MRI;
     const TargetMachine* TM;
@@ -90,87 +80,83 @@ namespace {
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
-    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
-    ///
-    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
+    /// WorkList - Copy instructions yet to be coalesced.
+    SmallVector<MachineInstr*, 8> WorkList;
+
+    /// ErasedInstrs - Set of instruction pointers that have been erased, and
+    /// that may be present in WorkList.
+    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
+
+    /// Dead instructions that are about to be deleted.
+    SmallVector<MachineInstr*, 8> DeadDefs;
+
+    /// Virtual registers to be considered for register class inflation.
+    SmallVector<unsigned, 8> InflateRegs;
 
-    /// ReMatCopies - Keep track of copies eliminated due to remat.
-    ///
-    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
+    /// Recursively eliminate dead defs in DeadDefs.
+    void eliminateDeadDefs();
 
-    /// ReMatDefs - Keep track of definition instructions which have
-    /// been remat'ed.
-    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
+    /// LiveRangeEdit callback.
+    void LRE_WillEraseInstruction(MachineInstr *MI);
 
-    /// joinIntervals - join compatible live intervals
-    void joinIntervals();
+    /// joinAllIntervals - join compatible live intervals
+    void joinAllIntervals();
 
-    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
-    /// copies that cannot yet be coalesced into the "TryAgain" list.
-    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
-                           std::vector<MachineInstr*> &TryAgain);
+    /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into WorkList.
+    void copyCoalesceInMBB(MachineBasicBlock *MBB);
 
-    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// copyCoalesceWorkList - Try to coalesce all copies in WorkList after
+    /// position From. Return true if any progress was made.
+    bool copyCoalesceWorkList(unsigned From = 0);
+
+    /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
     /// which are the src/dst of the copy instruction CopyMI.  This returns
     /// true if the copy was successfully coalesced away. If it is not
     /// currently possible to coalesce this interval, but it may be possible if
     /// other things get coalesced, then it returns true by reference in
     /// 'Again'.
-    bool JoinCopy(MachineInstr *TheCopy, bool &Again);
+    bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
-    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
+    /// joinIntervals - Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
     /// can use this information below to update aliases.
-    bool JoinIntervals(CoalescerPair &CP);
+    bool joinIntervals(CoalescerPair &CP);
+
+    /// Attempt joining with a reserved physreg.
+    bool joinReservedPhysReg(CoalescerPair &CP);
 
-    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
-    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
-    /// HasOtherReachingDefs - Return true if there are definitions of IntB
+    /// hasOtherReachingDefs - Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
-    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
                               VNInfo *AValNo, VNInfo *BValNo);
 
-    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy.
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
-    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+    bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// ReMaterializeTrivialDef - If the source of a copy is defined by a
+    /// reMaterializeTrivialDef - If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
-    /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
-    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, bool PreserveSrcInt,
-                                 unsigned DstReg, MachineInstr *CopyMI);
-
-    /// shouldJoinPhys - Return true if a physreg copy should be joined.
-    bool shouldJoinPhys(CoalescerPair &CP);
-
-    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
-    /// two virtual registers from different register classes.
-    bool isWinToJoinCrossClass(unsigned SrcReg,
-                               unsigned DstReg,
-                               const TargetRegisterClass *SrcRC,
-                               const TargetRegisterClass *DstRC,
-                               const TargetRegisterClass *NewRC);
-
-    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    bool reMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
+                                 MachineInstr *CopyMI);
+
+    /// canJoinPhys - Return true if a physreg copy should be joined.
+    bool canJoinPhys(CoalescerPair &CP);
+
+    /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
-    void UpdateRegDefsUses(const CoalescerPair &CP);
-
-    /// RemoveDeadDef - If a def of a live interval is now determined dead,
-    /// remove the val# it defines. If the live interval becomes empty, remove
-    /// it as well.
-    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
-
-    /// markAsJoined - Remember that CopyMI has already been joined.
-    void markAsJoined(MachineInstr *CopyMI);
+    void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
     /// eliminateUndefCopy - Handle copies of undef values.
     bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
@@ -233,7 +219,8 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
 }
 
 bool CoalescerPair::setRegisters(const MachineInstr *MI) {
-  SrcReg = DstReg = SubIdx = 0;
+  SrcReg = DstReg = 0;
+  SrcIdx = DstIdx = 0;
   NewRC = 0;
   Flipped = CrossClass = false;
 
@@ -271,39 +258,44 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     }
   } else {
     // Both registers are virtual.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
 
     // Both registers have subreg indices.
     if (SrcSub && DstSub) {
-      // For now we only handle the case of identical indices in commensurate
-      // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg
-      // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg.
-      if (SrcSub != DstSub)
+      // Copies between different sub-registers are never coalescable.
+      if (Src == Dst && SrcSub != DstSub)
         return false;
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-      if (!TRI.getCommonSubClass(DstRC, SrcRC))
+
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+                                         SrcIdx, DstIdx);
+      if (!NewRC)
         return false;
-      SrcSub = DstSub = 0;
+    } else if (DstSub) {
+      // SrcReg will be merged with a sub-register of DstReg.
+      SrcIdx = DstSub;
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    } else if (SrcSub) {
+      // DstReg will be merged with a sub-register of SrcReg.
+      DstIdx = SrcSub;
+      NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+    } else {
+      // This is a straight copy without sub-registers.
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
     }
 
-    // There can be no SrcSub.
-    if (SrcSub) {
+    // The combined constraint may be impossible to satisfy.
+    if (!NewRC)
+      return false;
+
+    // Prefer SrcReg to be a sub-register of DstReg.
+    // FIXME: Coalescer should support subregs symmetrically.
+    if (DstIdx && !SrcIdx) {
       std::swap(Src, Dst);
-      DstSub = SrcSub;
-      SrcSub = 0;
-      assert(!Flipped && "Unexpected flip");
-      Flipped = true;
+      std::swap(SrcIdx, DstIdx);
+      Flipped = !Flipped;
     }
 
-    // Find the new register class.
-    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
-    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-    if (DstSub)
-      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
-    else
-      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
-    if (!NewRC)
-      return false;
     CrossClass = NewRC != DstRC || NewRC != SrcRC;
   }
   // Check our invariants
@@ -312,14 +304,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
          "Cannot have a physical SubIdx");
   SrcReg = Src;
   DstReg = Dst;
-  SubIdx = DstSub;
   return true;
 }
 
 bool CoalescerPair::flip() {
-  if (SubIdx || TargetRegisterInfo::isPhysicalRegister(DstReg))
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
     return false;
   std::swap(SrcReg, DstReg);
+  std::swap(SrcIdx, DstIdx);
   Flipped = !Flipped;
   return true;
 }
@@ -343,7 +335,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
     if (!TargetRegisterInfo::isPhysicalRegister(Dst))
       return false;
-    assert(!SubIdx && "Inconsistent CoalescerPair state.");
+    assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
     if (DstSub)
       Dst = TRI.getSubReg(Dst, DstSub);
@@ -357,7 +349,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
     if (DstReg != Dst)
       return false;
     // Registers match, do the subregisters line up?
-    return compose(TRI, SubIdx, SrcSub) == DstSub;
+    return compose(TRI, SrcIdx, SrcSub) == compose(TRI, DstIdx, DstSub);
   }
 }
 
@@ -375,19 +367,18 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) {
-  /// Joined copies are not deleted immediately, but kept in JoinedCopies.
-  JoinedCopies.insert(CopyMI);
+void RegisterCoalescer::eliminateDeadDefs() {
+  SmallVector<LiveInterval*, 8> NewRegs;
+  LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
+}
 
-  /// Mark all register operands of CopyMI as <undef> so they won't affect dead
-  /// code elimination.
-  for (MachineInstr::mop_iterator I = CopyMI->operands_begin(),
-       E = CopyMI->operands_end(); I != E; ++I)
-    if (I->isReg())
-      I->setIsUndef(true);
+// Callback from eliminateDeadDefs().
+void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
+  // MI may be in WorkList. Make sure we don't visit it.
+  ErasedInstrs.insert(MI);
 }
 
-/// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
 /// being the source and IntB being the dest, thus this defines a value number
 /// in IntB.  If the source value number (in IntA) is defined by a copy from B,
 /// see if we can merge these two pieces of B into a single value number,
@@ -402,12 +393,10 @@ void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) {
 ///
 /// This returns true if an interval was modified.
 ///
-bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
-                                                    MachineInstr *CopyMI) {
-  // Bail if there is no dst interval - can happen when merging physical subreg
-  // operations.
-  if (!LIS->hasInterval(CP.getDstReg()))
-    return false;
+bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
+                                             MachineInstr *CopyMI) {
+  assert(!CP.isPartial() && "This doesn't work for partial copies.");
+  assert(!CP.isPhys() && "This doesn't work for physreg copies.");
 
   LiveInterval &IntA =
     LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
@@ -457,24 +446,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // IntB, we can merge them.
   if (ValLR+1 != BLR) return false;
 
-  // If a live interval is a physical register, conservatively check if any
-  // of its aliases is overlapping the live interval of the virtual register.
-  // If so, do not coalesce.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
-      if (LIS->hasInterval(*AS) && IntA.overlaps(LIS->getInterval(*AS))) {
-        DEBUG({
-            dbgs() << "\t\tInterfere with alias ";
-            LIS->getInterval(*AS).print(dbgs(), TRI);
-          });
-        return false;
-      }
-  }
-
-  DEBUG({
-      dbgs() << "Extending: ";
-      IntB.print(dbgs(), TRI);
-    });
+  DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
 
   SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
@@ -487,33 +459,10 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // two value numbers.
   IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
 
-  // If the IntB live range is assigned to a physical register, and if that
-  // physreg has sub-registers, update their live intervals as well.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const uint16_t *SR = TRI->getSubRegisters(IntB.reg); *SR; ++SR) {
-      if (!LIS->hasInterval(*SR))
-        continue;
-      LiveInterval &SRLI = LIS->getInterval(*SR);
-      SRLI.addRange(LiveRange(FillerStart, FillerEnd,
-                              SRLI.getNextValue(FillerStart,
-                                                LIS->getVNInfoAllocator())));
-    }
-  }
-
   // Okay, merge "B1" into the same value number as "B0".
-  if (BValNo != ValLR->valno) {
-    // If B1 is killed by a PHI, then the merged live range must also be killed
-    // by the same PHI, as B0 and B1 can not overlap.
-    bool HasPHIKill = BValNo->hasPHIKill();
+  if (BValNo != ValLR->valno)
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
-    if (HasPHIKill)
-      ValLR->valno->setHasPHIKill(true);
-  }
-  DEBUG({
-      dbgs() << "   result = ";
-      IntB.print(dbgs(), TRI);
-      dbgs() << "\n";
-    });
+  DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
@@ -525,8 +474,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // Rewrite the copy. If the copy instruction was killing the destination
   // register before the merge, find the last use and trim the live range. That
   // will also add the isKill marker.
-  CopyMI->substituteRegister(IntA.reg, IntB.reg, CP.getSubIdx(),
-                             *TRI);
+  CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
   if (ALR->end == CopyIdx)
     LIS->shrinkToUses(&IntA);
 
@@ -534,12 +482,17 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   return true;
 }
 
-/// HasOtherReachingDefs - Return true if there are definitions of IntB
+/// hasOtherReachingDefs - Return true if there are definitions of IntB
 /// other than BValNo val# that can reach uses of AValno val# of IntA.
-bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
-                                                    LiveInterval &IntB,
-                                                    VNInfo *AValNo,
-                                                    VNInfo *BValNo) {
+bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
+                                             LiveInterval &IntB,
+                                             VNInfo *AValNo,
+                                             VNInfo *BValNo) {
+  // If AValNo has PHI kills, conservatively assume that IntB defs can reach
+  // the PHI values.
+  if (LIS->hasPHIKill(IntA, AValNo))
+    return true;
+
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
@@ -559,7 +512,7 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
   return false;
 }
 
-/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with
 /// IntA being the source and IntB being the dest, thus this defines a value
 /// number in IntB.  If the source value number (in IntA) is defined by a
 /// commutable instruction and its other operand is coalesced to the copy dest
@@ -582,18 +535,9 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
 ///
 /// This returns true if an interval was modified.
 ///
-bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
-                                                        MachineInstr *CopyMI) {
-  // FIXME: For now, only eliminate the copy by commuting its def when the
-  // source register is a virtual register. We want to guard against cases
-  // where the copy is a back edge copy and commuting the def lengthen the
-  // live interval of the source register to the entire loop.
-  if (CP.isPhys() && CP.isFlipped())
-    return false;
-
-  // Bail if there is no dst interval.
-  if (!LIS->hasInterval(CP.getDstReg()))
-    return false;
+bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                 MachineInstr *CopyMI) {
+  assert (!CP.isPhys());
 
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
@@ -613,10 +557,7 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && "COPY source not live");
-
-  // If other defs can reach uses of this def, then it's not safe to perform
-  // the optimization.
-  if (AValNo->isPHIDef() || AValNo->isUnused() || AValNo->hasPHIKill())
+  if (AValNo->isPHIDef() || AValNo->isUnused())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
@@ -647,17 +588,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
 
   // Make sure there are no other definitions of IntB that would reach the
   // uses which the new definition can reach.
-  if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+  if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
     return false;
 
-  // Abort if the aliases of IntB.reg have values that are not simply the
-  // clobbers from the superreg.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
-    for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
-      if (LIS->hasInterval(*AS) &&
-          HasOtherReachingDefs(IntA, LIS->getInterval(*AS), AValNo, 0))
-        return false;
-
   // If some of the uses of IntA.reg is already coalesced away, return false.
   // It's not possible to determine whether it's safe to perform the coalescing.
   for (MachineRegisterInfo::use_nodbg_iterator UI =
@@ -666,13 +599,14 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     MachineInstr *UseMI = &*UI;
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end())
+    if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
-    if (ULR->valno == AValNo && JoinedCopies.count(UseMI))
+    // If this use is tied to a def, we can't rewrite the register.
+    if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
       return false;
   }
 
-  DEBUG(dbgs() << "\tRemoveCopyByCommutingDef: " << AValNo->def << '\t'
+  DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
                << *DefMI);
 
   // At this point we have decided that it is legal to do this
@@ -709,8 +643,6 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = &*UI;
     ++UI;
-    if (JoinedCopies.count(UseMI))
-      continue;
     if (UseMI->isDebugValue()) {
       // FIXME These don't have an instruction index.  Not clear we have enough
       // info to decide whether to do this replacement or not.  For now do it.
@@ -721,6 +653,8 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
+    // Kill flags are no longer accurate. They are recomputed after RA.
+    UseMO.setIsKill(false);
     if (TargetRegisterInfo::isPhysicalRegister(NewReg))
       UseMO.substPhysReg(NewReg, *TRI);
     else
@@ -742,7 +676,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
     BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
-    markAsJoined(UseMI);
+    ErasedInstrs.insert(UseMI);
+    LIS->RemoveMachineInstrFromMaps(UseMI);
+    UseMI->eraseFromParent();
   }
 
   // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
@@ -762,12 +698,11 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
-/// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
-bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
-                                                       bool preserveSrcInt,
-                                                       unsigned DstReg,
-                                                       MachineInstr *CopyMI) {
+bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt,
+                                                unsigned DstReg,
+                                                MachineInstr *CopyMI) {
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
   LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
   assert(SrcLR != SrcInt.end() && "Live range not found!");
@@ -792,7 +727,7 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
     // Make sure the copy destination register class fits the instruction
     // definition register class. The mismatch can happen as a result of earlier
     // extract_subreg, insert_subreg, subreg_to_reg coalescing.
-    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF);
     if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
       if (MRI->getRegClass(DstReg) != RC)
         return false;
@@ -838,23 +773,21 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
-    unsigned reg = NewMIImplDefs[i];
-    LiveInterval &li = LIS->getInterval(reg);
-    VNInfo *DeadDefVN = li.getNextValue(NewMIIdx.getRegSlot(),
-                                        LIS->getVNInfoAllocator());
-    LiveRange lr(NewMIIdx.getRegSlot(), NewMIIdx.getDeadSlot(), DeadDefVN);
-    li.addRange(lr);
+    unsigned Reg = NewMIImplDefs[i];
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
+        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
   CopyMI->eraseFromParent();
-  ReMatCopies.insert(CopyMI);
-  ReMatDefs.insert(DefMI);
+  ErasedInstrs.insert(CopyMI);
   DEBUG(dbgs() << "Remat: " << *NewMI);
   ++NumReMats;
 
   // The source interval can become smaller because we removed a use.
-  if (preserveSrcInt)
-    LIS->shrinkToUses(&SrcInt);
+  LIS->shrinkToUses(&SrcInt, &DeadDefs);
+  if (!DeadDefs.empty())
+    eliminateDeadDefs();
 
   return true;
 }
@@ -902,51 +835,40 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
   return true;
 }
 
-/// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
 /// update the subregister number if it is not zero. If DstReg is a
 /// physical register and the existing subregister number of the def / use
 /// being updated is not zero, make sure to set it to the correct physical
 /// subregister.
-void
-RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
-  bool DstIsPhys = CP.isPhys();
-  unsigned SrcReg = CP.getSrcReg();
-  unsigned DstReg = CP.getDstReg();
-  unsigned SubIdx = CP.getSubIdx();
+void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
+                                          unsigned DstReg,
+                                          unsigned SubIdx) {
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg);
 
   // Update LiveDebugVariables.
   LDV->renameRegister(SrcReg, DstReg, SubIdx);
 
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
        MachineInstr *UseMI = I.skipInstruction();) {
-    // A PhysReg copy that won't be coalesced can perhaps be rematerialized
-    // instead.
-    if (DstIsPhys) {
-      if (UseMI->isFullCopy() &&
-          UseMI->getOperand(1).getReg() == SrcReg &&
-          UseMI->getOperand(0).getReg() != SrcReg &&
-          UseMI->getOperand(0).getReg() != DstReg &&
-          !JoinedCopies.count(UseMI) &&
-          ReMaterializeTrivialDef(LIS->getInterval(SrcReg), false,
-                                  UseMI->getOperand(0).getReg(), UseMI))
-        continue;
-    }
-
     SmallVector<unsigned,8> Ops;
     bool Reads, Writes;
     tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
 
+    // If SrcReg wasn't read, it may still be the case that DstReg is live-in
+    // because SrcReg is a sub-register.
+    if (DstInt && !Reads && SubIdx)
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(UseMI));
+
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = UseMI->getOperand(Ops[i]);
 
-      // Make sure we don't create read-modify-write defs accidentally.  We
-      // assume here that a SrcReg def cannot be joined into a live DstReg.  If
-      // RegisterCoalescer starts tracking partially live registers, we will
-      // need to check the actual LiveInterval to determine if DstReg is live
-      // here.
-      if (SubIdx && !Reads)
-        MO.setIsUndef();
+      // Adjust <undef> flags in case of sub-register joins. We don't want to
+      // turn a full def into a read-modify-write sub-register def and vice
+      // versa.
+      if (SubIdx && MO.isDef())
+        MO.setIsUndef(!Reads);
 
       if (DstIsPhys)
         MO.substPhysReg(DstReg, *TRI);
@@ -954,10 +876,6 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
-    // This instruction is a copy that will be removed.
-    if (JoinedCopies.count(UseMI))
-      continue;
-
     DEBUG({
         dbgs() << "\t\tupdated: ";
         if (!UseMI->isDebugValue())
@@ -967,210 +885,107 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
   }
 }
 
-/// removeIntervalIfEmpty - Check if the live interval of a physical register
-/// is empty, if so remove it and also remove the empty intervals of its
-/// sub-registers. Return true if live interval is removed.
-static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *LIS,
-                                  const TargetRegisterInfo *TRI) {
-  if (li.empty()) {
-    if (TargetRegisterInfo::isPhysicalRegister(li.reg))
-      for (const uint16_t* SR = TRI->getSubRegisters(li.reg); *SR; ++SR) {
-        if (!LIS->hasInterval(*SR))
-          continue;
-        LiveInterval &sli = LIS->getInterval(*SR);
-        if (sli.empty())
-          LIS->removeInterval(*SR);
-      }
-    LIS->removeInterval(li.reg);
-    return true;
-  }
-  return false;
-}
-
-/// RemoveDeadDef - If a def of a live interval is now determined dead, remove
-/// the val# it defines. If the live interval becomes empty, remove it as well.
-bool RegisterCoalescer::RemoveDeadDef(LiveInterval &li,
-                                             MachineInstr *DefMI) {
-  SlotIndex DefIdx = LIS->getInstructionIndex(DefMI).getRegSlot();
-  LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx);
-  if (DefIdx != MLR->valno->def)
-    return false;
-  li.removeValNo(MLR->valno);
-  return removeIntervalIfEmpty(li, LIS, TRI);
-}
-
-/// shouldJoinPhys - Return true if a copy involving a physreg should be joined.
-/// We need to be careful about coalescing a source physical register with a
-/// virtual register. Once the coalescing is done, it cannot be broken and these
-/// are not spillable! If the destination interval uses are far away, think
-/// twice about coalescing them!
-bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) {
-  bool Allocatable = LIS->isAllocatable(CP.getDstReg());
-  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
-
+/// canJoinPhys - Return true if a copy involving a physreg should be joined.
+bool RegisterCoalescer::canJoinPhys(CoalescerPair &CP) {
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
   /// always beneficial.
-  if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue())
-    return true;
-
-  if (!EnablePhysicalJoin) {
-    DEBUG(dbgs() << "\tPhysreg joins disabled.\n");
-    return false;
-  }
-
-  // Only coalesce to allocatable physreg, we don't want to risk modifying
-  // reserved registers.
-  if (!Allocatable) {
-    DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-
-  // Don't join with physregs that have a ridiculous number of live
-  // ranges. The data structure performance is really bad when that
-  // happens.
-  if (LIS->hasInterval(CP.getDstReg()) &&
-      LIS->getInterval(CP.getDstReg()).ranges.size() > 1000) {
-    ++numAborts;
-    DEBUG(dbgs()
-          << "\tPhysical register live interval too complicated, abort!\n");
+  if (!RegClassInfo.isReserved(CP.getDstReg())) {
+    DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
 
-  // FIXME: Why are we skipping this test for partial copies?
-  //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
-  if (!CP.isPartial()) {
-    const TargetRegisterClass *RC = MRI->getRegClass(CP.getSrcReg());
-    unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2;
-    unsigned Length = LIS->getApproximateInstructionCount(JoinVInt);
-    if (Length > Threshold) {
-      ++numAborts;
-      DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-/// isWinToJoinCrossClass - Return true if it's profitable to coalesce
-/// two virtual registers from different register classes.
-bool
-RegisterCoalescer::isWinToJoinCrossClass(unsigned SrcReg,
-                                             unsigned DstReg,
-                                             const TargetRegisterClass *SrcRC,
-                                             const TargetRegisterClass *DstRC,
-                                             const TargetRegisterClass *NewRC) {
-  unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC);
-  // This heuristics is good enough in practice, but it's obviously not *right*.
-  // 4 is a magic number that works well enough for x86, ARM, etc. It filter
-  // out all but the most restrictive register classes.
-  if (NewRCCount > 4 ||
-      // Early exit if the function is fairly small, coalesce aggressively if
-      // that's the case. For really special register classes with 3 or
-      // fewer registers, be a bit more careful.
-      (LIS->getFuncInstructionCount() / NewRCCount) < 8)
-    return true;
-  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
-  LiveInterval &DstInt = LIS->getInterval(DstReg);
-  unsigned SrcSize = LIS->getApproximateInstructionCount(SrcInt);
-  unsigned DstSize = LIS->getApproximateInstructionCount(DstInt);
-
-  // Coalesce aggressively if the intervals are small compared to the number of
-  // registers in the new class. The number 4 is fairly arbitrary, chosen to be
-  // less aggressive than the 8 used for the whole function size.
-  const unsigned ThresSize = 4 * NewRCCount;
-  if (SrcSize <= ThresSize && DstSize <= ThresSize)
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
+  if (CP.isFlipped() && JoinVInt.containsOneValue())
     return true;
 
-  // Estimate *register use density*. If it doubles or more, abort.
-  unsigned SrcUses = std::distance(MRI->use_nodbg_begin(SrcReg),
-                                   MRI->use_nodbg_end());
-  unsigned DstUses = std::distance(MRI->use_nodbg_begin(DstReg),
-                                   MRI->use_nodbg_end());
-  unsigned NewUses = SrcUses + DstUses;
-  unsigned NewSize = SrcSize + DstSize;
-  if (SrcRC != NewRC && SrcSize > ThresSize) {
-    unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC);
-    if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount)
-      return false;
-  }
-  if (DstRC != NewRC && DstSize > ThresSize) {
-    unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC);
-    if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount)
-      return false;
-  }
-  return true;
+  DEBUG(dbgs() << "\tCannot join defs into reserved register.\n");
+  return false;
 }
 
-
-/// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
 /// if the copy was successfully coalesced away. If it is not currently
 /// possible to coalesce this interval, but it may be possible if other
 /// things get coalesced, then it returns true by reference in 'Again'.
-bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   Again = false;
-  if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI))
-    return false; // Already done.
-
   DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
 
-  CoalescerPair CP(*TII, *TRI);
+  CoalescerPair CP(*TRI);
   if (!CP.setRegisters(CopyMI)) {
     DEBUG(dbgs() << "\tNot coalescable.\n");
     return false;
   }
 
-  // If they are already joined we continue.
-  if (CP.getSrcReg() == CP.getDstReg()) {
-    markAsJoined(CopyMI);
-    DEBUG(dbgs() << "\tCopy already coalesced.\n");
-    return false;  // Not coalescable.
+  // Dead code elimination. This really should be handled by MachineDCE, but
+  // sometimes dead copies slip through, and we can't generate invalid live
+  // ranges.
+  if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
+    DEBUG(dbgs() << "\tCopy is dead.\n");
+    DeadDefs.push_back(CopyMI);
+    eliminateDeadDefs();
+    return true;
   }
 
   // Eliminate undefs.
   if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
-    markAsJoined(CopyMI);
     DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
     return false;  // Not coalescable.
   }
 
-  DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
-               << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSubIdx())
-               << "\n");
+  // Coalesced copies are normally removed immediately, but transformations
+  // like removeCopyByCommutingDef() can inadvertently create identity copies.
+  // When that happens, just join the values and remove the copy.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
+    DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    if (VNInfo *DefVNI = LRQ.valueDefined()) {
+      VNInfo *ReadVNI = LRQ.valueIn();
+      assert(ReadVNI && "No value before copy and no <undef> flag.");
+      assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+      DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+    }
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
+    return true;
+  }
 
   // Enforce policies.
   if (CP.isPhys()) {
-    if (!shouldJoinPhys(CP)) {
+    DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
+                 << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
+                 << '\n');
+    if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
       if (!CP.isFlipped() &&
-          ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+          reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
                                   CP.getDstReg(), CopyMI))
         return true;
       return false;
     }
   } else {
-    // Avoid constraining virtual register regclass too much.
-    if (CP.isCrossClass()) {
-      DEBUG(dbgs() << "\tCross-class to " << CP.getNewRC()->getName() << ".\n");
-      if (DisableCrossClassJoin) {
-        DEBUG(dbgs() << "\tCross-class joins disabled.\n");
-        return false;
-      }
-      if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
-                                 MRI->getRegClass(CP.getSrcReg()),
-                                 MRI->getRegClass(CP.getDstReg()),
-                                 CP.getNewRC())) {
-        DEBUG(dbgs() << "\tAvoid coalescing to constrained register class.\n");
-        Again = true;  // May be possible to coalesce later.
-        return false;
-      }
-    }
+    DEBUG({
+      dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName()
+             << " with ";
+      if (CP.getDstIdx() && CP.getSrcIdx())
+        dbgs() << PrintReg(CP.getDstReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
+               << PrintReg(CP.getSrcReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
+      else
+        dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
+               << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
+    });
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.getSubIdx() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
                            LIS->getInterval(CP.getDstReg()).ranges.size())
       CP.flip();
   }
@@ -1179,21 +994,22 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
   // been modified, so we can use this information below to update aliases.
-  if (!JoinIntervals(CP)) {
+  if (!joinIntervals(CP)) {
     // Coalescing failed.
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
     if (!CP.isFlipped() &&
-        ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+        reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
                                 CP.getDstReg(), CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
-    if (!CP.isPartial()) {
-      if (AdjustCopiesBackFrom(CP, CopyMI) ||
-          RemoveCopyByCommutingDef(CP, CopyMI)) {
-        markAsJoined(CopyMI);
+    if (!CP.isPartial() && !CP.isPhys()) {
+      if (adjustCopiesBackFrom(CP, CopyMI) ||
+          removeCopyByCommutingDef(CP, CopyMI)) {
+        LIS->RemoveMachineInstrFromMaps(CopyMI);
+        CopyMI->eraseFromParent();
         DEBUG(dbgs() << "\tTrivial!\n");
         return true;
       }
@@ -1212,29 +1028,21 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
   }
 
-  // Remember to delete the copy instruction.
-  markAsJoined(CopyMI);
+  // Removing sub-register copies can ease the register class constraints.
+  // Make sure we attempt to inflate the register class of DstReg.
+  if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
+    InflateRegs.push_back(CP.getDstReg());
 
-  UpdateRegDefsUses(CP);
+  // CopyMI has been erased by joinIntervals at this point. Remove it from
+  // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
+  // to the work list. This keeps ErasedInstrs from growing needlessly.
+  ErasedInstrs.erase(CopyMI);
 
-  // If we have extended the live range of a physical register, make sure we
-  // update live-in lists as well.
-  if (CP.isPhys()) {
-    SmallVector<MachineBasicBlock*, 16> BlockSeq;
-    // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
-    // ranges for this, and they are preserved.
-    LiveInterval &SrcInt = LIS->getInterval(CP.getSrcReg());
-    for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
-         I != E; ++I ) {
-      LIS->findLiveInMBBs(I->start, I->end, BlockSeq);
-      for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
-        MachineBasicBlock &block = *BlockSeq[idx];
-        if (!block.isLiveIn(CP.getDstReg()))
-          block.addLiveIn(CP.getDstReg());
-      }
-      BlockSeq.clear();
-    }
-  }
+  // Rewrite all SrcReg operands to DstReg.
+  // Also update DstReg operands to include DstIdx if it is set.
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
   // SrcReg is guaranteed to be the register whose live interval that is
   // being merged.
@@ -1244,16 +1052,56 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    LiveInterval &DstInt = LIS->getInterval(CP.getDstReg());
-    dbgs() << "\tJoined. Result = ";
-    DstInt.print(dbgs(), TRI);
-    dbgs() << "\n";
+    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
+    if (!CP.isPhys())
+      dbgs() << LIS->getInterval(CP.getDstReg());
+     dbgs() << '\n';
   });
 
   ++numJoins;
   return true;
 }
 
+/// Attempt joining with a reserved physreg.
+bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  assert(CP.isPhys() && "Must be a physreg copy");
+  assert(RegClassInfo.isReserved(CP.getDstReg()) && "Not a reserved register");
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << '\n');
+
+  assert(CP.isFlipped() && RHS.containsOneValue() &&
+         "Invalid join with reserved register");
+
+  // Optimization for reserved registers like ESP. We can only merge with a
+  // reserved physreg if RHS has a single value that is a copy of CP.DstReg().
+  // The live range of the reserved register will look like a set of dead defs
+  // - we don't properly track the live range of reserved registers.
+
+  // Deny any overlapping intervals.  This depends on all the reserved
+  // register live ranges to look like dead defs.
+  for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI)
+    if (RHS.overlaps(LIS->getRegUnit(*UI))) {
+      DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
+      return false;
+    }
+
+  // Skip any value computations, we are not adding new values to the
+  // reserved register.  Also skip merging the live ranges, the reserved
+  // register live range doesn't need to be accurate as long as all the
+  // defs are there.
+
+  // Delete the identity copy.
+  MachineInstr *CopyMI = MRI->getVRegDef(RHS.reg);
+  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  CopyMI->eraseFromParent();
+
+  // We don't track kills for reserved registers.
+  MRI->clearKillFlags(CP.getSrcReg());
+
+  return true;
+}
+
 /// ComputeUltimateVN - Assuming we are going to join two live intervals,
 /// compute what the resultant value numbers for each value in the input two
 /// ranges will be.  This is complicated by copies between the two which can
@@ -1320,144 +1168,70 @@ static bool RegistersDefinedFromSameValue(LiveIntervals &li,
                                           const TargetRegisterInfo &tri,
                                           CoalescerPair &CP,
                                           VNInfo *VNI,
-                                          LiveRange *LR,
+                                          VNInfo *OtherVNI,
                                      SmallVector<MachineInstr*, 8> &DupCopies) {
   // FIXME: This is very conservative. For example, we don't handle
   // physical registers.
 
   MachineInstr *MI = li.getInstructionFromIndex(VNI->def);
 
-  if (!MI || !MI->isFullCopy() || CP.isPartial() || CP.isPhys())
+  if (!MI || CP.isPartial() || CP.isPhys())
     return false;
 
-  unsigned Dst = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-
-  if (!TargetRegisterInfo::isVirtualRegister(Src) ||
-      !TargetRegisterInfo::isVirtualRegister(Dst))
+  unsigned A = CP.getDstReg();
+  if (!TargetRegisterInfo::isVirtualRegister(A))
     return false;
 
-  unsigned A = CP.getDstReg();
   unsigned B = CP.getSrcReg();
-
-  if (B == Dst)
-    std::swap(A, B);
-  assert(Dst == A);
-
-  VNInfo *Other = LR->valno;
-  const MachineInstr *OtherMI = li.getInstructionFromIndex(Other->def);
-
-  if (!OtherMI || !OtherMI->isFullCopy())
+  if (!TargetRegisterInfo::isVirtualRegister(B))
     return false;
 
-  unsigned OtherDst = OtherMI->getOperand(0).getReg();
-  unsigned OtherSrc = OtherMI->getOperand(1).getReg();
-
-  if (!TargetRegisterInfo::isVirtualRegister(OtherSrc) ||
-      !TargetRegisterInfo::isVirtualRegister(OtherDst))
+  MachineInstr *OtherMI = li.getInstructionFromIndex(OtherVNI->def);
+  if (!OtherMI)
     return false;
 
-  assert(OtherDst == B);
-
-  if (Src != OtherSrc)
-    return false;
+  if (MI->isImplicitDef()) {
+    DupCopies.push_back(MI);
+    return true;
+  } else {
+    if (!MI->isFullCopy())
+      return false;
+    unsigned Src = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Src))
+      return false;
+    if (!OtherMI->isFullCopy())
+      return false;
+    unsigned OtherSrc = OtherMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(OtherSrc))
+      return false;
 
-  // If the copies use two different value numbers of X, we cannot merge
-  // A and B.
-  LiveInterval &SrcInt = li.getInterval(Src);
-  // getVNInfoBefore returns NULL for undef copies. In this case, the
-  // optimization is still safe.
-  if (SrcInt.getVNInfoBefore(Other->def) != SrcInt.getVNInfoBefore(VNI->def))
-    return false;
+    if (Src != OtherSrc)
+      return false;
 
-  DupCopies.push_back(MI);
+    // If the copies use two different value numbers of X, we cannot merge
+    // A and B.
+    LiveInterval &SrcInt = li.getInterval(Src);
+    // getVNInfoBefore returns NULL for undef copies. In this case, the
+    // optimization is still safe.
+    if (SrcInt.getVNInfoBefore(OtherVNI->def) !=
+        SrcInt.getVNInfoBefore(VNI->def))
+      return false;
 
-  return true;
+    DupCopies.push_back(MI);
+    return true;
+  }
 }
 
-/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// joinIntervals - Attempt to join these two intervals.  On failure, this
 /// returns false.
-bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
-  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), TRI); dbgs() << "\n"; });
-
-  // If a live interval is a physical register, check for interference with any
-  // aliases. The interference check implemented here is a bit more conservative
-  // than the full interfeence check below. We allow overlapping live ranges
-  // only when one is a copy of the other.
-  if (CP.isPhys()) {
-    // Optimization for reserved registers like ESP.
-    // We can only merge with a reserved physreg if RHS has a single value that
-    // is a copy of CP.DstReg().  The live range of the reserved register will
-    // look like a set of dead defs - we don't properly track the live range of
-    // reserved registers.
-    if (RegClassInfo.isReserved(CP.getDstReg())) {
-      assert(CP.isFlipped() && RHS.containsOneValue() &&
-             "Invalid join with reserved register");
-      // Deny any overlapping intervals.  This depends on all the reserved
-      // register live ranges to look like dead defs.
-      for (const uint16_t *AS = TRI->getOverlaps(CP.getDstReg()); *AS; ++AS) {
-        if (!LIS->hasInterval(*AS)) {
-          // Make sure at least DstReg itself exists before attempting a join.
-          if (*AS == CP.getDstReg())
-            LIS->getOrCreateInterval(CP.getDstReg());
-          continue;
-        }
-        if (RHS.overlaps(LIS->getInterval(*AS))) {
-          DEBUG(dbgs() << "\t\tInterference: " << PrintReg(*AS, TRI) << '\n');
-          return false;
-        }
-      }
-      // Skip any value computations, we are not adding new values to the
-      // reserved register.  Also skip merging the live ranges, the reserved
-      // register live range doesn't need to be accurate as long as all the
-      // defs are there.
-      return true;
-    }
-
-    // Check if a register mask clobbers DstReg.
-    BitVector UsableRegs;
-    if (LIS->checkRegMaskInterference(RHS, UsableRegs) &&
-        !UsableRegs.test(CP.getDstReg())) {
-      DEBUG(dbgs() << "\t\tRegister mask interference.\n");
-      return false;
-    }
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  // Handle physreg joins separately.
+  if (CP.isPhys())
+    return joinReservedPhysReg(CP);
 
-    for (const uint16_t *AS = TRI->getAliasSet(CP.getDstReg()); *AS; ++AS){
-      if (!LIS->hasInterval(*AS))
-        continue;
-      const LiveInterval &LHS = LIS->getInterval(*AS);
-      LiveInterval::const_iterator LI = LHS.begin();
-      for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
-           RI != RE; ++RI) {
-        LI = std::lower_bound(LI, LHS.end(), RI->start);
-        // Does LHS have an overlapping live range starting before RI?
-        if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
-            (RI->start != RI->valno->def ||
-             !CP.isCoalescable(LIS->getInstructionFromIndex(RI->start)))) {
-          DEBUG({
-            dbgs() << "\t\tInterference from alias: ";
-            LHS.print(dbgs(), TRI);
-            dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
-          });
-          return false;
-        }
-
-        // Check that LHS ranges beginning in this range are copies.
-        for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
-          if (LI->start != LI->valno->def ||
-              !CP.isCoalescable(LIS->getInstructionFromIndex(LI->start))) {
-            DEBUG({
-              dbgs() << "\t\tInterference from alias: ";
-              LHS.print(dbgs(), TRI);
-              dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
-            });
-            return false;
-          }
-        }
-      }
-    }
-  }
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << '\n');
 
   // Compute the final value assignment, assuming that the live ranges can be
   // coalesced.
@@ -1468,9 +1242,11 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
 
   SmallVector<MachineInstr*, 8> DupCopies;
+  SmallVector<MachineInstr*, 8> DeadCopies;
 
   LiveInterval &LHS = LIS->getOrCreateInterval(CP.getDstReg());
-  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), TRI); dbgs() << "\n"; });
+  DEBUG(dbgs() << "\t\tLHS = " << PrintReg(CP.getDstReg(), TRI) << ' ' << LHS
+               << '\n');
 
   // Loop over the value numbers of the LHS, seeing if any are defined from
   // the RHS.
@@ -1481,21 +1257,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       continue;
     MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
     assert(MI && "Missing def");
-    if (!MI->isCopyLike())  // Src not defined by a copy?
+    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
       continue;
 
     // Figure out the value # from the RHS.
-    LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    VNInfo *OtherVNI = RHS.getVNInfoBefore(VNI->def);
     // The copy could be to an aliased physreg.
-    if (!lr) continue;
+    if (!OtherVNI)
+      continue;
 
     // DstReg is known to be a register in the LHS interval.  If the src is
     // from the RHS interval, we can use its value #.
-    if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
+    if (CP.isCoalescable(MI))
+      DeadCopies.push_back(MI);
+    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
+                                            DupCopies))
       continue;
 
-    LHSValsDefinedFromRHS[VNI] = lr->valno;
+    LHSValsDefinedFromRHS[VNI] = OtherVNI;
   }
 
   // Loop over the value numbers of the RHS, seeing if any are defined from
@@ -1507,21 +1286,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       continue;
     MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
     assert(MI && "Missing def");
-    if (!MI->isCopyLike())  // Src not defined by a copy?
+    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
       continue;
 
     // Figure out the value # from the LHS.
-    LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    VNInfo *OtherVNI = LHS.getVNInfoBefore(VNI->def);
     // The copy could be to an aliased physreg.
-    if (!lr) continue;
+    if (!OtherVNI)
+      continue;
 
     // DstReg is known to be a register in the RHS interval.  If the src is
     // from the LHS interval, we can use its value #.
-    if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
+    if (CP.isCoalescable(MI))
+      DeadCopies.push_back(MI);
+    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
+                                            DupCopies))
         continue;
 
-    RHSValsDefinedFromLHS[VNI] = lr->valno;
+    RHSValsDefinedFromLHS[VNI] = OtherVNI;
   }
 
   LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
@@ -1563,6 +1345,10 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
   LiveInterval::const_iterator J = RHS.begin();
   LiveInterval::const_iterator JE = RHS.end();
 
+  // Collect interval end points that will no longer be kills.
+  SmallVector<MachineInstr*, 8> LHSOldKills;
+  SmallVector<MachineInstr*, 8> RHSOldKills;
+
   // Skip ahead until the first place of potential sharing.
   if (I != IE && J != JE) {
     if (I->start < J->start) {
@@ -1576,20 +1362,21 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
 
   while (I != IE && J != JE) {
     // Determine if these two live ranges overlap.
-    bool Overlaps;
-    if (I->start < J->start) {
-      Overlaps = I->end > J->start;
-    } else {
-      Overlaps = J->end > I->start;
-    }
-
     // If so, check value # info to determine if they are really different.
-    if (Overlaps) {
+    if (I->end > J->start && J->end > I->start) {
       // If the live range overlap will map to the same value number in the
       // result liverange, we can still coalesce them.  If not, we can't.
       if (LHSValNoAssignments[I->valno->id] !=
           RHSValNoAssignments[J->valno->id])
         return false;
+
+      // Extended live ranges should no longer be killed.
+      if (!I->end.isBlock() && I->end < J->end)
+        if (MachineInstr *MI = LIS->getInstructionFromIndex(I->end))
+          LHSOldKills.push_back(MI);
+      if (!J->end.isBlock() && J->end < I->end)
+        if (MachineInstr *MI = LIS->getInstructionFromIndex(J->end))
+          RHSOldKills.push_back(MI);
     }
 
     if (I->end < J->end)
@@ -1598,47 +1385,48 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       ++J;
   }
 
-  // Update kill info. Some live ranges are extended due to copy coalescing.
-  for (DenseMap<VNInfo*, VNInfo*>::iterator I = LHSValsDefinedFromRHS.begin(),
-         E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
-    VNInfo *VNI = I->first;
-    unsigned LHSValID = LHSValNoAssignments[VNI->id];
-    if (VNI->hasPHIKill())
-      NewVNInfo[LHSValID]->setHasPHIKill(true);
-  }
-
-  // Update kill info. Some live ranges are extended due to copy coalescing.
-  for (DenseMap<VNInfo*, VNInfo*>::iterator I = RHSValsDefinedFromLHS.begin(),
-         E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
-    VNInfo *VNI = I->first;
-    unsigned RHSValID = RHSValNoAssignments[VNI->id];
-    if (VNI->hasPHIKill())
-      NewVNInfo[RHSValID]->setHasPHIKill(true);
-  }
+  // Clear kill flags where live ranges are extended.
+  while (!LHSOldKills.empty())
+    LHSOldKills.pop_back_val()->clearRegisterKills(LHS.reg, TRI);
+  while (!RHSOldKills.empty())
+    RHSOldKills.pop_back_val()->clearRegisterKills(RHS.reg, TRI);
 
   if (LHSValNoAssignments.empty())
     LHSValNoAssignments.push_back(-1);
   if (RHSValNoAssignments.empty())
     RHSValNoAssignments.push_back(-1);
 
+  // Now erase all the redundant copies.
+  for (unsigned i = 0, e = DeadCopies.size(); i != e; ++i) {
+    MachineInstr *MI = DeadCopies[i];
+    if (!ErasedInstrs.insert(MI))
+      continue;
+    DEBUG(dbgs() << "\t\terased:\t" << LIS->getInstructionIndex(MI)
+                 << '\t' << *MI);
+    LIS->RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+  }
+
   SmallVector<unsigned, 8> SourceRegisters;
   for (SmallVector<MachineInstr*, 8>::iterator I = DupCopies.begin(),
          E = DupCopies.end(); I != E; ++I) {
     MachineInstr *MI = *I;
+    if (!ErasedInstrs.insert(MI))
+      continue;
 
-    // We have pretended that the assignment to B in
+    // If MI is a copy, then we have pretended that the assignment to B in
     // A = X
     // B = X
     // was actually a copy from A. Now that we decided to coalesce A and B,
     // transform the code into
     // A = X
-    // X = X
-    // and mark the X as coalesced to keep the illusion.
-    unsigned Src = MI->getOperand(1).getReg();
-    SourceRegisters.push_back(Src);
-    MI->getOperand(0).substVirtReg(Src, 0, *TRI);
-
-    markAsJoined(MI);
+    // In the case of the implicit_def, we just have to remove it.
+    if (!MI->isImplicitDef()) {
+      unsigned Src = MI->getOperand(1).getReg();
+      SourceRegisters.push_back(Src);
+    }
+    LIS->RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
   }
 
   // If B = X was the last use of X in a liverange, we have to shrink it now
@@ -1678,73 +1466,58 @@ namespace {
   };
 }
 
-void RegisterCoalescer::CopyCoalesceInMBB(MachineBasicBlock *MBB,
-                                            std::vector<MachineInstr*> &TryAgain) {
-  DEBUG(dbgs() << MBB->getName() << ":\n");
-
-  SmallVector<MachineInstr*, 8> VirtCopies;
-  SmallVector<MachineInstr*, 8> PhysCopies;
-  SmallVector<MachineInstr*, 8> ImpDefCopies;
-  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
-       MII != E;) {
-    MachineInstr *Inst = MII++;
-
-    // If this isn't a copy nor a extract_subreg, we can't join intervals.
-    unsigned SrcReg, DstReg;
-    if (Inst->isCopy()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(1).getReg();
-    } else if (Inst->isSubregToReg()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(2).getReg();
-    } else
+// Try joining WorkList copies starting from index From.
+// Null out any successful joins.
+bool RegisterCoalescer::copyCoalesceWorkList(unsigned From) {
+  assert(From <= WorkList.size() && "Out of range");
+  bool Progress = false;
+  for (unsigned i = From, e = WorkList.size(); i != e; ++i) {
+    if (!WorkList[i])
       continue;
-
-    bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-    bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (LIS->hasInterval(SrcReg) && LIS->getInterval(SrcReg).empty())
-      ImpDefCopies.push_back(Inst);
-    else if (SrcIsPhys || DstIsPhys)
-      PhysCopies.push_back(Inst);
-    else
-      VirtCopies.push_back(Inst);
-  }
-
-  // Try coalescing implicit copies and insert_subreg <undef> first,
-  // followed by copies to / from physical registers, then finally copies
-  // from virtual registers to virtual registers.
-  for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = ImpDefCopies[i];
-    bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
-  }
-  for (unsigned i = 0, e = PhysCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = PhysCopies[i];
-    bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
-  }
-  for (unsigned i = 0, e = VirtCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = VirtCopies[i];
+    // Skip instruction pointers that have already been erased, for example by
+    // dead code elimination.
+    if (ErasedInstrs.erase(WorkList[i])) {
+      WorkList[i] = 0;
+      continue;
+    }
     bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
+    bool Success = joinCopy(WorkList[i], Again);
+    Progress |= Success;
+    if (Success || !Again)
+      WorkList[i] = 0;
   }
+  return Progress;
 }
 
-void RegisterCoalescer::joinIntervals() {
+void
+RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  // Collect all copy-like instructions in MBB. Don't start coalescing anything
+  // yet, it might invalidate the iterator.
+  const unsigned PrevSize = WorkList.size();
+  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+       MII != E; ++MII)
+    if (MII->isCopyLike())
+      WorkList.push_back(MII);
+
+  // Try coalescing the collected copies immediately, and remove the nulls.
+  // This prevents the WorkList from getting too large since most copies are
+  // joinable on the first attempt.
+  if (copyCoalesceWorkList(PrevSize))
+    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
+                               (MachineInstr*)0), WorkList.end());
+}
+
+void RegisterCoalescer::joinAllIntervals() {
   DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  assert(WorkList.empty() && "Old data still around.");
 
-  std::vector<MachineInstr*> TryAgainList;
   if (Loops->empty()) {
     // If there are no loops in the function, join intervals in function order.
     for (MachineFunction::iterator I = MF->begin(), E = MF->end();
          I != E; ++I)
-      CopyCoalesceInMBB(I, TryAgainList);
+      copyCoalesceInMBB(I);
   } else {
     // Otherwise, join intervals in inner loops before other intervals.
     // Unfortunately we can't just iterate over loop hierarchy here because
@@ -1763,34 +1536,20 @@ void RegisterCoalescer::joinIntervals() {
 
     // Finally, join intervals in loop nest order.
     for (unsigned i = 0, e = MBBs.size(); i != e; ++i)
-      CopyCoalesceInMBB(MBBs[i].second, TryAgainList);
+      copyCoalesceInMBB(MBBs[i].second);
   }
 
   // Joining intervals can allow other intervals to be joined.  Iteratively join
   // until we make no progress.
-  bool ProgressMade = true;
-  while (ProgressMade) {
-    ProgressMade = false;
-
-    for (unsigned i = 0, e = TryAgainList.size(); i != e; ++i) {
-      MachineInstr *&TheCopy = TryAgainList[i];
-      if (!TheCopy)
-        continue;
-
-      bool Again = false;
-      bool Success = JoinCopy(TheCopy, Again);
-      if (Success || !Again) {
-        TheCopy= 0;   // Mark this one as done.
-        ProgressMade = true;
-      }
-    }
-  }
+  while (copyCoalesceWorkList())
+    /* empty */ ;
 }
 
 void RegisterCoalescer::releaseMemory() {
-  JoinedCopies.clear();
-  ReMatCopies.clear();
-  ReMatDefs.clear();
+  ErasedInstrs.clear();
+  WorkList.clear();
+  DeadDefs.clear();
+  InflateRegs.clear();
 }
 
 bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
@@ -1814,138 +1573,11 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   RegClassInfo.runOnMachineFunction(fn);
 
   // Join (coalesce) intervals if requested.
-  if (EnableJoining) {
-    joinIntervals();
-    DEBUG({
-        dbgs() << "********** INTERVALS POST JOINING **********\n";
-        for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end();
-             I != E; ++I){
-          I->second->print(dbgs(), TRI);
-          dbgs() << "\n";
-        }
-      });
-  }
-
-  // Perform a final pass over the instructions and compute spill weights
-  // and remove identity moves.
-  SmallVector<unsigned, 4> DeadDefs, InflateRegs;
-  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
-       mbbi != mbbe; ++mbbi) {
-    MachineBasicBlock* mbb = mbbi;
-    for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
-         mii != mie; ) {
-      MachineInstr *MI = mii;
-      if (JoinedCopies.count(MI)) {
-        // Delete all coalesced copies.
-        bool DoDelete = true;
-        assert(MI->isCopyLike() && "Unrecognized copy instruction");
-        unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
-        unsigned DstReg = MI->getOperand(0).getReg();
-
-        // Collect candidates for register class inflation.
-        if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-            RegClassInfo.isProperSubClass(MRI->getRegClass(SrcReg)))
-          InflateRegs.push_back(SrcReg);
-        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
-            RegClassInfo.isProperSubClass(MRI->getRegClass(DstReg)))
-          InflateRegs.push_back(DstReg);
-
-        if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-            MI->getNumOperands() > 2)
-          // Do not delete extract_subreg, insert_subreg of physical
-          // registers unless the definition is dead. e.g.
-          // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
-          // or else the scavenger may complain. LowerSubregs will
-          // delete them later.
-          DoDelete = false;
-
-        if (MI->allDefsAreDead()) {
-          if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-              LIS->hasInterval(SrcReg))
-            LIS->shrinkToUses(&LIS->getInterval(SrcReg));
-          DoDelete = true;
-        }
-        if (!DoDelete) {
-          // We need the instruction to adjust liveness, so make it a KILL.
-          if (MI->isSubregToReg()) {
-            MI->RemoveOperand(3);
-            MI->RemoveOperand(1);
-          }
-          MI->setDesc(TII->get(TargetOpcode::KILL));
-          mii = llvm::next(mii);
-        } else {
-          LIS->RemoveMachineInstrFromMaps(MI);
-          mii = mbbi->erase(mii);
-          ++numPeep;
-        }
-        continue;
-      }
-
-      // Now check if this is a remat'ed def instruction which is now dead.
-      if (ReMatDefs.count(MI)) {
-        bool isDead = true;
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg())
-            continue;
-          unsigned Reg = MO.getReg();
-          if (!Reg)
-            continue;
-          DeadDefs.push_back(Reg);
-          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-            // Remat may also enable register class inflation.
-            if (RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)))
-              InflateRegs.push_back(Reg);
-          }
-          if (MO.isDead())
-            continue;
-          if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-              !MRI->use_nodbg_empty(Reg)) {
-            isDead = false;
-            break;
-          }
-        }
-        if (isDead) {
-          while (!DeadDefs.empty()) {
-            unsigned DeadDef = DeadDefs.back();
-            DeadDefs.pop_back();
-            RemoveDeadDef(LIS->getInterval(DeadDef), MI);
-          }
-          LIS->RemoveMachineInstrFromMaps(mii);
-          mii = mbbi->erase(mii);
-          continue;
-        } else
-          DeadDefs.clear();
-      }
-
-      ++mii;
-
-      // Check for now unnecessary kill flags.
-      if (LIS->isNotInMIMap(MI)) continue;
-      SlotIndex DefIdx = LIS->getInstructionIndex(MI).getRegSlot();
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isKill()) continue;
-        unsigned reg = MO.getReg();
-        if (!reg || !LIS->hasInterval(reg)) continue;
-        if (!LIS->getInterval(reg).killedAt(DefIdx)) {
-          MO.setIsKill(false);
-          continue;
-        }
-        // When leaving a kill flag on a physreg, check if any subregs should
-        // remain alive.
-        if (!TargetRegisterInfo::isPhysicalRegister(reg))
-          continue;
-        for (const uint16_t *SR = TRI->getSubRegisters(reg);
-             unsigned S = *SR; ++SR)
-          if (LIS->hasInterval(S) && LIS->getInterval(S).liveAt(DefIdx))
-            MI->addRegisterDefined(S, TRI);
-      }
-    }
-  }
+  if (EnableJoining)
+    joinAllIntervals();
 
   // After deleting a lot of copies, register classes may be less constrained.
-  // Removing sub-register opreands may alow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
   // DPR inflation.
   array_pod_sort(InflateRegs.begin(), InflateRegs.end());
   InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 310b933..8a6df98 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -26,7 +26,6 @@ namespace llvm {
   /// two registers can be coalesced, CoalescerPair can determine if a copy
   /// instruction would become an identity copy after coalescing.
   class CoalescerPair {
-    const TargetInstrInfo &TII;
     const TargetRegisterInfo &TRI;
 
     /// DstReg - The register that will be left after coalescing. It can be a
@@ -36,10 +35,13 @@ namespace llvm {
     /// SrcReg - the virtual register that will be coalesced into dstReg.
     unsigned SrcReg;
 
-    /// subReg_ - The subregister index of srcReg in DstReg. It is possible the
-    /// coalesce SrcReg into a subreg of the larger DstReg when DstReg is a
-    /// virtual register.
-    unsigned SubIdx;
+    /// DstIdx - The sub-register index of the old DstReg in the new coalesced
+    /// register.
+    unsigned DstIdx;
+
+    /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced
+    /// register.
+    unsigned SrcIdx;
 
     /// Partial - True when the original copy was a partial subregister copy.
     bool Partial;
@@ -52,12 +54,13 @@ namespace llvm {
     bool Flipped;
 
     /// NewRC - The register class of the coalesced register, or NULL if DstReg
-    /// is a physreg.
+    /// is a physreg. This register class may be a super-register of both
+    /// SrcReg and DstReg.
     const TargetRegisterClass *NewRC;
 
   public:
-    CoalescerPair(const TargetInstrInfo &tii, const TargetRegisterInfo &tri)
-      : TII(tii), TRI(tri), DstReg(0), SrcReg(0), SubIdx(0),
+    CoalescerPair(const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
 
     /// setRegisters - set registers to match the copy instruction MI. Return
@@ -94,9 +97,13 @@ namespace llvm {
     /// getSrcReg - Return the virtual register that will be coalesced away.
     unsigned getSrcReg() const { return SrcReg; }
 
-    /// getSubIdx - Return the subregister index in DstReg that SrcReg will be
-    /// coalesced into, or 0.
-    unsigned getSubIdx() const { return SubIdx; }
+    /// getDstIdx - Return the subregister index that DstReg will be coalesced
+    /// into, or 0.
+    unsigned getDstIdx() const { return DstIdx; }
+
+    /// getSrcIdx - Return the subregister index that SrcReg will be coalesced
+    /// into, or 0.
+    unsigned getSrcIdx() const { return SrcIdx; }
 
     /// getNewRC - Return the register class of the coalesced register.
     const TargetRegisterClass *getNewRC() const { return NewRC; }
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
new file mode 100644
index 0000000..43448c8
--- /dev/null
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -0,0 +1,841 @@
+//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// Increase register pressure for each set impacted by this register class.
+static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                std::vector<unsigned> &MaxSetPressure,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+  for (const int *PSet = TRI->getRegClassPressureSets(RC);
+       *PSet != -1; ++PSet) {
+    CurrSetPressure[*PSet] += Weight;
+    if (&CurrSetPressure != &MaxSetPressure
+        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
+      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
+    }
+  }
+}
+
+/// Decrease register pressure for each set impacted by this register class.
+static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+  for (const int *PSet = TRI->getRegClassPressureSets(RC);
+       *PSet != -1; ++PSet) {
+    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSet] -= Weight;
+  }
+}
+
+/// Directly increase pressure only within this RegisterPressure result.
+void RegisterPressure::increase(const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  increaseSetPressure(MaxSetPressure, MaxSetPressure, RC, TRI);
+}
+
+/// Directly decrease pressure only within this RegisterPressure result.
+void RegisterPressure::decrease(const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  decreaseSetPressure(MaxSetPressure, RC, TRI);
+}
+
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
+  dbgs() << "Live In: ";
+  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
+  dbgs() << '\n';
+  dbgs() << "Live Out: ";
+  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveOutRegs[i], TRI) << " ";
+  dbgs() << '\n';
+  for (unsigned i = 0, e = MaxSetPressure.size(); i < e; ++i) {
+    if (MaxSetPressure[i] != 0)
+      dbgs() << TRI->getRegPressureSetName(i) << "=" << MaxSetPressure[i]
+             << '\n';
+  }
+}
+
+/// Increase the current pressure as impacted by these physical registers and
+/// bump the high water mark if needed.
+void RegPressureTracker::increasePhysRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                        TRI->getMinimalPhysRegClass(Regs[I]), TRI);
+}
+
+/// Simply decrease the current pressure as impacted by these physcial
+/// registers.
+void RegPressureTracker::decreasePhysRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, TRI->getMinimalPhysRegClass(Regs[I]),
+                        TRI);
+}
+
+/// Increase the current pressure as impacted by these virtual registers and
+/// bump the high water mark if needed.
+void RegPressureTracker::increaseVirtRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                        MRI->getRegClass(Regs[I]), TRI);
+}
+
+/// Simply decrease the current pressure as impacted by these virtual registers.
+void RegPressureTracker::decreaseVirtRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, MRI->getRegClass(Regs[I]), TRI);
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void IntervalPressure::reset() {
+  TopIdx = BottomIdx = SlotIndex();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void RegionPressure::reset() {
+  TopPos = BottomPos = MachineBasicBlock::const_iterator();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// If the current top is not less than or equal to the next index, open it.
+/// We happen to need the SlotIndex for the next top for pressure update.
+void IntervalPressure::openTop(SlotIndex NextTop) {
+  if (TopIdx <= NextTop)
+    return;
+  TopIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current top is the previous instruction (before receding), open it.
+void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) {
+  if (TopPos != PrevTop)
+    return;
+  TopPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is not greater than the previous index, open it.
+void IntervalPressure::openBottom(SlotIndex PrevBottom) {
+  if (BottomIdx > PrevBottom)
+    return;
+  BottomIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is the previous instr (before advancing), open it.
+void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
+  if (BottomPos != PrevBottom)
+    return;
+  BottomPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// Setup the RegPressureTracker.
+///
+/// TODO: Add support for pressure without LiveIntervals.
+void RegPressureTracker::init(const MachineFunction *mf,
+                              const RegisterClassInfo *rci,
+                              const LiveIntervals *lis,
+                              const MachineBasicBlock *mbb,
+                              MachineBasicBlock::const_iterator pos)
+{
+  MF = mf;
+  TRI = MF->getTarget().getRegisterInfo();
+  RCI = rci;
+  MRI = &MF->getRegInfo();
+  MBB = mbb;
+
+  if (RequireIntervals) {
+    assert(lis && "IntervalPressure requires LiveIntervals");
+    LIS = lis;
+  }
+
+  CurrPos = pos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue())
+    ++CurrPos;
+
+  CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+  P.MaxSetPressure = CurrSetPressure;
+
+  LivePhysRegs.clear();
+  LivePhysRegs.setUniverse(TRI->getNumRegs());
+  LiveVirtRegs.clear();
+  LiveVirtRegs.setUniverse(MRI->getNumVirtRegs());
+}
+
+/// Does this pressure result have a valid top position and live ins.
+bool RegPressureTracker::isTopClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).TopIdx.isValid();
+  return (static_cast<RegionPressure&>(P).TopPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Does this pressure result have a valid bottom position and live outs.
+bool RegPressureTracker::isBottomClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).BottomIdx.isValid();
+  return (static_cast<RegionPressure&>(P).BottomPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Set the boundary for the top of the region and summarize live ins.
+void RegPressureTracker::closeTop() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).TopIdx =
+      LIS->getInstructionIndex(CurrPos).getRegSlot();
+  else
+    static_cast<RegionPressure&>(P).TopPos = CurrPos;
+
+  assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
+  P.LiveInRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size());
+  P.LiveInRegs.append(LivePhysRegs.begin(), LivePhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I)
+    P.LiveInRegs.push_back(*I);
+  std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end());
+  P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()),
+                     P.LiveInRegs.end());
+}
+
+/// Set the boundary for the bottom of the region and summarize live outs.
+void RegPressureTracker::closeBottom() {
+  if (RequireIntervals)
+    if (CurrPos == MBB->end())
+      static_cast<IntervalPressure&>(P).BottomIdx = LIS->getMBBEndIdx(MBB);
+    else
+      static_cast<IntervalPressure&>(P).BottomIdx =
+        LIS->getInstructionIndex(CurrPos).getRegSlot();
+  else
+    static_cast<RegionPressure&>(P).BottomPos = CurrPos;
+
+  assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
+  P.LiveOutRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size());
+  P.LiveOutRegs.append(LivePhysRegs.begin(), LivePhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I)
+    P.LiveOutRegs.push_back(*I);
+  std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end());
+  P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()),
+                      P.LiveOutRegs.end());
+}
+
+/// Finalize the region boundaries and record live ins and live outs.
+void RegPressureTracker::closeRegion() {
+  if (!isTopClosed() && !isBottomClosed()) {
+    assert(LivePhysRegs.empty() && LiveVirtRegs.empty() &&
+           "no region boundary");
+    return;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+  else if (!isTopClosed())
+    closeTop();
+  // If both top and bottom are closed, do nothing.
+}
+
+/// Return true if Reg aliases a register in Regs SparseSet.
+static bool hasRegAlias(unsigned Reg, SparseSet<unsigned> &Regs,
+                        const TargetRegisterInfo *TRI) {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg) && "only for physregs");
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (Regs.count(*AI))
+      return true;
+  return false;
+}
+
+/// Return true if Reg aliases a register in unsorted Regs SmallVector.
+/// This is only valid for physical registers.
+static SmallVectorImpl<unsigned>::iterator
+findRegAlias(unsigned Reg, SmallVectorImpl<unsigned> &Regs,
+             const TargetRegisterInfo *TRI) {
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    SmallVectorImpl<unsigned>::iterator I =
+      std::find(Regs.begin(), Regs.end(), *AI);
+    if (I != Regs.end())
+      return I;
+  }
+  return Regs.end();
+}
+
+/// Return true if Reg can be inserted into Regs SmallVector. For virtual
+/// register, do a linear search. For physical registers check for aliases.
+static SmallVectorImpl<unsigned>::iterator
+findReg(unsigned Reg, bool isVReg, SmallVectorImpl<unsigned> &Regs,
+        const TargetRegisterInfo *TRI) {
+  if(isVReg)
+    return std::find(Regs.begin(), Regs.end(), Reg);
+  return findRegAlias(Reg, Regs, TRI);
+}
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+template<bool isVReg>
+struct RegisterOperands {
+  SmallVector<unsigned, 8> Uses;
+  SmallVector<unsigned, 8> Defs;
+  SmallVector<unsigned, 8> DeadDefs;
+
+  /// Push this operand's register onto the correct vector.
+  void collect(const MachineOperand &MO, const TargetRegisterInfo *TRI) {
+    if (MO.readsReg()) {
+      if (findReg(MO.getReg(), isVReg, Uses, TRI) == Uses.end())
+      Uses.push_back(MO.getReg());
+    }
+    if (MO.isDef()) {
+      if (MO.isDead()) {
+        if (findReg(MO.getReg(), isVReg, DeadDefs, TRI) == DeadDefs.end())
+          DeadDefs.push_back(MO.getReg());
+      }
+      else {
+        if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
+          Defs.push_back(MO.getReg());
+      }
+    }
+  }
+};
+typedef RegisterOperands<false> PhysRegOperands;
+typedef RegisterOperands<true> VirtRegOperands;
+
+/// Collect physical and virtual register operands.
+static void collectOperands(const MachineInstr *MI,
+                            PhysRegOperands &PhysRegOpers,
+                            VirtRegOperands &VirtRegOpers,
+                            const TargetRegisterInfo *TRI,
+                            const RegisterClassInfo *RCI) {
+  for(ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) {
+    const MachineOperand &MO = *OperI;
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+
+    if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegOpers.collect(MO, TRI);
+    else if (RCI->isAllocatable(MO.getReg()))
+      PhysRegOpers.collect(MO, TRI);
+  }
+  // Remove redundant physreg dead defs.
+  for (unsigned i = PhysRegOpers.DeadDefs.size(); i > 0; --i) {
+    unsigned Reg = PhysRegOpers.DeadDefs[i-1];
+    if (findRegAlias(Reg, PhysRegOpers.Defs, TRI) != PhysRegOpers.Defs.end())
+      PhysRegOpers.DeadDefs.erase(&PhysRegOpers.DeadDefs[i-1]);
+  }
+}
+
+/// Force liveness of registers.
+void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    if (TargetRegisterInfo::isVirtualRegister(Regs[i])) {
+      if (LiveVirtRegs.insert(Regs[i]).second)
+        increaseVirtRegPressure(Regs[i]);
+    }
+    else  {
+      if (!hasRegAlias(Regs[i], LivePhysRegs, TRI)) {
+        LivePhysRegs.insert(Regs[i]);
+        increasePhysRegPressure(Regs[i]);
+      }
+    }
+  }
+}
+
+/// Add PhysReg to the live in set and increase max pressure.
+void RegPressureTracker::discoverPhysLiveIn(unsigned Reg) {
+  assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (findRegAlias(Reg, P.LiveInRegs, TRI) != P.LiveInRegs.end())
+    return;
+
+  // At live in discovery, unconditionally increase the high water mark.
+  P.LiveInRegs.push_back(Reg);
+  P.increase(TRI->getMinimalPhysRegClass(Reg), TRI);
+}
+
+/// Add PhysReg to the live out set and increase max pressure.
+void RegPressureTracker::discoverPhysLiveOut(unsigned Reg) {
+  assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (findRegAlias(Reg, P.LiveOutRegs, TRI) != P.LiveOutRegs.end())
+    return;
+
+  // At live out discovery, unconditionally increase the high water mark.
+  P.LiveOutRegs.push_back(Reg);
+  P.increase(TRI->getMinimalPhysRegClass(Reg), TRI);
+}
+
+/// Add VirtReg to the live in set and increase max pressure.
+void RegPressureTracker::discoverVirtLiveIn(unsigned Reg) {
+  assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (std::find(P.LiveInRegs.begin(), P.LiveInRegs.end(), Reg) !=
+      P.LiveInRegs.end())
+    return;
+
+  // At live in discovery, unconditionally increase the high water mark.
+  P.LiveInRegs.push_back(Reg);
+  P.increase(MRI->getRegClass(Reg), TRI);
+}
+
+/// Add VirtReg to the live out set and increase max pressure.
+void RegPressureTracker::discoverVirtLiveOut(unsigned Reg) {
+  assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (std::find(P.LiveOutRegs.begin(), P.LiveOutRegs.end(), Reg) !=
+      P.LiveOutRegs.end())
+    return;
+
+  // At live out discovery, unconditionally increase the high water mark.
+  P.LiveOutRegs.push_back(Reg);
+  P.increase(MRI->getRegClass(Reg), TRI);
+}
+
+/// Recede across the previous instruction.
+bool RegPressureTracker::recede() {
+  // Check for the top of the analyzable region.
+  if (CurrPos == MBB->begin()) {
+    closeRegion();
+    return false;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  do
+    --CurrPos;
+  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+
+  if (CurrPos->isDebugValue()) {
+    closeRegion();
+    return false;
+  }
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  // TODO: consider earlyclobbers?
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (LivePhysRegs.erase(Reg))
+      decreasePhysRegPressure(Reg);
+    else
+      discoverPhysLiveOut(Reg);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (LiveVirtRegs.erase(Reg))
+      decreaseVirtRegPressure(Reg);
+    else
+      discoverVirtLiveOut(Reg);
+  }
+
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI)) {
+      increasePhysRegPressure(Reg);
+      LivePhysRegs.insert(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (!LiveVirtRegs.count(Reg)) {
+      // Adjust liveouts if LiveIntervals are available.
+      if (RequireIntervals) {
+        const LiveInterval *LI = &LIS->getInterval(Reg);
+        if (!LI->killedAt(SlotIdx))
+          discoverVirtLiveOut(Reg);
+      }
+      increaseVirtRegPressure(Reg);
+      LiveVirtRegs.insert(Reg);
+    }
+  }
+  return true;
+}
+
+/// Advance across the current instruction.
+bool RegPressureTracker::advance() {
+  // Check for the bottom of the analyzable region.
+  if (CurrPos == MBB->end()) {
+    closeRegion();
+    return false;
+  }
+  if (!isTopClosed())
+    closeTop();
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the bottom of the region using slot indexes.
+  if (isBottomClosed()) {
+    if (RequireIntervals)
+      static_cast<IntervalPressure&>(P).openBottom(SlotIdx);
+    else
+      static_cast<RegionPressure&>(P).openBottom(CurrPos);
+  }
+
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Kill liveness at last uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI))
+      discoverPhysLiveIn(Reg);
+    else {
+      // Allocatable physregs are always single-use before regalloc.
+      decreasePhysRegPressure(Reg);
+      LivePhysRegs.erase(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (RequireIntervals) {
+      const LiveInterval *LI = &LIS->getInterval(Reg);
+      if (LI->killedAt(SlotIdx)) {
+        if (LiveVirtRegs.erase(Reg))
+          decreaseVirtRegPressure(Reg);
+        else
+          discoverVirtLiveIn(Reg);
+      }
+    }
+    else if (!LiveVirtRegs.count(Reg)) {
+      discoverVirtLiveIn(Reg);
+      increaseVirtRegPressure(Reg);
+    }
+  }
+
+  // Generate liveness for defs.
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI)) {
+      increasePhysRegPressure(Reg);
+      LivePhysRegs.insert(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (LiveVirtRegs.insert(Reg).second)
+      increaseVirtRegPressure(Reg);
+  }
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Find the next instruction.
+  do
+    ++CurrPos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue());
+  return true;
+}
+
+/// Find the max change in excess pressure across all sets.
+static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
+                                       ArrayRef<unsigned> NewPressureVec,
+                                       RegPressureDelta &Delta,
+                                       const TargetRegisterInfo *TRI) {
+  int ExcessUnits = 0;
+  unsigned PSetID = ~0U;
+  for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldPressureVec[i];
+    unsigned PNew = NewPressureVec[i];
+    int PDiff = (int)PNew - (int)POld;
+    if (!PDiff) // No change in this set in the common case.
+      continue;
+    // Only consider change beyond the limit.
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    if (Limit > POld) {
+      if (Limit > PNew)
+        PDiff = 0;            // Under the limit
+      else
+        PDiff = PNew - Limit; // Just exceeded limit.
+    }
+    else if (Limit > PNew)
+      PDiff = Limit - POld;   // Just obeyed limit.
+
+    if (std::abs(PDiff) > std::abs(ExcessUnits)) {
+      ExcessUnits = PDiff;
+      PSetID = i;
+    }
+  }
+  Delta.Excess.PSetID = PSetID;
+  Delta.Excess.UnitIncrease = ExcessUnits;
+}
+
+/// Find the max change in max pressure that either surpasses a critical PSet
+/// limit or exceeds the current MaxPressureLimit.
+///
+/// FIXME: comparing each element of the old and new MaxPressure vectors here is
+/// silly. It's done now to demonstrate the concept but will go away with a
+/// RegPressureTracker API change to work with pressure differences.
+static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
+                                    ArrayRef<unsigned> NewMaxPressureVec,
+                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<unsigned> MaxPressureLimit,
+                                    RegPressureDelta &Delta) {
+  Delta.CriticalMax = PressureElement();
+  Delta.CurrentMax = PressureElement();
+
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldMaxPressureVec[i];
+    unsigned PNew = NewMaxPressureVec[i];
+    if (PNew == POld) // No change in this set in the common case.
+      continue;
+
+    while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
+      ++CritIdx;
+
+    if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
+      int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
+      if (PDiff > Delta.CriticalMax.UnitIncrease) {
+        Delta.CriticalMax.PSetID = i;
+        Delta.CriticalMax.UnitIncrease = PDiff;
+      }
+    }
+
+    // Find the greatest increase above MaxPressureLimit.
+    // (Ignores negative MDiff).
+    int MDiff = (int)PNew - (int)MaxPressureLimit[i];
+    if (MDiff > Delta.CurrentMax.UnitIncrease) {
+      Delta.CurrentMax.PSetID = i;
+      Delta.CurrentMax.UnitIncrease = PNew;
+    }
+  }
+}
+
+/// Record the upward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  // Account for register pressure similar to RegPressureTracker::recede().
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  decreasePhysRegPressure(PhysRegOpers.Defs);
+  decreaseVirtRegPressure(VirtRegOpers.Defs);
+
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI))
+      increasePhysRegPressure(Reg);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (!LiveVirtRegs.count(Reg))
+      increaseVirtRegPressure(Reg);
+  }
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// bottom-up. Find the pressure set with the most change beyond its pressure
+/// limit based on the tracker's current pressure, and return the change in
+/// number of register units of that pressure set introduced by this
+/// instruction.
+///
+/// This assumes that the current LiveOut set is sufficient.
+///
+/// FIXME: This is expensive for an on-the-fly query. We need to cache the
+/// result per-SUnit with enough information to adjust for the current
+/// scheduling position. But this works as a proof of concept.
+void RegPressureTracker::
+getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                          ArrayRef<PressureElement> CriticalPSets,
+                          ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  // FIXME: The snapshot heap space should persist. But I'm planning to
+  // summarize the pressure effect so we don't need to snapshot at all.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+static bool findUseBetween(unsigned Reg,
+                           SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(Reg), UE = MRI->use_nodbg_end();
+         UI != UE; UI.skipInstruction()) {
+      const MachineInstr* MI = &*UI;
+      SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+      if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
+        return true;
+  }
+  return false;
+}
+
+/// Record the downward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  // Account for register pressure similar to RegPressureTracker::recede().
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Kill liveness at last uses. Assume allocatable physregs are single-use
+  // rather than checking LiveIntervals.
+  decreasePhysRegPressure(PhysRegOpers.Uses);
+  if (RequireIntervals) {
+    SlotIndex SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
+    for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+      unsigned Reg = VirtRegOpers.Uses[i];
+      const LiveInterval *LI = &LIS->getInterval(Reg);
+      // FIXME: allow the caller to pass in the list of vreg uses that remain to
+      // be bottom-scheduled to avoid searching uses at each query.
+      SlotIndex CurrIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+      if (LI->killedAt(SlotIdx)
+          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+        decreaseVirtRegPressure(Reg);
+      }
+    }
+  }
+
+  // Generate liveness for defs.
+  increasePhysRegPressure(PhysRegOpers.Defs);
+  increaseVirtRegPressure(VirtRegOpers.Defs);
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// top-down. Find the register class with the most change in its pressure limit
+/// based on the tracker's current pressure, and return the number of excess
+/// register units of that pressure set introduced by this instruction.
+///
+/// This assumes that the current LiveIn set is sufficient.
+void RegPressureTracker::
+getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Get the pressure of each PSet after traversing this instruction bottom-up.
+void RegPressureTracker::
+getUpwardPressure(const MachineInstr *MI,
+                  std::vector<unsigned> &PressureResult,
+                  std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
+
+/// Get the pressure of each PSet after traversing this instruction top-down.
+void RegPressureTracker::
+getDownwardPressure(const MachineInstr *MI,
+                    std::vector<unsigned> &PressureResult,
+                    std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 03bd82e..d673794 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -37,16 +37,13 @@ using namespace llvm;
 void RegScavenger::setUsed(unsigned Reg) {
   RegsAvailable.reset(Reg);
 
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs)
-    RegsAvailable.reset(SubReg);
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    RegsAvailable.reset(*SubRegs);
 }
 
 bool RegScavenger::isAliasUsed(unsigned Reg) const {
-  if (isUsed(Reg))
-    return true;
-  for (const uint16_t *R = TRI->getAliasSet(Reg); *R; ++R)
-    if (isUsed(*R))
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (isUsed(*AI))
       return true;
   return false;
 }
@@ -114,8 +111,8 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
 
 void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
   BV.set(Reg);
-  for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++)
-    BV.set(*R);
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    BV.set(*SubRegs);
 }
 
 void RegScavenger::forward() {
@@ -195,9 +192,8 @@ void RegScavenger::forward() {
         // Ideally we would like a way to model this, but leaving the
         // insert_subreg around causes both correctness and performance issues.
         bool SubUsed = false;
-        for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-             unsigned SubReg = *SubRegs; ++SubRegs)
-          if (isUsed(SubReg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          if (isUsed(*SubRegs)) {
             SubUsed = true;
             break;
           }
@@ -296,9 +292,8 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
           isVirtKillInsn = true;
         continue;
       }
-      Candidates.reset(MO.getReg());
-      for (const uint16_t *R = TRI->getAliasSet(MO.getReg()); *R; R++)
-        Candidates.reset(*R);
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
     }
     // If we're not in a virtual reg's live range, this is a valid
     // restore point.
diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp
deleted file mode 100644
index 6020908..0000000
--- a/lib/CodeGen/RenderMachineFunction.cpp
+++ /dev/null
@@ -1,1013 +0,0 @@
-//===-- llvm/CodeGen/RenderMachineFunction.cpp - MF->HTML -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "rendermf"
-
-#include "RenderMachineFunction.h"
-
-#include "VirtRegMap.h"
-
-#include "llvm/Function.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-#include <sstream>
-
-using namespace llvm;
-
-char RenderMachineFunction::ID = 0;
-INITIALIZE_PASS_BEGIN(RenderMachineFunction, "rendermf",
-                "Render machine functions (and related info) to HTML pages",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(RenderMachineFunction, "rendermf",
-                "Render machine functions (and related info) to HTML pages",
-                false, false)
-
-static cl::opt<std::string>
-outputFileSuffix("rmf-file-suffix",
-                 cl::desc("Appended to function name to get output file name "
-                          "(default: \".html\")"),
-                 cl::init(".html"), cl::Hidden);
-
-static cl::opt<std::string>
-machineFuncsToRender("rmf-funcs",
-                     cl::desc("Comma separated list of functions to render"
-                              ", or \"*\"."),
-                     cl::init(""), cl::Hidden);
-
-static cl::opt<std::string>
-pressureClasses("rmf-classes",
-                cl::desc("Register classes to render pressure for."),
-                cl::init(""), cl::Hidden);
-
-static cl::opt<std::string>
-showIntervals("rmf-intervals",
-              cl::desc("Live intervals to show alongside code."),
-              cl::init(""), cl::Hidden);
-
-static cl::opt<bool>
-filterEmpty("rmf-filter-empty-intervals",
-            cl::desc("Don't display empty intervals."),
-            cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-showEmptyIndexes("rmf-empty-indexes",
-                 cl::desc("Render indexes not associated with instructions or "
-                          "MBB starts."),
-                 cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-useFancyVerticals("rmf-fancy-verts",
-                  cl::desc("Use SVG for vertical text."),
-                  cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-prettyHTML("rmf-pretty-html",
-           cl::desc("Pretty print HTML. For debugging the renderer only.."),
-           cl::init(false), cl::Hidden);
-
-
-namespace llvm {
-
-  bool MFRenderingOptions::renderingOptionsProcessed;
-  std::set<std::string> MFRenderingOptions::mfNamesToRender;
-  bool MFRenderingOptions::renderAllMFs = false;
-
-  std::set<std::string> MFRenderingOptions::classNamesToRender;
-  bool MFRenderingOptions::renderAllClasses = false;
-
-  std::set<std::pair<unsigned, unsigned> >
-    MFRenderingOptions::intervalNumsToRender;
-  unsigned MFRenderingOptions::intervalTypesToRender = ExplicitOnly;
-
-  template <typename OutputItr>
-  void MFRenderingOptions::splitComaSeperatedList(const std::string &s,
-                                                         OutputItr outItr) {
-    std::string::const_iterator curPos = s.begin();
-    std::string::const_iterator nextComa = std::find(curPos, s.end(), ',');
-    while (nextComa != s.end()) {
-      std::string elem;
-      std::copy(curPos, nextComa, std::back_inserter(elem));
-      *outItr = elem;
-      ++outItr;
-      curPos = llvm::next(nextComa);
-      nextComa = std::find(curPos, s.end(), ',');
-    }
-
-    if (curPos != s.end()) {
-      std::string elem;
-      std::copy(curPos, s.end(), std::back_inserter(elem));
-      *outItr = elem;
-      ++outItr;
-    }
-  }
-
-  void MFRenderingOptions::processOptions() {
-    if (!renderingOptionsProcessed) {
-      processFuncNames();
-      processRegClassNames();
-      processIntervalNumbers();
-      renderingOptionsProcessed = true;
-    }
-  }
-
-  void MFRenderingOptions::processFuncNames() {
-    if (machineFuncsToRender == "*") {
-      renderAllMFs = true;
-    } else {
-      splitComaSeperatedList(machineFuncsToRender,
-                             std::inserter(mfNamesToRender,
-                                           mfNamesToRender.begin()));
-    }
-  }
-
-  void MFRenderingOptions::processRegClassNames() {
-    if (pressureClasses == "*") {
-      renderAllClasses = true;
-    } else {
-      splitComaSeperatedList(pressureClasses,
-                             std::inserter(classNamesToRender,
-                                           classNamesToRender.begin()));
-    }
-  }
-
-  void MFRenderingOptions::processIntervalNumbers() {
-    std::set<std::string> intervalRanges;
-    splitComaSeperatedList(showIntervals,
-                           std::inserter(intervalRanges,
-                                         intervalRanges.begin()));
-    std::for_each(intervalRanges.begin(), intervalRanges.end(),
-                  processIntervalRange);
-  }
-
-  void MFRenderingOptions::processIntervalRange(
-                                          const std::string &intervalRangeStr) {
-    if (intervalRangeStr == "*") {
-      intervalTypesToRender |= All;
-    } else if (intervalRangeStr == "virt-nospills*") {
-      intervalTypesToRender |= VirtNoSpills;
-    } else if (intervalRangeStr == "spills*") {
-      intervalTypesToRender |= VirtSpills;
-    } else if (intervalRangeStr == "virt*") {
-      intervalTypesToRender |= AllVirt;
-    } else if (intervalRangeStr == "phys*") {
-      intervalTypesToRender |= AllPhys;
-    } else {
-      std::istringstream iss(intervalRangeStr);
-      unsigned reg1, reg2;
-      if ((iss >> reg1 >> std::ws)) {
-        if (iss.eof()) {
-          intervalNumsToRender.insert(std::make_pair(reg1, reg1 + 1));
-        } else {
-          char c;
-          iss >> c;
-          if (c == '-' && (iss >> reg2)) {
-            intervalNumsToRender.insert(std::make_pair(reg1, reg2 + 1));
-          } else {
-            dbgs() << "Warning: Invalid interval range \""
-                   << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
-          }
-        }
-      } else {
-        dbgs() << "Warning: Invalid interval number \""
-               << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
-      }
-    }
-  }
-
-  void MFRenderingOptions::setup(MachineFunction *mf,
-                                 const TargetRegisterInfo *tri,
-                                 LiveIntervals *lis,
-                                 const RenderMachineFunction *rmf) {
-    this->mf = mf;
-    this->tri = tri;
-    this->lis = lis;
-    this->rmf = rmf;
-
-    clear();
-  }
-
-  void MFRenderingOptions::clear() {
-    regClassesTranslatedToCurrentFunction = false;
-    regClassSet.clear();
-
-    intervalsTranslatedToCurrentFunction = false;
-    intervalSet.clear();
-  }
-
-  void MFRenderingOptions::resetRenderSpecificOptions() {
-    intervalSet.clear();
-    intervalsTranslatedToCurrentFunction = false;
-  }
-
-  bool MFRenderingOptions::shouldRenderCurrentMachineFunction() const {
-    processOptions();
-
-    return (renderAllMFs ||
-            mfNamesToRender.find(mf->getFunction()->getName()) !=
-              mfNamesToRender.end());    
-  }
-
-  const MFRenderingOptions::RegClassSet& MFRenderingOptions::regClasses() const{
-    translateRegClassNamesToCurrentFunction();
-    return regClassSet;
-  }
-
-  const MFRenderingOptions::IntervalSet& MFRenderingOptions::intervals() const {
-    translateIntervalNumbersToCurrentFunction();
-    return intervalSet;
-  }
-
-  bool MFRenderingOptions::renderEmptyIndexes() const {
-    return showEmptyIndexes;
-  }
-
-  bool MFRenderingOptions::fancyVerticals() const {
-    return useFancyVerticals;
-  }
-
-  void MFRenderingOptions::translateRegClassNamesToCurrentFunction() const {
-    if (!regClassesTranslatedToCurrentFunction) {
-      processOptions();
-      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-                                                 rcEnd = tri->regclass_end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-        if (renderAllClasses ||
-            classNamesToRender.find(trc->getName()) !=
-              classNamesToRender.end()) {
-          regClassSet.insert(trc);
-        }
-      }
-      regClassesTranslatedToCurrentFunction = true;
-    }
-  }
-
-  void MFRenderingOptions::translateIntervalNumbersToCurrentFunction() const {
-    if (!intervalsTranslatedToCurrentFunction) {
-      processOptions();
-
-      // If we're not just doing explicit then do a copy over all matching
-      // types.
-      if (intervalTypesToRender != ExplicitOnly) {
-        for (LiveIntervals::iterator liItr = lis->begin(), liEnd = lis->end();
-             liItr != liEnd; ++liItr) {
-          LiveInterval *li = liItr->second;
-
-          if (filterEmpty && li->empty())
-            continue;
-
-          if ((TargetRegisterInfo::isPhysicalRegister(li->reg) &&
-               (intervalTypesToRender & AllPhys))) {
-            intervalSet.insert(li);
-          } else if (TargetRegisterInfo::isVirtualRegister(li->reg)) {
-            if (((intervalTypesToRender & VirtNoSpills) && !rmf->isSpill(li)) || 
-                ((intervalTypesToRender & VirtSpills) && rmf->isSpill(li))) {
-              intervalSet.insert(li);
-            }
-          }
-        }
-      }
-
-      // If we need to process the explicit list...
-      if (intervalTypesToRender != All) {
-        for (std::set<std::pair<unsigned, unsigned> >::const_iterator
-               regRangeItr = intervalNumsToRender.begin(),
-               regRangeEnd = intervalNumsToRender.end();
-             regRangeItr != regRangeEnd; ++regRangeItr) {
-          const std::pair<unsigned, unsigned> &range = *regRangeItr;
-          for (unsigned reg = range.first; reg != range.second; ++reg) {
-            if (lis->hasInterval(reg)) {
-              intervalSet.insert(&lis->getInterval(reg));
-            }
-          }
-        }
-      }
-
-      intervalsTranslatedToCurrentFunction = true;
-    }
-  }
-
-  // ---------- TargetRegisterExtraInformation implementation ----------
-
-  TargetRegisterExtraInfo::TargetRegisterExtraInfo()
-    : mapsPopulated(false) {
-  }
-
-  void TargetRegisterExtraInfo::setup(MachineFunction *mf,
-                                      MachineRegisterInfo *mri,
-                                      const TargetRegisterInfo *tri,
-                                      LiveIntervals *lis) {
-    this->mf = mf;
-    this->mri = mri;
-    this->tri = tri;
-    this->lis = lis;
-  }
-
-  void TargetRegisterExtraInfo::reset() {
-    if (!mapsPopulated) {
-      initWorst();
-      //initBounds();
-      initCapacity();
-      mapsPopulated = true;
-    }
-
-    resetPressureAndLiveStates();
-  }
-
-  void TargetRegisterExtraInfo::clear() {
-    prWorst.clear();
-    vrWorst.clear();
-    capacityMap.clear();
-    pressureMap.clear();
-    //liveStatesMap.clear();
-    mapsPopulated = false;
-  }
-
-  void TargetRegisterExtraInfo::initWorst() {
-    assert(!mapsPopulated && prWorst.empty() && vrWorst.empty() &&
-           "Worst map already initialised?");
-
-    // Start with the physical registers.
-    for (unsigned preg = 1; preg < tri->getNumRegs(); ++preg) {
-      WorstMapLine &pregLine = prWorst[preg];
-
-      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-                                                 rcEnd = tri->regclass_end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-
-        unsigned numOverlaps = 0;
-        for (TargetRegisterClass::iterator rItr = trc->begin(),
-                                           rEnd = trc->end();
-             rItr != rEnd; ++rItr) {
-          unsigned trcPReg = *rItr;
-          if (tri->regsOverlap(preg, trcPReg))
-            ++numOverlaps;
-        }
-        
-        pregLine[trc] = numOverlaps;
-      }
-    }
-
-    // Now the register classes.
-    for (TargetRegisterInfo::regclass_iterator rc1Itr = tri->regclass_begin(),
-                                               rcEnd = tri->regclass_end();
-         rc1Itr != rcEnd; ++rc1Itr) {
-      const TargetRegisterClass *trc1 = *rc1Itr;
-      WorstMapLine &classLine = vrWorst[trc1];
-
-      for (TargetRegisterInfo::regclass_iterator rc2Itr = tri->regclass_begin();
-           rc2Itr != rcEnd; ++rc2Itr) {
-        const TargetRegisterClass *trc2 = *rc2Itr;
-
-        unsigned worst = 0;
-
-        for (TargetRegisterClass::iterator trc1Itr = trc1->begin(),
-                                           trc1End = trc1->end();
-             trc1Itr != trc1End; ++trc1Itr) {
-          unsigned trc1Reg = *trc1Itr;
-          unsigned trc1RegWorst = 0;
-
-          for (TargetRegisterClass::iterator trc2Itr = trc2->begin(),
-                                             trc2End = trc2->end();
-               trc2Itr != trc2End; ++trc2Itr) {
-            unsigned trc2Reg = *trc2Itr;
-            if (tri->regsOverlap(trc1Reg, trc2Reg))
-              ++trc1RegWorst;
-          }
-          if (trc1RegWorst > worst) {
-            worst = trc1RegWorst;
-          }    
-        }
-
-        if (worst != 0) {
-          classLine[trc2] = worst;
-        }
-      }
-    }
-  }
-
-  unsigned TargetRegisterExtraInfo::getWorst(
-                                        unsigned reg,
-                                        const TargetRegisterClass *trc) const {
-    const WorstMapLine *wml = 0;
-    if (TargetRegisterInfo::isPhysicalRegister(reg)) {
-      PRWorstMap::const_iterator prwItr = prWorst.find(reg);
-      assert(prwItr != prWorst.end() && "Missing prWorst entry.");
-      wml = &prwItr->second;
-    } else {
-      const TargetRegisterClass *regTRC = mri->getRegClass(reg);
-      VRWorstMap::const_iterator vrwItr = vrWorst.find(regTRC);
-      assert(vrwItr != vrWorst.end() && "Missing vrWorst entry.");
-      wml = &vrwItr->second;
-    }
-    
-    WorstMapLine::const_iterator wmlItr = wml->find(trc);
-    if (wmlItr == wml->end())
-      return 0;
-
-    return wmlItr->second;
-  }
-
-  void TargetRegisterExtraInfo::initCapacity() {
-    assert(!mapsPopulated && capacityMap.empty() &&
-           "Capacity map already initialised?");
-
-    for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-           rcEnd = tri->regclass_end();
-         rcItr != rcEnd; ++rcItr) {
-      const TargetRegisterClass *trc = *rcItr;
-      unsigned capacity = trc->getRawAllocationOrder(*mf).size();
-
-      if (capacity != 0)
-        capacityMap[trc] = capacity;
-    }
-  }
-
-  unsigned TargetRegisterExtraInfo::getCapacity(
-                                         const TargetRegisterClass *trc) const {
-    CapacityMap::const_iterator cmItr = capacityMap.find(trc);
-    assert(cmItr != capacityMap.end() &&
-           "vreg with unallocable register class");
-    return cmItr->second;
-  }
-
-  void TargetRegisterExtraInfo::resetPressureAndLiveStates() {
-    pressureMap.clear();
-    //liveStatesMap.clear();
-
-    // Iterate over all slots.
-    
-
-    // Iterate over all live intervals.
-    for (LiveIntervals::iterator liItr = lis->begin(),
-           liEnd = lis->end();
-         liItr != liEnd; ++liItr) {
-      LiveInterval *li = liItr->second;
-
-      if (TargetRegisterInfo::isPhysicalRegister(li->reg))
-        continue;
-      
-      // For all ranges in the current interal.
-      for (LiveInterval::iterator lrItr = li->begin(),
-             lrEnd = li->end();
-           lrItr != lrEnd; ++lrItr) {
-        LiveRange *lr = &*lrItr;
-        
-        // For all slots in the current range.
-        for (SlotIndex i = lr->start; i != lr->end; i = i.getNextSlot()) {
-
-          // Record increased pressure at index for all overlapping classes.
-          for (TargetRegisterInfo::regclass_iterator
-                 rcItr = tri->regclass_begin(),
-                 rcEnd = tri->regclass_end();
-               rcItr != rcEnd; ++rcItr) {
-            const TargetRegisterClass *trc = *rcItr;
-
-            if (trc->getRawAllocationOrder(*mf).empty())
-              continue;
-
-            unsigned worstAtI = getWorst(li->reg, trc);
-
-            if (worstAtI != 0) {
-              pressureMap[i][trc] += worstAtI;
-            }
-          }
-        }
-      }
-    } 
-  }
-
-  unsigned TargetRegisterExtraInfo::getPressureAtSlot(
-                                                 const TargetRegisterClass *trc,
-                                                 SlotIndex i) const {
-    PressureMap::const_iterator pmItr = pressureMap.find(i);
-    if (pmItr == pressureMap.end())
-      return 0;
-    const PressureMapLine &pmLine = pmItr->second;
-    PressureMapLine::const_iterator pmlItr = pmLine.find(trc);
-    if (pmlItr == pmLine.end())
-      return 0;
-    return pmlItr->second;
-  }
-
-  bool TargetRegisterExtraInfo::classOverCapacityAtSlot(
-                                                 const TargetRegisterClass *trc,
-                                                 SlotIndex i) const {
-    return (getPressureAtSlot(trc, i) > getCapacity(trc));
-  }
-
-  // ---------- MachineFunctionRenderer implementation ----------
-
-  void RenderMachineFunction::Spacer::print(raw_ostream &os) const {
-    if (!prettyHTML)
-      return;
-    for (unsigned i = 0; i < ns; ++i) {
-      os << " ";
-    }
-  }
-
-  RenderMachineFunction::Spacer RenderMachineFunction::s(unsigned ns) const {
-    return Spacer(ns);
-  }
-
-  raw_ostream& operator<<(raw_ostream &os, const RenderMachineFunction::Spacer &s) {
-    s.print(os);
-    return os;
-  }
-
-  template <typename Iterator>
-  std::string RenderMachineFunction::escapeChars(Iterator sBegin, Iterator sEnd) const {
-    std::string r;
-
-    for (Iterator sItr = sBegin; sItr != sEnd; ++sItr) {
-      char c = *sItr;
-
-      switch (c) {
-        case '<': r.append("&lt;"); break;
-        case '>': r.append("&gt;"); break;
-        case '&': r.append("&amp;"); break;
-        case ' ': r.append("&nbsp;"); break;
-        case '\"': r.append("&quot;"); break;
-        default: r.push_back(c); break;
-      }
-    }
-
-    return r;
-  }
-
-  RenderMachineFunction::LiveState
-  RenderMachineFunction::getLiveStateAt(const LiveInterval *li,
-                                        SlotIndex i) const {
-    const MachineInstr *mi = sis->getInstructionFromIndex(i);
-
-    // For uses/defs recorded use/def indexes override current liveness and
-    // instruction operands (Only for the interval which records the indexes).
-    // FIXME: This is all wrong, uses and defs share the same slots.
-    if (i.isEarlyClobber() || i.isRegister()) {
-      UseDefs::const_iterator udItr = useDefs.find(li);
-      if (udItr != useDefs.end()) {
-        const SlotSet &slotSet = udItr->second;
-        if (slotSet.count(i)) {
-          if (i.isEarlyClobber()) {
-            return Used;
-          }
-          // else
-          return Defined;
-        }
-      }
-    }
-
-    // If the slot is a load/store, or there's no info in the use/def set then
-    // use liveness and instruction operand info.
-    if (li->liveAt(i)) {
-
-      if (mi == 0) {
-        if (vrm == 0 || 
-            (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
-          return AliveReg;
-        } else {
-          return AliveStack;
-        }
-      } else {
-        if (i.isRegister() && mi->definesRegister(li->reg, tri)) {
-          return Defined;
-        } else if (i.isEarlyClobber() && mi->readsRegister(li->reg)) {
-          return Used;
-        } else {
-          if (vrm == 0 || 
-              (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
-            return AliveReg;
-          } else {
-            return AliveStack;
-          }
-        }
-      }
-    }
-    return Dead;
-  }
-
-  RenderMachineFunction::PressureState
-  RenderMachineFunction::getPressureStateAt(const TargetRegisterClass *trc,
-                                              SlotIndex i) const {
-    if (trei.getPressureAtSlot(trc, i) == 0) {
-      return Zero;
-    } else if (trei.classOverCapacityAtSlot(trc, i)){
-      return High;
-    }
-    return Low;
-  }
-
-  /// \brief Render a machine instruction.
-  void RenderMachineFunction::renderMachineInstr(raw_ostream &os,
-                                                 const MachineInstr *mi) const {
-    std::string s;
-    raw_string_ostream oss(s);
-    oss << *mi;
-
-    os << escapeChars(oss.str());
-  }
-
-  template <typename T>
-  void RenderMachineFunction::renderVertical(const Spacer &indent,
-                                             raw_ostream &os,
-                                             const T &t) const {
-    if (ro.fancyVerticals()) {
-      os << indent << "<object\n"
-         << indent + s(2) << "class=\"obj\"\n"
-         << indent + s(2) << "type=\"image/svg+xml\"\n"
-         << indent + s(2) << "width=\"14px\"\n"
-         << indent + s(2) << "height=\"55px\"\n"
-         << indent + s(2) << "data=\"data:image/svg+xml,\n"
-         << indent + s(4) << "<svg xmlns='http://www.w3.org/2000/svg'>\n"
-         << indent + s(6) << "<text x='-55' y='10' "
-                             "font-family='Courier' font-size='12' "
-                             "transform='rotate(-90)' "
-                             "text-rendering='optimizeSpeed' "
-                             "fill='#000'>" << t << "</text>\n"
-         << indent + s(4) << "</svg>\">\n"
-         << indent << "</object>\n";
-    } else {
-      std::ostringstream oss;
-      oss << t;
-      std::string tStr(oss.str());
-
-      os << indent;
-      for (std::string::iterator tStrItr = tStr.begin(), tStrEnd = tStr.end();
-           tStrItr != tStrEnd; ++tStrItr) {
-        os << *tStrItr << "<br/>";
-      }
-      os << "\n";
-    }
-  }
-
-  void RenderMachineFunction::insertCSS(const Spacer &indent,
-                                        raw_ostream &os) const {
-    os << indent << "<style type=\"text/css\">\n"
-       << indent + s(2) << "body { font-color: black; }\n"
-       << indent + s(2) << "table.code td { font-family: monospace; "
-                    "border-width: 0px; border-style: solid; "
-                    "border-bottom: 1px solid #dddddd; white-space: nowrap; }\n"
-       << indent + s(2) << "table.code td.p-z { background-color: #000000; }\n"
-       << indent + s(2) << "table.code td.p-l { background-color: #00ff00; }\n"
-       << indent + s(2) << "table.code td.p-h { background-color: #ff0000; }\n"
-       << indent + s(2) << "table.code td.l-n { background-color: #ffffff; }\n"
-       << indent + s(2) << "table.code td.l-d { background-color: #ff0000; }\n"
-       << indent + s(2) << "table.code td.l-u { background-color: #ffff00; }\n"
-       << indent + s(2) << "table.code td.l-r { background-color: #000000; }\n"
-       << indent + s(2) << "table.code td.l-s { background-color: #770000; }\n"
-       << indent + s(2) << "table.code th { border-width: 0px; "
-                    "border-style: solid; }\n"
-       << indent << "</style>\n";
-  }
-
-  void RenderMachineFunction::renderFunctionSummary(
-                                    const Spacer &indent, raw_ostream &os,
-                                    const char * const renderContextStr) const {
-    os << indent << "<h1>Function: " << mf->getFunction()->getName()
-                 << "</h1>\n"
-       << indent << "<h2>Rendering context: " << renderContextStr << "</h2>\n";
-  }
-
-
-  void RenderMachineFunction::renderPressureTableLegend(
-                                                      const Spacer &indent,
-                                                      raw_ostream &os) const {
-    os << indent << "<h2>Rendering Pressure Legend:</h2>\n"
-       << indent << "<table class=\"code\">\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<th>Pressure</th><th>Description</th>"
-                    "<th>Appearance</th>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>No Pressure</td>"
-                    "<td>No physical registers of this class requested.</td>"
-                    "<td class=\"p-z\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>Low Pressure</td>"
-                    "<td>Sufficient physical registers to meet demand.</td>"
-                    "<td class=\"p-l\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>High Pressure</td>"
-                    "<td>Potentially insufficient physical registers to meet demand.</td>"
-                    "<td class=\"p-h\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent << "</table>\n";
-  }
-
-  template <typename CellType>
-  void RenderMachineFunction::renderCellsWithRLE(
-                   const Spacer &indent, raw_ostream &os,
-                   const std::pair<CellType, unsigned> &rleAccumulator,
-                   const std::map<CellType, std::string> &cellTypeStrs) const {
-
-    if (rleAccumulator.second == 0)
-      return; 
-
-    typename std::map<CellType, std::string>::const_iterator ctsItr =
-      cellTypeStrs.find(rleAccumulator.first);
-
-    assert(ctsItr != cellTypeStrs.end() && "No string for given cell type.");
-
-    os << indent + s(4) << "<td class=\"" << ctsItr->second << "\"";
-    if (rleAccumulator.second > 1)
-      os << " colspan=" << rleAccumulator.second;
-    os << "></td>\n";
-  }
-
-
-  void RenderMachineFunction::renderCodeTablePlusPI(const Spacer &indent,
-                                                    raw_ostream &os) const {
-
-    std::map<LiveState, std::string> lsStrs;
-    lsStrs[Dead] = "l-n";
-    lsStrs[Defined] = "l-d";
-    lsStrs[Used] = "l-u";
-    lsStrs[AliveReg] = "l-r";
-    lsStrs[AliveStack] = "l-s";
-
-    std::map<PressureState, std::string> psStrs;
-    psStrs[Zero] = "p-z";
-    psStrs[Low] = "p-l";
-    psStrs[High] = "p-h";
-
-    // Open the table... 
-
-    os << indent << "<table cellpadding=0 cellspacing=0 class=\"code\">\n"
-       << indent + s(2) << "<tr>\n";
-
-    // Render the header row...
-
-    os << indent + s(4) << "<th>index</th>\n"
-       << indent + s(4) << "<th>instr</th>\n";
-
-    // Render class names if necessary...
-    if (!ro.regClasses().empty()) {
-      for (MFRenderingOptions::RegClassSet::const_iterator
-             rcItr = ro.regClasses().begin(),
-             rcEnd = ro.regClasses().end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-        os << indent + s(4) << "<th>\n";
-        renderVertical(indent + s(6), os, trc->getName());
-        os << indent + s(4) << "</th>\n";
-      }
-    }
-
-    // FIXME: Is there a nicer way to insert space between columns in HTML?
-    if (!ro.regClasses().empty() && !ro.intervals().empty())
-      os << indent + s(4) << "<th>&nbsp;&nbsp;</th>\n";
-
-    // Render interval numbers if necessary...
-    if (!ro.intervals().empty()) {
-      for (MFRenderingOptions::IntervalSet::const_iterator
-             liItr = ro.intervals().begin(),
-             liEnd = ro.intervals().end();
-           liItr != liEnd; ++liItr) {
-
-        const LiveInterval *li = *liItr;
-        os << indent + s(4) << "<th>\n";
-        renderVertical(indent + s(6), os, li->reg);
-        os << indent + s(4) << "</th>\n";
-      }
-    }
-
-    os << indent + s(2) << "</tr>\n";
-
-    // End header row, start with the data rows...
-
-    MachineInstr *mi = 0;
-
-    // Data rows:
-    for (SlotIndex i = sis->getZeroIndex(); i != sis->getLastIndex();
-         i = i.getNextSlot()) {
-     
-      // Render the slot column. 
-      os << indent + s(2) << "<tr height=6ex>\n";
-      
-      // Render the code column.
-      if (i.isBlock()) {
-        MachineBasicBlock *mbb = sis->getMBBFromIndex(i);
-        mi = sis->getInstructionFromIndex(i);
-
-        if (i == sis->getMBBStartIdx(mbb) || mi != 0 ||
-            ro.renderEmptyIndexes()) {
-          os << indent + s(4) << "<td rowspan=4>" << i << "&nbsp;</td>\n"
-             << indent + s(4) << "<td rowspan=4>\n";
-
-          if (i == sis->getMBBStartIdx(mbb)) {
-            os << indent + s(6) << "BB#" << mbb->getNumber() << ":&nbsp;\n";
-          } else if (mi != 0) {
-            os << indent + s(6) << "&nbsp;&nbsp;";
-            renderMachineInstr(os, mi);
-          } else {
-            // Empty interval - leave blank.
-          }
-          os << indent + s(4) << "</td>\n";
-        } else {
-          i = i.getDeadSlot(); // <- Will be incremented to the next index.
-          continue;
-        }
-      }
-
-      // Render the class columns.
-      if (!ro.regClasses().empty()) {
-        std::pair<PressureState, unsigned> psRLEAccumulator(Zero, 0);
-        for (MFRenderingOptions::RegClassSet::const_iterator
-               rcItr = ro.regClasses().begin(),
-               rcEnd = ro.regClasses().end();
-             rcItr != rcEnd; ++rcItr) {
-          const TargetRegisterClass *trc = *rcItr;
-          PressureState newPressure = getPressureStateAt(trc, i);
-
-          if (newPressure == psRLEAccumulator.first) {
-            ++psRLEAccumulator.second;
-          } else {
-            renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
-            psRLEAccumulator.first = newPressure;
-            psRLEAccumulator.second = 1;
-          }
-        }
-        renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
-      }
-  
-      // FIXME: Is there a nicer way to insert space between columns in HTML?
-      if (!ro.regClasses().empty() && !ro.intervals().empty())
-        os << indent + s(4) << "<td width=2em></td>\n";
-
-      if (!ro.intervals().empty()) {
-        std::pair<LiveState, unsigned> lsRLEAccumulator(Dead, 0);
-        for (MFRenderingOptions::IntervalSet::const_iterator
-               liItr = ro.intervals().begin(),
-               liEnd = ro.intervals().end();
-             liItr != liEnd; ++liItr) {
-          const LiveInterval *li = *liItr;
-          LiveState newLiveness = getLiveStateAt(li, i);
-
-          if (newLiveness == lsRLEAccumulator.first) {
-            ++lsRLEAccumulator.second;
-          } else {
-            renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
-            lsRLEAccumulator.first = newLiveness;
-            lsRLEAccumulator.second = 1;
-          }
-        }
-        renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
-      }
-      os << indent + s(2) << "</tr>\n";
-    }
-
-    os << indent << "</table>\n";
-
-    if (!ro.regClasses().empty())
-      renderPressureTableLegend(indent, os);
-  }
-
-  void RenderMachineFunction::renderFunctionPage(
-                                    raw_ostream &os,
-                                    const char * const renderContextStr) const {
-    os << "<html>\n"
-       << s(2) << "<head>\n"
-       << s(4) << "<title>" << fqn << "</title>\n";
-
-    insertCSS(s(4), os);
-
-    os << s(2) << "<head>\n"
-       << s(2) << "<body >\n";
-
-    renderFunctionSummary(s(4), os, renderContextStr);
-
-    os << s(4) << "<br/><br/><br/>\n";
-
-    //renderLiveIntervalInfoTable("    ", os);
-
-    os << s(4) << "<br/><br/><br/>\n";
-
-    renderCodeTablePlusPI(s(4), os);
-
-    os << s(2) << "</body>\n"
-       << "</html>\n";
-  }
-
-  void RenderMachineFunction::getAnalysisUsage(AnalysisUsage &au) const {
-    au.addRequired<SlotIndexes>();
-    au.addRequired<LiveIntervals>();
-    au.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(au);
-  }
-
-  bool RenderMachineFunction::runOnMachineFunction(MachineFunction &fn) {
-
-    mf = &fn;
-    mri = &mf->getRegInfo();
-    tri = mf->getTarget().getRegisterInfo();
-    lis = &getAnalysis<LiveIntervals>();
-    sis = &getAnalysis<SlotIndexes>();
-
-    trei.setup(mf, mri, tri, lis);
-    ro.setup(mf, tri, lis, this);
-    spillIntervals.clear();
-    spillFor.clear();
-    useDefs.clear();
-
-    fqn = mf->getFunction()->getParent()->getModuleIdentifier() + "." +
-          mf->getFunction()->getName().str();
-
-    return false;
-  }
-
-  void RenderMachineFunction::releaseMemory() {
-    trei.clear();
-    ro.clear();
-    spillIntervals.clear();
-    spillFor.clear();
-    useDefs.clear();
-  }
-
-  void RenderMachineFunction::rememberUseDefs(const LiveInterval *li) {
-
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    for (MachineRegisterInfo::reg_iterator rItr = mri->reg_begin(li->reg),
-                                           rEnd = mri->reg_end();
-         rItr != rEnd; ++rItr) {
-      const MachineInstr *mi = &*rItr;
-      if (mi->readsRegister(li->reg)) {
-        useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot(true));
-      }
-      if (mi->definesRegister(li->reg)) {
-        useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot());
-      }
-    }
-  }
-
-  void RenderMachineFunction::rememberSpills(
-                                     const LiveInterval *li,
-                                     const std::vector<LiveInterval*> &spills) {
-
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    for (std::vector<LiveInterval*>::const_iterator siItr = spills.begin(),
-                                                    siEnd = spills.end();
-         siItr != siEnd; ++siItr) {
-      const LiveInterval *spill = *siItr;
-      spillIntervals[li].insert(spill);
-      spillFor[spill] = li;
-    }
-  }
-
-  bool RenderMachineFunction::isSpill(const LiveInterval *li) const {
-    SpillForMap::const_iterator sfItr = spillFor.find(li);
-    if (sfItr == spillFor.end())
-      return false;
-    return true;
-  }
-
-  void RenderMachineFunction::renderMachineFunction(
-                                                   const char *renderContextStr,
-                                                   const VirtRegMap *vrm,
-                                                   const char *renderSuffix) {
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    this->vrm = vrm;
-    trei.reset();
-
-    std::string rpFileName(mf->getFunction()->getName().str() +
-                           (renderSuffix ? renderSuffix : "") +
-                           outputFileSuffix);
-
-    std::string errMsg;
-    raw_fd_ostream outFile(rpFileName.c_str(), errMsg, raw_fd_ostream::F_Binary);
-
-    renderFunctionPage(outFile, renderContextStr);
-
-    ro.resetRenderSpecificOptions();
-  }
-
-  std::string RenderMachineFunction::escapeChars(const std::string &s) const {
-    return escapeChars(s.begin(), s.end());
-  }
-
-}
diff --git a/lib/CodeGen/RenderMachineFunction.h b/lib/CodeGen/RenderMachineFunction.h
deleted file mode 100644
index 8571992..0000000
--- a/lib/CodeGen/RenderMachineFunction.h
+++ /dev/null
@@ -1,338 +0,0 @@
-//===-- llvm/CodeGen/RenderMachineFunction.h - MF->HTML -*- C++ -*---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
-#define LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
-
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-
-namespace llvm {
-
-  class LiveInterval;
-  class LiveIntervals;
-  class MachineInstr;
-  class MachineRegisterInfo;
-  class RenderMachineFunction;
-  class TargetRegisterClass;
-  class TargetRegisterInfo;
-  class VirtRegMap;
-  class raw_ostream;
-
-  /// \brief Helper class to process rendering options. Tries to be as lazy as
-  ///        possible.
-  class MFRenderingOptions {
-  public:
-
-    struct RegClassComp {
-      bool operator()(const TargetRegisterClass *trc1,
-                      const TargetRegisterClass *trc2) const {
-        std::string trc1Name(trc1->getName()), trc2Name(trc2->getName());
-        return std::lexicographical_compare(trc1Name.begin(), trc1Name.end(),
-                                            trc2Name.begin(), trc2Name.end());
-      }
-    };
-
-    typedef std::set<const TargetRegisterClass*, RegClassComp> RegClassSet;
-
-    struct IntervalComp {
-      bool operator()(const LiveInterval *li1, const LiveInterval *li2) const {
-        return li1->reg < li2->reg;
-      }
-    };
-
-    typedef std::set<const LiveInterval*, IntervalComp> IntervalSet;
-
-    /// Initialise the rendering options.
-    void setup(MachineFunction *mf, const TargetRegisterInfo *tri,
-               LiveIntervals *lis, const RenderMachineFunction *rmf);
-
-    /// Clear translations of options to the current function.
-    void clear();
-
-    /// Reset any options computed for this specific rendering.
-    void resetRenderSpecificOptions();
-
-    /// Should we render the current function.
-    bool shouldRenderCurrentMachineFunction() const;
-
-    /// Return the set of register classes to render pressure for.
-    const RegClassSet& regClasses() const;
-
-    /// Return the set of live intervals to render liveness for.
-    const IntervalSet& intervals() const;
-
-    /// Render indexes which are not associated with instructions / MBB starts.
-    bool renderEmptyIndexes() const;
-
-    /// Return whether or not to render using SVG for fancy vertical text.
-    bool fancyVerticals() const;
-
-  private:
-
-    static bool renderingOptionsProcessed;
-    static std::set<std::string> mfNamesToRender;
-    static bool renderAllMFs;
-
-    static std::set<std::string> classNamesToRender;
-    static bool renderAllClasses;
-
-
-    static std::set<std::pair<unsigned, unsigned> > intervalNumsToRender;
-    typedef enum { ExplicitOnly     = 0,
-                   AllPhys          = 1,
-                   VirtNoSpills     = 2,
-                   VirtSpills       = 4,
-                   AllVirt          = 6,
-                   All              = 7 }
-      IntervalTypesToRender;
-    static unsigned intervalTypesToRender;
-
-    template <typename OutputItr>
-    static void splitComaSeperatedList(const std::string &s, OutputItr outItr);
-
-    static void processOptions();
-
-    static void processFuncNames();
-    static void processRegClassNames();
-    static void processIntervalNumbers();
-
-    static void processIntervalRange(const std::string &intervalRangeStr);
-
-    MachineFunction *mf;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-    const RenderMachineFunction *rmf;
-
-    mutable bool regClassesTranslatedToCurrentFunction;
-    mutable RegClassSet regClassSet;
-
-    mutable bool intervalsTranslatedToCurrentFunction;
-    mutable IntervalSet intervalSet;
-
-    void translateRegClassNamesToCurrentFunction() const;
-
-    void translateIntervalNumbersToCurrentFunction() const;
-  };
-
-  /// \brief Provide extra information about the physical and virtual registers
-  ///        in the function being compiled.
-  class TargetRegisterExtraInfo {
-  public:
-    TargetRegisterExtraInfo();
-
-    /// \brief Set up TargetRegisterExtraInfo with pointers to necessary
-    ///        sources of information.
-    void setup(MachineFunction *mf, MachineRegisterInfo *mri,
-               const TargetRegisterInfo *tri, LiveIntervals *lis);
-
-    /// \brief Recompute tables for changed function.
-    void reset(); 
-
-    /// \brief Free all tables in TargetRegisterExtraInfo.
-    void clear();
-
-    /// \brief Maximum number of registers from trc which alias reg.
-    unsigned getWorst(unsigned reg, const TargetRegisterClass *trc) const;
-
-    /// \brief Returns the number of allocable registers in trc.
-    unsigned getCapacity(const TargetRegisterClass *trc) const;
-
-    /// \brief Return the number of registers of class trc that may be
-    ///        needed at slot i.
-    unsigned getPressureAtSlot(const TargetRegisterClass *trc,
-                               SlotIndex i) const;
-
-    /// \brief Return true if the number of registers of type trc that may be
-    ///        needed at slot i is greater than the capacity of trc.
-    bool classOverCapacityAtSlot(const TargetRegisterClass *trc,
-                                 SlotIndex i) const;
-
-  private:
-
-    MachineFunction *mf;
-    MachineRegisterInfo *mri;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> WorstMapLine;
-    typedef std::map<const TargetRegisterClass*, WorstMapLine> VRWorstMap;
-    VRWorstMap vrWorst;
-
-    typedef std::map<unsigned, WorstMapLine> PRWorstMap;
-    PRWorstMap prWorst;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> CapacityMap;
-    CapacityMap capacityMap;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> PressureMapLine;
-    typedef std::map<SlotIndex, PressureMapLine> PressureMap;
-    PressureMap pressureMap;
-
-    bool mapsPopulated;
-
-    /// \brief Initialise the 'worst' table.
-    void initWorst();
- 
-    /// \brief Initialise the 'capacity' table.
-    void initCapacity();
-
-    /// \brief Initialise/Reset the 'pressure' and live states tables.
-    void resetPressureAndLiveStates();
-  };
-
-  /// \brief Render MachineFunction objects and related information to a HTML
-  ///        page.
-  class RenderMachineFunction : public MachineFunctionPass {
-  public:
-    static char ID;
-
-    RenderMachineFunction() : MachineFunctionPass(ID) {
-      initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &au) const;
-
-    virtual bool runOnMachineFunction(MachineFunction &fn);
-
-    virtual void releaseMemory();
-
-    void rememberUseDefs(const LiveInterval *li);
-
-    void rememberSpills(const LiveInterval *li,
-                        const std::vector<LiveInterval*> &spills);
-
-    bool isSpill(const LiveInterval *li) const;
-
-    /// \brief Render this machine function to HTML.
-    /// 
-    /// @param renderContextStr This parameter will be included in the top of
-    ///                         the html file to explain where (in the
-    ///                         codegen pipeline) this function was rendered
-    ///                         from. Set it to something like
-    ///                         "Pre-register-allocation".
-    /// @param vrm              If non-null the VRM will be queried to determine
-    ///                         whether a virtual register was allocated to a
-    ///                         physical register or spilled.
-    /// @param renderFilePrefix This string will be appended to the function
-    ///                         name (before the output file suffix) to enable
-    ///                         multiple renderings from the same function.
-    void renderMachineFunction(const char *renderContextStr,
-                               const VirtRegMap *vrm = 0,
-                               const char *renderSuffix = 0);
-
-  private:
-    class Spacer;
-    friend raw_ostream& operator<<(raw_ostream &os, const Spacer &s);
-
-    std::string fqn;
-
-    MachineFunction *mf;
-    MachineRegisterInfo *mri;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-    SlotIndexes *sis;
-    const VirtRegMap *vrm;
-
-    TargetRegisterExtraInfo trei;
-    MFRenderingOptions ro;
-
-    
-
-    // Utilities.
-    typedef enum { Dead, Defined, Used, AliveReg, AliveStack } LiveState;
-    LiveState getLiveStateAt(const LiveInterval *li, SlotIndex i) const;
-
-    typedef enum { Zero, Low, High } PressureState;
-    PressureState getPressureStateAt(const TargetRegisterClass *trc,
-                                     SlotIndex i) const;
-
-    typedef std::map<const LiveInterval*, std::set<const LiveInterval*> >
-      SpillIntervals;
-    SpillIntervals spillIntervals;
-
-    typedef std::map<const LiveInterval*, const LiveInterval*> SpillForMap;
-    SpillForMap spillFor;
-
-    typedef std::set<SlotIndex> SlotSet;
-    typedef std::map<const LiveInterval*, SlotSet> UseDefs;
-    UseDefs useDefs;
-
-    // ---------- Rendering methods ----------
-
-    /// For inserting spaces when pretty printing.
-    class Spacer {
-    public:
-      explicit Spacer(unsigned numSpaces) : ns(numSpaces) {}
-      Spacer operator+(const Spacer &o) const { return Spacer(ns + o.ns); }
-      void print(raw_ostream &os) const;
-    private:
-      unsigned ns;
-    };
-
-    Spacer s(unsigned ns) const;
-
-    template <typename Iterator>
-    std::string escapeChars(Iterator sBegin, Iterator sEnd) const;
-
-    /// \brief Render a machine instruction.
-    void renderMachineInstr(raw_ostream &os,
-                            const MachineInstr *mi) const;
-
-    /// \brief Render vertical text.
-    template <typename T>
-    void renderVertical(const Spacer &indent,
-                        raw_ostream &os,
-                        const T &t) const;
-
-    /// \brief Insert CSS layout info.
-    void insertCSS(const Spacer &indent,
-                   raw_ostream &os) const;
-
-    /// \brief Render a brief summary of the function (including rendering
-    ///        context).
-    void renderFunctionSummary(const Spacer &indent,
-                               raw_ostream &os,
-                               const char * const renderContextStr) const;
-
-    /// \brief Render a legend for the pressure table.
-    void renderPressureTableLegend(const Spacer &indent,
-                                   raw_ostream &os) const;
-
-    /// \brief Render a consecutive set of HTML cells of the same class using
-    /// the colspan attribute for run-length encoding.
-    template <typename CellType>
-    void renderCellsWithRLE(
-                     const Spacer &indent, raw_ostream &os,
-                     const std::pair<CellType, unsigned> &rleAccumulator,
-                     const std::map<CellType, std::string> &cellTypeStrs) const;
-
-    /// \brief Render code listing, potentially with register pressure
-    ///        and live intervals shown alongside.
-    void renderCodeTablePlusPI(const Spacer &indent,
-                               raw_ostream &os) const;
-
-    /// \brief Render the HTML page representing the MachineFunction.
-    void renderFunctionPage(raw_ostream &os,
-                            const char * const renderContextStr) const;
-
-    std::string escapeChars(const std::string &s) const;
-  };
-}
-
-#endif /* LLVM_CODEGEN_RENDERMACHINEFUNCTION_H */
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 8fd6426..752f8e4 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -64,10 +64,27 @@ const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
 /// specified node.
 bool SUnit::addPred(const SDep &D) {
   // If this node already has this depenence, don't add a redundant one.
-  for (SmallVector<SDep, 4>::const_iterator I = Preds.begin(), E = Preds.end();
-       I != E; ++I)
-    if (*I == D)
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+    if (I->overlaps(D)) {
+      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
+      if (I->getLatency() < D.getLatency()) {
+        SUnit *PredSU = I->getSUnit();
+        // Find the corresponding successor in N.
+        SDep ForwardD = *I;
+        ForwardD.setSUnit(this);
+        for (SmallVector<SDep, 4>::iterator II = PredSU->Succs.begin(),
+               EE = PredSU->Succs.end(); II != EE; ++II) {
+          if (*II == ForwardD) {
+            II->setLatency(D.getLatency());
+            break;
+          }
+        }
+        I->setLatency(D.getLatency());
+      }
       return false;
+    }
+  }
   // Now add a corresponding succ to N.
   SDep P = D;
   P.setSUnit(this);
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index d46eb89..9c1dba3 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -21,17 +21,24 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Enable use of AA during MI GAD construction"));
+
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo &mli,
                                      const MachineDominatorTree &mdt,
@@ -40,7 +47,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis),
     IsPostRA(IsPostRAFlag), UnitLatencies(false), CanHandleTerminators(false),
-    LoopRegs(MLI, MDT), FirstDbgValue(0) {
+    LoopRegs(MDT), FirstDbgValue(0) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
@@ -126,7 +133,8 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
   return 0;
 }
 
-void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) {
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
+  BB = bb;
   LoopRegs.Deps.clear();
   if (MachineLoop *ML = MLI.getLoopFor(BB))
     if (BB == ML->getLoopLatch())
@@ -134,7 +142,8 @@ void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) {
 }
 
 void ScheduleDAGInstrs::finishBlock() {
-  // Nothing to do.
+  // Subclasses should no longer refer to the old block.
+  BB = 0;
 }
 
 /// Initialize the map with the number of registers.
@@ -159,7 +168,7 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
                                     unsigned endcount) {
-  BB = bb;
+  assert(bb == BB && "startBlock should set BB");
   RegionBegin = begin;
   RegionEnd = end;
   EndIndex = endcount;
@@ -232,7 +241,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
   unsigned SpecialAddressLatency = ST.getSpecialAddressLatency();
   unsigned DataLatency = SU->Latency;
 
-  for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) {
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
       continue;
     std::vector<SUnit*> &UseList = Uses[*Alias];
@@ -261,10 +271,12 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
       // Adjust the dependence latency using operand def/use
       // information (if any), and then allow the target to
       // perform its own adjustments.
-      const SDep& dep = SDep(SU, SDep::Data, LDataLatency, *Alias);
+      SDep dep(SU, SDep::Data, LDataLatency, *Alias);
       if (!UnitLatencies) {
-        computeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
-        ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep));
+        unsigned Latency = computeOperandLatency(SU, UseSU, dep);
+        dep.setLatency(Latency);
+
+        ST.adjustSchedDependency(SU, UseSU, dep);
       }
       UseSU->addPred(dep);
     }
@@ -285,7 +297,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   // TODO: Using a latency of 1 here for output dependencies assumes
   //       there's no cost for reusing registers.
   SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
-  for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) {
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
     if (!Defs.contains(*Alias))
       continue;
     std::vector<SUnit *> &DefList = Defs[*Alias];
@@ -398,9 +411,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   const MachineInstr *MI = SU->getInstr();
   unsigned Reg = MI->getOperand(OperIdx).getReg();
 
-  // SSA defs do not have output/anti dependencies.
+  // Singly defined vregs do not have output/anti dependencies.
   // The current operand is a def, so we have at least one.
-  if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end())
+  // Check here if there are any others...
+  if (MRI.hasOneDef(Reg))
     return;
 
   // Add output dependence to the next nearest def of this vreg.
@@ -410,7 +424,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = findVRegDef(Reg);
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI == VRegDefs.end())
     VRegDefs.insert(VReg2SUnit(Reg, SU));
   else {
@@ -436,10 +450,11 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
   // Lookup this operand's reaching definition.
   assert(LIS && "vreg dependencies requires LiveIntervals");
-  SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot();
-  LiveInterval *LI = &LIS->getInterval(Reg);
-  VNInfo *VNI = LI->getVNInfoBefore(UseIdx);
+  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  VNInfo *VNI = LRQ.valueIn();
+
   // VNI will be valid because MachineOperand::readsReg() is checked by caller.
+  assert(VNI && "No value to read by operand");
   MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
   // Phis and other noninstructions (after coalescing) have a NULL Def.
   if (Def) {
@@ -449,11 +464,13 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
       // Create a data dependence.
       //
       // TODO: Handle "special" address latencies cleanly.
-      const SDep &dep = SDep(DefSU, SDep::Data, DefSU->Latency, Reg);
+      SDep dep(DefSU, SDep::Data, DefSU->Latency, Reg);
       if (!UnitLatencies) {
         // Adjust the dependence latency using operand def/use information, then
         // allow the target to perform its own adjustments.
-        computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        unsigned Latency = computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        dep.setLatency(Latency);
+
         const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
         ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       }
@@ -462,11 +479,217 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 
   // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = findVRegDef(Reg);
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI != VRegDefs.end() && DefI->SU != SU)
     DefI->SU->addPred(SDep(SU, SDep::Anti, 0, Reg));
 }
 
+/// Return true if MI is an instruction we are unable to reason about
+/// (like a call or something with unmodeled side effects).
+static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+  if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
+      (MI->hasVolatileMemoryRef() &&
+       (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
+    return true;
+  return false;
+}
+
+// This MI might have either incomplete info, or known to be unsafe
+// to deal with (i.e. volatile object).
+static inline bool isUnsafeMemoryObject(MachineInstr *MI,
+                                        const MachineFrameInfo *MFI) {
+  if (!MI || MI->memoperands_empty())
+    return true;
+  // We purposefully do no check for hasOneMemOperand() here
+  // in hope to trigger an assert downstream in order to
+  // finish implementation.
+  if ((*MI->memoperands_begin())->isVolatile() ||
+       MI->hasUnmodeledSideEffects())
+    return true;
+
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return true;
+
+  V = getUnderlyingObject(V);
+  if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+    // Similarly to getUnderlyingObjectForInstr:
+    // For now, ignore PseudoSourceValues which may alias LLVM IR values
+    // because the code that uses this function has no way to cope with
+    // such aliases.
+    if (PSV->isAliased(MFI))
+      return true;
+  }
+  // Does this pointer refer to a distinct and identifiable object?
+  if (!isIdentifiedObject(V))
+    return true;
+
+  return false;
+}
+
+/// This returns true if the two MIs need a chain edge betwee them.
+/// If these are not even memory operations, we still may need
+/// chain deps between them. The question really is - could
+/// these two MIs be reordered during scheduling from memory dependency
+/// point of view.
+static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                             MachineInstr *MIa,
+                             MachineInstr *MIb) {
+  // Cover a trivial case - no edge is need to itself.
+  if (MIa == MIb)
+    return false;
+
+  if (isUnsafeMemoryObject(MIa, MFI) || isUnsafeMemoryObject(MIb, MFI))
+    return true;
+
+  // If we are dealing with two "normal" loads, we do not need an edge
+  // between them - they could be reordered.
+  if (!MIa->mayStore() && !MIb->mayStore())
+    return false;
+
+  // To this point analysis is generic. From here on we do need AA.
+  if (!AA)
+    return true;
+
+  MachineMemOperand *MMOa = *MIa->memoperands_begin();
+  MachineMemOperand *MMOb = *MIb->memoperands_begin();
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
+    llvm_unreachable("Multiple memory operands.");
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasAnalysis::AliasResult AAResult = AA->alias(
+  AliasAnalysis::Location(MMOa->getValue(), Overlapa,
+                          MMOa->getTBAAInfo()),
+  AliasAnalysis::Location(MMOb->getValue(), Overlapb,
+                          MMOb->getTBAAInfo()));
+
+  return (AAResult != AliasAnalysis::NoAlias);
+}
+
+/// This recursive function iterates over chain deps of SUb looking for
+/// "latest" node that needs a chain edge to SUa.
+static unsigned
+iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                 SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth,
+                 SmallPtrSet<const SUnit*, 16> &Visited) {
+  if (!SUa || !SUb || SUb == ExitSU)
+    return *Depth;
+
+  // Remember visited nodes.
+  if (!Visited.insert(SUb))
+      return *Depth;
+  // If there is _some_ dependency already in place, do not
+  // descend any further.
+  // TODO: Need to make sure that if that dependency got eliminated or ignored
+  // for any reason in the future, we would not violate DAG topology.
+  // Currently it does not happen, but makes an implicit assumption about
+  // future implementation.
+  //
+  // Independently, if we encounter node that is some sort of global
+  // object (like a call) we already have full set of dependencies to it
+  // and we can stop descending.
+  if (SUa->isSucc(SUb) ||
+      isGlobalMemoryObject(AA, SUb->getInstr()))
+    return *Depth;
+
+  // If we do need an edge, or we have exceeded depth budget,
+  // add that edge to the predecessors chain of SUb,
+  // and stop descending.
+  if (*Depth > 200 ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SUb->addPred(SDep(SUa, SDep::Order, /*Latency=*/0, /*Reg=*/0,
+                      /*isNormalMemory=*/true));
+    return *Depth;
+  }
+  // Track current depth.
+  (*Depth)++;
+  // Iterate over chain dependencies only.
+  for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
+  return *Depth;
+}
+
+/// This function assumes that "downward" from SU there exist
+/// tail/leaf of already constructed DAG. It iterates downward and
+/// checks whether SU can be aliasing any node dominated
+/// by it.
+static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                            SUnit *SU, SUnit *ExitSU, std::set<SUnit *> &CheckList,
+                            unsigned LatencyToLoad) {
+  if (!SU)
+    return;
+
+  SmallPtrSet<const SUnit*, 16> Visited;
+  unsigned Depth = 0;
+
+  for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end();
+       I != IE; ++I) {
+    if (SU == *I)
+      continue;
+    if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) {
+      unsigned Latency = ((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0;
+      (*I)->addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0,
+                         /*isNormalMemory=*/true));
+    }
+    // Now go through all the chain successors and iterate from them.
+    // Keep track of visited nodes.
+    for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
+         JE = (*I)->Succs.end(); J != JE; ++J)
+      if (J->isCtrl())
+        iterateChainSucc (AA, MFI, SU, J->getSUnit(),
+                          ExitSU, &Depth, Visited);
+  }
+}
+
+/// Check whether two objects need a chain edge, if so, add it
+/// otherwise remember the rejected SU.
+static inline
+void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                         SUnit *SUa, SUnit *SUb,
+                         std::set<SUnit *> &RejectList,
+                         unsigned TrueMemOrderLatency = 0,
+                         bool isNormalMemory = false) {
+  // If this is a false dependency,
+  // do not add the edge, but rememeber the rejected node.
+  if (!EnableAASchedMI ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr()))
+    SUb->addPred(SDep(SUa, SDep::Order, TrueMemOrderLatency, /*Reg=*/0,
+                      isNormalMemory));
+  else {
+    // Duplicate entries should be ignored.
+    RejectList.insert(SUb);
+    DEBUG(dbgs() << "\tReject chain dep between SU("
+          << SUa->NodeNum << ") and SU("
+          << SUb->NodeNum << ")\n");
+  }
+}
+
 /// Create an SUnit for each real instruction, numbered in top-down toplological
 /// order. The instruction order A < B, implies that no edge exists from B to A.
 ///
@@ -502,7 +725,11 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
-void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
+/// If RegPressure is non null, compute register pressure as a side effect. The
+/// DAG builder is an efficient place to do it because it already visits
+/// operands.
+void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+                                        RegPressureTracker *RPTracker) {
   // Create an SUnit for each real instruction.
   initSUnits();
 
@@ -518,6 +745,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
   // that are known not to alias
   std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
   std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
+  std::set<SUnit*> RejectMemNodes;
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
@@ -553,6 +781,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
       PrevMI = MI;
       continue;
     }
+    if (RPTracker) {
+      RPTracker->recede();
+      assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
+    }
 
     assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() &&
            "Cannot schedule terminators or labels!");
@@ -587,11 +819,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
     // after stack slots are lowered to actual addresses.
     // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
     // produce more precise dependence information.
-#define STORE_LOAD_LATENCY 1
-    unsigned TrueMemOrderLatency = 0;
-    if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
-        (MI->hasVolatileMemoryRef() &&
-         (!MI->mayLoad() || !MI->isInvariantLoad(AA)))) {
+    unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0;
+    if (isGlobalMemoryObject(AA, MI)) {
       // Be conservative with these and add dependencies on all memory
       // references, even those that are known to not alias.
       for (std::map<const Value *, SUnit *>::iterator I =
@@ -603,36 +832,48 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
           I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
       }
-      NonAliasMemDefs.clear();
-      NonAliasMemUses.clear();
       // Add SU to the barrier chain.
       if (BarrierChain)
         BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       BarrierChain = SU;
+      // This is a barrier event that acts as a pivotal node in the DAG,
+      // so it is safe to clear list of exposed nodes.
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
+      RejectMemNodes.clear();
+      NonAliasMemDefs.clear();
+      NonAliasMemUses.clear();
 
       // fall-through
     new_alias_chain:
       // Chain all possibly aliasing memory references though SU.
-      if (AliasChain)
-        AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      if (AliasChain) {
+        unsigned ChainLatency = 0;
+        if (AliasChain->getInstr()->mayLoad())
+          ChainLatency = TrueMemOrderLatency;
+        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+                           ChainLatency);
+      }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                           TrueMemOrderLatency);
       for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
-           E = AliasMemDefs.end(); I != E; ++I) {
-        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-      }
+           E = AliasMemDefs.end(); I != E; ++I)
+        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
       for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+                             TrueMemOrderLatency);
       }
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
       PendingLoads.clear();
       AliasMemDefs.clear();
       AliasMemUses.clear();
     } else if (MI->mayStore()) {
       bool MayAlias = true;
-      TrueMemOrderLatency = STORE_LOAD_LATENCY;
       if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Record the def in MemDefs, first adding a dep if there is
@@ -642,8 +883,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
         std::map<const Value *, SUnit *>::iterator IE =
           ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
-          I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                                  /*isNormalMemory=*/true));
+          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
           I->second = SU;
         } else {
           if (MayAlias)
@@ -658,20 +899,28 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           ((MayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            J->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency,
-                                       /*Reg=*/0, /*isNormalMemory=*/true));
+            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+                               TrueMemOrderLatency, true);
           J->second.clear();
         }
         if (MayAlias) {
           // Add dependencies from all the PendingLoads, i.e. loads
           // with no underlying object.
           for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-            PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+            addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                               TrueMemOrderLatency);
           // Add dependence on alias chain, if needed.
           if (AliasChain)
-            AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+            addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          // But we also should check dependent instructions for the
+          // SU in question.
+          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                          TrueMemOrderLatency);
         }
         // Add dependence on barrier chain, if needed.
+        // There is no point to check aliasing on barrier event. Even if
+        // SU and barrier _could_ be reordered, they should not. In addition,
+        // we have lost all RejectMemNodes below barrier.
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       } else {
@@ -688,7 +937,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
                             /*isArtificial=*/true));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
-      TrueMemOrderLatency = 0;
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
       } else {
@@ -700,8 +948,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           std::map<const Value *, SUnit *>::iterator IE =
             ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
-            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                                    /*isNormalMemory=*/true));
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
           if (MayAlias)
             AliasMemUses[V].push_back(SU);
           else
@@ -711,15 +958,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           // potentially aliasing stores.
           for (std::map<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
 
           PendingLoads.push_back(SU);
           MayAlias = true;
         }
-
+        if (MayAlias)
+          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       }
@@ -735,8 +983,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
 }
 
 void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
-  // Compute the latency for the node.
-  if (!InstrItins || InstrItins->isEmpty()) {
+  // Compute the latency for the node. We only provide a default for missing
+  // itineraries. Empty itineraries still have latency properties.
+  if (!InstrItins) {
     SU->Latency = 1;
 
     // Simplistic target-independent heuristic: assume that loads take
@@ -748,63 +997,15 @@ void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
   }
 }
 
-void ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
-                                              SDep& dep) const {
-  if (!InstrItins || InstrItins->isEmpty())
-    return;
-
+unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
+                                                  const SDep& dep,
+                                                  bool FindMin) const {
   // For a data dependency with a known register...
   if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
-    return;
-
-  const unsigned Reg = dep.getReg();
-
-  // ... find the definition of the register in the defining
-  // instruction
-  MachineInstr *DefMI = Def->getInstr();
-  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
-  if (DefIdx != -1) {
-    const MachineOperand &MO = DefMI->getOperand(DefIdx);
-    if (MO.isReg() && MO.isImplicit() &&
-        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
-      // This is an implicit def, getOperandLatency() won't return the correct
-      // latency. e.g.
-      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
-      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
-      // What we want is to compute latency between def of %D6/%D7 and use of
-      // %Q3 instead.
-      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
-      if (DefMI->getOperand(Op2).isReg())
-        DefIdx = Op2;
-    }
-    MachineInstr *UseMI = Use->getInstr();
-    // For all uses of the register, calculate the maxmimum latency
-    int Latency = -1;
-    if (UseMI) {
-      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = UseMI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse())
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MOReg != Reg)
-          continue;
-
-        int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx,
-                                              UseMI, i);
-        Latency = std::max(Latency, UseCycle);
-      }
-    } else {
-      // UseMI is null, then it must be a scheduling barrier.
-      if (!InstrItins || InstrItins->isEmpty())
-        return;
-      unsigned DefClass = DefMI->getDesc().getSchedClass();
-      Latency = InstrItins->getOperandCycle(DefClass, DefIdx);
-    }
+    return 1;
 
-    // If we found a latency, then replace the existing dependence latency.
-    if (Latency >= 0)
-      dep.setLatency(Latency);
-  }
+  return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(),
+                                    Use->getInstr(), dep.getReg(), FindMin);
 }
 
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 3d22035..e675366 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -39,13 +39,11 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
   DebugType = ParentDebugType;
 #endif
 
-  // Determine the maximum depth of any itinerary. This determines the
-  // depth of the scoreboard. We always make the scoreboard at least 1
-  // cycle deep to avoid dealing with the boundary condition.
+  // Determine the maximum depth of any itinerary. This determines the depth of
+  // the scoreboard. We always make the scoreboard at least 1 cycle deep to
+  // avoid dealing with the boundary condition.
   unsigned ScoreboardDepth = 1;
   if (ItinData && !ItinData->isEmpty()) {
-    IssueWidth = ItinData->IssueWidth;
-
     for (unsigned idx = 0; ; ++idx) {
       if (ItinData->isEndMarker(idx))
         break;
@@ -63,16 +61,26 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
       // Find the next power-of-2 >= ItinDepth
       while (ItinDepth > ScoreboardDepth) {
         ScoreboardDepth *= 2;
+        // Don't set MaxLookAhead until we find at least one nonzero stage.
+        // This way, an itinerary with no stages has MaxLookAhead==0, which
+        // completely bypasses the scoreboard hazard logic.
+        MaxLookAhead = ScoreboardDepth;
       }
     }
-    MaxLookAhead = ScoreboardDepth;
   }
 
   ReservedScoreboard.reset(ScoreboardDepth);
   RequiredScoreboard.reset(ScoreboardDepth);
 
-  DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
-               << ScoreboardDepth << '\n');
+  // If MaxLookAhead is not set above, then we are not enabled.
+  if (!isEnabled())
+    DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+  else {
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel->IssueWidth;
+    DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+          << ScoreboardDepth << '\n');
+  }
 }
 
 void ScoreboardHazardRecognizer::Reset() {
@@ -151,7 +159,7 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       }
 
       if (!freeUnits) {
-        DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", ");
+        DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
         DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
         DEBUG(DAG->dumpNode(SU));
         return Hazard;
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index a6bdc3b..75e8167 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -23,3 +23,5 @@ add_llvm_library(LLVMSelectionDAG
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp
   )
+
+add_dependencies(LLVMSelectionDAG intrinsics_gen)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0914c66..4e29879 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -215,6 +215,7 @@ namespace {
     SDValue visitFADD(SDNode *N);
     SDValue visitFSUB(SDNode *N);
     SDValue visitFMUL(SDNode *N);
+    SDValue visitFMA(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFCOPYSIGN(SDNode *N);
@@ -227,6 +228,9 @@ namespace {
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
     SDValue visitFABS(SDNode *N);
+    SDValue visitFCEIL(SDNode *N);
+    SDValue visitFTRUNC(SDNode *N);
+    SDValue visitFFLOOR(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -328,15 +332,12 @@ namespace {
 class WorkListRemover : public SelectionDAG::DAGUpdateListener {
   DAGCombiner &DC;
 public:
-  explicit WorkListRemover(DAGCombiner &dc) : DC(dc) {}
+  explicit WorkListRemover(DAGCombiner &dc)
+    : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
 
   virtual void NodeDeleted(SDNode *N, SDNode *E) {
     DC.removeFromWorkList(N);
   }
-
-  virtual void NodeUpdated(SDNode *N) {
-    // Ignore updates.
-  }
 };
 }
 
@@ -619,8 +620,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                   N->getValueType(i) == To[i].getValueType()) &&
                  "Cannot combine value to value of different type!"));
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
-
+  DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
     // Push the new nodes and any users onto the worklist
     for (unsigned i = 0, e = NumTo; i != e; ++i) {
@@ -650,7 +650,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
 
   // Push the new node and any (possibly new) users onto the worklist.
   AddToWorkList(TLO.New.getNode());
@@ -707,9 +707,8 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
         Trunc.getNode()->dump(&DAG);
         dbgs() << '\n');
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc, &DeadNodes);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
   removeFromWorkList(Load);
   DAG.DeleteNode(Load);
   AddToWorkList(Trunc.getNode());
@@ -961,8 +960,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
           Result.getNode()->dump(&DAG);
           dbgs() << '\n');
     WorkListRemover DeadNodes(*this);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result, &DeadNodes);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1), &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
     removeFromWorkList(N);
     DAG.DeleteNode(N);
     AddToWorkList(Result.getNode());
@@ -1047,12 +1046,12 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     DAG.TransferDbgValues(SDValue(N, 0), RV);
     WorkListRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
-      DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
+      DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
       SDValue OpV = RV;
-      DAG.ReplaceAllUsesWith(N, &OpV, &DeadNodes);
+      DAG.ReplaceAllUsesWith(N, &OpV);
     }
 
     // Push the new node and any users onto the worklist
@@ -1131,6 +1130,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FADD:               return visitFADD(N);
   case ISD::FSUB:               return visitFSUB(N);
   case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FMA:                return visitFMA(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
@@ -1143,6 +1143,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
+  case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FCEIL:              return visitFCEIL(N);
+  case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
   case ISD::BR_CC:              return visitBR_CC(N);
   case ISD::LOAD:               return visitLOAD(N);
@@ -1325,10 +1328,12 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
   // Replacing results may cause a different MERGE_VALUES to suddenly
   // be CSE'd with N, and carry its uses with it. Iterate until no
   // uses remain, to ensure that the node can be safely deleted.
+  // First add the users of this node to the work list so that they
+  // can be tried again once they have new operands.
+  AddUsersToWorkList(N);
   do {
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i),
-                                    &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
   } while (!N->use_empty());
   removeFromWorkList(N);
   DAG.DeleteNode(N);
@@ -1640,7 +1645,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT);
     return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
-		       N1.getOperand(0));
+                       N1.getOperand(0));
   }
   // fold ((A+(B+or-C))-B) -> A+or-C
   if (N0.getOpcode() == ISD::ADD &&
@@ -2341,7 +2346,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeVectorOps) {
+      && Level == AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
@@ -2528,7 +2533,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                               Load->getOffset(), Load->getMemoryVT(),
                               Load->getMemOperand());
         // Replace uses of the EXTLOAD with the new ZEXTLOAD.
-        CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        if (Load->getNumValues() == 3) {
+          // PRE/POST_INC loads have 3 values.
+          SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
+                           NewLoad.getValue(2) };
+          CombineTo(Load, To, 3, true);
+        } else {
+          CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        }
       }
 
       // Fold the AND away, taking care not to fold to the old load node if we
@@ -2710,6 +2722,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      APInt ADDC = ADDI->getAPIntValue();
+      if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLI->getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              return SDValue(N, 0); // Return N so it doesn't get rechecked!
+            }
+          }
+        }
+      }
+    }
+  }
+      
+
   return SDValue();
 }
 
@@ -4526,8 +4566,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
     } else if (Op.getValueType().bitsGT(VT)) {
       Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
     }
     return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
                                   N0.getValueType().getScalarType());
@@ -5012,6 +5054,10 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT PtrType = N0.getOperand(1).getValueType();
 
+  if (PtrType == MVT::Untyped || PtrType.isExtended())
+    // It's not possible to generate a constant of extended or untyped type.
+    return SDValue();
+
   // For big endian targets, we need to adjust the offset to the pointer to
   // load the correct bytes.
   if (TLI.isBigEndian()) {
@@ -5041,8 +5087,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   // Replace the old load's chain with the new load's chain.
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
 
   // Shift the result left, if we've swallowed a left shift.
   SDValue Result = Load;
@@ -5225,7 +5270,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-
+      EVT IndexTy = N0->getOperand(1).getValueType();
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
       SDValue V = DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
@@ -5233,7 +5278,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
                          N->getDebugLoc(), TrTy, V,
-                         DAG.getConstant(Index, MVT::i32));
+                         DAG.getConstant(Index, IndexTy));
     }
   }
 
@@ -5607,7 +5652,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     if (FoldedVOp.getNode()) return FoldedVOp;
   }
 
-  // fold (fadd c1, c2) -> (fadd c1, c2)
+  // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP && VT != MVT::ppcf128)
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
   // canonicalize constant to RHS
@@ -5636,6 +5681,26 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                        DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
                                    N0.getOperand(1), N1));
 
+  // FADD -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+
+    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  
+    // fold (fadd x, (fmul y, z)) -> (fma x, y, z)
+    // Note: Commutes FADD operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N1.getOperand(0), N1.getOperand(1), N0);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5645,6 +5710,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -5665,17 +5731,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
       return GetNegatedExpression(N1, DAG, LegalOperations);
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
+      return DAG.getNode(ISD::FNEG, dl, VT, N1);
   }
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::FADD, dl, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
   // If 'unsafe math' is enabled, fold
+  //    (fsub x, x) -> 0.0 &
   //    (fsub x, (fadd x, y)) -> (fneg y) &
   //    (fsub x, (fadd y, x)) -> (fneg y)
   if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N0 == N1)
+      return DAG.getConstantFP(0.0f, VT);
+
     if (N1.getOpcode() == ISD::FADD) {
       SDValue N10 = N1->getOperand(0);
       SDValue N11 = N1->getOperand(1);
@@ -5689,6 +5759,40 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
+  // FSUB -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+
+    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    }
+
+    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+    // Note: Commutes FSUB operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT,
+                         N1.getOperand(0)),
+                         N1.getOperand(1), N0);
+    }
+
+    // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+    if (N0.getOpcode() == ISD::FNEG && 
+        N0.getOperand(0).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+      SDValue N00 = N0.getOperand(0).getOperand(0);
+      SDValue N01 = N0.getOperand(0).getOperand(1);
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    }
+  }
+
   return SDValue();
 }
 
@@ -5720,6 +5824,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (DAG.getTarget().Options.UnsafeFPMath &&
       ISD::isBuildVectorAllZeros(N1.getNode()))
     return N1;
+  // fold (fmul A, 1.0) -> A
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return N0;
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
@@ -5753,6 +5860,26 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFMA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N2);
+
+  // Canonicalize (fma c, x, y) -> (fma x, c, y)
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5893,6 +6020,38 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+        !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+
+    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+    //      (select_cc x, y, 1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
+          DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(0).getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5918,6 +6077,25 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+
+    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -6071,6 +6249,42 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FCEIL, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FTRUNC, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FFLOOR, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
@@ -6185,7 +6399,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
           }
           // Replace the uses of SRL with SETCC
           WorkListRemover DeadNodes(*this);
-          DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
           removeFromWorkList(N1.getNode());
           DAG.DeleteNode(N1.getNode());
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -6214,7 +6428,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
               Tmp.getNode()->dump(&DAG);
               dbgs() << '\n');
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(N1, Tmp, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
         removeFromWorkList(TheXor);
         DAG.DeleteNode(TheXor);
         return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
@@ -6240,7 +6454,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                                    Equal ? ISD::SETEQ : ISD::SETNE);
       // Replace the uses of XOR with SETCC
       WorkListRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
       removeFromWorkList(N1.getNode());
       DAG.DeleteNode(N1.getNode());
       return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
@@ -6431,21 +6645,17 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
         dbgs() << '\n');
   WorkListRemover DeadNodes(*this);
   if (isLoad) {
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
-                                  &DeadNodes);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
   } else {
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
   }
 
   // Finally, since the node is now dead, remove it from the graph.
   DAG.DeleteNode(N);
 
   // Replace the uses of Ptr with uses of the updated base value.
-  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
   removeFromWorkList(Ptr.getNode());
   DAG.DeleteNode(Ptr.getNode());
 
@@ -6559,13 +6769,10 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
               dbgs() << '\n');
         WorkListRemover DeadNodes(*this);
         if (isLoad) {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
-                                        &DeadNodes);
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
-                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
         } else {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
-                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
         }
 
         // Finally, since the node is now dead, remove it from the graph.
@@ -6573,8 +6780,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
-                                      Result.getValue(isLoad ? 1 : 0),
-                                      &DeadNodes);
+                                      Result.getValue(isLoad ? 1 : 0));
         removeFromWorkList(Op);
         DAG.DeleteNode(Op);
         return true;
@@ -6609,7 +6815,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               Chain.getNode()->dump(&DAG);
               dbgs() << "\n");
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
 
         if (N->use_empty()) {
           removeFromWorkList(N);
@@ -6629,11 +6835,10 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               Undef.getNode()->dump(&DAG);
               dbgs() << " and 2 other values\n");
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
-                                      DAG.getUNDEF(N->getValueType(1)),
-                                      &DeadNodes);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain, &DeadNodes);
+                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         removeFromWorkList(N);
         DAG.DeleteNode(N);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -6955,8 +7160,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       AddToWorkList(NewLD.getNode());
       AddToWorkList(NewVal.getNode());
       WorkListRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
-                                    &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
       ++OpsNarrowed;
       return NewST;
     }
@@ -7013,8 +7217,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     AddToWorkList(NewLD.getNode());
     AddToWorkList(NewST.getNode());
     WorkListRemover DeadNodes(*this);
-    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
     ++LdStFP2Int;
     return NewST;
   }
@@ -7058,7 +7261,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       SDValue Tmp;
       switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unknown FP type");
-      case MVT::f80:    // We don't do this for these yet.
+      case MVT::f16:    // We don't do this for these yet.
+      case MVT::f80:
       case MVT::f128:
       case MVT::ppcf128:
         break;
@@ -7323,8 +7527,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       OrigElt -= NumElem;
     }
 
+    EVT IndexTy = N->getOperand(1).getValueType();
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), NVT,
-                       InVec, DAG.getConstant(OrigElt, MVT::i32));
+                       InVec, DAG.getConstant(OrigElt, IndexTy));
   }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
@@ -7472,7 +7677,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     WorkListRemover DeadNodes(*this);
     SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
     SDValue To[] = { Load, Chain };
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2, &DeadNodes);
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
     // Since we're explcitly calling ReplaceAllUses, add the new node to the
     // worklist explicitly as well.
     AddToWorkList(Load.getNode());
@@ -7489,6 +7694,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   unsigned NumInScalars = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
@@ -7496,12 +7706,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // using shuffles.
   EVT SourceType = MVT::Other;
   bool AllAnyExt = true;
-  bool AllUndef = true;
+
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     // Ignore undef inputs.
     if (In.getOpcode() == ISD::UNDEF) continue;
-    AllUndef = false;
 
     bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
     bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
@@ -7529,9 +7738,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     AllAnyExt &= AnyExt;
   }
 
-  if (AllUndef)
-    return DAG.getUNDEF(VT);
-
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
@@ -7707,6 +7913,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   if (N->getNumOperands() == 1)
     return N->getOperand(0);
 
+  // Check if all of the operands are undefs.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(N->getValueType(0));
+
   return SDValue();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 0c1ac69..683fac6 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -40,6 +40,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "isel"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
@@ -51,10 +52,10 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -484,7 +485,7 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         // N = N + Offset
-        TotalOffs += 
+        TotalOffs +=
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         if (TotalOffs >= MaxOffs) {
           N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
@@ -573,7 +574,10 @@ bool FastISel::SelectCall(const User *I) {
     // At -O0 we don't care about the lifetime intrinsics.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
+    // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
     return true;
+
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
     if (!DIVariable(DI->getVariable()).Verify() ||
@@ -642,7 +646,7 @@ bool FastISel::SelectCall(const User *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addCImm(CI).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
-      else 
+      else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addImm(CI->getZExtValue()).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
@@ -786,13 +790,24 @@ FastISel::SelectInstruction(const Instruction *I) {
 
   MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
 
+  // As a special case, don't handle calls to builtin library functions that
+  // may be translated directly to target instructions.
+  if (const CallInst *Call = dyn_cast<CallInst>(I)) {
+    const Function *F = Call->getCalledFunction();
+    LibFunc::Func Func;
+    if (F && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func))
+      return false;
+  }
+
   // First, try doing target-independent selection.
   if (SelectOperator(I, I->getOpcode())) {
     ++NumFastIselSuccessIndependent;
     DL = DebugLoc();
     return true;
   }
-  // Remove dead code.  However, ignore call instructions since we've flushed 
+  // Remove dead code.  However, ignore call instructions since we've flushed
   // the local value map and recomputed the insert point.
   if (!isa<CallInst>(I)) {
     recomputeInsertPt();
@@ -1037,7 +1052,8 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 }
 
-FastISel::FastISel(FunctionLoweringInfo &funcInfo)
+FastISel::FastISel(FunctionLoweringInfo &funcInfo,
+                   const TargetLibraryInfo *libInfo)
   : FuncInfo(funcInfo),
     MRI(FuncInfo.MF->getRegInfo()),
     MFI(*FuncInfo.MF->getFrameInfo()),
@@ -1046,7 +1062,8 @@ FastISel::FastISel(FunctionLoweringInfo &funcInfo)
     TD(*TM.getTargetData()),
     TII(*TM.getInstrInfo()),
     TLI(*TM.getTargetLowering()),
-    TRI(*TM.getRegisterInfo()) {
+    TRI(*TM.getRegisterInfo()),
+    LibInfo(libInfo) {
 }
 
 FastISel::~FastISel() {}
@@ -1306,6 +1323,30 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill,
+                                     uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
 unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC,
                                   uint64_t Imm) {
@@ -1345,6 +1386,8 @@ unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
   assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
+  const TargetRegisterClass *RC = MRI.getRegClass(Op0);
+  MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
           DL, TII.get(TargetOpcode::COPY), ResultReg)
     .addReg(Op0, getKillRegState(Op0IsKill), Idx);
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8dde919..3e18ea7 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -15,13 +15,13 @@
 #define DEBUG_TYPE "function-lowering-info"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 1467d88..4488d27 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -48,16 +48,31 @@ unsigned InstrEmitter::CountResults(SDNode *Node) {
   return N;
 }
 
-/// CountOperands - The inputs to target nodes have any actual inputs first,
+/// countOperands - The inputs to target nodes have any actual inputs first,
 /// followed by an optional chain operand, then an optional glue operand.
 /// Compute the number of actual operands that will go into the resulting
 /// MachineInstr.
-unsigned InstrEmitter::CountOperands(SDNode *Node) {
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
   unsigned N = Node->getNumOperands();
   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     --N;
   if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
     --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  for (unsigned I = N; I; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
   return N;
 }
 
@@ -114,8 +129,10 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           if (User->isMachineOpcode()) {
             const MCInstrDesc &II = TII->get(User->getMachineOpcode());
             const TargetRegisterClass *RC = 0;
-            if (i+II.getNumDefs() < II.getNumOperands())
-              RC = TII->getRegClass(II, i+II.getNumDefs(), TRI);
+            if (i+II.getNumDefs() < II.getNumOperands()) {
+              RC = TRI->getAllocatableClass(
+                TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF));
+            }
             if (!UseRC)
               UseRC = RC;
             else if (RC) {
@@ -196,7 +213,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
     // is a vreg in the same register class, use the CopyToReg'd destination
     // register instead of creating a new vreg.
     unsigned VRBase = 0;
-    const TargetRegisterClass *RC = TII->getRegClass(II, i, TRI);
+    const TargetRegisterClass *RC =
+      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       unsigned NumResults = CountResults(Node);
@@ -293,7 +311,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
   if (II) {
     const TargetRegisterClass *DstRC = 0;
     if (IIOpNum < II->getNumOperands())
-      DstRC = TII->getRegClass(*II, IIOpNum, TRI);
+      DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
     assert((DstRC || (MI->isVariadic() && IIOpNum >= MCID.getNumOperands())) &&
            "Don't have operand info for this instruction!");
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
@@ -334,8 +352,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
 
 /// AddOperand - Add the specified operand to the specified machine instr.  II
 /// specifies the instruction information for the node, and IIOpNum is the
-/// operand number (in the II) that we are adding. IIOpNum and II are used for
-/// assertions only.
+/// operand number (in the II) that we are adding.
 void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                               unsigned IIOpNum,
                               const MCInstrDesc *II,
@@ -350,7 +367,11 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
     const ConstantFP *CFP = F->getConstantFPValue();
     MI->addOperand(MachineOperand::CreateFPImm(CFP));
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
-    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false));
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false, Imp));
   } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateRegMask(RM->getRegMask()));
   } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
@@ -390,6 +411,10 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateBA(BA->getBlockAddress(),
                                             BA->getTargetFlags()));
+  } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateTargetIndex(TI->getIndex(),
+                                                     TI->getOffset(),
+                                                     TI->getTargetFlags()));
   } else {
     assert(Op.getValueType() != MVT::Other &&
            Op.getValueType() != MVT::Glue &&
@@ -458,7 +483,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     unsigned SrcReg, DstReg, DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
-        SubIdx == DefSubIdx) {
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
       // Optimize these:
       // r1025 = s/zext r1024, 4
       // r1026 = extract_subreg r1025, 4
@@ -467,6 +493,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       VRBase = MRI->createVirtualRegister(TRC);
       BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
               TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+      MRI->clearKillFlags(SrcReg);
     } else {
       // VReg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
@@ -548,7 +575,8 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 
   // Create the new VReg in the destination class and emit a copy.
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-  const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
+  const TargetRegisterClass *DstRC =
+    TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
   unsigned NewVReg = MRI->createVirtualRegister(DstRC);
   BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
     NewVReg).addReg(VReg);
@@ -566,7 +594,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
-  unsigned NewVReg = MRI->createVirtualRegister(RC);
+  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
                              TII->get(TargetOpcode::REG_SEQUENCE), NewVReg);
   unsigned NumOps = Node->getNumOperands();
@@ -691,7 +719,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
-  unsigned NodeOperands = CountOperands(Node);
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands = countOperands(Node, NumImpUses);
   bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
@@ -700,7 +729,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
            "Too few operands for a variadic node!");
   else
     assert(NumMIOperands >= II.getNumOperands() &&
-           NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() &&
+           NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() +
+                            NumImpUses &&
            "#operands for dag node doesn't match .td file!");
 #endif
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index c081f38..9eddee9 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -105,12 +105,6 @@ public:
   /// (which do not go into the machine instrs.)
   static unsigned CountResults(SDNode *Node);
 
-  /// CountOperands - The inputs to target nodes have any actual inputs first,
-  /// followed by an optional chain operand, then flag operands.  Compute
-  /// the number of actual operands that will go into the resulting
-  /// MachineInstr.
-  static unsigned CountOperands(SDNode *Node);
-
   /// EmitDbgValue - Generate machine instruction for a dbg_value node.
   ///
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a96a997..908ebb9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -11,7 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -20,10 +24,6 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -70,6 +70,9 @@ private:
 
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
   /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
@@ -150,21 +153,21 @@ public:
   // Node replacement helpers
   void ReplacedNode(SDNode *N) {
     if (N->use_empty()) {
-      DAG.RemoveDeadNode(N, this);
+      DAG.RemoveDeadNode(N);
     } else {
       ForgetNode(N);
     }
   }
   void ReplaceNode(SDNode *Old, SDNode *New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old);
   }
   void ReplaceNode(SDValue Old, SDValue New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old.getNode());
   }
   void ReplaceNode(SDNode *Old, const SDValue *New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old);
   }
 };
@@ -203,7 +206,8 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
 }
 
 SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
-  : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+  : SelectionDAG::DAGUpdateListener(dag),
+    TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
     DAG(dag) {
 }
 
@@ -424,7 +428,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   DebugLoc dl = LD->getDebugLoc();
   if (VT.isFloatingPoint() || VT.isVector()) {
     EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
-    if (TLI.isTypeLegal(intVT)) {
+    if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
       // Expand to a (misaligned) integer load of the same size,
       // then bitconvert to floating point or vector.
       SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
@@ -432,8 +436,9 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                     LD->isNonTemporal(),
                                     LD->isInvariant(), LD->getAlignment());
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
-      if (VT.isFloatingPoint() && LoadedVT != VT)
-        Result = DAG.getNode(ISD::FP_EXTEND, dl, VT, Result);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
 
       ValResult = Result;
       ChainResult = Chain;
@@ -638,9 +643,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   // probably means that we need to integrate dag combiner and legalizer
   // together.
   // We generally can't do this one for long doubles.
-  SDValue Tmp1 = ST->getChain();
-  SDValue Tmp2 = ST->getBasePtr();
-  SDValue Tmp3;
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
@@ -648,19 +652,19 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
-      Tmp3 = DAG.getConstant(CFP->getValueAPF().
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
-      return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                           isVolatile, isNonTemporal, Alignment);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
       // If this target supports 64-bit registers, do a single 64-bit store.
       if (TLI.isTypeLegal(MVT::i64)) {
-        Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
-        return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                             isVolatile, isNonTemporal, Alignment);
       }
 
@@ -673,11 +677,11 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
-        Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getPointerInfo(), isVolatile,
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
                           isNonTemporal, Alignment);
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                             DAG.getIntPtrConstant(4));
-        Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2,
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
 
@@ -688,14 +692,448 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   return SDValue(0, 0);
 }
 
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    SDValue Chain = ST->getChain();
+    SDValue Ptr = ST->getBasePtr();
+    DebugLoc dl = Node->getDebugLoc();
+
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+    bool isNonTemporal = ST->isNonTemporal();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        ReplaceNode(ST, OptStore);
+        return;
+      }
+
+      {
+        SDValue Value = ST->getValue();
+        EVT VT = Value.getValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node),
+                                   DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Promote: {
+          assert(VT.isVector() && "Unknown legal promote case!");
+          Value = DAG.getNode(ISD::BITCAST, dl,
+                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr,
+                         ST->getPointerInfo(), isVolatile,
+                         isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+        }
+        return;
+      }
+    } else {
+      SDValue Value = ST->getValue();
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+        SDValue Result =
+          DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                            NVT, isVolatile, isNonTemporal, Alignment);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                 RoundVT,
+                                 isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(RoundWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(ExtraWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
+                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                              ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else {
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Expand:
+          assert(!StVT.isVector() &&
+                 "Vector Stores are handled in LegalizeVectorOps");
+
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(TLI.isTypeLegal(StVT) &&
+                 "Do not know how to expand this store!");
+          Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                         isVolatile, isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+      }
+    }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  DebugLoc dl = Node->getDebugLoc();
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    EVT VT = Node->getValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal:
+             // If this is an unaligned load and the target doesn't support it,
+             // expand it.
+             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+               Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+               unsigned ABIAlignment =
+                 TLI.getTargetData()->getABITypeAlignment(Ty);
+               if (LD->getAlignment() < ABIAlignment){
+                 ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                     DAG, TLI, RVal, RChain);
+               }
+             }
+             break;
+    case TargetLowering::Custom: {
+             SDValue Res = TLI.LowerOperation(RVal, DAG);
+             if (Res.getNode()) {
+               RVal = Res;
+               RChain = Res.getValue(1);
+             }
+             break;
+    }
+    case TargetLowering::Promote: {
+      // Only promote a load of vector type to another.
+      assert(VT.isVector() && "Cannot promote this load!");
+      // Change base type to a different vector type.
+      EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                         LD->isVolatile(), LD->isNonTemporal(),
+                         LD->isInvariant(), LD->getAlignment());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  EVT SrcVT = LD->getMemoryVT();
+  unsigned SrcWidth = SrcVT.getSizeInBits();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result =
+      DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                     Chain, Ptr, LD->getPointerInfo(),
+                     NVT, isVolatile, isNonTemporal, Alignment);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (SrcWidth & (SrcWidth - 1)) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+    assert(RoundWidth < SrcWidth);
+    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+
+    if (TLI.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
+                          Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(RoundWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                          dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(ExtraWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+             isCustom = true;
+             // FALLTHROUGH
+    case TargetLowering::Legal: {
+             Value = SDValue(Node, 0);
+             Chain = SDValue(Node, 1);
+
+             if (isCustom) {
+               SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+               if (Res.getNode()) {
+                 Value = Res;
+                 Chain = Res.getValue(1);
+               }
+             } else {
+               // If this is an unaligned load and the target doesn't support it,
+               // expand it.
+               if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+                 Type *Ty =
+                   LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+                 unsigned ABIAlignment =
+                   TLI.getTargetData()->getABITypeAlignment(Ty);
+                 if (LD->getAlignment() < ABIAlignment){
+                   ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                       DAG, TLI, Value, Chain);
+                 }
+               }
+             }
+             break;
+    }
+    case TargetLowering::Expand:
+             if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
+               SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
+                                          LD->getPointerInfo(),
+                                          LD->isVolatile(), LD->isNonTemporal(),
+                                          LD->isInvariant(), LD->getAlignment());
+               unsigned ExtendOp;
+               switch (ExtType) {
+               case ISD::EXTLOAD:
+                 ExtendOp = (SrcVT.isFloatingPoint() ?
+                             ISD::FP_EXTEND : ISD::ANY_EXTEND);
+                 break;
+               case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
+               case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
+               default: llvm_unreachable("Unexpected extend load type!");
+               }
+               Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+               Chain = Load.getValue(1);
+               break;
+             }
+
+             assert(!SrcVT.isVector() &&
+                    "Vector Loads are handled in LegalizeVectorOps");
+
+             // FIXME: This does not work for vectors on most targets.  Sign- and
+             // zero-extend operations are currently folded into extending loads,
+             // whether they are legal or not, and then we end up here without any
+             // support for legalizing them.
+             assert(ExtType != ISD::EXTLOAD &&
+                    "EXTLOAD should always be supported!");
+             // Turn the unsupported load into an EXTLOAD followed by an explicit
+             // zero/sign extend inreg.
+             SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
+                                             LD->isVolatile(), LD->isNonTemporal(),
+                                             LD->getAlignment());
+             SDValue ValRes;
+             if (ExtType == ISD::SEXTLOAD)
+               ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                    Result.getValueType(),
+                                    Result, DAG.getValueType(SrcVT));
+             else
+               ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+             Value = ValRes;
+             Chain = Result.getValue(1);
+             break;
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+  }
+}
+
 /// LegalizeOp - Return a legal replacement for the given operation, with
 /// all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
     return;
 
-  DebugLoc dl = Node->getDebugLoc();
-
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
     assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
              TargetLowering::TypeLegal &&
@@ -708,9 +1146,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
             Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
            "Unexpected illegal type!");
 
-  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
-  bool isCustom = false;
-
   // Figure out the correct action; the way to query this varies by opcode
   TargetLowering::LegalizeAction Action = TargetLowering::Legal;
   bool SimpleFinishLegalizing = true;
@@ -816,9 +1251,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   }
 
   if (SimpleFinishLegalizing) {
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
-      Ops.push_back(Node->getOperand(i));
+    SDNode *NewNode = Node;
     switch (Node->getOpcode()) {
     default: break;
     case ISD::SHL:
@@ -828,11 +1261,14 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case ISD::ROTR:
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
-      if (!Ops[1].getValueType().isVector()) {
-        SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[1]);
+      if (!Node->getOperand(1).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(1));
         HandleSDNode Handle(SAO);
         LegalizeOp(SAO.getNode());
-        Ops[1] = Handle.getValue();
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Handle.getValue());
       }
       break;
     case ISD::SRL_PARTS:
@@ -840,18 +1276,21 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case ISD::SHL_PARTS:
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
-      if (!Ops[2].getValueType().isVector()) {
-        SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[2]);
+      if (!Node->getOperand(2).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(2));
         HandleSDNode Handle(SAO);
         LegalizeOp(SAO.getNode());
-        Ops[2] = Handle.getValue();
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Node->getOperand(1),
+                                         Handle.getValue());
       }
       break;
     }
 
-    SDNode *NewNode = DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
     if (NewNode != Node) {
-      DAG.ReplaceAllUsesWith(Node, NewNode, this);
+      DAG.ReplaceAllUsesWith(Node, NewNode);
       for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
         DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i));
       ReplacedNode(Node);
@@ -860,27 +1299,27 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     switch (Action) {
     case TargetLowering::Legal:
       return;
-    case TargetLowering::Custom:
+    case TargetLowering::Custom: {
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
-      Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-      if (Tmp1.getNode()) {
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res.getNode()) {
         SmallVector<SDValue, 8> ResultVals;
         for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
           if (e == 1)
-            ResultVals.push_back(Tmp1);
+            ResultVals.push_back(Res);
           else
-            ResultVals.push_back(Tmp1.getValue(i));
+            ResultVals.push_back(Res.getValue(i));
         }
-        if (Tmp1.getNode() != Node || Tmp1.getResNo() != 0) {
-          DAG.ReplaceAllUsesWith(Node, ResultVals.data(), this);
+        if (Res.getNode() != Node || Res.getResNo() != 0) {
+          DAG.ReplaceAllUsesWith(Node, ResultVals.data());
           for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
             DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
           ReplacedNode(Node);
         }
         return;
       }
-
+    }
       // FALL THROUGH
     case TargetLowering::Expand:
       ExpandNode(Node);
@@ -904,428 +1343,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::CALLSEQ_END:
     break;
   case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(Node);
-    Tmp1 = LD->getChain();   // Legalize the chain.
-    Tmp2 = LD->getBasePtr(); // Legalize the base pointer.
-
-    ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (ExtType == ISD::NON_EXTLOAD) {
-      EVT VT = Node->getValueType(0);
-      Tmp3 = SDValue(Node, 0);
-      Tmp4 = SDValue(Node, 1);
-
-      switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Legal:
-        // If this is an unaligned load and the target doesn't support it,
-        // expand it.
-        if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
-          if (LD->getAlignment() < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Tmp3, Tmp4);
-          }
-        }
-        break;
-      case TargetLowering::Custom:
-        Tmp1 = TLI.LowerOperation(Tmp3, DAG);
-        if (Tmp1.getNode()) {
-          Tmp3 = Tmp1;
-          Tmp4 = Tmp1.getValue(1);
-        }
-        break;
-      case TargetLowering::Promote: {
-        // Only promote a load of vector type to another.
-        assert(VT.isVector() && "Cannot promote this load!");
-        // Change base type to a different vector type.
-        EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
-
-        Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getPointerInfo(),
-                           LD->isVolatile(), LD->isNonTemporal(),
-                           LD->isInvariant(), LD->getAlignment());
-        Tmp3 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1);
-        Tmp4 = Tmp1.getValue(1);
-        break;
-      }
-      }
-      if (Tmp4.getNode() != Node) {
-        assert(Tmp3.getNode() != Node && "Load must be completely replaced");
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp3);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp4);
-        ReplacedNode(Node);
-      }
-      return;
-    }
-
-    EVT SrcVT = LD->getMemoryVT();
-    unsigned SrcWidth = SrcVT.getSizeInBits();
-    unsigned Alignment = LD->getAlignment();
-    bool isVolatile = LD->isVolatile();
-    bool isNonTemporal = LD->isNonTemporal();
-
-    if (SrcWidth != SrcVT.getStoreSizeInBits() &&
-        // Some targets pretend to have an i1 loading operation, and actually
-        // load an i8.  This trick is correct for ZEXTLOAD because the top 7
-        // bits are guaranteed to be zero; it helps the optimizers understand
-        // that these bits are zero.  It is also useful for EXTLOAD, since it
-        // tells the optimizers that those bits are undefined.  It would be
-        // nice to have an effective generic way of getting these benefits...
-        // Until such a way is found, don't insist on promoting i1 here.
-        (SrcVT != MVT::i1 ||
-         TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
-      // Promote to a byte-sized load if not loading an integral number of
-      // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
-      unsigned NewWidth = SrcVT.getStoreSizeInBits();
-      EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
-      SDValue Ch;
-
-      // The extra bits are guaranteed to be zero, since we stored them that
-      // way.  A zext load from NVT thus automatically gives zext from SrcVT.
-
-      ISD::LoadExtType NewExtType =
-        ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-
-      SDValue Result =
-        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
-                       Tmp1, Tmp2, LD->getPointerInfo(),
-                       NVT, isVolatile, isNonTemporal, Alignment);
-
-      Ch = Result.getValue(1); // The chain.
-
-      if (ExtType == ISD::SEXTLOAD)
-        // Having the top bits zero doesn't help when sign extending.
-        Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                             Result.getValueType(),
-                             Result, DAG.getValueType(SrcVT));
-      else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
-        // All the top bits are guaranteed to be zero - inform the optimizers.
-        Result = DAG.getNode(ISD::AssertZext, dl,
-                             Result.getValueType(), Result,
-                             DAG.getValueType(SrcVT));
-
-      Tmp1 = Result;
-      Tmp2 = Ch;
-    } else if (SrcWidth & (SrcWidth - 1)) {
-      // If not loading a power-of-2 number of bits, expand as two loads.
-      assert(!SrcVT.isVector() && "Unsupported extload!");
-      unsigned RoundWidth = 1 << Log2_32(SrcWidth);
-      assert(RoundWidth < SrcWidth);
-      unsigned ExtraWidth = SrcWidth - RoundWidth;
-      assert(ExtraWidth < RoundWidth);
-      assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-             "Load size not an integral number of bytes!");
-      EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-      EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-      SDValue Lo, Hi, Ch;
-      unsigned IncrementSize;
-
-      if (TLI.isLittleEndian()) {
-        // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
-        // Load the bottom RoundWidth bits.
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
-                            Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(RoundWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      } else {
-        // Big endian - avoid unaligned loads.
-        // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
-        // Load the top RoundWidth bits.
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
-                            dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(ExtraWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      }
-
-      Tmp2 = Ch;
-    } else {
-      switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Custom:
-        isCustom = true;
-        // FALLTHROUGH
-      case TargetLowering::Legal:
-        Tmp1 = SDValue(Node, 0);
-        Tmp2 = SDValue(Node, 1);
-
-        if (isCustom) {
-          Tmp3 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp3.getNode()) {
-            Tmp1 = Tmp3;
-            Tmp2 = Tmp3.getValue(1);
-          }
-        } else {
-          // If this is an unaligned load and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-            Type *Ty =
-              LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment =
-              TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (LD->getAlignment() < ABIAlignment){
-              ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                  DAG, TLI, Tmp1, Tmp2);
-            }
-          }
-        }
-        break;
-      case TargetLowering::Expand:
-        if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
-          SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2,
-                                     LD->getPointerInfo(),
-                                     LD->isVolatile(), LD->isNonTemporal(),
-                                     LD->isInvariant(), LD->getAlignment());
-          unsigned ExtendOp;
-          switch (ExtType) {
-          case ISD::EXTLOAD:
-            ExtendOp = (SrcVT.isFloatingPoint() ?
-                        ISD::FP_EXTEND : ISD::ANY_EXTEND);
-            break;
-          case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-          case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-          default: llvm_unreachable("Unexpected extend load type!");
-          }
-          Tmp1 = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-          Tmp2 = Load.getValue(1);
-          break;
-        }
-
-        assert(!SrcVT.isVector() &&
-               "Vector Loads are handled in LegalizeVectorOps");
-
-        // FIXME: This does not work for vectors on most targets.  Sign- and
-        // zero-extend operations are currently folded into extending loads,
-        // whether they are legal or not, and then we end up here without any
-        // support for legalizing them.
-        assert(ExtType != ISD::EXTLOAD &&
-               "EXTLOAD should always be supported!");
-        // Turn the unsupported load into an EXTLOAD followed by an explicit
-        // zero/sign extend inreg.
-        SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                        Tmp1, Tmp2, LD->getPointerInfo(), SrcVT,
-                                        LD->isVolatile(), LD->isNonTemporal(),
-                                        LD->getAlignment());
-        SDValue ValRes;
-        if (ExtType == ISD::SEXTLOAD)
-          ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                               Result.getValueType(),
-                               Result, DAG.getValueType(SrcVT));
-        else
-          ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
-        Tmp1 = ValRes;
-        Tmp2 = Result.getValue(1);
-        break;
-      }
-    }
-
-    // Since loads produce two values, make sure to remember that we legalized
-    // both of them.
-    if (Tmp2.getNode() != Node) {
-      assert(Tmp1.getNode() != Node && "Load must be completely replaced");
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp1);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp2);
-      ReplacedNode(Node);
-    }
-    break;
+    return LegalizeLoadOps(Node);
   }
   case ISD::STORE: {
-    StoreSDNode *ST = cast<StoreSDNode>(Node);
-    Tmp1 = ST->getChain();
-    Tmp2 = ST->getBasePtr();
-    unsigned Alignment = ST->getAlignment();
-    bool isVolatile = ST->isVolatile();
-    bool isNonTemporal = ST->isNonTemporal();
-
-    if (!ST->isTruncatingStore()) {
-      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
-        ReplaceNode(ST, OptStore);
-        break;
-      }
-
-      {
-        Tmp3 = ST->getValue();
-        EVT VT = Tmp3.getValueType();
-        switch (TLI.getOperationAction(ISD::STORE, VT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp1.getNode())
-            ReplaceNode(SDValue(Node, 0), Tmp1);
-          break;
-        case TargetLowering::Promote: {
-          assert(VT.isVector() && "Unknown legal promote case!");
-          Tmp3 = DAG.getNode(ISD::BITCAST, dl,
-                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
-                         ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-        }
-        break;
-      }
-    } else {
-      Tmp3 = ST->getValue();
-
-      EVT StVT = ST->getMemoryVT();
-      unsigned StWidth = StVT.getSizeInBits();
-
-      if (StWidth != StVT.getStoreSizeInBits()) {
-        // Promote to a byte-sized store with upper bits zero if not
-        // storing an integral number of bytes.  For example, promote
-        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
-        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
-                                    StVT.getStoreSizeInBits());
-        Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT);
-        SDValue Result =
-          DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else if (StWidth & (StWidth - 1)) {
-        // If not storing a power-of-2 number of bits, expand as two stores.
-        assert(!StVT.isVector() && "Unsupported truncstore!");
-        unsigned RoundWidth = 1 << Log2_32(StWidth);
-        assert(RoundWidth < StWidth);
-        unsigned ExtraWidth = StWidth - RoundWidth;
-        assert(ExtraWidth < RoundWidth);
-        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-               "Store size not an integral number of bytes!");
-        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-        SDValue Lo, Hi;
-        unsigned IncrementSize;
-
-        if (TLI.isLittleEndian()) {
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
-          // Store the bottom RoundWidth bits.
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                                 RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(RoundWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2,
-                             ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        } else {
-          // Big endian - avoid unaligned stores.
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
-          // Store the top RoundWidth bits.
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(ExtraWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
-                              ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        }
-
-        // The order of the stores doesn't matter.
-        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else {
-        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          ReplaceNode(SDValue(Node, 0),
-                      TLI.LowerOperation(SDValue(Node, 0), DAG));
-          break;
-        case TargetLowering::Expand:
-          assert(!StVT.isVector() &&
-                 "Vector Stores are handled in LegalizeVectorOps");
-
-          // TRUNCSTORE:i16 i32 -> STORE i16
-          assert(TLI.isTypeLegal(StVT) && "Do not know how to expand this store!");
-          Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-      }
-    }
-    break;
+    return LegalizeStoreOps(Node);
   }
   }
 }
@@ -1795,11 +1816,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   if (isTailCall)
     InChain = TCChain;
 
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), isTailCall,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
 
   if (!CallInfo.second.getNode())
     // It's a tailcall, return the chain (which is the DAG root).
@@ -1828,11 +1851,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                          TLI.getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue,SDValue> CallInfo =
-  TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                  false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                       false, 0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
                   /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                   Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo.first;
 }
@@ -1860,11 +1885,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
 }
@@ -1919,9 +1945,11 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
   return TLI.getLibcallName(LC) != 0;
 }
 
-/// UseDivRem - Only issue divrem libcall if both quotient and remainder are
+/// useDivRem - Only issue divrem libcall if both quotient and remainder are
 /// needed.
-static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) {
+static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
+  // The other use might have been replaced with a divrem already.
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
   unsigned OtherOpcode = 0;
   if (isSigned)
     OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV;
@@ -1935,7 +1963,7 @@ static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) {
     SDNode *User = *UI;
     if (User == Node)
       continue;
-    if (User->getOpcode() == OtherOpcode &&
+    if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) &&
         User->getOperand(0) == Op0 &&
         User->getOperand(1) == Op1)
       return true;
@@ -1992,11 +2020,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                          TLI.getPointerTy());
 
   DebugLoc dl = Node->getDebugLoc();
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   // Remainder is loaded back from the stack frame.
   SDValue Rem = DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr,
@@ -2570,14 +2599,17 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // If the target didn't lower this, lower it to '__sync_synchronize()' call
     // FIXME: handle "fence singlethread" more efficiently.
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> CallResult =
-      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
                       false, false, false, false, 0, CallingConv::C,
                       /*isTailCall=*/false,
                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                       DAG.getExternalSymbol("__sync_synchronize",
                                             TLI.getPointerTy()),
                       Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
     Results.push_back(CallResult.second);
     break;
   }
@@ -2647,13 +2679,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::TRAP: {
     // If this operation is not supported, lower it to 'abort()' call
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> CallResult =
-      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
                       false, false, false, false, 0, CallingConv::C,
                       /*isTailCall=*/false,
                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                       DAG.getExternalSymbol("abort", TLI.getPointerTy()),
                       Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
     Results.push_back(CallResult.second);
     break;
   }
@@ -3059,7 +3094,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
            "Don't know how to expand this subtraction!");
     Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
                DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT));
-    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp2, DAG.getConstant(1, VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, VT));
     Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
     break;
   }
@@ -3074,7 +3109,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(1);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
         (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         UseDivRem(Node, isSigned, false))) {
+         useDivRem(Node, isSigned, false))) {
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
@@ -3102,7 +3137,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDVTList VTs = DAG.getVTList(VT, VT);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
         (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         UseDivRem(Node, isSigned, true)))
+         useDivRem(Node, isSigned, true)))
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
                          Node->getOperand(1));
     else if (isSigned)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 95ddb1e..e8e968a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -588,18 +588,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
     unsigned NumElts = InVT.getVectorNumElements();
     assert(NumElts == NVT.getVectorNumElements() &&
            "Dst and Src must have the same number of elements");
-    EVT EltVT = InVT.getScalarType();
     assert(isPowerOf2_32(NumElts) &&
            "Promoted vector type must be a power of two");
 
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts/2);
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
     EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
                                    NumElts/2);
-
-    SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(0));
-    SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(NumElts/2));
     EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
     EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
 
@@ -2273,9 +2269,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
-				  RHS, DAG.getConstant(0, VT), ISD::SETNE);
+                                  RHS, DAG.getConstant(0, VT), ISD::SETNE);
     SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
-				  DAG.getConstant(1, VT), RHS);
+                                  DAG.getConstant(1, VT), RHS);
     SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero);
     SDValue Overflow;
     Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE);
@@ -2296,8 +2292,8 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl,
-			       DAG.getConstant(0, PtrVT), Temp,
-			       MachinePointerInfo(), false, false, 0);
+                               DAG.getConstant(0, PtrVT), Temp,
+                               MachinePointerInfo(), false, false, 0);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2319,16 +2315,17 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(Chain, RetTy, true, false, false, false,
-		    0, TLI.getLibcallCallingConv(LC),
-                    /*isTailCall=*/false,
-		    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Func, Args, DAG, dl);
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, RetTy, true, false, false, false,
+                       0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Func, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
   SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp,
-			      MachinePointerInfo(), false, false, false, 0);
+                              MachinePointerInfo(), false, false, false, 0);
   SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
                              DAG.getConstant(0, PtrVT),
                              ISD::SETNE);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 439aa4d..39337ff 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -628,7 +628,8 @@ namespace {
   public:
     explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
                                 SmallSetVector<SDNode*, 16> &nta)
-      : DTL(dtl), NodesToAnalyze(nta) {}
+      : SelectionDAG::DAGUpdateListener(dtl.getDAG()),
+        DTL(dtl), NodesToAnalyze(nta) {}
 
     virtual void NodeDeleted(SDNode *N, SDNode *E) {
       assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
@@ -680,7 +681,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   SmallSetVector<SDNode*, 16> NodesToAnalyze;
   NodeUpdateListener NUL(*this, NodesToAnalyze);
   do {
-    DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+    DAG.ReplaceAllUsesOfValueWith(From, To);
 
     // The old node may still be present in a map like ExpandedIntegers or
     // PromotedIntegers.  Inform maps about the replacement.
@@ -709,7 +710,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
           SDValue NewVal(M, i);
           if (M->getNodeId() == Processed)
             RemapValue(NewVal);
-          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
           // OldVal may be a target of the ReplacedValues map which was marked
           // NewNode to force reanalysis because it was updated.  Ensure that
           // anything that ReplacedValues mapped to OldVal will now be mapped
@@ -950,7 +951,7 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
     if (i != ResNo)
       ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
-  return SDValue(N, ResNo);
+  return SDValue(N->getOperand(ResNo));
 }
 
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
@@ -1054,12 +1055,14 @@ SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
                                          TLI.getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue,SDValue> CallInfo =
-    TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, TLI.getLibcallCallingConv(LC),
                     /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
   return CallInfo.first;
 }
 
@@ -1086,11 +1089,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e866445..94fc976 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -135,6 +135,8 @@ public:
       ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
   }
 
+  SelectionDAG &getDAG() const { return DAG; }
+
 private:
   SDNode *AnalyzeNewNode(SDNode *N);
   void AnalyzeNewValue(SDValue &Val);
@@ -151,7 +153,7 @@ private:
 
   /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES
   /// node with the corresponding input operand, except for the result 'ResNo',
-  /// which is returned.
+  /// for which the corresponding input operand is returned.
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
@@ -509,10 +511,12 @@ private:
   void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
@@ -553,6 +557,7 @@ private:
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned OpNo);
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a8ff7c6..06f6bd6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -168,6 +168,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   SDValue OldVec = N->getOperand(0);
   unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
   DebugLoc dl = N->getDebugLoc();
 
   // Convert to a vector of the expanded element type, for example
@@ -175,6 +176,15 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   EVT OldVT = N->getValueType(0);
   EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
 
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
   SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
                                EVT::getVectorVT(*DAG.getContext(),
                                                 NewVT, 2*OldElts),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 9fe4480..704f99b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -71,6 +71,9 @@ class VectorLegalizer {
   // operands to a different type and bitcasting the result back to the
   // original type.
   SDValue PromoteVectorOp(SDValue Op);
+  // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input
+  // operand to the next size up.
+  SDValue PromoteVectorOpINT_TO_FP(SDValue Op);
 
   public:
   bool Run();
@@ -231,9 +234,19 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
   case TargetLowering::Promote:
-    // "Promote" the operation by bitcasting
-    Result = PromoteVectorOp(Op);
-    Changed = true;
+    switch (Op.getOpcode()) {
+    default:
+      // "Promote" the operation by bitcasting
+      Result = PromoteVectorOp(Op);
+      Changed = true;
+      break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      // "Promote" the operation by extending the operand.
+      Result = PromoteVectorOpINT_TO_FP(Op);
+      Changed = true;
+      break;
+    }
     break;
   case TargetLowering::Legal: break;
   case TargetLowering::Custom: {
@@ -293,6 +306,44 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
+SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
+  // INT_TO_FP operations may require the input operand be promoted even
+  // when the type is otherwise legal.
+  EVT VT = Op.getOperand(0).getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+
+  // Normal getTypeToPromoteTo() doesn't work here, as that will promote
+  // by widening the vector w/ the same element width and twice the number
+  // of elements. We want the other way around, the same number of elements,
+  // each twice the width.
+  //
+  // Increase the bitwidth of the element to the next pow-of-two
+  // (which is greater than 8 bits).
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  EltVT = EVT::getIntegerVT(*DAG.getContext(), 2 * EltVT.getSizeInBits());
+  assert(EltVT.isSimple() && "Promoting to a non-simple vector type!");
+
+  // Build a new vector type and check if it is legal.
+  MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND :
+    ISD::SIGN_EXTEND;
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(Opc, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), &Operands[0],
+                     Operands.size());
+}
+
 
 SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
   DebugLoc dl = Op.getDebugLoc();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5f23f01..4709202 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -48,7 +48,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
-  case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
   case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
@@ -115,6 +115,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SRL:
     R = ScalarizeVecRes_BinOp(N);
     break;
+  case ISD::FMA:
+    R = ScalarizeVecRes_TernaryOp(N);
+    break;
   }
 
   // If R is null, the sub-method took care of registering the result.
@@ -129,6 +132,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     Op0.getValueType(), Op0, Op1, Op2);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
                                                        unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -141,6 +152,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
                      NewVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
@@ -436,7 +457,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
         N->dump(&DAG);
         dbgs() << "\n");
   SDValue Lo, Hi;
-  
+
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
@@ -448,7 +469,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
-    llvm_unreachable("Do not know how to split the result of this operator!");
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::VSELECT:
@@ -529,6 +551,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FREM:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
+  case ISD::FMA:
+    SplitVecRes_TernaryOp(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -548,6 +573,22 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue Op0Lo, Op0Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  SDValue Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  SDValue Op2Lo, Op2Hi;
+  GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
+                   Op0Lo, Op1Lo, Op2Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
+                   Op0Hi, Op1Hi, Op2Hi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // We know the result is a vector.  The input may be either a vector or a
@@ -977,7 +1018,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       N->dump(&DAG);
       dbgs() << "\n";
 #endif
-      llvm_unreachable("Do not know how to split this operator's operand!");
+      report_fatal_error("Do not know how to split this operator's "
+                         "operand!\n");
+
     case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
     case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -1203,15 +1246,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   DebugLoc DL = N->getDebugLoc();
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
-  
+
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
                                InVT.getVectorNumElements());
-  
+
   Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
-  
+
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}  
+}
 
 
 
@@ -1755,8 +1798,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
-        Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                                 DAG.getIntPtrConstant(j));
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -1816,7 +1859,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
       InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
                          DAG.getIntPtrConstant(0));
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                SatOp, CvtCode);
+                                  SatOp, CvtCode);
     }
   }
 
@@ -1832,7 +1875,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
     SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
                                  DAG.getIntPtrConstant(i));
     Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
-                                        SatOp, CvtCode);
+                                  SatOp, CvtCode);
   }
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
@@ -1936,7 +1979,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
       Cond1 = GetWidenedVector(Cond1);
 
     if (Cond1.getValueType() != CondWidenVT)
-       Cond1 = ModifyToType(Cond1, CondWidenVT);
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
   }
 
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -2202,7 +2245,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
                            ResVT, WideSETCC, DAG.getIntPtrConstant(0));
 
-  return PromoteTargetBoolean(CC, N->getValueType(0)); 
+  return PromoteTargetBoolean(CC, N->getValueType(0));
 }
 
 
@@ -2371,10 +2414,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
       NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                               LD->getPointerInfo().getWithOffset(Offset),
-                               isVolatile,
-                               isNonTemporal, isInvariant,
-                               MinAlign(Align, Increment));
+                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2563,7 +2604,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                               DAG.getIntPtrConstant(Increment));
-      } while (StWidth != 0  && StWidth >= NewVTWidth);
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
     }
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index ff0136e..c3794d5 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -50,7 +50,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
 
    const TargetMachine &tm = (*IS->MF).getTarget();
    ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL);
-   // This hard requirment could be relaxed, but for now
+   // This hard requirement could be relaxed, but for now
    // do not let it procede.
    assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
 
@@ -318,7 +318,7 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) {
 
   // If packet is now full, reset the state so in the next cycle
   // we start fresh.
-  if (Packet.size() >= InstrItins->IssueWidth) {
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
     ResourcesModel->clearResources();
     Packet.clear();
   }
@@ -353,7 +353,7 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
 }
 
 /// Estimates change in reg pressure from this SU.
-/// It is acheived by trivial tracking of defined
+/// It is achieved by trivial tracking of defined
 /// and used vregs in dependent instructions.
 /// The RawPressure flag makes this function to ignore
 /// existing reg file sizes, and report raw def/use
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 24da432..b7ce48a 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -441,19 +441,14 @@ static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                SmallVector<unsigned, 4> &LRegs,
                                const TargetRegisterInfo *TRI) {
   bool Added = false;
-  if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) {
-    if (RegAdded.insert(Reg)) {
-      LRegs.push_back(Reg);
-      Added = true;
-    }
-  }
-  for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
-    if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
-      if (RegAdded.insert(*Alias)) {
-        LRegs.push_back(*Alias);
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
+      if (RegAdded.insert(*AI)) {
+        LRegs.push_back(*AI);
         Added = true;
       }
     }
+  }
   return Added;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 2cb5d37..bf0a437 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -266,7 +266,8 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
                           const TargetLowering *TLI,
                           const TargetInstrInfo *TII,
                           const TargetRegisterInfo *TRI,
-                          unsigned &RegClass, unsigned &Cost) {
+                          unsigned &RegClass, unsigned &Cost,
+                          const MachineFunction &MF) {
   EVT VT = RegDefPos.GetValue();
 
   // Special handling for untyped values.  These values can only come from
@@ -285,7 +286,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Idx = RegDefPos.GetIdx();
     const MCInstrDesc Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
     RegClass = RC->getID();
     // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
     // better way to determine it.
@@ -852,7 +853,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
 }
 
 /// After backtracking, the hazard checker needs to be restored to a state
-/// corresponding the the current cycle.
+/// corresponding the current cycle.
 void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
   HazardRec->Reset();
 
@@ -1181,7 +1182,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVector<unsigned, 4> &LRegs,
                                const TargetRegisterInfo *TRI) {
-  for (const uint16_t *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) {
+  for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
 
     // Check if Ref is live.
     if (!LiveRegDefs[*AliasI]) continue;
@@ -1920,7 +1921,7 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
     for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
          RegDefPos.IsValid(); RegDefPos.Advance()) {
       unsigned RCId, Cost;
-      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
 
       if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
         return true;
@@ -2034,7 +2035,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
         continue;
 
       unsigned RCId, Cost;
-      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
       RegPressure[RCId] += Cost;
       break;
     }
@@ -2049,7 +2050,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
     if (SkipRegDefs > 0)
       continue;
     unsigned RCId, Cost;
-    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
     if (RegPressure[RCId] < Cost) {
       // Register pressure tracking is imprecise. This can happen. But we try
       // hard not to let it happen because it likely results in poor scheduling.
@@ -2330,22 +2331,21 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
   // and latency.
   if (!checkPref || (left->SchedulingPref == Sched::ILP ||
                      right->SchedulingPref == Sched::ILP)) {
-    if (DisableSchedCycles) {
+    // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+    // is enabled, grouping instructions by cycle, then its height is already
+    // covered so only its depth matters. We also reach this point if both stall
+    // but have the same height.
+    if (!SPQ->getHazardRec()->isEnabled()) {
       if (LHeight != RHeight)
         return LHeight > RHeight ? 1 : -1;
     }
-    else {
-      // If neither instruction stalls (!LStall && !RStall) then
-      // its height is already covered so only its depth matters. We also reach
-      // this if both stall but have the same height.
-      int LDepth = left->getDepth() - LPenalty;
-      int RDepth = right->getDepth() - RPenalty;
-      if (LDepth != RDepth) {
-        DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
-              << ") depth " << LDepth << " vs SU (" << right->NodeNum
-              << ") depth " << RDepth << "\n");
-        return LDepth < RDepth ? 1 : -1;
-      }
+    int LDepth = left->getDepth() - LPenalty;
+    int RDepth = right->getDepth() - RPenalty;
+    if (LDepth != RDepth) {
+      DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+            << ") depth " << LDepth << " vs SU (" << right->NodeNum
+            << ") depth " << RDepth << "\n");
+      return LDepth < RDepth ? 1 : -1;
     }
     if (left->Latency != right->Latency)
       return left->Latency > right->Latency ? 1 : -1;
@@ -2363,7 +2363,7 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
     bool RHasPhysReg = right->hasPhysRegDefs;
     if (LHasPhysReg != RHasPhysReg) {
       #ifndef NDEBUG
-      const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"};
+      const char *const PhysRegMsg[] = {" has no physreg"," defines a physreg"};
       #endif
       DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
             << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 75940ec..84e41fc 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -61,6 +61,7 @@ namespace llvm {
       if (isa<BasicBlockSDNode>(Node))     return true;
       if (isa<FrameIndexSDNode>(Node))     return true;
       if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<TargetIndexSDNode>(Node))    return true;
       if (isa<JumpTableSDNode>(Node))      return true;
       if (isa<ExternalSymbolSDNode>(Node)) return true;
       if (isa<BlockAddressSDNode>(Node))   return true;
@@ -98,12 +99,6 @@ namespace llvm {
     ///
     virtual void computeLatency(SUnit *SU);
 
-    /// computeOperandLatency - Override dependence edge latency using
-    /// operand use/def information
-    ///
-    virtual void computeOperandLatency(SUnit *Def, SUnit *Use,
-                                       SDep& dep) const { }
-
     virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
                                        unsigned OpIdx, SDep& dep) const;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 92671d1..f4fe892 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -14,16 +14,16 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeOrdering.h"
 #include "SDNodeDbgValue.h"
+#include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/CallingConv.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -71,7 +71,9 @@ static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
   }
 }
 
-SelectionDAG::DAGUpdateListener::~DAGUpdateListener() {}
+// Default null implementations of the callbacks.
+void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
 
 //===----------------------------------------------------------------------===//
 //                              ConstantFPSDNode Class
@@ -217,6 +219,22 @@ bool ISD::isScalarToVector(const SDNode *N) {
   return true;
 }
 
+/// allOperandsUndef - Return true if the node has at least one operand
+/// and all operands of the specified node are ISD::UNDEF.
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i)
+    if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+
+  return true;
+}
+
 /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
 /// when given the operation for (X op Y).
 ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
@@ -385,6 +403,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddPointer(GA->getGlobal());
     ID.AddInteger(GA->getOffset());
     ID.AddInteger(GA->getTargetFlags());
+    ID.AddInteger(GA->getAddressSpace());
     break;
   }
   case ISD::BasicBlock:
@@ -420,16 +439,25 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(CP->getTargetFlags());
     break;
   }
+  case ISD::TargetIndex: {
+    const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+    ID.AddInteger(TI->getIndex());
+    ID.AddInteger(TI->getOffset());
+    ID.AddInteger(TI->getTargetFlags());
+    break;
+  }
   case ISD::LOAD: {
     const LoadSDNode *LD = cast<LoadSDNode>(N);
     ID.AddInteger(LD->getMemoryVT().getRawBits());
     ID.AddInteger(LD->getRawSubclassData());
+    ID.AddInteger(LD->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::STORE: {
     const StoreSDNode *ST = cast<StoreSDNode>(N);
     ID.AddInteger(ST->getMemoryVT().getRawBits());
     ID.AddInteger(ST->getRawSubclassData());
+    ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::ATOMIC_CMP_SWAP:
@@ -449,6 +477,12 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     const AtomicSDNode *AT = cast<AtomicSDNode>(N);
     ID.AddInteger(AT->getMemoryVT().getRawBits());
     ID.AddInteger(AT->getRawSubclassData());
+    ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::PREFETCH: {
+    const MemSDNode *PF = cast<MemSDNode>(N);
+    ID.AddInteger(PF->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
@@ -465,6 +499,10 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     break;
   }
   } // end switch (N->getOpcode())
+
+  // Target specific memory nodes could also have address spaces to check.
+  if (N->isTargetMemoryOpcode())
+    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
 }
 
 /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
@@ -544,16 +582,15 @@ void SelectionDAG::RemoveDeadNodes() {
 
 /// RemoveDeadNodes - This method deletes the unreachable nodes in the
 /// given list, and any nodes that become unreachable as a result.
-void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
-                                   DAGUpdateListener *UpdateListener) {
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
 
   // Process the worklist, deleting the nodes and adding their uses to the
   // worklist.
   while (!DeadNodes.empty()) {
     SDNode *N = DeadNodes.pop_back_val();
 
-    if (UpdateListener)
-      UpdateListener->NodeDeleted(N, 0);
+    for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+      DUL->NodeDeleted(N, 0);
 
     // Take the node out of the appropriate CSE map.
     RemoveNodeFromCSEMaps(N);
@@ -574,7 +611,7 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
   }
 }
 
-void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
+void SelectionDAG::RemoveDeadNode(SDNode *N){
   SmallVector<SDNode*, 16> DeadNodes(1, N);
 
   // Create a dummy node that adds a reference to the root node, preventing
@@ -582,7 +619,7 @@ void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
   // dead node.)
   HandleSDNode Dummy(getRoot());
 
-  RemoveDeadNodes(DeadNodes, UpdateListener);
+  RemoveDeadNodes(DeadNodes);
 }
 
 void SelectionDAG::DeleteNode(SDNode *N) {
@@ -684,8 +721,7 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
 /// node. This transfer can potentially trigger recursive merging.
 ///
 void
-SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
-                                       DAGUpdateListener *UpdateListener) {
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
   // For node types that aren't CSE'd, just act as if no identical node
   // already exists.
   if (!doNotCSE(N)) {
@@ -694,20 +730,19 @@ SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
       // If there was already an existing matching node, use ReplaceAllUsesWith
       // to replace the dead one with the existing one.  This can cause
       // recursive merging of other unrelated nodes down the line.
-      ReplaceAllUsesWith(N, Existing, UpdateListener);
+      ReplaceAllUsesWith(N, Existing);
 
-      // N is now dead.  Inform the listener if it exists and delete it.
-      if (UpdateListener)
-        UpdateListener->NodeDeleted(N, Existing);
+      // N is now dead. Inform the listeners and delete it.
+      for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+        DUL->NodeDeleted(N, Existing);
       DeleteNodeNotInCSEMaps(N);
       return;
     }
   }
 
-  // If the node doesn't already exist, we updated it.  Inform a listener if
-  // it exists.
-  if (UpdateListener)
-    UpdateListener->NodeUpdated(N);
+  // If the node doesn't already exist, we updated it.  Inform listeners.
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeUpdated(N);
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
@@ -855,7 +890,7 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
   : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
     OptLevel(OL), EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), Ordering(0) {
+    Root(getEntryNode()), Ordering(0), UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
   Ordering = new SDNodeOrdering();
   DbgInfo = new SDDbgInfo();
@@ -867,6 +902,7 @@ void SelectionDAG::init(MachineFunction &mf) {
 }
 
 SelectionDAG::~SelectionDAG() {
+  assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
   allnodes_clear();
   delete Ordering;
   delete DbgInfo;
@@ -1084,6 +1120,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
   ID.AddPointer(GV);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
+  ID.AddInteger(GV->getType()->getAddressSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
@@ -1183,6 +1220,24 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   return SDValue(N, 0);
 }
 
+SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                     unsigned char TargetFlags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), 0, 0);
+  ID.AddInteger(Index);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
+                                                    TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
 SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
@@ -1949,6 +2004,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
     ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero |= (~InMask);
+    KnownOne  &= (~KnownZero);
     return;
   }
   case ISD::FGETSIGN:
@@ -2246,8 +2302,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   // Handle LOADX separately here. EXTLOAD case will fallthrough.
-  if (Op.getOpcode() == ISD::LOAD) {
-    LoadSDNode *LD = cast<LoadSDNode>(Op);
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     unsigned ExtType = LD->getExtensionType();
     switch (ExtType) {
     default: break;
@@ -2428,6 +2483,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       case ISD::FABS:
         V.clearSign();
         return getConstantFP(V, VT);
+      case ISD::FCEIL: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
+      case ISD::FTRUNC: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
+      case ISD::FFLOOR: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
       case ISD::FP_EXTEND: {
         bool ignored;
         // This can return overflow, underflow, or inexact; we don't care.
@@ -2675,6 +2748,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1 == N2) return N1;
     break;
   case ISD::CONCAT_VECTORS:
+    // Concat of UNDEFs is UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF &&
+        N2.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
     // one big BUILD_VECTOR.
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
@@ -3708,8 +3786,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME: pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMCPY),
                     /*isTailCall=*/false,
@@ -3717,6 +3795,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -3761,8 +3841,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME:  pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMMOVE),
                     /*isTailCall=*/false,
@@ -3770,6 +3850,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -3822,8 +3904,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.isSExt = false;
   Args.push_back(Entry);
   // FIXME: pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMSET),
                     /*isTailCall=*/false,
@@ -3831,6 +3913,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -3874,6 +3958,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -3946,6 +4031,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -4002,6 +4088,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -4079,6 +4166,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
     void *IP = 0;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
@@ -4198,6 +4286,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
                                      MMO->isNonTemporal(), 
                                      MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
@@ -4287,6 +4376,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -4354,6 +4444,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   ID.AddInteger(SVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -4378,6 +4469,7 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
@@ -4654,13 +4746,7 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
     if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
       continue;
 
-    bool NoMatch = false;
-    for (unsigned i = 2; i != NumVTs; ++i)
-      if (VTs[i] != I->VTs[i]) {
-        NoMatch = true;
-        break;
-      }
-    if (!NoMatch)
+    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
       return *I;
   }
 
@@ -5237,11 +5323,7 @@ namespace {
 /// pointed to by a use iterator is deleted, increment the use iterator
 /// so that it doesn't dangle.
 ///
-/// This class also manages a "downlink" DAGUpdateListener, to forward
-/// messages to ReplaceAllUsesWith's callers.
-///
 class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
-  SelectionDAG::DAGUpdateListener *DownLink;
   SDNode::use_iterator &UI;
   SDNode::use_iterator &UE;
 
@@ -5249,21 +5331,13 @@ class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
     // Increment the iterator as needed.
     while (UI != UE && N == *UI)
       ++UI;
-
-    // Then forward the message.
-    if (DownLink) DownLink->NodeDeleted(N, E);
-  }
-
-  virtual void NodeUpdated(SDNode *N) {
-    // Just forward the message.
-    if (DownLink) DownLink->NodeUpdated(N);
   }
 
 public:
-  RAUWUpdateListener(SelectionDAG::DAGUpdateListener *dl,
+  RAUWUpdateListener(SelectionDAG &d,
                      SDNode::use_iterator &ui,
                      SDNode::use_iterator &ue)
-    : DownLink(dl), UI(ui), UE(ue) {}
+    : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
 };
 
 }
@@ -5273,8 +5347,7 @@ public:
 ///
 /// This version assumes From has a single result value.
 ///
-void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
   SDNode *From = FromN.getNode();
   assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
          "Cannot replace with this method!");
@@ -5288,7 +5361,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
   // is replaced by To, we don't want to replace of all its users with To
   // too. See PR3018 for more info.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5307,7 +5380,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5321,8 +5394,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
 /// This version assumes that for each value of From, there is a
 /// corresponding value in To in the same position with the same type.
 ///
-void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
 #ifndef NDEBUG
   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
     assert((!From->hasAnyUseOfValue(i) ||
@@ -5337,7 +5409,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5356,7 +5428,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5369,16 +5441,14 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
 ///
 /// This version can replace From with any result values.  To must match the
 /// number and types of values returned by From.
-void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
-                                      const SDValue *To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
   if (From->getNumValues() == 1)  // Handle the simple case efficiently.
-    return ReplaceAllUsesWith(SDValue(From, 0), To[0], UpdateListener);
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
 
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5398,7 +5468,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5409,14 +5479,13 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
 /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The Deleted
 /// vector is handled the same way as for ReplaceAllUsesWith.
-void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
-                                             DAGUpdateListener *UpdateListener){
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
   // Handle the really simple, really trivial case efficiently.
   if (From == To) return;
 
   // Handle the simple, trivial, case efficiently.
   if (From.getNode()->getNumValues() == 1) {
-    ReplaceAllUsesWith(From, To, UpdateListener);
+    ReplaceAllUsesWith(From, To);
     return;
   }
 
@@ -5424,7 +5493,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From.getNode()->use_begin(),
                        UE = From.getNode()->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
     bool UserRemovedFromCSEMaps = false;
@@ -5460,7 +5529,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5489,11 +5558,10 @@ namespace {
 /// handled the same way as for ReplaceAllUsesWith.
 void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
                                               const SDValue *To,
-                                              unsigned Num,
-                                              DAGUpdateListener *UpdateListener){
+                                              unsigned Num){
   // Handle the simple, trivial case efficiently.
   if (Num == 1)
-    return ReplaceAllUsesOfValueWith(*From, *To, UpdateListener);
+    return ReplaceAllUsesOfValueWith(*From, *To);
 
   // Read up all the uses and make records of them. This helps
   // processing new uses that are introduced during the
@@ -5538,7 +5606,7 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, UpdateListener);
+    AddModifiedNodeToCSEMaps(User);
   }
 }
 
@@ -5579,7 +5647,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
     }
   }
 
-  // Visit all the nodes. As we iterate, moves nodes into sorted order,
+  // Visit all the nodes. As we iterate, move nodes into sorted order,
   // such that by the time the end is reached all nodes will be sorted.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
     SDNode *N = I;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f1e879b..ba5bd79 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Constants.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
@@ -42,7 +43,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -51,6 +51,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -843,7 +844,7 @@ void SelectionDAGBuilder::clear() {
 }
 
 /// clearDanglingDebugInfo - Clear the dangling debug information
-/// map. This function is seperated from the clear so that debug
+/// map. This function is separated from the clear so that debug
 /// information that is dangling in a basic block can be properly
 /// resolved in a different basic block. This allows the
 /// SelectionDAG to resolve dangling debug information attached
@@ -941,7 +942,7 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   default: llvm_unreachable("Unknown instruction type encountered!");
     // Build the switch statement using the Instruction.def file.
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
-    case Instruction::OPCODE: visit##OPCODE((CLASS&)I); break;
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
 #include "llvm/Instruction.def"
   }
 
@@ -1578,17 +1579,18 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
-    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+    assert(CB.CC == ISD::SETCC_INVALID &&
+           "Condition is undefined for to-the-range belonging check.");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
 
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
-
-    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+    
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
-                          ISD::SETLE);
+                          ISD::SETULE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, VT));
@@ -1826,9 +1828,13 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)];
 
   const Value *Callee(I.getCalledValue());
+  const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     visitInlineAsm(&I);
-  else
+  else if (Fn && Fn->isIntrinsic()) {
+    assert(Fn->getIntrinsicID() == Intrinsic::donothing);
+    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+  } else
     LowerCallTo(&I, getValue(Callee), false, LandingPad);
 
   // If the value of the invoke is used outside of its defining block, make it
@@ -1901,8 +1907,6 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
                                                  const Value* SV,
                                                  MachineBasicBlock *Default,
                                                  MachineBasicBlock *SwitchBB) {
-  Case& BackCase  = *(CR.Range.second-1);
-
   // Size is the number of Cases represented by this range.
   size_t Size = CR.Range.second - CR.Range.first;
   if (Size > 3)
@@ -1970,11 +1974,28 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
     }
   }
 
+  // Order cases by weight so the most likely case will be checked first.
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  if (BPI) {
+    for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) {
+      uint32_t IWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
+                                            I->BB->getBasicBlock());
+      for (CaseItr J = CR.Range.first; J < I; ++J) {
+        uint32_t JWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
+                                              J->BB->getBasicBlock());
+        if (IWeight > JWeight)
+          std::swap(*I, *J);
+      }
+    }
+  }
   // Rearrange the case blocks so that the last one falls through if possible.
-  if (NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
+  Case &BackCase = *(CR.Range.second-1);
+  if (Size > 1 &&
+      NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
     // The last case block won't fall through into 'NextBlock' if we emit the
     // branches in this order.  See if rearranging a case value would help.
-    for (CaseItr I = CR.Range.first, E = CR.Range.second-1; I != E; ++I) {
+    // We start at the bottom as it's the case with the least weight.
+    for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I){
       if (I->BB == NextBlock) {
         std::swap(*I, BackCase);
         break;
@@ -2006,7 +2027,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETEQ;
       LHS = SV; RHS = I->High; MHS = NULL;
     } else {
-      CC = ISD::SETLE;
+      CC = ISD::SETCC_INVALID; 
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
@@ -2031,14 +2052,14 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
 }
 
 static inline bool areJTsAllowed(const TargetLowering &TLI) {
-  return !TLI.getTargetMachine().Options.DisableJumpTables &&
+  return TLI.supportJumpTables() &&
           (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
            TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
 }
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
   uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
-  APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth);
+  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
   return (LastExt - FirstExt + 1ULL);
 }
 
@@ -2104,7 +2125,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
     const APInt &High = cast<ConstantInt>(I->High)->getValue();
 
-    if (Low.sle(TEI) && TEI.sle(High)) {
+    if (Low.ule(TEI) && TEI.ule(High)) {
       DestBBs.push_back(I->BB);
       if (TEI==High)
         ++I;
@@ -2261,7 +2282,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2333,7 +2354,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   // Optimize the case where all the case values fit in a
   // word without having to subtract minValue. In this case,
   // we can optimize away the subtraction.
-  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
+  if (maxValue.ult(IntPtrBits)) {
     cmpRange = maxValue;
   } else {
     lowBound = minValue;
@@ -2407,57 +2428,46 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
 size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
-  size_t numCmps = 0;
+  
+  /// Use a shorter form of declaration, and also
+  /// show the we want to use CRSBuilder as Clusterifier.
+  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
+  
+  Clusterifier TheClusterifier;
 
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
        i != e; ++i) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), SuccBB) : 0;
-
-    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
-                         SMBB, ExtraWeight));
-  }
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
-
-  // Merge case into clusters
-  if (Cases.size() >= 2)
-    // Must recompute end() each iteration because it may be
-    // invalidated by erase if we hold on to it
-    for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin());
-         J != Cases.end(); ) {
-      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
-      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
-      MachineBasicBlock* nextBB = J->BB;
-      MachineBasicBlock* currentBB = I->BB;
-
-      // If the two neighboring cases go to the same destination, merge them
-      // into a single case.
-      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
-        I->High = J->High;
-        J = Cases.erase(J);
-
-        if (BranchProbabilityInfo *BPI = FuncInfo.BPI) {
-          uint32_t CurWeight = currentBB->getBasicBlock() ?
-            BPI->getEdgeWeight(SI.getParent(), currentBB->getBasicBlock()) : 16;
-          uint32_t NextWeight = nextBB->getBasicBlock() ?
-            BPI->getEdgeWeight(SI.getParent(), nextBB->getBasicBlock()) : 16;
-
-          BPI->setEdgeWeight(SI.getParent(), currentBB->getBasicBlock(),
-                             CurWeight + NextWeight);
-        }
-      } else {
-        I = J++;
-      }
+    TheClusterifier.add(i.getCaseValueEx(), SMBB);
+  }
+  
+  TheClusterifier.optimize();
+  
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  size_t numCmps = 0;
+  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    Clusterifier::Cluster &C = *i;
+    unsigned W = 0;
+    if (BPI) {
+      W = BPI->getEdgeWeight(SI.getParent(), C.second->getBasicBlock());
+      if (!W)
+        W = 16;
+      W *= C.first.Weight;
+      BPI->setEdgeWeight(SI.getParent(), C.second->getBasicBlock(), W);  
     }
 
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
-      // A range counts double, since it requires two compares.
-      ++numCmps;
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(Case(C.first.getLow().toConstantInt(),
+                         C.first.getHigh().toConstantInt(), C.second, W));
+    
+    if (C.first.getLow() != C.first.getHigh())
+    // A range counts double, since it requires two compares.
+    ++numCmps;
   }
 
   return numCmps;
@@ -2804,7 +2814,7 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
-// begining from position Pos and ending in Pos+Size, falls within the
+// beginning from position Pos and ending in Pos+Size, falls within the
 // specified sequential range [L, L+Pos). or is undef.
 static bool isSequentialInRange(const SmallVectorImpl<int> &Mask,
                                 unsigned Pos, unsigned Size, int Low) {
@@ -4914,6 +4924,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::pow:
     visitPow(I);
     return 0;
+  case Intrinsic::fabs:
+    setValue(&I, DAG.getNode(ISD::FABS, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::floor:
+    setValue(&I, DAG.getNode(ISD::FFLOOR, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, dl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -4921,6 +4941,29 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return 0;
+  case Intrinsic::fmuladd: {
+    EVT VT = TLI.getValueType(I.getType());
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isOperationLegal(ISD::FMA, VT) &&
+        TLI.isFMAFasterThanMulAndAdd(VT)){
+      setValue(&I, DAG.getNode(ISD::FMA, dl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2))));
+    } else {
+      SDValue Mul = DAG.getNode(ISD::FMUL, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                getValue(I.getArgOperand(0)),
+                                getValue(I.getArgOperand(1)));
+      SDValue Add = DAG.getNode(ISD::FADD, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                Mul,
+                                getValue(I.getArgOperand(2)));
+      setValue(&I, Add);
+    }
+    return 0;
+  }
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
                              MVT::i16, getValue(I.getArgOperand(0))));
@@ -5077,16 +5120,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return 0;
     }
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> Result =
-      TLI.LowerCallTo(getRoot(), I.getType(),
+    TargetLowering::
+    CallLoweringInfo CLI(getRoot(), I.getType(),
                  false, false, false, false, 0, CallingConv::C,
                  /*isTailCall=*/false,
                  /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                  DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
                  Args, DAG, getCurDebugLoc());
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return 0;
   }
+  case Intrinsic::debugtrap: {
+    DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot()));
+    return 0;
+  }
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::usub_with_overflow:
@@ -5139,6 +5187,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::lifetime_end:
     // Discard region information.
     return 0;
+  case Intrinsic::donothing:
+    // ignore
+    return 0;
   }
 }
 
@@ -5157,14 +5208,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  SmallVector<uint64_t, 4> Offsets;
   GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
-                Outs, TLI, &Offsets);
+                Outs, TLI);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-					   DAG.getMachineFunction(),
-					   FTy->isVarArg(), Outs,
-					   FTy->getContext());
+                                           DAG.getMachineFunction(),
+                                           FTy->isVarArg(), Outs,
+                                           FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -5247,16 +5297,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (isTailCall && TM.Options.EnableFastISel)
     isTailCall = false;
 
-  std::pair<SDValue,SDValue> Result =
-    TLI.LowerCallTo(getRoot(), RetTy,
-                    CS.paramHasAttr(0, Attribute::SExt),
-                    CS.paramHasAttr(0, Attribute::ZExt), FTy->isVarArg(),
-                    CS.paramHasAttr(0, Attribute::InReg), FTy->getNumParams(),
-                    CS.getCallingConv(),
-                    isTailCall,
-                    CS.doesNotReturn(),
-                    !CS.getInstruction()->use_empty(),
-                    Callee, Args, DAG, getCurDebugLoc());
+  TargetLowering::
+  CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG,
+                       getCurDebugLoc(), CS);
+  std::pair<SDValue,SDValue> Result = TLI.LowerCallTo(CLI);
   assert((isTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
@@ -5272,7 +5316,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     ComputeValueVTs(TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
-    unsigned NumValues = Outs.size();
+
+    SmallVector<EVT, 4> RetTys;
+    SmallVector<uint64_t, 4> Offsets;
+    RetTy = FTy->getReturnType();
+    ComputeValueVTs(TLI, RetTy, RetTys, &Offsets);
+
+    unsigned NumValues = RetTys.size();
     SmallVector<SDValue, 4> Values(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
@@ -5280,8 +5330,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
                                 DemoteStackSlot,
                                 DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second,
-                              Add,
+      SDValue L = DAG.getLoad(RetTys[i], getCurDebugLoc(), Result.second, Add,
                   MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
                               false, false, false, 1);
       Values[i] = L;
@@ -5292,30 +5341,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                 MVT::Other, &Chains[0], NumValues);
     PendingLoads.push_back(Chain);
 
-    // Collect the legal value parts into potentially illegal values
-    // that correspond to the original function's return values.
-    SmallVector<EVT, 4> RetTys;
-    RetTy = FTy->getReturnType();
-    ComputeValueVTs(TLI, RetTy, RetTys);
-    ISD::NodeType AssertOp = ISD::DELETED_NODE;
-    SmallVector<SDValue, 4> ReturnValues;
-    unsigned CurReg = 0;
-    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-      EVT VT = RetTys[I];
-      EVT RegisterVT = TLI.getRegisterType(RetTy->getContext(), VT);
-      unsigned NumRegs = TLI.getNumRegisters(RetTy->getContext(), VT);
-
-      SDValue ReturnValue =
-        getCopyFromParts(DAG, getCurDebugLoc(), &Values[CurReg], NumRegs,
-                         RegisterVT, VT, AssertOp);
-      ReturnValues.push_back(ReturnValue);
-      CurReg += NumRegs;
-    }
-
     setValue(CS.getInstruction(),
              DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
                          DAG.getVTList(&RetTys[0], RetTys.size()),
-                         &ReturnValues[0], ReturnValues.size()));
+                         &Values[0], Values.size()));
   }
 
   // Assign order to nodes here. If the call does not produce a result, it won't
@@ -5482,6 +5511,22 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   return false;
 }
 
+/// visitUnaryFloatCall - If a call instruction is a unary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
+                                              unsigned Opcode) {
+  // Sanity check that it really is a unary floating-point call.
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp = getValue(I.getArgOperand(0));
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(), Tmp.getValueType(), Tmp));
+  return true;
+}
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
@@ -5512,150 +5557,97 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
 
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.
-    if (!F->hasLocalLinkage() && F->hasName()) {
-      StringRef Name = F->getName();
-      if ((LibInfo->has(LibFunc::copysign) && Name == "copysign") ||
-          (LibInfo->has(LibFunc::copysignf) && Name == "copysignf") ||
-          (LibInfo->has(LibFunc::copysignl) && Name == "copysignl")) {
+    LibFunc::Func Func;
+    if (!F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func)) {
+      switch (Func) {
+      default: break;
+      case LibFunc::copysign:
+      case LibFunc::copysignf:
+      case LibFunc::copysignl:
         if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
             I.getArgOperand(0)->getType()->isFloatingPointTy() &&
             I.getType() == I.getArgOperand(0)->getType() &&
-            I.getType() == I.getArgOperand(1)->getType()) {
+            I.getType() == I.getArgOperand(1)->getType() &&
+            I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
-      } else if ((LibInfo->has(LibFunc::fabs) && Name == "fabs") ||
-                 (LibInfo->has(LibFunc::fabsf) && Name == "fabsf") ||
-                 (LibInfo->has(LibFunc::fabsl) && Name == "fabsl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::fabs:
+      case LibFunc::fabsf:
+      case LibFunc::fabsl:
+        if (visitUnaryFloatCall(I, ISD::FABS))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::sin) && Name == "sin") ||
-                 (LibInfo->has(LibFunc::sinf) && Name == "sinf") ||
-                 (LibInfo->has(LibFunc::sinl) && Name == "sinl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::sin:
+      case LibFunc::sinf:
+      case LibFunc::sinl:
+        if (visitUnaryFloatCall(I, ISD::FSIN))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::cos) && Name == "cos") ||
-                 (LibInfo->has(LibFunc::cosf) && Name == "cosf") ||
-                 (LibInfo->has(LibFunc::cosl) && Name == "cosl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::cos:
+      case LibFunc::cosf:
+      case LibFunc::cosl:
+        if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::sqrt) && Name == "sqrt") ||
-                 (LibInfo->has(LibFunc::sqrtf) && Name == "sqrtf") ||
-                 (LibInfo->has(LibFunc::sqrtl) && Name == "sqrtl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FSQRT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::sqrt:
+      case LibFunc::sqrtf:
+      case LibFunc::sqrtl:
+        if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::floor) && Name == "floor") ||
-                 (LibInfo->has(LibFunc::floorf) && Name == "floorf") ||
-                 (LibInfo->has(LibFunc::floorl) && Name == "floorl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FFLOOR, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::floor:
+      case LibFunc::floorf:
+      case LibFunc::floorl:
+        if (visitUnaryFloatCall(I, ISD::FFLOOR))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::nearbyint) && Name == "nearbyint") ||
-                 (LibInfo->has(LibFunc::nearbyintf) && Name == "nearbyintf") ||
-                 (LibInfo->has(LibFunc::nearbyintl) && Name == "nearbyintl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FNEARBYINT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::nearbyint:
+      case LibFunc::nearbyintf:
+      case LibFunc::nearbyintl:
+        if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::ceil) && Name == "ceil") ||
-                 (LibInfo->has(LibFunc::ceilf) && Name == "ceilf") ||
-                 (LibInfo->has(LibFunc::ceill) && Name == "ceill")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FCEIL, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::ceil:
+      case LibFunc::ceilf:
+      case LibFunc::ceill:
+        if (visitUnaryFloatCall(I, ISD::FCEIL))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::rint) && Name == "rint") ||
-                 (LibInfo->has(LibFunc::rintf) && Name == "rintf") ||
-                 (LibInfo->has(LibFunc::rintl) && Name == "rintl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FRINT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::rint:
+      case LibFunc::rintf:
+      case LibFunc::rintl:
+        if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::trunc) && Name == "trunc") ||
-                 (LibInfo->has(LibFunc::truncf) && Name == "truncf") ||
-                 (LibInfo->has(LibFunc::truncl) && Name == "truncl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FTRUNC, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::trunc:
+      case LibFunc::truncf:
+      case LibFunc::truncl:
+        if (visitUnaryFloatCall(I, ISD::FTRUNC))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::log2) && Name == "log2") ||
-                 (LibInfo->has(LibFunc::log2f) && Name == "log2f") ||
-                 (LibInfo->has(LibFunc::log2l) && Name == "log2l")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FLOG2, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::log2:
+      case LibFunc::log2f:
+      case LibFunc::log2l:
+        if (visitUnaryFloatCall(I, ISD::FLOG2))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::exp2) && Name == "exp2") ||
-                 (LibInfo->has(LibFunc::exp2f) && Name == "exp2f") ||
-                 (LibInfo->has(LibFunc::exp2l) && Name == "exp2l")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FEXP2, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+      case LibFunc::exp2l:
+        if (visitUnaryFloatCall(I, ISD::FEXP2))
           return;
-        }
-      } else if (Name == "memcmp") {
+        break;
+      case LibFunc::memcmp:
         if (visitMemCmpCall(I))
           return;
+        break;
       }
     }
   }
@@ -5952,11 +5944,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
                                            OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
                                            Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
@@ -6225,8 +6217,15 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
               OpInfo.ConstraintType == TargetLowering::C_Register) &&
              "Unknown constraint type!");
-      assert(!OpInfo.isIndirect &&
-             "Don't know how to handle indirect register inputs yet!");
+
+      // TODO: Support this.
+      if (OpInfo.isIndirect) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.emitError(CS.getInstruction(),
+                      "Don't know how to handle indirect register inputs yet "
+                      "for constraint '" + Twine(OpInfo.ConstraintCode) + "'");
+        break;
+      }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
@@ -6369,24 +6368,18 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 /// FIXME: When all targets are
 /// migrated to using LowerCall, this hook should be integrated into SDISel.
 std::pair<SDValue, SDValue>
-TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
-                            bool RetSExt, bool RetZExt, bool isVarArg,
-                            bool isInreg, unsigned NumFixedArgs,
-                            CallingConv::ID CallConv, bool isTailCall,
-                            bool doesNotRet, bool isReturnValueUsed,
-                            SDValue Callee,
-                            ArgListTy &Args, SelectionDAG &DAG,
-                            DebugLoc dl) const {
+TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle all of the outgoing arguments.
-  SmallVector<ISD::OutputArg, 32> Outs;
-  SmallVector<SDValue, 32> OutVals;
+  CLI.Outs.clear();
+  CLI.OutVals.clear();
+  ArgListTy &Args = CLI.Args;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(RetTy->getContext());
+      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
@@ -6419,8 +6412,8 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
         Flags.setNest();
       Flags.setOrigAlign(OriginalAlignment);
 
-      EVT PartVT = getRegisterType(RetTy->getContext(), VT);
-      unsigned NumParts = getNumRegisters(RetTy->getContext(), VT);
+      EVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
@@ -6429,89 +6422,88 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
       else if (Args[i].isZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
-      getCopyToParts(DAG, dl, Op, &Parts[0], NumParts,
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts,
                      PartVT, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
         ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
-                               i < NumFixedArgs);
+                               i < CLI.NumFixedArgs);
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
-        Outs.push_back(MyFlags);
-        OutVals.push_back(Parts[j]);
+        CLI.Outs.push_back(MyFlags);
+        CLI.OutVals.push_back(Parts[j]);
       }
     }
   }
 
   // Handle the incoming return values from the call.
-  SmallVector<ISD::InputArg, 32> Ins;
+  CLI.Ins.clear();
   SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(*this, RetTy, RetTys);
+  ComputeValueVTs(*this, CLI.RetTy, RetTys);
   for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
     EVT VT = RetTys[I];
-    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+    EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
     for (unsigned i = 0; i != NumRegs; ++i) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT.getSimpleVT();
-      MyFlags.Used = isReturnValueUsed;
-      if (RetSExt)
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
         MyFlags.Flags.setSExt();
-      if (RetZExt)
+      if (CLI.RetZExt)
         MyFlags.Flags.setZExt();
-      if (isInreg)
+      if (CLI.IsInReg)
         MyFlags.Flags.setInReg();
-      Ins.push_back(MyFlags);
+      CLI.Ins.push_back(MyFlags);
     }
   }
 
   SmallVector<SDValue, 4> InVals;
-  Chain = LowerCall(Chain, Callee, CallConv, isVarArg, doesNotRet, isTailCall,
-                    Outs, OutVals, Ins, dl, DAG, InVals);
+  CLI.Chain = LowerCall(CLI, InVals);
 
   // Verify that the target's LowerCall behaved as expected.
-  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+  assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
          "LowerCall didn't return a valid chain!");
-  assert((!isTailCall || InVals.empty()) &&
+  assert((!CLI.IsTailCall || InVals.empty()) &&
          "LowerCall emitted a return value for a tail call!");
-  assert((isTailCall || InVals.size() == Ins.size()) &&
+  assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
          "LowerCall didn't emit the correct number of values!");
 
   // For a tail call, the return value is merely live-out and there aren't
   // any nodes in the DAG representing it. Return a special value to
   // indicate that a tail call has been emitted and no more Instructions
   // should be processed in the current block.
-  if (isTailCall) {
-    DAG.setRoot(Chain);
+  if (CLI.IsTailCall) {
+    CLI.DAG.setRoot(CLI.Chain);
     return std::make_pair(SDValue(), SDValue());
   }
 
-  DEBUG(for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+  DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
           assert(InVals[i].getNode() &&
                  "LowerCall emitted a null value!");
-          assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+          assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
                  "LowerCall emitted a value with the wrong type!");
         });
 
   // Collect the legal value parts into potentially illegal values
   // that correspond to the original function's return values.
   ISD::NodeType AssertOp = ISD::DELETED_NODE;
-  if (RetSExt)
+  if (CLI.RetSExt)
     AssertOp = ISD::AssertSext;
-  else if (RetZExt)
+  else if (CLI.RetZExt)
     AssertOp = ISD::AssertZext;
   SmallVector<SDValue, 4> ReturnValues;
   unsigned CurReg = 0;
   for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
     EVT VT = RetTys[I];
-    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+    EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
 
-    ReturnValues.push_back(getCopyFromParts(DAG, dl, &InVals[CurReg],
+    ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
                                             NumRegs, RegisterVT, VT,
                                             AssertOp));
     CurReg += NumRegs;
@@ -6521,12 +6513,12 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
   // such a node, so we just return a null return value in that case. In
   // that case, nothing will actually look at the value.
   if (ReturnValues.empty())
-    return std::make_pair(SDValue(), Chain);
+    return std::make_pair(SDValue(), CLI.Chain);
 
-  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
-                            DAG.getVTList(&RetTys[0], RetTys.size()),
+  SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
+                                CLI.DAG.getVTList(&RetTys[0], RetTys.size()),
                             &ReturnValues[0], ReturnValues.size());
-  return std::make_pair(Res, Chain);
+  return std::make_pair(Res, CLI.Chain);
 }
 
 void TargetLowering::LowerOperationWrapper(SDNode *N,
@@ -6746,7 +6738,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
 
     // Note down frame index.
     if (FrameIndexSDNode *FI =
-	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
       FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 8393b41..4090002 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -180,17 +180,6 @@ private:
 
   typedef std::vector<CaseRec> CaseRecVector;
 
-  /// The comparison function for sorting the switch case values in the vector.
-  /// WARNING: Case ranges should be disjoint!
-  struct CaseCmp {
-    bool operator()(const Case &C1, const Case &C2) {
-      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
-      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
-      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
-      return CI1->getValue().slt(CI2->getValue());
-    }
-  };
-
   struct CaseBitsCmp {
     bool operator()(const CaseBits &C1, const CaseBits &C2) {
       return C1.Bits > C2.Bits;
@@ -351,7 +340,7 @@ public:
   void clear();
 
   /// clearDanglingDebugInfo - Clear the dangling debug information
-  /// map. This function is seperated from the clear so that debug
+  /// map. This function is separated from the clear so that debug
   /// information that is dangling in a basic block can be properly
   /// resolved in a different basic block. This allows the
   /// SelectionDAG to resolve dangling debug information attached
@@ -531,6 +520,7 @@ private:
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
+  bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index f981afb..13cd011 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScheduleDAGSDNodes.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Assembly/Writer.h"
@@ -19,7 +20,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -100,6 +100,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
   case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
   case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::TargetIndex:                return "TargetIndex";
   case ISD::ExternalSymbol:             return "ExternalSymbol";
   case ISD::BlockAddress:               return "BlockAddress";
   case ISD::INTRINSIC_WO_CHAIN:
@@ -265,6 +266,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STACKSAVE:                  return "stacksave";
   case ISD::STACKRESTORE:               return "stackrestore";
   case ISD::TRAP:                       return "trap";
+  case ISD::DEBUGTRAP:                  return "debugtrap";
 
   // Bit manipulation
   case ISD::BSWAP:                      return "bswap";
@@ -408,6 +410,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << " " << offset;
     if (unsigned int TF = CP->getTargetFlags())
       OS << " [TF=" << TF << ']';
+  } else if (const TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(this)) {
+    OS << "<" << TI->getIndex() << '+' << TI->getOffset() << ">";
+    if (unsigned TF = TI->getTargetFlags())
+      OS << " [TF=" << TF << ']';
   } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
     OS << "<";
     const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 605509b..4e5e3ba 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -14,12 +14,8 @@
 #define DEBUG_TYPE "isel"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
@@ -27,7 +23,10 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -38,6 +37,7 @@
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -263,8 +263,6 @@ void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
 
-void SelectionDAGISel::ISelUpdater::anchor() { }
-
 SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
   MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
@@ -451,9 +449,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         }
       }
     }
-  done:;
   }
 
+  done:
   // Determine if there is a call to setjmp in the machine function.
   MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
 
@@ -468,8 +466,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
     for (;;) {
-      DenseMap<unsigned, unsigned>::iterator J =
-        FuncInfo->RegFixups.find(To);
+      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
     }
@@ -703,6 +700,25 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   CurDAG->clear();
 }
 
+namespace {
+/// ISelUpdater - helper class to handle updates of the instruction selection
+/// graph.
+class ISelUpdater : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::allnodes_iterator &ISelPosition;
+public:
+  ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
+    : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
+
+  /// NodeDeleted - Handle nodes deleted from the graph. If the node being
+  /// deleted is the current ISelPosition node, update ISelPosition.
+  ///
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    if (ISelPosition == SelectionDAG::allnodes_iterator(N))
+      ++ISelPosition;
+  }
+};
+} // end anonymous namespace
+
 void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(errs() << "===== Instruction selection begins: BB#"
         << FuncInfo->MBB->getNumber()
@@ -719,9 +735,13 @@ void SelectionDAGISel::DoInstructionSelection() {
     // a reference to the root node, preventing it from being deleted,
     // and tracking any changes of the root.
     HandleSDNode Dummy(CurDAG->getRoot());
-    ISelPosition = SelectionDAG::allnodes_iterator(CurDAG->getRoot().getNode());
+    SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
     ++ISelPosition;
 
+    // Make sure that ISelPosition gets properly updated when nodes are deleted
+    // in calls made from this function.
+    ISelUpdater ISU(*CurDAG, ISelPosition);
+
     // The AllNodes list is now topological-sorted. Visit the
     // nodes by starting at the end of the list (the root of the
     // graph) and preceding back toward the beginning (the entry
@@ -748,10 +768,8 @@ void SelectionDAGISel::DoInstructionSelection() {
 
       // If after the replacement this node is not used any more,
       // remove this dead node.
-      if (Node->use_empty()) { // Don't delete EntryToken, etc.
-        ISelUpdater ISU(ISelPosition);
-        CurDAG->RemoveDeadNode(Node, &ISU);
-      }
+      if (Node->use_empty()) // Don't delete EntryToken, etc.
+        CurDAG->RemoveDeadNode(Node);
     }
 
     CurDAG->setRoot(Dummy.getValue());
@@ -961,7 +979,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
   if (TM.Options.EnableFastISel)
-    FastIS = TLI.createFastISel(*FuncInfo);
+    FastIS = TLI.createFastISel(*FuncInfo, LibInfo);
 
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -1680,8 +1698,6 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
                     bool isMorphNodeTo) {
   SmallVector<SDNode*, 4> NowDeadNodes;
 
-  ISelUpdater ISU(ISelPosition);
-
   // Now that all the normal results are replaced, we replace the chain and
   // glue results if present.
   if (!ChainNodesMatched.empty()) {
@@ -1705,7 +1721,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
       if (ChainVal.getValueType() == MVT::Glue)
         ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
       assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
-      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain, &ISU);
+      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (ChainNode->use_empty() &&
@@ -1728,7 +1744,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
       assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Glue &&
              "Doesn't have a glue result");
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
-                                        InputGlue, &ISU);
+                                        InputGlue);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (FRN->use_empty() &&
@@ -1738,7 +1754,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
   }
 
   if (!NowDeadNodes.empty())
-    CurDAG->RemoveDeadNodes(NowDeadNodes, &ISU);
+    CurDAG->RemoveDeadNodes(NowDeadNodes);
 
   DEBUG(errs() << "ISEL: Match complete!\n");
 }
@@ -1759,7 +1775,7 @@ enum ChainResult {
 /// The walk we do here is guaranteed to be small because we quickly get down to
 /// already selected nodes "below" us.
 static ChainResult
-WalkChainUsers(SDNode *ChainedNode,
+WalkChainUsers(const SDNode *ChainedNode,
                SmallVectorImpl<SDNode*> &ChainedNodesInPattern,
                SmallVectorImpl<SDNode*> &InteriorChainedNodes) {
   ChainResult Result = CR_Simple;
@@ -1992,14 +2008,14 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-                      SelectionDAGISel &SDISel) {
+                      const SelectionDAGISel &SDISel) {
   return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
 }
 
 /// CheckNodePredicate - Implements OP_CheckNodePredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-                   SelectionDAGISel &SDISel, SDNode *N) {
+                   const SelectionDAGISel &SDISel, SDNode *N) {
   return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
 }
 
@@ -2062,7 +2078,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-            SDValue N, SelectionDAGISel &SDISel) {
+            SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
@@ -2075,7 +2091,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-           SDValue N, SelectionDAGISel &SDISel) {
+           SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
@@ -2094,7 +2110,8 @@ CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// MatcherIndex to continue with.
 static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
-                                       bool &Result, SelectionDAGISel &SDISel,
+                                       bool &Result,
+                                       const SelectionDAGISel &SDISel,
                  SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
   switch (Table[Index++]) {
   default:
@@ -2759,9 +2776,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
                                                              (SDNode*) 0));
         }
 
-      } else {
+      } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) {
         Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(),
                         EmitNodeInfo);
+      } else {
+        // NodeToMatch was eliminated by CSE when the target changed the DAG.
+        // We will visit the equivalent node later.
+        DEBUG(dbgs() << "Node was eliminated by CSE\n");
+        return 0;
       }
 
       // If the node had chain/glue results, update our notion of the current
@@ -2959,6 +2981,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getFunction()->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 6cde05a..173ffac 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -13,13 +13,13 @@
 
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e341e15..f0c50c1 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -32,13 +33,6 @@
 #include <cctype>
 using namespace llvm;
 
-/// We are in the process of implementing a new TypeLegalization action
-/// - the promotion of vector elements. This feature is disabled by default
-/// and only enabled using this flag.
-static cl::opt<bool>
-AllowPromoteIntElem("promote-elements", cl::Hidden, cl::init(true),
-  cl::desc("Allow promotion of integer vector element types"));
-
 /// InitLibcallNames - Set default libcall names.
 ///
 static void InitLibcallNames(const char **Names) {
@@ -521,8 +515,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof),
-  mayPromoteElements(AllowPromoteIntElem) {
+  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
@@ -604,6 +597,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   IntDivIsCheap = false;
   Pow2DivIsCheap = false;
   JumpIsExpensive = false;
+  predictableSelectIsExpensive = false;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
@@ -618,6 +612,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
   InsertFencesForAtomic = false;
+  SupportJumpTables = true;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -708,42 +703,34 @@ bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const {
   return false;
 }
 
-/// hasLegalSuperRegRegClasses - Return true if the specified register class
-/// has one or more super-reg register classes that are legal.
-bool
-TargetLowering::hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const{
-  if (*RC->superregclasses_begin() == 0)
-    return false;
-  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
-         E = RC->superregclasses_end(); I != E; ++I) {
-    const TargetRegisterClass *RRC = *I;
-    if (isLegalRC(RRC))
-      return true;
-  }
-  return false;
-}
-
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
 std::pair<const TargetRegisterClass*, uint8_t>
 TargetLowering::findRepresentativeClass(EVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
   const TargetRegisterClass *BestRC = RC;
-  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
-         E = RC->superregclasses_end(); I != E; ++I) {
-    const TargetRegisterClass *RRC = *I;
-    if (RRC->isASubClass() || !isLegalRC(RRC))
+  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (SuperRC->getSize() <= BestRC->getSize())
+      continue;
+    if (!isLegalRC(SuperRC))
       continue;
-    if (!hasLegalSuperRegRegClasses(RRC))
-      return std::make_pair(RRC, 1);
-    BestRC = RRC;
+    BestRC = SuperRC;
   }
   return std::make_pair(BestRC, 1);
 }
 
-
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLowering::computeRegisterProperties() {
@@ -835,11 +822,8 @@ void TargetLowering::computeRegisterProperties() {
     unsigned NElts = VT.getVectorNumElements();
     if (NElts != 1) {
       bool IsLegalWiderType = false;
-      // If we allow the promotion of vector elements using a flag,
-      // then return TypePromoteInteger on vector elements.
       // First try to promote the elements of integer vectors. If no legal
       // promotion was found, fallback to the widen-vector method.
-      if (mayPromoteElements)
       for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
         EVT SVT = (MVT::SimpleValueType)nVT;
         // Promote vectors of integers to vectors with the same number
@@ -940,9 +924,12 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   unsigned NumElts = VT.getVectorNumElements();
 
   // If there is a wider vector type with the same element type as this one,
-  // we should widen to that legal vector type.  This handles things like
-  // <2 x float> -> <4 x float>.
-  if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) {
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
     RegisterVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterVT)) {
       IntermediateVT = RegisterVT;
@@ -1000,13 +987,11 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 /// TODO: Move this out of TargetLowering.cpp.
 void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
-                         const TargetLowering &TLI,
-                         SmallVectorImpl<uint64_t> *Offsets) {
+                         const TargetLowering &TLI) {
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, ReturnType, ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
-  unsigned Offset = 0;
 
   for (unsigned j = 0, f = NumValues; j != f; ++j) {
     EVT VT = ValueVTs[j];
@@ -1029,8 +1014,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
     EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
-    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
-                        PartVT.getTypeForEVT(ReturnType->getContext()));
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
@@ -1045,10 +1028,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     for (unsigned i = 0; i < NumParts; ++i) {
       Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
-      if (Offsets) {
-        Offsets->push_back(Offset);
-        Offset += PartSize;
-      }
     }
   }
 }
@@ -2019,7 +1998,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       }
 
-      // Make sure we're not loosing bits from the constant.
+      // Make sure we're not losing bits from the constant.
       if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
@@ -2343,6 +2322,55 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           }
         }
       }
+
+    if (C1.getMinSignedBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (ConstantSDNode *AndRHS =
+            dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countTrailingZeros();
+            EVT ShiftTy = DCI.isBeforeLegalize() ?
+              getPointerTy() : getShiftAmountTy(N0.getValueType());
+            EVT CmpTy = N0.getValueType();
+            SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
+                                        DAG.getConstant(ShiftBits, ShiftTy));
+            SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), CmpTy);
+            return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countTrailingOnes();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countTrailingZeros();
+        }
+        NewC = NewC.lshr(ShiftBits);
+        if (ShiftBits && isLegalICmpImmediate(NewC.getSExtValue())) {
+          EVT ShiftTy = DCI.isBeforeLegalize() ?
+            getPointerTy() : getShiftAmountTy(N0.getValueType());
+          EVT CmpTy = N0.getValueType();
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
+                                      DAG.getConstant(ShiftBits, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, CmpTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
   }
 
   if (isa<ConstantFPSDNode>(N0.getNode())) {
@@ -2411,25 +2439,33 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
 
   if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    uint64_t EqVal;
+    switch (getBooleanContents(N0.getValueType().isVector())) {
+    case UndefinedBooleanContent:
+    case ZeroOrOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond);
+      break;
+    case ZeroOrNegativeOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
+      break;
+    }
+
     // We can always fold X == X for integer setcc's.
     if (N0.getValueType().isInteger()) {
-      switch (getBooleanContents(N0.getValueType().isVector())) {
-      case UndefinedBooleanContent: 
-      case ZeroOrOneBooleanContent: 
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
-      case ZeroOrNegativeOneBooleanContent:
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond) ? -1 : 0, VT);
-      }
+      return DAG.getConstant(EqVal, VT);
     }
     unsigned UOF = ISD::getUnorderedFlavor(Cond);
     if (UOF == 2)   // FP operators that are undefined on NaNs.
-      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+      return DAG.getConstant(EqVal, VT);
     if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
-      return DAG.getConstant(UOF, VT);
+      return DAG.getConstant(EqVal, VT);
     // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
     // if it is not already.
     ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
-    if (NewCond != Cond)
+    if (NewCond != Cond && (DCI.isBeforeLegalizeOps() ||
+          getCondCodeAction(NewCond, N0.getValueType()) == Legal))
       return DAG.getSetCC(dl, VT, N0, N1, NewCond);
   }
 
@@ -2998,10 +3034,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                       Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 0016047..8a6b120 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -26,13 +26,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "shadowstackgc"
-#include "llvm/CodeGen/GCs.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCs.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 9a86f32..980bd74 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -13,28 +13,28 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "sjljehprepare"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <set>
 using namespace llvm;
 
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 26cf259..c8c3fb3 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -62,7 +62,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   assert(mi2iMap.empty() &&
          "MachineInstr -> Index mapping non-empty at initial numbering?");
 
-  functionSize = 0;
   unsigned index = 0;
   MBBRanges.resize(mf->getNumBlockIDs());
   idx2MBBMap.reserve(mf->size());
@@ -89,8 +88,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
       // Save this base index in the maps.
       mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(),
                                                   SlotIndex::Slot_Block)));
-
-      ++functionSize;
     }
 
     // We insert one blank instructions between basic blocks.
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 6f33f54..320128a 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -207,6 +207,17 @@ void SpillPlacement::activate(unsigned n) {
     return;
   ActiveNodes->set(n);
   nodes[n].clear();
+
+  // Very large bundles usually come from big switches, indirect branches,
+  // landing pads, or loops with many 'continue' statements. It is difficult to
+  // allocate registers when so many different blocks are involved.
+  //
+  // Give a small negative bias to large bundles such that 1/32 of the
+  // connected blocks need to be interested before we consider expanding the
+  // region through the bundle. This helps compile time by limiting the number
+  // of blocks visited and the number of links in the Hopfield network.
+  if (bundles->getBlocks(n).size() > 100)
+    nodes[n].Bias = -0.0625f;
 }
 
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 9959f74..4a2b7ec 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -345,9 +345,11 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Values.clear();
 
   // Reset the LiveRangeCalc instances needed for this spill mode.
-  LRCalc[0].reset(&VRM.getMachineFunction());
+  LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                  &LIS.getVNInfoAllocator());
   if (SpillMode)
-    LRCalc[1].reset(&VRM.getMachineFunction());
+    LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                    &LIS.getVNInfoAllocator());
 
   // We don't need an AliasAnalysis since we will only be performing
   // cheap-as-a-copy remats anyway.
@@ -650,7 +652,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
     // Adjust RegAssign if a register assignment is killed at VNI->def.  We
     // want to avoid calculating the live range of the source register if
     // possible.
-    AssignI.find(VNI->def.getPrevSlot());
+    AssignI.find(Def.getPrevSlot());
     if (!AssignI.valid() || AssignI.start() >= Def)
       continue;
     // If MI doesn't kill the assigned register, just leave it.
@@ -737,6 +739,8 @@ void SplitEditor::hoistCopiesForSize() {
   for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
        VI != VE; ++VI) {
     VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     assert(ParentVNI && "Parent not live at complement def");
 
@@ -810,6 +814,8 @@ void SplitEditor::hoistCopiesForSize() {
   for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
        VI != VE; ++VI) {
     VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     const DomPair &Dom = NearestDom[ParentVNI->id];
     if (!Dom.first || Dom.second == VNI->def)
@@ -924,11 +930,9 @@ bool SplitEditor::transferValues() {
     DEBUG(dbgs() << '\n');
   }
 
-  LRCalc[0].calculateValues(LIS.getSlotIndexes(), &MDT,
-                            &LIS.getVNInfoAllocator());
+  LRCalc[0].calculateValues();
   if (SpillMode)
-    LRCalc[1].calculateValues(LIS.getSlotIndexes(), &MDT,
-                              &LIS.getVNInfoAllocator());
+    LRCalc[1].calculateValues();
 
   return Skipped;
 }
@@ -953,8 +957,7 @@ void SplitEditor::extendPHIKillRanges() {
       if (Edit->getParent().liveAt(LastUse)) {
         assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        LRC.extend(LI, End,
-                   LIS.getSlotIndexes(), &MDT, &LIS.getVNInfoAllocator());
+        LRC.extend(LI, End);
       }
     }
   }
@@ -1004,8 +1007,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getRegSlot(true);
 
-    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot(), LIS.getSlotIndexes(),
-                             &MDT, &LIS.getVNInfoAllocator());
+    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
   }
 }
 
@@ -1049,8 +1051,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     if (ParentVNI->isUnused())
       continue;
     unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
-    VNInfo *VNI = defValue(RegIdx, ParentVNI, ParentVNI->def);
-    VNI->setIsPHIDef(ParentVNI->isPHIDef());
+    defValue(RegIdx, ParentVNI, ParentVNI->def);
 
     // Force rematted values to be recomputed everywhere.
     // The new live ranges may be truncated.
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 43a6ad8..f1eab1f 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/Triple.h"
 using namespace llvm;
 
 // SSPBufferSize - The lower bound for a buffer to be considered for stack
@@ -46,7 +47,7 @@ namespace {
     Function *F;
     Module *M;
 
-    DominatorTree* DT;
+    DominatorTree *DT;
 
     /// InsertStackProtectors - Insert code into the prologue and epilogue of
     /// the function.
@@ -70,8 +71,8 @@ namespace {
     }
     StackProtector(const TargetLowering *tli)
       : FunctionPass(ID), TLI(tli) {
-        initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-      }
+      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
@@ -95,7 +96,7 @@ bool StackProtector::runOnFunction(Function &Fn) {
   DT = getAnalysisIfAvailable<DominatorTree>();
 
   if (!RequiresStackProtector()) return false;
-  
+
   return InsertStackProtectors();
 }
 
@@ -111,6 +112,8 @@ bool StackProtector::RequiresStackProtector() const {
     return false;
 
   const TargetData *TD = TLI->getTargetData();
+  const TargetMachine &TM = TLI->getTargetMachine();
+  Triple Trip(TM.getTargetTriple());
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
@@ -123,11 +126,17 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType()))
+        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
+          // If we're on a non-Darwin platform, don't add stack protectors
+          // unless the array is a character array.
+          if (!Trip.isOSDarwin() && !AT->getElementType()->isIntegerTy(8))
+            continue;
+
           // If an array has more than SSPBufferSize bytes of allocated space,
           // then we emit stack protectors.
           if (SSPBufferSize <= TD->getTypeAllocSize(AT))
             return true;
+        }
       }
   }
 
@@ -159,17 +168,17 @@ bool StackProtector::InsertStackProtectors() {
       //     StackGuardSlot = alloca i8*
       //     StackGuard = load __stack_chk_guard
       //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
-      // 
+      //
       PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
       unsigned AddressSpace, Offset;
       if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
         Constant *OffsetVal =
           ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-        
+
         StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
                                       PointerType::get(PtrTy, AddressSpace));
       } else {
-        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy); 
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
       }
 
       BasicBlock &Entry = F->getEntryBlock();
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 1e940b1..20da36e 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -46,7 +46,6 @@ STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
 
 namespace {
   class StackSlotColoring : public MachineFunctionPass {
-    bool ColorWithRegs;
     LiveStacks* LS;
     MachineFrameInfo *MFI;
     const TargetInstrInfo  *TII;
@@ -82,7 +81,7 @@ namespace {
   public:
     static char ID; // Pass identification
     StackSlotColoring() :
-      MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) {
+      MachineFunctionPass(ID), NextColor(-1) {
         initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
       }
 
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index c6fdc73..5b06195 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -672,8 +672,8 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
       LiveInterval &SrcInterval = LI->getInterval(SrcReg);
       SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
       VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
+      (void)SrcVNI;
       assert(SrcVNI);
-      SrcVNI->setHasPHIKill(true);
       continue;
     }
 
@@ -744,7 +744,6 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
     SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
     VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
     assert(DestVNI);
-    DestVNI->setIsPHIDef(true);
   
     // Prior to PHI elimination, the live ranges of PHIs begin at their defining
     // instruction. After PHI elimination, PHI instructions are replaced by VNs
@@ -777,7 +776,6 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
   SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
   VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
                                         LI->getVNInfoAllocator());
-  CopyVNI->setIsPHIDef(true);
   CopyLI.addRange(LiveRange(MBBStartIndex,
                             DestCopyIndex.getRegSlot(),
                             CopyVNI));
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 8ebfbca..a813fa6 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -20,12 +20,15 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -57,8 +60,10 @@ namespace {
   /// TailDuplicatePass - Perform tail duplication.
   class TailDuplicatePass : public MachineFunctionPass {
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
     MachineRegisterInfo *MRI;
+    OwningPtr<RegScavenger> RS;
     bool PreRegAlloc;
 
     // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
@@ -124,9 +129,13 @@ INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication",
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   PreRegAlloc = MRI->isSSA();
+  RS.reset();
+  if (MRI->tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
+    RS.reset(new RegScavenger());
 
   bool MadeChange = false;
   while (TailDuplicateBlocks(MF))
@@ -272,8 +281,8 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
       continue;
     unsigned Dst = Copy->getOperand(0).getReg();
     unsigned Src = Copy->getOperand(1).getReg();
-    MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
-    if (++UI == MRI->use_end()) {
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
       // Copy is the only use. Do trivial copy propagation here.
       MRI->replaceRegWith(Dst, Src);
       Copy->eraseFromParent();
@@ -429,8 +438,10 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
         AddSSAUpdateEntry(Reg, NewReg, PredBB);
     } else {
       DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
-      if (VI != LocalVRMap.end())
+      if (VI != LocalVRMap.end()) {
         MO.setReg(VI->second);
+        MRI->constrainRegClass(VI->second, MRI->getRegClass(Reg));
+      }
     }
   }
   PredBB->insert(PredBB->instr_end(), NewMI);
@@ -775,6 +786,23 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     // Remove PredBB's unconditional branch.
     TII->RemoveBranch(*PredBB);
 
+    if (RS && !TailBB->livein_empty()) {
+      // Update PredBB livein.
+      RS->enterBasicBlock(PredBB);
+      if (!PredBB->empty())
+        RS->forward(prior(PredBB->end()));
+      BitVector RegsLiveAtExit(TRI->getNumRegs());
+      RS->getRegsUsed(RegsLiveAtExit, false);
+      for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
+             E = TailBB->livein_end(); I != E; ++I) {
+        if (!RegsLiveAtExit[*I])
+          // If a register is previously livein to the tail but it's not live
+          // at the end of predecessor BB, then it should be added to its
+          // livein list.
+          PredBB->addLiveIn(*I);
+      }
+    }
+
     // Clone the contents of TailBB into PredBB.
     DenseMap<unsigned, unsigned> LocalVRMap;
     SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 2beb928..ddee6b2 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -501,6 +501,14 @@ CreateTargetHazardRecognizer(const TargetMachine *TM,
   return new ScheduleHazardRecognizer();
 }
 
+// Default implementation of CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfoImpl::
+CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                               const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "misched");
+}
+
 // Default implementation of CreateTargetPostRAHazardRecognizer.
 ScheduleHazardRecognizer *TargetInstrInfoImpl::
 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
@@ -509,6 +517,10 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
     new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
 }
 
+//===----------------------------------------------------------------------===//
+//  SelectionDAG latency interface.
+//===----------------------------------------------------------------------===//
+
 int
 TargetInstrInfoImpl::getOperandLatency(const InstrItineraryData *ItinData,
                                        SDNode *DefNode, unsigned DefIdx,
@@ -537,3 +549,201 @@ int TargetInstrInfoImpl::getInstrLatency(const InstrItineraryData *ItinData,
   return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
 }
 
+//===----------------------------------------------------------------------===//
+//  MachineInstr latency interface.
+//===----------------------------------------------------------------------===//
+
+unsigned
+TargetInstrInfoImpl::getNumMicroOps(const InstrItineraryData *ItinData,
+                                    const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Class = MI->getDesc().getSchedClass();
+  int UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps >= 0)
+    return UOps;
+
+  // The # of u-ops is dynamically determined. The specific target should
+  // override this function to return the right number.
+  return 1;
+}
+
+/// Return the default expected latency for a def based on it's opcode.
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
+                                            const MachineInstr *DefMI) const {
+  if (DefMI->mayLoad())
+    return SchedModel->LoadLatency;
+  if (isHighLatencyDef(DefMI->getOpcode()))
+    return SchedModel->HighLatency;
+  return 1;
+}
+
+unsigned TargetInstrInfoImpl::
+getInstrLatency(const InstrItineraryData *ItinData,
+                const MachineInstr *MI,
+                unsigned *PredCost) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return MI->mayLoad() ? 2 : 1;
+
+  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
+}
+
+bool TargetInstrInfoImpl::hasLowDefLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *DefMI,
+                                           unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+  return (DefCycle != -1 && DefCycle <= 1);
+}
+
+/// Both DefMI and UseMI must be valid.  By default, call directly to the
+/// itinerary. This may be overriden by the target.
+int TargetInstrInfoImpl::
+getOperandLatency(const InstrItineraryData *ItinData,
+                  const MachineInstr *DefMI, unsigned DefIdx,
+                  const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  unsigned UseClass = UseMI->getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+/// If we can determine the operand latency from the def only, without itinerary
+/// lookup, do so. Otherwise return -1.
+static int computeDefOperandLatency(
+  const TargetInstrInfo *TII, const InstrItineraryData *ItinData,
+  const MachineInstr *DefMI, bool FindMin) {
+
+  // Let the target hook getInstrLatency handle missing itineraries.
+  if (!ItinData)
+    return TII->getInstrLatency(ItinData, DefMI);
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
+    // it exists before defaulting to MinLatency.
+    if (ItinData->SchedModel->MinLatency >= 0)
+      return TII->getInstrLatency(ItinData, DefMI);
+
+    // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
+    // For empty itineraries, short-cirtuit the check and default to one cycle.
+    if (ItinData->isEmpty())
+      return 1;
+  }
+  else if(ItinData->isEmpty())
+    return TII->defaultDefLatency(ItinData->SchedModel, DefMI);
+
+  // ...operand lookup required
+  return -1;
+}
+
+/// computeOperandLatency - Compute and return the latency of the given data
+/// dependent def and use when the operand indices are already known.
+///
+/// FindMin may be set to get the minimum vs. expected latency.
+unsigned TargetInstrInfo::
+computeOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx,
+                      bool FindMin) const {
+
+  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  int OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  if (OperLatency >= 0)
+    return OperLatency;
+
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency,
+                            defaultDefLatency(ItinData->SchedModel, DefMI));
+  return InstrLatency;
+}
+
+/// computeOperandLatency - Compute and return the latency of the given data
+/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an
+/// unknown use. Depending on the subtarget's itinerary properties, this may or
+/// may not need to call getOperandLatency().
+///
+/// FindMin may be set to get the minimum vs. expected latency. Minimum
+/// latency is used for scheduling groups, while expected latency is for
+/// instruction cost and critical path.
+///
+/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency.
+/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use.
+unsigned TargetInstrInfo::
+computeOperandLatency(const InstrItineraryData *ItinData,
+                      const TargetRegisterInfo *TRI,
+                      const MachineInstr *DefMI, const MachineInstr *UseMI,
+                      unsigned Reg, bool FindMin) const {
+
+  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  // Find the definition of the register in the defining instruction.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
+  if (DefIdx != -1) {
+    const MachineOperand &MO = DefMI->getOperand(DefIdx);
+    if (MO.isReg() && MO.isImplicit() &&
+        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
+      // This is an implicit def, getOperandLatency() won't return the correct
+      // latency. e.g.
+      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
+      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
+      // What we want is to compute latency between def of %D6/%D7 and use of
+      // %Q3 instead.
+      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
+      if (DefMI->getOperand(Op2).isReg())
+        DefIdx = Op2;
+    }
+    // For all uses of the register, calculate the maxmimum latency
+    int OperLatency = -1;
+
+    // UseMI is null, then it must be a scheduling barrier.
+    if (!UseMI) {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+    }
+    else {
+      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = UseMI->getOperand(i);
+        if (!MO.isReg() || !MO.isUse())
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MOReg != Reg)
+          continue;
+
+        int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i);
+        OperLatency = std::max(OperLatency, UseCycle);
+      }
+    }
+    // If we found an operand latency, we're done.
+    if (OperLatency >= 0)
+      return OperLatency;
+  }
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency,
+                            defaultDefLatency(ItinData->SchedModel, DefMI));
+  return InstrLatency;
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 9925185..2a2fa9e 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -93,8 +93,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // N.B.: The defaults used in here are no the same ones used in MC.
   // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
   // both gas and MC will produce a section with no flags. Given
-  // section(".eh_frame") gcc will produce
-  // .section	.eh_frame,"a",@progbits
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
@@ -349,10 +350,17 @@ TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const {
   if (Priority == 65535)
     return StaticCtorSection;
 
-  std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
-  return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                    ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
+  if (UseInitArray) {
+    std::string Name = std::string(".init_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
 }
 
 const MCSection *
@@ -362,10 +370,35 @@ TargetLoweringObjectFileELF::getStaticDtorSection(unsigned Priority) const {
   if (Priority == 65535)
     return StaticDtorSection;
 
-  std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
-  return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                    ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
+  if (UseInitArray) {
+    std::string Name = std::string(".fini_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
+}
+
+void
+TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
+  UseInitArray = UseInitArray_;
+  if (!UseInitArray)
+    return;
+
+  StaticCtorSection =
+    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  StaticDtorSection =
+    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
 }
 
 //===----------------------------------------------------------------------===//
@@ -379,7 +412,7 @@ emitModuleFlags(MCStreamer &Streamer,
                 ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                 Mangler *Mang, const TargetMachine &TM) const {
   unsigned VersionVal = 0;
-  unsigned GCFlags = 0;
+  unsigned ImageInfoFlags = 0;
   StringRef SectionVal;
 
   for (ArrayRef<Module::ModuleFlagEntry>::iterator
@@ -396,8 +429,9 @@ emitModuleFlags(MCStreamer &Streamer,
     if (Key == "Objective-C Image Info Version")
       VersionVal = cast<ConstantInt>(Val)->getZExtValue();
     else if (Key == "Objective-C Garbage Collection" ||
-             Key == "Objective-C GC Only")
-      GCFlags |= cast<ConstantInt>(Val)->getZExtValue();
+             Key == "Objective-C GC Only" ||
+             Key == "Objective-C Is Simulated")
+      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
     else if (Key == "Objective-C Image Info Section")
       SectionVal = cast<MDString>(Val)->getString();
   }
@@ -424,7 +458,7 @@ emitModuleFlags(MCStreamer &Streamer,
   Streamer.EmitLabel(getContext().
                      GetOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
   Streamer.EmitIntValue(VersionVal, 4);
-  Streamer.EmitIntValue(GCFlags, 4);
+  Streamer.EmitIntValue(ImageInfoFlags, 4);
   Streamer.AddBlankLine();
 }
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c30b133..aa601af 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -30,6 +30,7 @@
 #define DEBUG_TYPE "twoaddrinstr"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -55,18 +56,19 @@ STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
 STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
 STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
 STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
-STATISTIC(NumReMats,           "Number of instructions re-materialized");
-STATISTIC(NumDeletes,          "Number of dead instructions deleted");
 STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
 STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
 
 namespace {
   class TwoAddressInstructionPass : public MachineFunctionPass {
+    MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
     MachineRegisterInfo *MRI;
     LiveVariables *LV;
+    SlotIndexes *Indexes;
+    LiveIntervals *LIS;
     AliasAnalysis *AA;
     CodeGenOpt::Level OptLevel;
 
@@ -92,17 +94,10 @@ namespace {
                               unsigned Reg,
                               MachineBasicBlock::iterator OldPos);
 
-    bool isProfitableToReMat(unsigned Reg, const TargetRegisterClass *RC,
-                             MachineInstr *MI, MachineInstr *DefMI,
-                             MachineBasicBlock *MBB, unsigned Loc);
-
     bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
                            unsigned &LastDef);
 
-    MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB,
-                                   unsigned Dist);
-
-    bool isProfitableToCommute(unsigned regB, unsigned regC,
+    bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
                                MachineInstr *MI, MachineBasicBlock *MBB,
                                unsigned Dist);
 
@@ -117,14 +112,6 @@ namespace {
                             MachineFunction::iterator &mbbi,
                             unsigned RegA, unsigned RegB, unsigned Dist);
 
-    typedef std::pair<std::pair<unsigned, bool>, MachineInstr*> NewKill;
-    bool canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
-                               SmallVector<NewKill, 4> &NewKills,
-                               MachineBasicBlock *MBB, unsigned Dist);
-    bool DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
-                           MachineBasicBlock::iterator &nmi,
-                           MachineFunction::iterator &mbbi, unsigned Dist);
-
     bool isDefTooClose(unsigned Reg, unsigned Dist,
                        MachineInstr *MI, MachineBasicBlock *MBB);
 
@@ -150,6 +137,11 @@ namespace {
     void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
                      SmallPtrSet<MachineInstr*, 8> &Processed);
 
+    typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
+    typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
+    bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+    void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+
     void CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, unsigned DstReg);
 
     /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
@@ -167,6 +159,8 @@ namespace {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
       AU.addPreserved<LiveVariables>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addPreserved<LiveIntervals>();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -241,7 +235,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   // appropriate location, we can try to sink the current instruction
   // past it.
   if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
-      KillMI->isTerminator())
+      KillMI == OldPos || KillMI->isTerminator())
     return false;
 
   // If any of the definitions are used by another instruction between the
@@ -284,6 +278,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
       }
     }
   }
+  assert(KillMO && "Didn't find kill");
 
   // Update kill and LV information.
   KillMO->setIsKill(false);
@@ -297,59 +292,13 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   MBB->remove(MI);
   MBB->insert(KillPos, MI);
 
+  if (LIS)
+    LIS->handleMove(MI);
+
   ++Num3AddrSunk;
   return true;
 }
 
-/// isTwoAddrUse - Return true if the specified MI is using the specified
-/// register as a two-address operand.
-static bool isTwoAddrUse(MachineInstr *UseMI, unsigned Reg) {
-  const MCInstrDesc &MCID = UseMI->getDesc();
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = UseMI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg &&
-        (MO.isDef() || UseMI->isRegTiedToDefOperand(i)))
-      // Earlier use is a two-address one.
-      return true;
-  }
-  return false;
-}
-
-/// isProfitableToReMat - Return true if the heuristics determines it is likely
-/// to be profitable to re-materialize the definition of Reg rather than copy
-/// the register.
-bool
-TwoAddressInstructionPass::isProfitableToReMat(unsigned Reg,
-                                         const TargetRegisterClass *RC,
-                                         MachineInstr *MI, MachineInstr *DefMI,
-                                         MachineBasicBlock *MBB, unsigned Loc) {
-  bool OtherUse = false;
-  for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(Reg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineOperand &UseMO = UI.getOperand();
-    MachineInstr *UseMI = UseMO.getParent();
-    MachineBasicBlock *UseMBB = UseMI->getParent();
-    if (UseMBB == MBB) {
-      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
-      if (DI != DistanceMap.end() && DI->second == Loc)
-        continue;  // Current use.
-      OtherUse = true;
-      // There is at least one other use in the MBB that will clobber the
-      // register.
-      if (isTwoAddrUse(UseMI, Reg))
-        return true;
-    }
-  }
-
-  // If other uses in MBB are not two-address uses, then don't remat.
-  if (OtherUse)
-    return false;
-
-  // No other uses in the same block, remat if it's defined in the same
-  // block so it does not unnecessarily extend the live range.
-  return MBB == DefMI->getParent();
-}
-
 /// NoUseAfterLastDef - Return true if there are no intervening uses between the
 /// last instruction in the MBB that defines the specified register and the
 /// two-address instruction which is being processed. It also returns the last
@@ -377,31 +326,6 @@ bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
   return !(LastUse > LastDef && LastUse < Dist);
 }
 
-MachineInstr *TwoAddressInstructionPass::FindLastUseInMBB(unsigned Reg,
-                                                         MachineBasicBlock *MBB,
-                                                         unsigned Dist) {
-  unsigned LastUseDist = 0;
-  MachineInstr *LastUse = 0;
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
-         E = MRI->reg_end(); I != E; ++I) {
-    MachineOperand &MO = I.getOperand();
-    MachineInstr *MI = MO.getParent();
-    if (MI->getParent() != MBB || MI->isDebugValue())
-      continue;
-    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
-    if (DI == DistanceMap.end())
-      continue;
-    if (DI->second >= Dist)
-      continue;
-
-    if (MO.isUse() && DI->second > LastUseDist) {
-      LastUse = DI->first;
-      LastUseDist = DI->second;
-    }
-  }
-  return LastUse;
-}
-
 /// isCopyToReg - Return true if the specified MI is a copy instruction or
 /// a extract_subreg instruction. It also returns the source and destination
 /// registers and whether they are physical registers by reference.
@@ -483,32 +407,6 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findLocalKill - Look for an instruction below MI in the MBB that kills the
-/// specified register. Returns null if there are any other Reg use between the
-/// instructions.
-static
-MachineInstr *findLocalKill(unsigned Reg, MachineBasicBlock *MBB,
-                            MachineInstr *MI, MachineRegisterInfo *MRI,
-                            DenseMap<MachineInstr*, unsigned> &DistanceMap) {
-  MachineInstr *KillMI = 0;
-  for (MachineRegisterInfo::use_nodbg_iterator
-         UI = MRI->use_nodbg_begin(Reg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineInstr *UseMI = &*UI;
-    if (UseMI == MI || UseMI->getParent() != MBB)
-      continue;
-    if (DistanceMap.count(UseMI))
-      continue;
-    if (!UI.getOperand().isKill())
-      return 0;
-    if (KillMI)
-      return 0;  // -O0 kill markers cannot be trusted?
-    KillMI = UseMI;
-  }
-
-  return KillMI;
-}
-
 /// findOnlyInterestingUse - Given a register, if has a single in-basic block
 /// use, return the use instruction if it's a copy or a two-address use.
 static
@@ -564,10 +462,11 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 
-/// isProfitableToReMat - Return true if it's potentially profitable to commute
+/// isProfitableToCommute - Return true if it's potentially profitable to commute
 /// the two-address instruction that's being processed.
 bool
-TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
+TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
+                                       unsigned regC,
                                        MachineInstr *MI, MachineBasicBlock *MBB,
                                        unsigned Dist) {
   if (OptLevel == CodeGenOpt::None)
@@ -604,15 +503,15 @@ TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
   // %reg1026<def> = ADD %reg1024, %reg1025
   // r0            = MOV %reg1026
   // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
-  unsigned FromRegB = getMappedReg(regB, SrcRegMap);
-  unsigned FromRegC = getMappedReg(regC, SrcRegMap);
-  unsigned ToRegB = getMappedReg(regB, DstRegMap);
-  unsigned ToRegC = getMappedReg(regC, DstRegMap);
-  if ((FromRegB && ToRegB && !regsAreCompatible(FromRegB, ToRegB, TRI)) &&
-      ((!FromRegC && !ToRegC) ||
-       regsAreCompatible(FromRegB, ToRegC, TRI) ||
-       regsAreCompatible(FromRegC, ToRegB, TRI)))
-    return true;
+  unsigned ToRegA = getMappedReg(regA, DstRegMap);
+  if (ToRegA) {
+    unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+    unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+    bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI);
+    if (BComp != CComp)
+      return !BComp && CComp;
+  }
 
   // If there is a use of regC between its last def (could be livein) and this
   // instruction, then bail.
@@ -653,6 +552,8 @@ TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
     if (LV)
       // Update live variables
       LV->replaceKillInstruction(RegC, MI, NewMI);
+    if (Indexes)
+      Indexes->replaceMachineInstrInMaps(MI, NewMI);
 
     mbbi->insert(mi, NewMI);           // Insert the new inst
     mbbi->erase(mi);                   // Nuke the old inst.
@@ -701,6 +602,9 @@ TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
     DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
     bool Sunk = false;
 
+    if (Indexes)
+      Indexes->replaceMachineInstrInMaps(mi, NewMI);
+
     if (NewMI->findRegisterUseOperand(RegB, false, TRI))
       // FIXME: Temporary workaround. If the new instruction doesn't
       // uses RegB, convertToThreeAddress must have created more
@@ -810,92 +714,6 @@ void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
   return;
 }
 
-/// isSafeToDelete - If the specified instruction does not produce any side
-/// effects and all of its defs are dead, then it's safe to delete.
-static bool isSafeToDelete(MachineInstr *MI,
-                           const TargetInstrInfo *TII,
-                           SmallVector<unsigned, 4> &Kills) {
-  if (MI->mayStore() || MI->isCall())
-    return false;
-  if (MI->isTerminator() || MI->hasUnmodeledSideEffects())
-    return false;
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    if (MO.isDef() && !MO.isDead())
-      return false;
-    if (MO.isUse() && MO.isKill())
-      Kills.push_back(MO.getReg());
-  }
-  return true;
-}
-
-/// canUpdateDeletedKills - Check if all the registers listed in Kills are
-/// killed by instructions in MBB preceding the current instruction at
-/// position Dist.  If so, return true and record information about the
-/// preceding kills in NewKills.
-bool TwoAddressInstructionPass::
-canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
-                      SmallVector<NewKill, 4> &NewKills,
-                      MachineBasicBlock *MBB, unsigned Dist) {
-  while (!Kills.empty()) {
-    unsigned Kill = Kills.back();
-    Kills.pop_back();
-    if (TargetRegisterInfo::isPhysicalRegister(Kill))
-      return false;
-
-    MachineInstr *LastKill = FindLastUseInMBB(Kill, MBB, Dist);
-    if (!LastKill)
-      return false;
-
-    bool isModRef = LastKill->definesRegister(Kill);
-    NewKills.push_back(std::make_pair(std::make_pair(Kill, isModRef),
-                                      LastKill));
-  }
-  return true;
-}
-
-/// DeleteUnusedInstr - If an instruction with a tied register operand can
-/// be safely deleted, just delete it.
-bool
-TwoAddressInstructionPass::DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
-                                             MachineBasicBlock::iterator &nmi,
-                                             MachineFunction::iterator &mbbi,
-                                             unsigned Dist) {
-  // Check if the instruction has no side effects and if all its defs are dead.
-  SmallVector<unsigned, 4> Kills;
-  if (!isSafeToDelete(mi, TII, Kills))
-    return false;
-
-  // If this instruction kills some virtual registers, we need to
-  // update the kill information. If it's not possible to do so,
-  // then bail out.
-  SmallVector<NewKill, 4> NewKills;
-  if (!canUpdateDeletedKills(Kills, NewKills, &*mbbi, Dist))
-    return false;
-
-  if (LV) {
-    while (!NewKills.empty()) {
-      MachineInstr *NewKill = NewKills.back().second;
-      unsigned Kill = NewKills.back().first.first;
-      bool isDead = NewKills.back().first.second;
-      NewKills.pop_back();
-      if (LV->removeVirtualRegisterKilled(Kill, mi)) {
-        if (isDead)
-          LV->addVirtualRegisterDead(Kill, NewKill);
-        else
-          LV->addVirtualRegisterKilled(Kill, NewKill);
-      }
-    }
-  }
-
-  mbbi->erase(mi); // Nuke the old inst.
-  mi = nmi;
-  return true;
-}
-
 /// RescheduleMIBelowKill - If there is one more local instruction that reads
 /// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
 /// instruction in order to eliminate the need for the copy.
@@ -904,14 +722,19 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -998,6 +821,12 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
             ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
       }
     }
   }
@@ -1011,20 +840,13 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
   MBB->splice(KillPos, MBB, From, To);
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
+  if (LIS)
+    LIS->handleMove(MI);
 
+  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
   return true;
 }
 
@@ -1045,7 +867,7 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
       return true;  // Below MI
     unsigned DefDist = DDI->second;
     assert(Dist > DefDist && "Visited def already?");
-    if (TII->getInstrLatency(InstrItins, DefMI) > (int)(Dist - DefDist))
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
       return true;
   }
   return false;
@@ -1060,14 +882,19 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -1093,6 +920,8 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         continue;
       if (isDefTooClose(MOReg, DI->second, MI, MBB))
         return false;
+      if (MOReg == Reg && !MO.isKill())
+        return false;
       Uses.insert(MOReg);
       if (MO.isKill() && MOReg != Reg)
         Kills.insert(MOReg);
@@ -1134,6 +963,9 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         if (Kills.count(MOReg))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (OtherMI != MI && MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
       } else {
         OtherDefs.push_back(MOReg);
       }
@@ -1164,19 +996,13 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
   nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr.
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
+  if (LIS)
+    LIS->handleMove(KillMI);
+
+  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
   return true;
 }
 
@@ -1201,15 +1027,10 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   assert(TargetRegisterInfo::isVirtualRegister(regB) &&
          "cannot make instruction into two-address form");
-
-  // If regA is dead and the instruction can be deleted, just delete
-  // it so it doesn't clobber regB.
   bool regBKilled = isKilled(MI, regB, MRI, TII);
-  if (!regBKilled && MI.getOperand(DstIdx).isDead() &&
-      DeleteUnusedInstr(mi, nmi, mbbi, Dist)) {
-    ++NumDeletes;
-    return true; // Done with this instruction.
-  }
+
+  if (TargetRegisterInfo::isVirtualRegister(regA))
+    ScanUses(regA, &*mbbi, Processed);
 
   // Check if it is profitable to commute the operands.
   unsigned SrcOp1, SrcOp2;
@@ -1230,7 +1051,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
         // If C dies but B does not, swap the B and C operands.
         // This makes the live ranges of A and C joinable.
         TryCommute = true;
-      else if (isProfitableToCommute(regB, regC, &MI, mbbi, Dist)) {
+      else if (isProfitableToCommute(regA, regB, regC, &MI, mbbi, Dist)) {
         TryCommute = true;
         AggressiveCommute = true;
       }
@@ -1252,9 +1073,6 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
     return true;
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(regA))
-    ScanUses(regA, &*mbbi, Processed);
-
   if (MI.isConvertibleTo3Addr()) {
     // This instruction is potentially convertible to a true
     // three-address instruction.  Check if it is profitable.
@@ -1293,15 +1111,14 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
     if (NewOpc != 0) {
       const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
       if (UnfoldMCID.getNumDefs() == 1) {
-        MachineFunction &MF = *mbbi->getParent();
-
         // Unfold the load.
         DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC =
-          TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI);
+          TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
-        if (!TII->unfoldMemoryOperand(MF, &MI, Reg,
+        if (!TII->unfoldMemoryOperand(*MF, &MI, Reg,
                                       /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
                                       NewMIs)) {
           DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
@@ -1378,15 +1195,177 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
   return false;
 }
 
+// Collect tied operands of MI that need to be handled.
+// Rewrite trivial cases immediately.
+// Return true if any tied operands where found, including the trivial ones.
+bool TwoAddressInstructionPass::
+collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  bool AnyOps = false;
+  unsigned NumOps = MI->isInlineAsm() ?
+    MI->getNumOperands() : MCID.getNumOperands();
+
+  for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+    unsigned DstIdx = 0;
+    if (!MI->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+      continue;
+    AnyOps = true;
+    MachineOperand &SrcMO = MI->getOperand(SrcIdx);
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned SrcReg = SrcMO.getReg();
+    unsigned DstReg = DstMO.getReg();
+    // Tied constraint already satisfied?
+    if (SrcReg == DstReg)
+      continue;
+
+    assert(SrcReg && SrcMO.isUse() && "two address instruction invalid");
+
+    // Deal with <undef> uses immediately - simply rewrite the src operand.
+    if (SrcMO.isUndef()) {
+      // Constrain the DstReg register class if required.
+      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+        if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
+                                                             TRI, *MF))
+          MRI->constrainRegClass(DstReg, RC);
+      SrcMO.setReg(DstReg);
+      DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      continue;
+    }
+    TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
+  }
+  return AnyOps;
+}
+
+// Process a list of tied MI operands that all use the same source register.
+// The tied pairs are of the form (SrcIdx, DstIdx).
+void
+TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
+                                            TiedPairList &TiedPairs,
+                                            unsigned &Dist) {
+  bool IsEarlyClobber = false;
+  bool RemovedKillFlag = false;
+  bool AllUsesCopied = true;
+  unsigned LastCopiedReg = 0;
+  unsigned RegB = 0;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    unsigned SrcIdx = TiedPairs[tpi].first;
+    unsigned DstIdx = TiedPairs[tpi].second;
+
+    const MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned RegA = DstMO.getReg();
+    IsEarlyClobber |= DstMO.isEarlyClobber();
+
+    // Grab RegB from the instruction because it may have changed if the
+    // instruction was commuted.
+    RegB = MI->getOperand(SrcIdx).getReg();
+
+    if (RegA == RegB) {
+      // The register is tied to multiple destinations (or else we would
+      // not have continued this far), but this use of the register
+      // already matches the tied destination.  Leave it.
+      AllUsesCopied = false;
+      continue;
+    }
+    LastCopiedReg = RegA;
+
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+    // First, verify that we don't have a use of "a" in the instruction
+    // (a = b + a for example) because our transformation will not
+    // work. This should never occur because we are in SSA form.
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i)
+      assert(i == DstIdx ||
+             !MI->getOperand(i).isReg() ||
+             MI->getOperand(i).getReg() != RegA);
+#endif
+
+    // Emit a copy.
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), RegA).addReg(RegB);
+
+    // Update DistanceMap.
+    MachineBasicBlock::iterator PrevMI = MI;
+    --PrevMI;
+    DistanceMap.insert(std::make_pair(PrevMI, Dist));
+    DistanceMap[MI] = ++Dist;
+
+    SlotIndex CopyIdx;
+    if (Indexes)
+      CopyIdx = Indexes->insertMachineInstrInMaps(PrevMI).getRegSlot();
+
+    DEBUG(dbgs() << "\t\tprepend:\t" << *PrevMI);
+
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
+           "inconsistent operand info for 2-reg pass");
+    if (MO.isKill()) {
+      MO.setIsKill(false);
+      RemovedKillFlag = true;
+    }
+
+    // Make sure regA is a legal regclass for the SrcIdx operand.
+    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
+        TargetRegisterInfo::isVirtualRegister(RegB))
+      MRI->constrainRegClass(RegA, MRI->getRegClass(RegB));
+
+    MO.setReg(RegA);
+
+    // Propagate SrcRegMap.
+    SrcRegMap[RegA] = RegB;
+  }
+
+
+  if (AllUsesCopied) {
+    if (!IsEarlyClobber) {
+      // Replace other (un-tied) uses of regB with LastCopiedReg.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(LastCopiedReg);
+        }
+      }
+    }
+
+    // Update live variables for regB.
+    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(MI)) {
+      MachineBasicBlock::iterator PrevMI = MI;
+      --PrevMI;
+      LV->addVirtualRegisterKilled(RegB, PrevMI);
+    }
+
+  } else if (RemovedKillFlag) {
+    // Some tied uses of regB matched their destination registers, so
+    // regB is still used in this instruction, but a kill flag was
+    // removed from a different tied use of regB, so now we need to add
+    // a kill flag to one of the remaining uses of regB.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+        MO.setIsKill(true);
+        break;
+      }
+    }
+  }
+}
+
 /// runOnMachineFunction - Reduce two-address instructions to two operands.
 ///
-bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  MRI = &MF.getRegInfo();
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetMachine &TM = MF->getTarget();
+  MRI = &MF->getRegInfo();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   InstrItins = TM.getInstrItineraryData();
+  Indexes = getAnalysisIfAvailable<SlotIndexes>();
   LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   OptLevel = TM.getOptLevel();
 
@@ -1394,20 +1373,15 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
   DEBUG(dbgs() << "********** Function: "
-        << MF.getFunction()->getName() << '\n');
+        << MF->getFunction()->getName() << '\n');
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
 
-  // ReMatRegs - Keep track of the registers whose def's are remat'ed.
-  BitVector ReMatRegs(MRI->getNumVirtRegs());
-
-  typedef DenseMap<unsigned, SmallVector<std::pair<unsigned, unsigned>, 4> >
-    TiedOperandMap;
-  TiedOperandMap TiedOperands(4);
+  TiedOperandMap TiedOperands;
 
   SmallPtrSet<MachineInstr*, 8> Processed;
-  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
        mbbi != mbbe; ++mbbi) {
     unsigned Dist = 0;
     DistanceMap.clear();
@@ -1426,188 +1400,63 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
       if (mi->isRegSequence())
         RegSequences.push_back(&*mi);
 
-      const MCInstrDesc &MCID = mi->getDesc();
-      bool FirstTied = true;
-
       DistanceMap.insert(std::make_pair(mi, ++Dist));
 
       ProcessCopy(&*mi, &*mbbi, Processed);
 
       // First scan through all the tied register uses in this instruction
       // and record a list of pairs of tied operands for each register.
-      unsigned NumOps = mi->isInlineAsm()
-        ? mi->getNumOperands() : MCID.getNumOperands();
-      for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
-        unsigned DstIdx = 0;
-        if (!mi->isRegTiedToDefOperand(SrcIdx, &DstIdx))
-          continue;
-
-        if (FirstTied) {
-          FirstTied = false;
-          ++NumTwoAddressInstrs;
-          DEBUG(dbgs() << '\t' << *mi);
-        }
-
-        assert(mi->getOperand(SrcIdx).isReg() &&
-               mi->getOperand(SrcIdx).getReg() &&
-               mi->getOperand(SrcIdx).isUse() &&
-               "two address instruction invalid");
-
-        unsigned regB = mi->getOperand(SrcIdx).getReg();
-        TiedOperands[regB].push_back(std::make_pair(SrcIdx, DstIdx));
+      if (!collectTiedOperands(mi, TiedOperands)) {
+        mi = nmi;
+        continue;
       }
 
-      // Now iterate over the information collected above.
-      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
-             OE = TiedOperands.end(); OI != OE; ++OI) {
-        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second;
-
-        // If the instruction has a single pair of tied operands, try some
-        // transformations that may either eliminate the tied operands or
-        // improve the opportunities for coalescing away the register copy.
-        if (TiedOperands.size() == 1 && TiedPairs.size() == 1) {
+      ++NumTwoAddressInstrs;
+      MadeChange = true;
+      DEBUG(dbgs() << '\t' << *mi);
+
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
           unsigned SrcIdx = TiedPairs[0].first;
           unsigned DstIdx = TiedPairs[0].second;
-
-          // If the registers are already equal, nothing needs to be done.
-          if (mi->getOperand(SrcIdx).getReg() ==
-              mi->getOperand(DstIdx).getReg())
-            break; // Done with this instruction.
-
-          if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
-                                      Processed))
-            break; // The tied operands have been eliminated.
-        }
-
-        bool IsEarlyClobber = false;
-        bool RemovedKillFlag = false;
-        bool AllUsesCopied = true;
-        unsigned LastCopiedReg = 0;
-        unsigned regB = OI->first;
-        for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
-          unsigned SrcIdx = TiedPairs[tpi].first;
-          unsigned DstIdx = TiedPairs[tpi].second;
-
-          const MachineOperand &DstMO = mi->getOperand(DstIdx);
-          unsigned regA = DstMO.getReg();
-          IsEarlyClobber |= DstMO.isEarlyClobber();
-
-          // Grab regB from the instruction because it may have changed if the
-          // instruction was commuted.
-          regB = mi->getOperand(SrcIdx).getReg();
-
-          if (regA == regB) {
-            // The register is tied to multiple destinations (or else we would
-            // not have continued this far), but this use of the register
-            // already matches the tied destination.  Leave it.
-            AllUsesCopied = false;
+          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
+                                      Processed)) {
+            // The tied operands have been eliminated or shifted further down the
+            // block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            mi = nmi;
             continue;
           }
-          LastCopiedReg = regA;
-
-          assert(TargetRegisterInfo::isVirtualRegister(regB) &&
-                 "cannot make instruction into two-address form");
-
-#ifndef NDEBUG
-          // First, verify that we don't have a use of "a" in the instruction
-          // (a = b + a for example) because our transformation will not
-          // work. This should never occur because we are in SSA form.
-          for (unsigned i = 0; i != mi->getNumOperands(); ++i)
-            assert(i == DstIdx ||
-                   !mi->getOperand(i).isReg() ||
-                   mi->getOperand(i).getReg() != regA);
-#endif
-
-          // Emit a copy or rematerialize the definition.
-          const TargetRegisterClass *rc = MRI->getRegClass(regB);
-          MachineInstr *DefMI = MRI->getVRegDef(regB);
-          // If it's safe and profitable, remat the definition instead of
-          // copying it.
-          if (DefMI &&
-              DefMI->isAsCheapAsAMove() &&
-              DefMI->isSafeToReMat(TII, AA, regB) &&
-              isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
-            DEBUG(dbgs() << "2addr: REMATTING : " << *DefMI << "\n");
-            unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
-            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, *TRI);
-            ReMatRegs.set(TargetRegisterInfo::virtReg2Index(regB));
-            ++NumReMats;
-          } else {
-            BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
-                    regA).addReg(regB);
-          }
-
-          MachineBasicBlock::iterator prevMI = prior(mi);
-          // Update DistanceMap.
-          DistanceMap.insert(std::make_pair(prevMI, Dist));
-          DistanceMap[mi] = ++Dist;
-
-          DEBUG(dbgs() << "\t\tprepend:\t" << *prevMI);
-
-          MachineOperand &MO = mi->getOperand(SrcIdx);
-          assert(MO.isReg() && MO.getReg() == regB && MO.isUse() &&
-                 "inconsistent operand info for 2-reg pass");
-          if (MO.isKill()) {
-            MO.setIsKill(false);
-            RemovedKillFlag = true;
-          }
-          MO.setReg(regA);
         }
+      }
 
-        if (AllUsesCopied) {
-          if (!IsEarlyClobber) {
-            // Replace other (un-tied) uses of regB with LastCopiedReg.
-            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-              MachineOperand &MO = mi->getOperand(i);
-              if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-                if (MO.isKill()) {
-                  MO.setIsKill(false);
-                  RemovedKillFlag = true;
-                }
-                MO.setReg(LastCopiedReg);
-              }
-            }
-          }
-
-          // Update live variables for regB.
-          if (RemovedKillFlag && LV && LV->getVarInfo(regB).removeKill(mi))
-            LV->addVirtualRegisterKilled(regB, prior(mi));
-
-        } else if (RemovedKillFlag) {
-          // Some tied uses of regB matched their destination registers, so
-          // regB is still used in this instruction, but a kill flag was
-          // removed from a different tied use of regB, so now we need to add
-          // a kill flag to one of the remaining uses of regB.
-          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = mi->getOperand(i);
-            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-              MO.setIsKill(true);
-              break;
-            }
-          }
-        }
-
-        // Schedule the source copy / remat inserted to form two-address
-        // instruction. FIXME: Does it matter the distance map may not be
-        // accurate after it's scheduled?
-        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
-
-        MadeChange = true;
-
+      // Now iterate over the information collected above.
+      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
+             OE = TiedOperands.end(); OI != OE; ++OI) {
+        processTiedPairs(mi, OI->second, Dist);
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
 
-        // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
-        if (mi->isInsertSubreg()) {
-          // From %reg = INSERT_SUBREG %reg, %subreg, subidx
-          // To   %reg:subidx = COPY %subreg
-          unsigned SubIdx = mi->getOperand(3).getImm();
-          mi->RemoveOperand(3);
-          assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
-          mi->getOperand(0).setSubReg(SubIdx);
-          mi->RemoveOperand(1);
-          mi->setDesc(TII->get(TargetOpcode::COPY));
-          DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
-        }
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
       }
 
       // Clear TiedOperands here instead of at the top of the loop
@@ -1617,15 +1466,6 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Some remat'ed instructions are dead.
-  for (int i = ReMatRegs.find_first(); i != -1; i = ReMatRegs.find_next(i)) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
-    if (MRI->use_nodbg_empty(VReg)) {
-      MachineInstr *DefMI = MRI->getVRegDef(VReg);
-      DefMI->eraseFromParent();
-    }
-  }
-
   // Eliminate REG_SEQUENCE instructions. Their whole purpose was to preseve
   // SSA form. It's now safe to de-SSA.
   MadeChange |= EliminateRegSequences();
@@ -1694,9 +1534,10 @@ TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
       continue;
 
     // Check that the instructions are all in the same basic block.
-    MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg);
-    MachineInstr *DstDefMI = MRI->getVRegDef(DstReg);
-    if (SrcDefMI->getParent() != DstDefMI->getParent())
+    MachineInstr *SrcDefMI = MRI->getUniqueVRegDef(SrcReg);
+    MachineInstr *DstDefMI = MRI->getUniqueVRegDef(DstReg);
+    if (!SrcDefMI || !DstDefMI ||
+        SrcDefMI->getParent() != DstDefMI->getParent())
       continue;
 
     // If there are no other uses than copies which feed into
@@ -1832,6 +1673,11 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
     SmallVector<unsigned, 4> RealSrcs;
     SmallSet<unsigned, 4> Seen;
     for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
+      // Nothing needs to be inserted for <undef> operands.
+      if (MI->getOperand(i).isUndef()) {
+        MI->getOperand(i).setReg(0);
+        continue;
+      }
       unsigned SrcReg = MI->getOperand(i).getReg();
       unsigned SrcSubIdx = MI->getOperand(i).getSubReg();
       unsigned SubIdx = MI->getOperand(i+1).getImm();
@@ -1841,7 +1687,7 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       MachineInstr *DefMI = NULL;
       if (!MI->getOperand(i).getSubReg() &&
           !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
-        DefMI = MRI->getVRegDef(SrcReg);
+        DefMI = MRI->getUniqueVRegDef(SrcReg);
       }
 
       if (DefMI && DefMI->isImplicitDef()) {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 3bab93b..93840f0 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -18,12 +18,14 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "VirtRegMap.h"
+#include "LiveDebugVariables.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -104,11 +106,149 @@ void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
   Virt2StackSlotMap[virtReg] = SS;
 }
 
-void VirtRegMap::rewrite(SlotIndexes *Indexes) {
+void VirtRegMap::print(raw_ostream &OS, const Module*) const {
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> "
+         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
+         << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
+         << "] " << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+  OS << '\n';
+}
+
+void VirtRegMap::dump() const {
+  print(dbgs());
+}
+
+//===----------------------------------------------------------------------===//
+//                              VirtRegRewriter
+//===----------------------------------------------------------------------===//
+//
+// The VirtRegRewriter is the last of the register allocator passes.
+// It rewrites virtual registers to physical registers as specified in the
+// VirtRegMap analysis. It also updates live-in information on basic blocks
+// according to LiveIntervals.
+//
+namespace {
+class VirtRegRewriter : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  void rewrite();
+  void addMBBLiveIns();
+public:
+  static char ID;
+  VirtRegRewriter() : MachineFunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual bool runOnMachineFunction(MachineFunction&);
+};
+} // end anonymous namespace
+
+char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;
+
+INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
+                      "Virtual Register Rewriter", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
+                    "Virtual Register Rewriter", false, false)
+
+char VirtRegRewriter::ID = 0;
+
+void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  TM = &MF->getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  MRI = &MF->getRegInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
   DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
                << "********** Function: "
                << MF->getFunction()->getName() << '\n');
-  DEBUG(dump());
+  DEBUG(VRM->dump());
+
+  // Add kill flags while we still have virtual registers.
+  LIS->addKillFlags();
+
+  // Live-in lists on basic blocks are required for physregs.
+  addMBBLiveIns();
+
+  // Rewrite virtual registers.
+  rewrite();
+
+  // Write out new DBG_VALUE instructions.
+  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers and release all the transient data.
+  VRM->clearAllVirt();
+  MRI->clearVirtRegs();
+  return true;
+}
+
+// Compute MBB live-in lists from virtual register live ranges and their
+// assignments.
+void VirtRegRewriter::addMBBLiveIns() {
+  SmallVector<MachineBasicBlock*, 16> LiveIn;
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    if (LI.empty() || LIS->intervalIsInOneMBB(LI))
+      continue;
+    // This is a virtual register that is live across basic blocks. Its
+    // assigned PhysReg must be marked as live-in to those blocks.
+    unsigned PhysReg = VRM->getPhys(VirtReg);
+    assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
+
+    // Scan the segments of LI.
+    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
+         ++I) {
+      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
+        continue;
+      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+        if (!LiveIn[i]->isLiveIn(PhysReg))
+          LiveIn[i]->addLiveIn(PhysReg);
+      LiveIn.clear();
+    }
+  }
+}
+
+void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
@@ -135,8 +275,9 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
         if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
         unsigned VirtReg = MO.getReg();
-        unsigned PhysReg = getPhys(VirtReg);
-        assert(PhysReg != NO_PHYS_REG && "Instruction uses unmapped VirtReg");
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+               "Instruction uses unmapped VirtReg");
         assert(!Reserved.test(PhysReg) && "Reserved register assignment");
 
         // Preserve semantics of sub-register operands.
@@ -207,31 +348,3 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
     if (!MRI->reg_nodbg_empty(Reg))
       MRI->setPhysRegUsed(Reg);
 }
-
-void VirtRegMap::print(raw_ostream &OS, const Module* M) const {
-  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  OS << "********** REGISTER MAP **********\n";
-  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
-      OS << '[' << PrintReg(Reg, TRI) << " -> "
-         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
-         << MRI.getRegClass(Reg)->getName() << "\n";
-    }
-  }
-
-  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
-      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
-         << "] " << MRI.getRegClass(Reg)->getName() << "\n";
-    }
-  }
-  OS << '\n';
-}
-
-void VirtRegMap::dump() const {
-  print(dbgs());
-}
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
index 8cac311..c320985 100644
--- a/lib/CodeGen/VirtRegMap.h
+++ b/lib/CodeGen/VirtRegMap.h
@@ -177,13 +177,6 @@ namespace llvm {
     /// the specified stack slot
     void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
 
-    /// rewrite - Rewrite all instructions in MF to use only physical registers
-    /// by mapping all virtual register operands to their assigned physical
-    /// registers.
-    ///
-    /// @param Indexes Optionally remove deleted instructions from indexes.
-    void rewrite(SlotIndexes *Indexes);
-
     void print(raw_ostream &OS, const Module* M = 0) const;
     void dump() const;
   };
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index 24bf97f..b27d57b 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -82,7 +82,7 @@ void DWARFCompileUnit::clear() {
   Abbrevs = 0;
   AddrSize = 0;
   BaseAddr = 0;
-  DieArray.clear();
+  clearDIEs(false);
 }
 
 void DWARFCompileUnit::dump(raw_ostream &OS) {
@@ -97,6 +97,13 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
   getCompileUnitDIE(false)->dump(OS, this, -1U);
 }
 
+const char *DWARFCompileUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
 void DWARFCompileUnit::setDIERelations() {
   if (DieArray.empty())
     return;
@@ -201,7 +208,7 @@ size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
 }
 
 void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) {
-  if (DieArray.size() > 1) {
+  if (DieArray.size() > (unsigned)keep_compile_unit_die) {
     // std::vectors never get any smaller when resized to a smaller size,
     // or when clear() or erase() are called, the size will report that it
     // is smaller, but the memory allocated remains intact (call capacity()
@@ -227,8 +234,8 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
   // all compile units to stay loaded when they weren't needed. So we can end
   // up parsing the DWARF and then throwing them all away to keep memory usage
   // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1;
-
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
   DieArray[0].buildAddressRangeTable(this, debug_aranges);
 
   // Keep memory down by clearing DIEs if this generate function
@@ -236,3 +243,13 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
   if (clear_dies)
     clearDIEs(true);
 }
+
+const DWARFDebugInfoEntryMinimal*
+DWARFCompileUnit::getFunctionDIEForAddress(int64_t address) {
+  extractDIEsIfNeeded(false);
+  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
+    if (DieArray[i].addressRangeContainsAddress(this, address))
+      return &DieArray[i];
+  }
+  return 0;
+}
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index d916729..b34a596 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -43,7 +43,7 @@ public:
                    const DWARFAbbreviationDeclarationSet *abbrevs);
 
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done.
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool cu_die_only);
   void clear();
   void dump(raw_ostream &OS);
@@ -78,6 +78,8 @@ public:
     return &DieArray[0];
   }
 
+  const char *getCompilationDir();
+
   /// setDIERelations - We read in all of the DIE entries into our flat list
   /// of DIE entries and now we need to go back through all of them and set the
   /// parent, sibling and child pointers for quick DIE navigation.
@@ -104,6 +106,11 @@ public:
 
   void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
                               bool clear_dies_if_already_not_parsed);
+  /// getFunctionDIEForAddress - Returns pointer to parsed subprogram DIE,
+  /// address ranges of which contain the provided address,
+  /// or NULL if there is no such subprogram. The pointer
+  /// is valid until DWARFCompileUnit::clear() or clearDIEs() is called.
+  const DWARFDebugInfoEntryMinimal *getFunctionDIEForAddress(int64_t address);
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index dccadc4..797662b 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFContext.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
@@ -140,30 +142,64 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) {
   return 0;
 }
 
-DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address) {
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address,
+    DILineInfoSpecifier specifier) {
   // First, get the offset of the compile unit.
   uint32_t cuOffset = getDebugAranges()->findAddress(address);
   // Retrieve the compile unit.
   DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset);
   if (!cu)
-    return DILineInfo("<invalid>", 0, 0);
-  // Get the line table for this compile unit.
-  const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
-  if (!lineTable)
-    return DILineInfo("<invalid>", 0, 0);
-  // Get the index of the row we're looking for in the line table.
-  uint64_t hiPC =
-    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_high_pc,
-                                                         -1ULL);
-  uint32_t rowIndex = lineTable->lookupAddress(address, hiPC);
-  if (rowIndex == -1U)
-    return DILineInfo("<invalid>", 0, 0);
-
-  // From here, contruct the DILineInfo.
-  const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
-  const std::string &fileName = lineTable->Prologue.FileNames[row.File-1].Name;
-
-  return DILineInfo(fileName.c_str(), row.Line, row.Column);
+    return DILineInfo();
+  SmallString<16> fileName("<invalid>");
+  SmallString<16> functionName("<invalid>");
+  uint32_t line = 0;
+  uint32_t column = 0;
+  if (specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    const DWARFDebugInfoEntryMinimal *function_die =
+        cu->getFunctionDIEForAddress(address);
+    if (function_die) {
+      if (const char *name = function_die->getSubprogramName(cu))
+        functionName = name;
+    }
+  }
+  if (specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    // Get the line table for this compile unit.
+    const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
+    if (lineTable) {
+      // Get the index of the row we're looking for in the line table.
+      uint32_t rowIndex = lineTable->lookupAddress(address);
+      if (rowIndex != -1U) {
+        const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
+        // Take file/line info from the line table.
+        const DWARFDebugLine::FileNameEntry &fileNameEntry =
+            lineTable->Prologue.FileNames[row.File - 1];
+        fileName = fileNameEntry.Name;
+        if (specifier.needs(DILineInfoSpecifier::AbsoluteFilePath) &&
+            sys::path::is_relative(fileName.str())) {
+          // Append include directory of file (if it is present in line table)
+          // and compilation directory of compile unit to make path absolute.
+          const char *includeDir = 0;
+          if (uint64_t includeDirIndex = fileNameEntry.DirIdx) {
+            includeDir = lineTable->Prologue
+                         .IncludeDirectories[includeDirIndex - 1];
+          }
+          SmallString<16> absFileName;
+          if (includeDir == 0 || sys::path::is_relative(includeDir)) {
+            if (const char *compilationDir = cu->getCompilationDir())
+              sys::path::append(absFileName, compilationDir);
+          }
+          if (includeDir) {
+            sys::path::append(absFileName, includeDir);
+          }
+          sys::path::append(absFileName, fileName.str());
+          fileName = absFileName;
+        }
+        line = row.Line;
+        column = row.Column;
+      }
+    }
+  }
+  return DILineInfo(fileName, functionName, line, column);
 }
 
 void DWARFContextInMemory::anchor() { }
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index d2e763a..e55a27e 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -66,7 +66,8 @@ public:
   const DWARFDebugLine::LineTable *
   getLineTableForCompileUnit(DWARFCompileUnit *cu);
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address);
+  virtual DILineInfo getLineInfoForAddress(uint64_t address,
+      DILineInfoSpecifier specifier = DILineInfoSpecifier());
 
   bool isLittleEndian() const { return IsLittleEndian; }
 
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index 1788145..ef470e5 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -93,6 +93,7 @@ bool DWARFDebugAranges::generate(DWARFContext *ctx) {
         cu->buildAddressRangeTable(this, true);
     }
   }
+  sort(true, /* overlap size */ 0);
   return !isEmpty();
 }
 
@@ -221,4 +222,3 @@ bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
   HiPC = Aranges.back().HiPC();
   return true;
 }
-
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 236db97..429a36c 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -440,3 +440,54 @@ DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
     }
   }
 }
+
+bool
+DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFCompileUnit *cu, const uint64_t address) const {
+  if (!isNULL() && getTag() == DW_TAG_subprogram) {
+    uint64_t hi_pc = -1ULL;
+    uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
+    if (lo_pc != -1ULL)
+      hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
+    if (hi_pc != -1ULL) {
+      return (lo_pc <= address && address < hi_pc);
+    }
+  }
+  return false;
+}
+
+const char*
+DWARFDebugInfoEntryMinimal::getSubprogramName(
+    const DWARFCompileUnit *cu) const {
+  if (isNULL() || getTag() != DW_TAG_subprogram)
+    return 0;
+  // Try to get mangled name if possible.
+  if (const char *name =
+      getAttributeValueAsString(cu, DW_AT_MIPS_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(cu, DW_AT_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(cu, DW_AT_name, 0))
+    return name;
+  // Try to get name from specification DIE.
+  uint32_t spec_ref =
+      getAttributeValueAsReference(cu, DW_AT_specification, -1U);
+  if (spec_ref != -1U) {
+    DWARFDebugInfoEntryMinimal spec_die;
+    if (spec_die.extract(cu, &spec_ref)) {
+      if (const char *name = spec_die.getSubprogramName(cu))
+        return name;
+    }
+  }
+  // Try to get name from abstract origin DIE.
+  uint32_t abs_origin_ref =
+      getAttributeValueAsReference(cu, DW_AT_abstract_origin, -1U);
+  if (abs_origin_ref != -1U) {
+    DWARFDebugInfoEntryMinimal abs_origin_die;
+    if (abs_origin_die.extract(cu, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubprogramName(cu))
+        return name;
+    }
+  }
+  return 0;
+}
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index 37b3bcd..d5d86b9 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -128,6 +128,15 @@ public:
 
   void buildAddressRangeTable(const DWARFCompileUnit *cu,
                               DWARFDebugAranges *debug_aranges) const;
+
+  bool addressRangeContainsAddress(const DWARFCompileUnit *cu,
+                                   const uint64_t address) const;
+
+  // If a DIE represents a subprogram, returns its mangled name
+  // (or short name, if mangled is missing). This name may be fetched
+  // from specification or abstract origin for this subprogram.
+  // Returns null if no name is found.
+  const char* getSubprogramName(const DWARFCompileUnit *cu) const;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 117fa31..d99575d 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -95,14 +95,46 @@ void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
 DWARFDebugLine::State::~State() {}
 
 void DWARFDebugLine::State::appendRowToMatrix(uint32_t offset) {
+  if (Sequence::Empty) {
+    // Record the beginning of instruction sequence.
+    Sequence::Empty = false;
+    Sequence::LowPC = Address;
+    Sequence::FirstRowIndex = row;
+  }
   ++row;  // Increase the row number.
   LineTable::appendRow(*this);
+  if (EndSequence) {
+    // Record the end of instruction sequence.
+    Sequence::HighPC = Address;
+    Sequence::LastRowIndex = row;
+    if (Sequence::isValid())
+      LineTable::appendSequence(*this);
+    Sequence::reset();
+  }
   Row::postAppend();
 }
 
+void DWARFDebugLine::State::finalize() {
+  row = DoneParsingLineTable;
+  if (!Sequence::Empty) {
+    fprintf(stderr, "warning: last sequence in debug line table is not"
+                    "terminated!\n");
+  }
+  // Sort all sequences so that address lookup will work faster.
+  if (!Sequences.empty()) {
+    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
+    // Note: actually, instruction address ranges of sequences should not
+    // overlap (in shared objects and executables). If they do, the address
+    // lookup would still work, though, but result would be ambiguous.
+    // We don't report warning in this case. For example,
+    // sometimes .so compiled from multiple object files contains a few
+    // rudimentary sequences for address ranges [0x0, 0xsomething).
+  }
+}
+
 DWARFDebugLine::DumpingState::~DumpingState() {}
 
-void DWARFDebugLine::DumpingState::finalize(uint32_t offset) {
+void DWARFDebugLine::DumpingState::finalize() {
   LineTable::dump(OS);
 }
 
@@ -180,8 +212,9 @@ DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
     fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
                     " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
             prologue_offset, end_prologue_offset, *offset_ptr);
+    return false;
   }
-  return end_prologue_offset;
+  return true;
 }
 
 bool
@@ -430,47 +463,53 @@ DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
     }
   }
 
-  state.finalize(*offset_ptr);
+  state.finalize();
 
   return end_offset;
 }
 
-static bool findMatchingAddress(const DWARFDebugLine::Row& row1,
-                                const DWARFDebugLine::Row& row2) {
-  return row1.Address < row2.Address;
-}
-
 uint32_t
-DWARFDebugLine::LineTable::lookupAddress(uint64_t address,
-                                         uint64_t cu_high_pc) const {
-  uint32_t index = UINT32_MAX;
-  if (!Rows.empty()) {
-    // Use the lower_bound algorithm to perform a binary search since we know
-    // that our line table data is ordered by address.
-    DWARFDebugLine::Row row;
-    row.Address = address;
-    typedef std::vector<Row>::const_iterator iterator;
-    iterator begin_pos = Rows.begin();
-    iterator end_pos = Rows.end();
-    iterator pos = std::lower_bound(begin_pos, end_pos, row,
-                                    findMatchingAddress);
-    if (pos == end_pos) {
-      if (address < cu_high_pc)
-        return Rows.size()-1;
-    } else {
-      // Rely on fact that we are using a std::vector and we can do
-      // pointer arithmetic to find the row index (which will be one less
-      // that what we found since it will find the first position after
-      // the current address) since std::vector iterators are just
-      // pointers to the container type.
-      index = pos - begin_pos;
-      if (pos->Address > address) {
-        if (index > 0)
-          --index;
-        else
-          index = UINT32_MAX;
-      }
-    }
+DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
+  uint32_t unknown_index = UINT32_MAX;
+  if (Sequences.empty())
+    return unknown_index;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  DWARFDebugLine::Sequence found_seq;
+  if (seq_pos == last_seq) {
+    found_seq = Sequences.back();
+  } else if (seq_pos->LowPC == address) {
+    found_seq = *seq_pos;
+  } else {
+    if (seq_pos == first_seq)
+      return unknown_index;
+    found_seq = *(seq_pos - 1);
+  }
+  if (!found_seq.containsPC(address))
+    return unknown_index;
+  // Search for instruction address in the rows describing the sequence.
+  // Rows are stored in a vector, so we may use arithmetical operations with
+  // iterators.
+  DWARFDebugLine::Row row;
+  row.Address = address;
+  RowIter first_row = Rows.begin() + found_seq.FirstRowIndex;
+  RowIter last_row = Rows.begin() + found_seq.LastRowIndex;
+  RowIter row_pos = std::lower_bound(first_row, last_row, row,
+      DWARFDebugLine::Row::orderByAddress);
+  if (row_pos == last_row) {
+    return found_seq.LastRowIndex - 1;
+  }
+  uint32_t index = found_seq.FirstRowIndex + (row_pos - first_row);
+  if (row_pos->Address > address) {
+    if (row_pos == first_row)
+      return unknown_index;
+    else
+      index--;
   }
-  return index; // Failed to find address.
+  return index;
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index bc6a70b..6382b45 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -12,7 +12,6 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include <map>
-#include <string>
 #include <vector>
 
 namespace llvm {
@@ -22,9 +21,9 @@ class raw_ostream;
 class DWARFDebugLine {
 public:
   struct FileNameEntry {
-    FileNameEntry() : DirIdx(0), ModTime(0), Length(0) {}
+    FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {}
 
-    std::string Name;
+    const char *Name;
     uint64_t DirIdx;
     uint64_t ModTime;
     uint64_t Length;
@@ -56,7 +55,7 @@ public:
     // The number assigned to the first special opcode.
     uint8_t OpcodeBase;
     std::vector<uint8_t> StandardOpcodeLengths;
-    std::vector<std::string> IncludeDirectories;
+    std::vector<const char*> IncludeDirectories;
     std::vector<FileNameEntry> FileNames;
 
     // Length of the prologue in bytes.
@@ -89,6 +88,10 @@ public:
     void reset(bool default_is_stmt);
     void dump(raw_ostream &OS) const;
 
+    static bool orderByAddress(const Row& LHS, const Row& RHS) {
+      return LHS.Address < RHS.Address;
+    }
+
     // The program-counter value corresponding to a machine instruction
     // generated by the compiler.
     uint64_t Address;
@@ -126,21 +129,63 @@ public:
             EpilogueBegin:1;
   };
 
+  // Represents a series of contiguous machine instructions. Line table for each
+  // compilation unit may consist of multiple sequences, which are not
+  // guaranteed to be in the order of ascending instruction address.
+  struct Sequence {
+    // Sequence describes instructions at address range [LowPC, HighPC)
+    // and is described by line table rows [FirstRowIndex, LastRowIndex).
+    uint64_t LowPC;
+    uint64_t HighPC;
+    unsigned FirstRowIndex;
+    unsigned LastRowIndex;
+    bool Empty;
+
+    Sequence() { reset(); }
+    void reset() {
+      LowPC = 0;
+      HighPC = 0;
+      FirstRowIndex = 0;
+      LastRowIndex = 0;
+      Empty = true;
+    }
+    static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
+      return LHS.LowPC < RHS.LowPC;
+    }
+    bool isValid() const {
+      return !Empty && (LowPC < HighPC) && (FirstRowIndex < LastRowIndex);
+    }
+    bool containsPC(uint64_t pc) const {
+      return (LowPC <= pc && pc < HighPC);
+    }
+  };
+
   struct LineTable {
     void appendRow(const DWARFDebugLine::Row &state) { Rows.push_back(state); }
+    void appendSequence(const DWARFDebugLine::Sequence &sequence) {
+      Sequences.push_back(sequence);
+    }
     void clear() {
       Prologue.clear();
       Rows.clear();
+      Sequences.clear();
     }
 
-    uint32_t lookupAddress(uint64_t address, uint64_t cu_high_pc) const;
+    // Returns the index of the row with file/line info for a given address,
+    // or -1 if there is no such row.
+    uint32_t lookupAddress(uint64_t address) const;
     void dump(raw_ostream &OS) const;
 
     struct Prologue Prologue;
-    std::vector<Row> Rows;
+    typedef std::vector<Row> RowVector;
+    typedef RowVector::const_iterator RowIter;
+    typedef std::vector<Sequence> SequenceVector;
+    typedef SequenceVector::const_iterator SequenceIter;
+    RowVector Rows;
+    SequenceVector Sequences;
   };
 
-  struct State : public Row, public LineTable {
+  struct State : public Row, public Sequence, public LineTable {
     // Special row codes.
     enum {
       StartParsingLineTable = 0,
@@ -151,8 +196,11 @@ public:
     virtual ~State();
 
     virtual void appendRowToMatrix(uint32_t offset);
-    virtual void finalize(uint32_t offset) { row = DoneParsingLineTable; }
-    virtual void reset() { Row::reset(Prologue.DefaultIsStmt); }
+    virtual void finalize();
+    virtual void reset() {
+      Row::reset(Prologue.DefaultIsStmt);
+      Sequence::reset();
+    }
 
     // The row number that starts at zero for the prologue, and increases for
     // each row added to the matrix.
@@ -162,7 +210,7 @@ public:
   struct DumpingState : public State {
     DumpingState(raw_ostream &OS) : OS(OS) {}
     virtual ~DumpingState();
-    virtual void finalize(uint32_t offset);
+    virtual void finalize();
   private:
     raw_ostream &OS;
   };
diff --git a/lib/ExecutionEngine/EventListenerCommon.h b/lib/ExecutionEngine/EventListenerCommon.h
index 1c07c94..911d1d6 100644
--- a/lib/ExecutionEngine/EventListenerCommon.h
+++ b/lib/ExecutionEngine/EventListenerCommon.h
@@ -14,8 +14,8 @@
 #ifndef EVENT_LISTENER_COMMON_H
 #define EVENT_LISTENER_COMMON_H
 
+#include "llvm/DebugInfo.h"
 #include "llvm/Metadata.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/Path.h"
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 5dfa78f..c11c17e 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -16,11 +16,11 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
 #define DEBUG_TYPE "amplifier-jit-event-listener"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Metadata.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ExecutionEngine/IntelJITEventsWrapper.h"
 #include "llvm/Support/Debug.h"
@@ -138,7 +138,7 @@ void IntelJITEventListener::NotifyFunctionEmitted(
       // the first instruction that has one
       if (FunctionMessage.source_file_name == 0) {
         MDNode *scope = I->Loc.getScope(
-					Details.MF->getFunction()->getContext());
+          Details.MF->getFunction()->getContext());
         FunctionMessage.source_file_name = const_cast<char*>(
                                                   Filenames.getFullPath(scope));
       }
@@ -152,7 +152,7 @@ void IntelJITEventListener::NotifyFunctionEmitted(
   }
 
   Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-													 &FunctionMessage);
+                           &FunctionMessage);
   MethodIDs[FnStart] = FunctionMessage.method_id;
 }
 
diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
index d331f83..74df8f0 100644
--- a/lib/ExecutionEngine/Interpreter/CMakeLists.txt
+++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
@@ -15,3 +15,5 @@ add_llvm_library(LLVMInterpreter
 if( LLVM_ENABLE_FFI )
   target_link_libraries( LLVMInterpreter ${FFI_LIBRARY_PATH} )
 endif()
+
+add_dependencies(LLVMInterpreter intrinsics_gen)
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index af47be9..5202b09 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -651,11 +651,40 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
-    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(i.getCaseSuccessor());
-      break;
+    IntegersSubset& Case = i.getCaseValueEx();
+    if (Case.isSingleNumber()) {
+      // FIXME: Currently work with ConstantInt based numbers.
+      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
+      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+        Dest = cast<BasicBlock>(i.getCaseSuccessor());
+        break;        
+      }
     }
+    if (Case.isSingleNumbersOnly()) {
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
+        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }      
+    } else
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        IntegersSubset::Range r = Case.getItem(n);
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *LowCI = r.getLow().toConstantInt();
+        const ConstantInt *HighCI = r.getHigh().toConstantInt();
+        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
+        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
+        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
+            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }
   }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index a942299..97995ad 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -361,7 +361,7 @@ bool JIT::removeModule(Module *M) {
 
   MutexGuard locked(lock);
 
-  if (jitstate->getModule() == M) {
+  if (jitstate && jitstate->getModule() == M) {
     delete jitstate;
     jitstate = 0;
   }
@@ -433,13 +433,18 @@ GenericValue JIT::runFunction(Function *F,
       }
       break;
     case 1:
-      if (FTy->getNumParams() == 1 &&
-          FTy->getParamType(0)->isIntegerTy(32)) {
+      if (FTy->getParamType(0)->isIntegerTy(32)) {
         GenericValue rv;
         int (*PF)(int) = (int(*)(int))(intptr_t)FPtr;
         rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
         return rv;
       }
+      if (FTy->getParamType(0)->isPointerTy()) {
+        GenericValue rv;
+        int (*PF)(char *) = (int(*)(char *))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF((char*)GVTOP(ArgValues[0])));
+        return rv;
+      }
       break;
     }
   }
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 504c8bd..ff3a9dc 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -17,9 +17,9 @@
 #include "JITDwarfEmitter.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Constants.h"
-#include "llvm/Module.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Module.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineCodeInfo.h"
@@ -108,13 +108,18 @@ namespace {
     /// particular GlobalVariable so that we can reuse them if necessary.
     GlobalToIndirectSymMapTy GlobalToIndirectSymMap;
 
+#ifndef NDEBUG
     /// Instance of the JIT this ResolverState serves.
     JIT *TheJIT;
+#endif
 
   public:
     JITResolverState(JIT *jit) : FunctionToLazyStubMap(this),
-                                 FunctionToCallSitesMap(this),
-                                 TheJIT(jit) {}
+                                 FunctionToCallSitesMap(this) {
+#ifndef NDEBUG
+      TheJIT = jit;
+#endif
+    }
 
     FunctionToLazyStubMapTy& getFunctionToLazyStubMap(
       const MutexGuard& locked) {
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 2d1775c..7be6ef8 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -852,7 +852,7 @@ static int jit_noop() {
 /// for resolving library symbols, not code generated symbols.
 ///
 void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                     bool AbortOnFailure) {
+                                                         bool AbortOnFailure) {
   // Check to see if this is one of the functions we want to intercept.  Note,
   // we cast to intptr_t here to silence a -pedantic warning that complains
   // about casting a function pointer to a normal pointer.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 44f89cf..739ffd7d8 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Target/TargetData.h"
 
 using namespace llvm;
@@ -45,7 +46,7 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
   // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo())
-    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), GVsWithCode);
+    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), GVsWithCode);
 
   if (ErrorStr)
     *ErrorStr = "target does not support JIT code generation";
@@ -54,9 +55,35 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
 MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
              RTDyldMemoryManager *MM, bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), MemMgr(MM), M(m), OS(Buffer), Dyld(MM) {
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM), 
+    isCompiled(false), M(m), OS(Buffer)  {
 
   setTargetData(TM->getTargetData());
+}
+
+MCJIT::~MCJIT() {
+  delete MemMgr;
+  delete TM;
+}
+
+void MCJIT::emitObject(Module *m) {
+  /// Currently, MCJIT only supports a single module and the module passed to
+  /// this function call is expected to be the contained module.  The module
+  /// is passed as a parameter here to prepare for multiple module support in 
+  /// the future.
+  assert(M == m);
+
+  // Get a thread lock to make sure we aren't trying to compile multiple times
+  MutexGuard locked(lock);
+
+  // FIXME: Track compilation state on a per-module basis when multiple modules
+  //        are supported.
+  // Re-compilation is not supported
+  if (isCompiled)
+    return;
+
+  PassManager PM;
+
   PM.add(new TargetData(*TM->getTargetData()));
 
   // Turn the machine code intermediate representation into bytes in memory
@@ -69,23 +96,22 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
   // FIXME: When we support multiple modules, we'll want to move the code
   // gen and finalization out of the constructor here and do it more
   // on-demand as part of getPointerToFunction().
-  PM.run(*M);
+  PM.run(*m);
   // Flush the output buffer so the SmallVector gets its data.
   OS.flush();
 
   // Load the object into the dynamic linker.
-  MemoryBuffer *MB = MemoryBuffer::getMemBuffer(StringRef(Buffer.data(),
+  MemoryBuffer* MB = MemoryBuffer::getMemBuffer(StringRef(Buffer.data(),
                                                           Buffer.size()),
                                                 "", false);
   if (Dyld.loadObject(MB))
     report_fatal_error(Dyld.getErrorString());
+
   // Resolve any relocations.
   Dyld.resolveRelocations();
-}
 
-MCJIT::~MCJIT() {
-  delete MemMgr;
-  delete TM;
+  // FIXME: Add support for per-module compilation state
+  isCompiled = true;
 }
 
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
@@ -93,6 +119,10 @@ void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
 }
 
 void *MCJIT::getPointerToFunction(Function *F) {
+  // FIXME: Add support for per-module compilation state
+  if (!isCompiled)
+    emitObject(M);
+
   if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
     bool AbortOnFailure = !F->hasExternalWeakLinkage();
     void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
@@ -100,6 +130,7 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
+  // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
   StringRef BaseName = F->getName();
   if (BaseName[0] == '\1')
@@ -217,7 +248,11 @@ GenericValue MCJIT::runFunction(Function *F,
 }
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
-                                       bool AbortOnFailure){
+                                       bool AbortOnFailure) {
+  // FIXME: Add support for per-module compilation state
+  if (!isCompiled)
+    emitObject(M);
+
   if (!isSymbolSearchingDisabled() && MemMgr) {
     void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
     if (ptr)
@@ -231,7 +266,7 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name,
 
   if (AbortOnFailure) {
     report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
+                       "' which could not be resolved!");
   }
   return 0;
 }
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 2b3df98..1d272e9 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -29,17 +29,16 @@ class MCJIT : public ExecutionEngine {
   TargetMachine *TM;
   MCContext *Ctx;
   RTDyldMemoryManager *MemMgr;
+  RuntimeDyld Dyld;
 
-  // FIXME: These may need moved to a separate 'jitstate' member like the
-  // non-MC JIT does for multithreading and such. Just keep them here for now.
-  PassManager PM;
+  // FIXME: Add support for multiple modules
+  bool isCompiled;
   Module *M;
-  // FIXME: This really doesn't belong here.
+
+  // FIXME: Move these to a single container which manages JITed objects
   SmallVector<char, 4096> Buffer; // Working buffer into which we JIT.
   raw_svector_ostream OS;
 
-  RuntimeDyld Dyld;
-
 public:
   ~MCJIT();
 
@@ -91,6 +90,14 @@ public:
                                     TargetMachine *TM);
 
   // @}
+
+protected:
+  /// emitObject -- Generate a JITed object in memory from the specified module
+  /// Currently, MCJIT only supports a single module and the module passed to
+  /// this function call is expected to be the contained module.  The module
+  /// is passed as a parameter here to prepare for multiple module support in 
+  /// the future.
+  void emitObject(Module *M);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index a68949a..441aaeb 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -22,24 +22,20 @@ namespace llvm {
 // matching LLVM IR counterparts in the module(s) being compiled.
 class MCJITMemoryManager : public RTDyldMemoryManager {
   virtual void anchor();
-  JITMemoryManager *JMM;
+  OwningPtr<JITMemoryManager> JMM;
 
-  // FIXME: Multiple modules.
-  Module *M;
 public:
-  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) :
-    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()), M(m) {}
-  // We own the JMM, so make sure to delete it.
-  ~MCJITMemoryManager() { delete JMM; }
+  MCJITMemoryManager(JITMemoryManager *jmm) :
+    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()) {}
 
   uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateSpace(Size, Alignment);
+    return JMM->allocateDataSection(Size, Alignment, SectionID);
   }
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateSpace(Size, Alignment);
+    return JMM->allocateCodeSection(Size, Alignment, SectionID);
   }
 
   virtual void *getPointerToNamedFunction(const std::string &Name,
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index e6142e3..6b8e9d1 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -16,9 +16,9 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
 #define DEBUG_TYPE "oprofile-jit-event-listener"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
index 8206ead..c3e3572 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
@@ -48,7 +48,7 @@ public:
   virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
               {}
 
-  // Subclasses can override this method to provide JIT debugging support
+  // Subclasses can override these methods to provide JIT debugging support
   virtual void registerWithDebugger() {}
   virtual void deregisterWithDebugger() {}
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 1b1840a..b464040 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -39,7 +39,7 @@ namespace {
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
   // First, resolve relocations associated with external symbols.
-  resolveSymbols();
+  resolveExternalSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
@@ -59,8 +59,8 @@ void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-// Subclasses can implement this method to create specialized image instances
-// The caller owns the the pointer that is returned.
+// Subclasses can implement this method to create specialized image instances.
+// The caller owns the pointer that is returned.
 ObjectImage *RuntimeDyldImpl::createObjectImage(const MemoryBuffer *InputBuffer) {
   ObjectFile *ObjFile = ObjectFile::createObjectFile(const_cast<MemoryBuffer*>
                                                                  (InputBuffer));
@@ -75,11 +75,15 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
 
   Arch = (Triple::ArchType)obj->getArch();
 
-  LocalSymbolMap LocalSymbols;     // Functions and data symbols from the
-                                   // object file.
-  ObjSectionToIDMap LocalSections; // Used sections from the object file
-  CommonSymbolMap   CommonSymbols; // Common symbols requiring allocation
-  uint64_t          CommonSize = 0;
+  // Symbols found in this object
+  StringMap<SymbolLoc> LocalSymbols;
+  // Used sections from the object file
+  ObjSectionToIDMap LocalSections;
+
+  // Common symbols requiring allocation, and the total size required to
+  // allocate all common symbols.
+  CommonSymbolMap CommonSymbols;
+  uint64_t CommonSize = 0;
 
   error_code err;
   // Parse symbols
@@ -106,28 +110,29 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
       if (SymType == object::SymbolRef::ST_Function ||
           SymType == object::SymbolRef::ST_Data) {
         uint64_t FileOffset;
-        StringRef sData;
+        StringRef SectionData;
         section_iterator si = obj->end_sections();
         Check(i->getFileOffset(FileOffset));
         Check(i->getSection(si));
         if (si == obj->end_sections()) continue;
-        Check(si->getContents(sData));
+        Check(si->getContents(SectionData));
         const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
                                 (uintptr_t)FileOffset;
-        uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin());
+        uintptr_t SectOffset = (uintptr_t)(SymPtr -
+                                           (const uint8_t*)SectionData.begin());
         unsigned SectionID =
           findOrEmitSection(*obj,
                             *si,
                             SymType == object::SymbolRef::ST_Function,
                             LocalSections);
-        bool isGlobal = flags & SymbolRef::SF_Global;
         LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
         DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
                      << " flags: " << flags
                      << " SID: " << SectionID
                      << " Offset: " << format("%p", SectOffset));
+        bool isGlobal = flags & SymbolRef::SF_Global;
         if (isGlobal)
-          SymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+          GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
       }
     }
     DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
@@ -137,7 +142,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
   if (CommonSize != 0)
     emitCommonSymbols(*obj, CommonSymbols, CommonSize, LocalSymbols);
 
-  // Parse and proccess relocations
+  // Parse and process relocations
   DEBUG(dbgs() << "Parse relocations:\n");
   for (section_iterator si = obj->begin_sections(),
        se = obj->end_sections(); si != se; si.increment(err)) {
@@ -150,7 +155,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
          e = si->end_relocations(); i != e; i.increment(err)) {
       Check(err);
 
-      // If it's first relocation in this section, find its SectionID
+      // If it's the first relocation in this section, find its SectionID
       if (isFirstRelocation) {
         SectionID = findOrEmitSection(*obj, *si, true, LocalSections);
         DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
@@ -177,10 +182,10 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
   return false;
 }
 
-unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
-                                            const CommonSymbolMap &Map,
-                                            uint64_t TotalSize,
-                                            LocalSymbolMap &LocalSymbols) {
+void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
+                                        const CommonSymbolMap &CommonSymbols,
+                                        uint64_t TotalSize,
+                                        SymbolTableMap &SymbolTable) {
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
   uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
@@ -197,18 +202,16 @@ unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                << "\n");
 
   // Assign the address of each symbol
-  for (CommonSymbolMap::const_iterator it = Map.begin(), itEnd = Map.end();
-       it != itEnd; it++) {
-    uint64_t Size = it->second;
+  for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(),
+       itEnd = CommonSymbols.end(); it != itEnd; it++) {
     StringRef Name;
     it->first.getName(Name);
     Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
-    LocalSymbols[Name.data()] = SymbolLoc(SectionID, Offset);
+    SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset);
+    uint64_t Size = it->second;
     Offset += Size;
     Addr += Size;
   }
-
-  return SectionID;
 }
 
 unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
@@ -274,8 +277,8 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   }
   else {
     // Even if we didn't load the section, we need to record an entry for it
-    //   to handle later processing (and by 'handle' I mean don't do anything
-    //   with these sections).
+    // to handle later processing (and by 'handle' I mean don't do anything
+    // with these sections).
     Allocate = 0;
     Addr = 0;
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
@@ -307,28 +310,26 @@ unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj,
   return SectionID;
 }
 
-void RuntimeDyldImpl::AddRelocation(const RelocationValueRef &Value,
-                                   unsigned SectionID, uintptr_t Offset,
-                                   uint32_t RelType) {
-  DEBUG(dbgs() << "AddRelocation SymNamePtr: " << format("%p", Value.SymbolName)
-               << " SID: " << Value.SectionID
-               << " Addend: " << format("%p", Value.Addend)
-               << " Offset: " << format("%p", Offset)
-               << " RelType: " << format("%x", RelType)
-               << "\n");
+void RuntimeDyldImpl::addRelocationForSection(const RelocationEntry &RE,
+                                              unsigned SectionID) {
+  Relocations[SectionID].push_back(RE);
+}
 
-  if (Value.SymbolName == 0) {
-    Relocations[Value.SectionID].push_back(RelocationEntry(
-      SectionID,
-      Offset,
-      RelType,
-      Value.Addend));
-  } else
-    SymbolRelocations[Value.SymbolName].push_back(RelocationEntry(
-      SectionID,
-      Offset,
-      RelType,
-      Value.Addend));
+void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
+                                             StringRef SymbolName) {
+  // Relocation by symbol.  If the symbol is found in the global symbol table,
+  // create an appropriate section relocation.  Otherwise, add it to
+  // ExternalSymbolRelocations.
+  SymbolTableMap::const_iterator Loc =
+      GlobalSymbolTable.find(SymbolName);
+  if (Loc == GlobalSymbolTable.end()) {
+    ExternalSymbolRelocations[SymbolName].push_back(RE);
+  } else {
+    // Copy the RE since we want to modify its addend.
+    RelocationEntry RECopy = RE;
+    RECopy.Addend += Loc->second.second;
+    Relocations[Loc->second.first].push_back(RECopy);
+  }
 }
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
@@ -369,12 +370,12 @@ void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
       uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
       DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
             << " + " << RE.Offset << " (" << format("%p", Target) << ")"
-            << " Data: " << RE.Data
+            << " RelType: " << RE.RelType
             << " Addend: " << RE.Addend
             << "\n");
 
       resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
-                        Value, RE.Data, RE.Addend);
+                        Value, RE.RelType, RE.Addend);
   }
 }
 
@@ -385,16 +386,14 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
   }
 }
 
-// resolveSymbols - Resolve any relocations to the specified symbols if
-// we know where it lives.
-void RuntimeDyldImpl::resolveSymbols() {
-  StringMap<RelocationList>::iterator i = SymbolRelocations.begin(),
-                                      e = SymbolRelocations.end();
+void RuntimeDyldImpl::resolveExternalSymbols() {
+  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
+                                      e = ExternalSymbolRelocations.end();
   for (; i != e; i++) {
     StringRef Name = i->first();
     RelocationList &Relocs = i->second;
-    StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name);
-    if (Loc == SymbolTable.end()) {
+    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+    if (Loc == GlobalSymbolTable.end()) {
       // This is an external symbol, try to get it address from
       // MemoryManager.
       uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
@@ -404,15 +403,7 @@ void RuntimeDyldImpl::resolveSymbols() {
               << "\n");
       resolveRelocationList(Relocs, (uintptr_t)Addr);
     } else {
-      // Change the relocation to be section relative rather than symbol
-      // relative and move it to the resolved relocation list.
-      DEBUG(dbgs() << "Resolving symbol '" << Name << "'\n");
-      for (int i = 0, e = Relocs.size(); i != e; ++i) {
-        RelocationEntry Entry = Relocs[i];
-        Entry.Addend += Loc->second.second;
-        Relocations[Loc->second.first].push_back(Entry);
-      }
-      Relocs.clear();
+      report_fatal_error("Expected external symbol");
     }
   }
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index db6da8c..75bb586 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -55,7 +55,7 @@ public:
 
   const MemoryBuffer& getBuffer() const { return *InputData; }
 
-  // Methods for type inquiry through isa, cast, and dyn_cast
+  // Methods for type inquiry through isa, cast and dyn_cast
   static inline bool classof(const Binary *v) {
     return (isa<ELFObjectFile<target_endianness, is64Bits> >(v)
             && classof(cast<ELFObjectFile<target_endianness, is64Bits> >(v)));
@@ -208,10 +208,9 @@ void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
   case ELF::R_X86_64_32:
   case ELF::R_X86_64_32S: {
     Value += Addend;
-    // FIXME: Handle the possibility of this assertion failing
-    assert((Type == ELF::R_X86_64_32 && !(Value & 0xFFFFFFFF00000000ULL)) ||
-           (Type == ELF::R_X86_64_32S &&
-            (Value & 0xFFFFFFFF00000000ULL) == 0xFFFFFFFF00000000ULL));
+    assert((Type == ELF::R_X86_64_32 && (Value <= UINT32_MAX)) ||
+           (Type == ELF::R_X86_64_32S && 
+             ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN)));
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
     uint32_t *Target = reinterpret_cast<uint32_t*>(LocalAddress);
     *Target = TruncatedAddr;
@@ -220,7 +219,7 @@ void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
   case ELF::R_X86_64_PC32: {
     uint32_t *Placeholder = reinterpret_cast<uint32_t*>(LocalAddress);
     int64_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
-    assert(RealOffset <= 214783647 && RealOffset >= -214783648);
+    assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
     int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
     *Placeholder = TruncOffset;
     break;
@@ -248,7 +247,7 @@ void RuntimeDyldELF::resolveX86Relocation(uint8_t *LocalAddress,
     }
     default:
       // There are other relocation types, but it appears these are the
-      //  only ones currently used by the LLVM ELF object writer
+      // only ones currently used by the LLVM ELF object writer
       llvm_unreachable("Relocation type not implemented yet!");
       break;
   }
@@ -334,28 +333,31 @@ void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
 void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
                                           ObjectImage &Obj,
                                           ObjSectionToIDMap &ObjSectionToID,
-                                          LocalSymbolMap &Symbols,
+                                          const SymbolTableMap &Symbols,
                                           StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL);
   intptr_t Addend = (intptr_t)Rel.AdditionalInfo;
-  RelocationValueRef Value;
-  StringRef TargetName;
   const SymbolRef &Symbol = Rel.Symbol;
+
+  // Obtain the symbol name which is referenced in the relocation
+  StringRef TargetName;
   Symbol.getName(TargetName);
   DEBUG(dbgs() << "\t\tRelType: " << RelType
                << " Addend: " << Addend
                << " TargetName: " << TargetName
                << "\n");
-  // First look the symbol in object file symbols.
-  LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+  RelocationValueRef Value;
+  // First search for the symbol in the local symbol table
+  SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
     Value.Addend = lsi->second.second;
   } else {
-    // Second look the symbol in global symbol table.
-    StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
-    if (gsi != SymbolTable.end()) {
+    // Search for the symbol in the global symbol table
+    SymbolTableMap::const_iterator gsi =
+        GlobalSymbolTable.find(TargetName.data());
+    if (gsi != GlobalSymbolTable.end()) {
       Value.SectionID = gsi->second.first;
       Value.Addend = gsi->second.second;
     } else {
@@ -366,7 +368,7 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
           // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
           // and can be changed by another developers. Maybe best way is add
           // a new symbol type ST_Section to SymbolRef and use it.
-          section_iterator si = Obj.end_sections();
+          section_iterator si(Obj.end_sections());
           Symbol.getSection(si);
           if (si == Obj.end_sections())
             llvm_unreachable("Symbol section not found, bad object file format!");
@@ -410,14 +412,24 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
       Stubs[Value] = Section.StubOffset;
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
-      AddRelocation(Value, Rel.SectionID,
-                    StubTargetAddr - Section.Address, ELF::R_ARM_ABS32);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         ELF::R_ARM_ABS32, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+
       resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
                         Section.StubOffset, RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
-  } else
-    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
 }
 
 bool RuntimeDyldELF::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index e7f6fab..e413f78 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -51,7 +51,8 @@ protected:
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs);
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
 
   virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
   virtual void handleObjectLoaded(ObjectImage *Obj);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 2dea13f..c38ca69 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -14,60 +14,83 @@
 #ifndef LLVM_RUNTIME_DYLD_IMPL_H
 #define LLVM_RUNTIME_DYLD_IMPL_H
 
+#include "ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/system_error.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/Triple.h"
-#include <map>
 #include "llvm/Support/Format.h"
-#include "ObjectImage.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <map>
 
 using namespace llvm;
 using namespace llvm::object;
 
 namespace llvm {
 
+class MemoryBuffer;
+class Twine;
+
+
+/// SectionEntry - represents a section emitted into memory by the dynamic
+/// linker.
 class SectionEntry {
 public:
-  uint8_t* Address;
+  /// Address - address in the linker's memory where the section resides.
+  uint8_t *Address;
+
+  /// Size - section size.
   size_t Size;
-  uint64_t LoadAddress;   // For each section, the address it will be
-                          // considered to live at for relocations. The same
-                          // as the pointer to the above memory block for
-                          // hosted JITs.
-  uintptr_t StubOffset;   // It's used for architecturies with stub
-                          // functions for far relocations like ARM.
-  uintptr_t ObjAddress;   // Section address in object file. It's use for
-                          // calculate MachO relocation addend
-  SectionEntry(uint8_t* address, size_t size, uintptr_t stubOffset,
+
+  /// LoadAddress - the address of the section in the target process's memory.
+  /// Used for situations in which JIT-ed code is being executed in the address
+  /// space of a separate process.  If the code executes in the same address
+  /// space where it was JIT-ed, this just equals Address.
+  uint64_t LoadAddress;
+
+  /// StubOffset - used for architectures with stub functions for far
+  /// relocations (like ARM).
+  uintptr_t StubOffset;
+
+  /// ObjAddress - address of the section in the in-memory object file.  Used
+  /// for calculating relocations in some object formats (like MachO).
+  uintptr_t ObjAddress;
+
+  SectionEntry(uint8_t *address, size_t size, uintptr_t stubOffset,
                uintptr_t objAddress)
     : Address(address), Size(size), LoadAddress((uintptr_t)address),
       StubOffset(stubOffset), ObjAddress(objAddress) {}
 };
 
+/// RelocationEntry - used to represent relocations internally in the dynamic
+/// linker.
 class RelocationEntry {
 public:
-  unsigned    SectionID;  // Section the relocation is contained in.
-  uintptr_t   Offset;     // Offset into the section for the relocation.
-  uint32_t    Data;       // Relocatino data. Including type of relocation
-                          // and another flags and parameners from
-  intptr_t    Addend;     // Addend encoded in the instruction itself, if any,
-                          // plus the offset into the source section for
-                          // the symbol once the relocation is resolvable.
-  RelocationEntry(unsigned id, uint64_t offset, uint32_t data, int64_t addend)
-    : SectionID(id), Offset(offset), Data(data), Addend(addend) {}
+  /// SectionID - the section this relocation points to.
+  unsigned SectionID;
+
+  /// Offset - offset into the section.
+  uintptr_t Offset;
+
+  /// RelType - relocation type.
+  uint32_t RelType;
+
+  /// Addend - the relocation addend encoded in the instruction itself.  Also
+  /// used to make a relocation section relative instead of symbol relative.
+  intptr_t Addend;
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend) {}
 };
 
-// Raw relocation data from object file
+/// ObjRelocationInfo - relocation information as read from the object file.
+/// Used to pass around data taken from object::RelocationRef, together with
+/// the section to which the relocation points (represented by a SectionID).
 class ObjRelocationInfo {
 public:
   unsigned  SectionID;
@@ -97,7 +120,8 @@ protected:
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
 
-  // A list of emmitted sections.
+  // A list of all sections emitted by the dynamic linker.  These sections are
+  // referenced in the code by means of their index in this list - SectionID.
   typedef SmallVector<SectionEntry, 64> SectionList;
   SectionList Sections;
 
@@ -105,11 +129,11 @@ protected:
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
 
-  // Master symbol table. As modules are loaded and external symbols are
-  // resolved, their addresses are stored here as a SectionID/Offset pair.
+  // A global symbol table for symbols from all loaded modules.  Maps the
+  // symbol name to a (SectionID, offset in section) pair.
   typedef std::pair<unsigned, uintptr_t> SymbolLoc;
-  StringMap<SymbolLoc> SymbolTable;
-  typedef DenseMap<const char*, SymbolLoc> LocalSymbolMap;
+  typedef StringMap<SymbolLoc> SymbolTableMap;
+  SymbolTableMap GlobalSymbolTable;
 
   // Keep a map of common symbols to their sizes
   typedef std::map<SymbolRef, unsigned> CommonSymbolMap;
@@ -121,12 +145,14 @@ protected:
   // in the relocation list where it's stored.
   typedef SmallVector<RelocationEntry, 64> RelocationList;
   // Relocations to sections already loaded. Indexed by SectionID which is the
-  // source of the address. The target where the address will be writen is
+  // source of the address. The target where the address will be written is
   // SectionID/Offset in the relocation itself.
   DenseMap<unsigned, RelocationList> Relocations;
-  // Relocations to external symbols that are not yet resolved.
-  // Indexed by symbol name.
-  StringMap<RelocationList> SymbolRelocations;
+
+  // Relocations to external symbols that are not yet resolved.  Symbols are
+  // external when they aren't found in the global symbol table of all loaded
+  // modules.  This map is indexed by symbol name.
+  StringMap<RelocationList> ExternalSymbolRelocations;
 
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
@@ -153,16 +179,17 @@ protected:
     return (uint8_t*)Sections[SectionID].Address;
   }
 
-  /// \brief Emits a section containing common symbols.
-  /// \return SectionID.
-  unsigned emitCommonSymbols(ObjectImage &Obj,
-                             const CommonSymbolMap &Map,
-                             uint64_t TotalSize,
-                             LocalSymbolMap &Symbols);
+  /// \brief Given the common symbols discovered in the object file, emit a
+  /// new section for them and update the symbol mappings in the object and
+  /// symbol table.
+  void emitCommonSymbols(ObjectImage &Obj,
+                         const CommonSymbolMap &CommonSymbols,
+                         uint64_t TotalSize,
+                         SymbolTableMap &SymbolTable);
 
   /// \brief Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
-  ///        used for emmits, else allocateDataSection() will be used.
+  ///        used for emits, else allocateDataSection() will be used.
   /// \return SectionID.
   unsigned emitSection(ObjectImage &Obj,
                        const SectionRef &Section,
@@ -178,10 +205,12 @@ protected:
                              bool IsCode,
                              ObjSectionToIDMap &LocalSections);
 
-  /// \brief If Value.SymbolName is NULL then store relocation to the
-  ///        Relocations, else store it in the SymbolRelocations.
-  void AddRelocation(const RelocationValueRef &Value, unsigned SectionID,
-                     uintptr_t Offset, uint32_t RelType);
+  // \brief Add a relocation entry that uses the given section.
+  void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID);
+
+  // \brief Add a relocation entry that uses the given symbol.  This symbol may
+  // be found in the global symbol table, or it may be external.
+  void addRelocationForSymbol(const RelocationEntry &RE, StringRef SymbolName);
 
   /// \brief Emits long jump instruction to Addr.
   /// \return Pointer to the memory area for emitting target address.
@@ -203,14 +232,16 @@ protected:
                                  uint32_t Type,
                                  int64_t Addend) = 0;
 
-  /// \brief Parses the object file relocation and store it to Relocations
-  ///        or SymbolRelocations. Its depend from object file type.
+  /// \brief Parses the object file relocation and stores it to Relocations
+  ///        or SymbolRelocations (this depends on the object file type).
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs) = 0;
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs) = 0;
 
-  void resolveSymbols();
+  /// \brief Resolve relocations to external symbols.
+  void resolveExternalSymbols();
   virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
   virtual void handleObjectLoaded(ObjectImage *Obj)
   {
@@ -228,9 +259,9 @@ public:
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (SymbolTable.find(Name) == SymbolTable.end())
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = SymbolTable.lookup(Name);
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index b7f515d..0e3a9d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -30,7 +30,8 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
   unsigned MachoType = (Type >> 28) & 0xf;
   unsigned Size = 1 << ((Type >> 25) & 3);
 
-  DEBUG(dbgs() << "resolveRelocation LocalAddress: " << format("%p", LocalAddress)
+  DEBUG(dbgs() << "resolveRelocation LocalAddress: " 
+        << format("%p", LocalAddress)
         << " FinalAddress: " << format("%p", FinalAddress)
         << " Value: " << format("%p", Value)
         << " Addend: " << Addend
@@ -53,12 +54,12 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
     break;
   case Triple::x86:
     resolveI386Relocation(LocalAddress,
-                                 FinalAddress,
-                                 (uintptr_t)Value,
-                                 isPCRel,
-                                 Type,
-                                 Size,
-                                 Addend);
+                          FinalAddress,
+                          (uintptr_t)Value,
+                          isPCRel,
+                          Type,
+                          Size,
+                          Addend);
     break;
   case Triple::arm:    // Fall through.
   case Triple::thumb:
@@ -73,14 +74,13 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveI386Relocation(uint8_t *LocalAddress,
-                      uint64_t FinalAddress,
-                      uint64_t Value,
-                      bool isPCRel,
-                      unsigned Type,
-                      unsigned Size,
-                      int64_t Addend) {
+bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
+                                             uint64_t FinalAddress,
+                                             uint64_t Value,
+                                             bool isPCRel,
+                                             unsigned Type,
+                                             unsigned Size,
+                                             int64_t Addend) {
   if (isPCRel)
     Value -= FinalAddress + 4; // see resolveX86_64Relocation
 
@@ -102,14 +102,13 @@ resolveI386Relocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveX86_64Relocation(uint8_t *LocalAddress,
-                        uint64_t FinalAddress,
-                        uint64_t Value,
-                        bool isPCRel,
-                        unsigned Type,
-                        unsigned Size,
-                        int64_t Addend) {
+bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
+                                               uint64_t FinalAddress,
+                                               uint64_t Value,
+                                               bool isPCRel,
+                                               unsigned Type,
+                                               unsigned Size,
+                                               int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel)
@@ -144,14 +143,13 @@ resolveX86_64Relocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveARMRelocation(uint8_t *LocalAddress,
-                     uint64_t FinalAddress,
-                     uint64_t Value,
-                     bool isPCRel,
-                     unsigned Type,
-                     unsigned Size,
-                     int64_t Addend) {
+bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
+                                            uint64_t FinalAddress,
+                                            uint64_t Value,
+                                            bool isPCRel,
+                                            unsigned Type,
+                                            unsigned Size,
+                                            int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel) {
@@ -207,7 +205,7 @@ resolveARMRelocation(uint8_t *LocalAddress,
 void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
                                             ObjectImage &Obj,
                                             ObjSectionToIDMap &ObjSectionToID,
-                                            LocalSymbolMap &Symbols,
+                                            const SymbolTableMap &Symbols,
                                             StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
@@ -217,18 +215,19 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
 
   bool isExtern = (RelType >> 27) & 1;
   if (isExtern) {
+    // Obtain the symbol name which is referenced in the relocation
     StringRef TargetName;
     const SymbolRef &Symbol = Rel.Symbol;
     Symbol.getName(TargetName);
-    // First look the symbol in object file symbols.
-    LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+    // First search for the symbol in the local symbol table
+    SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
     if (lsi != Symbols.end()) {
       Value.SectionID = lsi->second.first;
       Value.Addend = lsi->second.second;
     } else {
-      // Second look the symbol in global symbol table.
-      StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
-      if (gsi != SymbolTable.end()) {
+      // Search for the symbol in the global symbol table
+      SymbolTableMap::const_iterator gsi = GlobalSymbolTable.find(TargetName.data());
+      if (gsi != GlobalSymbolTable.end()) {
         Value.SectionID = gsi->second.first;
         Value.Addend = gsi->second.second;
       } else
@@ -249,8 +248,8 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
     Value.Addend = *(const intptr_t *)Target;
     if (Value.Addend) {
-      // The MachO addend is offset from the current section, we need set it
-      // as offset from destination section
+      // The MachO addend is an offset from the current section.  We need it
+      // to be an offset from the destination section
       Value.Addend += Section.ObjAddress - Sections[Value.SectionID].ObjAddress;
     }
   }
@@ -269,19 +268,29 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
       Stubs[Value] = Section.StubOffset;
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
-      AddRelocation(Value, Rel.SectionID, StubTargetAddr - Section.Address,
-                    macho::RIT_Vanilla);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         macho::RIT_Vanilla, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
       resolveRelocation(Target, (uint64_t)Target,
                         (uint64_t)Section.Address + Section.StubOffset,
                         RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
-  } else
-    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
 }
 
 
-bool RuntimeDyldMachO::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
+bool RuntimeDyldMachO::isCompatibleFormat(
+        const MemoryBuffer *InputBuffer) const {
   StringRef Magic = InputBuffer->getBuffer().slice(0, 4);
   if (Magic == "\xFE\xED\xFA\xCE") return true;
   if (Magic == "\xCE\xFA\xED\xFE") return true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 418d130..707664c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -51,7 +51,8 @@ protected:
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs);
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
 
 public:
   virtual void resolveRelocation(uint8_t *LocalAddress,
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 42364f9..7cdd669 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -26,11 +26,7 @@
 using namespace llvm;
 
 TargetMachine *EngineBuilder::selectTarget() {
-  StringRef MArch = "";
-  StringRef MCPU = "";
-  SmallVector<std::string, 1> MAttrs;
-  Triple TT(M->getTargetTriple());
-
+  Triple TT(LLVM_HOSTTRIPLE);
   return selectTarget(TT, MArch, MCPU, MAttrs);
 }
 
@@ -56,8 +52,9 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
     }
 
     if (!TheTarget) {
-      *ErrorStr = "No available targets are compatible with this -march, "
-        "see -version for the available targets.\n";
+      if (ErrorStr)
+        *ErrorStr = "No available targets are compatible with this -march, "
+                    "see -version for the available targets.\n";
       return 0;
     }
 
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 765fcc8..a6599bf 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -16,6 +16,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
+#include "llvm/TypeFinder.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
@@ -25,6 +26,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm-c/Linker.h"
 #include <cctype>
 using namespace llvm;
 
@@ -594,13 +596,13 @@ void ModuleLinker::computeTypeMapping() {
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType*> SrcStructTypes;
-  SrcM->findUsedStructTypes(SrcStructTypes);
+  TypeFinder SrcStructTypes;
+  SrcStructTypes.run(*SrcM, true);
   SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(),
                                                  SrcStructTypes.end());
 
-  std::vector<StructType*> DstStructTypes;
-  DstM->findUsedStructTypes(DstStructTypes);
+  TypeFinder DstStructTypes;
+  DstStructTypes.run(*DstM, true);
   SmallPtrSet<StructType*, 32> DstStructTypesSet(DstStructTypes.begin(),
                                                  DstStructTypes.end());
 
@@ -683,7 +685,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   GlobalVariable *NG =
     new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(),
                        DstGV->getLinkage(), /*init*/0, /*name*/"", DstGV,
-                       DstGV->isThreadLocal(),
+                       DstGV->getThreadLocalMode(),
                        DstGV->getType()->getAddressSpace());
   
   // Propagate alignment, visibility and section info.
@@ -758,7 +760,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()),
                        SGV->isConstant(), SGV->getLinkage(), /*init*/0,
                        SGV->getName(), /*insertbefore*/0,
-                       SGV->isThreadLocal(),
+                       SGV->getThreadLocalMode(),
                        SGV->getType()->getAddressSpace());
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NewDGV, SGV);
@@ -1335,3 +1337,17 @@ bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode,
 
   return false;
 }
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
+                         LLVMLinkerMode Mode, char **OutMessages) {
+  std::string Messages;
+  LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src),
+                                        Mode, OutMessages? &Messages : 0);
+  if (OutMessages)
+    *OutMessages = strdup(Messages.c_str());
+  return Result;
+}
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index f11e686..99bff96 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_library(LLVMMC
   MCObjectStreamer.cpp
   MCObjectWriter.cpp
   MCPureStreamer.cpp
+  MCRegisterInfo.cpp
   MCSection.cpp
   MCSectionCOFF.cpp
   MCSectionELF.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 9fc33b6..7203b9a 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -627,7 +627,7 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
 
 const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
                                                const MCValue &Target,
-                                               const MCFragment &F, 
+                                               const MCFragment &F,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
@@ -1061,11 +1061,19 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
       entry.Index += LocalSymbolData.size();
     if (is64Bit()) {
       String64(*F, entry.r_offset);
+      if (TargetObjectWriter->isN64()) {
+        String32(*F, entry.Index);
 
-      struct ELF::Elf64_Rela ERE64;
-      ERE64.setSymbolAndType(entry.Index, entry.Type);
-      String64(*F, ERE64.r_info);
-
+        String8(*F, TargetObjectWriter->getRSsym(entry.Type));
+        String8(*F, TargetObjectWriter->getRType3(entry.Type));
+        String8(*F, TargetObjectWriter->getRType2(entry.Type));
+        String8(*F, TargetObjectWriter->getRType(entry.Type));
+      }
+      else {
+        struct ELF::Elf64_Rela ERE64;
+        ERE64.setSymbolAndType(entry.Index, entry.Type);
+        String64(*F, ERE64.r_info);
+      }
       if (hasRelocationAddend())
         String64(*F, entry.r_addend);
     } else {
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 0b2e4ae..2e447b0 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -39,7 +39,7 @@ MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
     { "FK_SecRel_4", 0, 32, 0 },
     { "FK_SecRel_8", 0, 64, 0 }
   };
-  
+
   assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) &&
          "Unknown fixup kind");
   return Builtins[Kind];
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 8286c1d..8da2e0e 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -50,6 +50,7 @@ MCAsmInfo::MCAsmInfo() {
   AllowNameToStartWithDigit = false;
   AllowPeriodsInName = true;
   AllowUTF8 = true;
+  UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
@@ -57,12 +58,6 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
-  DataBegin = "$d.";
-  CodeBegin = "$a.";
-  JT8Begin = "$d.";
-  JT16Begin = "$d.";
-  JT32Begin = "$d.";
-  SupportsDataRegions = false;
   SunStyleELFSectionSwitchSyntax = false;
   UsesELFSectionDirectiveForBSS = false;
   AlignDirective = "\t.align\t";
@@ -89,14 +84,10 @@ MCAsmInfo::MCAsmInfo() {
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
   DwarfUsesInlineInfoSection = false;
-  DwarfRequiresRelocationForSectionOffset = true;
   DwarfSectionOffsetDirective = 0;
-  DwarfUsesLabelOffsetForRanges = true;
-  DwarfUsesRelocationsForStringPool = true;
+  DwarfUsesRelocationsAcrossSections = true;
   DwarfRegNumForCFI = false;
   HasMicrosoftFastStdCallMangling = false;
-
-  AsmTransCBE = 0;
 }
 
 MCAsmInfo::~MCAsmInfo() {
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 881d992..678e75a 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -26,7 +26,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   PrivateGlobalPrefix = "L";  // Prefix for private global symbols
   WeakRefDirective = "\t.weak\t";
   LinkOnceDirective = "\t.linkonce discard\n";
-  
+
   // Doesn't support visibility:
   HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid;
   ProtectedVisibilityAttr = MCSA_Invalid;
@@ -36,8 +36,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   SupportsDebugInformation = true;
   DwarfSectionOffsetDirective = "\t.secrel32\t";
   HasMicrosoftFastStdCallMangling = true;
-
-  SupportsDataRegions = false;
 }
 
 void MCAsmInfoMicrosoft::anchor() { }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index c1e2635..8e0ac23 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -18,7 +18,7 @@
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
-void MCAsmInfoDarwin::anchor() { } 
+void MCAsmInfoDarwin::anchor() { }
 
 MCAsmInfoDarwin::MCAsmInfoDarwin() {
   // Common settings for all Darwin targets.
@@ -43,13 +43,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   HasMachoTBSSDirective = true; // Uses .tbss
   HasStaticCtorDtorReferenceInStaticMode = true;
 
-  CodeBegin = "L$start$code$";
-  DataBegin = "L$start$data$";
-  JT8Begin  = "L$start$jt8$";
-  JT16Begin = "L$start$jt16$";
-  JT32Begin = "L$start$jt32$";
-  SupportsDataRegions = true;
-
   // FIXME: Darwin 10 and newer don't need this.
   LinkerRequiresNonEmptyDwarfLines = true;
 
@@ -61,12 +54,10 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   // Doesn't support protected visibility.
   ProtectedVisibilityAttr = MCSA_Invalid;
-  
+
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   HasSymbolResolver = true;
 
-  DwarfRequiresRelocationForSectionOffset = false;
-  DwarfUsesLabelOffsetForRanges = false;
-  DwarfUsesRelocationsForStringPool = false;
+  DwarfUsesRelocationsAcrossSections = false;
 }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 11f0f72..373df4b 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -138,6 +138,7 @@ public:
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
 
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
@@ -170,7 +171,7 @@ public:
                                      unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
 
   virtual void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol,
                                uint64_t Size, unsigned ByteAlignment = 0);
@@ -352,6 +353,21 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  MCContext &Ctx = getContext();
+  const MCAsmInfo &MAI = Ctx.getAsmInfo();
+  if (!MAI.doesSupportDataRegionDirectives())
+    return;
+  switch (Kind) {
+  case MCDR_DataRegion:            OS << "\t.data_region"; break;
+  case MCDR_DataRegionJT8:         OS << "\t.data_region jt8"; break;
+  case MCDR_DataRegionJT16:        OS << "\t.data_region jt16"; break;
+  case MCDR_DataRegionJT32:        OS << "\t.data_region jt32"; break;
+  case MCDR_DataRegionEnd:         OS << "\t.end_data_region"; break;
+  }
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
@@ -513,7 +529,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                 unsigned Size, unsigned ByteAlignment) {
+                                 uint64_t Size, unsigned ByteAlignment) {
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
@@ -826,7 +842,7 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
 
   if (IsVerboseAsm) {
     OS.PadToColumn(MAI.getCommentColumn());
-    OS << MAI.getCommentString() << ' ' << FileName << ':' 
+    OS << MAI.getCommentString() << ' ' << FileName << ':'
        << Line << ':' << Column;
   }
   EmitEOL();
@@ -1009,7 +1025,7 @@ void MCAsmStreamer::EmitCFISignalFrame() {
   if (!UseCFI)
     return;
 
-  OS << "\t.cif_signal_frame";
+  OS << "\t.cfi_signal_frame";
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 66ba9b8..05519b5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 
@@ -403,7 +404,7 @@ static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
 
     // See if we are aligning with nops, and if so do that first to try to fill
     // the Count bytes.  Then if that did not fill any bytes or there are any
-    // bytes left to fill use the the Value and ValueSize to fill the rest.
+    // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
       if (!Asm.getBackend().writeNopData(Count, OW))
@@ -713,9 +714,9 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   Data.clear();
   raw_svector_ostream OSE(Data);
   if (LF.isSigned())
-    MCObjectWriter::EncodeSLEB128(Value, OSE);
+    encodeSLEB128(Value, OSE);
   else
-    MCObjectWriter::EncodeULEB128(Value, OSE);
+    encodeULEB128(Value, OSE);
   OSE.flush();
   return OldSize != LF.getContents().size();
 }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index d3c4fb1..b5b14b9 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -274,11 +274,11 @@ unsigned MCContext::GetDwarfFile(StringRef Directory, StringRef FileName,
 
   if (Directory.empty()) {
     // Separate the directory part from the basename of the FileName.
-    std::pair<StringRef, StringRef> Slash = FileName.rsplit('/');
-    Directory = Slash.second;
-    if (!Directory.empty()) {
-      Directory = Slash.first;
-      FileName = Slash.second;
+    StringRef tFileName = sys::path::filename(FileName);
+    if (!tFileName.empty()) {
+      Directory = sys::path::parent_path(FileName);
+      if (!Directory.empty())
+        FileName = tFileName;
     }
   }
 
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 880a31a..322abd5 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -99,6 +99,14 @@ public:
     DisAsm.reset(disAsm);
     IP.reset(iP);
   }
+  const std::string &getTripleName() const { return TripleName; }
+  void *getDisInfo() const { return DisInfo; }
+  int getTagType() const { return TagType; }
+  LLVMOpInfoCallback getGetOpInfo() const { return GetOpInfo; }
+  LLVMSymbolLookupCallback getSymbolLookupCallback() const {
+    return SymbolLookUp;
+  }
+  const Target *getTarget() const { return TheTarget; }
   const MCDisassembler *getDisAsm() const { return DisAsm.get(); }
   const MCAsmInfo *getAsmInfo() const { return MAI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index b2672ca..1226f1a 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -44,7 +44,7 @@ struct TripleMap {
   const char *String;
 };
 
-static struct TripleMap triplemap[] = {
+static const struct TripleMap triplemap[] = {
   { Triple::x86,          "i386-unknown-unknown"    },
   { Triple::x86_64,       "x86_64-unknown-unknown"  },
   { Triple::arm,          "arm-unknown-unknown"     },
@@ -256,7 +256,7 @@ void EDDisassembler::initMaps(const MCRegisterInfo &registerInfo) {
   unsigned registerIndex;
   
   for (registerIndex = 0; registerIndex < numRegisters; ++registerIndex) {
-    const char* registerName = registerInfo.get(registerIndex).Name;
+    const char* registerName = registerInfo.getName(registerIndex);
     
     RegVec.push_back(registerName);
     RegRMap[registerName] = registerIndex;
diff --git a/lib/MC/MCDisassembler/EDMain.cpp b/lib/MC/MCDisassembler/EDMain.cpp
index c658717..5c065db 100644
--- a/lib/MC/MCDisassembler/EDMain.cpp
+++ b/lib/MC/MCDisassembler/EDMain.cpp
@@ -4,7 +4,7 @@
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the enhanced disassembler's public C API.
@@ -34,9 +34,9 @@ int EDGetDisassembler(EDDisassemblerRef *disassembler,
     Syntax = EDDisassembler::kEDAssemblySyntaxARMUAL;
     break;
   }
-  
+
   EDDisassemblerRef ret = EDDisassembler::getDisassembler(triple, Syntax);
-  
+
   if (!ret)
     return -1;
   *disassembler = ret;
@@ -70,18 +70,18 @@ unsigned int EDCreateInsts(EDInstRef *insts,
                            uint64_t address,
                            void *arg) {
   unsigned int index;
-  
+
   for (index = 0; index < count; ++index) {
     EDInst *inst = ((EDDisassembler*)disassembler)->createInst(byteReader,
                                                                address, arg);
-    
+
     if (!inst)
       return index;
-    
+
     insts[index] = inst;
     address += inst->byteSize();
   }
-  
+
   return count;
 }
 
@@ -165,14 +165,14 @@ int EDTokenIsRegister(EDTokenRef token) {
 int EDTokenIsNegativeLiteral(EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenLiteral)
     return -1;
-  
+
   return ((EDToken*)token)->literalSign();
 }
 
 int EDLiteralTokenAbsoluteValue(uint64_t *value, EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenLiteral)
     return -1;
-  
+
   return ((EDToken*)token)->literalAbsoluteValue(*value);
 }
 
@@ -180,7 +180,7 @@ int EDRegisterTokenValue(unsigned *registerID,
                          EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenRegister)
     return -1;
-  
+
   return ((EDToken*)token)->registerID(*registerID);
 }
 
@@ -231,7 +231,7 @@ struct ByteReaderWrapper {
   EDByteBlock_t byteBlock;
 };
 
-static int readerWrapperCallback(uint8_t *byte, 
+static int readerWrapperCallback(uint8_t *byte,
                           uint64_t address,
                           void *arg) {
   struct ByteReaderWrapper *wrapper = (struct ByteReaderWrapper *)arg;
@@ -245,13 +245,9 @@ unsigned int EDBlockCreateInsts(EDInstRef *insts,
                                 uint64_t address) {
   struct ByteReaderWrapper wrapper;
   wrapper.byteBlock = byteBlock;
-  
-  return EDCreateInsts(insts,
-                       count,
-                       disassembler, 
-                       readerWrapperCallback, 
-                       address, 
-                       (void*)&wrapper);
+
+  return EDCreateInsts(insts, count, disassembler, readerWrapperCallback,
+                       address, (void*)&wrapper);
 }
 
 int EDBlockEvaluateOperand(uint64_t *result, EDOperandRef operand,
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 84a34f1..4c63e43 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/ADT/Hashing.h"
@@ -36,7 +37,7 @@ using namespace llvm;
 
 // First special line opcode - leave room for the standard opcodes.
 // Note: If you want to change this, you'll have to update the
-// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().  
+// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().
 #define DWARF2_LINE_OPCODE_BASE         13
 
 // Minimum line offset in a special line info. opcode.  This value
@@ -105,7 +106,7 @@ void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) {
 
 //
 // This helper routine returns an expression of End - Start + IntVal .
-// 
+//
 static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
                                                   const MCSymbol &Start,
                                                   const MCSymbol &End,
@@ -198,7 +199,7 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
   // Set the value of the symbol, as we are at the end of the section.
   MCOS->EmitLabel(SectionEnd);
 
-  // Switch back the the dwarf line section.
+  // Switch back the dwarf line section.
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
@@ -310,7 +311,7 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines()
       && MCLineSectionOrder.begin() == MCLineSectionOrder.end()) {
     // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures
-    // it requires:  
+    // it requires:
     // total_length >= prologue_length + 10
     // We are 4 bytes short, since we have total_length = 51 and
     // prologue_length = 45
@@ -354,14 +355,14 @@ void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
   AddrDelta = ScaleAddrDelta(AddrDelta);
 
   // A LineDelta of INT64_MAX is a signal that this is actually a
-  // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the 
+  // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the
   // end_sequence to emit the matrix entry.
   if (LineDelta == INT64_MAX) {
     if (AddrDelta == MAX_SPECIAL_ADDR_DELTA)
       OS << char(dwarf::DW_LNS_const_add_pc);
     else {
       OS << char(dwarf::DW_LNS_advance_pc);
-      MCObjectWriter::EncodeULEB128(AddrDelta, OS);
+      encodeULEB128(AddrDelta, OS);
     }
     OS << char(dwarf::DW_LNS_extended_op);
     OS << char(1);
@@ -376,7 +377,7 @@ void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
   // it with DW_LNS_advance_line.
   if (Temp >= DWARF2_LINE_RANGE) {
     OS << char(dwarf::DW_LNS_advance_line);
-    MCObjectWriter::EncodeSLEB128(LineDelta, OS);
+    encodeSLEB128(LineDelta, OS);
 
     LineDelta = 0;
     Temp = 0 - DWARF2_LINE_BASE;
@@ -412,7 +413,7 @@ void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
 
   // Otherwise use DW_LNS_advance_pc.
   OS << char(dwarf::DW_LNS_advance_pc);
-  MCObjectWriter::EncodeULEB128(AddrDelta, OS);
+  encodeULEB128(AddrDelta, OS);
 
   if (NeedCopy)
     OS << char(dwarf::DW_LNS_copy);
@@ -552,7 +553,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
                              const MCSymbol *LineSectionSymbol) {
   MCContext &context = MCOS->getContext();
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); 
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
 
   // Create a symbol at the start and end of this section used in here for the
   // expression to calculate the length in the header.
@@ -705,7 +706,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) {
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
   MCSymbol *AbbrevSectionSymbol;
-  if (AsmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+  if (AsmInfo.doesDwarfUseRelocationsAcrossSections()) {
     AbbrevSectionSymbol = context.CreateTempSymbol();
     MCOS->EmitLabel(AbbrevSectionSymbol);
   } else {
@@ -766,7 +767,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
   MCOS->EmitLabel(Label);
 
   // Create and entry for the info and add it to the other entries.
-  MCGenDwarfLabelEntry *Entry = 
+  MCGenDwarfLabelEntry *Entry =
     new MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label);
   MCOS->getContext().addMCGenDwarfLabelEntry(Entry);
 }
@@ -1285,7 +1286,7 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
                                                  0);
     if (verboseAsm) streamer.AddComment("FDE CIE Offset");
     streamer.EmitAbsValue(offset, 4);
-  } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+  } else if (!asmInfo.doesDwarfUseRelocationsAcrossSections()) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
                                                  cieStart, 0);
     streamer.EmitAbsValue(offset, 4);
@@ -1293,20 +1294,17 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
     streamer.EmitSymbolValue(&cieStart, 4);
   }
 
-  unsigned fdeEncoding = MOFI->getFDEEncoding(UsingCFI);
-  unsigned size = getSizeForEncoding(streamer, fdeEncoding);
-
   // PC Begin
-  unsigned PCBeginEncoding = IsEH ? fdeEncoding :
-    (unsigned)dwarf::DW_EH_PE_absptr;
-  unsigned PCBeginSize = getSizeForEncoding(streamer, PCBeginEncoding);
-  EmitSymbol(streamer, *frame.Begin, PCBeginEncoding, "FDE initial location");
+  unsigned PCEncoding = IsEH ? MOFI->getFDEEncoding(UsingCFI)
+                             : (unsigned)dwarf::DW_EH_PE_absptr;
+  unsigned PCSize = getSizeForEncoding(streamer, PCEncoding);
+  EmitSymbol(streamer, *frame.Begin, PCEncoding, "FDE initial location");
 
   // PC Range
   const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
                                               *frame.End, 0);
   if (verboseAsm) streamer.AddComment("FDE address range");
-  streamer.EmitAbsValue(Range, size);
+  streamer.EmitAbsValue(Range, PCSize);
 
   if (IsEH) {
     // Augmentation Data Length
@@ -1329,7 +1327,7 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
   EmitCFIInstructions(streamer, frame.Instructions, frame.Begin);
 
   // Padding
-  streamer.EmitValueToAlignment(PCBeginSize);
+  streamer.EmitValueToAlignment(PCSize);
 
   return fdeEnd;
 }
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 171ab4d..6eb6914 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -15,9 +15,11 @@ using namespace llvm;
 MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_,
                                                  uint8_t OSABI_,
                                                  uint16_t EMachine_,
-                                                 bool HasRelocationAddend_)
+                                                 bool HasRelocationAddend_,
+                                                 bool IsN64_)
   : OSABI(OSABI_), EMachine(EMachine_),
-    HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_) {
+    HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_),
+    IsN64(IsN64_){
 }
 
 /// Default e_flags = 0
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 6c4d0e3..2d342dc 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -13,6 +13,8 @@
 
 #include "MCELF.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -89,7 +91,7 @@ public:
                                      unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0) {
+                            uint64_t Size = 0, unsigned ByteAlignment = 0) {
     llvm_unreachable("ELF doesn't support this directive");
   }
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 7880155..0eb7fcc 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -202,6 +202,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_DARWIN_LO16: return "lo16";
   case VK_PPC_GAS_HA16: return "ha";
   case VK_PPC_GAS_LO16: return "l";
+  case VK_PPC_TPREL16_HA: return "tprel@ha";
+  case VK_PPC_TPREL16_LO: return "tprel@l";
   case VK_Mips_GPREL: return "GPREL";
   case VK_Mips_GOT_CALL: return "GOT_CALL";
   case VK_Mips_GOT16: return "GOT16";
@@ -220,6 +222,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Mips_GOT_DISP: return "GOT_DISP";
   case VK_Mips_GOT_PAGE: return "GOT_PAGE";
   case VK_Mips_GOT_OFST: return "GOT_OFST";
+  case VK_Mips_HIGHER:   return "HIGHER";
+  case VK_Mips_HIGHEST:  return "HIGHEST";
   }
   llvm_unreachable("Invalid variant kind");
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index bc6cf77..b75fe2c 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,4 +1,3 @@
-//===- lib/MC/MCMachOStreamer.cpp - Mach-O Object Output ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -33,6 +32,8 @@ class MCMachOStreamer : public MCObjectStreamer {
 private:
   virtual void EmitInstToData(const MCInst &Inst);
 
+  void EmitDataRegion(DataRegionData::KindTy Kind);
+  void EmitDataRegionEnd();
 public:
   MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
                   raw_ostream &OS, MCCodeEmitter *Emitter)
@@ -46,6 +47,7 @@ public:
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
@@ -72,7 +74,7 @@ public:
     llvm_unreachable("macho doesn't support this directive");
   }
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment = 0);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
@@ -138,6 +140,26 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
 }
 
+void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
+  // Create a temporary label to mark the start of the data region.
+  MCSymbol *Start = getContext().CreateTempSymbol();
+  EmitLabel(Start);
+  // Record the region for the object writer to use.
+  DataRegionData Data = { Kind, Start, NULL };
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  Regions.push_back(Data);
+}
+
+void MCMachOStreamer::EmitDataRegionEnd() {
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  assert(Regions.size() && "Mismatched .end_data_region!");
+  DataRegionData &Data = Regions.back();
+  assert(Data.End == NULL && "Mismatched .end_data_region!");
+  // Create a temporary label to mark the end of the data region.
+  Data.End = getContext().CreateTempSymbol();
+  EmitLabel(Data.End);
+}
+
 void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
@@ -153,6 +175,26 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
+void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  switch (Kind) {
+  case MCDR_DataRegion:
+    EmitDataRegion(DataRegionData::Data);
+    return;
+  case MCDR_DataRegionJT8:
+    EmitDataRegion(DataRegionData::JumpTable8);
+    return;
+  case MCDR_DataRegionJT16:
+    EmitDataRegion(DataRegionData::JumpTable16);
+    return;
+  case MCDR_DataRegionJT32:
+    EmitDataRegion(DataRegionData::JumpTable32);
+    return;
+  case MCDR_DataRegionEnd:
+    EmitDataRegionEnd();
+    return;
+  }
+}
+
 void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   // Remember that the function is a thumb function. Fixup and relocation
   // values will need adjusted.
@@ -284,7 +326,7 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                   unsigned Size, unsigned ByteAlignment) {
+                                   uint64_t Size, unsigned ByteAlignment) {
   MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
 
   // The symbol may not be present, which only creates the section.
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 7ff2d1b..4c17d91 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -63,7 +63,7 @@ namespace {
     virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
     virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                              unsigned Size = 0, unsigned ByteAlignment = 0) {}
+                              uint64_t Size = 0, unsigned ByteAlignment = 0) {}
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment) {}
     virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
@@ -82,7 +82,7 @@ namespace {
 
     virtual bool EmitValueToOffset(const MCExpr *Offset,
                                    unsigned char Value = 0) { return false; }
-    
+
     virtual void EmitFileDirective(StringRef Filename) {}
     virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                         StringRef Filename) {
@@ -99,12 +99,12 @@ namespace {
     virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
       RecordProcEnd(Frame);
     }
-    
+
     /// @}
   };
 
 }
-    
+
 MCStreamer *llvm::createNullStreamer(MCContext &Context) {
   return new MCNullStreamer(Context);
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index b22ae33..29b4a94 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -169,7 +169,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__apple_types",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
-    
+
   DwarfAbbrevSection =
     Ctx->getMachOSection("__DWARF", "__debug_abbrev",
                          MCSectionMachO::S_ATTR_DEBUG,
@@ -507,15 +507,13 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   PDataSection =
     Ctx->getCOFFSection(".pdata",
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                        COFF::IMAGE_SCN_MEM_READ |
-                        COFF::IMAGE_SCN_MEM_WRITE,
+                        COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getDataRel());
 
   XDataSection =
     Ctx->getCOFFSection(".xdata",
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                        COFF::IMAGE_SCN_MEM_READ |
-                        COFF::IMAGE_SCN_MEM_WRITE,
+                        COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getDataRel());
   TLSDataSection =
     Ctx->getCOFFSection(".tls$",
diff --git a/lib/MC/MCObjectWriter.cpp b/lib/MC/MCObjectWriter.cpp
index 030f247..94d7cd6 100644
--- a/lib/MC/MCObjectWriter.cpp
+++ b/lib/MC/MCObjectWriter.cpp
@@ -17,40 +17,6 @@ using namespace llvm;
 MCObjectWriter::~MCObjectWriter() {
 }
 
-/// Utility function to encode a SLEB128 value.
-void MCObjectWriter::EncodeSLEB128(int64_t Value, raw_ostream &OS) {
-  bool More;
-  do {
-    uint8_t Byte = Value & 0x7f;
-    // NOTE: this assumes that this signed shift is an arithmetic right shift.
-    Value >>= 7;
-    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
-              ((Value == -1) && ((Byte & 0x40) != 0))));
-    if (More)
-      Byte |= 0x80; // Mark this byte that that more bytes will follow.
-    OS << char(Byte);
-  } while (More);
-}
-
-/// Utility function to encode a ULEB128 value.
-void MCObjectWriter::EncodeULEB128(uint64_t Value, raw_ostream &OS,
-                                   unsigned Padding) {
-  do {
-    uint8_t Byte = Value & 0x7f;
-    Value >>= 7;
-    if (Value != 0 || Padding != 0)
-      Byte |= 0x80; // Mark this byte that that more bytes will follow.
-    OS << char(Byte);
-  } while (Value != 0);
-
-  // Pad with 0x80 and emit a null byte at the end.
-  if (Padding != 0) {
-    for (; Padding != 1; --Padding)
-      OS << '\x80';
-    OS << '\x00';
-  }
-}
-
 bool
 MCObjectWriter::IsSymbolRefDifferenceFullyResolved(const MCAssembler &Asm,
                                                    const MCSymbolRefExpr *A,
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 8aef43c..b67c769 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -45,13 +45,18 @@ FatalAssemblerWarnings("fatal-assembler-warnings",
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
+typedef std::vector<AsmToken> MacroArgument;
+typedef std::vector<MacroArgument> MacroArguments;
+typedef StringRef MacroParameter;
+typedef std::vector<MacroParameter> MacroParameters;
+
 struct Macro {
   StringRef Name;
   StringRef Body;
-  std::vector<StringRef> Parameters;
+  MacroParameters Parameters;
 
 public:
-  Macro(StringRef N, StringRef B, const std::vector<StringRef> &P) :
+  Macro(StringRef N, StringRef B, const MacroParameters &P) :
     Name(N), Body(B), Parameters(P) {}
 };
 
@@ -178,9 +183,9 @@ private:
   bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
   bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
-  bool expandMacro(SmallString<256> &Buf, StringRef Body,
-                   const std::vector<StringRef> &Parameters,
-                   const std::vector<std::vector<AsmToken> > &A,
+  bool expandMacro(raw_svector_ostream &OS, StringRef Body,
+                   const MacroParameters &Parameters,
+                   const MacroArguments &A,
                    const SMLoc &L);
   void HandleMacroExit();
 
@@ -204,11 +209,18 @@ private:
 
   void EatToEndOfStatement();
 
+  bool ParseMacroArgument(MacroArgument &MA);
+  bool ParseMacroArguments(const Macro *M, MacroArguments &A);
+
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
   StringRef ParseStringToEndOfStatement();
 
+  /// \brief Parse until the end of a statement or a comma is encountered,
+  /// return the contents from the current token up to the end or comma.
+  StringRef ParseStringToComma();
+
   bool ParseAssignment(StringRef Name, bool allow_redef);
 
   bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
@@ -245,6 +257,10 @@ private:
   bool ParseDirectiveIncbin(); // ".incbin"
 
   bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  // ".ifb" or ".ifnb", depending on ExpectBlank.
+  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // ".ifc" or ".ifnc", depending on ExpectEqual.
+  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
   // ".ifdef" or ".ifndef", depending on expect_defined
   bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
   bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
@@ -257,6 +273,15 @@ private:
 
   const MCExpr *ApplyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
+
+  // Macro-like directives
+  Macro *ParseMacroLikeBody(SMLoc DirectiveLoc);
+  void InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+                                raw_svector_ostream &OS);
+  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 };
 
 /// \brief Generic implementations of directive handling, etc. which is shared
@@ -328,6 +353,7 @@ public:
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacro>(".macro");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endm");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endmacro");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectivePurgeMacro>(".purgem");
 
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".sleb128");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".uleb128");
@@ -359,6 +385,7 @@ public:
   bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveEndMacro(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectivePurgeMacro(StringRef, SMLoc DirectiveLoc);
 
   bool ParseDirectiveLEB128(StringRef, SMLoc);
 };
@@ -456,7 +483,7 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
 }
 
 /// Process the specified .incbin file by seaching for it in the include paths
-/// then just emiting the byte contents of the file to the streamer. This 
+/// then just emitting the byte contents of the file to the streamer. This
 /// returns true on failure.
 bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
@@ -602,6 +629,18 @@ StringRef AsmParser::ParseStringToEndOfStatement() {
   return StringRef(Start, End - Start);
 }
 
+StringRef AsmParser::ParseStringToComma() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Comma) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
+
 /// ParseParenExpr - Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
@@ -700,7 +739,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
                                                       IDVal == "f" ? 1 : 0);
         Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
                                       getContext());
-        if(IDVal == "b" && Sym->isUndefined())
+        if (IDVal == "b" && Sym->isUndefined())
           return Error(Loc, "invalid reference to undefined symbol");
         EndLoc = Lexer.getLoc();
         Lex(); // Eat identifier.
@@ -1042,6 +1081,14 @@ bool AsmParser::ParseStatement() {
   // example.
   if (IDVal == ".if")
     return ParseDirectiveIf(IDLoc);
+  if (IDVal == ".ifb")
+    return ParseDirectiveIfb(IDLoc, true);
+  if (IDVal == ".ifnb")
+    return ParseDirectiveIfb(IDLoc, false);
+  if (IDVal == ".ifc")
+    return ParseDirectiveIfc(IDLoc, true);
+  if (IDVal == ".ifnc")
+    return ParseDirectiveIfc(IDLoc, false);
   if (IDVal == ".ifdef")
     return ParseDirectiveIfdef(IDLoc, true);
   if (IDVal == ".ifndef" || IDVal == ".ifnotdef")
@@ -1123,6 +1170,11 @@ bool AsmParser::ParseStatement() {
 
   // Otherwise, we have a normal instruction or directive.
   if (IDVal[0] == '.' && IDVal != ".") {
+
+    // Target hook for parsing target specific directives.
+    if (!getTargetParser().ParseDirective(ID))
+      return false;
+
     // Assembler features
     if (IDVal == ".set" || IDVal == ".equ")
       return ParseDirectiveSet(IDVal, true);
@@ -1192,6 +1244,10 @@ bool AsmParser::ParseStatement() {
 
     // Symbol attribute directives
 
+    if (IDVal == ".extern") {
+      EatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    }
     if (IDVal == ".globl" || IDVal == ".global")
       return ParseDirectiveSymbolAttribute(MCSA_Global);
     if (IDVal == ".indirect_symbol")
@@ -1225,22 +1281,27 @@ bool AsmParser::ParseStatement() {
     if (IDVal == ".incbin")
       return ParseDirectiveIncbin();
 
-    if (IDVal == ".code16")
+    if (IDVal == ".code16" || IDVal == ".code16gcc")
       return TokError(Twine(IDVal) + " not supported yet");
 
+    // Macro-like directives
+    if (IDVal == ".rept")
+      return ParseDirectiveRept(IDLoc);
+    if (IDVal == ".irp")
+      return ParseDirectiveIrp(IDLoc);
+    if (IDVal == ".irpc")
+      return ParseDirectiveIrpc(IDLoc);
+    if (IDVal == ".endr")
+      return ParseDirectiveEndr(IDLoc);
+
     // Look up the handler in the handler table.
     std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
       DirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
-    // Target hook for parsing target specific directives.
-    if (!getTargetParser().ParseDirective(ID))
-      return false;
 
-    bool retval = Warning(IDLoc, "ignoring directive for now");
-    EatToEndOfStatement();
-    return retval;
+    return Error(IDLoc, "unknown directive");
   }
 
   CheckForValidSection();
@@ -1339,7 +1400,7 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   return false;
 }
 
-/// DiagHandler - will use the the last parsed cpp hash line filename comment
+/// DiagHandler - will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   const AsmParser *Parser = static_cast<const AsmParser*>(Context);
@@ -1393,11 +1454,10 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     NewDiag.print(0, OS);
 }
 
-bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
-                            const std::vector<StringRef> &Parameters,
-                            const std::vector<std::vector<AsmToken> > &A,
+bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
+                            const MacroParameters &Parameters,
+                            const MacroArguments &A,
                             const SMLoc &L) {
-  raw_svector_ostream OS(Buf);
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
@@ -1449,7 +1509,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
-        for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+        for (MacroArgument::const_iterator it = A[Index].begin(),
                ie = A[Index].end(); it != ie; ++it)
           OS << it->getString();
         break;
@@ -1472,7 +1532,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
       if (Index == NParameters)
         return Error(L, "Parameter not found");
 
-      for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+      for (MacroArgument::const_iterator it = A[Index].begin(),
              ie = A[Index].end(); it != ie; ++it)
         OS << it->getString();
 
@@ -1482,9 +1542,6 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
     Body = Body.substr(Pos);
   }
 
-  // We include the .endmacro in the buffer as our queue to exit the macro
-  // instantiation.
-  OS << ".endmacro\n";
   return false;
 }
 
@@ -1494,55 +1551,97 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
 {
 }
 
-bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
-                                 const Macro *M) {
-  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
-  // this, although we should protect against infinite loops.
-  if (ActiveMacros.size() == 20)
-    return TokError("macros cannot be nested more than 20 levels deep");
-
-  // Parse the macro instantiation arguments.
-  std::vector<std::vector<AsmToken> > MacroArguments;
-  MacroArguments.push_back(std::vector<AsmToken>());
+/// ParseMacroArgument - Extract AsmTokens for a macro argument.
+/// This is used for both default macro parameter values and the
+/// arguments in macro invocations
+bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
   unsigned ParenLevel = 0;
+
   for (;;) {
-    if (Lexer.is(AsmToken::Eof))
+    SMLoc LastTokenLoc;
+
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
       return TokError("unexpected token in macro instantiation");
+
+    // HandleMacroEntry relies on not advancing the lexer here
+    // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma))
+      break;
 
-    // If we aren't inside parentheses and this is a comma, start a new token
-    // list.
-    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
-      MacroArguments.push_back(std::vector<AsmToken>());
-    } else {
-      // Adjust the current parentheses level.
-      if (Lexer.is(AsmToken::LParen))
-        ++ParenLevel;
-      else if (Lexer.is(AsmToken::RParen) && ParenLevel)
-        --ParenLevel;
-
-      // Append the token to the current argument list.
-      MacroArguments.back().push_back(getTok());
-    }
+    // Adjust the current parentheses level.
+    if (Lexer.is(AsmToken::LParen))
+      ++ParenLevel;
+    else if (Lexer.is(AsmToken::RParen) && ParenLevel)
+      --ParenLevel;
+
+    // Append the token to the current argument list.
+    MA.push_back(getTok());
     Lex();
   }
-  // If the last argument didn't end up with any tokens, it's not a real
-  // argument and we should remove it from the list. This happens with either
-  // a tailing comma or an empty argument list.
-  if (MacroArguments.back().empty())
-    MacroArguments.pop_back();
+  if (ParenLevel != 0)
+    return TokError("unbalanced parenthesises in macro argument");
+  return false;
+}
+
+// Parse the macro instantiation arguments.
+bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
+  const unsigned NParameters = M ? M->Parameters.size() : 0;
+
+  // Parse two kinds of macro invocations:
+  // - macros defined without any parameters accept an arbitrary number of them
+  // - macros defined with parameters accept at most that many of them
+  for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
+       ++Parameter) {
+    MacroArgument MA;
+
+    if (ParseMacroArgument(MA))
+      return true;
+
+    A.push_back(MA);
+
+    if (Lexer.is(AsmToken::EndOfStatement))
+      return false;
+
+    if (Lexer.is(AsmToken::Comma))
+      Lex();
+  }
+  return TokError("Too many arguments");
+}
+
+bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
+                                 const Macro *M) {
+  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
+  // this, although we should protect against infinite loops.
+  if (ActiveMacros.size() == 20)
+    return TokError("macros cannot be nested more than 20 levels deep");
+
+  MacroArguments A;
+  if (ParseMacroArguments(M, A))
+    return true;
+
+  // Remove any trailing empty arguments. Do this after-the-fact as we have
+  // to keep empty arguments in the middle of the list or positionality
+  // gets off. e.g.,  "foo 1, , 2" vs. "foo 1, 2,"
+  while (!A.empty() && A.back().empty())
+    A.pop_back();
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
   // to hold the macro body with substitutions.
   SmallString<256> Buf;
   StringRef Body = M->Body;
+  raw_svector_ostream OS(Buf);
 
-  if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc()))
+  if (expandMacro(OS, Body, M->Parameters, A, getTok().getLoc()))
     return true;
 
+  // We include the .endmacro in the buffer as our queue to exit the macro
+  // instantiation.
+  OS << ".endmacro\n";
+
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>");
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
@@ -2295,10 +2394,9 @@ bool AsmParser::ParseDirectiveIncbin() {
 bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
-  if(TheCondState.Ignore) {
+  if (TheCondState.Ignore) {
     EatToEndOfStatement();
-  }
-  else {
+  } else {
     int64_t ExprValue;
     if (ParseAbsoluteExpression(ExprValue))
       return true;
@@ -2315,6 +2413,61 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   return false;
 }
 
+/// ParseDirectiveIfb
+/// ::= .ifb string
+bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    EatToEndOfStatement();
+  } else {
+    StringRef Str = ParseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifb' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfc
+/// ::= .ifc string1, string2
+bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    EatToEndOfStatement();
+  } else {
+    StringRef Str1 = ParseStringToComma();
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    StringRef Str2 = ParseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectEqual == (Str1 == Str2);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfdef
+/// ::= .ifdef symbol
 bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   StringRef Name;
   TheCondStack.push_back(TheCondState);
@@ -2853,7 +3006,7 @@ bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal,
 /// ParseDirectiveCFIRestore
 /// ::= .cfi_restore register
 bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal,
-						SMLoc DirectiveLoc) {
+                                                SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
@@ -2866,7 +3019,7 @@ bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal,
 /// ParseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
 bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal,
-					       SMLoc DirectiveLoc) {
+                                               SMLoc DirectiveLoc) {
   std::string Values;
   int64_t CurrValue;
   if (getParser().ParseAbsoluteExpression(CurrValue))
@@ -2922,7 +3075,7 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
   if (getParser().ParseIdentifier(Name))
     return TokError("expected identifier in directive");
 
-  std::vector<StringRef> Parameters;
+  MacroParameters Parameters;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for(;;) {
       StringRef Parameter;
@@ -2981,7 +3134,7 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
 /// ::= .endm
 /// ::= .endmacro
 bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
-                                           SMLoc DirectiveLoc) {
+                                              SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
 
@@ -2998,6 +3151,27 @@ bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
                   "no current macro definition");
 }
 
+/// ParseDirectivePurgeMacro
+/// ::= .purgem
+bool GenericAsmParser::ParseDirectivePurgeMacro(StringRef Directive,
+                                                SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in '.purgem' directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.purgem' directive");
+
+  StringMap<Macro*>::iterator I = getParser().MacroMap.find(Name);
+  if (I == getParser().MacroMap.end())
+    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+
+  // Undefine the macro.
+  delete I->getValue();
+  getParser().MacroMap.erase(I);
+  return false;
+}
+
 bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
   getParser().CheckForValidSection();
 
@@ -3017,6 +3191,217 @@ bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
   return false;
 }
 
+Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+  AsmToken EndToken, StartToken = getTok();
+
+  unsigned NestLevel = 0;
+  for (;;) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof)) {
+      Error(DirectiveLoc, "no matching '.endr' in definition");
+      return 0;
+    }
+
+    if (Lexer.is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".rept")) {
+      ++NestLevel;
+    }
+
+    // Otherwise, check whether we have reached the .endr.
+    if (Lexer.is(AsmToken::Identifier) &&
+        getTok().getIdentifier() == ".endr") {
+      if (NestLevel == 0) {
+        EndToken = getTok();
+        Lex();
+        if (Lexer.isNot(AsmToken::EndOfStatement)) {
+          TokError("unexpected token in '.endr' directive");
+          return 0;
+        }
+        break;
+      }
+      --NestLevel;
+    }
+
+    // Otherwise, scan till the end of the statement.
+    EatToEndOfStatement();
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+
+  // We Are Anonymous.
+  StringRef Name;
+  MacroParameters Parameters;
+  return new Macro(Name, Body, Parameters);
+}
+
+void AsmParser::InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+                                         raw_svector_ostream &OS) {
+  OS << ".endr\n";
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
+                                                  getTok().getLoc(),
+                                                  Instantiation);
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lex();
+}
+
+bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+  int64_t Count;
+  if (ParseAbsoluteExpression(Count))
+    return TokError("unexpected token in '.rept' directive");
+
+  if (Count < 0)
+    return TokError("Count is negative");
+
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.rept' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the rept definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  MacroParameters Parameters;
+  MacroArguments A;
+  raw_svector_ostream OS(Buf);
+  while (Count--) {
+    if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
+      return true;
+  }
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrp
+/// ::= .irp symbol,values
+bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+  MacroParameters Parameters;
+  MacroParameter Parameter;
+
+  if (ParseIdentifier(Parameter))
+    return TokError("expected identifier in '.irp' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irp' directive");
+
+  Lex();
+
+  MacroArguments A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irp definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  for (std::vector<MacroArgument>::iterator i = A.begin(), e = A.end(); i != e;
+       ++i) {
+    std::vector<MacroArgument> Args;
+    Args.push_back(*i);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrpc
+/// ::= .irpc symbol,values
+bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+  MacroParameters Parameters;
+  MacroParameter Parameter;
+
+  if (ParseIdentifier(Parameter))
+    return TokError("expected identifier in '.irpc' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irpc' directive");
+
+  Lex();
+
+  MacroArguments A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  if (A.size() != 1 || A.front().size() != 1)
+    return TokError("unexpected token in '.irpc' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irpc definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StringRef Values = A.front().front().getString();
+  std::size_t I, End = Values.size();
+  for (I = 0; I < End; ++I) {
+    MacroArgument Arg;
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+
+    MacroArguments Args;
+    Args.push_back(Arg);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+  if (ActiveMacros.empty())
+    return TokError("unexpected '.endr' directive, no current .rept");
+
+  // The only .repl that should get here are the ones created by
+  // InstantiateMacroLikeBody.
+  assert(getLexer().is(AsmToken::EndOfStatement));
+
+  HandleMacroExit();
+  return false;
+}
 
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 6f45068..18033d0 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -49,6 +50,9 @@ public:
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".dump");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".load");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSection>(".section");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePushSection>(".pushsection");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePopSection>(".popsection");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePrevious>(".previous");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogUnique>(
       ".secure_log_unique");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogReset>(
@@ -56,6 +60,9 @@ public:
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
 
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(".data_region");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(".end_data_region");
+
     // Special section directives.
     AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
     AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(".const_data");
@@ -108,11 +115,16 @@ public:
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
   bool ParseDirectiveSection(StringRef, SMLoc);
+  bool ParseDirectivePushSection(StringRef, SMLoc);
+  bool ParseDirectivePopSection(StringRef, SMLoc);
+  bool ParseDirectivePrevious(StringRef, SMLoc);
   bool ParseDirectiveSecureLogReset(StringRef, SMLoc);
   bool ParseDirectiveSecureLogUnique(StringRef, SMLoc);
   bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc);
   bool ParseDirectiveTBSS(StringRef, SMLoc);
   bool ParseDirectiveZerofill(StringRef, SMLoc);
+  bool ParseDirectiveDataRegion(StringRef, SMLoc);
+  bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
 
   // Named Section Directive
   bool ParseSectionDirectiveConst(StringRef, SMLoc) {
@@ -291,7 +303,7 @@ public:
 
 };
 
-}
+} // end anonymous namespace
 
 bool DarwinAsmParser::ParseSectionSwitch(const char *Segment,
                                          const char *Section,
@@ -451,6 +463,37 @@ bool DarwinAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectivePushSection:
+///   ::= .pushsection identifier (',' identifier)*
+bool DarwinAsmParser::ParseDirectivePushSection(StringRef S, SMLoc Loc) {
+  getStreamer().PushSection();
+
+  if (ParseDirectiveSection(S, Loc)) {
+    getStreamer().PopSection();
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectivePopSection:
+///   ::= .popsection
+bool DarwinAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
+  if (!getStreamer().PopSection())
+    return TokError(".popsection without corresponding .pushsection");
+  return false;
+}
+
+/// ParseDirectivePrevious:
+///   ::= .previous
+bool DarwinAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
+  const MCSection *PreviousSection = getStreamer().getPreviousSection();
+  if (PreviousSection == NULL)
+      return TokError(".previous without corresponding .section");
+  getStreamer().SwitchSection(PreviousSection);
+  return false;
+}
+
 /// ParseDirectiveSecureLogUnique
 ///  ::= .secure_log_unique ... message ...
 bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
@@ -659,10 +702,46 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveDataRegion
+///  ::= .data_region [ ( jt8 | jt16 | jt32 ) ]
+bool DarwinAsmParser::ParseDirectiveDataRegion(StringRef, SMLoc) {
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Lex();
+    getStreamer().EmitDataRegion(MCDR_DataRegion);
+    return false;
+  }
+  StringRef RegionType;
+  SMLoc Loc = getParser().getTok().getLoc();
+  if (getParser().ParseIdentifier(RegionType))
+    return TokError("expected region type after '.data_region' directive");
+  int Kind = StringSwitch<int>(RegionType)
+    .Case("jt8", MCDR_DataRegionJT8)
+    .Case("jt16", MCDR_DataRegionJT16)
+    .Case("jt32", MCDR_DataRegionJT32)
+    .Default(-1);
+  if (Kind == -1)
+    return Error(Loc, "unknown region type in '.data_region' directive");
+  Lex();
+
+  getStreamer().EmitDataRegion((MCDataRegionType)Kind);
+  return false;
+}
+
+/// ParseDirectiveDataRegionEnd
+///  ::= .end_data_region
+bool DarwinAsmParser::ParseDirectiveDataRegionEnd(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.end_data_region' directive");
+
+  Lex();
+  getStreamer().EmitDataRegion(MCDR_DataRegionEnd);
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createDarwinAsmParser() {
   return new DarwinAsmParser;
 }
 
-}
+} // end llvm namespace
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index ffc400b..9316bb1 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -64,6 +64,7 @@ public:
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
@@ -141,6 +142,7 @@ public:
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveIdent(StringRef, SMLoc);
   bool ParseDirectiveSymver(StringRef, SMLoc);
+  bool ParseDirectiveVersion(StringRef, SMLoc);
   bool ParseDirectiveWeakref(StringRef, SMLoc);
   bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
 
@@ -548,6 +550,32 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveVersion
+///  ::= .version string
+bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("unexpected token in '.version' directive");
+
+  StringRef Data = getTok().getIdentifier();
+
+  Lex();
+
+  const MCSection *Note =
+    getContext().getELFSection(".note", ELF::SHT_NOTE, 0,
+                               SectionKind::getReadOnly());
+
+  getStreamer().PushSection();
+  getStreamer().SwitchSection(Note);
+  getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
+  getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
+  getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
+  getStreamer().EmitBytes(Data, 0);             // name.
+  getStreamer().EmitIntValue(0, 1);             // terminate the string.
+  getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
+  getStreamer().PopSection();
+  return false;
+}
+
 /// ParseDirectiveWeakref
 ///  ::= .weakref foo, bar
 bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index a770c97..9ccab93 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -39,7 +39,7 @@ public:
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                     unsigned ValueSize = 1,
@@ -144,7 +144,7 @@ void MCPureStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
 }
 
 void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                  unsigned Size, unsigned ByteAlignment) {
+                                  uint64_t Size, unsigned ByteAlignment) {
   report_fatal_error("not yet implemented in pure streamer");
 }
 
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
new file mode 100644
index 0000000..4d1aff3
--- /dev/null
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -0,0 +1,71 @@
+//=== MC/MCRegisterInfo.cpp - Target Register Description -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCRegisterInfo functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+using namespace llvm;
+
+unsigned MCRegisterInfo::getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
+                                             const MCRegisterClass *RC) const {
+  for (MCSuperRegIterator Supers(Reg, this); Supers.isValid(); ++Supers)
+    if (RC->contains(*Supers) && Reg == getSubReg(*Supers, SubIdx))
+      return *Supers;
+  return 0;
+}
+
+unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
+  // Get a pointer to the corresponding SubRegIndices list. This list has the
+  // name of each sub-register in the same order as MCSubRegIterator.
+  const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
+  for (MCSubRegIterator Subs(Reg, this); Subs.isValid(); ++Subs, ++SRI)
+    if (*SRI == Idx)
+      return *Subs;
+  return 0;
+}
+
+unsigned MCRegisterInfo::getSubRegIndex(unsigned Reg, unsigned SubReg) const {
+  // Get a pointer to the corresponding SubRegIndices list. This list has the
+  // name of each sub-register in the same order as MCSubRegIterator.
+  const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
+  for (MCSubRegIterator Subs(Reg, this); Subs.isValid(); ++Subs, ++SRI)
+    if (*Subs == SubReg)
+      return *SRI;
+  return 0;
+}
+
+int MCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
+  unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
+
+  DwarfLLVMRegPair Key = { RegNum, 0 };
+  const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
+  if (I == M+Size || I->FromReg != RegNum)
+    return -1;
+  return I->ToReg;
+}
+
+int MCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  const DwarfLLVMRegPair *M = isEH ? EHDwarf2LRegs : Dwarf2LRegs;
+  unsigned Size = isEH ? EHDwarf2LRegsSize : Dwarf2LRegsSize;
+
+  DwarfLLVMRegPair Key = { RegNum, 0 };
+  const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
+  assert(I != M+Size && I->FromReg == RegNum && "Invalid RegNum");
+  return I->ToReg;
+}
+
+int MCRegisterInfo::getSEHRegNum(unsigned RegNum) const {
+  const DenseMap<unsigned, int>::const_iterator I = L2SEHRegs.find(RegNum);
+  if (I == L2SEHRegs.end()) return (int)RegNum;
+  return I->second;
+}
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 90091f0..aac9377 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -20,7 +20,7 @@ MCSectionCOFF::~MCSectionCOFF() {} // anchor.
 // should be printed before the section name
 bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
                                                const MCAsmInfo &MAI) const {
-  
+
   // FIXME: Does .section .bss/.data/.text work everywhere??
   if (Name == ".text" || Name == ".data" || Name == ".bss")
     return true;
@@ -30,7 +30,7 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
 
 void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                          raw_ostream &OS) const {
-  
+
   // standard sections don't require the '.section'
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
@@ -47,7 +47,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
     OS << 'n';
   OS << "\"\n";
-  
+
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
     switch (Selection) {
       case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index dfd77c3..0775cfa 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -22,7 +22,7 @@ MCSectionELF::~MCSectionELF() {} // anchor.
 // should be printed before the section name
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
-  
+
   // FIXME: Does .section .bss/.data/.text work everywhere??
   if (Name == ".text" || Name == ".data" ||
       (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS()))
@@ -33,7 +33,7 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
 
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                         raw_ostream &OS) const {
-   
+
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
     return;
@@ -62,7 +62,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
   }
 
   // Handle the weird solaris syntax if desired.
-  if (MAI.usesSunStyleELFSectionSwitchSyntax() && 
+  if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
       !(Flags & ELF::SHF_MERGE)) {
     if (Flags & ELF::SHF_ALLOC)
       OS << ",#alloc";
@@ -75,7 +75,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << '\n';
     return;
   }
-  
+
   OS << ",\"";
   if (Flags & ELF::SHF_ALLOC)
     OS << 'a';
@@ -91,13 +91,13 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'S';
   if (Flags & ELF::SHF_TLS)
     OS << 'T';
-  
+
   // If there are target-specific flags, print them.
   if (Flags & ELF::XCORE_SHF_CP_SECTION)
     OS << 'c';
   if (Flags & ELF::XCORE_SHF_DP_SECTION)
     OS << 'd';
-  
+
   OS << '"';
 
   OS << ',';
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 43e62ff..0bac24d 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -15,17 +15,15 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true),
-                                         EmitDebugFrame(false),
-                                         CurrentW64UnwindInfo(0),
-                                         LastSymbol(0),
-                                         UniqueCodeBeginSuffix(0),
-                                         UniqueDataBeginSuffix(0) {
+MCStreamer::MCStreamer(MCContext &Ctx)
+  : Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
+    CurrentW64UnwindInfo(0), LastSymbol(0) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
@@ -97,7 +95,7 @@ void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned AddrSpace,
                                      unsigned Padding) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
-  MCObjectWriter::EncodeULEB128(Value, OSE, Padding);
+  encodeULEB128(Value, OSE, Padding);
   EmitBytes(OSE.str(), AddrSpace);
 }
 
@@ -106,7 +104,7 @@ void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned AddrSpace,
 void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
-  MCObjectWriter::EncodeSLEB128(Value, OSE);
+  encodeSLEB128(Value, OSE);
   EmitBytes(OSE.str(), AddrSpace);
 }
 
@@ -183,85 +181,6 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   LastSymbol = Symbol;
 }
 
-void MCStreamer::EmitDataRegion() {
-  if (RegionIndicator == Data) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getDataBeginLabelName() +
-                                               Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = Data;
-}
-
-void MCStreamer::EmitCodeRegion() {
-  if (RegionIndicator == Code) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getCodeBeginLabelName() +
-                                               Twine(UniqueCodeBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = Code;
-}
-
-void MCStreamer::EmitJumpTable8Region() {
-  if (RegionIndicator == JumpTable8) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable8BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable8;
-}
-
-void MCStreamer::EmitJumpTable16Region() {
-  if (RegionIndicator == JumpTable16) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable16BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable16;
-}
-
-
-void MCStreamer::EmitJumpTable32Region() {
-  if (RegionIndicator == JumpTable32) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable32BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable32;
-}
-
 void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
@@ -283,7 +202,6 @@ void MCStreamer::EmitCFIStartProc() {
   EmitCFIStartProcImpl(Frame);
 
   FrameInfos.push_back(Frame);
-  RegionIndicator = Code;
 }
 
 void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 86dc108..05c83f7 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -17,11 +17,13 @@
 
 using namespace llvm;
 
+MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
+
 void
 MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                                      const SubtargetFeatureKV *PF,
                                      const SubtargetFeatureKV *PD,
-                                     const SubtargetInfoKV *PI,
+                                     const SubtargetInfoKV *ProcSched,
                                      const InstrStage *IS,
                                      const unsigned *OC,
                                      const unsigned *FP,
@@ -29,10 +31,10 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
   TargetTriple = TT;
   ProcFeatures = PF;
   ProcDesc = PD;
-  ProcItins = PI;
+  ProcSchedModel = ProcSched;
   Stages = IS;
   OperandCycles = OC;
-  ForwardingPathes = FP;
+  ForwardingPaths = FP;
   NumFeatures = NF;
   NumProcs = NP;
 
@@ -68,14 +70,14 @@ uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
 }
 
 
-InstrItineraryData
-MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
-  assert(ProcItins && "Instruction itineraries information not available!");
+MCSchedModel *
+MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
+  assert(ProcSchedModel && "Processor machine model not available!");
 
 #ifndef NDEBUG
   for (size_t i = 1; i < NumProcs; i++) {
-    assert(strcmp(ProcItins[i - 1].Key, ProcItins[i].Key) < 0 &&
-           "Itineraries table is not sorted");
+    assert(strcmp(ProcSchedModel[i - 1].Key, ProcSchedModel[i].Key) < 0 &&
+           "Processor machine model table is not sorted");
   }
 #endif
 
@@ -83,14 +85,19 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
   SubtargetInfoKV KV;
   KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcItins, ProcItins+NumProcs, KV);
-  if (Found == ProcItins+NumProcs || StringRef(Found->Key) != CPU) {
+    std::lower_bound(ProcSchedModel, ProcSchedModel+NumProcs, KV);
+  if (Found == ProcSchedModel+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
            << " (ignoring processor)\n";
-    return InstrItineraryData();
+    return &MCSchedModel::DefaultSchedModel;
   }
+  assert(Found->Value && "Missing processor SchedModel value");
+  return (MCSchedModel *)Found->Value;
+}
 
-  return InstrItineraryData(Stages, OperandCycles, ForwardingPathes,
-                            (InstrItinerary *)Found->Value);
+InstrItineraryData
+MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
+  MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index e013e77..f7f9184 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -30,7 +30,7 @@ static bool isAcceptableChar(char C) {
 /// syntactically correct.
 static bool NameNeedsQuoting(StringRef Str) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
-  
+
   // If any of the characters in the string is an unacceptable character, force
   // quotes.
   for (unsigned i = 0, e = Str.size(); i != e; ++i)
@@ -72,7 +72,7 @@ void MCSymbol::print(raw_ostream &OS) const {
     OS << getName();
     return;
   }
-    
+
   OS << '"' << getName() << '"';
 }
 
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 79e66fc..c05b4b1 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -228,8 +228,7 @@ static const MCSection *getWin64EHTableSection(StringRef suffix,
 
   return context.getCOFFSection((".xdata"+suffix).str(),
                                 COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getDataRel());
 }
 
@@ -239,8 +238,7 @@ static const MCSection *getWin64EHFuncTableSection(StringRef suffix,
     return context.getObjectFileInfo()->getPDataSection();
   return context.getCOFFSection((".pdata"+suffix).str(),
                                 COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getDataRel());
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 8e4066c..5820a22 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #include <vector>
@@ -351,6 +352,21 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
     Write32(Address);
 }
 
+void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
+                                                uint32_t DataOffset,
+                                                uint32_t DataSize) {
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(Type);
+  Write32(macho::LinkeditLoadCommandSize);
+  Write32(DataOffset);
+  Write32(DataSize);
+
+  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+}
+
+
 void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const MCFragment *Fragment,
@@ -654,6 +670,13 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
                          macho::DysymtabLoadCommandSize);
   }
 
+  // Add the data-in-code load command size, if used.
+  unsigned NumDataRegions = Asm.getDataRegions().size();
+  if (NumDataRegions) {
+    ++NumLoadCommands;
+    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+  }
+
   // Compute the total size of the section data, as well as its file size and vm
   // size.
   uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
@@ -701,6 +724,15 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
   }
 
+  // Write the data-in-code load command, if used.
+  uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8;
+  if (NumDataRegions) {
+    uint64_t DataRegionsOffset = RelocTableEnd;
+    uint64_t DataRegionsSize = NumDataRegions * 8;
+    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+                             DataRegionsSize);
+  }
+
   // Write the symbol table load command, if used.
   if (NumSymbols) {
     unsigned FirstLocalSymbol = 0;
@@ -717,10 +749,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
 
     // If used, the indirect symbols are written after the section data.
     if (NumIndirectSymbols)
-      IndirectSymbolOffset = RelocTableEnd;
+      IndirectSymbolOffset = DataInCodeTableEnd;
 
     // The symbol table is written after the indirect symbol data.
-    uint64_t SymbolTableOffset = RelocTableEnd + IndirectSymbolSize;
+    uint64_t SymbolTableOffset = DataInCodeTableEnd + IndirectSymbolSize;
 
     // The string table is written after symbol table.
     uint64_t StringTableOffset =
@@ -760,6 +792,23 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     }
   }
 
+  // Write out the data-in-code region payload, if there is one.
+  for (MCAssembler::const_data_region_iterator
+         it = Asm.data_region_begin(), ie = Asm.data_region_end();
+         it != ie; ++it) {
+    const DataRegionData *Data = &(*it);
+    uint64_t Start = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start), Layout);
+    uint64_t End = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End), Layout);
+    DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
+                 << "  start: " << Start << "(" << Data->Start->getName() << ")"
+                 << "  end: " << End << "(" << Data->End->getName() << ")"
+                 << "  size: " << End - Start
+                 << "\n");
+    Write32(Start);
+    Write16(End - Start);
+    Write16(Data->Kind);
+  }
+
   // Write the symbol table data, if used.
   if (NumSymbols) {
     // Write the indirect symbol entries.
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index be41579..0a44e77 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -92,7 +92,7 @@ static void Split(std::vector<std::string> &V, const StringRef S) {
 static std::string Join(const std::vector<std::string> &V) {
   // Start with empty string.
   std::string Result;
-  // If the vector is not empty 
+  // If the vector is not empty
   if (!V.empty()) {
     // Start with the first feature
     Result = V[0];
@@ -104,7 +104,7 @@ static std::string Join(const std::vector<std::string> &V) {
       Result += V[i];
     }
   }
-  // Return the features string 
+  // Return the features string
   return Result;
 }
 
@@ -205,7 +205,7 @@ void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 
 /// ClearImpliedBits - For each feature that (transitively) implies this
 /// feature, clear it.
-/// 
+///
 static
 void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                       const SubtargetFeatureKV *FeatureTable,
@@ -252,7 +252,7 @@ SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
 
   return Bits;
 }
-           
+
 
 /// getFeatureBits - Get feature bits a CPU.
 ///
@@ -279,7 +279,7 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   // Check if help is needed
   if (CPU == "help")
     Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
-  
+
   // Find CPU entry if CPU name is specified.
   if (!CPU.empty()) {
     const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable, CPUTableSize);
@@ -304,11 +304,11 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   // Iterate through each feature
   for (size_t i = 0, E = Features.size(); i < E; i++) {
     const StringRef Feature = Features[i];
-    
+
     // Check for help
     if (Feature == "+help")
       Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
-    
+
     // Find feature in table.
     const SubtargetFeatureKV *FeatureEntry =
                        Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
@@ -349,7 +349,7 @@ void *SubtargetFeatures::getItinerary(const StringRef CPU,
 
   // Find entry
   const SubtargetInfoKV *Entry = Find(CPU, Table, TableSize);
-  
+
   if (Entry) {
     return Entry->Value;
   } else {
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 67dc649..b026277 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -67,7 +67,7 @@ public:
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                            unsigned Size,unsigned ByteAlignment);
+                            uint64_t Size,unsigned ByteAlignment);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
@@ -324,7 +324,7 @@ void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                   unsigned Size,unsigned ByteAlignment) {
+                                   uint64_t Size,unsigned ByteAlignment) {
   llvm_unreachable("not implemented");
 }
 
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index c5f15ba..2a5951a 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -28,7 +28,7 @@ struct ArchiveMemberHeader {
   char UID[6];
   char GID[6];
   char AccessMode[8];
-  char Size[10]; //< Size of data, not including header or padding.
+  char Size[10]; ///< Size of data, not including header or padding.
   char Terminator[2];
 
   ///! Get the name without looking up long names.
@@ -60,11 +60,11 @@ static const ArchiveMemberHeader *ToHeader(const char *base) {
 
 
 static bool isInternalMember(const ArchiveMemberHeader &amh) {
-  const char *internals[] = {
+  static const char *const internals[] = {
     "/",
     "//",
     "#_LLVM_SYM_TAB_#"
-    };
+  };
 
   StringRef name = amh.getName();
   for (std::size_t i = 0; i < sizeof(internals) / sizeof(*internals); ++i) {
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index bd27a56..8ab54c6 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -622,6 +622,28 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
   return object_error::success;
 }
 
+ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
+                                  const coff_symbol *symbol) const {
+  const uint8_t *aux = NULL;
+  
+  if ( symbol->NumberOfAuxSymbols > 0 ) {
+  // AUX data comes immediately after the symbol in COFF
+    aux = reinterpret_cast<const uint8_t *>(symbol + 1);
+# ifndef NDEBUG
+    // Verify that the aux symbol points to a valid entry in the symbol table.
+    uintptr_t offset = uintptr_t(aux) - uintptr_t(base());
+    if (offset < Header->PointerToSymbolTable
+        || offset >= Header->PointerToSymbolTable
+           + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+      report_fatal_error("Aux Symbol data was outside of symbol table.");
+
+    assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+         == 0 && "Aux Symbol data did not point to the beginning of a symbol");
+# endif
+  }
+  return ArrayRef<uint8_t>(aux, symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
+}
+
 error_code COFFObjectFile::getSectionName(const coff_section *Sec,
                                           StringRef &Res) const {
   StringRef Name;
@@ -694,6 +716,20 @@ error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
   return object_error::success;
 }
 
+const coff_section *COFFObjectFile::getCOFFSection(section_iterator &It) const {
+  return toSec(It->getRawDataRefImpl());
+}
+
+const coff_symbol *COFFObjectFile::getCOFFSymbol(symbol_iterator &It) const {
+  return toSymb(It->getRawDataRefImpl());
+}
+
+const coff_relocation *COFFObjectFile::getCOFFRelocation(
+                                             relocation_iterator &It) const {
+  return toRel(It->getRawDataRefImpl());
+}
+
+
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
   case COFF::enum: res = #enum; break;
 
diff --git a/lib/Object/MachOObject.cpp b/lib/Object/MachOObject.cpp
index b7e5cdc..00dea3f 100644
--- a/lib/Object/MachOObject.cpp
+++ b/lib/Object/MachOObject.cpp
@@ -357,6 +357,19 @@ void MachOObject::ReadSymbol64TableEntry(uint64_t SymbolTableOffset,
   ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
 }
 
+template<>
+void SwapStruct(macho::DataInCodeTableEntry &Value) {
+  SwapValue(Value.Offset);
+  SwapValue(Value.Length);
+  SwapValue(Value.Kind);
+}
+void MachOObject::ReadDataInCodeTableEntry(uint64_t TableOffset,
+                                           unsigned Index,
+                       InMemoryStruct<macho::DataInCodeTableEntry> &Res) const {
+  uint64_t Offset = (TableOffset +
+                     Index * sizeof(macho::DataInCodeTableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
 
 void MachOObject::ReadULEB128s(uint64_t Index,
                                SmallVectorImpl<uint64_t> &Out) const {
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 3bcda17..d229671 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -598,13 +598,15 @@ error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI,
   if (MachOObj->is64Bit()) {
     InMemoryStruct<macho::Section64> Sect;
     getSection64(DRI, Sect);
-    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
-              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
   } else {
     InMemoryStruct<macho::Section> Sect;
     getSection(DRI, Sect);
-    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
-              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
   }
 
   return object_error::success;
@@ -786,7 +788,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
 
   switch (Arch) {
     case Triple::x86: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "GENERIC_RELOC_VANILLA",
         "GENERIC_RELOC_PAIR",
         "GENERIC_RELOC_SECTDIFF",
@@ -801,7 +803,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::x86_64: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "X86_64_RELOC_UNSIGNED",
         "X86_64_RELOC_SIGNED",
         "X86_64_RELOC_BRANCH",
@@ -820,7 +822,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::arm: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "ARM_RELOC_VANILLA",
         "ARM_RELOC_PAIR",
         "ARM_RELOC_SECTDIFF",
@@ -839,7 +841,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::ppc: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "PPC_RELOC_VANILLA",
         "PPC_RELOC_PAIR",
         "PPC_RELOC_BR14",
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 409d4fb..2139df5 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1765,6 +1765,32 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
   return fs;
 }
 
+/* Rounding-mode corrrect round to integral value.  */
+APFloat::opStatus APFloat::roundToIntegral(roundingMode rounding_mode) {
+  opStatus fs;
+  assertArithmeticOK(*semantics);
+
+  // The algorithm here is quite simple: we add 2^(p-1), where p is the
+  // precision of our format, and then subtract it back off again.  The choice
+  // of rounding modes for the addition/subtraction determines the rounding mode
+  // for our integral rounding as well.
+  APInt IntegerConstant(NextPowerOf2(semanticsPrecision(*semantics)),
+                        1 << (semanticsPrecision(*semantics)-1));
+  APFloat MagicConstant(*semantics);
+  fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
+                                      rmNearestTiesToEven);
+  if (fs != opOK)
+    return fs;
+
+  fs = add(MagicConstant, rounding_mode);
+  if (fs != opOK && fs != opInexact)
+    return fs;
+
+  fs = subtract(MagicConstant, rounding_mode);
+  return fs;
+}
+
+
 /* Comparison requires normalized numbers.  */
 APFloat::cmpResult
 APFloat::compare(const APFloat &rhs) const
@@ -3278,16 +3304,8 @@ APFloat::APFloat(double d) : exponent2(0), sign2(0) {
 }
 
 namespace {
-  static void append(SmallVectorImpl<char> &Buffer,
-                     unsigned N, const char *Str) {
-    unsigned Start = Buffer.size();
-    Buffer.set_size(Start + N);
-    memcpy(&Buffer[Start], Str, N);
-  }
-
-  template <unsigned N>
-  void append(SmallVectorImpl<char> &Buffer, const char (&Str)[N]) {
-    append(Buffer, N, Str);
+  void append(SmallVectorImpl<char> &Buffer, StringRef Str) {
+    Buffer.append(Str.begin(), Str.end());
   }
 
   /// Removes data from the given significand until it is no more
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 9b81fe7..38cfaed 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1135,7 +1135,7 @@ APInt APInt::lshr(unsigned shiftAmt) const {
   // If all the bits were shifted out, the result is 0. This avoids issues
   // with shifting by the size of the integer type, which produces undefined
   // results. We define these "undefined results" to always be 0.
-  if (shiftAmt == BitWidth)
+  if (shiftAmt >= BitWidth)
     return APInt(BitWidth, 0);
 
   // If none of the bits are shifted out, the result is *this. This avoids
@@ -1446,7 +1446,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
   APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
   APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
 
-  nc = allOnes - (-d).urem(d);
+  nc = allOnes - (allOnes - d).urem(d);
   p = d.getBitWidth() - 1;  // initialize p
   q1 = signedMin.udiv(nc);  // initialize q1 = 2p/nc
   r1 = signedMin - q1*nc;   // initialize r1 = rem(2p,nc)
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 9103327..83baf60 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMSupport
   Dwarf.cpp
   ErrorHandling.cpp
   FileUtilities.cpp
+  FileOutputBuffer.cpp
   FoldingSet.cpp
   FormattedStream.cpp
   GraphWriter.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index e6fdf16..593315d1 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -219,10 +219,10 @@ static Option *LookupNearestOption(StringRef Arg,
       if (!Best || Distance < BestDistance) {
         Best = O;
         BestDistance = Distance;
-	if (RHS.empty() || !PermitValue)
-	  NearestString = OptionNames[i];
-	else
-	  NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
+        if (RHS.empty() || !PermitValue)
+          NearestString = OptionNames[i];
+        else
+          NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
       }
     }
   }
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index 5206cf1..720ef36 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -143,16 +143,17 @@ bool ConstantRange::isSignWrappedSet() const {
 /// getSetSize - Return the number of elements in this set.
 ///
 APInt ConstantRange::getSetSize() const {
-  if (isEmptySet()) 
-    return APInt(getBitWidth(), 0);
-  if (getBitWidth() == 1) {
-    if (Lower != Upper)  // One of T or F in the set...
-      return APInt(2, 1);
-    return APInt(2, 2);      // Must be full set...
+  if (isEmptySet())
+    return APInt(getBitWidth()+1, 0);
+
+  if (isFullSet()) {
+    APInt Size(getBitWidth()+1, 0);
+    Size.setBit(getBitWidth());
+    return Size;
   }
 
-  // Simply subtract the bounds...
-  return Upper - Lower;
+  // This is also correct for wrapped sets.
+  return (Upper - Lower).zext(getBitWidth()+1);
 }
 
 /// getUnsignedMax - Return the largest unsigned value contained in the
@@ -248,6 +249,12 @@ ConstantRange ConstantRange::subtract(const APInt &Val) const {
   return ConstantRange(Lower - Val, Upper - Val);
 }
 
+/// \brief Subtract the specified range from this range (aka relative complement
+/// of the sets).
+ConstantRange ConstantRange::difference(const ConstantRange &CR) const {
+  return intersectWith(CR.inverse());
+}
+
 /// intersectWith - Return the range that results from the intersection of this
 /// range with another range.  The resultant range is guaranteed to include all
 /// elements contained in both input ranges, and to have the smallest possible
@@ -288,7 +295,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
       if (CR.Upper.ult(Upper))
         return CR;
 
-      if (CR.Upper.ult(Lower))
+      if (CR.Upper.ule(Lower))
         return ConstantRange(CR.Lower, Upper);
 
       if (getSetSize().ult(CR.getSetSize()))
@@ -316,7 +323,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
     return CR;
   }
-  if (CR.Upper.ult(Lower)) {
+  if (CR.Upper.ule(Lower)) {
     if (CR.Lower.ult(Lower))
       return *this;
 
@@ -420,9 +427,13 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
 
   unsigned SrcTySize = getBitWidth();
   assert(SrcTySize < DstTySize && "Not a value extension");
-  if (isFullSet() || isWrappedSet())
+  if (isFullSet() || isWrappedSet()) {
     // Change into [0, 1 << src bit width)
-    return ConstantRange(APInt(DstTySize,0), APInt(DstTySize,1).shl(SrcTySize));
+    APInt LowerExt(DstTySize, 0);
+    if (!Upper) // special case: [X, 0) -- not really wrapping around
+      LowerExt = Lower.zext(DstTySize);
+    return ConstantRange(LowerExt, APInt(DstTySize, 1).shl(SrcTySize));
+  }
 
   return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize));
 }
@@ -450,10 +461,53 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 /// truncated to the specified type.
 ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   assert(getBitWidth() > DstTySize && "Not a value truncation");
-  if (isFullSet() || getSetSize().getActiveBits() > DstTySize)
+  if (isEmptySet())
+    return ConstantRange(DstTySize, /*isFullSet=*/false);
+  if (isFullSet())
     return ConstantRange(DstTySize, /*isFullSet=*/true);
 
-  return ConstantRange(Lower.trunc(DstTySize), Upper.trunc(DstTySize));
+  APInt MaxValue = APInt::getMaxValue(DstTySize).zext(getBitWidth());
+  APInt MaxBitValue(getBitWidth(), 0);
+  MaxBitValue.setBit(DstTySize);
+
+  APInt LowerDiv(Lower), UpperDiv(Upper);
+  ConstantRange Union(DstTySize, /*isFullSet=*/false);
+
+  // Analyze wrapped sets in their two parts: [0, Upper) \/ [Lower, MaxValue]
+  // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
+  // then we do the union with [MaxValue, Upper)
+  if (isWrappedSet()) {
+    // if Upper is greater than Max Value, it covers the whole truncated range.
+    if (Upper.uge(MaxValue))
+      return ConstantRange(DstTySize, /*isFullSet=*/true);
+
+    Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
+    UpperDiv = APInt::getMaxValue(getBitWidth());
+
+    // Union covers the MaxValue case, so return if the remaining range is just
+    // MaxValue.
+    if (LowerDiv == UpperDiv)
+      return Union;
+  }
+
+  // Chop off the most significant bits that are past the destination bitwidth.
+  if (LowerDiv.uge(MaxValue)) {
+    APInt Div(getBitWidth(), 0);
+    APInt::udivrem(LowerDiv, MaxBitValue, Div, LowerDiv);
+    UpperDiv = UpperDiv - MaxBitValue * Div;
+  }
+
+  if (UpperDiv.ule(MaxValue))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperDiv.trunc(DstTySize)).unionWith(Union);
+
+  // The truncated value wrapps around. Check if we can do better than fullset.
+  APInt UpperModulo = UpperDiv - MaxBitValue;
+  if (UpperModulo.ult(LowerDiv))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperModulo.trunc(DstTySize)).unionWith(Union);
+
+  return ConstantRange(DstTySize, /*isFullSet=*/true);
 }
 
 /// zextOrTrunc - make this range have the bit width given by \p DstTySize. The
@@ -529,8 +583,6 @@ ConstantRange::multiply(const ConstantRange &Other) const {
 
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-  if (isFullSet() || Other.isFullSet())
-    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   APInt this_min = getUnsignedMin().zext(getBitWidth() * 2);
   APInt this_max = getUnsignedMax().zext(getBitWidth() * 2);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index e2af0bc..e175056 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -223,7 +223,7 @@ void CrashRecoveryContext::Disable() {
 
 #include <signal.h>
 
-static int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
 static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
 static struct sigaction PrevActions[NumSignals];
 
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index 9fdb12e..c8e8900 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a handle way of adding debugging information to your
+// This file implements a handy way of adding debugging information to your
 // code, without it being enabled all of the time, and without having to add
 // command line options to enable it.
 //
@@ -18,8 +18,8 @@
 // can specify '-debug-only=foo' to enable JUST the debug information for the
 // foo class.
 //
-// When compiling in release mode, the -debug-* options and all code in DEBUG()
-// statements disappears, so it does not effect the runtime of the code.
+// When compiling without assertions, the -debug-* options and all code in
+// DEBUG() statements disappears, so it does not affect the runtime of the code.
 //
 //===----------------------------------------------------------------------===//
 
@@ -89,11 +89,11 @@ bool llvm::isCurrentDebugType(const char *DebugType) {
   return CurrentDebugType.empty() || DebugType == CurrentDebugType;
 }
 
-/// SetCurrentDebugType - Set the current debug type, as if the -debug-only=X
+/// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
 /// option were specified.  Note that DebugFlag also needs to be set to true for
 /// debug output to be produced.
 ///
-void llvm::SetCurrentDebugType(const char *Type) {
+void llvm::setCurrentDebugType(const char *Type) {
   CurrentDebugType = Type;
 }
 
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index 18c6581..dd218f6 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -52,7 +52,7 @@ std::string StrError(int errnum) {
 # endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
     if (errnum)
-      strerror_s(buffer, errnum);
+      strerror_s(buffer, MaxErrStrLen - 1, errnum);
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
new file mode 100644
index 0000000..7dc9587
--- /dev/null
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -0,0 +1,148 @@
+//===- FileOutputBuffer.cpp - File Output Buffer ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility for creating a in-memory buffer that will be written to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileOutputBuffer.h"
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+
+namespace llvm {
+
+
+FileOutputBuffer::FileOutputBuffer(uint8_t *Start, uint8_t *End, 
+                                  StringRef Path, StringRef TmpPath)
+  : BufferStart(Start), BufferEnd(End) {
+  FinalPath.assign(Path);
+  TempPath.assign(TmpPath);
+}
+
+
+FileOutputBuffer::~FileOutputBuffer() {
+  // If not already commited, delete buffer and remove temp file.
+  if ( BufferStart != NULL ) {
+    sys::fs::unmap_file_pages((void*)BufferStart, getBufferSize());
+    bool Existed;
+    sys::fs::remove(Twine(TempPath), Existed);
+  }
+}
+
+ 
+error_code FileOutputBuffer::create(StringRef FilePath, 
+                                    size_t Size,  
+                                    OwningPtr<FileOutputBuffer> &Result,
+                                    unsigned Flags) {
+  // If file already exists, it must be a regular file (to be mappable).
+  sys::fs::file_status Stat;
+  error_code EC = sys::fs::status(FilePath, Stat);
+  switch (Stat.type()) {
+    case sys::fs::file_type::file_not_found:
+      // If file does not exist, we'll create one.
+      break;
+    case sys::fs::file_type::regular_file: {
+        // If file is not currently writable, error out.
+        // FIXME: There is no sys::fs:: api for checking this.
+        // FIXME: In posix, you use the access() call to check this.
+      }
+      break;
+    default:
+      if (EC)
+        return EC;
+      else
+        return make_error_code(errc::operation_not_permitted);
+  }
+
+  // Delete target file.
+  bool Existed;
+  EC = sys::fs::remove(FilePath, Existed);
+  if (EC)
+    return EC;
+  
+  // Create new file in same directory but with random name.
+  SmallString<128> TempFilePath;
+  int FD;
+  EC = sys::fs::unique_file(Twine(FilePath) + ".tmp%%%%%%%",  
+                                                FD, TempFilePath, false, 0644);
+  if (EC)
+    return EC;
+  
+  // The unique_file() interface leaks lower layers and returns a file 
+  // descriptor.  There is no way to directly close it, so use this hack
+  // to hand it off to raw_fd_ostream to close for us.
+  {
+    raw_fd_ostream Dummy(FD, /*shouldClose=*/true);
+  }
+  
+  // Resize file to requested initial size
+  EC = sys::fs::resize_file(Twine(TempFilePath), Size);
+  if (EC)
+    return EC;
+  
+  // If requested, make the output file executable.
+  if ( Flags & F_executable ) {
+    sys::fs::file_status Stat2;
+    EC = sys::fs::status(Twine(TempFilePath), Stat2);
+    if (EC)
+      return EC;
+    
+    sys::fs::perms new_perms = Stat2.permissions();
+    if ( new_perms & sys::fs::owner_read )
+      new_perms |= sys::fs::owner_exe;
+    if ( new_perms & sys::fs::group_read )
+      new_perms |= sys::fs::group_exe;
+    if ( new_perms & sys::fs::others_read )
+      new_perms |= sys::fs::others_exe;
+    new_perms |= sys::fs::add_perms;
+    EC = sys::fs::permissions(Twine(TempFilePath), new_perms);
+    if (EC)
+      return EC;
+  }
+
+  // Memory map new file.
+  void *Base;
+  EC = sys::fs::map_file_pages(Twine(TempFilePath), 0, Size, true, Base);
+  if (EC)
+    return EC;
+  
+  // Create FileOutputBuffer object to own mapped range.
+  uint8_t *Start = reinterpret_cast<uint8_t*>(Base);
+  Result.reset(new FileOutputBuffer(Start, Start+Size, FilePath, TempFilePath));
+                     
+  return error_code::success();
+}                    
+
+
+error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
+  // Unmap buffer, letting OS flush dirty pages to file on disk.
+  void *Start = reinterpret_cast<void*>(BufferStart);
+  error_code EC = sys::fs::unmap_file_pages(Start, getBufferSize());
+  if (EC)
+    return EC;
+  
+  // If requested, resize file as part of commit.
+  if ( NewSmallerSize != -1 ) {
+    EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
+    if (EC)
+      return EC;
+  }
+  
+  // Rename file to final name.
+  return sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+}
+
+
+} // namespace
+
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 32126ec..f6aaf83 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -99,7 +99,6 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break;
   case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break;
   case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
-  default: errs() << "Unknown graph layout name; using default.\n";
   }
 
   args.push_back(0);
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 0f06964..9a2c39d 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -11,7 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/DataStream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Config/config.h"
 #include <string.h>
 
@@ -25,6 +31,12 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
+#if defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+#endif
 
 //===----------------------------------------------------------------------===//
 //
@@ -230,11 +242,18 @@ std::string sys::getHostCPUName() {
       case 45:
         return "corei7-avx";
 
-      case 28: // Intel Atom processor. All processors are manufactured using
-               // the 45 nm process
+      // Ivy Bridge:
+      case 58:
+        return "core-avx-i";
+
+      case 28: // Most 45 nm Intel Atom processors
+      case 38: // 45 nm Atom Lincroft
+      case 39: // 32 nm Atom Medfield
+      case 53: // 32 nm Atom Midview
+      case 54: // 32 nm Atom Midview
         return "atom";
 
-      default: return "i686";
+      default: return (Em64T) ? "x86-64" : "i686";
       }
     case 15: {
       switch (Model) {
@@ -315,6 +334,179 @@ std::string sys::getHostCPUName() {
   }
   return "generic";
 }
+#elif defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return generic;
+  }
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  const char *CPUInfoStart = buffer;
+  const char *CPUInfoEnd = buffer + CPUInfoSize;
+
+  const char *CIP = CPUInfoStart;
+
+  const char *CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+  
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+  
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+    .Case("604e", "604e")
+    .Case("604", "604")
+    .Case("7400", "7400")
+    .Case("7410", "7400")
+    .Case("7447", "7400")
+    .Case("7455", "7450")
+    .Case("G4", "g4")
+    .Case("POWER4", "970")
+    .Case("PPC970FX", "970")
+    .Case("PPC970MP", "970")
+    .Case("G5", "g5")
+    .Case("POWER5", "g5")
+    .Case("A2", "a2")
+    .Case("POWER6", "pwr6")
+    .Case("POWER7", "pwr7")
+    .Default(generic);
+}
+#elif defined(__linux__) && defined(__arm__)
+std::string sys::getHostCPUName() {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+
+  if (Implementer == "0x41") // ARM Ltd.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+          .Case("0x926", "arm926ej-s")
+          .Case("0xb02", "mpcore")
+          .Case("0xb36", "arm1136j-s")
+          .Case("0xb56", "arm1156t2-s")
+          .Case("0xb76", "arm1176jz-s")
+          .Case("0xc08", "cortex-a8")
+          .Case("0xc09", "cortex-a9")
+          .Case("0xc20", "cortex-m0")
+          .Case("0xc23", "cortex-m3")
+          .Case("0xc24", "cortex-m4")
+          .Default("generic");
+
+  return "generic";
+}
 #else
 std::string sys::getHostCPUName() {
   return "generic";
diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp
index 2a1642a..22f7494 100644
--- a/lib/Support/Memory.cpp
+++ b/lib/Support/Memory.cpp
@@ -45,7 +45,7 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
 
 #  if (defined(__POWERPC__) || defined (__ppc__) || \
      defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
-  sys_icache_invalidate(Addr, Len);
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
 #  endif
 
 #else
@@ -67,11 +67,12 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
   asm volatile("isync");
 #  elif defined(__arm__) && defined(__GNUC__)
   // FIXME: Can we safely always call this for __GNUC__ everywhere?
-  char *Start = (char*) Addr;
-  char *End = Start + Len;
-  __clear_cache(Start, End);
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
 #  elif defined(__mips__)
-  cacheflush((char*)Addr, Len, BCACHE);
+  const char *Start = static_cast<const char *>(Addr);
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
 #  endif
 
 #endif  // end apple
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 16e5c7a..992f03c 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
@@ -214,6 +215,14 @@ error_code MemoryBuffer::getFile(const char *Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
                                  bool RequiresNullTerminator) {
+  // First check that the "file" is not a directory
+  bool is_dir = false;
+  error_code err = sys::fs::is_directory(Filename, is_dir);
+  if (err)
+    return err;
+  if (is_dir)
+    return make_error_code(errc::is_a_directory);
+
   int OpenFlags = O_RDONLY;
 #ifdef O_BINARY
   OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
@@ -304,16 +313,6 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
                                                       RealMapOffset)) {
       result.reset(GetNamedBuffer<MemoryBufferMMapFile>(
           StringRef(Pages + Delta, MapSize), Filename, RequiresNullTerminator));
-
-      if (RequiresNullTerminator && result->getBufferEnd()[0] != '\0') {
-        // There could be a racing issue that resulted in the file being larger
-        // than the FileSize passed by the caller. We already have an assertion
-        // for this in MemoryBuffer::init() but have a runtime guarantee that
-        // the buffer will be null-terminated here, so do a copy that adds a
-        // null-terminator.
-        result.reset(MemoryBuffer::getMemBufferCopy(result->getBuffer(),
-                                                    Filename));
-      }
       return error_code::success();
     }
   }
diff --git a/lib/Support/Mutex.cpp b/lib/Support/Mutex.cpp
index da5baab..4e4a026 100644
--- a/lib/Support/Mutex.cpp
+++ b/lib/Support/Mutex.cpp
@@ -59,7 +59,8 @@ MutexImpl::MutexImpl( bool recursive)
   errorcode = pthread_mutexattr_settype(&attr, kind);
   assert(errorcode == 0);
 
-#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__DragonFly__)
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && \
+    !defined(__DragonFly__) && !defined(__Bitrig__)
   // Make it a process local mutex
   errorcode = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
   assert(errorcode == 0);
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index dcddeda..db4a56b 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -60,8 +60,11 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
 
     case '\177':
       if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
-        if (length >= 18 && magic[17] == 0)
-          switch (magic[16]) {
+        bool Data2MSB = magic[5] == 2;
+        unsigned high = Data2MSB ? 16 : 17;
+        unsigned low  = Data2MSB ? 17 : 16;
+        if (length >= 18 && magic[high] == 0)
+          switch (magic[low]) {
             default: break;
             case 1: return ELF_Relocatable_FileType;
             case 2: return ELF_Executable_FileType;
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
index e2a69a6..46571c0 100644
--- a/lib/Support/PathV2.cpp
+++ b/lib/Support/PathV2.cpp
@@ -744,6 +744,8 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
 
 /// @brief Identify the magic in magic.
 file_magic identify_magic(StringRef magic) {
+  if (magic.size() < 4)
+    return file_magic::unknown;
   switch ((unsigned char)magic[0]) {
     case 0xDE:  // 0x0B17C0DE = BC wraper
       if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 15278c5..e4e01be 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -79,9 +79,10 @@ int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   return -1;
 }
 
-/// FindLineNumber - Find the line number for the specified location in the
-/// specified file.  This is not a fast method.
-unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
+/// getLineAndColumn - Find the line and column number for the specified
+/// location in the specified file.  This is not a fast method.
+std::pair<unsigned, unsigned>
+SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
   if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc);
   assert(BufferID != -1 && "Invalid Location!");
 
@@ -91,7 +92,8 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   // location.
   unsigned LineNo = 1;
 
-  const char *Ptr = Buff->getBufferStart();
+  const char *BufStart = Buff->getBufferStart();
+  const char *Ptr = BufStart;
 
   // If we have a line number cache, and if the query is to a later point in the
   // same file, start searching from the last query location.  This optimizes
@@ -108,7 +110,6 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
     if (*Ptr == '\n') ++LineNo;
 
-
   // Allocate the line number cache if it doesn't exist.
   if (LineNoCache == 0)
     LineNoCache = new LineNoCacheTy();
@@ -118,7 +119,10 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   Cache.LastQueryBufferID = BufferID;
   Cache.LastQuery = Ptr;
   Cache.LineNoOfQuery = LineNo;
-  return LineNo;
+  
+  size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
+  if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
+  return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
 }
 
 void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
@@ -145,50 +149,59 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    ArrayRef<SMRange> Ranges) const {
 
   // First thing to do: find the current buffer containing the specified
-  // location.
-  int CurBuf = FindBufferContainingLoc(Loc);
-  assert(CurBuf != -1 && "Invalid or unspecified location!");
-
-  MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
-
-  // Scan backward to find the start of the line.
-  const char *LineStart = Loc.getPointer();
-  while (LineStart != CurMB->getBufferStart() &&
-         LineStart[-1] != '\n' && LineStart[-1] != '\r')
-    --LineStart;
-
-  // Get the end of the line.
-  const char *LineEnd = Loc.getPointer();
-  while (LineEnd != CurMB->getBufferEnd() &&
-         LineEnd[0] != '\n' && LineEnd[0] != '\r')
-    ++LineEnd;
-  std::string LineStr(LineStart, LineEnd);
-
-  // Convert any ranges to column ranges that only intersect the line of the
-  // location.
+  // location to pull out the source line.
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
-  for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
-    SMRange R = Ranges[i];
-    if (!R.isValid()) continue;
-    
-    // If the line doesn't contain any part of the range, then ignore it.
-    if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
-      continue;
-   
-    // Ignore pieces of the range that go onto other lines.
-    if (R.Start.getPointer() < LineStart)
-      R.Start = SMLoc::getFromPointer(LineStart);
-    if (R.End.getPointer() > LineEnd)
-      R.End = SMLoc::getFromPointer(LineEnd);
+  std::pair<unsigned, unsigned> LineAndCol;
+  const char *BufferID = "<unknown>";
+  std::string LineStr;
+  
+  if (Loc.isValid()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+
+    MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
+    BufferID = CurMB->getBufferIdentifier();
     
-    // Translate from SMLoc ranges to column ranges.
-    ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
-                                       R.End.getPointer()-LineStart));
+    // Scan backward to find the start of the line.
+    const char *LineStart = Loc.getPointer();
+    const char *BufStart = CurMB->getBufferStart();
+    while (LineStart != BufStart && LineStart[-1] != '\n' &&
+           LineStart[-1] != '\r')
+      --LineStart;
+
+    // Get the end of the line.
+    const char *LineEnd = Loc.getPointer();
+    const char *BufEnd = CurMB->getBufferEnd();
+    while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
+      ++LineEnd;
+    LineStr = std::string(LineStart, LineEnd);
+
+    // Convert any ranges to column ranges that only intersect the line of the
+    // location.
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+      SMRange R = Ranges[i];
+      if (!R.isValid()) continue;
+      
+      // If the line doesn't contain any part of the range, then ignore it.
+      if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+        continue;
+     
+      // Ignore pieces of the range that go onto other lines.
+      if (R.Start.getPointer() < LineStart)
+        R.Start = SMLoc::getFromPointer(LineStart);
+      if (R.End.getPointer() > LineEnd)
+        R.End = SMLoc::getFromPointer(LineEnd);
+      
+      // Translate from SMLoc ranges to column ranges.
+      ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
+                                         R.End.getPointer()-LineStart));
+    }
+
+    LineAndCol = getLineAndColumn(Loc, CurBuf);
   }
-  
-  return SMDiagnostic(*this, Loc,
-                      CurMB->getBufferIdentifier(), FindLineNumber(Loc, CurBuf),
-                      Loc.getPointer()-LineStart, Kind, Msg.str(),
+    
+  return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
+                      LineAndCol.second-1, Kind, Msg.str(),
                       LineStr, ColRanges);
 }
 
@@ -205,9 +218,11 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 
   raw_ostream &OS = errs();
 
-  int CurBuf = FindBufferContainingLoc(Loc);
-  assert(CurBuf != -1 && "Invalid or unspecified location!");
-  PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+  if (Loc != SMLoc()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+    PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+  }
 
   Diagnostic.print(0, OS, ShowColors);
 }
@@ -228,8 +243,8 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN,
 
 void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
                          bool ShowColors) const {
-  // Display colors only if OS goes to a tty.
-  ShowColors &= S.is_displayed();
+  // Display colors only if OS supports colors.
+  ShowColors &= S.has_colors();
 
   if (ShowColors)
     S.changeColor(raw_ostream::SAVEDCOLOR, true);
@@ -343,5 +358,3 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   
   S << '\n';
 }
-
-
diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
index c23f07b..fe3752a 100644
--- a/lib/Support/StreamableMemoryObject.cpp
+++ b/lib/Support/StreamableMemoryObject.cpp
@@ -20,7 +20,7 @@ class RawMemoryObject : public StreamableMemoryObject {
 public:
   RawMemoryObject(const unsigned char *Start, const unsigned char *End) :
     FirstChar(Start), LastChar(End) {
-    assert(LastChar > FirstChar && "Invalid start/end range");
+    assert(LastChar >= FirstChar && "Invalid start/end range");
   }
 
   virtual uint64_t getBase() const { return 0; }
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index c131fe0..c2fc261 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -189,7 +189,7 @@ void StringMapImpl::RehashTable() {
   // grow/rehash the table.
   if (NumItems*4 > NumBuckets*3) {
     NewSize = NumBuckets*2;
-  } else if (NumBuckets-(NumItems+NumTombstones) < NumBuckets/8) {
+  } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) {
     NewSize = NumBuckets;
   } else {
     return;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index abe570f..8aab4b2 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
+
 #include <bitset>
 
 using namespace llvm;
@@ -230,6 +231,31 @@ StringRef::size_type StringRef::find_last_of(StringRef Chars,
   return npos;
 }
 
+/// find_last_not_of - Find the last character in the string that is not
+/// \arg C, or npos if not found.
+StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (Data[i] != C)
+      return i;
+  return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not in
+/// \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
+                                                 size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0, e = Chars.size(); i != e; ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (!CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
 void StringRef::split(SmallVectorImpl<StringRef> &A,
                       StringRef Separators, int MaxSplit,
                       bool KeepEmpty) const {
@@ -272,14 +298,22 @@ static unsigned GetAutoSenseRadix(StringRef &Str) {
   if (Str.startswith("0x")) {
     Str = Str.substr(2);
     return 16;
-  } else if (Str.startswith("0b")) {
+  }
+  
+  if (Str.startswith("0b")) {
     Str = Str.substr(2);
     return 2;
-  } else if (Str.startswith("0")) {
+  }
+
+  if (Str.startswith("0o")) {
+    Str = Str.substr(2);
     return 8;
-  } else {
-    return 10;
   }
+
+  if (Str.startswith("0"))
+    return 8;
+  
+  return 10;
 }
 
 
@@ -383,7 +417,7 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   unsigned BitWidth = Log2Radix * Str.size();
   if (BitWidth < Result.getBitWidth())
     BitWidth = Result.getBitWidth(); // don't shrink the result
-  else
+  else if (BitWidth > Result.getBitWidth())
     Result = Result.zext(BitWidth);
 
   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 53c8d84..9c81327 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -23,6 +23,47 @@ TargetRegistry::iterator TargetRegistry::begin() {
   return iterator(FirstTarget);
 }
 
+const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
+                                           Triple &TheTriple,
+                                           std::string &Error) {
+  // Allocate target machine.  First, check whether the user has explicitly
+  // specified an architecture to compile for. If so we have to look it up by
+  // name, because it might be a backend that has no mapping to a target triple.
+  const Target *TheTarget = 0;
+  if (!ArchName.empty()) {
+    for (TargetRegistry::iterator it = TargetRegistry::begin(),
+           ie = TargetRegistry::end(); it != ie; ++it) {
+      if (ArchName == it->getName()) {
+        TheTarget = &*it;
+        break;
+      }
+    }
+
+    if (!TheTarget) {
+      Error = "error: invalid target '" + ArchName + "'.\n";
+      return 0;
+    }
+
+    // Adjust the triple to match (if known), otherwise stick with the
+    // given triple.
+    Triple::ArchType Type = Triple::getArchTypeForLLVMName(ArchName);
+    if (Type != Triple::UnknownArch)
+      TheTriple.setArch(Type);
+  } else {
+    // Get the target specific parser.
+    std::string TempError;
+    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
+    if (TheTarget == 0) {
+      Error = ": error: unable to get target for '"
+            + TheTriple.getTriple()
+            + "', see --version and --triple.\n";
+      return 0;
+    }
+  }
+
+  return TheTarget;
+}
+
 const Target *TargetRegistry::lookupTarget(const std::string &TT,
                                            std::string &Error) {
   // Provide special warning when no targets are initialized.
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 08b12b6..0587aae 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -25,9 +25,18 @@ namespace llvm {
 using namespace sys;
 ThreadLocalImpl::ThreadLocalImpl() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
-void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
-const void* ThreadLocalImpl::getInstance() { return data; }
-void ThreadLocalImpl::removeInstance() { data = 0; }
+void ThreadLocalImpl::setInstance(const void* d) {
+  typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
+  void **pd = reinterpret_cast<void**>(&data);
+  *pd = const_cast<void*>(d);
+}
+const void* ThreadLocalImpl::getInstance() {
+  void **pd = reinterpret_cast<void**>(&data);
+  return *pd;
+}
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
 }
 #else
 
@@ -40,31 +49,30 @@ void ThreadLocalImpl::removeInstance() { data = 0; }
 namespace llvm {
 using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() : data(0) {
-  pthread_key_t* key = new pthread_key_t;
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(pthread_key_t) <= sizeof(data) ? 1 : -1];
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_key_create(key, NULL);
   assert(errorcode == 0);
   (void) errorcode;
-  data = (void*)key;
 }
 
 ThreadLocalImpl::~ThreadLocalImpl() {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_key_delete(*key);
   assert(errorcode == 0);
   (void) errorcode;
-  delete key;
 }
 
 void ThreadLocalImpl::setInstance(const void* d) {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_setspecific(*key, d);
   assert(errorcode == 0);
   (void) errorcode;
 }
 
 const void* ThreadLocalImpl::getInstance() {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   return pthread_getspecific(*key);
 }
 
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 44a1b38..cca549d 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -38,8 +38,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case x86_64:  return "x86_64";
   case xcore:   return "xcore";
   case mblaze:  return "mblaze";
-  case ptx32:   return "ptx32";
-  case ptx64:   return "ptx64";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx64";
   case le32:    return "le32";
   case amdil:   return "amdil";
   }
@@ -62,7 +62,12 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case mblaze:  return "mblaze";
 
-  case hexagon:   return "hexagon";
+  case mips:
+  case mipsel:
+  case mips64:
+  case mips64el:return "mips";
+
+  case hexagon: return "hexagon";
 
   case r600:    return "r600";
 
@@ -74,8 +79,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case xcore:   return "xcore";
 
-  case ptx32:   return "ptx";
-  case ptx64:   return "ptx";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx";
   case le32:    return "le32";
   case amdil:   return "amdil";
   }
@@ -119,6 +124,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case RTEMS: return "rtems";
   case NativeClient: return "nacl";
   case CNK: return "cnk";
+  case Bitrig: return "bitrig";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -160,8 +166,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("x86", x86)
     .Case("x86-64", x86_64)
     .Case("xcore", xcore)
-    .Case("ptx32", ptx32)
-    .Case("ptx64", ptx64)
+    .Case("nvptx", nvptx)
+    .Case("nvptx64", nvptx64)
     .Case("le32", le32)
     .Case("amdil", amdil)
     .Default(UnknownArch);
@@ -192,8 +198,8 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
     .Cases("arm", "armv4t", "armv5", "armv6", Triple::arm)
     .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", Triple::arm)
     .Case("r600", Triple::r600)
-    .Case("ptx32", Triple::ptx32)
-    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
     .Case("amdil", Triple::amdil)
     .Default(Triple::UnknownArch);
 }
@@ -215,8 +221,8 @@ const char *Triple::getArchNameForAssembler() {
     .Cases("armv6", "thumbv6", "armv6")
     .Cases("armv7", "thumbv7", "armv7")
     .Case("r600", "r600")
-    .Case("ptx32", "ptx32")
-    .Case("ptx64", "ptx64")
+    .Case("nvptx", "nvptx")
+    .Case("nvptx64", "nvptx64")
     .Case("le32", "le32")
     .Case("amdil", "amdil")
     .Default(NULL);
@@ -249,8 +255,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("sparcv9", Triple::sparcv9)
     .Case("tce", Triple::tce)
     .Case("xcore", Triple::xcore)
-    .Case("ptx32", Triple::ptx32)
-    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
     .Case("le32", Triple::le32)
     .Case("amdil", Triple::amdil)
     .Default(Triple::UnknownArch);
@@ -288,6 +294,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("rtems", Triple::RTEMS)
     .StartsWith("nacl", Triple::NativeClient)
     .StartsWith("cnk", Triple::CNK)
+    .StartsWith("bitrig", Triple::Bitrig)
     .Default(Triple::UnknownOS);
 }
 
@@ -584,6 +591,29 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
   return true;
 }
 
+void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
+                           unsigned &Micro) const {
+  switch (getOS()) {
+  default: llvm_unreachable("unexpected OS for Darwin triple");
+  case Darwin:
+  case MacOSX:
+    // Ignore the version from the triple.  This is only handled because the
+    // the clang driver combines OS X and IOS support into a common Darwin
+    // toolchain that wants to know the iOS version number even when targeting
+    // OS X.
+    Major = 3;
+    Minor = 0;
+    Micro = 0;
+    break;
+  case IOS:
+    getOSVersion(Major, Minor, Micro);
+    // Default to 3.0.
+    if (Major == 0)
+      Major = 3;
+    break;
+  }
+}
+
 void Triple::setTriple(const Twine &Str) {
   *this = Triple(Str);
 }
@@ -652,8 +682,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::mblaze:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
+  case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
-  case llvm::Triple::ptx32:
   case llvm::Triple::r600:
   case llvm::Triple::sparc:
   case llvm::Triple::tce:
@@ -664,8 +694,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
 
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
+  case llvm::Triple::nvptx64:
   case llvm::Triple::ppc64:
-  case llvm::Triple::ptx64:
   case llvm::Triple::sparcv9:
   case llvm::Triple::x86_64:
     return 64;
@@ -701,8 +731,8 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::mblaze:
   case Triple::mips:
   case Triple::mipsel:
+  case Triple::nvptx:
   case Triple::ppc:
-  case Triple::ptx32:
   case Triple::r600:
   case Triple::sparc:
   case Triple::tce:
@@ -714,8 +744,8 @@ Triple Triple::get32BitArchVariant() const {
 
   case Triple::mips64:    T.setArch(Triple::mips);    break;
   case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:     T.setArch(Triple::ppc);   break;
-  case Triple::ptx64:     T.setArch(Triple::ptx32);   break;
   case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
   case Triple::x86_64:    T.setArch(Triple::x86);     break;
   }
@@ -742,8 +772,8 @@ Triple Triple::get64BitArchVariant() const {
 
   case Triple::mips64:
   case Triple::mips64el:
+  case Triple::nvptx64:
   case Triple::ppc64:
-  case Triple::ptx64:
   case Triple::sparcv9:
   case Triple::x86_64:
     // Already 64-bit.
@@ -751,8 +781,8 @@ Triple Triple::get64BitArchVariant() const {
 
   case Triple::mips:    T.setArch(Triple::mips64);    break;
   case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
+  case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
   case Triple::ppc:     T.setArch(Triple::ppc64);     break;
-  case Triple::ptx32:   T.setArch(Triple::ptx64);     break;
   case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
   case Triple::x86:     T.setArch(Triple::x86_64);    break;
   }
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index ddc1e0f..6bddbdf 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -260,7 +260,7 @@ Path::GetCurrentDirectory() {
   return Path(pathname);
 }
 
-#if defined(__FreeBSD__) || defined (__NetBSD__) || \
+#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
     defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
@@ -329,7 +329,7 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
     if (realpath(exe_path, link_path))
       return Path(link_path);
   }
-#elif defined(__FreeBSD__) || defined (__NetBSD__) || \
+#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
       defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
   char exe_path[PATH_MAX];
 
@@ -884,7 +884,8 @@ const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
 }
 
 void Path::UnMapFilePages(const char *BasePtr, size_t FileSize) {
-  ::munmap((void*)BasePtr, FileSize);
+  const void *Addr = static_cast<const void *>(BasePtr);
+  ::munmap(const_cast<void *>(Addr), FileSize);
 }
 
 } // end llvm namespace
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index edb101e..99f8cd4 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -17,12 +17,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Support/Process.h"
 #if HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
 #if HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
 #if HAVE_DIRENT_H
 # include <dirent.h>
 # define NAMLEN(dirent) strlen((dirent)->d_name)
@@ -46,6 +50,12 @@
 #include <limits.h>
 #endif
 
+// Both stdio.h and cstdio are included via different pathes and
+// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
+// either.
+#undef ferror
+#undef feof
+
 // For GNU Hurd
 #if defined(__GNU__) && !defined(PATH_MAX)
 # define PATH_MAX 4096
@@ -272,8 +282,7 @@ error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-  struct stat status;
-  if (::stat(p.begin(), &status) == -1) {
+  if (::access(p.begin(), F_OK) == -1) {
     if (errno != errc::no_such_file_or_directory)
       return error_code(errno, system_category());
     result = false;
@@ -285,8 +294,8 @@ error_code exists(const Twine &path, bool &result) {
 
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
-  return A.st_dev == B.st_dev &&
-         A.st_ino == B.st_ino;
+  return A.fs_st_dev == B.fs_st_dev &&
+         A.fs_st_ino == B.fs_st_ino;
 }
 
 error_code equivalent(const Twine &A, const Twine &B, bool &result) {
@@ -325,30 +334,62 @@ error_code status(const Twine &path, file_status &result) {
     return ec;
   }
 
+  perms prms = static_cast<perms>(status.st_mode & perms_mask);
+  
   if (S_ISDIR(status.st_mode))
-    result = file_status(file_type::directory_file);
+    result = file_status(file_type::directory_file, prms);
   else if (S_ISREG(status.st_mode))
-    result = file_status(file_type::regular_file);
+    result = file_status(file_type::regular_file, prms);
   else if (S_ISBLK(status.st_mode))
-    result = file_status(file_type::block_file);
+    result = file_status(file_type::block_file, prms);
   else if (S_ISCHR(status.st_mode))
-    result = file_status(file_type::character_file);
+    result = file_status(file_type::character_file, prms);
   else if (S_ISFIFO(status.st_mode))
-    result = file_status(file_type::fifo_file);
+    result = file_status(file_type::fifo_file, prms);
   else if (S_ISSOCK(status.st_mode))
-    result = file_status(file_type::socket_file);
+    result = file_status(file_type::socket_file, prms);
   else
-    result = file_status(file_type::type_unknown);
+    result = file_status(file_type::type_unknown, prms);
 
-  result.st_dev = status.st_dev;
-  result.st_ino = status.st_ino;
+  result.fs_st_dev = status.st_dev;
+  result.fs_st_ino = status.st_ino;
 
   return error_code::success();
 }
 
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+  if ((prms & add_perms) && (prms & remove_perms))
+    llvm_unreachable("add_perms and remove_perms are mutually exclusive");
+
+  // Get current permissions
+  file_status info;
+  if (error_code ec = status(path, info)) {
+    return ec;
+  }
+  
+  // Set updated permissions.
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+  perms permsToSet;
+  if (prms & add_perms) {
+    permsToSet = (info.permissions() | prms) & perms_mask;
+  } else if (prms & remove_perms) {
+    permsToSet = (info.permissions() & ~prms) & perms_mask;
+  } else {
+    permsToSet = prms & perms_mask;
+  }
+  if (::chmod(p.begin(), static_cast<mode_t>(permsToSet))) {
+    return error_code(errno, system_category()); 
+  }
+
+  return error_code::success();
+}
+
+// Since this is most often used for temporary files, mode defaults to 0600.
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path,
-                             bool makeAbsolute) {
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
   SmallString<128> Model;
   model.toVector(Model);
   // Null terminate.
@@ -365,37 +406,20 @@ error_code unique_file(const Twine &model, int &result_fd,
     }
   }
 
-  // Replace '%' with random chars. From here on, DO NOT modify model. It may be
-  // needed if the randomly chosen path already exists.
-  SmallString<128> RandomPath;
-  RandomPath.reserve(Model.size() + 1);
-  ::srand(::time(NULL));
+  // From here on, DO NOT modify model. It may be needed if the randomly chosen
+  // path already exists.
+  SmallString<128> RandomPath = Model;
 
 retry_random_path:
-  // This is opened here instead of above to make it easier to track when to
-  // close it. Collisions should be rare enough for the possible extra syscalls
-  // not to matter.
-  FILE *RandomSource = ::fopen("/dev/urandom", "r");
-  RandomPath.set_size(0);
-  for (SmallVectorImpl<char>::const_iterator i = Model.begin(),
-                                             e = Model.end(); i != e; ++i) {
-    if (*i == '%') {
-      char val = 0;
-      if (RandomSource)
-        val = fgetc(RandomSource);
-      else
-        val = ::rand();
-      RandomPath.push_back("0123456789abcdef"[val & 15]);
-    } else
-      RandomPath.push_back(*i);
+  // Replace '%' with random chars.
+  for (unsigned i = 0, e = Model.size(); i != e; ++i) {
+    if (Model[i] == '%')
+      RandomPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15];
   }
 
-  if (RandomSource)
-    ::fclose(RandomSource);
-
   // Try to open + create the file.
 rety_open_create:
-  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, 0600);
+  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode);
   if (RandomFD == -1) {
     // If the file existed, try again, otherwise, error.
     if (errno == errc::file_exists)
@@ -511,6 +535,36 @@ error_code get_magic(const Twine &path, uint32_t len,
   return error_code::success();
 }
 
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = map_writable ? O_RDWR : O_RDONLY;
+  int ofd = ::open(name.begin(), oflags);
+  if ( ofd == -1 )
+    return error_code(errno, system_category());
+  AutoFD fd(ofd);
+  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
+  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  result = ::mmap(0, size, prot, flags, fd, file_offset);
+  if (result == MAP_FAILED) {
+    return error_code(errno, system_category());
+  }
+  
+  return error_code::success();
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  if ( ::munmap(base, size) == -1 )
+    return error_code(errno, system_category());
+   
+  return error_code::success();
+}
+
+
 } // end namespace fs
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index f640462..5204147 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -12,15 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/TimeValue.h"
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif
 #ifdef HAVE_SYS_RESOURCE_H
 #include <sys/resource.h>
 #endif
-// DragonFly BSD has deprecated <malloc.h> for <stdlib.h> instead,
-//  Unix.h includes this for us already.
-#if defined(HAVE_MALLOC_H) && !defined(__DragonFly__)
+// DragonFlyBSD, OpenBSD, and Bitrig have deprecated <malloc.h> for
+// <stdlib.h> instead. Unix.h includes this for us already.
+#if defined(HAVE_MALLOC_H) && !defined(__DragonFly__) && \
+    !defined(__OpenBSD__) && !defined(__Bitrig__)
 #include <malloc.h>
 #endif
 #ifdef HAVE_MALLOC_MALLOC_H
@@ -247,16 +250,18 @@ static bool terminalHasColors() {
   return false;
 }
 
+bool Process::FileDescriptorHasColors(int fd) {
+  // A file descriptor has colors if it is displayed and the terminal has
+  // colors.
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
+}
+
 bool Process::StandardOutHasColors() {
-  if (!StandardOutIsDisplayed())
-    return false;
-  return terminalHasColors();
+  return FileDescriptorHasColors(STDOUT_FILENO);
 }
 
 bool Process::StandardErrHasColors() {
-  if (!StandardErrIsDisplayed())
-    return false;
-  return terminalHasColors();
+  return FileDescriptorHasColors(STDERR_FILENO);
 }
 
 bool Process::ColorNeedsFlush() {
@@ -297,3 +302,33 @@ const char *Process::OutputReverse() {
 const char *Process::ResetColor() {
   return "\033[0m";
 }
+
+#if !defined(HAVE_ARC4RANDOM)
+static unsigned GetRandomNumberSeed() {
+  // Attempt to get the initial seed from /dev/urandom, if possible.
+  if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) {
+    unsigned seed;
+    int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource);
+    ::fclose(RandomSource);
+
+    // Return the seed if the read was successful.
+    if (count == 1)
+      return seed;
+  }
+
+  // Otherwise, swizzle the current time and the process ID to form a reasonable
+  // seed.
+  TimeValue Now = llvm::TimeValue::now();
+  return hash_combine(Now.seconds(), Now.nanoseconds(), ::getpid());
+}
+#endif
+
+unsigned llvm::sys::Process::GetRandomNumber() {
+#if defined(HAVE_ARC4RANDOM)
+  return arc4random();
+#else
+  static int x = (::srand(GetRandomNumberSeed()), 0);
+  (void)x;
+  return ::rand();
+#endif
+}
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index c9ec9fc..5195116 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -15,6 +15,7 @@
 #include "Unix.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Mutex.h"
+#include <string>
 #include <vector>
 #include <algorithm>
 #if HAVE_EXECINFO_H
@@ -43,7 +44,7 @@ static SmartMutex<true> SignalsMutex;
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = 0;
 
-static std::vector<sys::Path> FilesToRemove;
+static std::vector<std::string> FilesToRemove;
 static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
 
 // IntSigs - Signals that may interrupt the program at any time.
@@ -117,10 +118,20 @@ static void UnregisterHandlers() {
 
 /// RemoveFilesToRemove - Process the FilesToRemove list. This function
 /// should be called with the SignalsMutex lock held.
+/// NB: This must be an async signal safe function. It cannot allocate or free
+/// memory, even in debug builds.
 static void RemoveFilesToRemove() {
-  while (!FilesToRemove.empty()) {
-    FilesToRemove.back().eraseFromDisk(true);
-    FilesToRemove.pop_back();
+  // Note: avoid iterators in case of debug iterators that allocate or release
+  // memory.
+  for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
+    // Note that we don't want to use any external code here, and we don't care
+    // about errors. We're going to try as hard as we can as often as we need
+    // to to make these files go away. If these aren't files, too bad.
+    //
+    // We do however rely on a std::string implementation for which repeated
+    // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all
+    // of these strings to try to ensure this is safe.
+    unlink(FilesToRemove[i].c_str());
   }
 }
 
@@ -178,7 +189,19 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
                                    std::string* ErrMsg) {
   SignalsMutex.acquire();
-  FilesToRemove.push_back(Filename);
+  std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0];
+  FilesToRemove.push_back(Filename.str());
+
+  // We want to call 'c_str()' on every std::string in this vector so that if
+  // the underlying implementation requires a re-allocation, it happens here
+  // rather than inside of the signal handler. If we see the vector grow, we
+  // have to call it on every entry. If it remains in place, we only need to
+  // call it on the latest one.
+  if (OldPtr == &FilesToRemove[0])
+    FilesToRemove.back().c_str();
+  else
+    for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i)
+      FilesToRemove[i].c_str();
 
   SignalsMutex.release();
 
@@ -189,10 +212,19 @@ bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
 // DontRemoveFileOnSignal - The public API
 void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
   SignalsMutex.acquire();
-  std::vector<sys::Path>::reverse_iterator I =
-    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
-  if (I != FilesToRemove.rend())
-    FilesToRemove.erase(I.base()-1);
+  std::vector<std::string>::reverse_iterator RI =
+    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename.str());
+  std::vector<std::string>::iterator I = FilesToRemove.end();
+  if (RI != FilesToRemove.rend())
+    I = FilesToRemove.erase(RI.base()-1);
+
+  // We need to call c_str() on every element which would have been moved by
+  // the erase. These elements, in a C++98 implementation where c_str()
+  // requires a reallocation on the first call may have had the call to c_str()
+  // made on insertion become invalid by being copied down an element.
+  for (std::vector<std::string>::iterator E = FilesToRemove.end(); I != E; ++I)
+    I->c_str();
+
   SignalsMutex.release();
 }
 
diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index b7be311..361f297 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h
@@ -44,16 +44,10 @@
 #include <assert.h>
 #endif
 
-#ifdef TIME_WITH_SYS_TIME
+#ifdef HAVE_SYS_TIME_H
 # include <sys/time.h>
-# include <time.h>
-#else
-# ifdef HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
 #endif
+#include <time.h>
 
 #ifdef HAVE_SYS_WAIT_H
 # include <sys/wait.h>
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index d8dc522..2280b34 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -188,8 +188,20 @@ static Path *TempDirectory;
 
 Path
 Path::GetTemporaryDirectory(std::string* ErrMsg) {
-  if (TempDirectory)
+  if (TempDirectory) {
+#if defined(_MSC_VER)
+    // Visual Studio gets confused and emits a diagnostic about calling exists,
+    // even though this is the implementation for PathV1.  Temporarily 
+    // disable the deprecated warning message
+    #pragma warning(push)
+    #pragma warning(disable:4996)
+#endif
+    assert(TempDirectory->exists() && "Who has removed TempDirectory?");
+#if defined(_MSC_VER)
+    #pragma warning(pop)
+#endif
     return *TempDirectory;
+  }
 
   char pathname[MAX_PATH];
   if (!GetTempPath(MAX_PATH, pathname)) {
@@ -201,7 +213,7 @@ Path::GetTemporaryDirectory(std::string* ErrMsg) {
   Path result;
   result.set(pathname);
 
-  // Append a subdirectory passed on our process id so multiple LLVMs don't
+  // Append a subdirectory based on our process id so multiple LLVMs don't
   // step on each other's toes.
 #ifdef __MINGW32__
   // Mingw's Win32 header files are broken.
diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc
index e9ce5d9..66eeab0 100644
--- a/lib/Support/Windows/PathV2.inc
+++ b/lib/Support/Windows/PathV2.inc
@@ -301,11 +301,21 @@ error_code rename(const Twine &from, const Twine &to) {
   if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
   if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
 
-  if (!::MoveFileExW(wide_from.begin(), wide_to.begin(),
-                     MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
-    return windows_error(::GetLastError());
+  error_code ec = error_code::success();
+  for (int i = 0; i < 2000; i++) {
+    if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
+                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
+      return error_code::success();
+    ec = windows_error(::GetLastError());
+    if (ec != windows_error::access_denied)
+      break;
+    // Retry MoveFile() at ACCESS_DENIED.
+    // System scanners (eg. indexer) might open the source file when
+    // It is written and closed.
+    ::Sleep(1);
+  }
 
-  return error_code::success();
+  return ec;
 }
 
 error_code resize_file(const Twine &path, uint64_t size) {
@@ -487,9 +497,46 @@ handle_status_error:
   return error_code::success();
 }
 
+
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+#if 0 // verify code below before enabling:
+  // If the permissions bits are not trying to modify
+  // "write" permissions, there is nothing to do.
+  if (!(prms & (owner_write|group_write|others_write)))
+    return error_code::success();
+  
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+
+  if (prms & add_perms) {
+    attributes &= ~FILE_ATTRIBUTE_READONLY;
+  }
+  else if (prms & remove_perms) {
+    attributes |= FILE_ATTRIBUTE_READONLY;
+  }
+  else {
+    assert(0 && "neither add_perms or remove_perms is set");
+  }
+
+  if ( ! ::SetFileAttributesW(path_utf16.begin(), attributes))
+    return windows_error(::GetLastError());
+#endif    
+  return error_code::success();
+}
+
+
+// FIXME: mode should be used here and default to user r/w only,
+// it currently comes in as a UNIX mode.
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path,
-                             bool makeAbsolute) {
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
   // Use result_path as temp storage.
   result_path.set_size(0);
   StringRef m = model.toStringRef(result_path);
@@ -743,6 +790,19 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return error_code::success();
 }
 
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+
+
 } // end namespace fs
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 9a388b4..e29eb6d 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -133,7 +133,7 @@ bool Process::StandardErrIsDisplayed() {
 }
 
 bool Process::FileDescriptorIsDisplayed(int fd) {
-  DWORD Mode;	// Unused
+  DWORD Mode;  // Unused
   return (GetConsoleMode((HANDLE)_get_osfhandle(fd), &Mode) != 0);
 }
 
@@ -153,13 +153,17 @@ unsigned Process::StandardErrColumns() {
   return Columns;
 }
 
-// It always has colors.
-bool Process::StandardErrHasColors() {
-  return StandardErrIsDisplayed();
+// The terminal always has colors.
+bool Process::FileDescriptorHasColors(int fd) {
+  return FileDescriptorIsDisplayed(fd);
 }
 
 bool Process::StandardOutHasColors() {
-  return StandardOutIsDisplayed();
+  return FileDescriptorHasColors(1);
+}
+
+bool Process::StandardErrHasColors() {
+  return FileDescriptorHasColors(2);
 }
 
 namespace {
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 26b9bba..9593923 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -67,9 +67,9 @@ static bool loadSRW() {
                                                "ReleaseSRWLockShared");
       ::FreeLibrary(hLib);
 
-	  if (fpInitializeSRWLock != NULL) {
-	    sHasSRW = true;
-	  }
+      if (fpInitializeSRWLock != NULL) {
+        sHasSRW = true;
+      }
     }
   }
   return sHasSRW;
diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index 512462d..057deb3 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc
@@ -22,26 +22,25 @@
 namespace llvm {
 using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() {
-  DWORD* tls = new DWORD;
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1];
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);
-  data = tls;
 }
 
 ThreadLocalImpl::~ThreadLocalImpl() {
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   TlsFree(*tls);
-  delete tls;
 }
 
 const void* ThreadLocalImpl::getInstance() {
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
 
 void ThreadLocalImpl::setInstance(const void* d){
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
   assert(errorcode != 0);
   (void)errorcode;
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index d38b51b..7c353c8 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -27,12 +27,12 @@ using namespace llvm;
 using namespace yaml;
 
 enum UnicodeEncodingForm {
-  UEF_UTF32_LE, //< UTF-32 Little Endian
-  UEF_UTF32_BE, //< UTF-32 Big Endian
-  UEF_UTF16_LE, //< UTF-16 Little Endian
-  UEF_UTF16_BE, //< UTF-16 Big Endian
-  UEF_UTF8,     //< UTF-8 or ascii.
-  UEF_Unknown   //< Not a valid Unicode encoding.
+  UEF_UTF32_LE, ///< UTF-32 Little Endian
+  UEF_UTF32_BE, ///< UTF-32 Big Endian
+  UEF_UTF16_LE, ///< UTF-16 Little Endian
+  UEF_UTF16_BE, ///< UTF-16 Big Endian
+  UEF_UTF8,     ///< UTF-8 or ascii.
+  UEF_Unknown   ///< Not a valid Unicode encoding.
 };
 
 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
@@ -489,9 +489,6 @@ private:
   /// @brief Can the next token be the start of a simple key?
   bool IsSimpleKeyAllowed;
 
-  /// @brief Is the next token required to start a simple key?
-  bool IsSimpleKeyRequired;
-
   /// @brief True if an error has occurred.
   bool Failed;
 
@@ -658,7 +655,7 @@ std::string yaml::escape(StringRef Input) {
       EscapedInput += "\\r";
     else if (*i == 0x1B)
       EscapedInput += "\\e";
-    else if (*i >= 0 && *i < 0x20) { // Control characters not handled above.
+    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
       std::string HexStr = utohexstr(*i);
       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
@@ -704,7 +701,6 @@ Scanner::Scanner(StringRef Input, SourceMgr &sm)
   , FlowLevel(0)
   , IsStartOfStream(true)
   , IsSimpleKeyAllowed(true)
-  , IsSimpleKeyRequired(false)
   , Failed(false) {
   InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
   SM.AddNewSourceBuffer(InputBuffer, SMLoc());
@@ -755,6 +751,8 @@ Token Scanner::getNext() {
 }
 
 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
   // Check 7 bit c-printable - b-char.
   if (   *Position == 0x09
       || (*Position >= 0x20 && *Position <= 0x7E))
@@ -778,6 +776,8 @@ StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
 }
 
 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
   if (*Position == 0x0D) {
     if (Position + 1 != End && *(Position + 1) == 0x0A)
       return Position + 2;
@@ -1211,7 +1211,9 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
         ++Current;
       // Repeat until the previous character was not a '\' or was an escaped
       // backslash.
-    } while (*(Current - 1) == '\\' && wasEscaped(Start + 1, Current));
+    } while (   Current != End
+             && *(Current - 1) == '\\'
+             && wasEscaped(Start + 1, Current));
   } else {
     skip(1);
     while (true) {
@@ -1624,9 +1626,7 @@ StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
     return UnquotedValue;
   }
   // Plain or block.
-  size_t trimtrail = Value.rfind(' ');
-  return Value.drop_back(
-    trimtrail == StringRef::npos ? 0 : Value.size() - trimtrail);
+  return Value.rtrim(" ");
 }
 
 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
@@ -1732,8 +1732,10 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
           if (UnquotedValue.size() < 3)
             // TODO: Report error.
             break;
-          unsigned int UnicodeScalarValue = 0;
-          UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue);
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(2);
           break;
@@ -1742,8 +1744,10 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
           if (UnquotedValue.size() < 5)
             // TODO: Report error.
             break;
-          unsigned int UnicodeScalarValue = 0;
-          UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue);
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(4);
           break;
@@ -1752,8 +1756,10 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
           if (UnquotedValue.size() < 9)
             // TODO: Report error.
             break;
-          unsigned int UnicodeScalarValue = 0;
-          UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue);
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(8);
           break;
@@ -2113,5 +2119,3 @@ bool Document::expectToken(int TK) {
   }
   return true;
 }
-
-OwningPtr<Document> document_iterator::NullDoc;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 86cdca1..fa69c2d 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -528,7 +528,8 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
     } else {
       // Use ::writev() where available.
 #if defined(HAVE_WRITEV)
-      struct iovec IOV = { (void*) Ptr, Size };
+      const void *Addr = static_cast<const void *>(Ptr);
+      struct iovec IOV = {const_cast<void *>(Addr), Size };
       ret = ::writev(FD, &IOV, 1);
 #else
       ret = ::write(FD, Ptr, Size);
@@ -650,6 +651,10 @@ bool raw_fd_ostream::is_displayed() const {
   return sys::Process::FileDescriptorIsDisplayed(FD);
 }
 
+bool raw_fd_ostream::has_colors() const {
+  return sys::Process::FileDescriptorHasColors(FD);
+}
+
 //===----------------------------------------------------------------------===//
 //  outs(), errs(), nulls()
 //===----------------------------------------------------------------------===//
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index 82f72b0..ba7bf14 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMTableGen
   Error.cpp
   Main.cpp
   Record.cpp
+  StringMatcher.cpp
   TableGenAction.cpp
   TableGenBackend.cpp
   TGLexer.cpp
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 01bc55e..7aeef56 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -34,7 +34,9 @@ namespace {
                  cl::init("-"));
 
   cl::opt<std::string>
-  DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"),
+  DependFilename("d",
+                 cl::desc("Dependency filename"),
+                 cl::value_desc("filename"),
                  cl::init(""));
 
   cl::opt<std::string>
@@ -53,7 +55,8 @@ int TableGenMain(char *argv0, TableGenAction &Action) {
   try {
     // Parse the input file.
     OwningPtr<MemoryBuffer> File;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+    if (error_code ec =
+          MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
       errs() << "Could not open input file '" << InputFilename << "': "
              << ec.message() <<"\n";
       return 1;
@@ -93,7 +96,7 @@ int TableGenMain(char *argv0, TableGenAction &Action) {
       DepOut.os() << OutputFilename << ":";
       const std::vector<std::string> &Dependencies = Parser.getDependencies();
       for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
-                                                          E = Dependencies.end();
+                                                    E = Dependencies.end();
            I != E; ++I) {
         DepOut.os() << " " << (*I);
       }
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 93eed24..99fdc1f 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -1699,7 +1699,7 @@ void Record::checkName() {
   assert(TypedName && "Record name is not typed!");
   RecTy *Type = TypedName->getType();
   if (dynamic_cast<StringRecTy *>(Type) == 0) {
-    throw "Record name is not a string!";
+    throw TGError(getLoc(), "Record name is not a string!");
   }
 }
 
diff --git a/utils/TableGen/StringMatcher.cpp b/lib/TableGen/StringMatcher.cpp
index 6aedcbf..1668170 100644
--- a/utils/TableGen/StringMatcher.cpp
+++ b/lib/TableGen/StringMatcher.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "StringMatcher.h"
+#include "llvm/TableGen/StringMatcher.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 using namespace llvm;
@@ -87,11 +87,11 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
       << Matches[0]->first[CharNo] << "')\n";
       OS << Indent << "  break;\n";
     } else {
-      // Do the comparison with if (Str.substr(1, 3) != "foo").    
+      // Do the comparison with if memcmp(Str.data()+1, "foo", 3).
       // FIXME: Need to escape general strings.
-      OS << Indent << "if (" << StrVariableName << ".substr(" << CharNo << ", "
-      << NumChars << ") != \"";
-      OS << Matches[0]->first.substr(CharNo, NumChars) << "\")\n";
+      OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
+         << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
+         << NumChars << "))\n";
       OS << Indent << "  break;\n";
     }
     
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 04c4fc1..b9c7ff6 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -292,107 +292,78 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
 /// ProcessForeachDefs - Given a record, apply all of the variable
 /// values in all surrounding foreach loops, creating new records for
 /// each combination of values.
-bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                                  SMLoc Loc) {
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc) {
+  if (Loops.empty())
+    return false;
+
   // We want to instantiate a new copy of CurRec for each combination
   // of nested loop iterator values.  We don't want top instantiate
   // any copies until we have values for each loop iterator.
   IterSet IterVals;
-  for (LoopVector::iterator Loop = Loops.begin(), LoopEnd = Loops.end();
-       Loop != LoopEnd;
-       ++Loop) {
-    // Process this loop.
-    if (ProcessForeachDefs(CurRec, CurMultiClass, Loc,
-                           IterVals, *Loop, Loop+1)) {
-      Error(Loc,
-            "Could not process loops for def " + CurRec->getNameInitAsString());
-      return true;
-    }
-  }
-
-  return false;
+  return ProcessForeachDefs(CurRec, Loc, IterVals);
 }
 
 /// ProcessForeachDefs - Given a record, a loop and a loop iterator,
 /// apply each of the variable values in this loop and then process
 /// subloops.
-bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                                  SMLoc Loc, IterSet &IterVals,
-                                  ForeachLoop &CurLoop,
-                                  LoopVector::iterator NextLoop) {
-  Init *IterVar = CurLoop.IterVar;
-  ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue);
-
-  if (List == 0) {
-    Error(Loc, "Loop list is not a list");
-    return true;
-  }
-
-  // Process each value.
-  for (int64_t i = 0; i < List->getSize(); ++i) {
-    Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
-    IterVals.push_back(IterRecord(IterVar, ItemVal));
-
-    if (IterVals.size() == Loops.size()) {
-      // Ok, we have all of the iterator values for this point in the
-      // iteration space.  Instantiate a new record to reflect this
-      // combination of values.
-      Record *IterRec = new Record(*CurRec);
-
-      // Set the iterator values now.
-      for (IterSet::iterator i = IterVals.begin(), iend = IterVals.end();
-           i != iend;
-           ++i) {
-        VarInit *IterVar = dynamic_cast<VarInit *>(i->IterVar);
-        if (IterVar == 0) {
-          Error(Loc, "foreach iterator is unresolved");
-          return true;
-        }
-
-        TypedInit *IVal  = dynamic_cast<TypedInit *>(i->IterValue);
-        if (IVal == 0) {
-          Error(Loc, "foreach iterator value is untyped");
-          return true;
-        }
-
-        IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
+  // Recursively build a tuple of iterator values.
+  if (IterVals.size() != Loops.size()) {
+    assert(IterVals.size() < Loops.size());
+    ForeachLoop &CurLoop = Loops[IterVals.size()];
+    ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue);
+    if (List == 0) {
+      Error(Loc, "Loop list is not a list");
+      return true;
+    }
 
-        if (SetValue(IterRec, Loc, IterVar->getName(),
-                     std::vector<unsigned>(), IVal)) {
-          Error(Loc, "when instantiating this def");
-          return true;
-        }
+    // Process each value.
+    for (int64_t i = 0; i < List->getSize(); ++i) {
+      Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
+      IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal));
+      if (ProcessForeachDefs(CurRec, Loc, IterVals))
+        return true;
+      IterVals.pop_back();
+    }
+    return false;
+  }
 
-        // Resolve it next.
-        IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
+  // This is the bottom of the recursion. We have all of the iterator values
+  // for this point in the iteration space.  Instantiate a new record to
+  // reflect this combination of values.
+  Record *IterRec = new Record(*CurRec);
 
-        // Remove it.
-        IterRec->removeValue(IterVar->getName());
-      }
+  // Set the iterator values now.
+  for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
+    VarInit *IterVar = IterVals[i].IterVar;
+    TypedInit *IVal = dynamic_cast<TypedInit *>(IterVals[i].IterValue);
+    if (IVal == 0) {
+      Error(Loc, "foreach iterator value is untyped");
+      return true;
+    }
 
-      if (Records.getDef(IterRec->getNameInitAsString())) {
-        Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
-        return true;
-      }
+    IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
 
-      Records.addDef(IterRec);
-      IterRec->resolveReferences();
+    if (SetValue(IterRec, Loc, IterVar->getName(),
+                 std::vector<unsigned>(), IVal)) {
+      Error(Loc, "when instantiating this def");
+      return true;
     }
 
-    if (NextLoop != Loops.end()) {
-      // Process nested loops.
-      if (ProcessForeachDefs(CurRec, CurMultiClass, Loc, IterVals, *NextLoop,
-                             NextLoop+1)) {
-        Error(Loc,
-              "Could not process loops for def " +
-              CurRec->getNameInitAsString());
-        return true;
-      }
-    }
+    // Resolve it next.
+    IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
 
-    // We're done with this iterator.
-    IterVals.pop_back();
+    // Remove it.
+    IterRec->removeValue(IterVar->getName());
   }
+
+  if (Records.getDef(IterRec->getNameInitAsString())) {
+    Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+    return true;
+  }
+
+  Records.addDef(IterRec);
+  IterRec->resolveReferences();
   return false;
 }
 
@@ -1726,9 +1697,11 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
 /// the name of the declared object or a NULL Init on error.  Return
 /// the name of the parsed initializer list through ForeachListName.
 ///
-///  ForeachDeclaration ::= ID '=' Value
+///  ForeachDeclaration ::= ID '=' '[' ValueList ']'
+///  ForeachDeclaration ::= ID '=' '{' RangeList '}'
+///  ForeachDeclaration ::= ID '=' RangePiece
 ///
-Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
+VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   if (Lex.getCode() != tgtok::Id) {
     TokError("Expected identifier in foreach declaration");
     return 0;
@@ -1744,26 +1717,59 @@ Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
   }
   Lex.Lex();  // Eat the '='
 
-  // Expect a list initializer.
-  ForeachListValue = ParseValue(0, 0, ParseForeachMode);
+  RecTy *IterType = 0;
+  std::vector<unsigned> Ranges;
 
-  TypedInit *TypedList = dynamic_cast<TypedInit *>(ForeachListValue);
-  if (TypedList == 0) {
-    TokError("Value list is untyped");
-    return 0;
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when expecting a range list"); return 0;
+  case tgtok::l_square: { // '[' ValueList ']'
+    Init *List = ParseSimpleValue(0, 0, ParseForeachMode);
+    ForeachListValue = dynamic_cast<ListInit*>(List);
+    if (ForeachListValue == 0) {
+      TokError("Expected a Value list");
+      return 0;
+    }
+    RecTy *ValueType = ForeachListValue->getType();
+    ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType);
+    if (ListType == 0) {
+      TokError("Value list is not of list type");
+      return 0;
+    }
+    IterType = ListType->getElementType();
+    break;
   }
 
-  RecTy *ValueType = TypedList->getType();
-  ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType);
-  if (ListType == 0) {
-    TokError("Value list is not of list type");
-    return 0;
+  case tgtok::IntVal: { // RangePiece.
+    if (ParseRangePiece(Ranges))
+      return 0;
+    break;
   }
 
-  RecTy *IterType = ListType->getElementType();
-  VarInit *IterVar = VarInit::get(DeclName, IterType);
+  case tgtok::l_brace: { // '{' RangeList '}'
+    Lex.Lex(); // eat the '{'
+    Ranges = ParseRangeList();
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of bit range list");
+      return 0;
+    }
+    Lex.Lex();
+    break;
+  }
+  }
 
-  return IterVar;
+  if (!Ranges.empty()) {
+    assert(!IterType && "Type already initialized?");
+    IterType = IntRecTy::get();
+    std::vector<Init*> Values;
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i)
+      Values.push_back(IntInit::get(Ranges[i]));
+    ForeachListValue = ListInit::get(Values, IterType);
+  }
+
+  if (!IterType)
+    return 0;
+
+  return VarInit::get(DeclName, IterType);
 }
 
 /// ParseTemplateArgList - Read a template argument list, which is a non-empty
@@ -1932,7 +1938,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   // Parse ObjectName and make a record for it.
   Record *CurRec = new Record(ParseObjectName(CurMultiClass), DefLoc, Records);
 
-  if (!CurMultiClass) {
+  if (!CurMultiClass && Loops.empty()) {
     // Top-level def definition.
 
     // Ensure redefinition doesn't happen.
@@ -1942,7 +1948,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
       return true;
     }
     Records.addDef(CurRec);
-  } else {
+  } else if (CurMultiClass) {
     // Otherwise, a def inside a multiclass, add it to the multiclass.
     for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
       if (CurMultiClass->DefPrototypes[i]->getNameInit()
@@ -1978,7 +1984,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
     }
   }
 
-  if (ProcessForeachDefs(CurRec, CurMultiClass, DefLoc)) {
+  if (ProcessForeachDefs(CurRec, DefLoc)) {
     Error(DefLoc,
           "Could not process loops for def" + CurRec->getNameInitAsString());
     return true;
@@ -1999,8 +2005,8 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
 
   // Make a temporary object to record items associated with the for
   // loop.
-  Init *ListValue = 0;
-  Init *IterName = ParseForeachDeclaration(ListValue);
+  ListInit *ListValue = 0;
+  VarInit *IterName = ParseForeachDeclaration(ListValue);
   if (IterName == 0)
     return TokError("expected declaration in for");
 
@@ -2278,23 +2284,33 @@ InstantiateMulticlassDef(MultiClass &MC,
   Ref.Rec = DefProto;
   AddSubClass(CurRec, Ref);
 
-  if (DefNameString == 0) {
-    // We must resolve references to NAME.
-    if (SetValue(CurRec, Ref.RefLoc, "NAME", std::vector<unsigned>(),
-                 DefmPrefix)) {
-      Error(DefmPrefixLoc, "Could not resolve "
-            + CurRec->getNameInitAsString() + ":NAME to '"
-            + DefmPrefix->getAsUnquotedString() + "'");
-      return 0;
-    }
+  // Set the value for NAME. We don't resolve references to it 'til later,
+  // though, so that uses in nested multiclass names don't get
+  // confused.
+  if (SetValue(CurRec, Ref.RefLoc, "NAME", std::vector<unsigned>(),
+               DefmPrefix)) {
+    Error(DefmPrefixLoc, "Could not resolve "
+          + CurRec->getNameInitAsString() + ":NAME to '"
+          + DefmPrefix->getAsUnquotedString() + "'");
+    return 0;
+  }
 
+  // If the DefNameString didn't resolve, we probably have a reference to
+  // NAME and need to replace it. We need to do at least this much greedily,
+  // otherwise nested multiclasses will end up with incorrect NAME expansions.
+  if (DefNameString == 0) {
     RecordVal *DefNameRV = CurRec->getValue("NAME");
     CurRec->resolveReferencesTo(DefNameRV);
   }
 
   if (!CurMultiClass) {
-    // We do this after resolving NAME because before resolution, many
-    // multiclass defs will have the same name expression.  If we are
+    // Now that we're at the top level, resolve all NAME references
+    // in the resultant defs that weren't in the def names themselves.
+    RecordVal *DefNameRV = CurRec->getValue("NAME");
+    CurRec->resolveReferencesTo(DefNameRV);
+
+    // Now that NAME references are resolved and we're at the top level of
+    // any multiclass expansions, add the record to the RecordKeeper. If we are
     // currently in a multiclass, it means this defm appears inside a
     // multiclass and its name won't be fully resolvable until we see
     // the top-level defm.  Therefore, we don't add this to the
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index b8e7cb1..3d2c72c 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -45,10 +45,11 @@ namespace llvm {
   /// ForeachLoop - Record the iteration state associated with a for loop.
   /// This is used to instantiate items in the loop body.
   struct ForeachLoop {
-    Init *IterVar;
-    Init *ListValue;
+    VarInit *IterVar;
+    ListInit *ListValue;
 
-    ForeachLoop(Init *IVar, Init *LValue) : IterVar(IVar), ListValue(LValue) {}
+    ForeachLoop(VarInit *IVar, ListInit *LValue)
+      : IterVar(IVar), ListValue(LValue) {}
   };
 
 class TGParser {
@@ -113,20 +114,17 @@ private:  // Semantic analysis methods.
 
   // IterRecord: Map an iterator name to a value.
   struct IterRecord {
-    Init *IterVar;
+    VarInit *IterVar;
     Init *IterValue;
-    IterRecord(Init *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
+    IterRecord(VarInit *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
   };
 
   // IterSet: The set of all iterator values at some point in the
   // iteration space.
   typedef std::vector<IterRecord> IterSet;
 
-  bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                          SMLoc Loc);
-  bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                          SMLoc Loc, IterSet &IterVals, ForeachLoop &CurLoop,
-                          LoopVector::iterator NextLoop);
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc);
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals);
 
 private:  // Parser methods.
   bool ParseObjectList(MultiClass *MC = 0);
@@ -160,7 +158,7 @@ private:  // Parser methods.
 
   bool ParseTemplateArgList(Record *CurRec);
   Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs);
-  Init *ParseForeachDeclaration(Init *&ForeachListValue);
+  VarInit *ParseForeachDeclaration(ListInit *&ForeachListValue);
 
   SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm);
   SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC);
diff --git a/lib/TableGen/TableGenBackend.cpp b/lib/TableGen/TableGenBackend.cpp
index 09bcc7a..7c8367a 100644
--- a/lib/TableGen/TableGenBackend.cpp
+++ b/lib/TableGen/TableGenBackend.cpp
@@ -1,4 +1,4 @@
-//===- TableGenBackend.cpp - Base class for TableGen Backends ---*- C++ -*-===//
+//===- TableGenBackend.cpp - Utilities for TableGen Backends ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/TableGen/Record.h"
 using namespace llvm;
 
-void TableGenBackend::anchor() { }
-
-void TableGenBackend::EmitSourceFileHeader(StringRef Desc,
-                                           raw_ostream &OS) const {
-  OS << "//===- TableGen'erated file -------------------------------------*-"
-       " C++ -*-===//\n//\n// " << Desc << "\n//\n// Automatically generate"
-       "d file, do not edit!\n//\n//===------------------------------------"
-       "----------------------------------===//\n\n";
+static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
+                      StringRef Suffix) {
+  uint64_t Pos = OS.tell();
+  OS << Prefix;
+  for (unsigned i = OS.tell() - Pos, e = 80 - Suffix.size(); i != e; ++i)
+    OS << Fill;
+  OS << Suffix << '\n';
 }
 
+void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS) {
+  printLine(OS, "/*===- TableGen'erated file ", '-', "*- C++ -*-===*\\");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "|* " + Desc, ' ', "*|");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "|* Automatically generated file, do not edit!", ' ', "*|");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "\\*===", '-', "===*/");
+  OS << '\n';
+}
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 9b0cb0c..69e2346 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -141,7 +141,7 @@ def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                     FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, GenericItineraries, Features>;
+ : Processor<Name, NoItineraries, Features>;
 
 // V4 Processors.
 def : ProcNoItin<"generic",         []>;
@@ -204,13 +204,13 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
                                                        FeatureDSPThumb2]>;
 
 // V7a Processors.
-def : Processor<"cortex-a8",        CortexA8Itineraries,
+def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
-def : Processor<"cortex-a9",        CortexA9Itineraries,
+def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
-def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
+def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
                                      FeatureHasRAS]>;
@@ -224,7 +224,7 @@ def : ProcNoItin<"cortex-m3",       [HasV7Ops,
 def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
-                                     FeatureT2XtPk, FeatureVFP2,
+                                     FeatureT2XtPk, FeatureVFP4,
                                      FeatureVFPOnlySP, FeatureMClass]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 410790a..8536b94 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -23,8 +23,8 @@
 #include "InstPrinter/ARMInstPrinter.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMMCExpr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/Assembly/Writer.h"
@@ -283,9 +283,16 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   }
 }
 
-void ARMAsmPrinter::EmitFunctionEntryLabel() {
-  OutStreamer.ForceCodeRegion();
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
 
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
     OutStreamer.EmitThumbFunc(CurrentFnSym);
@@ -415,7 +422,9 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
     case 'a': // Print as a memory address.
       if (MI->getOperand(OpNum).isReg()) {
         O << "["
@@ -434,15 +443,18 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       printOperand(MI, OpNum, O);
       return false;
     case 'y': // Print a VFP single precision register as indexed double.
-      // This uses the ordering of the alias table to get the first 'd' register
-      // that overlaps the 's' register. Also, s0 is an odd register, hence the
-      // odd modulus check below.
       if (MI->getOperand(OpNum).isReg()) {
         unsigned Reg = MI->getOperand(OpNum).getReg();
         const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-        O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) <<
-        (((Reg % 2) == 1) ? "[0]" : "[1]");
-        return false;
+        // Find the 'd' register that has this 's' register as a sub-register,
+        // and determine the lane number.
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (!ARM::DPRRegClass.contains(*SR))
+            continue;
+          bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+          O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+          return false;
+        }
       }
       return true;
     case 'B': // Bitwise inverse of integer or symbol without a preceding #.
@@ -517,10 +529,23 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       return false;
     }
 
-    // These modifiers are not yet supported.
+    // This modifier is not yet supported.
     case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
-    case 'H': // The highest-numbered register of a pair.
       return true;
+    case 'H': // The highest-numbered register of a pair.
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (!MO.isReg())
+        return true;
+      const TargetRegisterClass &RC = ARM::GPRRegClass;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+      unsigned RegIdx = TRI->getEncodingValue(MO.getReg());
+      RegIdx |= 1; //The odd register is also the higher-numbered one of a pair.
+
+      unsigned Reg = RC.getRegister(RegIdx);
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
     }
   }
 
@@ -934,13 +959,13 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
-  // Tag the jump table appropriately for precise disassembly.
-  OutStreamer.EmitJumpTable32Region();
-
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
+  // Mark the jump table as data-in-code.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
+
   // Emit each entry of the table.
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
@@ -969,6 +994,8 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
                                      OutContext);
     OutStreamer.EmitValue(Expr, 4);
   }
+  // Mark the end of jump table data-in-code region.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
@@ -978,15 +1005,6 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
-  // Emit a label for the jump table.
-  if (MI->getOpcode() == ARM::t2TBB_JT) {
-    OutStreamer.EmitJumpTable8Region();
-  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
-    OutStreamer.EmitJumpTable16Region();
-  } else {
-    OutStreamer.EmitJumpTable32Region();
-  }
-
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
@@ -995,10 +1013,15 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
   unsigned OffsetWidth = 4;
-  if (MI->getOpcode() == ARM::t2TBB_JT)
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
     OffsetWidth = 1;
-  else if (MI->getOpcode() == ARM::t2TBH_JT)
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT8);
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
     OffsetWidth = 2;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT16);
+  }
 
   for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
     MachineBasicBlock *MBB = JTBBs[i];
@@ -1031,6 +1054,11 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
                                    OutContext);
     OutStreamer.EmitValue(Expr, OffsetWidth);
   }
+  // Mark the end of jump table data-in-code region. 32-bit offsets use
+  // actual branch instructions here, so we don't mark those as a data-region
+  // at all.
+  if (OffsetWidth != 4)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
@@ -1121,8 +1149,14 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       assert(SrcReg == ARM::SP &&
              "Only stack pointer as a source reg is supported");
       for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
-           i != NumOps; ++i)
-        RegList.push_back(MI->getOperand(i).getReg());
+           i != NumOps; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        // Actually, there should never be any impdef stuff here. Skip it
+        // temporary to workaround PR11902.
+        if (MO.isImplicit())
+          continue;
+        RegList.push_back(MO.getReg());
+      }
       break;
     case ARM::STR_PRE_IMM:
     case ARM::STR_PRE_REG:
@@ -1208,8 +1242,11 @@ extern cl::opt<bool> EnableARMEHABI;
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY)
-    OutStreamer.EmitCodeRegion();
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
 
   // Emit unwinding stuff for frame-related instructions
   if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
@@ -1565,9 +1602,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
     unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
 
-    // Mark the constant pool entry as data if we're not already in a data
-    // region.
-    OutStreamer.EmitDataRegion();
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
     OutStreamer.EmitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index af3f75a..3555e8f5 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -44,9 +44,12 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// MachineFunction.
   const MachineConstantPool *MCP;
 
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
 public:
   explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) {
+    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
@@ -70,6 +73,7 @@ public:
   bool runOnMachineFunction(MachineFunction &F);
 
   virtual void EmitConstantPool() {} // we emit constant pools customly!
+  virtual void EmitFunctionBodyEnd();
   virtual void EmitFunctionEntryLabel();
   void EmitStartOfAsmFile(Module &M);
   void EmitEndOfAsmFile(Module &M);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c6280f8..057fd71 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -51,9 +51,9 @@ WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
 
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
-  unsigned MLxOpc;     // MLA / MLS opcode
-  unsigned MulOpc;     // Expanded multiplication opcode
-  unsigned AddSubOpc;  // Expanded add / sub opcode
+  uint16_t MLxOpc;     // MLA / MLS opcode
+  uint16_t MulOpc;     // Expanded multiplication opcode
+  uint16_t AddSubOpc;  // Expanded add / sub opcode
   bool NegAcc;         // True if the acc is negated before the add / sub.
   bool HasLane;        // True if instruction has an extra "lane" operand.
 };
@@ -795,8 +795,28 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       } else
         llvm_unreachable("Unknown reg class!");
       break;
+    case 24:
+      if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
     case 32:
-      if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
           // FIXME: It's possible to only store part of the QQ register if the
           // spilled def has a sub-register index.
@@ -868,6 +888,8 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
     }
     break;
   case ARM::VST1q64:
+  case ARM::VST1d64TPseudo:
+  case ARM::VST1d64QPseudo:
     if (MI->getOperand(0).isFI() &&
         MI->getOperand(2).getSubReg() == 0) {
       FrameIndex = MI->getOperand(0).getIndex();
@@ -942,8 +964,28 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     } else
       llvm_unreachable("Unknown reg class!");
     break;
-  case 32:
-    if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+  case 24:
+    if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                         .addFrameIndex(FI)
+                         .addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+   case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
@@ -1016,6 +1058,8 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
     }
     break;
   case ARM::VLD1q64:
+  case ARM::VLD1d64TPseudo:
+  case ARM::VLD1d64QPseudo:
     if (MI->getOperand(1).isFI() &&
         MI->getOperand(0).getSubReg() == 0) {
       FrameIndex = MI->getOperand(1).getIndex();
@@ -1531,11 +1575,11 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 /// This will go away once we can teach tblgen how to set the optional CPSR def
 /// operand itself.
 struct AddSubFlagsOpcodePair {
-  unsigned PseudoOpc;
-  unsigned MachineOpc;
+  uint16_t PseudoOpc;
+  uint16_t MachineOpc;
 };
 
-static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::ADDSri, ARM::ADDri},
   {ARM::ADDSrr, ARM::ADDrr},
   {ARM::ADDSrsi, ARM::ADDrsi},
@@ -1563,14 +1607,9 @@ static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
 };
 
 unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
-  static const int NPairs =
-    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
-  for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0],
-         *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) {
-    if (OldOpc == OpcPair->PseudoOpc) {
-      return OpcPair->MachineOpc;
-    }
-  }
+  for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+    if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+      return AddSubFlagsOpcodeMap[i].MachineOpc;
   return 0;
 }
 
@@ -1742,20 +1781,33 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
 bool ARMBaseInstrInfo::
-AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask,
-               int &CmpValue) const {
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
   switch (MI->getOpcode()) {
   default: break;
   case ARM::CMPri:
   case ARM::t2CMPri:
     SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = MI->getOperand(1).getImm();
     return true;
+  case ARM::CMPrr:
+  case ARM::t2CMPrr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
   case ARM::TSTri:
   case ARM::t2TSTri:
     SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
     CmpMask = MI->getOperand(1).getImm();
     CmpValue = 0;
     return true;
@@ -1793,20 +1845,67 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
   return false;
 }
 
-/// OptimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARMBaseInstrInfo::
-OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
-                     int CmpValue, const MachineRegisterInfo *MRI) const {
-  if (CmpValue != 0)
-    return false;
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if ((CmpI->getOpcode() == ARM::CMPrr ||
+       CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::SUBrr ||
+       OI->getOpcode() == ARM::t2SUBrr) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
 
-  MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg);
-  if (llvm::next(DI) != MRI->def_end())
-    // Only support one definition.
-    return false;
+  if ((CmpI->getOpcode() == ARM::CMPri ||
+       CmpI->getOpcode() == ARM::t2CMPri) &&
+      (OI->getOpcode() == ARM::SUBri ||
+       OI->getOpcode() == ARM::t2SUBri) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
 
-  MachineInstr *MI = &*DI;
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
@@ -1825,32 +1924,49 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
     }
   }
 
-  // Conservatively refuse to convert an instruction which isn't in the same BB
-  // as the comparison.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change.
-  MachineBasicBlock::iterator I = CmpInstr,E = MI, B = MI->getParent()->begin();
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
 
   // Early exit if CmpInstr is at the beginning of the BB.
   if (I == B) return false;
 
+  // There are two possible candidates which can be changed to set CPSR:
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+  MachineInstr *Sub = NULL;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = NULL;
+  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison.
+    // For CMPri, we need to check Sub, thus we can't return here.
+    if (CmpInstr->getOpcode() == ARM::CMPri ||
+       CmpInstr->getOpcode() == ARM::t2CMPri)
+      MI = NULL;
+    else
+      return false;
+  }
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change. At the same time, search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
   for (; I != E; --I) {
     const MachineInstr &Instr = *I;
 
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR))
-        return false;
-      if (!MO.isReg()) continue;
-
+    if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+        Instr.readsRegister(ARM::CPSR, TRI))
       // This instruction modifies or uses CPSR after the one we want to
       // change. We can't do this transformation.
-      if (MO.getReg() == ARM::CPSR)
-        return false;
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
     }
 
     if (I == B)
@@ -1858,7 +1974,13 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
       return false;
   }
 
-  // Set the "zero" bit in CPSR.
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
   switch (MI->getOpcode()) {
   default: break;
   case ARM::RSBrr:
@@ -1894,13 +2016,17 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
   case ARM::EORri:
   case ARM::t2EORrr:
   case ARM::t2EORri: {
-    // Scan forward for the use of CPSR, if it's a conditional code requires
-    // checking of V bit, then this is not safe to do. If we can't find the
-    // CPSR use (i.e. used in another block), then it's not safe to perform
-    // the optimization.
+    // Scan forward for the use of CPSR
+    // When checking against MI: if it's a conditional code requires
+    // checking of V bit, then this is not safe to do.
+    // It is safe to remove CmpInstr if CPSR is redefined or killed.
+    // If we are done with the basic block, we need to check whether CPSR is
+    // live-out.
+    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+        OperandsToUpdate;
     bool isSafe = false;
     I = CmpInstr;
-    E = MI->getParent()->end();
+    E = CmpInstr->getParent()->end();
     while (!isSafe && ++I != E) {
       const MachineInstr &Instr = *I;
       for (unsigned IO = 0, EO = Instr.getNumOperands();
@@ -1918,28 +2044,56 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
         }
         // Condition code is after the operand before CPSR.
         ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
-        switch (CC) {
-        default:
-          isSafe = true;
-          break;
-        case ARMCC::VS:
-        case ARMCC::VC:
-        case ARMCC::GE:
-        case ARMCC::LT:
-        case ARMCC::GT:
-        case ARMCC::LE:
-          return false;
+        if (Sub) {
+          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+          if (NewCC == ARMCC::AL)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+          // on CMP needs to be updated to be based on SUB.
+          // Push the condition code operands to OperandsToUpdate.
+          // If it is safe to remove CmpInstr, the condition code of these
+          // operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg)
+            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
+                                                      NewCC));
         }
+        else
+          switch (CC) {
+          default:
+            // CPSR can be used multiple times, we should continue.
+            break;
+          case ARMCC::VS:
+          case ARMCC::VC:
+          case ARMCC::GE:
+          case ARMCC::LT:
+          case ARMCC::GT:
+          case ARMCC::LE:
+            return false;
+          }
       }
     }
 
-    if (!isSafe)
-      return false;
+    // If CPSR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr->getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+        if ((*SI)->isLiveIn(ARM::CPSR))
+          return false;
+    }
 
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
     CmpInstr->eraseFromParent();
+
+    // Modify the condition code of operands in OperandsToUpdate.
+    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
     return true;
   }
   }
@@ -2071,9 +2225,9 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
 
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned Class = Desc.getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (UOps)
-    return UOps;
+  int ItinUOps = ItinData->getNumMicroOps(Class);
+  if (ItinUOps >= 0)
+    return ItinUOps;
 
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -2088,7 +2242,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   //
   // On Cortex-A8, each pair of register loads / stores can be scheduled on the
   // same cycle. The scheduling for the first load / store must be done
-  // separately by assuming the the address is not 64-bit aligned.
+  // separately by assuming the address is not 64-bit aligned.
   //
   // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
   // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
@@ -2147,19 +2301,19 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
         return 2;
       // 4 registers would be issued: 2, 2.
       // 5 registers would be issued: 2, 2, 1.
-      UOps = (NumRegs / 2);
+      int A8UOps = (NumRegs / 2);
       if (NumRegs % 2)
-        ++UOps;
-      return UOps;
+        ++A8UOps;
+      return A8UOps;
     } else if (Subtarget.isCortexA9()) {
-      UOps = (NumRegs / 2);
+      int A9UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
       if ((NumRegs % 2) ||
           !MI->hasOneMemOperand() ||
           (*MI->memoperands_begin())->getAlignment() < 8)
-        ++UOps;
-      return UOps;
+        ++A9UOps;
+      return A9UOps;
     } else {
       // Assume the worst.
       return NumRegs;
@@ -2478,82 +2632,14 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
   return II;
 }
 
-int
-ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const {
-  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-      DefMI->isRegSequence() || DefMI->isImplicitDef())
-    return 1;
-
-  if (!ItinData || ItinData->isEmpty())
-    return DefMI->mayLoad() ? 3 : 1;
-
-  const MCInstrDesc *DefMCID = &DefMI->getDesc();
-  const MCInstrDesc *UseMCID = &UseMI->getDesc();
-  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
-  unsigned Reg = DefMO.getReg();
-  if (Reg == ARM::CPSR) {
-    if (DefMI->getOpcode() == ARM::FMSTAT) {
-      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
-      return Subtarget.isCortexA9() ? 1 : 20;
-    }
-
-    // CPSR set and branch can be paired in the same cycle.
-    if (UseMI->isBranch())
-      return 0;
-
-    // Otherwise it takes the instruction latency (generally one).
-    int Latency = getInstrLatency(ItinData, DefMI);
-
-    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
-    // its uses. Instructions which are otherwise scheduled between them may
-    // incur a code size penalty (not able to use the CPSR setting 16-bit
-    // instructions).
-    if (Latency > 0 && Subtarget.isThumb2()) {
-      const MachineFunction *MF = DefMI->getParent()->getParent();
-      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-        --Latency;
-    }
-    return Latency;
-  }
-
-  unsigned DefAlign = DefMI->hasOneMemOperand()
-    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
-  unsigned UseAlign = UseMI->hasOneMemOperand()
-    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
-
-  unsigned DefAdj = 0;
-  if (DefMI->isBundle()) {
-    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
-    if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-        DefMI->isRegSequence() || DefMI->isImplicitDef())
-      return 1;
-    DefMCID = &DefMI->getDesc();
-  }
-  unsigned UseAdj = 0;
-  if (UseMI->isBundle()) {
-    unsigned NewUseIdx;
-    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
-                                                   Reg, NewUseIdx, UseAdj);
-    if (NewUseMI) {
-      UseMI = NewUseMI;
-      UseIdx = NewUseIdx;
-      UseMCID = &UseMI->getDesc();
-    }
-  }
-
-  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
-                                  *UseMCID, UseIdx, UseAlign);
-  int Adj = DefAdj + UseAdj;
-  if (Adj) {
-    Latency -= (int)(DefAdj + UseAdj);
-    if (Latency < 1)
-      return 1;
-  }
-
-  if (Latency > 1 &&
-      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+                            const MachineInstr *DefMI,
+                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+  int Adjust = 0;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID->getOpcode()) {
@@ -2564,7 +2650,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
-        --Latency;
+        --Adjust;
       break;
     }
     case ARM::t2LDRs:
@@ -2574,13 +2660,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       // Thumb2 mode: lsl only.
       unsigned ShAmt = DefMI->getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 2)
-        --Latency;
+        --Adjust;
       break;
     }
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9())
+  if (DefAlign < 8 && Subtarget.isCortexA9()) {
     switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -2689,10 +2775,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4LNq32_UPD:
       // If the address is not 64-bit aligned, the latencies of these
       // instructions increases by one.
-      ++Latency;
+      ++Adjust;
       break;
     }
+  }
+  return Adjust;
+}
+
+
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  // No operand latency. The caller may fall back to getInstrLatency.
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+  const MCInstrDesc *DefMCID = &DefMI->getDesc();
+  const MCInstrDesc *UseMCID = &UseMI->getDesc();
+
+  unsigned DefAdj = 0;
+  if (DefMI->isBundle()) {
+    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
+    DefMCID = &DefMI->getDesc();
+  }
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+    return 1;
+  }
+
+  unsigned UseAdj = 0;
+  if (UseMI->isBundle()) {
+    unsigned NewUseIdx;
+    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
+                                                   Reg, NewUseIdx, UseAdj);
+    if (!NewUseMI)
+      return -1;
+
+    UseMI = NewUseMI;
+    UseIdx = NewUseIdx;
+    UseMCID = &UseMI->getDesc();
+  }
+
+  if (Reg == ARM::CPSR) {
+    if (DefMI->getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isCortexA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseMI->isBranch())
+      return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI->getParent()->getParent();
+      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
+  }
+
+  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+    return -1;
+
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
 
+  // Get the itinerary's latency if possible, and handle variable_ops.
+  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
+                                  *UseMCID, UseIdx, UseAlign);
+  // Unable to find operand latency. The caller may resort to getInstrLatency.
+  if (Latency < 0)
+    return Latency;
+
+  // Adjust for IT block position.
+  int Adj = DefAdj + UseAdj;
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  // Return the itinerary latency, which may be zero but not less than zero.
   return Latency;
 }
 
@@ -2892,22 +3069,20 @@ ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData,
     return 1;
 
   // If the second MI is predicated, then there is an implicit use dependency.
-  return getOperandLatency(ItinData, DefMI, DefIdx, DepMI,
-                           DepMI->getNumOperands());
+  return getInstrLatency(ItinData, DefMI);
 }
 
-int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                      const MachineInstr *MI,
-                                      unsigned *PredCost) const {
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *MI,
+                                           unsigned *PredCost) const {
   if (MI->isCopyLike() || MI->isInsertSubreg() ||
       MI->isRegSequence() || MI->isImplicitDef())
     return 1;
 
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
+  // An instruction scheduler typically runs on unbundled instructions, however
+  // other passes may query the latency of a bundled instruction.
   if (MI->isBundle()) {
-    int Latency = 0;
+    unsigned Latency = 0;
     MachineBasicBlock::const_instr_iterator I = MI;
     MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
@@ -2918,15 +3093,33 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 
   const MCInstrDesc &MCID = MI->getDesc();
-  unsigned Class = MCID.getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)))
+  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     *PredCost = 1;
-  if (UOps)
-    return ItinData->getStageLatency(Class);
-  return getNumMicroOps(ItinData, MI);
+  }
+  // Be sure to call getStageLatency for an empty itinerary in case it has a
+  // valid MinLatency property.
+  if (!ItinData)
+    return MI->mayLoad() ? 3 : 1;
+
+  unsigned Class = MCID.getSchedClass();
+
+  // For instructions with variable uops, use uops as latency.
+  if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+    return getNumMicroOps(ItinData, MI);
+
+  // For the common case, fall back on the itinerary's latency.
+  unsigned Latency = ItinData->getStageLatency(Class);
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  unsigned DefAlign = MI->hasOneMemOperand()
+    ? (*MI->memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  return Latency;
 }
 
 int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
@@ -2960,7 +3153,10 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx,
+                                      /*FindMin=*/false);
+  if (Latency < 0)
+    Latency = getInstrLatency(ItinData, DefMI);
   if (Latency <= 3)
     return false;
   return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 2fe8507..1a10a4a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -186,16 +186,20 @@ public:
     return NumCycles == 1;
   }
 
-  /// AnalyzeCompare - For a comparison instruction, return the source register
-  /// in SrcReg and the value it compares against in CmpValue. Return true if
-  /// the comparison instruction can be analyzed.
-  virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                              int &CmpMask, int &CmpValue) const;
-
-  /// OptimizeCompareInstr - Convert the instruction to set the zero flag so
-  /// that we can remove a "comparison with zero".
-  virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                                    int CmpMask, int CmpValue,
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2, int &CmpMask,
+                              int &CmpValue) const;
+
+  /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero"; Remove a redundant CMP
+  /// instruction if the flags can be updated in the same way by an earlier
+  /// instruction such as SUB.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
                                     const MachineRegisterInfo *MRI) const;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
@@ -249,8 +253,9 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      const MachineInstr *MI, unsigned *PredCost = 0) const;
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = 0) const;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3907f75..9deb96e 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -62,12 +62,26 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
 
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return (STI.isTargetIOS()) ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  bool ghcCall = false;
+ 
+  if (MF) {
+    const Function *F = MF->getFunction();
+    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+  }
+ 
+  if (ghcCall) {
+      return CSR_GHC_SaveList;
+  }
+  else {
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  }
 }
 
 const uint32_t*
 ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
-  return (STI.isTargetIOS()) ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
 BitVector ARMBaseRegisterInfo::
@@ -257,8 +271,9 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
 }
 
 const TargetRegisterClass *
-ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
-  return ARM::GPRRegisterClass;
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::GPRRegClass;
 }
 
 const TargetRegisterClass *
@@ -369,7 +384,7 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
   };
 
   // We only support even/odd hints for GPR and rGPR.
-  if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass)
+  if (RC != &ARM::GPRRegClass && RC != &ARM::rGPRRegClass)
     return RC->getRawAllocationOrder(MF);
 
   if (HintType == ARMRI::RegPairEven) {
@@ -712,6 +727,11 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
 requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return true;
 }
@@ -932,7 +952,8 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset));
@@ -1110,7 +1131,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // Must be addrmode4/6.
     MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
   else {
-    ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
+    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index af79351..da29f7e 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -109,7 +109,8 @@ public:
                                        SmallVectorImpl<unsigned> &SubIndices,
                                        unsigned &NewSubIdx) const;
 
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
   const TargetRegisterClass*
   getCrossCopyRegClass(const TargetRegisterClass *RC) const;
 
@@ -173,6 +174,8 @@ public:
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
   virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index b9a2512..bda1517 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -79,6 +79,25 @@ def RetFastCC_ARM_APCS : CallingConv<[
   CCDelegateTo<RetCC_ARM_APCS>
 ]>;
 
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_APCS_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
+  CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim
+  CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>>
+]>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS (EABI) Calling Convention, common parts
@@ -113,6 +132,9 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_ARM_AAPCS : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -138,6 +160,9 @@ def RetCC_ARM_AAPCS : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -171,3 +196,9 @@ def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
 // iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register.
 // Also save R7-R4 first to match the stack frame fixed spill areas.
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
+
+// GHC set of callee saved regs is empty as all those regs are
+// used for passing STG regs around
+// add is a workaround for not being able to compile empty list:
+// def CSR_GHC : CalleeSavedRegs<()>;
+def CSR_GHC : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 32ef345..e81b4cc 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -254,7 +254,7 @@ namespace {
         emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
         return 0;
       }
-      unsigned Reg = getARMRegisterNumbering(MO.getReg());
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
       int32_t Imm12 = MO1.getImm();
       uint32_t Binary;
       Binary = Imm12 & 0xfff;
@@ -296,7 +296,7 @@ namespace {
         emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
         return 0;
       }
-      unsigned Reg = getARMRegisterNumbering(MO.getReg());
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
       int32_t Imm12 = MO1.getImm();
 
       // Special value for #-0
@@ -352,6 +352,12 @@ namespace {
     void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
     void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
                                intptr_t JTBase = 0) const;
+    unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) const;
   };
 }
 
@@ -440,7 +446,7 @@ unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI,
 unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
                                            const MachineOperand &MO) const {
   if (MO.isReg())
-    return getARMRegisterNumbering(MO.getReg());
+    return II->getRegisterInfo().getEncodingValue(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
   else if (MO.isGlobal())
@@ -776,7 +782,7 @@ void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
 
   // Encode Rn which is PC.
-  Binary |= getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+  Binary |= II->getRegisterInfo().getEncodingValue(ARM::PC) << ARMII::RegRnShift;
 
   // Encode the displacement.
   Binary |= 1 << ARMII::I_BitShift;
@@ -963,7 +969,7 @@ unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI,
   if (Rs) {
     // Encode Rs bit[11:8].
     assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-    return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+    return Binary | (II->getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
   }
 
   // Encode shift_imm bit[11:7].
@@ -1014,7 +1020,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
   else if (ImplicitRd)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift);
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
 
   if (MCID.Opcode == ARM::MOVi16) {
       // Get immediate from MI.
@@ -1064,7 +1070,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
   if (!isUnary) {
     if (ImplicitRn)
       // Special handling for implicit use (e.g. PC).
-      Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
+      Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
     else {
       Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
       ++OpIdx;
@@ -1081,7 +1087,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
 
   if (MO.isReg()) {
     // Encode register Rm.
-    emitWordLE(Binary | getARMRegisterNumbering(MO.getReg()));
+    emitWordLE(Binary | II->getRegisterInfo().getEncodingValue(MO.getReg()));
     return;
   }
 
@@ -1124,14 +1130,14 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
   // Set first operand
   if (ImplicitRd)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift);
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
 
   // Set second operand
   if (ImplicitRn)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
@@ -1158,7 +1164,7 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
   Binary |= 1 << ARMII::I_BitShift;
   assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
   // Set bit[3:0] to the corresponding Rm register
-  Binary |= getARMRegisterNumbering(MO2.getReg());
+  Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
 
   // If this instr is in scaled register offset/index instruction, set
   // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
@@ -1202,7 +1208,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
   // Set second operand
   if (ImplicitRn)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
@@ -1221,7 +1227,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
   // If this instr is in register offset/index encoding, set bit[3:0]
   // to the corresponding Rm register.
   if (MO2.getReg()) {
-    Binary |= getARMRegisterNumbering(MO2.getReg());
+    Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
     emitWordLE(Binary);
     return;
   }
@@ -1287,7 +1293,7 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       break;
-    unsigned RegNum = getARMRegisterNumbering(MO.getReg());
+    unsigned RegNum = II->getRegisterInfo().getEncodingValue(MO.getReg());
     assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
            RegNum < 16);
     Binary |= 0x1 << RegNum;
@@ -1530,7 +1536,7 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
 
   if (MCID.Opcode == ARM::BX_RET || MCID.Opcode == ARM::MOVPCLR)
     // The return register is LR.
-    Binary |= getARMRegisterNumbering(ARM::LR);
+    Binary |= II->getRegisterInfo().getEncodingValue(ARM::LR);
   else
     // otherwise, set the return register
     Binary |= getMachineOpValue(MI, 0);
@@ -1538,11 +1544,12 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
-static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeVFPRd(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegD);
-  RegD = getARMRegisterNumbering(RegD);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegD);
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
   if (!isSPVFP)
     Binary |=   RegD               << ARMII::RegRdShift;
   else {
@@ -1552,11 +1559,12 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
   return Binary;
 }
 
-static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeVFPRn(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegN);
-  RegN = getARMRegisterNumbering(RegN);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegN);
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
   if (!isSPVFP)
     Binary |=   RegN               << ARMII::RegRnShift;
   else {
@@ -1566,11 +1574,12 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
   return Binary;
 }
 
-static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeVFPRm(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegM);
-  RegM = getARMRegisterNumbering(RegM);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegM);
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
   if (!isSPVFP)
     Binary |=   RegM;
   else {
@@ -1757,28 +1766,31 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
-static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeNEONRd(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegD = getARMRegisterNumbering(RegD);
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
   Binary |= (RegD & 0xf) << ARMII::RegRdShift;
   Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
   return Binary;
 }
 
-static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeNEONRn(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegN = getARMRegisterNumbering(RegN);
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
   Binary |= (RegN & 0xf) << ARMII::RegRnShift;
   Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
   return Binary;
 }
 
-static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) {
+unsigned ARMCodeEmitter::encodeNEONRm(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegM = getARMRegisterNumbering(RegM);
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
   Binary |= (RegM & 0xf);
   Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
   return Binary;
@@ -1812,7 +1824,7 @@ void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
   Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
 
   unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
-  RegT = getARMRegisterNumbering(RegT);
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
   Binary |= (RegT << ARMII::RegRdShift);
   Binary |= encodeNEONRn(MI, RegNOpIdx);
 
@@ -1841,7 +1853,7 @@ void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
   Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
 
   unsigned RegT = MI.getOperand(1).getReg();
-  RegT = getARMRegisterNumbering(RegT);
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
   Binary |= (RegT << ARMII::RegRdShift);
   Binary |= encodeNEONRn(MI, 0);
   emitWordLE(Binary);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index fc35c7c..a953985 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -69,27 +69,6 @@ static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
   return 0;
 }
 
-/// WorstCaseAlign - Assuming only the low KnownBits bits in Offset are exact,
-/// add padding such that:
-///
-/// 1. The result is aligned to 1 << LogAlign.
-///
-/// 2. No other value of the unknown bits would require more padding.
-///
-/// This may add more padding than is required to satisfy just one of the
-/// constraints.  It is necessary to compute alignment this way to guarantee
-/// that we don't underestimate the padding before an aligned block.  If the
-/// real padding before a block is larger than we think, constant pool entries
-/// may go out of range.
-static inline unsigned WorstCaseAlign(unsigned Offset, unsigned LogAlign,
-                                      unsigned KnownBits) {
-  // Add the worst possible padding that the unknown bits could cause.
-  Offset += UnknownPadding(LogAlign, KnownBits);
-
-  // Then align the result.
-  return RoundUpToAlignment(Offset, 1u << LogAlign);
-}
-
 namespace {
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
   /// requires constant pool entries to be scattered among the instructions
@@ -109,7 +88,12 @@ namespace {
       /// Offset - Distance from the beginning of the function to the beginning
       /// of this basic block.
       ///
-      /// The offset is always aligned as required by the basic block.
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
       unsigned Offset;
 
       /// Size - Size of the basic block in bytes.  If the block contains
@@ -140,7 +124,12 @@ namespace {
       /// This number should be used to predict worst case padding when
       /// splitting the block.
       unsigned internalKnownBits() const {
-        return Unalign ? Unalign : KnownBits;
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = CountTrailingZeros_32(Size);
+        return Bits;
       }
 
       /// Compute the offset immediately following this block.  If LogAlign is
@@ -152,7 +141,7 @@ namespace {
         if (!LA)
           return PO;
         // Add alignment padding from the terminator.
-        return WorstCaseAlign(PO, LA, internalKnownBits());
+        return PO + UnknownPadding(LA, internalKnownBits());
       }
 
       /// Compute the number of known low bits of postOffset.  If this block
@@ -342,9 +331,7 @@ void ARMConstantIslands::verify() {
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = MBBI;
-    unsigned Align = MBB->getAlignment();
     unsigned MBBId = MBB->getNumber();
-    assert(BBInfo[MBBId].Offset % (1u << Align) == 0);
     assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
   }
   DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
@@ -428,7 +415,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
 
   // ARM and Thumb2 functions need to be 4-byte aligned.
   if (!isThumb1)
-    MF->EnsureAlignment(2);  // 2 = log2(4)
+    MF->ensureAlignment(2);  // 2 = log2(4)
 
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
@@ -529,7 +516,7 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
 
   // The function needs to be as aligned as the basic blocks. The linker may
   // move functions around based on their alignment.
-  MF->EnsureAlignment(BB->getAlignment());
+  MF->ensureAlignment(BB->getAlignment());
 
   // Order the entries in BB by descending alignment.  That ensures correct
   // alignment of all entries as long as BB is sufficiently aligned.  Keep
@@ -828,7 +815,7 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   // tBR_JTr contains a .align 2 directive.
   if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
     BBI.PostAlign = 2;
-    MBB->getParent()->EnsureAlignment(2);
+    MBB->getParent()->ensureAlignment(2);
   }
 }
 
@@ -1045,7 +1032,6 @@ bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
                                       MachineInstr *CPEMI, unsigned MaxDisp,
                                       bool NegOk, bool DoDump) {
   unsigned CPEOffset  = getOffsetOf(CPEMI);
-  assert(CPEOffset % 4 == 0 && "Misaligned CPE");
 
   if (DoDump) {
     DEBUG({
@@ -1256,11 +1242,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   if (BBHasFallthrough(UserMBB)) {
     // Size of branch to insert.
     unsigned Delta = isThumb1 ? 2 : 4;
-    // End of UserBlock after adding a branch.
-    unsigned UserBlockEnd = UserBBI.postOffset() + Delta;
     // Compute the offset where the CPE will begin.
-    unsigned CPEOffset = WorstCaseAlign(UserBlockEnd, CPELogAlign,
-                                        UserBBI.postKnownBits());
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
@@ -1299,20 +1282,16 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // up the insertion point.
 
   // Try to split the block so it's fully aligned.  Compute the latest split
-  // point where we can add a 4-byte branch instruction, and then
-  // WorstCaseAlign to LogAlign.
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
   unsigned LogAlign = MF->getAlignment();
   assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
   unsigned KnownBits = UserBBI.internalKnownBits();
   unsigned UPad = UnknownPadding(LogAlign, KnownBits);
-  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
   DEBUG(dbgs() << format("Split in middle of big block before %#x",
                          BaseInsertOffset));
 
-  // Account for alignment and unknown padding.
-  BaseInsertOffset &= ~((1u << LogAlign) - 1);
-  BaseInsertOffset -= UPad;
-
   // The 4 in the following is for the unconditional branch we'll be inserting
   // (allows for long branch on Thumb1).  Alignment of the island is handled
   // inside isOffsetInRange.
@@ -1327,11 +1306,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // pool entries following this block; only the last one is in the water list.
   // Back past any possible branches (allow for a conditional and a maximally
   // long unconditional).
-  if (BaseInsertOffset >= BBInfo[UserMBB->getNumber()+1].Offset)
-    BaseInsertOffset = BBInfo[UserMBB->getNumber()+1].Offset -
-      (isThumb1 ? 6 : 8);
-  unsigned EndInsertOffset =
-    WorstCaseAlign(BaseInsertOffset + 4, LogAlign, KnownBits) +
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - UPad - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
     CPEMI->getOperand(2).getImm();
   MachineBasicBlock::iterator MI = UserMI;
   ++MI;
@@ -1342,6 +1321,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
        Offset < BaseInsertOffset;
        Offset += TII->GetInstSizeInBytes(MI),
        MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
     if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
@@ -1353,9 +1333,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       // reused within the block, but it doesn't matter much.  Also assume CPEs
       // are added in order with alignment padding.  We may eventually be able
       // to pack the aligned CPEs better.
-      EndInsertOffset = RoundUpToAlignment(EndInsertOffset,
-                                           1u << getCPELogAlign(U.CPEMI)) +
-        U.CPEMI->getOperand(2).getImm();
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
       CPUIndex++;
     }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 5fc0360..15bb32e 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -459,22 +459,23 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
     MIB.addOperand(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
   unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
   GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
-  MIB.addReg(D0);
+  MIB.addReg(D0, getUndefRegState(SrcIsUndef));
   if (NumRegs > 1 && TableEntry->copyAllListRegs)
-    MIB.addReg(D1);
+    MIB.addReg(D1, getUndefRegState(SrcIsUndef));
   if (NumRegs > 2 && TableEntry->copyAllListRegs)
-    MIB.addReg(D2);
+    MIB.addReg(D2, getUndefRegState(SrcIsUndef));
   if (NumRegs > 3 && TableEntry->copyAllListRegs)
-    MIB.addReg(D3);
+    MIB.addReg(D3, getUndefRegState(SrcIsUndef));
 
   // Copy the predicate operands.
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill) // Add an implicit kill for the super-reg.
+  if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
 
@@ -925,7 +926,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       if (isARM) {
         AddDefaultPred(MIB3);
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
-          MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+          MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
       TransferImpOps(MI, MIB1, MIB3);
       MI.eraseFromParent();
@@ -1008,7 +1009,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
       unsigned OpIdx = 0;
       unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned Lane = getARMRegisterNumbering(SrcReg) & 1;
+      unsigned Lane = TRI->getEncodingValue(SrcReg) & 1;
       unsigned DReg = TRI->getMatchingSuperReg(SrcReg,
                             Lane & 1 ? ARM::ssub_1 : ARM::ssub_0,
                             &ARM::DPR_VFP2RegClass);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 2e1eaca..57f8116 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -47,11 +47,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static cl::opt<bool>
-DisableARMFastISel("disable-arm-fast-isel",
-                    cl::desc("Turn off experimental ARM fast-isel support"),
-                    cl::init(false), cl::Hidden);
-
 extern cl::opt<bool> EnableARMLongCalls;
 
 namespace {
@@ -92,8 +87,9 @@ class ARMFastISel : public FastISel {
   LLVMContext *Context;
 
   public:
-    explicit ARMFastISel(FunctionLoweringInfo &funcInfo)
-    : FastISel(funcInfo),
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo),
       TM(funcInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
       TLI(*TM.getTargetLowering()) {
@@ -172,6 +168,7 @@ class ARMFastISel : public FastISel {
     bool SelectRet(const Instruction *I);
     bool SelectTrunc(const Instruction *I);
     bool SelectIntExt(const Instruction *I);
+    bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
 
     // Utility routines.
   private:
@@ -182,7 +179,6 @@ class ARMFastISel : public FastISel {
     bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
-                     
     bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
                       unsigned Alignment = 0);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
@@ -195,21 +191,25 @@ class ARMFastISel : public FastISel {
     unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
     unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
-    unsigned ARMSelectCallOp(const GlobalValue *GV);
+    unsigned ARMSelectCallOp(bool UseReg);
 
     // Call handling routines.
   private:
-    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+                                  bool Return,
+                                  bool isVarArg);
     bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                          SmallVectorImpl<unsigned> &ArgRegs,
                          SmallVectorImpl<MVT> &ArgVTs,
                          SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
                          SmallVectorImpl<unsigned> &RegArgs,
                          CallingConv::ID CC,
-                         unsigned &NumBytes);
+                         unsigned &NumBytes,
+                         bool isVarArg);
+    unsigned getLibcallReg(const Twine &Name);
     bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                     const Instruction *I, CallingConv::ID CC,
-                    unsigned &NumBytes);
+                    unsigned &NumBytes, bool isVarArg);
     bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
 
     // OptionalDef handling routines.
@@ -719,7 +719,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
 
   MVT VT;
-  if (!isLoadTypeLegal(AI->getType(), VT)) return false;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
 
   DenseMap<const AllocaInst*, int>::iterator SI =
     FuncInfo.StaticAllocaMap.find(AI);
@@ -910,8 +910,9 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
   // put the alloca address into a register, set the base type back to
   // register and continue. This should almost never happen.
   if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
-    const TargetRegisterClass *RC = isThumb2 ? ARM::tGPRRegisterClass
-                                             : ARM::GPRRegisterClass;
+    const TargetRegisterClass *RC = isThumb2 ?
+      (const TargetRegisterClass*)&ARM::tGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass;
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1005,7 +1006,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
           useAM3 = true;
         }
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::i16:
       if (isThumb2) {
@@ -1017,7 +1018,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
         Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
         useAM3 = true;
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::i32:
       if (isThumb2) {
@@ -1028,7 +1029,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       } else {
         Opc = ARM::LDRi12;
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
@@ -1037,7 +1038,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
         needVMOV = true;
         VT = MVT::i32;
         Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
-        RC = ARM::GPRRegisterClass;
+        RC = &ARM::GPRRegClass;
       } else {
         Opc = ARM::VLDRS;
         RC = TLI.getRegClassFor(VT);
@@ -1106,8 +1107,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1: {
-      unsigned Res = createResultReg(isThumb2 ? ARM::tGPRRegisterClass :
-                                               ARM::GPRRegisterClass);
+      unsigned Res = createResultReg(isThumb2 ?
+        (const TargetRegisterClass*)&ARM::tGPRRegClass :
+        (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(Opc), Res)
@@ -1358,7 +1360,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
   unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
                   .addReg(AddrReg));
-  return true;  
+  return true;
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -1423,12 +1425,12 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
         if (!UseImm)
           CmpOpc = ARM::t2CMPrr;
         else
-          CmpOpc = isNegativeImm ? ARM::t2CMNzri : ARM::t2CMPri;
+          CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
       } else {
         if (!UseImm)
           CmpOpc = ARM::CMPrr;
         else
-          CmpOpc = isNegativeImm ? ARM::CMNzri : ARM::CMPri;
+          CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
       }
       break;
   }
@@ -1491,8 +1493,9 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   // Now set a register based on the comparison. Explicitly set the predicates
   // here.
   unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
-  const TargetRegisterClass *RC = isThumb2 ? ARM::rGPRRegisterClass
-                                           : ARM::GPRRegisterClass;
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = TargetMaterializeConstant(Zero);
@@ -1516,7 +1519,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
   unsigned Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(ARM::DPRRegisterClass);
+  unsigned Result = createResultReg(&ARM::DPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(ARM::VCVTDS), Result)
                   .addReg(Op));
@@ -1535,7 +1538,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   unsigned Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(ARM::SPRRegisterClass);
+  unsigned Result = createResultReg(&ARM::SPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(ARM::VCVTSD), Result)
                   .addReg(Op));
@@ -1736,7 +1739,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   // type and the target independent selector doesn't know how to handle it.
   if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
     return false;
-  
+
   unsigned Opc;
   switch (ISDOpcode) {
     default: return false;
@@ -1809,10 +1812,11 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
 
 // Call Handling Code
 
-// This is largely taken directly from CCAssignFnForNode - we don't support
-// varargs in FastISel so that part has been removed.
+// This is largely taken directly from CCAssignFnForNode
 // TODO: We may not support all of this.
-CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool Return,
+                                           bool isVarArg) {
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention");
@@ -1825,18 +1829,26 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
     // Use target triple & subtarget features to do actual dispatch.
     if (Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() &&
-          TM.Options.FloatABIType == FloatABI::Hard)
+          TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
         return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
       else
         return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
     } else
         return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
   case CallingConv::ARM_AAPCS_VFP:
-    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    // Fall through to soft float variant, variadic functions don't
+    // use hard floating point ABI.
   case CallingConv::ARM_AAPCS:
     return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
   case CallingConv::ARM_APCS:
     return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::GHC:
+    if (Return)
+      llvm_unreachable("Can't return in GHC call convention");
+    else
+      return CC_ARM_APCS_GHC;
   }
 }
 
@@ -1846,10 +1858,12 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                                   SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
                                   SmallVectorImpl<unsigned> &RegArgs,
                                   CallingConv::ID CC,
-                                  unsigned &NumBytes) {
+                                  unsigned &NumBytes,
+                                  bool isVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+                             CCAssignFnForCall(CC, false, isVarArg));
 
   // Check that we can handle all of the arguments. If we can't, then bail out
   // now before we add code to the MBB.
@@ -1981,7 +1995,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
 
 bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                              const Instruction *I, CallingConv::ID CC,
-                             unsigned &NumBytes) {
+                             unsigned &NumBytes, bool isVarArg) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1991,8 +2005,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true));
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
 
     // Copy all of the result registers out of their specified physreg.
     if (RVLocs.size() == 2 && RetVT == MVT::f64) {
@@ -2041,9 +2055,6 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
-  if (F.isVarArg())
-    return false;
-
   CallingConv::ID CC = F.getCallingConv();
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
@@ -2053,7 +2064,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
-    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+                                                 F.isVarArg()));
 
     const Value *RV = Ret->getOperand(0);
     unsigned Reg = getRegForValue(RV);
@@ -2110,12 +2122,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   return true;
 }
 
-unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
-  if (isThumb2) {
-    return ARM::tBL;
-  } else  {
-    return ARM::BL;
-  }
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+  if (UseReg)
+    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+  else
+    return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+                                       GlobalValue::ExternalLinkage, 0, Name);
+  return ARMMaterializeGV(GV, TLI.getValueType(GV->getType()));
 }
 
 // A quick function that will emit a call for a named libcall in F with the
@@ -2136,8 +2153,14 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // TODO: For now if we have long calls specified we don't handle the call.
-  if (EnableARMLongCalls) return false;
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
@@ -2170,23 +2193,36 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // Handle the arguments now that we've gotten them.
   SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, false))
     return false;
 
+  unsigned CalleeReg = 0;
+  if (EnableARMLongCalls) {
+    CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+    if (CalleeReg == 0) return false;
+  }
+
   // Issue the call.
-  MachineInstrBuilder MIB;
-  unsigned CallOpc = ARMSelectCallOp(NULL);
-  if (isThumb2)
-    // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc)))
-                         .addExternalSymbol(TLI.getLibcallName(Call));
-  else
+  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
+  if (isThumb2) {
     // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc))
-          .addExternalSymbol(TLI.getLibcallName(Call)));
+    AddDefaultPred(MIB);
+    if (EnableARMLongCalls)
+      MIB.addReg(CalleeReg);
+    else
+      MIB.addExternalSymbol(TLI.getLibcallName(Call));
+  } else {
+    if (EnableARMLongCalls)
+      MIB.addReg(CalleeReg);
+    else
+      MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
+    // Explicitly adding the predicate here.
+    AddDefaultPred(MIB);
+  }
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -2197,7 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 
   // Finish off the call including any return values.
   SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
 
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
@@ -2213,22 +2249,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Can't handle inline asm.
   if (isa<InlineAsm>(Callee)) return false;
 
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
-    return false;
-
   // Check the calling convention.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
 
   // TODO: Avoid some calling conventions?
 
-  // Let SDISel handle vararg functions.
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
-    return false;
+  bool isVarArg = FTy->isVarArg();
 
   // Handle *simple* calls for now.
   Type *RetTy = I->getType();
@@ -2239,8 +2268,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
            RetVT != MVT::i8  && RetVT != MVT::i1)
     return false;
 
-  // TODO: For now if we have long calls specified we don't handle the call.
-  if (EnableARMLongCalls) return false;
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+      RetVT != MVT::i16 && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
@@ -2295,33 +2331,49 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Handle the arguments now that we've gotten them.
   SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, isVarArg))
     return false;
 
+  bool UseReg = false;
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || EnableARMLongCalls) UseReg = true;
+
+  unsigned CalleeReg = 0;
+  if (UseReg) {
+    if (IntrMemName)
+      CalleeReg = getLibcallReg(IntrMemName);
+    else
+      CalleeReg = getRegForValue(Callee);
+
+    if (CalleeReg == 0) return false;
+  }
+
   // Issue the call.
-  MachineInstrBuilder MIB;
-  unsigned CallOpc = ARMSelectCallOp(GV);
-  // Explicitly adding the predicate here.
+  unsigned CallOpc = ARMSelectCallOp(UseReg);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
   if(isThumb2) {
     // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                 TII.get(CallOpc)));
-    if (!IntrMemName)
+    AddDefaultPred(MIB);
+    if (UseReg)
+      MIB.addReg(CalleeReg);
+    else if (!IntrMemName)
       MIB.addGlobalAddress(GV, 0, 0);
-    else 
+    else
       MIB.addExternalSymbol(IntrMemName, 0);
   } else {
-    if (!IntrMemName)
-      // Explicitly adding the predicate here.
-      MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                   TII.get(CallOpc))
-            .addGlobalAddress(GV, 0, 0));
+    if (UseReg)
+      MIB.addReg(CalleeReg);
+    else if (!IntrMemName)
+      MIB.addGlobalAddress(GV, 0, 0);
     else
-      MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                   TII.get(CallOpc))
-            .addExternalSymbol(IntrMemName, 0));
+      MIB.addExternalSymbol(IntrMemName, 0);
+
+    // Explicitly adding the predicate here.
+    AddDefaultPred(MIB);
   }
-  
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -2332,7 +2384,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // Finish off the call including any return values.
   SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+    return false;
 
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
@@ -2383,6 +2436,42 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    unsigned LdrOpc;
+    const TargetRegisterClass *RC;
+    if (isThumb2) {
+      LdrOpc =  ARM::t2LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::tGPRRegClass;
+    } else {
+      LdrOpc =  ARM::LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::GPRRegClass;
+    }
+
+    const ARMBaseRegisterInfo *RegInfo =
+          static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = FramePtr;
+
+    // Recursively load frame address
+    // ldr r0 [fp]
+    // ldr r0 [r0]
+    // ldr r0 [r0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(LdrOpc), DestReg)
+                      .addReg(SrcReg).addImm(0));
+      SrcReg = DestReg;
+    }
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const MemTransferInst &MTI = cast<MemTransferInst>(I);
@@ -2406,10 +2495,10 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
           return true;
       }
     }
-    
+
     if (!MTI.getLength()->getType()->isIntegerTy(32))
       return false;
-    
+
     if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
       return false;
 
@@ -2421,20 +2510,24 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     // Don't handle volatile.
     if (MSI.isVolatile())
       return false;
-    
+
     if (!MSI.getLength()->getType()->isIntegerTy(32))
       return false;
-    
+
     if (MSI.getDestAddressSpace() > 255)
       return false;
-    
+
     return SelectCall(&I, "memset");
   }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::TRAP));
+    return true;
+  }
   }
 }
 
 bool ARMFastISel::SelectTrunc(const Instruction *I) {
-  // The high bits for a type smaller than the register size are assumed to be 
+  // The high bits for a type smaller than the register size are assumed to be
   // undefined.
   Value *Op = I->getOperand(0);
 
@@ -2522,6 +2615,61 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) {
   return true;
 }
 
+bool ARMFastISel::SelectShift(const Instruction *I,
+                              ARM_AM::ShiftOpc ShiftTy) {
+  // We handle thumb2 mode by target independent selector
+  // or SelectionDAG ISel.
+  if (isThumb2)
+    return false;
+
+  // Only handle i32 now.
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned Opc = ARM::MOVsr;
+  unsigned ShiftImm;
+  Value *Src2Value = I->getOperand(1);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) {
+    ShiftImm = CI->getZExtValue();
+
+    // Fall back to selection DAG isel if the shift amount
+    // is zero or greater than the width of the value type.
+    if (ShiftImm == 0 || ShiftImm >=32)
+      return false;
+
+    Opc = ARM::MOVsi;
+  }
+
+  Value *Src1Value = I->getOperand(0);
+  unsigned Reg1 = getRegForValue(Src1Value);
+  if (Reg1 == 0) return false;
+
+  unsigned Reg2;
+  if (Opc == ARM::MOVsr) {
+    Reg2 = getRegForValue(Src2Value);
+    if (Reg2 == 0) return false;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  if(ResultReg == 0) return false;
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(Opc), ResultReg)
+                            .addReg(Reg1);
+
+  if (Opc == ARM::MOVsi)
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm));
+  else if (Opc == ARM::MOVsr) {
+    MIB.addReg(Reg2);
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0));
+  }
+
+  AddOptionalDefs(MIB);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
 // TODO: SoftFP support.
 bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
 
@@ -2582,6 +2730,12 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
     case Instruction::ZExt:
     case Instruction::SExt:
       return SelectIntExt(I);
+    case Instruction::Shl:
+      return SelectShift(I, ARM_AM::lsl);
+    case Instruction::LShr:
+      return SelectShift(I, ARM_AM::lsr);
+    case Instruction::AShr:
+      return SelectShift(I, ARM_AM::asr);
     default: break;
   }
   return false;
@@ -2625,7 +2779,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
   // See if we can handle this address.
   Address Addr;
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
-  
+
   unsigned ResultReg = MI->getOperand(0).getReg();
   if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
     return false;
@@ -2634,15 +2788,15 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
 }
 
 namespace llvm {
-  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) {
+  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
     // Completely untested on non-iOS.
     const TargetMachine &TM = funcInfo.MF->getTarget();
 
     // Darwin and thumb1 only for now.
     const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only() &&
-        !DisableARMFastISel)
-      return new ARMFastISel(funcInfo);
+    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only())
+      return new ARMFastISel(funcInfo, libInfo);
     return 0;
   }
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 402ecb0..aee72d2 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -15,6 +15,8 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Function.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -151,6 +153,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   int FramePtrSpillFI = 0;
   int D8SpillFI = 0;
 
+  // All calls are tail calls in GHC calling conv, and functions have no prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   // Allocate the vararg register save area. This is not counted in NumBytes.
   if (VARegSaveSize)
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize,
@@ -354,6 +360,10 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
+  // All calls are tail calls in GHC calling conv, and functions have no prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   if (!AFI->hasStackFrame()) {
     if (NumBytes != 0)
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
@@ -790,7 +800,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // The writeback is only needed when emitting two vst1.64 instructions.
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
                            ARM::R4)
@@ -808,7 +818,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
                    .addReg(ARM::R4).addImm(16).addReg(NextReg)
@@ -820,7 +830,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // 16-byte aligned vst1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QPRRegisterClass);
+                                               &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
                    .addReg(ARM::R4).addImm(16).addReg(SupReg));
@@ -908,7 +918,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 4 d-regs and writeback.
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
                    .addReg(ARM::R4, RegState::Define)
                    .addReg(ARM::R4, RegState::Kill).addImm(16)
@@ -924,7 +934,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
                    .addReg(ARM::R4).addImm(16)
                    .addReg(SupReg, RegState::ImplicitDefine));
@@ -935,7 +945,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QPRRegisterClass);
+                                               &ARM::QPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
                    .addReg(ARM::R4).addImm(16));
     NextReg += 2;
@@ -1244,7 +1254,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       CanEliminateFrame = false;
     }
 
-    if (!ARM::GPRRegisterClass->contains(Reg))
+    if (!ARM::GPRRegClass.contains(Reg))
       continue;
 
     if (Spilled) {
@@ -1404,7 +1414,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
-        const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+        const TargetRegisterClass *RC = &ARM::GPRRegClass;
         RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
                                                            false));
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1eafbbc..ee349a7 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -47,11 +47,6 @@ CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
   cl::desc("Check fp vmla / vmls hazard at isel time"),
   cl::init(true));
 
-static cl::opt<bool>
-DisableARMIntABS("disable-arm-int-abs", cl::Hidden,
-  cl::desc("Enable / disable ARM integer abs transform"),
-  cl::init(false));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -210,29 +205,29 @@ private:
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    unsigned *DOpcodes,
-                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    unsigned *DOpcodes,
-                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// load/store of D registers and Q registers.
   SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
                           bool isUpdating, unsigned NumVecs,
-                          unsigned *DOpcodes, unsigned *QOpcodes);
+                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
   SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       unsigned *Opcodes);
+                       const uint16_t *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
@@ -583,8 +578,6 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 }
 
 
-
-
 //-----
 
 AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
@@ -1597,8 +1590,9 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   unsigned *DOpcodes, unsigned *QOpcodes0,
-                                   unsigned *QOpcodes1) {
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1729,8 +1723,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
 }
 
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   unsigned *DOpcodes, unsigned *QOpcodes0,
-                                   unsigned *QOpcodes1) {
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1875,8 +1870,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                          bool isUpdating, unsigned NumVecs,
-                                         unsigned *DOpcodes,
-                                         unsigned *QOpcodes) {
+                                         const uint16_t *DOpcodes,
+                                         const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1994,7 +1989,8 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                      unsigned NumVecs, unsigned *Opcodes) {
+                                      unsigned NumVecs,
+                                      const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -2491,14 +2487,10 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   SDValue XORSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  if (DisableARMIntABS)
-    return NULL;
-
   if (Subtarget->isThumb1Only())
     return NULL;
 
-  if (XORSrc0.getOpcode() != ISD::ADD ||
-    XORSrc1.getOpcode() != ISD::SRA)
+  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
     return NULL;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
@@ -2509,16 +2501,10 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   EVT XType = SRASrc0.getValueType();
   unsigned Size = XType.getSizeInBits() - 1;
 
-  if (ADDSrc1 == XORSrc1  &&
-      ADDSrc0 == SRASrc0 &&
-      XType.isInteger() &&
-      SRAConstant != NULL &&
+  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
+      XType.isInteger() && SRAConstant != NULL &&
       Size == SRAConstant->getZExtValue()) {
-
-    unsigned Opcode = ARM::ABS;
-    if (Subtarget->isThumb2())
-      Opcode = ARM::t2ABS;
-
+    unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
     return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
   }
 
@@ -2893,176 +2879,199 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2DUP: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
-                           ARM::VLD2DUPd32 };
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                        ARM::VLD2DUPd32 };
     return SelectVLDDup(N, false, 2, Opcodes);
   }
 
   case ARMISD::VLD3DUP: {
-    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
-                           ARM::VLD3DUPd32Pseudo };
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                        ARM::VLD3DUPd16Pseudo,
+                                        ARM::VLD3DUPd32Pseudo };
     return SelectVLDDup(N, false, 3, Opcodes);
   }
 
   case ARMISD::VLD4DUP: {
-    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
-                           ARM::VLD4DUPd32Pseudo };
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                        ARM::VLD4DUPd16Pseudo,
+                                        ARM::VLD4DUPd32Pseudo };
     return SelectVLDDup(N, false, 4, Opcodes);
   }
 
   case ARMISD::VLD2DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed,
-                           ARM::VLD2DUPd32wb_fixed };
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+                                        ARM::VLD2DUPd16wb_fixed,
+                                        ARM::VLD2DUPd32wb_fixed };
     return SelectVLDDup(N, true, 2, Opcodes);
   }
 
   case ARMISD::VLD3DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD,
-                           ARM::VLD3DUPd32Pseudo_UPD };
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+                                        ARM::VLD3DUPd16Pseudo_UPD,
+                                        ARM::VLD3DUPd32Pseudo_UPD };
     return SelectVLDDup(N, true, 3, Opcodes);
   }
 
   case ARMISD::VLD4DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD,
-                           ARM::VLD4DUPd32Pseudo_UPD };
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+                                        ARM::VLD4DUPd16Pseudo_UPD,
+                                        ARM::VLD4DUPd32Pseudo_UPD };
     return SelectVLDDup(N, true, 4, Opcodes);
   }
 
   case ARMISD::VLD1_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD1d8wb_fixed, ARM::VLD1d16wb_fixed,
-                            ARM::VLD1d32wb_fixed, ARM::VLD1d64wb_fixed };
-    unsigned QOpcodes[] = { ARM::VLD1q8wb_fixed,
-                            ARM::VLD1q16wb_fixed,
-                            ARM::VLD1q32wb_fixed,
-                            ARM::VLD1q64wb_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+                                         ARM::VLD1d16wb_fixed,
+                                         ARM::VLD1d32wb_fixed,
+                                         ARM::VLD1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+                                         ARM::VLD1q16wb_fixed,
+                                         ARM::VLD1q32wb_fixed,
+                                         ARM::VLD1q64wb_fixed };
     return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VLD2_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD2d8wb_fixed,
-                            ARM::VLD2d16wb_fixed,
-                            ARM::VLD2d32wb_fixed,
-                            ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
-                            ARM::VLD2q16PseudoWB_fixed,
-                            ARM::VLD2q32PseudoWB_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+                                         ARM::VLD2d16wb_fixed,
+                                         ARM::VLD2d32wb_fixed,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                                         ARM::VLD2q16PseudoWB_fixed,
+                                         ARM::VLD2q32PseudoWB_fixed };
     return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VLD3_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD,
-                            ARM::VLD3d32Pseudo_UPD, ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
-                             ARM::VLD3q16Pseudo_UPD,
-                             ARM::VLD3q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
-                             ARM::VLD3q16oddPseudo_UPD,
-                             ARM::VLD3q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+                                         ARM::VLD3d16Pseudo_UPD,
+                                         ARM::VLD3d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                          ARM::VLD3q16Pseudo_UPD,
+                                          ARM::VLD3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                                          ARM::VLD3q16oddPseudo_UPD,
+                                          ARM::VLD3q32oddPseudo_UPD };
     return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VLD4_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD,
-                            ARM::VLD4d32Pseudo_UPD, ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
-                             ARM::VLD4q16Pseudo_UPD,
-                             ARM::VLD4q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
-                             ARM::VLD4q16oddPseudo_UPD,
-                             ARM::VLD4q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+                                         ARM::VLD4d16Pseudo_UPD,
+                                         ARM::VLD4d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                          ARM::VLD4q16Pseudo_UPD,
+                                          ARM::VLD4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                                          ARM::VLD4q16oddPseudo_UPD,
+                                          ARM::VLD4q32oddPseudo_UPD };
     return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VLD2LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD,
-                            ARM::VLD2LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
-                            ARM::VLD2LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+                                         ARM::VLD2LNd16Pseudo_UPD,
+                                         ARM::VLD2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                                         ARM::VLD2LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VLD3LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD,
-                            ARM::VLD3LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
-                            ARM::VLD3LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+                                         ARM::VLD3LNd16Pseudo_UPD,
+                                         ARM::VLD3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                                         ARM::VLD3LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VLD4LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD,
-                            ARM::VLD4LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
-                            ARM::VLD4LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+                                         ARM::VLD4LNd16Pseudo_UPD,
+                                         ARM::VLD4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                                         ARM::VLD4LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST1_UPD: {
-    unsigned DOpcodes[] = { ARM::VST1d8wb_fixed, ARM::VST1d16wb_fixed,
-                            ARM::VST1d32wb_fixed, ARM::VST1d64wb_fixed };
-    unsigned QOpcodes[] = { ARM::VST1q8wb_fixed,
-                            ARM::VST1q16wb_fixed,
-                            ARM::VST1q32wb_fixed,
-                            ARM::VST1q64wb_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+                                         ARM::VST1d16wb_fixed,
+                                         ARM::VST1d32wb_fixed,
+                                         ARM::VST1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+                                         ARM::VST1q16wb_fixed,
+                                         ARM::VST1q32wb_fixed,
+                                         ARM::VST1q64wb_fixed };
     return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VST2_UPD: {
-    unsigned DOpcodes[] = { ARM::VST2d8wb_fixed,
-                            ARM::VST2d16wb_fixed,
-                            ARM::VST2d32wb_fixed,
-                            ARM::VST1q64wb_fixed};
-    unsigned QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
-                            ARM::VST2q16PseudoWB_fixed,
-                            ARM::VST2q32PseudoWB_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+                                         ARM::VST2d16wb_fixed,
+                                         ARM::VST2d32wb_fixed,
+                                         ARM::VST1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                                         ARM::VST2q16PseudoWB_fixed,
+                                         ARM::VST2q32PseudoWB_fixed };
     return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VST3_UPD: {
-    unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
-                            ARM::VST3d32Pseudo_UPD,ARM::VST1d64TPseudoWB_fixed};
-    unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
-                             ARM::VST3q16Pseudo_UPD,
-                             ARM::VST3q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
-                             ARM::VST3q16oddPseudo_UPD,
-                             ARM::VST3q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+                                         ARM::VST3d16Pseudo_UPD,
+                                         ARM::VST3d32Pseudo_UPD,
+                                         ARM::VST1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                          ARM::VST3q16Pseudo_UPD,
+                                          ARM::VST3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                                          ARM::VST3q16oddPseudo_UPD,
+                                          ARM::VST3q32oddPseudo_UPD };
     return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VST4_UPD: {
-    unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
-                            ARM::VST4d32Pseudo_UPD,ARM::VST1d64QPseudoWB_fixed};
-    unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
-                             ARM::VST4q16Pseudo_UPD,
-                             ARM::VST4q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
-                             ARM::VST4q16oddPseudo_UPD,
-                             ARM::VST4q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+                                         ARM::VST4d16Pseudo_UPD,
+                                         ARM::VST4d32Pseudo_UPD,
+                                         ARM::VST1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                          ARM::VST4q16Pseudo_UPD,
+                                          ARM::VST4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                                          ARM::VST4q16oddPseudo_UPD,
+                                          ARM::VST4q32oddPseudo_UPD };
     return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VST2LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD,
-                            ARM::VST2LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
-                            ARM::VST2LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+                                         ARM::VST2LNd16Pseudo_UPD,
+                                         ARM::VST2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                                         ARM::VST2LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST3LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD,
-                            ARM::VST3LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
-                            ARM::VST3LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+                                         ARM::VST3LNd16Pseudo_UPD,
+                                         ARM::VST3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                                         ARM::VST3LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST4LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD,
-                            ARM::VST4LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
-                            ARM::VST4LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+                                         ARM::VST4LNd16Pseudo_UPD,
+                                         ARM::VST4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                                         ARM::VST4LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
   }
 
@@ -3179,124 +3188,144 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
 
     case Intrinsic::arm_neon_vld1: {
-      unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
-                              ARM::VLD1d32, ARM::VLD1d64 };
-      unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
-                              ARM::VLD1q32, ARM::VLD1q64};
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                                           ARM::VLD1d32, ARM::VLD1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64};
       return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld2: {
-      unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
-                              ARM::VLD2d32, ARM::VLD1q64 };
-      unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
-                              ARM::VLD2q32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                                           ARM::VLD2d32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                                           ARM::VLD2q32Pseudo };
       return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
-      unsigned DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo,
-                              ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo };
-      unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
-                               ARM::VLD3q16Pseudo_UPD,
-                               ARM::VLD3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo,
-                               ARM::VLD3q16oddPseudo,
-                               ARM::VLD3q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+                                           ARM::VLD3d16Pseudo,
+                                           ARM::VLD3d32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                            ARM::VLD3q16Pseudo_UPD,
+                                            ARM::VLD3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                                            ARM::VLD3q16oddPseudo,
+                                            ARM::VLD3q32oddPseudo };
       return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld4: {
-      unsigned DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo,
-                              ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo };
-      unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
-                               ARM::VLD4q16Pseudo_UPD,
-                               ARM::VLD4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo,
-                               ARM::VLD4q16oddPseudo,
-                               ARM::VLD4q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+                                           ARM::VLD4d16Pseudo,
+                                           ARM::VLD4d32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                            ARM::VLD4q16Pseudo_UPD,
+                                            ARM::VLD4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                                            ARM::VLD4q16oddPseudo,
+                                            ARM::VLD4q32oddPseudo };
       return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld2lane: {
-      unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo,
-                              ARM::VLD2LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+                                           ARM::VLD2LNd16Pseudo,
+                                           ARM::VLD2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+                                           ARM::VLD2LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld3lane: {
-      unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo,
-                              ARM::VLD3LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+                                           ARM::VLD3LNd16Pseudo,
+                                           ARM::VLD3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+                                           ARM::VLD3LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld4lane: {
-      unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo,
-                              ARM::VLD4LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+                                           ARM::VLD4LNd16Pseudo,
+                                           ARM::VLD4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+                                           ARM::VLD4LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst1: {
-      unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
-                              ARM::VST1d32, ARM::VST1d64 };
-      unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
-                              ARM::VST1q32, ARM::VST1q64 };
+      static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                                           ARM::VST1d32, ARM::VST1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
       return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst2: {
-      unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
-                              ARM::VST2d32, ARM::VST1q64 };
-      unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
-                              ARM::VST2q32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                                           ARM::VST2d32, ARM::VST1q64 };
+      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                     ARM::VST2q32Pseudo };
       return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
-      unsigned DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo,
-                              ARM::VST3d32Pseudo, ARM::VST1d64TPseudo };
-      unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
-                               ARM::VST3q16Pseudo_UPD,
-                               ARM::VST3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo,
-                               ARM::VST3q16oddPseudo,
-                               ARM::VST3q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+                                           ARM::VST3d16Pseudo,
+                                           ARM::VST3d32Pseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                            ARM::VST3q16Pseudo_UPD,
+                                            ARM::VST3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                                            ARM::VST3q16oddPseudo,
+                                            ARM::VST3q32oddPseudo };
       return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst4: {
-      unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo,
-                              ARM::VST4d32Pseudo, ARM::VST1d64QPseudo };
-      unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
-                               ARM::VST4q16Pseudo_UPD,
-                               ARM::VST4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo,
-                               ARM::VST4q16oddPseudo,
-                               ARM::VST4q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+                                           ARM::VST4d16Pseudo,
+                                           ARM::VST4d32Pseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                            ARM::VST4q16Pseudo_UPD,
+                                            ARM::VST4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                                            ARM::VST4q16oddPseudo,
+                                            ARM::VST4q32oddPseudo };
       return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst2lane: {
-      unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo,
-                              ARM::VST2LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+                                           ARM::VST2LNd16Pseudo,
+                                           ARM::VST2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+                                           ARM::VST2LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst3lane: {
-      unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo,
-                              ARM::VST3LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+                                           ARM::VST3LNd16Pseudo,
+                                           ARM::VST3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+                                           ARM::VST3LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst4lane: {
-      unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo,
-                              ARM::VST4LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+                                           ARM::VST4LNd16Pseudo,
+                                           ARM::VST4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+                                           ARM::VST4LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
     }
     }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index a103c94..c66618a 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -52,6 +52,7 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 
 // This option should go away when tail calls fully work.
 static cl::opt<bool>
@@ -89,76 +90,71 @@ static const uint16_t GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
-void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
-                                       EVT PromotedBitwiseVT) {
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType (ISD::LOAD, VT.getSimpleVT(),
-                       PromotedLdStVT.getSimpleVT());
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType (ISD::STORE, VT.getSimpleVT(),
-                       PromotedLdStVT.getSimpleVT());
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
   }
 
-  EVT ElemTy = VT.getVectorElementType();
+  MVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
-    setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   if (ElemTy == MVT::i32) {
-    setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Custom);
-    setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Custom);
-    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
   } else {
-    setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand);
-  }
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal);
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+  setOperationAction(ISD::SELECT,            VT, Expand);
+  setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
   if (VT.isInteger()) {
-    setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-    setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-    setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
   }
 
   // Promote all bit-wise operations.
   if (VT.isInteger() && VT != PromotedBitwiseVT) {
-    setOperationAction(ISD::AND, VT.getSimpleVT(), Promote);
-    AddPromotedToType (ISD::AND, VT.getSimpleVT(),
-                       PromotedBitwiseVT.getSimpleVT());
-    setOperationAction(ISD::OR,  VT.getSimpleVT(), Promote);
-    AddPromotedToType (ISD::OR,  VT.getSimpleVT(),
-                       PromotedBitwiseVT.getSimpleVT());
-    setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote);
-    AddPromotedToType (ISD::XOR, VT.getSimpleVT(),
-                       PromotedBitwiseVT.getSimpleVT());
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
   }
 
   // Neon does not support vector divide/remainder operations.
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::FDIV, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
 }
 
-void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
-  addRegisterClass(VT, ARM::DPRRegisterClass);
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPRRegClass);
   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 }
 
-void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
-  addRegisterClass(VT, ARM::QPRRegisterClass);
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::QPRRegClass);
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
@@ -431,14 +427,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   if (Subtarget->isThumb1Only())
-    addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
+    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
-    addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
+    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
-    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
+    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     if (!Subtarget->isFPOnlySP())
-      addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   }
@@ -824,6 +820,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   benefitFromCodePlacementOpt = true;
 
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  predictableSelectIsExpensive = Subtarget->isCortexA9();
+
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
@@ -849,7 +848,7 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{
   // the cost is 1 for both f32 and f64.
   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     // When NEON is used for SP, only half of the register file is available
     // because operations that define both SP and DP results will be constrained
     // to the VFP2 class (D0-D15). We currently model this constraint prior to
@@ -859,15 +858,15 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{
     break;
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 2;
     break;
   case MVT::v4i64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 4;
     break;
   case MVT::v8i64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 8;
     break;
   }
@@ -891,6 +890,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
@@ -1027,17 +1027,18 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
   // load / store 4 to 8 consecutive D registers.
   if (Subtarget->hasNEON()) {
     if (VT == MVT::v4i64)
-      return ARM::QQPRRegisterClass;
-    else if (VT == MVT::v8i64)
-      return ARM::QQQQPRRegisterClass;
+      return &ARM::QQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::QQQQPRRegClass;
   }
   return TargetLowering::getRegClassFor(VT);
 }
 
 // Create a fast isel object.
 FastISel *
-ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
-  return ARM::createFastISel(funcInfo);
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return ARM::createFastISel(funcInfo, libInfo);
 }
 
 /// getMaximalGlobalOffset - Returns the maximal possible offset which can
@@ -1166,6 +1167,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   case CallingConv::ARM_APCS:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::GHC:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
   }
 }
 
@@ -1286,14 +1289,20 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
 SDValue
-ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool doesNotRet                       = CLI.DoesNotReturn;
+  bool isVarArg                         = CLI.IsVarArg;
+
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsSibCall = false;
@@ -1415,21 +1424,22 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         CCInfo.clearFirstByValReg();
       }
 
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
-                                StkPtrOff);
-      SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
-      SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
-      SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
-                                         MVT::i32);
-      MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
-                                          Flags.getByValAlign(),
-                                          /*isVolatile=*/false,
-                                          /*AlwaysInline=*/false,
-                                          MachinePointerInfo(0),
-                                          MachinePointerInfo(0)));
-
+      if (Flags.getByValSize() - 4*offset > 0) {
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                  StkPtrOff);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                           MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
+
+        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+                                          Ops, array_lengthof(Ops)));
+      }
     } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
@@ -2095,12 +2105,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
   // FIXME: is there useful debug info available here?
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()),
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                (Type *) Type::getInt32Ty(*DAG.getContext()),
                 false, false, false, false,
                 0, CallingConv::C, /*isTailCall=*/false,
                 /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
 
@@ -2108,7 +2119,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 // "local exec" model.
 SDValue
 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
-                                        SelectionDAG &DAG) const {
+                                        SelectionDAG &DAG,
+                                        TLSModel::Model model) const {
   const GlobalValue *GV = GA->getGlobal();
   DebugLoc dl = GA->getDebugLoc();
   SDValue Offset;
@@ -2117,7 +2129,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
-  if (GV->isDeclaration()) {
+  if (model == TLSModel::InitialExec) {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
@@ -2142,6 +2154,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                          false, false, false, 0);
   } else {
     // local exec model
+    assert(model == TLSModel::LocalExec);
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
@@ -2162,12 +2175,18 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() &&
          "TLS not implemented for non-ELF targets");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
-  // otherwise use the "Local Exec" TLS Model
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
-    return LowerToTLSGeneralDynamicModel(GA, DAG);
-  else
-    return LowerToTLSExecModels(GA, DAG);
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModels(GA, DAG, model);
+  }
+  llvm_unreachable("bogus TLS model");
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
@@ -2457,9 +2476,9 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 
   const TargetRegisterClass *RC;
   if (AFI->isThumb1OnlyFunction())
-    RC = ARM::tGPRRegisterClass;
+    RC = &ARM::tGPRRegClass;
   else
-    RC = ARM::GPRRegisterClass;
+    RC = &ARM::GPRRegClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
@@ -2543,9 +2562,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
     for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
       const TargetRegisterClass *RC;
       if (AFI->isThumb1OnlyFunction())
-        RC = ARM::tGPRRegisterClass;
+        RC = &ARM::tGPRRegClass;
       else
-        RC = ARM::GPRRegisterClass;
+        RC = &ARM::GPRRegClass;
 
       unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
@@ -2627,14 +2646,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         const TargetRegisterClass *RC;
 
         if (RegVT == MVT::f32)
-          RC = ARM::SPRRegisterClass;
+          RC = &ARM::SPRRegClass;
         else if (RegVT == MVT::f64)
-          RC = ARM::DPRRegisterClass;
+          RC = &ARM::DPRRegClass;
         else if (RegVT == MVT::v2f64)
-          RC = ARM::QPRRegisterClass;
+          RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
-          RC = (AFI->isThumb1OnlyFunction() ?
-                ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
+          RC = AFI->isThumb1OnlyFunction() ?
+            (const TargetRegisterClass*)&ARM::tGPRRegClass :
+            (const TargetRegisterClass*)&ARM::GPRRegClass;
         else
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
@@ -4249,6 +4269,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
     // Record this extraction against the appropriate vector if possible...
     SDValue SourceVec = V.getOperand(0);
+    // If the element number isn't a constant, we can't effectively
+    // analyze what's going on.
+    if (!isa<ConstantSDNode>(V.getOperand(1)))
+      return SDValue();
     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
     bool FoundSource = false;
     for (unsigned j = 0; j < SourceVecs.size(); ++j) {
@@ -4791,7 +4815,9 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
   for (unsigned i = 0; i != NumElts; ++i) {
     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
     const APInt &CInt = C->getAPIntValue();
-    Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT));
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
@@ -5252,14 +5278,14 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   bool isThumb2 = Subtarget->isThumb2();
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  unsigned scratch =
-    MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass
-                                       : ARM::GPRRegisterClass);
+  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass);
 
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(newval, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
@@ -5362,8 +5388,8 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
@@ -5394,8 +5420,9 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 
@@ -5469,8 +5496,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc, extendOpc;
@@ -5504,8 +5531,9 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = MRI.createVirtualRegister(TRC);
 
@@ -5531,7 +5559,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
+    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
                      .addReg(dest)
                      .addImm(0));
@@ -5586,9 +5614,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
@@ -5614,8 +5642,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned storesuccess = MRI.createVirtualRegister(TRC);
 
   //  thisMBB:
@@ -5722,8 +5751,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
 
-  const TargetRegisterClass *TRC =
-    isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
 
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
@@ -5827,8 +5857,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   MachineFrameInfo *MFI = MF->getFrameInfo();
   int FI = MFI->getFunctionContextIndex();
 
-  const TargetRegisterClass *TRC =
-    Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
@@ -6176,14 +6207,12 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
         unsigned Reg = SavedRegs[i];
         if (Subtarget->isThumb2() &&
-            !ARM::tGPRRegisterClass->contains(Reg) &&
-            !ARM::hGPRRegisterClass->contains(Reg))
+            !ARM::tGPRRegClass.contains(Reg) &&
+            !ARM::hGPRRegClass.contains(Reg))
           continue;
-        else if (Subtarget->isThumb1Only() &&
-                 !ARM::tGPRRegisterClass->contains(Reg))
+        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
           continue;
-        else if (!Subtarget->isThumb() &&
-                 !ARM::GPRRegisterClass->contains(Reg))
+        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
           continue;
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
@@ -6214,6 +6243,304 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
+MachineBasicBlock *ARMTargetLowering::
+EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+  // This pseudo instruction has 3 operands: dst, src, size
+  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+  // Otherwise, we will generate unrolled scalar copies.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned src = MI->getOperand(1).getReg();
+  unsigned SizeVal = MI->getOperand(2).getImm();
+  unsigned Align = MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  bool isThumb2 = Subtarget->isThumb2();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned ldrOpc, strOpc, UnitSize = 0;
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *TRC_Vec = 0;
+
+  if (Align & 1) {
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    UnitSize = 1;
+  } else if (Align & 2) {
+    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
+    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
+    UnitSize = 2;
+  } else {
+    // Check whether we can use NEON instructions.
+    if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) &&
+        Subtarget->hasNEON()) {
+      if ((Align % 16 == 0) && SizeVal >= 16) {
+        ldrOpc = ARM::VLD1q32wb_fixed;
+        strOpc = ARM::VST1q32wb_fixed;
+        UnitSize = 16;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
+      }
+      else if ((Align % 8 == 0) && SizeVal >= 8) {
+        ldrOpc = ARM::VLD1d32wb_fixed;
+        strOpc = ARM::VST1d32wb_fixed;
+        UnitSize = 8;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
+      }
+    }
+    // Can't use NEON instructions.
+    if (UnitSize == 0) {
+      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+      UnitSize = 4;
+    }
+  }
+
+  unsigned BytesLeft = SizeVal % UnitSize;
+  unsigned LoopSize = SizeVal - BytesLeft;
+
+  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+    // Use LDR and STR to copy.
+    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+    // [destOut] = STR_POST(scratch, destIn, UnitSize)
+    unsigned srcIn = src;
+    unsigned destIn = dest;
+    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (UnitSize >= 8) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(destIn).addImm(0).addReg(scratch));
+      } else if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addImm(UnitSize));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
+          .addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(UnitSize));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+
+    // Handle the leftover bytes with LDRB and STRB.
+    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+    // [destOut] = STRB_POST(scratch, destIn, 1)
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    for (unsigned i = 0; i < BytesLeft; i++) {
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+    MI->eraseFromParent();   // The instruction is gone now.
+    return BB;
+  }
+
+  // Expand the pseudo op to a loop.
+  // thisMBB:
+  //   ...
+  //   movw varEnd, # --> with thumb2
+  //   movt varEnd, #
+  //   ldrcp varEnd, idx --> without thumb2
+  //   fallthrough --> loopMBB
+  // loopMBB:
+  //   PHI varPhi, varEnd, varLoop
+  //   PHI srcPhi, src, srcLoop
+  //   PHI destPhi, dst, destLoop
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+  //   subs varLoop, varPhi, #UnitSize
+  //   bne loopMBB
+  //   fallthrough --> exitMBB
+  // exitMBB:
+  //   epilogue to handle left-over bytes
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Load an immediate to varEnd.
+  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  if (isThumb2) {
+    unsigned VReg1 = varEnd;
+    if ((LoopSize & 0xFFFF0000) != 0)
+      VReg1 = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                   .addImm(LoopSize & 0xFFFF));
+
+    if ((LoopSize & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
+                     .addReg(VReg1)
+                     .addImm(LoopSize >> 16));
+  } else {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = getTargetData()->getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
+                   .addReg(varEnd, RegState::Define)
+                   .addConstantPoolIndex(Idx)
+                   .addImm(0));
+  }
+  BB->addSuccessor(loopMBB);
+
+  // Generate the loop body:
+  //   varPhi = PHI(varLoop, varEnd)
+  //   srcPhi = PHI(srcLoop, src)
+  //   destPhi = PHI(destLoop, dst)
+  MachineBasicBlock *entryBB = BB;
+  BB = loopMBB;
+  unsigned varLoop = MRI.createVirtualRegister(TRC);
+  unsigned varPhi = MRI.createVirtualRegister(TRC);
+  unsigned srcLoop = MRI.createVirtualRegister(TRC);
+  unsigned srcPhi = MRI.createVirtualRegister(TRC);
+  unsigned destLoop = MRI.createVirtualRegister(TRC);
+  unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+    .addReg(varLoop).addMBB(loopMBB)
+    .addReg(varEnd).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+    .addReg(srcLoop).addMBB(loopMBB)
+    .addReg(src).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+    .addReg(destLoop).addMBB(loopMBB)
+    .addReg(dest).addMBB(entryBB);
+
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+  if (UnitSize >= 8) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(destPhi).addImm(0).addReg(scratch));
+  } else if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addImm(UnitSize));
+  } else {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
+      .addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addReg(0).addImm(UnitSize));
+  }
+
+  // Decrement loop variable by UnitSize.
+  MachineInstrBuilder MIB = BuildMI(BB, dl,
+    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+  MIB->getOperand(5).setReg(ARM::CPSR);
+  MIB->getOperand(5).setIsDef(true);
+
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  // loopMBB can loop back to loopMBB or fall through to exitMBB.
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  // Add epilogue to handle BytesLeft.
+  BB = exitMBB;
+  MachineInstr *StartOfExit = exitMBB->begin();
+  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  unsigned srcIn = srcLoop;
+  unsigned destIn = destLoop;
+  for (unsigned i = 0; i < BytesLeft; i++) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    unsigned srcOut = MRI.createVirtualRegister(TRC);
+    unsigned destOut = MRI.createVirtualRegister(TRC);
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addImm(1));
+    } else {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addReg(0).addImm(1));
+    }
+    srcIn = srcOut;
+    destIn = destOut;
+  }
+
+  MI->eraseFromParent();   // The instruction is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -6517,10 +6844,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
     // PC and if destination register is the SP, so restrict register class
-    unsigned NewMovDstReg = MRI.createVirtualRegister(
-      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
-    unsigned NewRsbDstReg = MRI.createVirtualRegister(
-      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
+      (const TargetRegisterClass*)&ARM::rGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
@@ -6534,12 +6860,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // fall through to SinkMBB
     RSBBB->addSuccessor(SinkBB);
 
-    // insert a movs at the end of BB
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr),
-      NewMovDstReg)
-      .addReg(ABSSrcReg, RegState::Kill)
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addReg(ARM::CPSR, RegState::Define);
+    // insert a cmp at the end of BB
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                   .addReg(ABSSrcReg).addImm(0));
 
     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
     BuildMI(BB, dl,
@@ -6551,7 +6875,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // by if-conversion pass
     BuildMI(*RSBBB, RSBBB->begin(), dl,
       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-      .addReg(NewMovDstReg, RegState::Kill)
+      .addReg(ABSSrcReg, RegState::Kill)
       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
 
     // insert PHI in SinkBB,
@@ -6559,7 +6883,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(*SinkBB, SinkBB->begin(), dl,
       TII->get(ARM::PHI), ABSDstReg)
       .addReg(NewRsbDstReg).addMBB(RSBBB)
-      .addReg(NewMovDstReg).addMBB(BB);
+      .addReg(ABSSrcReg).addMBB(BB);
 
     // remove ABS instruction
     MI->eraseFromParent();
@@ -6567,6 +6891,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // return last added BB
     return SinkBB;
   }
+  case ARM::COPY_STRUCT_BYVAL_I32:
+    ++NumLoopByVals;
+    return EmitStructByval(MI, BB);
   }
 }
 
@@ -7095,8 +7422,12 @@ static SDValue PerformORCombine(SDNode *N,
       return COR;
   }
 
+
+  // The code below optimizes (or (and X, Y), Z).
+  // The AND operand needs to have a single user to make these optimizations
+  // profitable.
   SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
     return SDValue();
   SDValue N1 = N->getOperand(1);
 
@@ -7353,7 +7684,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (St->isVolatile())
     return SDValue();
 
-  // Optimize trunc store (of multiple scalars) to shuffle and store.  First, 
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
   // pack all of the elements in one place.  Next, store to memory in fewer
   // chunks.
   SDValue StVal = St->getValue();
@@ -8721,12 +9052,19 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return Imm >= 0 && Imm <= 255;
 }
 
-/// isLegalAddImmediate - Return true if the specified immediate is legal
-/// add immediate, that is the target has add instructions which can add
-/// a register with the immediate without having to materialize the
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
 /// immediate into a register.
 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
-  return ARM_AM::getSOImmVal(Imm) != -1;
+  // Same encoding for add/sub, just flip the sign.
+  int64_t AbsImm = llvm::abs64(Imm);
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(AbsImm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+  // Thumb1 only has 8-bit unsigned immediate.
+  return AbsImm >= 0 && AbsImm <= 255;
 }
 
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
@@ -9030,39 +9368,38 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     switch (Constraint[0]) {
     case 'l': // Low regs or general regs.
       if (Subtarget->isThumb())
-        return RCPair(0U, ARM::tGPRRegisterClass);
-      else
-        return RCPair(0U, ARM::GPRRegisterClass);
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
     case 'h': // High regs or no regs.
       if (Subtarget->isThumb())
-        return RCPair(0U, ARM::hGPRRegisterClass);
+        return RCPair(0U, &ARM::hGPRRegClass);
       break;
     case 'r':
-      return RCPair(0U, ARM::GPRRegisterClass);
+      return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
-        return RCPair(0U, ARM::DPRRegisterClass);
+        return RCPair(0U, &ARM::DPRRegClass);
       if (VT.getSizeInBits() == 128)
-        return RCPair(0U, ARM::QPRRegisterClass);
+        return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPR_8RegisterClass);
+        return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
-        return RCPair(0U, ARM::DPR_8RegisterClass);
+        return RCPair(0U, &ARM::DPR_8RegClass);
       if (VT.getSizeInBits() == 128)
-        return RCPair(0U, ARM::QPR_8RegisterClass);
+        return RCPair(0U, &ARM::QPR_8RegClass);
       break;
     case 't':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, &ARM::SPRRegClass);
       break;
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
+    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 352d980..51d1205 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -41,6 +41,9 @@ namespace llvm {
                     // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
 
+      // Add pseudo op to model memcpy for struct byval.
+      COPY_STRUCT_BYVAL,
+
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
@@ -53,6 +56,7 @@ namespace llvm {
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
       CMP,          // ARM compare instructions.
+      CMN,          // ARM CMN instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
       CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
@@ -357,7 +361,8 @@ namespace llvm {
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                                     const TargetLibraryInfo *libInfo) const;
 
     Sched::Preference getSchedulingPreference(SDNode *N) const;
 
@@ -389,9 +394,9 @@ namespace llvm {
     ///
     unsigned ARMPCLabelIndex;
 
-    void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT);
-    void addDRTypeForNEON(EVT VT);
-    void addQRTypeForNEON(EVT VT);
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
     void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
@@ -422,7 +427,8 @@ namespace llvm {
     SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                             SelectionDAG &DAG) const;
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
-                                   SelectionDAG &DAG) const;
+                                 SelectionDAG &DAG,
+                                 TLSModel::Model model) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -462,13 +468,7 @@ namespace llvm {
                         unsigned &VARegSize, unsigned &VARegSaveSize) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
@@ -532,6 +532,9 @@ namespace llvm {
                                              MachineBasicBlock *MBB) const;
 
     bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
@@ -542,7 +545,8 @@ namespace llvm {
 
 
   namespace ARM {
-    FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
   }
 }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f04926a..c8966fb 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -827,6 +827,8 @@ class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{7-4}   = 0b0111;
   let Inst{9-8}   = 0b00;
   let Inst{27-20} = opcod;
+
+  let Unpredictable{9-8} = 0b11;
 }
 
 // Misc Arithmetic instructions.
@@ -1862,7 +1864,6 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
-
   // Instruction operands.
   bits<5> Vd;
   bits<5> Vn;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index b8f607e..31b0c41 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -31,7 +31,8 @@ ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   if (hasNOP()) {
-    NopInst.setOpcode(ARM::NOP);
+    NopInst.setOpcode(ARM::HINT);
+    NopInst.addOperand(MCOperand::CreateImm(0));
     NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
     NopInst.addOperand(MCOperand::CreateReg(0));
   } else {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 1eb561d..6340a58 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -18,6 +18,9 @@
 // Type profiles.
 def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
 def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
 
@@ -90,6 +93,10 @@ def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
                               [SDNPHasChain, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+                                SDT_ARMStructByVal,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                 SDNPMayStore, SDNPMayLoad]>;
 
 def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -121,6 +128,9 @@ def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
                               [SDNPOutGlue]>;
 
+def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
 def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
                               [SDNPOutGlue, SDNPCommutative]>;
 
@@ -161,53 +171,59 @@ def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 // ARM Instruction Predicate Definitions.
 //
 def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
-                                 AssemblerPredicate<"HasV4TOps">;
+                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
-                                 AssemblerPredicate<"HasV5TEOps">;
+                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
-                                 AssemblerPredicate<"HasV6Ops">;
+                                 AssemblerPredicate<"HasV6Ops", "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
-                                 AssemblerPredicate<"HasV6T2Ops">;
+                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
-                                 AssemblerPredicate<"HasV7Ops">;
+                                 AssemblerPredicate<"HasV7Ops", "armv7">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
-                                 AssemblerPredicate<"FeatureVFP2">;
+                                 AssemblerPredicate<"FeatureVFP2", "VFP2">;
 def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
-                                 AssemblerPredicate<"FeatureVFP3">;
+                                 AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
-                                 AssemblerPredicate<"FeatureVFP4">;
+                                 AssemblerPredicate<"FeatureVFP4", "VFP4">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON">;
+                                 AssemblerPredicate<"FeatureNEON", "NEON">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16">;
+                                 AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv">;
+                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
-                                 AssemblerPredicate<"FeatureT2XtPk">;
+                                 AssemblerPredicate<"FeatureT2XtPk",
+                                                     "pack/extract">;
 def HasThumb2DSP     : Predicate<"Subtarget->hasThumb2DSP()">,
-                                 AssemblerPredicate<"FeatureDSPThumb2">;
+                                 AssemblerPredicate<"FeatureDSPThumb2",
+                                                    "thumb2-dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
-                                 AssemblerPredicate<"FeatureDB">;
+                                 AssemblerPredicate<"FeatureDB",
+                                                    "data-barriers">;
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
-                                 AssemblerPredicate<"FeatureMP">;
+                                 AssemblerPredicate<"FeatureMP",
+                                                    "mp-extensions">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
 def IsThumb          : Predicate<"Subtarget->isThumb()">,
-                                 AssemblerPredicate<"ModeThumb">;
+                                 AssemblerPredicate<"ModeThumb", "thumb">;
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
-                                 AssemblerPredicate<"ModeThumb,FeatureThumb2">;
+                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                                    "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass">;
+                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
 def IsARClass        : Predicate<"!Subtarget->isMClass()">,
-                                 AssemblerPredicate<"!FeatureMClass">;
+                                 AssemblerPredicate<"!FeatureMClass",
+                                                    "armv7a/r">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
-                                 AssemblerPredicate<"!ModeThumb">;
+                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
 def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
@@ -220,7 +236,8 @@ def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
 // But only select them if more precision in FP computation is allowed.
 // Do not use them for Darwin platforms.
-def UseFusedMAC      : Predicate<"!TM.Options.NoExcessFPPrecision && "
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast) && "
                                  "!Subtarget->isTargetDarwin()">;
 def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
                                  "Subtarget->isTargetDarwin()">;
@@ -236,9 +253,9 @@ class RegConstraint<string C> {
 //  ARM specific transformation functions and pattern fragments.
 //
 
-// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
-// so_imm_neg def below.
-def so_imm_neg_XFORM : SDNodeXForm<imm, [{
+// imm_neg_XFORM - Return a imm value packed into the format described for
+// imm_neg defs below.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
 
@@ -257,7 +274,7 @@ def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
     int64_t Value = -(int)N->getZExtValue();
     return Value && ARM_AM::getSOImmVal(Value) != -1;
-  }], so_imm_neg_XFORM> {
+  }], imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
 }
 
@@ -399,8 +416,11 @@ def pclabel : Operand<i32> {
 }
 
 // ADR instruction labels.
+def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
 def adrlabel : Operand<i32> {
   let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrLabelAsmOperand;
+  let PrintMethod = "printAdrLabelOperand";
 }
 
 def neon_vcvt_imm32 : Operand<i32> {
@@ -570,7 +590,10 @@ def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
 }
 
 /// imm0_15 predicate - Immediate in the range [0,15].
-def Imm0_15AsmOperand: ImmAsmOperand { let Name = "Imm0_15"; }
+def Imm0_15AsmOperand: ImmAsmOperand {
+  let Name = "Imm0_15";
+  let DiagnosticType = "ImmRange0_15";
+}
 def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 16;
 }]> {
@@ -615,6 +638,11 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_65535AsmOperand;
 }
 
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+  return -Imm >= 0 && -Imm < 65536;
+}]>;
+
 // imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
 // a relocatable expression.
 //
@@ -940,9 +968,10 @@ include "ARMInstrFormats.td"
 
 /// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
 /// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, string baseOpc, bit Commutable = 0> {
+                        PatFrag opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1003,38 +1032,15 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
     let Inst{4} = 1;
     let Inst{3-0} = shift{3-0};
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-
 }
 
 /// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
 /// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, string baseOpc, bit Commutable = 0> {
+                        PatFrag opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1094,30 +1100,6 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
     let Inst{4} = 1;
     let Inst{3-0} = shift{3-0};
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-
 }
 
 /// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
@@ -1304,8 +1286,9 @@ class AI_exta_rrot_np<bits<8> opcod, string opc>
 }
 
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
-                             string baseOpc, bit Commutable = 0> {
+                             bit Commutable = 0> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
@@ -1351,7 +1334,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
                 (ins GPRnopc:$Rn, so_reg_reg:$shift),
                 DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
-              [(set GPRnopc:$Rd, CPSR, (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+              [(set GPRnopc:$Rd, CPSR,
+                    (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -1366,34 +1350,11 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{3-0} = shift{3-0};
   }
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPRnopc:$Rdn, GPRnopc:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
 }
 
 /// AI1_rsc_irs - Define instructions and patterns for rsc
-multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
-                       string baseOpc> {
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
@@ -1450,29 +1411,6 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{3-0} = shift{3-0};
   }
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -1511,9 +1449,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
   // Note: We use the complex addrmode_imm12 rather than just an input
   // GPR and a constrained immediate so that we can use this to match
   // frame index references and avoid matching constant pool references.
-  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr),
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins addrmode_imm12:$addr),
                    AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
-                  [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+                   [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
     bits<4>  Rt;
     bits<17> addr;
     let Inst{23}    = addr{12};     // U (add = ('U' == 1))
@@ -1521,9 +1460,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};   // imm12
   }
-  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift),
-                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
-                 [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins ldst_so_reg:$shift),
+                   AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                   [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
     bits<4>  Rt;
     bits<17> shift;
     let shift{4}    = 0;            // Inst{4} = 0
@@ -1581,9 +1521,10 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};   // imm12
   }
-  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift),
-                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
-                 [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+  def rs : AI2ldst<0b011, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                   AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                   [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
     bits<4> Rt;
     bits<17> shift;
     let shift{4}    = 0;            // Inst{4} = 0
@@ -1655,33 +1596,18 @@ def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
                                  NoItinerary, []>;
 }
 
-def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000000;
+def HINT : AI<(outs), (ins imm0_255:$imm), MiscFrm, NoItinerary,
+              "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
 }
 
-def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000001;
-}
-
-def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000010;
-}
-
-def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000011;
-}
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1694,16 +1620,10 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
   let Inst{27-20} = 0b01101000;
   let Inst{7-4} = 0b1011;
   let Inst{11-8} = 0b1111;
+  let Unpredictable{11-8} = 0b1111;
 }
 
-def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
-             []>, Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000100;
-}
-
-// The i32imm operand $val can be used by a debugger to store more information
+// The 16-bit operand $val can be used by a debugger to store more information
 // about the breakpoint.
 def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
               "bkpt", "\t$val", []>, Requires<[IsARM]> {
@@ -1922,7 +1842,7 @@ let isCall = 1,
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
   Defs = [LR], Uses = [SP] in {
-  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
             Requires<[IsARM]> {
@@ -1932,7 +1852,7 @@ let isCall = 1,
     let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
-  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
                 Requires<[IsARM]> {
@@ -1942,7 +1862,7 @@ let isCall = 1,
   }
 
   // ARMv5T and above
-  def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
             Requires<[IsARM, HasV5T]> {
@@ -1951,7 +1871,7 @@ let isCall = 1,
     let Inst{3-0}  = func;
   }
 
-  def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+  def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
                     IIC_Br, "blx", "\t$func",
                     [(ARMcall_pred GPR:$func)]>,
                  Requires<[IsARM, HasV5T]> {
@@ -1962,19 +1882,18 @@ let isCall = 1,
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
-  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
                    Requires<[IsARM, HasV4T]>;
 
   // ARMv4
-  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
                    Requires<[IsARM, NoV4T]>;
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
-  def BMOVPCB_CALL : ARMPseudoInst<(outs),
-                                   (ins bl_target:$func, variable_ops),
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                       Requires<[IsARM]>;
 }
@@ -2044,18 +1963,16 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Tail calls.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
-                              IIC_Br, []>;
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>;
 
-  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
-                              IIC_Br, []>;
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>;
 
-  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops),
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
                                  4, IIC_Br, [],
                                  (Bcc br_target:$dst, (ops 14, zero_reg))>,
                                  Requires<[IsARM]>;
 
-  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
                                  4, IIC_Br, [],
                                  (BX GPR:$dst)>,
                                  Requires<[IsARM]>;
@@ -2509,6 +2426,7 @@ multiclass AI2_stridx<bit isByte, string opc,
      let Inst{23} = offset{12};
      let Inst{19-16} = addr;
      let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
 
     let DecoderMethod = "DecodeAddrMode2IdxInstruction";
    }
@@ -2768,7 +2686,7 @@ defm STRHT : AI3strT<0b1011, "strht">;
 multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
                          InstrItinClass itin, InstrItinClass itin_upd> {
   // IA is the default, so no need for an explicit suffix on the
-  // mnemonic here. Without it is the cannonical spelling.
+  // mnemonic here. Without it is the canonical spelling.
   def IA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
@@ -2900,9 +2818,6 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
   let Inst{15-12} = Rd;
 }
 
-def : ARMInstAlias<"movs${p} $Rd, $Rm",
-                   (MOVr GPR:$Rd, GPR:$Rm, pred:$p, CPSR)>;
-
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
@@ -3113,10 +3028,10 @@ def UBFX  : I<(outs GPR:$Rd),
 
 defm ADD  : AsI1_bin_irs<0b0100, "add",
                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(add  node:$LHS, node:$RHS)>, "ADD", 1>;
+                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
 defm SUB  : AsI1_bin_irs<0b0010, "sub",
                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(sub  node:$LHS, node:$RHS)>, "SUB">;
+                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 
 // ADD and SUB with 's' bit set.
 //
@@ -3134,15 +3049,13 @@ defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                            BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-                  BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>,
-                          "ADC", 1>;
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
-                          "SBC">;
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
 
-defm RSB  : AsI1_rbin_irs <0b0011, "rsb",
-                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(sub node:$LHS, node:$RHS)>, "RSB">;
+defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                          BinOpFrag<(sub node:$LHS, node:$RHS)>>;
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
@@ -3150,8 +3063,7 @@ defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                            BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 defm RSC : AI1_rsc_irs<0b0111, "rsc",
-                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
-                       "RSC">;
+                BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -3163,6 +3075,11 @@ def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
 def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
              (SUBSri  GPR:$src, so_imm_neg:$imm)>;
 
+def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
@@ -3190,7 +3107,7 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Inst{19-16} = Rn;
   let Inst{15-12} = Rd;
   let Inst{3-0}   = Rm;
-  
+
   let Unpredictable{11-8} = 0b1111;
 }
 
@@ -3355,16 +3272,16 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
 
 defm AND   : AsI1_bin_irs<0b0000, "and",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(and node:$LHS, node:$RHS)>, "AND", 1>;
+                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
 defm ORR   : AsI1_bin_irs<0b1100, "orr",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(or  node:$LHS, node:$RHS)>, "ORR", 1>;
+                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
 defm EOR   : AsI1_bin_irs<0b0001, "eor",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(xor node:$LHS, node:$RHS)>, "EOR", 1>;
+                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
 defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(and node:$LHS, (not node:$RHS))>, "BIC">;
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
 // FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
 // like in the actual instruction encoding. The complexity of mapping the mask
@@ -3482,27 +3399,28 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
 
 // FIXME: The v5 pseudos are only necessary for the additional Constraint
 //        property. Remove them when it's possible to add those properties
-//        on an individual MachineInstr, not just an instuction description.
-let isCommutable = 1 in {
-def MUL  : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                   IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
-                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
-                   Requires<[IsARM, HasV6]> {
+//        on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+                    (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                  [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+                  Requires<[IsARM, HasV6]> {
   let Inst{15-12} = 0b0000;
   let Unpredictable{15-12} = 0b1111;
 }
 
 let Constraints = "@earlyclobber $Rd" in
 def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
-                                            pred:$p, cc_out:$s),
-                          4, IIC_iMUL32,
-                         [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
-                         (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-                        Requires<[IsARM, NoV6]>;
+                                                    pred:$p, cc_out:$s),
+                           4, IIC_iMUL32,
+               [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+               (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+               Requires<[IsARM, NoV6]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-                    IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                     IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
                    Requires<[IsARM, HasV6]> {
   bits<4> Ra;
@@ -3511,8 +3429,8 @@ def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 let Constraints = "@earlyclobber $Rd" in
 def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
-                          (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
-                          4, IIC_iMAC32,
+                           (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
+                           4, IIC_iMAC32,
                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))],
                   (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>,
                         Requires<[IsARM, NoV6]>;
@@ -3630,8 +3548,7 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra",
-               [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>,
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
@@ -3912,49 +3829,85 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
 def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
              (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
 
-// FIXME: We have to be careful when using the CMN instruction and comparison
-// with 0. One would expect these two pieces of code should give identical
-// results:
-//
-//   rsbs r1, r1, 0
-//   cmp  r0, r1
-//   mov  r0, #0
-//   it   ls
-//   mov  r0, #1
-//
-// and:
-//
-//   cmn  r0, r1
-//   mov  r0, #0
-//   it   ls
-//   mov  r0, #1
-//
-// However, the CMN gives the *opposite* result when r1 is 0. This is because
-// the carry flag is set in the CMP case but not in the CMN case. In short, the
-// CMP instruction doesn't perform a truncate of the (logical) NOT of 0 plus the
-// value of r0 and the carry bit (because the "carry bit" parameter to
-// AddWithCarry is defined as 1 in this case, the carry flag will always be set
-// when r0 >= 0). The CMN instruction doesn't perform a NOT of 0 so there is
-// never a "carry" when this AddWithCarry is performed (because the "carry bit"
-// parameter to AddWithCarry is defined as 0).
-//
-// When x is 0 and unsigned:
-//
-//    x = 0
-//   ~x = 0xFFFF FFFF
-//   ~x + 1 = 0x1 0000 0000
-//   (-x = 0) != (0x1 0000 0000 = ~x + 1)
-//
-// Therefore, we should disable CMN when comparing against zero, until we can
-// limit when the CMN instruction is used (when we know that the RHS is not 0 or
-// when it's a comparison which doesn't look at the 'carry' flag).
-//
-// (See the ARM docs for the "AddWithCarry" pseudo-code.)
-//
-// This is related to <rdar://problem/7569620>.
-//
-//defm CMN  : AI1_cmp_irs<0b1011, "cmn",
-//                        BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
+                "cmn", "\t$Rn, $imm",
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]> {
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-0} = imm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+                 "cmn", "\t$Rn, $Rm",
+                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                   GPR:$Rn, GPR:$Rm)]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let isCommutable = 1;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+                  (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPR:$Rn, so_reg_imm:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+                  (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPRnopc:$Rn, so_reg_reg:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp  GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
 
 // Note that TST/TEQ don't set all the same flags that CMP does!
 defm TST  : AI1_cmp_irs<0b1000, "tst",
@@ -3964,16 +3917,6 @@ defm TEQ  : AI1_cmp_irs<0b1001, "teq",
                         IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
                       BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
 
-defm CMNz  : AI1_cmp_irs<0b1011, "cmn",
-                         IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
-                         BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
-
-//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
-//             (CMNri  GPR:$src, so_imm_neg:$imm)>;
-
-def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
-             (CMNzri  GPR:$src, so_imm_neg:$imm)>;
-
 // Pseudo i64 compares for some floating point compares.
 let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
     Defs = [CPSR] in {
@@ -4121,11 +4064,8 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 
 // Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in {
-def ABS : ARMPseudoInst<
-  (outs GPR:$dst), (ins GPR:$src),
-  8, NoItinerary, []>;
-}
+let usesCustomInserter = 1, Defs = [CPSR] in
+def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
 
 let usesCustomInserter = 1 in {
   let Defs = [CPSR] in {
@@ -4242,6 +4182,13 @@ let usesCustomInserter = 1 in {
 }
 }
 
+let usesCustomInserter = 1 in {
+    def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+      (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+      NoItinerary,
+      [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary,
@@ -4280,10 +4227,10 @@ def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
 
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
-def SWP : AIswp<0, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
-                "swp", []>;
-def SWPB: AIswp<1, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
-                "swpb", []>;
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4609,8 +4556,8 @@ class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
-                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
-                                     imm:$CRm)]>;
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
@@ -4637,8 +4584,8 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
-                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
-                                        imm:$CRm)]>;
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, imm:$CRm)]>;
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -4658,7 +4605,8 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
   let Unpredictable{11-0} = 0b110100001111;
 }
 
-def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, Requires<[IsARM]>;
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+         Requires<[IsARM]>;
 
 // The MRSsys instruction is the MRS instruction from the ARM ARM,
 // section B9.3.9, with the R bit set to 1.
@@ -5114,7 +5062,7 @@ def : ARMInstAlias<"add${s}${p} $Rd, $imm",
                  (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via so_imm_neg
 def : ARMInstAlias<"cmp${p} $Rd, $imm",
-                   (CMNzri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+                   (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
 def : ARMInstAlias<"cmn${p} $Rd, $imm",
                    (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
 
@@ -5123,6 +5071,7 @@ def : ARMInstAlias<"cmn${p} $Rd, $imm",
 // FIXME: We need C++ parser hooks to map the alias to the MOV
 //        encoding. It seems we should be able to do that sort of thing
 //        in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
 def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
                         (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
                              cc_out:$s)>;
@@ -5135,8 +5084,10 @@ def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
 def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
                         (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
                              cc_out:$s)>;
+}
 def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
 def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
                              cc_out:$s)>;
@@ -5149,32 +5100,7 @@ def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
 def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
                              cc_out:$s)>;
-// shifter instructions also support a two-operand form.
-def : ARMInstAlias<"asr${s}${p} $Rm, $imm",
-                   (ASRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"lsr${s}${p} $Rm, $imm",
-                   (LSRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"lsl${s}${p} $Rm, $imm",
-                   (LSLi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"ror${s}${p} $Rm, $imm",
-                   (RORi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"asr${s}${p} $Rn, $Rm",
-                   (ASRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"lsr${s}${p} $Rn, $Rm",
-                   (LSRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"lsl${s}${p} $Rn, $Rm",
-                   (LSLr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"ror${s}${p} $Rn, $Rm",
-                   (RORr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-
-
-// 'mul' instruction can be specified with only two operands.
-def : ARMInstAlias<"mul${s}${p} $Rn, $Rm",
-                   (MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p, cc_out:$s)>;
+}
 
 // "neg" is and alias for "rsb rd, rn, #0"
 def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index fd8ac0b..3134088 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1962,7 +1962,7 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
   let Inst{4}   = Rn{5};
 }
 
-def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, 
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
                        addrmode6oneL32> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
@@ -2300,14 +2300,14 @@ class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
         (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -2325,7 +2325,7 @@ class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
@@ -2343,7 +2343,7 @@ class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
         (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
@@ -2368,6 +2368,8 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 // Same as N3VD but no data type.
@@ -2379,6 +2381,8 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, "$Vd, $Vn, $Vm", "",
          [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 
@@ -2391,6 +2395,8 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8,
         [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$Vn),
                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
@@ -2401,6 +2407,8 @@ class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
         [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$Vn),
                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 
@@ -2411,6 +2419,8 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -2420,6 +2430,8 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
          OpcodeStr, "$Vd, $Vn, $Vm", "",
          [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQSL<bits<2> op21_20, bits<4> op11_8,
@@ -2432,6 +2444,8 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
               (ResTy (ShOp (ResTy QPR:$Vn),
                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                 imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
@@ -2443,21 +2457,25 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
               (ResTy (ShOp (ResTy QPR:$Vn),
                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                 imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2468,7 +2486,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
   let isCommutable = 0;
 }
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                  string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2479,26 +2497,29 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
         OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
   let isCommutable = 0;
 }
 
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2510,7 +2531,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2522,11 +2543,12 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
         OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
   let isCommutable = 0;
 }
 
@@ -2606,7 +2628,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 // Neon Intrinsic-Op instructions (VABA): double- and quad-register.
 class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2614,7 +2636,7 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                              (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
 class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2625,7 +2647,7 @@ class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // The destination register is also used as the first source operand register.
 class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2633,7 +2655,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                                       (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2678,7 +2700,7 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
 // Long Intrinsic-Op vector operations with explicit extend (VABAL).
 class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                    InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                    SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2691,7 +2713,7 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // a quad-register and is also used as the first source operand register.
 class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2699,7 +2721,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
           (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
 class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
@@ -2712,7 +2734,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                 imm:$lane)))))]>;
 class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
@@ -2727,7 +2749,7 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
 // Narrowing 3-register intrinsics.
 class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
-              Intrinsic IntOp, bit Commutable>
+              SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2780,7 +2802,7 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // Long 3-register intrinsics with explicit extend (VABDL).
 class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                  InstrItinClass itin, string OpcodeStr, string Dt,
-                 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                  bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2793,7 +2815,7 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // Long 3-register intrinsics.
 class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2802,7 +2824,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 }
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2812,7 +2834,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                 imm:$lane)))))]>;
 class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2830,6 +2852,8 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
                                 (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 
@@ -2837,14 +2861,14 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
         (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
         (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -2855,7 +2879,7 @@ class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
         OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2863,7 +2887,7 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
         OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2871,6 +2895,7 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 
 // Shift by immediate,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
              Format f, InstrItinClass itin, Operand ImmTy,
              string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
@@ -2885,6 +2910,7 @@ class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
 
 // Long shift by immediate.
 class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
@@ -2908,6 +2934,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Operand ImmTy, string OpcodeStr, string Dt,
                 ValueType Ty, SDNode ShOp>
@@ -2924,9 +2951,11 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
 
 // Shift by immediate and insert,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Operand ImmTy, Format f, string OpcodeStr, string Dt,
                 ValueType Ty,SDNode ShOp>
@@ -2941,19 +2970,20 @@ class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
 
 // Convert, with fractional bits immediate,
 // both double- and quad-register.
 class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
            IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
            IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
@@ -3023,7 +3053,7 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                       bits<5> op11_7, bit op4,
                       InstrItinClass itinD, InstrItinClass itinQ,
-                      string OpcodeStr, string Dt, Intrinsic IntOp> {
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                       itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3064,7 +3094,7 @@ multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op6, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp> {
+                       SDPatternOperator IntOp> {
   def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
                       itin, OpcodeStr, !strconcat(Dt, "16"),
                       v8i8, v8i16, IntOp>;
@@ -3152,7 +3182,7 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
-                     Intrinsic IntOp, bit Commutable = 0> {
+                     SDPatternOperator IntOp, bit Commutable = 0> {
   // 64-bit vector types.
   def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
@@ -3173,7 +3203,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
-                     Intrinsic IntOp> {
+                     SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
@@ -3194,7 +3224,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
 multiclass N3VIntSL_HS<bits<4> op11_8,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
                           OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
   def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
@@ -3210,7 +3240,7 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0>
+                      SDPatternOperator IntOp, bit Commutable = 0>
   : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp, Commutable> {
   def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3224,7 +3254,7 @@ multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp>
+                      SDPatternOperator IntOp>
   : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp> {
   def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3241,7 +3271,7 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
   : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
   def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3255,7 +3285,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp>
+                       SDPatternOperator IntOp>
   : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp> {
   def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3270,7 +3300,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, bit Commutable = 0> {
   def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v8i8, v8i16, IntOp, Commutable>;
@@ -3330,7 +3360,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                       InstrItinClass itin16, InstrItinClass itin32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0> {
+                      SDPatternOperator IntOp, bit Commutable = 0> {
   def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i32, v4i16, IntOp, Commutable>;
@@ -3341,7 +3371,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
 
 multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
                         InstrItinClass itin, string OpcodeStr, string Dt,
-                        Intrinsic IntOp> {
+                        SDPatternOperator IntOp> {
   def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
   def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
@@ -3352,7 +3382,7 @@ multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
 multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin16, InstrItinClass itin32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
   : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
                IntOp, Commutable> {
   def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
@@ -3363,7 +3393,7 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // ....with explicit extend (VABDL).
 multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
   def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
                          OpcodeStr, !strconcat(Dt, "8"),
                          v8i16, v8i8, IntOp, ExtOp, Commutable>;
@@ -3436,7 +3466,7 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itinD, InstrItinClass itinQ,
-                        string OpcodeStr, string Dt, Intrinsic IntOp,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
                         SDNode OpNode> {
   // 64-bit vector types.
   def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
@@ -3459,7 +3489,7 @@ multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itinD, InstrItinClass itinQ,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
                        OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3506,7 +3536,7 @@ multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
 // First with only element sizes of 16 and 32 bits:
 multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin16, InstrItinClass itin32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
   def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
@@ -3514,7 +3544,7 @@ multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
                            OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
   def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
@@ -3524,7 +3554,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
 // ....then also with element size of 8 bits:
 multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itin16, InstrItinClass itin32,
-                        string OpcodeStr, string Dt, Intrinsic IntOp>
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
   : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
   def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
@@ -3533,7 +3563,7 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // ....with explicit extend (VABAL).
 multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                             InstrItinClass itin, string OpcodeStr, string Dt,
-                            Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
   def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
                            OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
                            IntOp, ExtOp, OpNode>;
@@ -3550,7 +3580,7 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                         bits<5> op11_7, bit op4,
-                        string OpcodeStr, string Dt, Intrinsic IntOp> {
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -3573,7 +3603,7 @@ multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                          bits<5> op11_7, bit op4,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                          OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -3668,33 +3698,6 @@ multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
-
-  // Aliases for two-operand forms (source and dest regs the same).
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v8i8"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v4i16"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v2i32"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v1i64"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v16i8"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v8i16"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v4i32"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v2i64"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
 }
 
 // Neon Shift-Accumulate vector operations,
@@ -4133,16 +4136,16 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                 Requires<[HasVFP4,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-def : Pat<(v2f32 (fma DPR:$src1, DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
           (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
           Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma QPR:$src1, QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
           (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
           Requires<[HasVFP4]>;
-def : Pat<(v2f32 (fma (fneg DPR:$src1), DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
           (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
       Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma (fneg QPR:$src1), QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
           (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
       Requires<[HasVFP4]>;
 
@@ -4305,6 +4308,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
 
 
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
 def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
                      "vbic", "$Vd, $Vn, $Vm", "",
@@ -4315,6 +4319,7 @@ def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      "vbic", "$Vd, $Vn, $Vm", "",
                      [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
                                                  (vnotq QPR:$Vm))))]>;
+}
 
 def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
                           (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
@@ -4820,14 +4825,14 @@ defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
 //   VCLZ     : Vector Count Leading Zeros
 defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
-                           int_arm_neon_vclz>;
+                           ctlz>;
 //   VCNT     : Vector Count One Bits
 def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiD, "vcnt", "8",
-                        v8i8, v8i8, int_arm_neon_vcnt>;
+                        v8i8, v8i8, ctpop>;
 def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiQ, "vcnt", "8",
-                        v16i8, v16i8, int_arm_neon_vcnt>;
+                        v16i8, v16i8, ctpop>;
 
 // Vector Swap
 def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
@@ -5308,6 +5313,9 @@ def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
 
 //   VEXT     : Vector Extract
 
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
 class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
         (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
@@ -5327,6 +5335,7 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   bits<4> index;
   let Inst{11-8} = index{3-0};
 }
+}
 
 def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
   let Inst{11-8} = index{3-0};
@@ -5588,82 +5597,87 @@ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
 // Vector lengthening move with load, matching extending loads.
 
 // extload, zextload and sextload for a standard lengthening load. Example:
-// Lengthen_Single<"8", "i16", "i8"> = Pat<(v8i16 (extloadvi8 addrmode5:$addr))
-//                                         (VMOVLuv8i16 (VLDRD addrmode5:$addr))>;
+// Lengthen_Single<"8", "i16", "8"> = 
+//     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+//         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+//                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
 multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+  let AddedComplexity = 10 in {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                    (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                    (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
                   (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                    (VLDRD addrmode5:$addr))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
   def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                  (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
                 (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                  (VLDRD addrmode5:$addr))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
   def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                  (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
                 (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
-                  (VLDRD addrmode5:$addr))>;
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+  }
 }
 
 // extload, zextload and sextload for a lengthening load which only uses
 // half the lanes available. Example:
 // Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
-//     Pat<(v4i16 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                     (VLDRS addrmode5:$addr),
-//                                                     ssub_0)),
+//     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, 
+//                                      (f64 (IMPLICIT_DEF)), (i32 0))),
 //                         dsub_0)>;
 multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
                                string InsnLanes, string InsnTy> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
 }
 
 // extload, zextload and sextload for a lengthening load followed by another
 // lengthening load, to quadruple the initial length.
 //
-// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0> =
-//     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv4i32 
-//           (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                       (VLDRS addrmode5:$addr),
-//                                                       ssub_0)),
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
+//         (EXTRACT_SUBREG (VMOVLuv4i32
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                                   (f64 (IMPLICIT_DEF)),
+//                                                   (i32 0))),
 //                           dsub_0)),
-//           qsub_0)>;
+//           dsub_0)>;
 multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
                            string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
                            string Insn2Ty> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
 }
 
 // extload, zextload and sextload for a lengthening load followed by another
@@ -5671,45 +5685,43 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
 // requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
 //
 // Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
-//     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv4i32 
-//           (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                       (VLDRS addrmode5:$addr),
-//                                                       ssub_0)),
-//                           dsub_0)),
-//           dsub_0)>;
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
+//     (EXTRACT_SUBREG (VMOVLuv4i32
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
+//                                               (f64 (IMPLICIT_DEF)), (i32 0))),
+//                       dsub_0)),
+//       dsub_0)>;
 multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
                            string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
                            string Insn2Ty> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
           dsub_0)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
           dsub_0)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
           dsub_0)>;
 }
 
-defm : Lengthen_Single<"8", "i16", "i8">; // v8i8 -> v8i16
-defm : Lengthen_Single<"4", "i32", "i16">; // v4i16 -> v4i32
-defm : Lengthen_Single<"2", "i64", "i32">; // v2i32 -> v2i64
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
 
 defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
-defm : Lengthen_HalfSingle<"2", "i16", "i8", "8", "i16">; // v2i8 -> v2i16
 defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
 
 // Double lengthening - v4i8 -> v4i16 -> v4i32
@@ -5720,18 +5732,18 @@ defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
 defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
 
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
-def : Pat<(v2i64 (extloadvi8 addrmode5:$addr)),
+def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (zextloadvi8 addrmode5:$addr)),
+         (VLD1LNd16 addrmode6:$addr, 
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (sextloadvi8 addrmode5:$addr)),
+         (VLD1LNd16 addrmode6:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
       (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
+         (VLD1LNd16 addrmode6:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
@@ -5742,69 +5754,6 @@ def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
 def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
                     (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
 
-
-// VADD two-operand aliases.
-def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
-                    (VADDv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
-                    (VADDv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
-                    (VADDv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
-                    (VADDv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
-                    (VADDv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
-                    (VADDv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
-                    (VADDv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
-                    (VADDv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
-                    (VADDfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
-                    (VADDfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSUB two-operand aliases.
-def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
-                    (VSUBv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
-                    (VSUBv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
-                    (VSUBv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
-                    (VSUBv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
-                    (VSUBv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
-                    (VSUBv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
-                    (VSUBv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
-                    (VSUBv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
-                    (VSUBfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
-                    (VSUBfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VADDW two-operand aliases.
-def : NEONInstAlias<"vaddw${p}.s8 $Vdn, $Vm",
-                    (VADDWsv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.s16 $Vdn, $Vm",
-                    (VADDWsv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.s32 $Vdn, $Vm",
-                    (VADDWsv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u8 $Vdn, $Vm",
-                    (VADDWuv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u16 $Vdn, $Vm",
-                    (VADDWuv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u32 $Vdn, $Vm",
-                    (VADDWuv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-
 // VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
                          (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
@@ -5823,23 +5772,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
 defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
                          (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
 // ... two-operand aliases
-def : NEONInstAlias<"vand${p} $Vdn, $Vm",
-                    (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vand${p} $Vdn, $Vm",
-                    (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
-                    (VBICd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
-                    (VBICq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"veor${p} $Vdn, $Vm",
-                    (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"veor${p} $Vdn, $Vm",
-                    (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
-                    (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
-                    (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
                          (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
@@ -5853,212 +5785,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
 defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
                          (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
 
-// VMUL two-operand aliases.
-def : NEONInstAlias<"vmul${p}.p8 $Qdn, $Qm",
-                    (VMULpq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i8 $Qdn, $Qm",
-                    (VMULv16i8 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Qm",
-                    (VMULv8i16 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Qm",
-                    (VMULv4i32 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.p8 $Ddn, $Dm",
-                    (VMULpd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i8 $Ddn, $Dm",
-                    (VMULv8i8 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm",
-                    (VMULv4i16 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm",
-                    (VMULv2i32 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Qm",
-                    (VMULfq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm",
-                    (VMULfd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm$lane",
-                    (VMULslv4i16 DPR:$Ddn, DPR:$Ddn, DPR_8:$Dm,
-                                 VectorIndex16:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Dm$lane",
-                    (VMULslv8i16 QPR:$Qdn, QPR:$Qdn, DPR_8:$Dm,
-                                 VectorIndex16:$lane, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm$lane",
-                    (VMULslv2i32 DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
-                                 VectorIndex32:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Dm$lane",
-                    (VMULslv4i32 QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
-                                 VectorIndex32:$lane, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm$lane",
-                    (VMULslfd DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
-                              VectorIndex32:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Dm$lane",
-                    (VMULslfq QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
-                              VectorIndex32:$lane, pred:$p)>;
-
-// VQADD (register) two-operand aliases.
-def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
-                    (VQADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
-                    (VQADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
-                    (VQADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
-                    (VQADDsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
-                    (VQADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
-                    (VQADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
-                    (VQADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
-                    (VQADDuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
-                    (VQADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
-                    (VQADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
-                    (VQADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
-                    (VQADDsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
-                    (VQADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
-                    (VQADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
-                    (VQADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
-                    (VQADDuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSHL (immediate) two-operand aliases.
-def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
-                    (VSHLiv8i8 DPR:$Vdn, DPR:$Vdn, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
-                    (VSHLiv4i16 DPR:$Vdn, DPR:$Vdn, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
-                    (VSHLiv2i32 DPR:$Vdn, DPR:$Vdn, imm0_31:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
-                    (VSHLiv1i64 DPR:$Vdn, DPR:$Vdn, imm0_63:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
-                    (VSHLiv16i8 QPR:$Vdn, QPR:$Vdn, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
-                    (VSHLiv8i16 QPR:$Vdn, QPR:$Vdn, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
-                    (VSHLiv4i32 QPR:$Vdn, QPR:$Vdn, imm0_31:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
-                    (VSHLiv2i64 QPR:$Vdn, QPR:$Vdn, imm0_63:$imm, pred:$p)>;
-
-// VSHL (register) two-operand aliases.
-def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
-                    (VSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
-                    (VSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
-                    (VSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
-                    (VSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
-                    (VSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
-                    (VSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
-                    (VSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
-                    (VSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
-                    (VSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
-                    (VSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
-                    (VSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
-                    (VSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
-                    (VSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
-                    (VSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
-                    (VSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
-                    (VSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSHR (immediate) two-operand aliases.
-def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
-                    (VSHRsv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
-                    (VSHRsv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
-                    (VSHRsv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
-                    (VSHRsv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
-                    (VSHRsv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
-                    (VSHRsv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
-                    (VSHRsv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
-                    (VSHRsv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
-                    (VSHRuv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
-                    (VSHRuv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
-                    (VSHRuv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
-                    (VSHRuv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
-                    (VSHRuv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
-                    (VSHRuv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
-                    (VSHRuv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
-                    (VSHRuv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-// VRSHL two-operand aliases.
-def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
-                    (VRSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
-                    (VRSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
-                    (VRSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
-                    (VRSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
-                    (VRSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
-                    (VRSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
-                    (VRSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
-                    (VRSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
-                    (VRSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
-                    (VRSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
-                    (VRSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
-                    (VRSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
-                    (VRSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
-                    (VRSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
-                    (VRSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
-                    (VRSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
@@ -6223,17 +5949,17 @@ def VST2LNqWB_register_Asm_32 :
 
 // VLD3 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
-def VLD3DUPdAsm_8  : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_8  : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
 
 def VLD3DUPdWB_fixed_Asm_8 :
@@ -6499,17 +6225,17 @@ def VST3qWB_register_Asm_32 :
 
 // VLD4 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
-def VLD4DUPdAsm_8  : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_8  : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
 
 def VLD4DUPdWB_fixed_Asm_8 :
@@ -6845,277 +6571,6 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
 def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
                     (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 
-// Two-operand variants for VEXT
-def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
-                  (VEXTd8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
-                  (VEXTd16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_3:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
-                  (VEXTd32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_1:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
-                  (VEXTq8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
-                  (VEXTq16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
-                  (VEXTq32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_3:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.64 $Vdn, $Vm, $imm",
-                  (VEXTq64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_1:$imm, pred:$p)>;
-
-// Two-operand variants for VQDMULH
-def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
-                    (VQDMULHv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
-                    (VQDMULHv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
-                    (VQDMULHv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
-                    (VQDMULHv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VMAX.
-def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm",
-                    (VMAXsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm",
-                    (VMAXsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm",
-                    (VMAXsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm",
-                    (VMAXuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm",
-                    (VMAXuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm",
-                    (VMAXuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm",
-                    (VMAXfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm",
-                    (VMAXsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm",
-                    (VMAXsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm",
-                    (VMAXsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm",
-                    (VMAXuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm",
-                    (VMAXuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm",
-                    (VMAXuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm",
-                    (VMAXfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VMIN.
-def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm",
-                    (VMINsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm",
-                    (VMINsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm",
-                    (VMINsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm",
-                    (VMINuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm",
-                    (VMINuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm",
-                    (VMINuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm",
-                    (VMINfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm",
-                    (VMINsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm",
-                    (VMINsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm",
-                    (VMINsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm",
-                    (VMINuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm",
-                    (VMINuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm",
-                    (VMINuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm",
-                    (VMINfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VPADD.
-def : NEONInstAlias<"vpadd${p}.i8 $Vdn, $Vm",
-                    (VPADDi8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.i16 $Vdn, $Vm",
-                    (VPADDi16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.i32 $Vdn, $Vm",
-                    (VPADDi32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.f32 $Vdn, $Vm",
-                    (VPADDf DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VSRA.
-    // Signed.
-def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm",
-                    (VSRAsv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm",
-                    (VSRAsv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm",
-                    (VSRAsv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm",
-                    (VSRAsv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm",
-                    (VSRAsv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm",
-                    (VSRAsv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm",
-                    (VSRAsv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm",
-                    (VSRAsv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm",
-                    (VSRAuv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm",
-                    (VSRAuv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm",
-                    (VSRAuv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm",
-                    (VSRAuv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm",
-                    (VSRAuv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm",
-                    (VSRAuv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm",
-                    (VSRAuv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm",
-                    (VSRAuv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VSRI.
-def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm",
-                    (VSRIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm",
-                    (VSRIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm",
-                    (VSRIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm",
-                    (VSRIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm",
-                    (VSRIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm",
-                    (VSRIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm",
-                    (VSRIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm",
-                    (VSRIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VSLI.
-def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm",
-                    (VSLIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm",
-                    (VSLIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm",
-                    (VSLIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm",
-                    (VSLIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm",
-                    (VSLIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm",
-                    (VSLIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm",
-                    (VSLIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm",
-                    (VSLIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VHSUB.
-    // Signed.
-def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
-                    (VHSUBsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
-                    (VHSUBsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
-                    (VHSUBsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
-                    (VHSUBsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
-                    (VHSUBsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
-                    (VHSUBsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
-                    (VHSUBuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
-                    (VHSUBuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
-                    (VHSUBuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
-                    (VHSUBuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
-                    (VHSUBuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
-                    (VHSUBuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-
-// Two-operand variants for VHADD.
-    // Signed.
-def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
-                    (VHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
-                    (VHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
-                    (VHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
-                    (VHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
-                    (VHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
-                    (VHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
-                    (VHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
-                    (VHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
-                    (VHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
-                    (VHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
-                    (VHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
-                    (VHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VRHADD.
-    // Signed.
-def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
-                    (VRHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
-                    (VRHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
-                    (VRHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-
-def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
-                    (VRHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
-                    (VRHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
-                    (VRHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
-                    (VRHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
-                    (VRHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
-                    (VRHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-
-def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
-                    (VRHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
-                    (VRHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
-                    (VRHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-
 // VSWP allows, but does not require, a type suffix.
 defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
                          (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 6335229..554f6d9 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -32,9 +32,6 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = ThumbSRImmAsmOperand;
 }
 
-def imm_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
-}]>;
 def imm_comp_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
 }]>;
@@ -258,16 +255,20 @@ def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
         Requires<[IsThumb2]>;
 
 def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
-           T1SystemEncoding<0x10>; // A8.6.410
+           T1SystemEncoding<0x10>, // A8.6.410
+           Requires<[IsThumb2]>;
 
 def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
-           T1SystemEncoding<0x20>; // A8.6.408
+           T1SystemEncoding<0x20>, // A8.6.408
+           Requires<[IsThumb2]>;
 
 def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
-           T1SystemEncoding<0x30>; // A8.6.409
+           T1SystemEncoding<0x30>, // A8.6.409
+           Requires<[IsThumb2]>;
 
 def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
-           T1SystemEncoding<0x40>; // A8.6.157
+           T1SystemEncoding<0x40>, // A8.6.157
+           Requires<[IsThumb2]>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -363,8 +364,8 @@ def : tInstAlias<"sub${p} sp, sp, $imm",
                  (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
 
 // ADD <Rm>, sp
-def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr,
-                  "add", "\t$Rdn, $sp, $Rn", []>,
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+                   "add", "\t$Rdn, $sp, $Rn", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T1
   bits<4> Rdn;
@@ -419,34 +420,35 @@ let isCall = 1,
   Defs = [LR], Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br,
+                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
                   "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
              Requires<[IsThumb]> {
-    bits<22> func;
-    let Inst{26} = func{21};
+    bits<24> func;
+    let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
     let Inst{10-0} = func{10-0};
   }
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                 (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br,
+                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
               Requires<[IsThumb, HasV5T]> {
-    bits<21> func;
+    bits<24> func;
+    let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
     let Inst{10-1} = func{10-1};
     let Inst{0} = 0; // func{0} is assumed zero
   }
 
   // Also used for Thumb2
-  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
                   "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
@@ -457,7 +459,7 @@ let isCall = 1,
   }
 
   // ARMv4T
-  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb, IsThumb1Only]>;
@@ -504,7 +506,7 @@ let isBranch = 1, isTerminator = 1 in
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS versions.
   let Uses = [SP] in {
-    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
+    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
                      4, IIC_Br, [],
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
                      Requires<[IsThumb]>;
@@ -514,7 +516,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Non-IOS version:
   let Uses = [SP] in {
     def tTAILJMPdND : tPseudoExpand<(outs),
-                   (ins t_brtarget:$dst, pred:$p, variable_ops),
+                   (ins t_brtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
                  Requires<[IsThumb, IsNotIOS]>;
@@ -1398,7 +1400,7 @@ def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
 
 // For round-trip assembly/disassembly, we have to handle a CPS instruction
 // without any iflags. That's not, strictly speaking, valid syntax, but it's
-// a useful extention and assembles to defined behaviour (the insn does
+// a useful extension and assembles to defined behaviour (the insn does
 // nothing).
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index e6fb9d5..307006f 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -62,6 +62,15 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
 }]>;
 
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+  APInt apIntN = N->getAPIntValue();
+  unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+  return CurDAG->getTargetConstant(~N16bitSignExt, MVT::i32);
+}]>;
+
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
@@ -86,6 +95,17 @@ def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = t2_so_imm_not_asmoperand;
 }
 
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+    APInt apIntN = N->getAPIntValue();
+    if (!apIntN.isIntN(16)) return false;
+    unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+    return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+  }], t2_so_imm_notSext16_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
 def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
@@ -152,6 +172,7 @@ def t2ldr_pcrel_imm12 : Operand<i32> {
 // ADR instruction labels.
 def t2adrlabel : Operand<i32> {
   let EncoderMethod = "getT2AdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand";
 }
 
 
@@ -509,7 +530,7 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
 /// changed to modify CPSR.
 multiclass T2I_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, string baseOpc, bit Commutable = 0,
+                       PatFrag opnode, bit Commutable = 0,
                        string wide = ""> {
    // shifted imm
    def ri : T2sTwoRegImm<
@@ -545,15 +566,15 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
   // Assembly aliases for optional destination operand when it's the same
   // as the source operand.
   def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_imm:$imm, pred:$p,
                                                     cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn,
                                                     rGPR:$Rm, pred:$p,
                                                     cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_reg:$shift, pred:$p,
                                                     cc_out:$s)>;
 }
@@ -562,36 +583,30 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
 //  the ".w" suffix to indicate that they are wide.
 multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                         PatFrag opnode, string baseOpc, bit Commutable = 0> :
-    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w"> {
+                         PatFrag opnode, bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
   // Assembler aliases w/ the ".w" suffix.
   def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rd, rGPR:$Rn,
-                                                    t2_so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p,
+                                    cc_out:$s)>;
   // Assembler aliases w/o the ".w" suffix.
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
-                                                    rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rd, rGPR:$Rn,
-                                                    t2_so_reg:$shift, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
 
   // and with the optional destination operand, too.
   def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    t2_so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm,
+                                    pred:$p, cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    t2_so_reg:$shift, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
 }
 
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
@@ -668,16 +683,16 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<PatFrag opnode> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
-                         (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+                         (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
                          4, IIC_iALUi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
-                                                GPRnopc:$Rn))]>;
+                                                rGPR:$Rn))]>;
    // shifted register
    def rs : t2PseudoInst<(outs rGPR:$Rd),
-                         (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, IIC_iALUsi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
-                                                GPRnopc:$Rn))]>;
+                                                rGPR:$Rn))]>;
 }
 }
 
@@ -788,8 +803,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
 //  rotate operation that produces a value.
-multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode,
-                     string baseOpc> {
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
    // 5-bit imm
    def ri : T2sTwoRegShiftImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
@@ -814,33 +828,27 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode,
 
   // Optional destination register
   def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    ty:$imm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
 
   // Assembler aliases w/o the ".w" suffix.
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rd, rGPR:$Rn,
-                                                    ty:$imm, pred:$p,
-                                                   cc_out:$s)>;
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
-                                                    rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
 
   // and with the optional destination operand, too.
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    ty:$imm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
   def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
-                                                    rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>;
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
 }
 
 /// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
@@ -848,7 +856,7 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode,
 /// a explicit result, only implicitly set CPSR.
 multiclass T2I_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, string baseOpc> {
+                       PatFrag opnode> {
 let isCompare = 1, Defs = [CPSR] in {
    // shifted imm
    def ri : T2OneRegCmpImm<
@@ -893,12 +901,9 @@ let isCompare = 1, Defs = [CPSR] in {
   // No alias here for 'rr' version as not all instantiations of this
   // multiclass want one (CMP in particular, does not).
   def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPRnopc:$Rn,
-                                                    t2_so_imm:$imm, pred:$p)>;
+     (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
   def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPRnopc:$Rn,
-                                                    t2_so_reg:$shift,
-                                                    pred:$p)>;
+     (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
 }
 
 /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
@@ -1911,11 +1916,16 @@ def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
             (t2SUBri    GPR:$src, t2_so_imm_neg:$imm)>;
 def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_65535_neg:$imm),
+            (t2SUBrr    GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
 let AddedComplexity = 1 in
 def : T2Pat<(ARMaddc    rGPR:$src, imm0_255_neg:$imm),
             (t2SUBSri   rGPR:$src, imm0_255_neg:$imm)>;
 def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
             (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_65535_neg:$imm),
+            (t2SUBSrr   rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
@@ -1924,6 +1934,8 @@ def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
 def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
 
 // Select Bytes -- for disassembly only
 
@@ -2125,17 +2137,17 @@ def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
 //
 
 defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
-                        BinOpFrag<(shl  node:$LHS, node:$RHS)>, "t2LSL">;
+                        BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
 defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
-                        BinOpFrag<(srl  node:$LHS, node:$RHS)>, "t2LSR">;
+                        BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
-                        BinOpFrag<(sra  node:$LHS, node:$RHS)>, "t2ASR">;
+                        BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
-                        BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">;
+                        BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
-def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
-          (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+            (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
 
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
@@ -2187,18 +2199,17 @@ def t2MOVsra_flag : T2TwoRegShiftImm<
 
 defm t2AND  : T2I_bin_w_irs<0b0000, "and",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(and node:$LHS, node:$RHS)>, "t2AND", 1>;
+                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
 defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(or  node:$LHS, node:$RHS)>, "t2ORR", 1>;
+                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
 defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(xor node:$LHS, node:$RHS)>, "t2EOR", 1>;
+                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
 
 defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(and node:$LHS, (not node:$RHS))>,
-                            "t2BIC">;
+                            BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
 class T2BitFI<dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
@@ -2278,8 +2289,7 @@ let Constraints = "$src = $Rd" in {
 
 defm t2ORN  : T2I_bin_irs<0b0011, "orn",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                          BinOpFrag<(or  node:$LHS, (not node:$RHS))>,
-                          "t2ORN", 0, "">;
+                          BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
 
 /// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 /// unary operation that produces a value. These are predicable and can be
@@ -2332,6 +2342,17 @@ let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
             (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
 
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+  return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
 // FIXME: Disable this pattern on Darwin to workaround an assembler bug.
 def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
             (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
@@ -2840,7 +2861,7 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
                           IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>, "t2CMP">;
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
 
 def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
             (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
@@ -2849,29 +2870,68 @@ def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
 def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
             (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
 
-//FIXME: Disable CMN, as CCodes are backwards from compare expectations
-//       Compare-to-zero still works out, just not the relationals
-//defm t2CMN  : T2I_cmp_irs<0b1000, "cmn",
-//                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
-defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
-                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>,
-                          "t2CMNz">;
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def t2CMNri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+                "cmn", ".w\t$Rn, $imm",
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def t2CMNzrr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+                "cmn", ".w\t$Rn, $Rm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, rGPR:$Rm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def t2CMNzrs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+                "cmn", ".w\t$Rn, $ShiftedRm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
 
-//def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
-//            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<"cmn${p} $Rn, $imm",
+   (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rn, $shift",
+   (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
 
-def : T2Pat<(ARMcmpZ  GPRnopc:$src, t2_so_imm_neg:$imm),
-            (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
 
 defm t2TST  : T2I_cmp_irs<0b0000, "tst",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>,
-                          "t2TST">;
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
 defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>,
-                          "t2TEQ">;
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
@@ -3017,7 +3077,7 @@ def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
 
 def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
                   "isb", "\t$opt",
-                  []>, Requires<[IsThumb2, HasDB]> {
+                  []>, Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
@@ -3271,7 +3331,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
   let Uses = [SP] in
   def tTAILJMPd: tPseudoExpand<(outs),
-                   (ins uncondbrtarget:$dst, pred:$p, variable_ops),
+                   (ins uncondbrtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B uncondbrtarget:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsIOS]>;
@@ -3281,7 +3341,7 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def t2BMOVPCB_CALL : tPseudoInst<(outs),
-                                   (ins t_bltarget:$func, variable_ops),
+                                   (ins t_bltarget:$func),
                                6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                         Requires<[IsThumb]>;
 }
@@ -3382,21 +3442,18 @@ let imod = 0, iflags = 0, M = 1 in
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-class T2I_hint<bits<8> op7_0, string opc, string asm>
-  : T2I<(outs), (ins), NoItinerary, opc, asm, []> {
-  let Inst{31-20} = 0xf3a;
-  let Inst{19-16} = 0b1111;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-  let Inst{10-8} = 0b000;
-  let Inst{7-0} = op7_0;
+def t2HINT : T2I<(outs), (ins imm0_255:$imm), NoItinerary, "hint", "\t$imm",[]>{
+  bits<8> imm;
+  let Inst{31-8} = 0b111100111010111110000000;
+  let Inst{7-0} = imm;
 }
 
-def t2NOP   : T2I_hint<0b00000000, "nop",   ".w">;
-def t2YIELD : T2I_hint<0b00000001, "yield", ".w">;
-def t2WFE   : T2I_hint<0b00000010, "wfe",   ".w">;
-def t2WFI   : T2I_hint<0b00000011, "wfi",   ".w">;
-def t2SEV   : T2I_hint<0b00000100, "sev",   ".w">;
+def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_255:$imm, pred:$p)>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
   bits<4> opt;
@@ -3622,8 +3679,8 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
 // A/R class MRS.
 //
 // A/R class can only move from CPSR or SPSR.
-def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>,
-               Requires<[IsThumb2,IsARClass]> {
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+                  []>, Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
@@ -3632,8 +3689,8 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>
 
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
 
-def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>,
-                 Requires<[IsThumb2,IsARClass]> {
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   []>, Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
@@ -3646,7 +3703,7 @@ def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", [
 // the A/R class (a full msr_mask).
 def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
                   "mrs", "\t$Rd, $mask", []>,
-              Requires<[IsThumb2,IsMClass]> {
+              Requires<[IsThumb,IsMClass]> {
   bits<4> Rd;
   bits<8> mask;
   let Inst{31-12} = 0b11110011111011111000;
@@ -3682,14 +3739,14 @@ def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
 // Move from ARM core register to Special Register
 def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
                   NoItinerary, "msr", "\t$SYSm, $Rn", []>,
-              Requires<[IsThumb2,IsMClass]> {
-  bits<8> SYSm;
+              Requires<[IsThumb,IsMClass]> {
+  bits<12> SYSm;
   bits<4> Rn;
   let Inst{31-21} = 0b11110011100;
   let Inst{20}    = 0b0;
   let Inst{19-16} = Rn;
   let Inst{15-12} = 0b1000;
-  let Inst{7-0}  = SYSm;
+  let Inst{11-0}  = SYSm;
 }
 
 
@@ -3969,6 +4026,17 @@ def : t2InstAlias<"add${s}${p} $Rdn, $imm",
 def : t2InstAlias<"add${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
 
 // Aliases for SUB without the ".w" optional width specifier.
 def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
@@ -4002,9 +4070,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4213,7 +4281,7 @@ def : t2InstAlias<"add${s}${p} $Rd, $imm",
                            pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via t2_so_imm_neg
 def : t2InstAlias<"cmp${p} $Rd, $imm",
-                  (t2CMNzri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+                  (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 def : t2InstAlias<"cmn${p} $Rd, $imm",
                   (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 3600b88..23c132e 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -221,11 +221,13 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
 // FP Binary Operations.
 //
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VADDD  : ADbI<0b11100, 0b11, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
@@ -235,11 +237,13 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
@@ -249,21 +253,25 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
@@ -559,8 +567,8 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
   bits<4> Rt2;
 
   // Encode instruction operands.
-  let Inst{3-0}   = src1{3-0};
-  let Inst{5}     = src1{4};
+  let Inst{3-0}   = src1{4-1};
+  let Inst{5}     = src1{0};
   let Inst{15-12} = Rt;
   let Inst{19-16} = Rt2;
 
@@ -609,8 +617,8 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
   bits<4> src2;
 
   // Encode instruction operands.
-  let Inst{3-0}   = dst1{3-0};
-  let Inst{5}     = dst1{4};
+  let Inst{3-0}   = dst1{4-1};
+  let Inst{5}     = dst1{0};
   let Inst{15-12} = src1;
   let Inst{19-16} = src2;
 
@@ -819,9 +827,9 @@ let Constraints = "$a = $dst" in {
 // FP to Fixed-Point:
 
 // Single Precision register
-class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
-                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-                list<dag> pattern>
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
   : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
@@ -830,9 +838,9 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bi
 }
 
 // Double Precision register
-class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
-                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-                list<dag> pattern>
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
   : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
@@ -1081,10 +1089,11 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-def : Pat<(f64 (fma DPR:$Ddin, DPR:$Dn, DPR:$Dm)),
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma SPR:$Sdin, SPR:$Sn, SPR:$Sm)),
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1115,18 +1124,18 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fma (fneg x), y, z) -> (vfms x, y, z)
-def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm)),
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm)),
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fneg (fma x, (fneg y), z) -> (vfms x, y, z)
-def : Pat<(fneg (f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm))),
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm))),
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1157,18 +1166,18 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fneg (fma x, y, z)) -> (vfnma x, y, z)
-def : Pat<(fneg (fma (f64 DPR:$Ddin), (f64 DPR:$Dn), (f64 DPR:$Dm))),
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f32 SPR:$Sdin), (f32 SPR:$Sn), (f32 SPR:$Sm))),
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fma (fneg x), y, (fneg z)) -> (vfnma x, y, z)
-def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, (fneg DPR:$Dm))),
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, (fneg SPR:$Sm))),
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1198,18 +1207,26 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fneg (fma (fneg x), y, z)) -> (vnfms x, y, z)
-def : Pat<(fneg (f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm))),
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm))),
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fma x, (fneg y), z) -> (vnfms x, y, z)
-def : Pat<(f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm)),
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm)),
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1426,22 +1443,6 @@ def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
 def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
                     (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
 
-// VMUL has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vmul${p}.f64 $Dn, $Dm",
-                    (VMULD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vmul${p}.f32 $Sn, $Sm",
-                    (VMULS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-// VADD has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vadd${p}.f64 $Dn, $Dm",
-                    (VADDD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vadd${p}.f32 $Sn, $Sm",
-                    (VADDS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-// VSUB has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vsub${p}.f64 $Dn, $Dm",
-                    (VSUBD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vsub${p}.f32 $Sn, $Sm",
-                    (VSUBS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-
 // VMOV can accept optional 32-bit or less data type suffix suffix.
 def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
                     (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 98930cc..3f99cce 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -289,9 +289,9 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
       if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
         ResultPtr = ResultPtr >> 2;
       *((intptr_t*)RelocPos) |= ResultPtr;
-      // Set register Rn to PC.
-      *((intptr_t*)RelocPos) |=
-        getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+      // Set register Rn to PC (which is register 15 on all architectures).
+      // FIXME: This avoids the need for register info in the JIT class.
+      *((intptr_t*)RelocPos) |= 15 << ARMII::RegRnShift;
       break;
     }
     case ARM::reloc_arm_pic_jt:
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 9ef2ace..897ceb6 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -456,8 +456,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   DebugLoc dl = Loc->getDebugLoc();
   const MachineOperand &PMO = Loc->getOperand(0);
   unsigned PReg = PMO.getReg();
-  unsigned PRegNum = PMO.isUndef() ? UINT_MAX
-    : getARMRegisterNumbering(PReg);
+  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
   unsigned Count = 1;
   unsigned Limit = ~0U;
 
@@ -483,8 +482,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     int NewOffset = MemOps[i].Offset;
     const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
     unsigned Reg = MO.getReg();
-    unsigned RegNum = MO.isUndef() ? UINT_MAX
-      : getARMRegisterNumbering(Reg);
+    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
     // Register numbers must be in ascending order. For VFP / NEON load and
     // store multiples, the registers must also be consecutive and within the
     // limit on the number of registers per instruction.
@@ -1177,8 +1175,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
-        if (isT2 && NewOpc == ARM::t2LDRi8 && OffImm+4 >= 0)
-          NewOpc = ARM::t2LDRi12;
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1326,7 +1322,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // First advance to the instruction just before the start of the chain.
         AdvanceRS(MBB, MemOps);
         // Find a scratch register.
-        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
+        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
         // Process the load / store instructions.
         RS->forward(prior(MBBI));
 
@@ -1739,7 +1735,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
           MRI->constrainRegClass(EvenReg, TRC);
           MRI->constrainRegClass(OddReg, TRC);
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 1466e98..6f974fd 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 // Registers are identified with 4-bit ID numbers.
-class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
-  field bits<4> Num;
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "ARM";
   let SubRegs = subregs;
   // All bits of ARM registers with sub-registers are covered by sub-registers.
   let CoveredBySubRegs = 1;
 }
 
-class ARMFReg<bits<6> num, string n> : Register<n> {
-  field bits<6> Num;
+class ARMFReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "ARM";
 }
 
@@ -267,21 +267,16 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
 // Subset of DPR that are accessible with VFP2 (and so that also have
 // 32-bit SPR subregs).
 def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                             (trunc DPR, 16)> {
-  let SubRegClasses = [(SPR ssub_0, ssub_1)];
-}
+                             (trunc DPR, 16)>;
 
 // Subset of DPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                          (trunc DPR, 8)> {
-  let SubRegClasses = [(SPR_8 ssub_0, ssub_1)];
-}
+                          (trunc DPR, 8)>;
 
 // Generic 128-bit vector register class.
 def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
                         (sequence "Q%u", 0, 15)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1)];
   // Allocate non-VFP2 aliases Q8-Q15 first.
   let AltOrders = [(rotl QPR, 8)];
   let AltOrderSelect = [{ return 1; }];
@@ -289,17 +284,11 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
 
 // Subset of QPR that have 32-bit SPR subregs.
 def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                             128, (trunc QPR, 8)> {
-  let SubRegClasses = [(SPR      ssub_0, ssub_1, ssub_2, ssub_3),
-                       (DPR_VFP2 dsub_0, dsub_1)];
-}
+                             128, (trunc QPR, 8)>;
 
 // Subset of QPR that have DPR_8 and SPR_8 subregs.
 def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                           128, (trunc QPR, 4)> {
-  let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3),
-                       (DPR_8 dsub_0, dsub_1)];
-}
+                           128, (trunc QPR, 4)>;
 
 // Pseudo-registers representing odd-even pairs of D registers. The even-odd
 // pairs are already represented by the Q registers.
@@ -338,8 +327,6 @@ def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
 // Pseudo 256-bit vector register class to model pairs of Q registers
 // (4 consecutive D registers).
 def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
-                       (QPR qsub_0, qsub_1)];
   // Allocate non-VFP2 aliases first.
   let AltOrders = [(rotl QQPR, 8)];
   let AltOrderSelect = [{ return 1; }];
@@ -363,9 +350,6 @@ def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
 // Pseudo 512-bit vector register class to model 4 consecutive Q registers
 // (8 consecutive D registers).
 def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
-                            dsub_4, dsub_5, dsub_6, dsub_7),
-                       (QPR qsub_0, qsub_1, qsub_2, qsub_3)];
   // Allocate non-VFP2 aliases first.
   let AltOrders = [(rotl QQQQPR, 8)];
   let AltOrderSelect = [{ return 1; }];
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 45486fd..81d2fa3 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -70,11 +70,11 @@ def IIC_iLoad_bh_siu : InstrItinClass;
 def IIC_iLoad_d_i  : InstrItinClass;
 def IIC_iLoad_d_r  : InstrItinClass;
 def IIC_iLoad_d_ru : InstrItinClass;
-def IIC_iLoad_m    : InstrItinClass<0>;  // micro-coded
-def IIC_iLoad_mu   : InstrItinClass<0>;  // micro-coded
-def IIC_iLoad_mBr  : InstrItinClass<0>;  // micro-coded
-def IIC_iPop       : InstrItinClass<0>;  // micro-coded
-def IIC_iPop_Br    : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_m    : InstrItinClass;
+def IIC_iLoad_mu   : InstrItinClass;
+def IIC_iLoad_mBr  : InstrItinClass;
+def IIC_iPop       : InstrItinClass;
+def IIC_iPop_Br    : InstrItinClass;
 def IIC_iLoadiALU  : InstrItinClass;
 def IIC_iStore_i   : InstrItinClass;
 def IIC_iStore_r   : InstrItinClass;
@@ -91,8 +91,8 @@ def IIC_iStore_bh_siu : InstrItinClass;
 def IIC_iStore_d_i   : InstrItinClass;
 def IIC_iStore_d_r   : InstrItinClass;
 def IIC_iStore_d_ru  : InstrItinClass;
-def IIC_iStore_m   : InstrItinClass<0>;  // micro-coded
-def IIC_iStore_mu  : InstrItinClass<0>;  // micro-coded
+def IIC_iStore_m   : InstrItinClass;
+def IIC_iStore_mu  : InstrItinClass;
 def IIC_Preload    : InstrItinClass;
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
@@ -126,12 +126,12 @@ def IIC_fpSQRT32   : InstrItinClass;
 def IIC_fpSQRT64   : InstrItinClass;
 def IIC_fpLoad32   : InstrItinClass;
 def IIC_fpLoad64   : InstrItinClass;
-def IIC_fpLoad_m   : InstrItinClass<0>;  // micro-coded
-def IIC_fpLoad_mu  : InstrItinClass<0>;  // micro-coded
+def IIC_fpLoad_m   : InstrItinClass;
+def IIC_fpLoad_mu  : InstrItinClass;
 def IIC_fpStore32  : InstrItinClass;
 def IIC_fpStore64  : InstrItinClass;
-def IIC_fpStore_m  : InstrItinClass<0>;  // micro-coded
-def IIC_fpStore_mu : InstrItinClass<0>;  // micro-coded
+def IIC_fpStore_m  : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
 def IIC_VLD1       : InstrItinClass;
 def IIC_VLD1x2     : InstrItinClass;
 def IIC_VLD1x3     : InstrItinClass;
@@ -258,8 +258,6 @@ def IIC_VTBX4      : InstrItinClass;
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-def GenericItineraries : ProcessorItineraries<[], [], []>;
-
 include "ARMScheduleV6.td"
 include "ARMScheduleA8.td"
 include "ARMScheduleA9.td"
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 8b1fb93..2c63825 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -151,28 +151,30 @@ def CortexA8Itineraries : ProcessorItineraries<
   // Load multiple, def is the 5th operand. Pipeline 0 only.
   // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
   InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Load multiple + update, defs are the 1st and 5th operands.
   InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Load multiple plus branch
   InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
                                 InstrStage<3, [A8_LSPipe]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 2, 1, 1, 3]>,
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Pop, def is the 3rd operand.
   InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
   //
   // Push, def is the 3th operand.
   InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
                                 InstrStage<3, [A8_LSPipe]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 1, 3]>,
-
+                               [1, 1, 3], [], -1>, // dynamic uops
   //
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -227,12 +229,13 @@ def CortexA8Itineraries : ProcessorItineraries<
   // Store multiple. Pipeline 0 only.
   // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
   InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [2]>,
-
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
   //
   // Preload
   InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
@@ -393,14 +396,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
   //
   // FP Load Multiple + update
   InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -419,15 +424,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Store Multiple + update
   InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
                                 InstrStage<1, [A8_LSPipe]>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
-                                InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>,
-
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
   // NEON
   // Issue through integer pipeline, and execute in NEON unit.
   //
@@ -1051,3 +1057,19 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NPipe], 0>,
                             InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA8Itineraries;
+}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 0d710cc..7bc590f 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -11,6 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
 //
 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
 // Reference Manual".
@@ -280,7 +284,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Load multiple + update, defs are the 1st and 5th operands.
   InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -288,7 +293,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [2, 1, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Load multiple plus branch
   InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -297,7 +303,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 2, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Pop, def is the 3rd operand.
   InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -305,7 +312,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 3],
-                               [NoBypass, NoBypass, A9_LdBypass]>,
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
   //
   // Pop + branch, def is the 3rd operand.
   InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -314,8 +322,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 1, 3],
-                               [NoBypass, NoBypass, A9_LdBypass]>,
-
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
   //
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -409,14 +417,15 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU], 0>,
-                                InstrStage<2, [A9_LSUnit]>]>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [], [], -1>, // dynamic uops
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU], 0>,
-                                InstrStage<2, [A9_LSUnit]>], [2]>,
-
+                                InstrStage<2, [A9_LSUnit]>],
+                [2], [], -1>, // dynamic uops
   //
   // Preload
   InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
@@ -713,7 +722,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Load Multiple + update
   // FIXME: assumes 2 doubles which requires 2 LS cycles.
@@ -722,7 +732,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -749,7 +760,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Store Multiple + update
   // FIXME: assumes 2 doubles which requires 2 LS cycles.
@@ -758,7 +770,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_NPipe], 0>,
-                                InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
   // NEON
   // VLD1
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -1861,3 +1874,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<2, [A9_NPipe]>],
                               [4, 1, 2, 2, 3, 3, 1]>
 ]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA9Itineraries;
+}
+
+// TODO: Add Cortex-A9 processor and scheduler resources.
+
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index e2530d0..31d5d38 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -179,8 +179,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   Args.push_back(Entry);
 
   // Emit __eabi_memset call
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain,
+  TargetLowering::CallLoweringInfo CLI(Chain,
                     Type::getVoidTy(*DAG.getContext()), // return type
                     false, // return sign ext
                     false, // return zero ext
@@ -193,7 +192,9 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
                     false, // is return val used
                     DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
                                           TLI.getPointerTy()), // callee
-                    Args, DAG, dl); // arg list, DAG and debug
+                    Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(CLI);
 
   return CallResult.second;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index e247b76..4762854 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -67,6 +67,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
   , AvoidCPSRPartialUpdate(false)
+  , HasRAS(false)
   , HasMPExtension(false)
   , FPOnlySP(false)
   , AllowsUnalignedMem(false)
@@ -82,7 +83,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
   // based on the architecture version.
-  std::string ArchFS = ARM_MC::ParseARMTriple(TT);
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPUString);
   if (!FS.empty()) {
     if (!ArchFS.empty())
       ArchFS = ArchFS + "," + FS;
@@ -96,13 +97,13 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   if (!HasV6T2Ops && hasThumb2())
     HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true;
 
+  // Keep a pointer to static instruction cost data for the specified CPU.
+  SchedModel = getSchedModelForCPU(CPUString);
+
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  // After parsing Itineraries, set ItinData.IssueWidth.
-  computeIssueWidth();
-
-  if (TT.find("eabi") != std::string::npos)
+  if ((TT.find("eabi") != std::string::npos) || (isTargetIOS() && isMClass()))
     // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g.
     // Darwin-EABI conforms to AACPS but not the rest of EABI.
     TargetABI = ARM_ABI_AAPCS;
@@ -181,31 +182,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
 }
 
 unsigned ARMSubtarget::getMispredictionPenalty() const {
-  // If we have a reasonable estimate of the pipeline depth, then we can
-  // estimate the penalty of a misprediction based on that.
-  if (isCortexA8())
-    return 13;
-  else if (isCortexA9())
-    return 8;
-
-  // Otherwise, just return a sensible default.
-  return 10;
-}
-
-void ARMSubtarget::computeIssueWidth() {
-  unsigned allStage1Units = 0;
-  for (const InstrItinerary *itin = InstrItins.Itineraries;
-       itin->FirstStage != ~0U; ++itin) {
-    const InstrStage *IS = InstrItins.Stages + itin->FirstStage;
-    allStage1Units |= IS->getUnits();
-  }
-  InstrItins.IssueWidth = 0;
-  while (allStage1Units) {
-    ++InstrItins.IssueWidth;
-    // clear the lowest bit
-    allStage1Units ^= allStage1Units & ~(allStage1Units - 1);
-  }
-  assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units");
+  return SchedModel->MispredictPenalty;
 }
 
 bool ARMSubtarget::enablePostRAScheduler(
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index e72b06f..b394061 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -74,7 +74,7 @@ protected:
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
-  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs - 
+  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs -
   /// v6m, v7m for example.
   bool IsMClass;
 
@@ -155,6 +155,9 @@ protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
+  /// SchedModel - Processor specific instruction costs.
+  const MCSchedModel *SchedModel;
+
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 9aa8308..171c9ad 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -136,22 +136,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
-    PM->add(createGlobalMergePass(TM->getTargetLowering()));
+    addPass(createGlobalMergePass(TM->getTargetLowering()));
 
   return false;
 }
 
 bool ARMPassConfig::addInstSelector() {
-  PM->add(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+  addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
   return false;
 }
 
 bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
-    PM->add(createARMLoadStoreOptimizationPass(true));
+    addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
-    PM->add(createMLxExpansionPass());
+    addPass(createMLxExpansionPass());
   return true;
 }
 
@@ -159,23 +159,23 @@ bool ARMPassConfig::addPreSched2() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None) {
     if (!getARMSubtarget().isThumb1Only()) {
-      PM->add(createARMLoadStoreOptimizationPass());
+      addPass(createARMLoadStoreOptimizationPass());
       printAndVerify("After ARM load / store optimizer");
     }
     if (getARMSubtarget().hasNEON())
-      PM->add(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+      addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
   // proper scheduling.
-  PM->add(createARMExpandPseudoPass());
+  addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
     if (!getARMSubtarget().isThumb1Only())
-      addPass(IfConverterID);
+      addPass(&IfConverterID);
   }
   if (getARMSubtarget().isThumb2())
-    PM->add(createThumb2ITBlockPass());
+    addPass(createThumb2ITBlockPass());
 
   return true;
 }
@@ -183,13 +183,13 @@ bool ARMPassConfig::addPreSched2() {
 bool ARMPassConfig::addPreEmitPass() {
   if (getARMSubtarget().isThumb2()) {
     if (!getARMSubtarget().prefers32BitThumb())
-      PM->add(createThumb2SizeReductionPass());
+      addPass(createThumb2SizeReductionPass());
 
     // Constant island pass work on unbundled instructions.
-    addPass(UnpackMachineBundlesID);
+    addPass(&UnpackMachineBundlesID);
   }
 
-  PM->add(createARMConstantIslandPass());
+  addPass(createARMConstantIslandPass());
 
   return true;
 }
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index a5ea1c2..3d85ca7 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -24,20 +24,11 @@ using namespace dwarf;
 
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
+  bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
+  InitializeELF(isAAPCS_ABI);
 
   if (isAAPCS_ABI) {
-    StaticCtorSection =
-      getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
-                                 ELF::SHF_WRITE |
-                                 ELF::SHF_ALLOC,
-                                 SectionKind::getDataRel());
-    StaticDtorSection =
-      getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
-                                 ELF::SHF_WRITE |
-                                 ELF::SHF_ALLOC,
-                                 SectionKind::getDataRel());
     LSDASection = NULL;
   }
 
@@ -47,33 +38,3 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                0,
                                SectionKind::getMetadata());
 }
-
-const MCSection *
-ARMElfTargetObjectFile::getStaticCtorSection(unsigned Priority) const {
-  if (!isAAPCS_ABI)
-    return TargetLoweringObjectFileELF::getStaticCtorSection(Priority);
-
-  if (Priority == 65535)
-    return StaticCtorSection;
-
-  // Emit ctors in priority order.
-  std::string Name = std::string(".init_array.") + utostr(Priority);
-  return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
-                                    ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
-}
-
-const MCSection *
-ARMElfTargetObjectFile::getStaticDtorSection(unsigned Priority) const {
-  if (!isAAPCS_ABI)
-    return TargetLoweringObjectFileELF::getStaticDtorSection(Priority);
-
-  if (Priority == 65535)
-    return StaticDtorSection;
-
-  // Emit dtors in priority order.
-  std::string Name = std::string(".fini_array.") + utostr(Priority);
-  return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
-                                    ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
-}
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index ff21060..c6a7261 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -20,7 +20,6 @@ class TargetMachine;
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
 protected:
   const MCSection *AttributesSection;
-  bool isAAPCS_ABI;
 public:
   ARMElfTargetObjectFile() :
     TargetLoweringObjectFileELF(),
@@ -32,9 +31,6 @@ public:
   virtual const MCSection *getAttributesSection() const {
     return AttributesSection;
   }
-
-  const MCSection * getStaticCtorSection(unsigned Priority) const;
-  const MCSection * getStaticDtorSection(unsigned Priority) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 2c53e3f..3a5957b 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -236,7 +236,10 @@ public:
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
     Match_RequiresNotITBlock,
     Match_RequiresV6,
-    Match_RequiresThumb2
+    Match_RequiresThumb2,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
   };
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
@@ -793,6 +796,13 @@ public:
     int64_t Value = CE->getValue();
     return Value > 0 && Value <= 32;
   }
+  bool isAdrLabel() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, but it can't fit 
+    // into shift immediate encoding, we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm())) return true;
+    else return (isARMSOImm() || isARMSOImmNeg());
+  }
   bool isARMSOImm() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -914,7 +924,9 @@ public:
     // Immediate offset in range [-255, 255].
     if (!Memory.OffsetImm) return true;
     int64_t Val = Memory.OffsetImm->getValue();
-    return Val > -256 && Val < 256;
+    // The #-0 offset is encoded as INT32_MIN, and we have to check 
+    // for this too.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
   }
   bool isAM3Offset() const {
     if (Kind != k_Immediate && Kind != k_PostIndexRegister)
@@ -1028,7 +1040,8 @@ public:
     // Immediate offset a multiple of 4 in range [-1020, 1020].
     if (!Memory.OffsetImm) return true;
     int64_t Val = Memory.OffsetImm->getValue();
-    return Val >= -1020 && Val <= 1020 && (Val & 3) == 0;
+    // Special case, #-0 is INT32_MIN.
+    return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
   }
   bool isMemImm0_1020s4Offset() const {
     if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
@@ -1446,8 +1459,10 @@ public:
     assert(isRegShiftedImm() &&
            "addRegShiftedImmOperands() on non RegShiftedImm!");
     Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
+    // Shift of #32 is encoded as 0 where permitted
+    unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm)));
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
   }
 
   void addShifterImmOperands(MCInst &Inst, unsigned N) const {
@@ -1637,6 +1652,22 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(isImm() && "Not an immediate!");
+
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. 
+    if (!isa<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
   void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
@@ -2301,7 +2332,7 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<ccout " << getReg() << ">";
     break;
   case k_ITCondMask: {
-    static const char *MaskStr[] = {
+    static const char *const MaskStr[] = {
       "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
       "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
     };
@@ -2672,7 +2703,7 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
-  unsigned CC = StringSwitch<unsigned>(Tok.getString())
+  unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
     .Case("eq", ARMCC::EQ)
     .Case("ne", ARMCC::NE)
     .Case("hs", ARMCC::HS)
@@ -2877,7 +2908,7 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       if (!RC->contains(EndReg))
         return Error(EndLoc, "invalid register in register list");
       // Ranges must go from low to high.
-      if (getARMRegisterNumbering(Reg) > getARMRegisterNumbering(EndReg))
+      if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
         return Error(EndLoc, "bad range in register list");
 
       // Add all the registers in the range to the register list.
@@ -2904,13 +2935,13 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     if (!RC->contains(Reg))
       return Error(RegLoc, "invalid register in register list");
     // List must be monotonically increasing.
-    if (getARMRegisterNumbering(Reg) < getARMRegisterNumbering(OldReg)) {
+    if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
       if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
         Warning(RegLoc, "register list not in ascending order");
       else
         return Error(RegLoc, "register list not in ascending order");
     }
-    if (getARMRegisterNumbering(Reg) == getARMRegisterNumbering(OldReg)) {
+    if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
       Warning(RegLoc, "duplicated register (" + RegTok.getString() +
               ") in register list");
       continue;
@@ -3249,28 +3280,59 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-  StringRef OptStr = Tok.getString();
-
-  unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()))
-    .Case("sy",    ARM_MB::SY)
-    .Case("st",    ARM_MB::ST)
-    .Case("sh",    ARM_MB::ISH)
-    .Case("ish",   ARM_MB::ISH)
-    .Case("shst",  ARM_MB::ISHST)
-    .Case("ishst", ARM_MB::ISHST)
-    .Case("nsh",   ARM_MB::NSH)
-    .Case("un",    ARM_MB::NSH)
-    .Case("nshst", ARM_MB::NSHST)
-    .Case("unst",  ARM_MB::NSHST)
-    .Case("osh",   ARM_MB::OSH)
-    .Case("oshst", ARM_MB::OSHST)
-    .Default(~0U);
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
+      .Case("sy",    ARM_MB::SY)
+      .Case("st",    ARM_MB::ST)
+      .Case("sh",    ARM_MB::ISH)
+      .Case("ish",   ARM_MB::ISH)
+      .Case("shst",  ARM_MB::ISHST)
+      .Case("ishst", ARM_MB::ISHST)
+      .Case("nsh",   ARM_MB::NSH)
+      .Case("un",    ARM_MB::NSH)
+      .Case("nshst", ARM_MB::NSHST)
+      .Case("unst",  ARM_MB::NSHST)
+      .Case("osh",   ARM_MB::OSH)
+      .Case("oshst", ARM_MB::OSHST)
+      .Default(~0U);
 
-  if (Opt == ~0U)
-    return MatchOperand_NoMatch;
+    if (Opt == ~0U)
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat the '#'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *MemBarrierID;
+    if (getParser().ParseExpression(MemBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+    
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_MB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
 
-  Parser.Lex(); // Eat identifier token.
   Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
   return MatchOperand_Success;
 }
@@ -3280,7 +3342,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (!Tok.is(AsmToken::Identifier)) 
+    return MatchOperand_NoMatch;
   StringRef IFlagsStr = Tok.getString();
 
   // An iflags string of "none" is interpreted to mean that none of the AIF
@@ -3320,26 +3383,51 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     // See ARMv6-M 10.1.1
     std::string Name = Mask.lower();
     unsigned FlagsVal = StringSwitch<unsigned>(Name)
-      .Case("apsr", 0)
-      .Case("iapsr", 1)
-      .Case("eapsr", 2)
-      .Case("xpsr", 3)
-      .Case("ipsr", 5)
-      .Case("epsr", 6)
-      .Case("iepsr", 7)
-      .Case("msp", 8)
-      .Case("psp", 9)
-      .Case("primask", 16)
-      .Case("basepri", 17)
-      .Case("basepri_max", 18)
-      .Case("faultmask", 19)
-      .Case("control", 20)
+      // Note: in the documentation:
+      //  ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+      //  for MSR APSR_nzcvq.
+      // but we do make it an alias here.  This is so to get the "mask encoding"
+      // bits correct on MSR APSR writes.
+      //
+      // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+      // should really only be allowed when writing a special register.  Note
+      // they get dropped in the MRS instruction reading a special register as
+      // the SYSm field is only 8 bits.
+      //
+      // FIXME: the _g and _nzcvqg versions are only allowed if the processor
+      // includes the DSP extension but that is not checked.
+      .Case("apsr", 0x800)
+      .Case("apsr_nzcvq", 0x800)
+      .Case("apsr_g", 0x400)
+      .Case("apsr_nzcvqg", 0xc00)
+      .Case("iapsr", 0x801)
+      .Case("iapsr_nzcvq", 0x801)
+      .Case("iapsr_g", 0x401)
+      .Case("iapsr_nzcvqg", 0xc01)
+      .Case("eapsr", 0x802)
+      .Case("eapsr_nzcvq", 0x802)
+      .Case("eapsr_g", 0x402)
+      .Case("eapsr_nzcvqg", 0xc02)
+      .Case("xpsr", 0x803)
+      .Case("xpsr_nzcvq", 0x803)
+      .Case("xpsr_g", 0x403)
+      .Case("xpsr_nzcvqg", 0xc03)
+      .Case("ipsr", 0x805)
+      .Case("epsr", 0x806)
+      .Case("iepsr", 0x807)
+      .Case("msp", 0x808)
+      .Case("psp", 0x809)
+      .Case("primask", 0x810)
+      .Case("basepri", 0x811)
+      .Case("basepri_max", 0x812)
+      .Case("faultmask", 0x813)
+      .Case("control", 0x814)
       .Default(~0U);
 
     if (FlagsVal == ~0U)
       return MatchOperand_NoMatch;
 
-    if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19)
+    if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
       // basepri, basepri_max and faultmask only valid for V7m.
       return MatchOperand_NoMatch;
 
@@ -5216,8 +5304,8 @@ validateInstruction(MCInst &Inst,
   case ARM::LDRD_POST:
   case ARM::LDREXD: {
     // Rt2 must be Rt + 1.
-    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
-    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "destination operands must be sequential");
@@ -5225,8 +5313,8 @@ validateInstruction(MCInst &Inst,
   }
   case ARM::STRD: {
     // Rt2 must be Rt + 1.
-    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
-    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "source operands must be sequential");
@@ -5236,8 +5324,8 @@ validateInstruction(MCInst &Inst,
   case ARM::STRD_POST:
   case ARM::STREXD: {
     // Rt2 must be Rt + 1.
-    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(1).getReg());
-    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(2).getReg());
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "source operands must be sequential");
@@ -5315,6 +5403,16 @@ validateInstruction(MCInst &Inst,
                    "registers must be in range r0-r7");
     break;
   }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need thumb2 (for the wide encoding), or we have an error.
+    if (!isThumbTwo() &&
+        Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      return Error(Operands[4]->getStartLoc(),
+                   "source register must be the same as destination");
+    }
+    break;
+  }
   }
 
   return false;
@@ -6750,8 +6848,8 @@ processInstruction(MCInst &Inst,
     case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
     case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
     }
-    unsigned Ammount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
-    if (Ammount == 32) Ammount = 0;
+    unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Amount == 32) Amount = 0;
     TmpInst.setOpcode(newOpc);
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow)
@@ -6759,7 +6857,7 @@ processInstruction(MCInst &Inst,
           Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     if (newOpc != ARM::t2RRX)
-      TmpInst.addOperand(MCOperand::CreateImm(Ammount));
+      TmpInst.addOperand(MCOperand::CreateImm(Amount));
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
     TmpInst.addOperand(Inst.getOperand(4));
     if (!isNarrow)
@@ -6809,6 +6907,9 @@ processInstruction(MCInst &Inst,
     // A shift by zero is a plain MOVr, not a MOVsi.
     unsigned Amt = Inst.getOperand(2).getImm();
     unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+    // A shift by 32 should be encoded as 0 when permitted
+    if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+      Amt = 0;
     unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
     MCInst TmpInst;
     TmpInst.setOpcode(Opc);
@@ -6985,6 +7086,16 @@ processInstruction(MCInst &Inst,
     Inst = TmpInst;
     return true;
   }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need to use the 32-bit encoding if it's available.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      Inst.setOpcode(ARM::t2ADDrr);
+      Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+      return true;
+    }
+    break;
+  }
   case ARM::tB:
     // A Thumb conditional branch outside of an IT block is a tBcc.
     if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
@@ -7154,7 +7265,9 @@ processInstruction(MCInst &Inst,
   }
   case ARM::MOVsi: {
     ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
-    if (SOpc == ARM_AM::rrx) return false;
+    // rrx shifts and asr/lsr of #32 is encoded as 0
+    if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) 
+      return false;
     if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
       // Shifting by zero is accepted as a vanilla 'MOVr'
       MCInst TmpInst;
@@ -7188,7 +7301,9 @@ processInstruction(MCInst &Inst,
     case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
     }
     // If the shift is by zero, use the non-shifted instruction definition.
-    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0) {
+    // The exception is for right shifts, where 0 == 32
+    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+        !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
       MCInst TmpInst;
       TmpInst.setOpcode(newOpc);
       TmpInst.addOperand(Inst.getOperand(0));
@@ -7207,9 +7322,7 @@ processInstruction(MCInst &Inst,
     // The mask bits for all but the first condition are represented as
     // the low bit of the condition code value implies 't'. We currently
     // always have 1 implies 't', so XOR toggle the bits if the low bit
-    // of the condition code is zero. The encoding also expects the low
-    // bit of the condition to be encoded as bit 4 of the mask operand,
-    // so mask that in if needed
+    // of the condition code is zero. 
     MCOperand &MO = Inst.getOperand(1);
     unsigned Mask = MO.getImm();
     unsigned OrigMask = Mask;
@@ -7218,8 +7331,7 @@ processInstruction(MCInst &Inst,
       assert(Mask && TZ <= 3 && "illegal IT mask value!");
       for (unsigned i = 3; i != TZ; --i)
         Mask ^= 1 << i;
-    } else
-      Mask |= 0x10;
+    }
     MO.setImm(Mask);
 
     // Set up the IT block state according to the IT instruction we just
@@ -7231,6 +7343,86 @@ processInstruction(MCInst &Inst,
     ITState.FirstCond = true;
     break;
   }
+  case ARM::t2LSLrr:
+  case ARM::t2LSRrr:
+  case ARM::t2ASRrr:
+  case ARM::t2SBCrr:
+  case ARM::t2RORrr:
+  case ARM::t2BICrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+        case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+        case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+        case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+        case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+        case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::t2ANDrr:
+  case ARM::t2EORrr:
+  case ARM::t2ADCrr:
+  case ARM::t2ORRrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    // These instructions are special in that they are commutable, so shorter encodings
+    // are available more often.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+         Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+        case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+        case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+        case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+      } else {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(1));
+      }
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
   }
   return false;
 }
@@ -7277,6 +7469,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -7317,9 +7510,21 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst);
     return false;
-  case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
-    return true;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
@@ -7336,7 +7541,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "invalid instruction",
                  ((ARMOperand*)Operands[0])->getLocRange());
   case Match_ConversionFail:
-    // The converter function will have already emited a diagnostic.
+    // The converter function will have already emitted a diagnostic.
     return true;
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
@@ -7346,6 +7551,11 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_ImmRange0_15: {
+    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+  }
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -7582,5 +7792,6 @@ extern "C" void LLVMInitializeARMAsmParser() {
 }
 
 #define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 9a2aab5..ac916cc 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,6 +49,8 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
+add_dependencies(LLVMARMCodeGen intrinsics_gen)
+
 # workaround for hanging compilation on MSVC9, 10
 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
 set_property(
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 912935d..e47bf66 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -24,12 +24,66 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <vector>
 
 using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+  // Handles the condition code status of instructions in IT blocks
+  class ITStatus
+  {
+    public:
+      // Returns the condition code for instruction in IT block
+      unsigned getITCC() {
+        unsigned CC = ARMCC::AL;
+        if (instrInITBlock())
+          CC = ITStates.back();
+        return CC;
+      }
+
+      // Advances the IT block state to the next T or E
+      void advanceITState() {
+        ITStates.pop_back();
+      }
+
+      // Returns true if the current instruction is in an IT block
+      bool instrInITBlock() {
+        return !ITStates.empty();
+      }
+
+      // Returns true if current instruction is the last instruction in an IT block
+      bool instrLastInITBlock() {
+        return ITStates.size() == 1;
+      }
+
+      // Called when decoding an IT instruction. Sets the IT state for the following
+      // instructions that for the IT block. Firstcond and Mask correspond to the 
+      // fields in the IT instruction encoding.
+      void setITState(char Firstcond, char Mask) {
+        // (3 - the number of trailing zeros) is the number of then / else.
+        unsigned CondBit0 = Firstcond & 1;
+        unsigned NumTZ = CountTrailingZeros_32(Mask);
+        unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+        assert(NumTZ <= 3 && "Invalid IT mask!");
+        // push condition codes onto the stack the correct order for the pops
+        for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+          bool T = ((Mask >> Pos) & 1) == CondBit0;
+          if (T)
+            ITStates.push_back(CCBits);
+          else
+            ITStates.push_back(CCBits ^ 1);
+        }
+        ITStates.push_back(CCBits);
+      }
+
+    private:
+      std::vector<unsigned char> ITStates;
+  };
+}
+
+namespace {
 /// ARMDisassembler - ARM disassembler for all ARM platforms.
 class ARMDisassembler : public MCDisassembler {
 public:
@@ -78,7 +132,7 @@ public:
   /// getEDInfo - See MCDisassembler.
   const EDInstInfo *getEDInfo() const;
 private:
-  mutable std::vector<unsigned> ITBlock;
+  mutable ITStatus ITBlock;
   DecodeStatus AddThumbPredicate(MCInst&) const;
   void UpdateThumbVFPPredicate(MCInst&) const;
 };
@@ -549,7 +603,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
 /// These can often be values in a literal pool near the Address of the
 /// instruction.  The Address of the instruction and its immediate Value are
 /// used as a possible literal pool entry.  The SymbolLookUp call back will
-/// return the name of a symbol referenced by the the literal pool's entry if
+/// return the name of a symbol referenced by the literal pool's entry if
 /// the referenced address is that of a symbol.  Or it will return a pointer to
 /// a literal 'C' string if the referenced address of the literal pool's entry
 /// is an address into a section with 'C' string literals.
@@ -612,7 +666,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
     case ARM::tSETEND:
       // Some instructions (mostly conditional branches) are not
       // allowed in IT blocks.
-      if (!ITBlock.empty())
+      if (ITBlock.instrInITBlock())
         S = SoftFail;
       else
         return Success;
@@ -623,7 +677,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
     case ARM::t2TBH:
       // Some instructions (mostly unconditional branches) can
       // only appears at the end of, or outside of, an IT.
-      if (ITBlock.size() > 1)
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
         S = SoftFail;
       break;
     default:
@@ -633,13 +687,11 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
   // If we're in an IT block, base the predicate on that.  Otherwise,
   // assume a predicate of AL.
   unsigned CC;
-  if (!ITBlock.empty()) {
-    CC = ITBlock.back();
-    if (CC == 0xF)
-      CC = ARMCC::AL;
-    ITBlock.pop_back();
-  } else
+  CC = ITBlock.getITCC();
+  if (CC == 0xF) 
     CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
 
   const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
   unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
@@ -674,11 +726,9 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
 // context as a post-pass.
 void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
   unsigned CC;
-  if (!ITBlock.empty()) {
-    CC = ITBlock.back();
-    ITBlock.pop_back();
-  } else
-    CC = ARMCC::AL;
+  CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
 
   const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
   MCInst::iterator I = MI.begin();
@@ -726,7 +776,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI);
   if (result) {
     Size = 2;
-    bool InITBlock = !ITBlock.empty();
+    bool InITBlock = ITBlock.instrInITBlock();
     Check(result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
     return result;
@@ -739,7 +789,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
     // the Thumb predicate.
-    if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty())
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
       result = MCDisassembler::SoftFail;
 
     Check(result, AddThumbPredicate(MI));
@@ -749,21 +799,9 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // to the subsequent instructions.
     if (MI.getOpcode() == ARM::t2IT) {
 
-      // (3 - the number of trailing zeros) is the number of then / else.
-      unsigned firstcond = MI.getOperand(0).getImm();
+      unsigned Firstcond = MI.getOperand(0).getImm();
       unsigned Mask = MI.getOperand(1).getImm();
-      unsigned CondBit0 = Mask >> 4 & 1;
-      unsigned NumTZ = CountTrailingZeros_32(Mask);
-      assert(NumTZ <= 3 && "Invalid IT mask!");
-      for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
-        bool T = ((Mask >> Pos) & 1) == CondBit0;
-        if (T)
-          ITBlock.insert(ITBlock.begin(), firstcond);
-        else
-          ITBlock.insert(ITBlock.begin(), firstcond ^ 1);
-      }
-
-      ITBlock.push_back(firstcond);
+      ITBlock.setITState(Firstcond, Mask);
     }
 
     return result;
@@ -783,7 +821,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   result = decodeThumbInstruction32(MI, insn32, Address, this, STI);
   if (result != MCDisassembler::Fail) {
     Size = 4;
-    bool InITBlock = ITBlock.size();
+    bool InITBlock = ITBlock.instrInITBlock();
     Check(result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
     return result;
@@ -1186,8 +1224,8 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
-  unsigned regs = Val & 0xFF;
+  unsigned Vd = fieldFromInstruction32(Val, 8, 5);
+  unsigned regs = fieldFromInstruction32(Val, 0, 8);
 
   if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -1203,8 +1241,10 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
-  unsigned regs = (Val & 0xFF) / 2;
+  unsigned Vd = fieldFromInstruction32(Val, 8, 5);
+  unsigned regs = fieldFromInstruction32(Val, 0, 8);
+
+  regs = regs >> 1;
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -2976,7 +3016,7 @@ static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
   return MCDisassembler::Success;
@@ -3111,9 +3151,14 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
                            uint64_t Address, const void *Decoder) {
-  int imm = Val & 0xFF;
-  if (!(Val & 0x100)) imm *= -1;
-  Inst.addOperand(MCOperand::CreateImm(imm << 2));
+  if (Val == 0)
+    Inst.addOperand(MCOperand::CreateImm(INT32_MIN));
+  else {
+    int imm = Val & 0xFF;
+
+    if (!(Val & 0x100)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm << 2));
+  }
 
   return MCDisassembler::Success;
 }
@@ -3258,9 +3303,9 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
 
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
     return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
     return MCDisassembler::Fail;
-    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
   } else if (Inst.getOpcode() == ARM::tADDspr) {
     unsigned Rm = fieldFromInstruction16(Insn, 3, 4);
 
@@ -3299,10 +3344,25 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
+  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+  // Note only one trailing zero not two.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with two trailing zeros as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
   if (!tryAddingSymbolicOperand(Address,
-                                (Address & ~2u) + SignExtend32<22>(Val << 1) + 4,
+                                (Address & ~2u) + imm32 + 4,
                                 true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(imm32));
   return MCDisassembler::Success;
 }
 
@@ -3408,35 +3468,39 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
 static DecodeStatus
 DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
                             uint64_t Address, const void *Decoder){
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<8>(Val<<1) + 4,
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<8>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<9>(Val << 1)));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
                                        uint64_t Address, const void *Decoder){
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
+  // Val is passed in as S:J1:J2:imm10:imm11
+  // Note no trailing zero after imm11.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
                                 true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(imm32));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
-  switch (Val) {
-  default:
+  if (Val & ~0xf)
     return MCDisassembler::Fail;
-  case 0xF: // SY
-  case 0xE: // ST
-  case 0xB: // ISH
-  case 0xA: // ISHST
-  case 0x7: // NSH
-  case 0x6: // NSHST
-  case 0x3: // OSH
-  case 0x2: // OSHST
-    break;
-  }
 
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
@@ -4128,9 +4192,9 @@ static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
-  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  5, 1);
   unsigned pred = fieldFromInstruction32(Insn, 28, 4);
-  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  Rm |= fieldFromInstruction32(Insn, 0, 4) << 1;
 
   if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
     S = MCDisassembler::SoftFail;
@@ -4154,9 +4218,9 @@ static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
-  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  5, 1);
   unsigned pred = fieldFromInstruction32(Insn, 28, 4);
-  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  Rm |= fieldFromInstruction32(Insn, 0, 4) << 1;
 
   if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
     S = MCDisassembler::SoftFail;
@@ -4179,19 +4243,14 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
                              uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned pred = fieldFromInstruction16(Insn, 4, 4);
-  // The InstPrinter needs to have the low bit of the predicate in
-  // the mask operand to be able to print it properly.
-  unsigned mask = fieldFromInstruction16(Insn, 0, 5);
+  unsigned mask = fieldFromInstruction16(Insn, 0, 4);
 
   if (pred == 0xF) {
     pred = 0xE;
     S = MCDisassembler::SoftFail;
   }
 
-  if ((mask & 0xF) == 0) {
-    // Preserve the high bit of the mask, which is the low bit of
-    // the predicate.
-    mask &= 0x10;
+  if (mask == 0x0) {
     mask |= 0x8;
     S = MCDisassembler::SoftFail;
   }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index cbd81c1..8b9109e 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -52,6 +52,27 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
+  // Check for HINT instructions w/ canonical names.
+  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+    switch (MI->getOperand(0).getImm()) {
+    case 0: O << "\tnop"; break;
+    case 1: O << "\tyield"; break;
+    case 2: O << "\twfe"; break;
+    case 3: O << "\twfi"; break;
+    case 4: O << "\tsev"; break;
+    default:
+      // Anything else should just print normally.
+      printInstruction(MI, O);
+      printAnnotation(O, Annot);
+      return;
+    }
+    printPredicateOperand(MI, 1, O);
+    if (Opcode == ARM::t2HINT)
+      O << ".w";
+    printAnnotation(O, Annot);
+    return;
+  }
+
   // Check for MOVs and print canonical forms, instead.
   if (Opcode == ARM::MOVsr) {
     // FIXME: Thumb variants?
@@ -426,9 +447,13 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
     return;
   }
 
-  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+  //If the op is sub we have to print the immediate even if it is 0 
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+ 
+  if (ImmOffs || (op == ARM_AM::sub))
     O << ", #"
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
+      << ARM_AM::getAddrOpcStr(op)
       << ImmOffs;
   O << ']';
 }
@@ -643,22 +668,50 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
   unsigned Mask = Op.getImm() & 0xf;
 
   if (getAvailableFeatures() & ARM::FeatureMClass) {
-    switch (Op.getImm()) {
+    unsigned SYSm = Op.getImm();
+    unsigned Opcode = MI->getOpcode();
+    // For reads of the special registers ignore the "mask encoding" bits
+    // which are only for writes.
+    if (Opcode == ARM::t2MRS_M)
+      SYSm &= 0xff;
+    switch (SYSm) {
     default: llvm_unreachable("Unexpected mask value!");
-    case 0: O << "apsr"; return;
-    case 1: O << "iapsr"; return;
-    case 2: O << "eapsr"; return;
-    case 3: O << "xpsr"; return;
-    case 5: O << "ipsr"; return;
-    case 6: O << "epsr"; return;
-    case 7: O << "iepsr"; return;
-    case 8: O << "msp"; return;
-    case 9: O << "psp"; return;
-    case 16: O << "primask"; return;
-    case 17: O << "basepri"; return;
-    case 18: O << "basepri_max"; return;
-    case 19: O << "faultmask"; return;
-    case 20: O << "control"; return;
+    case     0:
+    case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr
+    case 0x400: O << "apsr_g"; return;
+    case 0xc00: O << "apsr_nzcvqg"; return;
+    case     1:
+    case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr
+    case 0x401: O << "iapsr_g"; return;
+    case 0xc01: O << "iapsr_nzcvqg"; return;
+    case     2:
+    case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr
+    case 0x402: O << "eapsr_g"; return;
+    case 0xc02: O << "eapsr_nzcvqg"; return;
+    case     3:
+    case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr
+    case 0x403: O << "xpsr_g"; return;
+    case 0xc03: O << "xpsr_nzcvqg"; return;
+    case     5:
+    case 0x805: O << "ipsr"; return;
+    case     6:
+    case 0x806: O << "epsr"; return;
+    case     7:
+    case 0x807: O << "iepsr"; return;
+    case     8:
+    case 0x808: O << "msp"; return;
+    case     9:
+    case 0x809: O << "psp"; return;
+    case  0x10:
+    case 0x810: O << "primask"; return;
+    case  0x11:
+    case 0x811: O << "basepri"; return;
+    case  0x12:
+    case 0x812: O << "basepri_max"; return;
+    case  0x13:
+    case 0x813: O << "faultmask"; return;
+    case  0x14:
+    case 0x814: O << "control"; return;
     }
   }
 
@@ -739,6 +792,25 @@ void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
   llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
 }
 
+void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (MO.isExpr()) {
+    O << *MO.getExpr();
+    return;
+  }
+
+  int32_t OffImm = (int32_t)MO.getImm();
+
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+}
+
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
   O << "#" << MI->getOperand(OpNum).getImm() * 4;
@@ -754,7 +826,8 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O) {
   // (3 - the number of trailing zeros) is the number of then / else.
   unsigned Mask = MI->getOperand(OpNum).getImm();
-  unsigned CondBit0 = Mask >> 4 & 1;
+  unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
+  unsigned CondBit0 = Firstcond & 1;
   unsigned NumTZ = CountTrailingZeros_32(Mask);
   assert(NumTZ <= 3 && "Invalid IT mask!");
   for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
@@ -899,12 +972,17 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
 
   O << "[" << getRegisterName(MO1.getReg());
 
-  int32_t OffImm = (int32_t)MO2.getImm() / 4;
+  int32_t OffImm = (int32_t)MO2.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
   // Don't print +0.
-  if (OffImm < 0)
-    O << ", #-" << -OffImm * 4;
+  if (OffImm == INT32_MIN)
+    O << ", #-0";
+  else if (OffImm < 0)
+    O << ", #-" << -OffImm;
   else if (OffImm > 0)
-    O << ", #" << OffImm * 4;
+    O << ", #" << OffImm;
   O << "]";
 }
 
@@ -936,15 +1014,17 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
                                                         unsigned OpNum,
                                                         raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
-  int32_t OffImm = (int32_t)MO1.getImm() / 4;
+  int32_t OffImm = (int32_t)MO1.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
   // Don't print +0.
-  if (OffImm != 0) {
-    O << ", ";
-    if (OffImm < 0)
-      O << "#-" << -OffImm * 4;
-    else if (OffImm > 0)
-      O << "#" << OffImm * 4;
-  }
+  if (OffImm == INT32_MIN)
+    O << ", #-0";
+  else if (OffImm < 0)
+    O << ", #-" << -OffImm;
+  else if (OffImm > 0)
+    O << ", #" << OffImm;
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 8acb7ee..73d7bfd 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -73,6 +73,7 @@ public:
   void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index d10bfc1..ac6ce64 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
@@ -84,7 +85,8 @@ public:
 { "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
 { "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
 { "fixup_arm_movt_hi16",     0,            20,  0 },
@@ -110,32 +112,7 @@ public:
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
                          MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) {
-    const MCSymbolRefExpr *A = Target.getSymA();
-    // Some fixups to thumb function symbols need the low bit (thumb bit)
-    // twiddled.
-    if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
-      if (A) {
-        const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
-        if (Asm.isThumbFunc(&Sym))
-          Value |= 1;
-      }
-    }
-    // We must always generate a relocation for BL/BLX instructions if we have
-    // a symbol to reference, as the linker relies on knowing the destination
-    // symbol's thumb-ness to get interworking right.
-    if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
-      IsResolved = false;
-  }
+                         bool &IsResolved);
 
   bool mayNeedRelaxation(const MCInst &Inst) const;
 
@@ -269,7 +246,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = NULL) {
+  unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
@@ -322,7 +301,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       Value = -Value;
       isAdd = false;
     }
-    assert ((Value < 4096) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 4096)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10,
@@ -345,8 +325,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       Value = -Value;
       opc = 2; // 0b0010
     }
-    assert(ARM_AM::getSOImmVal(Value) != -1 &&
-           "Out of range pc-relative fixup value!");
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     // Encode the immediate and shift the opcode into place.
     return ARM_AM::getSOImmVal(Value) | (opc << 21);
   }
@@ -414,39 +394,65 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     return swapped;
   }
   case ARM::fixup_arm_thumb_bl: {
-    // The value doesn't encode the low bit (always zero) and is offset by
-    // four. The value is encoded into disjoint bit positions in the destination
-    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit
-    //
-    //   BL:  xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0x3fffff & ((Value - 4) >> 1);
-    Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
-    Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
+     // The value doesn't encode the low bit (always zero) and is offset by
+     // four. The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+     // J = either J1 or J2 bit
+     //
+     //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 4) >> 1;
+     uint32_t signBit = (offset & 0x800000) >> 23;
+     uint32_t I1Bit = (offset & 0x400000) >> 22;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x200000) >> 21;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+     uint32_t imm11Bits = (offset & 0x000007FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                           (uint16_t)imm11Bits);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
+
   }
   case ARM::fixup_arm_thumb_blx: {
-    // The value doesn't encode the low two bits (always zero) and is offset by
-    // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit
-    // positions in the destination opcode. x = unchanged, I = immediate value
-    // bit, S = sign extension bit, 0 = zero.
-    //
-    //   BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0xfffff & ((Value - 2) >> 2);
-    Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
-    Binary |= (Value & 0xffc00) >> 10;  // High imm10H value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
+     // The value doesn't encode the low two bits (always zero) and is offset by
+     // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination 
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, 
+     // J = either J1 or J2 bit, 0 = zero.
+     //
+     //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 2) >> 2;
+     uint32_t signBit = (offset & 0x400000) >> 22;
+     uint32_t I1Bit = (offset & 0x200000) >> 21;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x100000) >> 20;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+     uint32_t imm10LBits = (offset & 0x3FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | 
+                           ((uint16_t)imm10LBits) << 1);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
   }
   case ARM::fixup_arm_thumb_cp:
     // Offset by 4, and don't encode the low two bits. Two bytes of that
@@ -473,7 +479,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
     return Value | (isAdd << 23);
   }
@@ -491,7 +498,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
@@ -507,6 +515,43 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   }
 }
 
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  const MCSymbolRefExpr *A = Target.getSymA();
+  // Some fixups to thumb function symbols need the low bit (thumb bit)
+  // twiddled.
+  if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
+    if (A) {
+      const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+      if (Asm.isThumbFunc(&Sym))
+        Value |= 1;
+    }
+  }
+  // We must always generate a relocation for BL/BLX instructions if we have
+  // a symbol to reference, as the linker relies on knowing the destination
+  // symbol's thumb-ness to get interworking right.
+  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value aren't invalid.
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
 namespace {
 
 // FIXME: This should be in a separate file.
@@ -530,7 +575,7 @@ public:
 void ELFARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                   unsigned DataSize, uint64_t Value) const {
   unsigned NumBytes = 4;        // FIXME: 2 for Thumb
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
@@ -615,7 +660,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 void DarwinARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                      unsigned DataSize, uint64_t Value) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index ae11be8..de48a0e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -120,14 +120,22 @@ namespace ARM_MB {
   // The Memory Barrier Option constants map directly to the 4-bit encoding of
   // the option field for memory barrier operations.
   enum MemBOpt {
-    SY    = 15,
-    ST    = 14,
-    ISH   = 11,
-    ISHST = 10,
-    NSH   = 7,
-    NSHST = 6,
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    OSHST = 2,
     OSH   = 3,
-    OSHST = 2
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    NSHST = 6,
+    NSH   = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    ISHST = 10,
+    ISH   = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    ST    = 14,
+    SY    = 15
   };
 
   inline static const char *MemBOptToString(unsigned val) {
@@ -135,92 +143,24 @@ namespace ARM_MB {
     default: llvm_unreachable("Unknown memory operation");
     case SY:    return "sy";
     case ST:    return "st";
+    case RESERVED_13: return "#0xd";
+    case RESERVED_12: return "#0xc";
     case ISH:   return "ish";
     case ISHST: return "ishst";
+    case RESERVED_9: return "#0x9";
+    case RESERVED_8: return "#0x8";
     case NSH:   return "nsh";
     case NSHST: return "nshst";
+    case RESERVED_5: return "#0x5";
+    case RESERVED_4: return "#0x4";
     case OSH:   return "osh";
     case OSHST: return "oshst";
+    case RESERVED_1: return "#0x1";
+    case RESERVED_0: return "#0x0";
     }
   }
 } // namespace ARM_MB
 
-/// getARMRegisterNumbering - Given the enum value for some register, e.g.
-/// ARM::LR, return the number that it corresponds to (e.g. 14).
-inline static unsigned getARMRegisterNumbering(unsigned Reg) {
-  using namespace ARM;
-  switch (Reg) {
-  default:
-    llvm_unreachable("Unknown ARM register!");
-  case R0:  case S0:  case D0:  case Q0:  return 0;
-  case R1:  case S1:  case D1:  case Q1:  return 1;
-  case R2:  case S2:  case D2:  case Q2:  return 2;
-  case R3:  case S3:  case D3:  case Q3:  return 3;
-  case R4:  case S4:  case D4:  case Q4:  return 4;
-  case R5:  case S5:  case D5:  case Q5:  return 5;
-  case R6:  case S6:  case D6:  case Q6:  return 6;
-  case R7:  case S7:  case D7:  case Q7:  return 7;
-  case R8:  case S8:  case D8:  case Q8:  return 8;
-  case R9:  case S9:  case D9:  case Q9:  return 9;
-  case R10: case S10: case D10: case Q10: return 10;
-  case R11: case S11: case D11: case Q11: return 11;
-  case R12: case S12: case D12: case Q12: return 12;
-  case SP:  case S13: case D13: case Q13: return 13;
-  case LR:  case S14: case D14: case Q14: return 14;
-  case PC:  case S15: case D15: case Q15: return 15;
-
-  case S16: case D16: return 16;
-  case S17: case D17: return 17;
-  case S18: case D18: return 18;
-  case S19: case D19: return 19;
-  case S20: case D20: return 20;
-  case S21: case D21: return 21;
-  case S22: case D22: return 22;
-  case S23: case D23: return 23;
-  case S24: case D24: return 24;
-  case S25: case D25: return 25;
-  case S26: case D26: return 26;
-  case S27: case D27: return 27;
-  case S28: case D28: return 28;
-  case S29: case D29: return 29;
-  case S30: case D30: return 30;
-  case S31: case D31: return 31;
-
-  // Composite registers use the regnum of the first register in the list.
-  /* Q0  */     case D0_D2:   return 0;
-  case D1_D2:   case D1_D3:   return 1;
-  /* Q1  */     case D2_D4:   return 2;
-  case D3_D4:   case D3_D5:   return 3;
-  /* Q2  */     case D4_D6:   return 4;
-  case D5_D6:   case D5_D7:   return 5;
-  /* Q3  */     case D6_D8:   return 6;
-  case D7_D8:   case D7_D9:   return 7;
-  /* Q4  */     case D8_D10:  return 8;
-  case D9_D10:  case D9_D11:  return 9;
-  /* Q5  */     case D10_D12: return 10;
-  case D11_D12: case D11_D13: return 11;
-  /* Q6  */     case D12_D14: return 12;
-  case D13_D14: case D13_D15: return 13;
-  /* Q7  */     case D14_D16: return 14;
-  case D15_D16: case D15_D17: return 15;
-  /* Q8  */     case D16_D18: return 16;
-  case D17_D18: case D17_D19: return 17;
-  /* Q9  */     case D18_D20: return 18;
-  case D19_D20: case D19_D21: return 19;
-  /* Q10 */     case D20_D22: return 20;
-  case D21_D22: case D21_D23: return 21;
-  /* Q11 */     case D22_D24: return 22;
-  case D23_D24: case D23_D25: return 23;
-  /* Q12 */     case D24_D26: return 24;
-  case D25_D26: case D25_D27: return 25;
-  /* Q13 */     case D26_D28: return 26;
-  case D27_D28: case D27_D29: return 27;
-  /* Q14 */     case D28_D30: return 28;
-  case D29_D30: case D29_D31: return 29;
-  /* Q15 */
-  }
-}
-
 /// isARMLowRegister - Returns true if the register is a low register (r0-r7).
 ///
 static inline bool isARMLowRegister(unsigned Reg) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index aa649ba..7d6acbc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -178,9 +178,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
         break;
       }
       break;
-    case ARM::fixup_arm_uncondbl:
     case ARM::fixup_arm_blx:
-    case ARM::fixup_arm_uncondbranch:
+    case ARM::fixup_arm_uncondbl:
       switch (Modifier) {
       case MCSymbolRefExpr::VK_ARM_PLT:
         Type = ELF::R_ARM_PLT32;
@@ -192,6 +191,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       break;
     case ARM::fixup_arm_condbl:
     case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_arm_movt_hi16:
@@ -252,10 +252,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_thumb_cp:
     case ARM::fixup_arm_thumb_br:
       llvm_unreachable("Unimplemented");
-    case ARM::fixup_arm_uncondbranch:
-      Type = ELF::R_ARM_CALL;
-      break;
     case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_arm_movt_hi16:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 03e8d5f..d32805e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -22,40 +22,14 @@ EnableARMEHABI("arm-enable-ehabi", cl::Hidden,
   cl::init(false));
 
 
-static const char *const arm_asm_table[] = {
-  "{r0}", "r0",
-  "{r1}", "r1",
-  "{r2}", "r2",
-  "{r3}", "r3",
-  "{r4}", "r4",
-  "{r5}", "r5",
-  "{r6}", "r6",
-  "{r7}", "r7",
-  "{r8}", "r8",
-  "{r9}", "r9",
-  "{r10}", "r10",
-  "{r11}", "r11",
-  "{r12}", "r12",
-  "{r13}", "r13",
-  "{r14}", "r14",
-  "{lr}", "lr",
-  "{sp}", "sp",
-  "{ip}", "ip",
-  "{fp}", "fp",
-  "{sl}", "sl",
-  "{memory}", "memory",
-  "{cc}", "cc",
-  0,0
-};
-
 void ARMMCAsmInfoDarwin::anchor() { }
 
 ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
-  AsmTransCBE = arm_asm_table;
   Data64bitsDirective = 0;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
+  UseDataRegionDirectives = true;
 
   SupportsDebugInformation = true;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 10d1c48..94f1082 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -18,6 +18,7 @@
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -38,11 +39,12 @@ class ARMMCCodeEmitter : public MCCodeEmitter {
   void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
+  const MCContext &CTX;
 
 public:
   ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
                    MCContext &ctx)
-    : MCII(mcii), STI(sti) {
+    : MCII(mcii), STI(sti), CTX(ctx) {
   }
 
   ~ARMMCCodeEmitter() {}
@@ -336,6 +338,7 @@ public:
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new ARMMCCodeEmitter(MCII, STI, Ctx);
@@ -404,7 +407,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    unsigned RegNo = getARMRegisterNumbering(Reg);
+    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
 
     // Q registers are encoded as 2x their register number.
     switch (Reg) {
@@ -433,7 +436,7 @@ EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
-  Reg = getARMRegisterNumbering(MO.getReg());
+  Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
 
   int32_t SImm = MO1.getImm();
   bool isAdd = true;
@@ -640,8 +643,8 @@ getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
   return Val;
 }
 
-/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
-/// target.
+/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate
+/// ADR label target.
 uint32_t ARMMCCodeEmitter::
 getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
                    SmallVectorImpl<MCFixup> &Fixups) const {
@@ -651,15 +654,23 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
                                     Fixups);
   int32_t offset = MO.getImm();
   uint32_t Val = 0x2000;
-  if (offset < 0) {
+
+  if (offset == INT32_MIN) {
+    Val = 0x1000;
+    offset = 0;
+  } else if (offset < 0) {
     Val = 0x1000;
     offset *= -1;
   }
-  Val |= offset;
+
+  int SoImmVal = ARM_AM::getSOImmVal(offset);
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  Val |= SoImmVal;
   return Val;
 }
 
-/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
 /// target.
 uint32_t ARMMCCodeEmitter::
 getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
@@ -669,14 +680,16 @@ getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
                                     Fixups);
   int32_t Val = MO.getImm();
-  if (Val < 0) {
+  if (Val == INT32_MIN)
+    Val = 0x1000;
+  else if (Val < 0) {
     Val *= -1;
     Val |= 0x1000;
   }
   return Val;
 }
 
-/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
 /// target.
 uint32_t ARMMCCodeEmitter::
 getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
@@ -698,8 +711,8 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO1 = MI.getOperand(OpIdx);
   const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = getARMRegisterNumbering(MO1.getReg());
-  unsigned Rm = getARMRegisterNumbering(MO2.getReg());
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
   return (Rm << 3) | Rn;
 }
 
@@ -715,7 +728,7 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
     Imm12 = 0;
     isAdd = false ; // 'U' bit is set as part of the fixup.
 
@@ -795,7 +808,7 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false ; // 'U' bit is set as part of the fixup.
 
@@ -831,7 +844,7 @@ getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
   // {7-0}  = imm8
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Reg = getARMRegisterNumbering(MO.getReg());
+  unsigned Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
   unsigned Imm8 = MO1.getImm();
   return (Reg << 8) | Imm8;
 }
@@ -861,11 +874,11 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
 
   // Handle :upper16: and :lower16: assembly prefixes.
   const MCExpr *E = MO.getExpr();
+  MCFixupKind Kind;
   if (E->getKind() == MCExpr::Target) {
     const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
     E = ARM16Expr->getSubExpr();
 
-    MCFixupKind Kind;
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
@@ -891,9 +904,21 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     }
     Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
     return 0;
-  };
-
-  llvm_unreachable("Unsupported MCExpr type in MCOperand!");
+  }
+  // If the expression doesn't have :upper16: or :lower16: on it,
+  // it's just a plain immediate expression, and those evaluate to
+  // the lower 16 bits of the expression regardless of whether
+  // we have a movt or a movw.
+  if (!isTargetDarwin() && EvaluateAsPCRel(E))
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16_pcrel
+                       : ARM::fixup_arm_movw_lo16_pcrel);
+  else
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16
+                       : ARM::fixup_arm_movw_lo16);
+  Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+  return 0;
 }
 
 uint32_t ARMMCCodeEmitter::
@@ -902,8 +927,8 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   const MCOperand &MO2 = MI.getOperand(OpIdx+2);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
-  unsigned Rm = getARMRegisterNumbering(MO1.getReg());
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
   unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
   bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
   ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
@@ -933,7 +958,7 @@ getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
   // {12}     isAdd
   // {11-0}   imm12/Rm
   const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
   uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
   Binary |= Rn << 14;
   return Binary;
@@ -956,7 +981,7 @@ getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
     ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
     Binary <<= 7;                    // Shift amount is bits [11:7]
     Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
-    Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0]
+    Binary |= CTX.getRegisterInfo().getEncodingValue(MO.getReg()); // Rm is bits [3:0]
   }
   return Binary | (isAdd << 12) | (isReg << 13);
 }
@@ -969,7 +994,7 @@ getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   bool isAdd = MO1.getImm() != 0;
-  return getARMRegisterNumbering(MO.getReg()) | (isAdd << 4);
+  return CTX.getRegisterInfo().getEncodingValue(MO.getReg()) | (isAdd << 4);
 }
 
 uint32_t ARMMCCodeEmitter::
@@ -987,7 +1012,7 @@ getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = getARMRegisterNumbering(MO.getReg());
+    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
   return Imm8 | (isAdd << 8) | (isImm << 9);
 }
 
@@ -1005,7 +1030,7 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
 
   // If The first operand isn't a register, we have a label reference.
   if (!MO.isReg()) {
-    unsigned Rn = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    unsigned Rn = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
 
     assert(MO.isExpr() && "Unexpected machine operand type!");
     const MCExpr *Expr = MO.getExpr();
@@ -1015,14 +1040,14 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
     ++MCNumCPRelocations;
     return (Rn << 9) | (1 << 13);
   }
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
   unsigned Imm = MO2.getImm();
   bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
   bool isImm = MO1.getReg() == 0;
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = getARMRegisterNumbering(MO1.getReg());
+    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
   return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
 }
 
@@ -1050,7 +1075,7 @@ getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
   unsigned Imm5 = MO1.getImm();
   return ((Imm5 & 0x1f) << 3) | Rn;
 }
@@ -1077,7 +1102,7 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false; // 'U' bit is handled as part of the fixup.
 
@@ -1123,7 +1148,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
 
   // Encode Rm.
-  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1148,7 +1173,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   // Encode the shift operation Rs.
   // Encode Rs bit[11:8].
   assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-  return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+  return Binary | (CTX.getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
 }
 
 unsigned ARMMCCodeEmitter::
@@ -1167,7 +1192,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1192,8 +1217,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
   // Encode shift_imm bit[11:7].
   Binary |= SBits << 4;
   unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
-  assert(Offset && "Offset must be in range 1-32!");
-  if (Offset == 32) Offset = 0;
+  assert(Offset < 32 && "Offset must be in range 0-31!");
   return Binary | (Offset << 7);
 }
 
@@ -1207,9 +1231,9 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
 
   // Encoded as [Rn, Rm, imm].
   // FIXME: Needs fixup support.
-  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
   Value <<= 4;
-  Value |= getARMRegisterNumbering(MO2.getReg());
+  Value |= CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
   Value <<= 2;
   Value |= MO3.getImm();
 
@@ -1223,7 +1247,7 @@ getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
   const MCOperand &MO2 = MI.getOperand(OpNum+1);
 
   // FIXME: Needs fixup support.
-  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
 
   // Even though the immediate is 8 bits long, we need 9 bits in order
   // to represent the (inverse of the) sign bit.
@@ -1285,7 +1309,7 @@ getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1341,7 +1365,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
 
   if (SPRRegs || DPRRegs) {
     // VLDM/VSTM
-    unsigned RegNo = getARMRegisterNumbering(Reg);
+    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
     unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
     Binary |= (RegNo & 0x1f) << 8;
     if (SPRRegs)
@@ -1350,7 +1374,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
       Binary |= NumRegs * 2;
   } else {
     for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
-      unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg());
+      unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(MI.getOperand(I).getReg());
       Binary |= 1 << RegNo;
     }
   }
@@ -1366,7 +1390,7 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1389,7 +1413,7 @@ getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1415,7 +1439,7 @@ getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1434,7 +1458,7 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(Op);
   if (MO.getReg() == 0) return 0x0D;
-  return getARMRegisterNumbering(MO.getReg());
+  return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
 }
 
 unsigned ARMMCCodeEmitter::
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index e3512cd..5df84c8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -35,7 +35,7 @@
 
 using namespace llvm;
 
-std::string ARM_MC::ParseARMTriple(StringRef TT) {
+std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
   unsigned Len = TT.size();
@@ -51,27 +51,48 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
       Idx = 6;
   }
 
+  bool NoCPU = CPU == "generic" || CPU.empty();
   std::string ARMArchFeature;
   if (Idx) {
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+        if (NoCPU)
+          // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
       } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
-        // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-        //       FeatureT2XtPk, FeatureMClass
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
-      } else
-        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
-        ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+        if (NoCPU)
+          // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
+          //       FeatureT2XtPk, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      } else {
+        // v7 CPUs have lots of different feature sets. If no CPU is specified,
+        // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
+        // the "minimum" feature set and use CPU string to figure out the exact
+        // features.
+        if (NoCPU)
+          // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      }
     } else if (SubVer == '6') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
-      else if (Len >= Idx+2 && TT[Idx+1] == 'm')
-        // v6m: FeatureNoARM, FeatureMClass
-        ARMArchFeature = "+v6t2,+noarm,+mclass";
-      else
+      else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        if (NoCPU)
+          // v6m: FeatureNoARM, FeatureMClass
+          ARMArchFeature = "+v6,+noarm,+mclass";
+        else
+          ARMArchFeature = "+v6";
+      } else
         ARMArchFeature = "+v6";
     } else if (SubVer == '5') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
@@ -94,7 +115,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
 
 MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
-  std::string ArchFS = ARM_MC::ParseARMTriple(TT);
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
   if (!FS.empty()) {
     if (!ArchFS.empty())
       ArchFS = ArchFS + "," + FS.str();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 88472d7..510302d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -23,6 +23,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -31,7 +32,7 @@ class raw_ostream;
 extern Target TheARMTarget, TheThumbTarget;
 
 namespace ARM_MC {
-  std::string ParseARMTriple(StringRef TT);
+  std::string ParseARMTriple(StringRef TT, StringRef CPU);
 
   /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance.
   /// This is exposed so Asm parser, etc. do not need to go through
@@ -41,6 +42,7 @@ namespace ARM_MC {
 }
 
 MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 8057cb6..a51e0fa 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -190,7 +190,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   //      0 - arm instructions
   //      1 - thumb instructions
   // the other half of the relocated expression is in the following pair
-  // relocation entry in the the low 16 bits of r_address field.
+  // relocation entry in the low 16 bits of r_address field.
   unsigned ThumbBit = 0;
   unsigned MovtBit = 0;
   switch ((unsigned)Fixup.getKind()) {
@@ -408,15 +408,22 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
   if (Type == macho::RIT_ARM_Half) {
-    // The other-half value only gets populated for the movt relocation.
+    // The other-half value only gets populated for the movt and movw
+    // relocation entries.
     uint32_t Value = 0;;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
+    case ARM::fixup_arm_movw_lo16:
+    case ARM::fixup_arm_movw_lo16_pcrel:
+    case ARM::fixup_t2_movw_lo16:
+    case ARM::fixup_t2_movw_lo16_pcrel:
+      Value = (FixedValue >> 16) & 0xffff;
+      break;
     case ARM::fixup_arm_movt_hi16:
     case ARM::fixup_arm_movt_hi16_pcrel:
     case ARM::fixup_t2_movt_hi16:
     case ARM::fixup_t2_movt_hi16_pcrel:
-      Value = FixedValue;
+      Value = FixedValue & 0xffff;
       break;
     }
     macho::RelocationEntry MREPair;
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 2899836..ad60e32 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -220,7 +220,9 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  unsigned TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TmpReg = MRI->createVirtualRegister(
+                      TII->getRegClass(MCID1, 0, TRI, MF));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 3eddda8..57dc6cb 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -710,3 +710,24 @@ targets, e.g., PPC, that share this behavior, it would be best to implement
 this in a target-independent way: we should probably fold that (when using
 "undefined at zero" semantics) to set the "defined at zero" bit and have
 the code generator expand out the right code.
+
+
+//===---------------------------------------------------------------------===//
+
+Clean up the test/MC/ARM files to have more robust register choices.
+
+R0 should not be used as a register operand in the assembler tests as it's then
+not possible to distinguish between a correct encoding and a missing operand
+encoding, as zero is the default value for the binary encoder.
+e.g.,
+    add r0, r0  // bad
+    add r3, r5  // good
+
+Register operands should be distinct. That is, when the encoding does not
+require two syntactical operands to refer to the same register, two different
+registers should be used in the test so as to catch errors where the
+operands are swapped in the encoding.
+e.g.,
+    subs.w r1, r1, r1 // bad
+    subs.w r1, r2, r3 // good
+
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index e03e758..735b255 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -53,11 +53,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  assert((RC == ARM::tGPRRegisterClass ||
+  assert((RC == &ARM::tGPRRegClass ||
           (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
            isARMLowRegister(SrcReg))) && "Unknown regclass!");
 
-  if (RC == ARM::tGPRRegisterClass ||
+  if (RC == &ARM::tGPRRegClass ||
       (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
        isARMLowRegister(SrcReg))) {
     DebugLoc DL;
@@ -81,11 +81,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  assert((RC == ARM::tGPRRegisterClass ||
+  assert((RC == &ARM::tGPRRegClass ||
           (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
            isARMLowRegister(DestReg))) && "Unknown regclass!");
 
-  if (RC == ARM::tGPRRegisterClass ||
+  if (RC == &ARM::tGPRRegClass ||
       (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
        isARMLowRegister(DestReg))) {
     DebugLoc DL;
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index ef77bbd..a39b722 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -49,13 +49,14 @@ const TargetRegisterClass*
 Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
                                                                          const {
   if (ARM::tGPRRegClass.hasSubClassEq(RC))
-    return ARM::tGPRRegisterClass;
+    return &ARM::tGPRRegClass;
   return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
 }
 
 const TargetRegisterClass *
-Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const {
-  return ARM::tGPRRegisterClass;
+Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::tGPRRegClass;
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -109,7 +110,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
     unsigned LdReg = DestReg;
     if (DestReg == ARM::SP) {
       assert(BaseReg == ARM::SP && "Unexpected!");
-      LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
     }
 
     if (NumBytes <= 255 && NumBytes >= 0)
@@ -693,7 +694,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // register. The offset is already handled in the vreg value.
       MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
   } else if (MI.mayStore()) {
-      VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
       bool UseRR = false;
 
       if (Opcode == ARM::tSTRspi) {
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 6971842..f2e4b08 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -30,7 +30,8 @@ public:
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
 
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index ecb4c2f..d54aa93 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -24,8 +24,6 @@ STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
   class Thumb2ITBlockPass : public MachineFunctionPass {
-    bool PreRegAlloc;
-
   public:
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
@@ -76,16 +74,14 @@ static void TrackDefUses(MachineInstr *MI,
   for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
     unsigned Reg = LocalUses[i];
     Uses.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
       Uses.insert(*Subreg);
   }
 
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
     Defs.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
       Defs.insert(*Subreg);
     if (Reg == ARM::CPSR)
       continue;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 8ab486b..e9e20dd 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -126,9 +126,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
-      RC == ARM::GPRnopcRegisterClass) {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -153,9 +153,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
-      RC == ARM::GPRnopcRegisterClass) {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -563,48 +563,6 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
-/// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
-/// two-addrss instruction inserted by two-address pass.
-void
-Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI,
-                                       MachineInstr *UseMI,
-                                       const TargetRegisterInfo &TRI) const {
-  if (SrcMI->getOpcode() != ARM::tMOVr || SrcMI->getOperand(1).isKill())
-    return;
-
-  unsigned PredReg = 0;
-  ARMCC::CondCodes CC = getInstrPredicate(UseMI, PredReg);
-  if (CC == ARMCC::AL || PredReg != ARM::CPSR)
-    return;
-
-  // Schedule the copy so it doesn't come between previous instructions
-  // and UseMI which can form an IT block.
-  unsigned SrcReg = SrcMI->getOperand(1).getReg();
-  ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
-  MachineBasicBlock *MBB = UseMI->getParent();
-  MachineBasicBlock::iterator MBBI = SrcMI;
-  unsigned NumInsts = 0;
-  while (--MBBI != MBB->begin()) {
-    if (MBBI->isDebugValue())
-      continue;
-
-    MachineInstr *NMI = &*MBBI;
-    ARMCC::CondCodes NCC = getInstrPredicate(NMI, PredReg);
-    if (!(NCC == CC || NCC == OCC) ||
-        NMI->modifiesRegister(SrcReg, &TRI) ||
-        NMI->modifiesRegister(ARM::CPSR, &TRI))
-      break;
-    if (++NumInsts == 4)
-      // Too many in a row!
-      return;
-  }
-
-  if (NumInsts) {
-    MBB->remove(SrcMI);
-    MBB->insert(++MBBI, SrcMI);
-  }
-}
-
 ARMCC::CondCodes
 llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
   unsigned Opc = MI->getOpcode();
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 0911f8a..2cdcd06 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -57,11 +57,6 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
-  /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
-  /// two-addrss instruction inserted by two-address pass.
-  void scheduleTwoAddrSource(MachineInstr *SrcMI, MachineInstr *UseMI,
-                             const TargetRegisterInfo &TRI) const;
-
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index b5a397e..f18f491 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -67,6 +67,7 @@ namespace {
     { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 1,0 },
     //FIXME: Disable CMN, as CCodes are backwards from compare expectations
     //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,    1,   0,  2,0, 0,0 },
     { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0,0 },
     { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0,1 },
     { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 1,0 },
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index cf4f796..1f8ca86 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -24,5 +24,7 @@ add_llvm_target(CellSPUCodeGen
   SPUNopFiller.cpp
   )
 
+add_dependencies(LLVMCellSPUCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt
index 3e7e0b6..3bce960 100644
--- a/lib/Target/CellSPU/README.txt
+++ b/lib/Target/CellSPU/README.txt
@@ -37,6 +37,20 @@ to add 'spu' to configure's --enable-targets option, e.g.:
 ---------------------------------------------------------------------------
 
 TODO:
+* In commit r142152 vector legalization was set to element promotion per
+  default. This breaks half vectors (e.g. v2i32) badly as they get element
+  promoted to much slower types (v2i64).
+
+* Many CellSPU specific codegen tests only grep & count the number of 
+  instructions, not checking their place with FileCheck. There have also
+  been some commits that change the CellSPU checks, some of which might
+  have not been thoroughly scrutinized w.r.t. to the changes they cause in SPU
+  assembly. (especially since about the time of r142152)  
+
+* Some of the i64 math have huge tablegen rules, which sometime cause
+  tablegen to run out of memory. See e.g. bug 8850. i64 arithmetics 
+  should probably be done with libraries.
+
 * Create a machine pass for performing dual-pipeline scheduling specifically
   for CellSPU, and insert branch prediction instructions as needed.
 
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 14021fe..03d5a9a 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -301,7 +301,9 @@ bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'L': // Write second word of DImode reference.
       // Verify that this operand has two consecutive registers.
       if (!MI->getOperand(OpNo).isReg() ||
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
index 403d7ef..67a83f1 100644
--- a/lib/Target/CellSPU/SPUHazardRecognizers.cpp
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -30,12 +30,6 @@ using namespace llvm;
 // very little right now.
 //===----------------------------------------------------------------------===//
 
-SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
-  TII(tii),
-  EvenOdd(0)
-{
-}
-
 /// Return the pipeline hazard type encountered or generated by this
 /// instruction. Currently returns NoHazard.
 ///
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h
index 675632c..30acaea 100644
--- a/lib/Target/CellSPU/SPUHazardRecognizers.h
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.h
@@ -24,12 +24,8 @@ class TargetInstrInfo;
 /// SPUHazardRecognizer
 class SPUHazardRecognizer : public ScheduleHazardRecognizer
 {
-private:
-  const TargetInstrInfo &TII;
-  int EvenOdd;
-
 public:
-  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  SPUHazardRecognizer(const TargetInstrInfo &/*TII*/) {}
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 0623741..4e9fcd1 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -77,12 +77,14 @@ namespace {
     // Splice the libcall in wherever FindInputOutputChains tells us to.
     Type *RetTy =
                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
-    std::pair<SDValue, SDValue> CallInfo =
-            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+    TargetLowering::CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned,
+                                         false, false,
                             0, TLI.getLibcallCallingConv(LC),
                             /*isTailCall=*/false,
-                            /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                                         /*doesNotRet=*/false,
+                                         /*isReturnValueUsed=*/true,
                             Callee, Args, DAG, Op.getDebugLoc());
+    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
     return CallInfo.first;
   }
@@ -100,13 +102,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
 
   // Set up the SPU's register classes:
-  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
-  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
-  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
-  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
-  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
-  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
-  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+  addRegisterClass(MVT::i8,   &SPU::R8CRegClass);
+  addRegisterClass(MVT::i16,  &SPU::R16CRegClass);
+  addRegisterClass(MVT::i32,  &SPU::R32CRegClass);
+  addRegisterClass(MVT::i64,  &SPU::R64CRegClass);
+  addRegisterClass(MVT::f32,  &SPU::R32FPRegClass);
+  addRegisterClass(MVT::f64,  &SPU::R64FPRegClass);
+  addRegisterClass(MVT::i128, &SPU::GPRCRegClass);
 
   // SPU has no sign or zero extended loads for i1, i8, i16:
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
@@ -397,12 +399,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // First set operation action for all vector types to expand. Then we
   // will selectively turn on ones that can be effectively codegen'd.
-  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v16i8, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v8i16, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v4i32, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v2i64, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v4f32, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v2f64, &SPU::VECREGRegClass);
 
   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1133,7 +1135,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
 
@@ -1263,14 +1265,19 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue
-SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+SPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // CellSPU target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -1280,7 +1287,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
 
@@ -1441,7 +1448,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Now handle the return value(s)
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
 
 
@@ -1468,7 +1475,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -3139,16 +3146,16 @@ SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'b':   // R1-R31
     case 'r':   // R0-R31
       if (VT == MVT::i64)
-        return std::make_pair(0U, SPU::R64CRegisterClass);
-      return std::make_pair(0U, SPU::R32CRegisterClass);
+        return std::make_pair(0U, &SPU::R64CRegClass);
+      return std::make_pair(0U, &SPU::R32CRegClass);
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, SPU::R32FPRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, SPU::R64FPRegisterClass);
+        return std::make_pair(0U, &SPU::R32FPRegClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &SPU::R64FPRegClass);
       break;
     case 'v':
-      return std::make_pair(0U, SPU::GPRCRegisterClass);
+      return std::make_pair(0U, &SPU::GPRCRegClass);
     }
   }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index e3db7b2..9f1599f 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -86,7 +86,6 @@ namespace llvm {
   class SPUTargetLowering :
     public TargetLowering
   {
-    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
     SPUTargetMachine &SPUTM;
 
   public:
@@ -159,13 +158,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 759923d..b25a639 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -140,29 +140,27 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
                                   unsigned SrcReg, bool isKill, int FrameIdx,
                                   const TargetRegisterClass *RC,
-                                  const TargetRegisterInfo *TRI) const
-{
+                                  const TargetRegisterInfo *TRI) const {
   unsigned opc;
   bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
-  if (RC == SPU::GPRCRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128);
-  } else if (RC == SPU::R64CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
-  } else if (RC == SPU::R64FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
-  } else if (RC == SPU::R32CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
-  } else if (RC == SPU::R32FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
-  } else if (RC == SPU::R16CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
-  } else if (RC == SPU::R8CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
-  } else if (RC == SPU::VECREGRegisterClass) {
-    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
-  } else {
+  if (RC == &SPU::GPRCRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128;
+  else if (RC == &SPU::R64CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64;
+  else if (RC == &SPU::R64FPRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64;
+  else if (RC == &SPU::R32CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32;
+  else if (RC == &SPU::R32FPRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32;
+  else if (RC == &SPU::R16CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16;
+  else if (RC == &SPU::R8CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8;
+  else if (RC == &SPU::VECREGRegClass)
+    opc = isValidFrameIdx ? SPU::STQDv16i8 : SPU::STQXv16i8;
+  else
     llvm_unreachable("Unknown regclass!");
-  }
 
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -175,29 +173,27 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const
-{
+                                   const TargetRegisterInfo *TRI) const {
   unsigned opc;
   bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
-  if (RC == SPU::GPRCRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128);
-  } else if (RC == SPU::R64CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
-  } else if (RC == SPU::R64FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
-  } else if (RC == SPU::R32CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
-  } else if (RC == SPU::R32FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
-  } else if (RC == SPU::R16CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
-  } else if (RC == SPU::R8CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
-  } else if (RC == SPU::VECREGRegisterClass) {
-    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
-  } else {
+  if (RC == &SPU::GPRCRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128;
+  else if (RC == &SPU::R64CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64;
+  else if (RC == &SPU::R64FPRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64;
+  else if (RC == &SPU::R32CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32;
+  else if (RC == &SPU::R32FPRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32;
+  else if (RC == &SPU::R16CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16;
+  else if (RC == &SPU::R8CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8;
+  else if (RC == &SPU::VECREGRegClass)
+    opc = isValidFrameIdx ? SPU::LQDv16i8 : SPU::LQXv16i8;
+  else
     llvm_unreachable("Unknown regclass in loadRegFromStackSlot!");
-  }
 
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -340,11 +336,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB)
 {
    MachineBasicBlock::iterator J = MBB.end();
-	for( int i=0; i<8; i++) {
-		if( J == MBB.begin() ) return J;
-		J--;
-	}
-	return J;
+   for( int i=0; i<8; i++) {
+     if( J == MBB.begin() ) return J;
+     J--;
+   }
+   return J;
 }
 
 unsigned
@@ -360,7 +356,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   MachineInstrBuilder MIB;
   //TODO: make a more accurate algorithm.
   bool haveHBR = MBB.size()>8;
-  
+
   removeHBR(MBB);
   MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol();
   // Add a label just before the branch
@@ -382,7 +378,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
         MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
         MIB.addSym(branchLabel);
         MIB.addMBB(TBB);
-      }	
+      }
     } else {
       // Conditional branch
       MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
@@ -392,7 +388,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
         MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
         MIB.addSym(branchLabel);
         MIB.addMBB(TBB);
-      }	
+      }
 
       DEBUG(errs() << "Inserted one-way cond branch:   ");
       DEBUG((*MIB).dump());
@@ -410,7 +406,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
       MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
       MIB.addSym(branchLabel);
       MIB.addMBB(FBB);
-    }	
+    }
 
     DEBUG(errs() << "Inserted conditional branch:    ");
     DEBUG((*MIB).dump());
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index f76ebd7..117acd7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -3421,14 +3421,14 @@ let isCall = 1,
   // Branch relative and set link: Used if we actually know that the target
   // is within [-32768, 32767] bytes of the target
   def BRSL:
-    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops),
+    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func),
       "brsl\t$$lr, $func",
       [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>;
 
   // Branch absolute and set link: Used if we actually know that the target
   // is an absolute address
   def BRASL:
-    BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops),
+    BranchSetLink<0b011001100, (outs), (ins calltarget:$func),
       "brasl\t$$lr, $func",
       [(SPUcall (SPUaform tglobaladdr:$func, 0))]>;
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 1b2da5f..e6c872d 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -193,7 +193,8 @@ SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
-SPURegisterInfo::getPointerRegClass(unsigned Kind) const {
+SPURegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                        const {
   return &SPU::R32CRegClass;
 }
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index e5ab224..e9f9aba 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -46,7 +46,7 @@ namespace llvm {
     /// getPointerRegClass - Return the register class to use to hold pointers.
     /// This is used for addressing modes.
     virtual const TargetRegisterClass *
-    getPointerRegClass(unsigned Kind = 0) const;
+    getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
     /// After allocating this many registers, the allocator should feel
     /// register pressure. The value is a somewhat random guess, based on the
@@ -63,6 +63,11 @@ namespace llvm {
     virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
     { return true; }
 
+    //! Enable tracking of liveness after register allocation, since register
+    // scavenging is enabled.
+    virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
+    { return true; }
+
     //! Return the reserved registers
     BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 3b90261..54764f1 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -72,7 +72,7 @@ TargetPassConfig *SPUTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool SPUPassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM->add(createSPUISelDag(getSPUTargetMachine()));
+  addPass(createSPUISelDag(getSPUTargetMachine()));
   return false;
 }
 
@@ -85,9 +85,9 @@ bool SPUPassConfig::addPreEmitPass() {
     (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
           "createTCESchedulerPass");
   if (schedulerCreator != NULL)
-      PM->add(schedulerCreator("cellspu"));
+      addPass(schedulerCreator("cellspu"));
 
   //align instructions with nops/lnops for dual issue
-  PM->add(createSPUNopFillerPass(getSPUTargetMachine()));
+  addPass(createSPUNopFillerPass(getSPUTargetMachine()));
   return true;
 }
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 69f0ff8..c8e757b 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -130,6 +130,7 @@ namespace {
   private:
     void printLinkageType(GlobalValue::LinkageTypes LT);
     void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
     void printCallingConv(CallingConv::ID cc);
     void printEscapedString(const std::string& str);
     void printCFP(const ConstantFP* CFP);
@@ -325,6 +326,26 @@ void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
   }
 }
 
+void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      Out << "GlobalVariable::NotThreadLocal";
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "GlobalVariable::GeneralDynamicTLSModel";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "GlobalVariable::LocalDynamicTLSModel";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "GlobalVariable::InitialExecTLSModel";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "GlobalVariable::LocalExecTLSModel";
+      break;
+  }
+}
+
 // printEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 void CppWriter::printEscapedString(const std::string &Str) {
@@ -496,7 +517,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       Out << "Attrs.push_back(PAWI);";
       nl(Out);
     }
-    Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+    Out << name << "_PAL = AttrListPtr::get(Attrs);";
     nl(Out);
     out(); nl(Out);
     Out << '}'; nl(Out);
@@ -996,7 +1017,9 @@ void CppWriter::printVariableHead(const GlobalVariable *GV) {
   }
   if (GV->isThreadLocal()) {
     printCppName(GV);
-    Out << "->setThreadLocal(true);";
+    Out << "->setThreadLocalMode(";
+    printThreadLocalMode(GV->getThreadLocalMode());
+    Out << ");";
     nl(Out);
   }
   if (is_inline) {
@@ -1105,7 +1128,7 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out);
     for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      const ConstantInt* CaseVal = i.getCaseValue();
+      const IntegersSubset CaseVal = i.getCaseValueEx();
       const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
@@ -2078,7 +2101,9 @@ char CppWriter::ID = 0;
 bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                            formatted_raw_ostream &o,
                                            CodeGenFileType FileType,
-                                           bool DisableVerify) {
+                                           bool DisableVerify,
+                                           AnalysisID StartAfter,
+                                           AnalysisID StopAfter) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
   PM.add(new CppWriter(o));
   return false;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 92bca6c..9cbe798 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -31,7 +31,9 @@ struct CPPTargetMachine : public TargetMachine {
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   bool DisableVerify);
+                                   bool DisableVerify,
+                                   AnalysisID StartAfter,
+                                   AnalysisID StopAfter);
 
   virtual const TargetData *getTargetData() const { return 0; }
 };
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index af9e813..1f2d8ac 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -28,8 +28,12 @@ add_llvm_target(HexagonCodeGen
   HexagonSubtarget.cpp
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
+  HexagonVLIWPacketizer.cpp
+  HexagonNewValueJump.cpp
 )
 
+add_dependencies(LLVMHexagonCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 0808323..45f857b 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -40,6 +40,9 @@ namespace llvm {
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonPacketizer();
+  FunctionPass *createHexagonNewValueJump();
+
 
 /* TODO: object output.
   MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
@@ -47,7 +50,8 @@ namespace llvm {
                                             MCContext &Ctx);
 */
 /* TODO: assembler input.
-  TargetAsmBackend *createHexagonAsmBackend(const Target &, const std::string &);
+  TargetAsmBackend *createHexagonAsmBackend(const Target &,
+                                                  const std::string &);
 */
   void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
                         HexagonAsmPrinter &AP);
@@ -67,7 +71,7 @@ namespace llvm {
 // Normal instruction size (in bytes).
 #define HEXAGON_INSTR_SIZE 4
 
-// Maximum number of words in a packet (in instructions).
+// Maximum number of words and instructions in a packet.
 #define HEXAGON_PACKET_SIZE 4
 
 #endif
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 4a50d16..451e562 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -28,6 +28,8 @@ def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
                                     "Hexagon v3">;
 def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
                                     "Hexagon v4">;
+def ArchV5       : SubtargetFeature<"v5", "HexagonArchVersion", "V5",
+                                    "Hexagon v5">;
 
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
@@ -45,13 +47,15 @@ def HexagonInstrInfo : InstrInfo;
 // Hexagon processors supported.
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, ProcessorItineraries Itin,
+class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
- : Processor<Name, Itin, Features>;
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv2", HexagonModel,   [ArchV2]>;
+def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
+def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
+def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
 
-def : Proc<"hexagonv2", HexagonItineraries,   [ArchV2]>;
-def : Proc<"hexagonv3", HexagonItineraries,   [ArchV2, ArchV3]>;
-def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>;
 
 // Hexagon Uses the MC printer for assembler output, so make sure the TableGen
 // AsmWriter bits get associated with the correct class.
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 39bf45d..5fa4740 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -13,11 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #define DEBUG_TYPE "asm-printer"
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
+#include "HexagonMCInst.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonSubtarget.h"
 #include "InstPrinter/HexagonInstPrinter.h"
@@ -77,8 +77,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   const MachineOperand &MO = MI->getOperand(OpNo);
 
   switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
+  default: llvm_unreachable ("<unknown operand type>");
   case MachineOperand::MO_Register:
     O << HexagonInstPrinter::getRegisterName(MO.getReg());
     return;
@@ -134,7 +133,9 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
     case 'c': // Don't print "$" before a global var name or constant.
       // Hexagon never has a prefix.
       printOperand(MI, OpNo, OS);
@@ -196,10 +197,45 @@ void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
 /// the current output stream.
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MCInst MCI;
-
-  HexagonLowerToMC(MI, MCI, *this);
-  OutStreamer.EmitInstruction(MCI);
+  if (MI->isBundle()) {
+    std::vector<const MachineInstr*> BundleMIs;
+
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI;
+    ++MII;
+    unsigned int IgnoreCount = 0;
+    while (MII != MBB->end() && MII->isInsideBundle()) {
+      const MachineInstr *MInst = MII;
+      if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+          IgnoreCount++;
+          ++MII;
+          continue;
+      }
+      //BundleMIs.push_back(&*MII);
+      BundleMIs.push_back(MInst);
+      ++MII;
+    }
+    unsigned Size = BundleMIs.size();
+    assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
+    for (unsigned Index = 0; Index < Size; Index++) {
+      HexagonMCInst MCI;
+      MCI.setStartPacket(Index == 0);
+      MCI.setEndPacket(Index == (Size-1));
+
+      HexagonLowerToMC(BundleMIs[Index], MCI, *this);
+      OutStreamer.EmitInstruction(MCI);
+    }
+  }
+  else {
+    HexagonMCInst MCI;
+    if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+      MCI.setStartPacket(true);
+      MCI.setEndPacket(true);
+    }
+    HexagonLowerToMC(MI, MCI, *this);
+    OutStreamer.EmitInstruction(MCI);
+  }
 
   return;
 }
@@ -241,15 +277,15 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
 void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
                                        raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
-  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) &&
-	  "Expecting jump table index");
+  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && 
+           "Expecting jump table index");
 
   // Hexagon_TODO: Do we need name mangling?
   O << *GetJTISymbol(MO.getIndex());
 }
 
 void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo,
-                                          raw_ostream &O) {
+                                       raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) &&
           "Expecting constant pool index");
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
index bd9608b..e61b2a7 100644
--- a/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/lib/Target/Hexagon/HexagonCallingConv.td
@@ -17,8 +17,8 @@
 
 // Hexagon 32-bit C return-value convention.
 def RetCC_Hexagon32 : CallingConv<[
-  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
 
   // Alternatively, they are assigned to the stack in 4-byte aligned units.
   CCAssignToStack<4, 4>
@@ -27,8 +27,8 @@ def RetCC_Hexagon32 : CallingConv<[
 // Hexagon 32-bit C Calling convention.
 def CC_Hexagon32 : CallingConv<[
   // All arguments get passed in integer registers if there is space.
-  CCIfType<[i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
 
   // Alternatively, they are assigned to the stack in 4-byte aligned units.
   CCAssignToStack<4, 4>
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index 46c20e9..ba8e679 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -56,11 +56,8 @@ void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void Hexagon_CCState::MarkAllocated(unsigned Reg) {
-  UsedRegs[Reg/32] |= 1 << (Reg&31);
-
-  if (const uint16_t *RegAliases = TRI.getAliasSet(Reg))
-    for (; (Reg = *RegAliases); ++RegAliases)
-      UsedRegs[Reg/32] |= 1 << (Reg&31);
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
 /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 2100474..ae2ca37 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 // The Hexagon processor has no instructions that load or store predicate
-// registers directly.  So, when these registers must be spilled a general 
-// purpose register must be found and the value copied to/from it from/to 
-// the predicate register.  This code currently does not use the register 
+// registers directly.  So, when these registers must be spilled a general
+// purpose register must be found and the value copied to/from it from/to
+// the predicate register.  This code currently does not use the register
 // scavenger mechanism available in the allocator.  There are two registers
 // reserved to allow spilling/restoring predicate registers.  One is used to
 // hold the predicate value.  The other is used when stack frame offsets are
@@ -84,7 +84,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
         int SrcReg = MI->getOperand(2).getReg();
         assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
                "Not a predicate register");
-        if (!TII->isValidOffset(Hexagon::STriw, Offset)) {
+        if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) {
           if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
@@ -95,7 +95,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::STriw))
+                    TII->get(Hexagon::STriw_indexed))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
           } else {
@@ -103,7 +103,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
                       HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw))
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                          TII->get(Hexagon::STriw_indexed))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0)
               .addReg(HEXAGON_RESERVED_REG_2);
@@ -111,7 +112,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
         } else {
           BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                     HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw)).
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                        TII->get(Hexagon::STriw_indexed)).
                     addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
         }
         MII = MBB->erase(MI);
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index e8a6924..cd682df 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -209,6 +209,16 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
           FuncInfo->hasClobberLR() );
 }
 
+static inline
+unsigned uniqueSuperReg(unsigned Reg, const TargetRegisterInfo *TRI) {
+  MCSuperRegIterator SRI(Reg, TRI);
+  assert(SRI.isValid() && "Expected a superreg");
+  unsigned SuperReg = *SRI;
+  ++SRI;
+  assert(!SRI.isValid() && "Expected exactly one superreg");
+  return SuperReg;
+}
+
 bool
 HexagonFrameLowering::spillCalleeSavedRegisters(
                                         MachineBasicBlock &MBB,
@@ -235,26 +245,21 @@ HexagonFrameLowering::spillCalleeSavedRegisters(
     //
     // Check if we can use a double-word store.
     //
-    const uint16_t* SuperReg = TRI->getSuperRegisters(Reg);
-
-    // Assume that there is exactly one superreg.
-    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     bool CanUseDblStore = false;
     const TargetRegisterClass* SuperRegClass = 0;
 
     if (ContiguousRegs && (i < CSI.size()-1)) {
-      const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
-      assert(SuperRegNext[0] && !SuperRegNext[1] &&
-             "Expected exactly one superreg");
-      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
-      CanUseDblStore = (SuperRegNext[0] == SuperReg[0]);
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblStore = (SuperRegNext == SuperReg);
     }
 
 
     if (CanUseDblStore) {
-      TII.storeRegToStackSlot(MBB, MI, SuperReg[0], true,
+      TII.storeRegToStackSlot(MBB, MI, SuperReg, true,
                               CSI[i+1].getFrameIdx(), SuperRegClass, TRI);
-      MBB.addLiveIn(SuperReg[0]);
+      MBB.addLiveIn(SuperReg);
       ++i;
     } else {
       // Cannot use a double-word store.
@@ -295,25 +300,20 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters(
     //
     // Check if we can use a double-word load.
     //
-    const uint16_t* SuperReg = TRI->getSuperRegisters(Reg);
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     const TargetRegisterClass* SuperRegClass = 0;
-
-    // Assume that there is exactly one superreg.
-    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
     bool CanUseDblLoad = false;
     if (ContiguousRegs && (i < CSI.size()-1)) {
-      const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
-      assert(SuperRegNext[0] && !SuperRegNext[1] &&
-             "Expected exactly one superreg");
-      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
-      CanUseDblLoad = (SuperRegNext[0] == SuperReg[0]);
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblLoad = (SuperRegNext == SuperReg);
     }
 
 
     if (CanUseDblLoad) {
-      TII.loadRegFromStackSlot(MBB, MI, SuperReg[0], CSI[i+1].getFrameIdx(),
+      TII.loadRegFromStackSlot(MBB, MI, SuperReg, CSI[i+1].getFrameIdx(),
                                SuperRegClass, TRI);
-      MBB.addLiveIn(SuperReg[0]);
+      MBB.addLiveIn(SuperReg);
       ++i;
     } else {
       // Cannot use a double-word load.
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 57772a5..d756aec 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -328,7 +328,10 @@ CountValue *HexagonHardwareLoops::getTripCount(MachineLoop *L) const {
   // can get a useful trip count.  The trip count can
   // be either a register or an immediate.  The location
   // of the value depends upon the type (reg or imm).
-  while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
+  for (MachineRegisterInfo::reg_iterator
+       RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
+       RI != RE; ++RI) {
+    IV_Opnd = &RI.getOperand();
     const MachineInstr *MI = IV_Opnd->getParent();
     if (L->contains(MI) && isCompareEqualsImm(MI)) {
       const MachineOperand &MO = MI->getOperand(2);
@@ -491,7 +494,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
               TII->get(Hexagon::NEG), CountReg).addReg(CountReg1);
     }
 
-    // Add the Loop instruction to the begining of the loop.
+    // Add the Loop instruction to the beginning of the loop.
     BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
             TII->get(Hexagon::LOOP0_r)).addMBB(LoopStart).addReg(CountReg);
   } else {
@@ -623,7 +626,7 @@ void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
   const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
   MachineBasicBlock *MBB = MII->getParent();
   DebugLoc DL = MII->getDebugLoc();
-  unsigned Scratch = RS.scavengeRegister(Hexagon::IntRegsRegisterClass, MII, 0);
+  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
 
   // First, set the LC0 with the trip count.
   if (MII->getOperand(1).isReg()) {
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9df965e..5499134 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -90,7 +90,9 @@ public:
   SDNode *SelectMul(SDNode *N);
   SDNode *SelectZeroExtend(SDNode *N);
   SDNode *SelectIntrinsicWOChain(SDNode *N);
+  SDNode *SelectIntrinsicWChain(SDNode *N);
   SDNode *SelectConstant(SDNode *N);
+  SDNode *SelectConstantFP(SDNode *N);
   SDNode *SelectAdd(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
@@ -318,7 +320,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
       else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
       else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
       else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
-      else assert (0 && "unknown memory type");
+      else llvm_unreachable("unknown memory type");
 
       // Build indexed load.
       SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy);
@@ -375,7 +377,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
       };
       ReplaceUses(Froms, Tos, 3);
       return Result_2;
-    } 
+    }
     SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
     SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -516,7 +518,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
     else
       Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
   } else
-    assert (0 && "unknown memory type");
+    llvm_unreachable("unknown memory type");
 
   // For zero ext i64 loads, we need to add combine instructions.
   if (LD->getValueType(0) == MVT::i64 &&
@@ -613,7 +615,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
     else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
     else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
     else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
-    else assert (0 && "unknown memory type");
+    else llvm_unreachable("unknown memory type");
 
     // Build post increment store.
     SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -636,10 +638,10 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
   // Figure out the opcode.
   if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
-  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
   else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
   else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
-  else assert (0 && "unknown memory type");
+  else llvm_unreachable("unknown memory type");
 
   // Build regular store.
   SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
@@ -693,7 +695,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
         else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
         else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
         else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
-        else assert (0 && "unknown memory type");
+        else llvm_unreachable("unknown memory type");
 
         SDValue Ops[] = {SDValue(NewBase,0),
                          CurDAG->getTargetConstant(Offset,PointerTy),
@@ -723,7 +725,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   if (AM != ISD::UNINDEXED) {
     return SelectIndexedStore(ST, dl);
   }
-   
+
   return SelectBaseOffsetStore(ST, dl);
 }
 
@@ -752,7 +754,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
     if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
       SDValue Sext0 = MulOp0.getOperand(0);
       if (Sext0.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
+        return SelectCode(N);
       }
 
       OP0 = Sext0;
@@ -761,7 +763,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
       if (LD->getMemoryVT() != MVT::i32 ||
           LD->getExtensionType() != ISD::SEXTLOAD ||
           LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
+        return SelectCode(N);
       }
 
       SDValue Chain = LD->getChain();
@@ -1128,12 +1130,12 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
     // For immediates, lower it.
     for (unsigned i = 1; i < N->getNumOperands(); ++i) {
       SDNode *Arg = N->getOperand(i).getNode();
-      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF);
 
-      if (RC == Hexagon::IntRegsRegisterClass ||
-          RC == Hexagon::DoubleRegsRegisterClass) {
+      if (RC == &Hexagon::IntRegsRegClass ||
+          RC == &Hexagon::DoubleRegsRegClass) {
         Ops.push_back(SDValue(Arg, 0));
-      } else if (RC == Hexagon::PredRegsRegisterClass) {
+      } else if (RC == &Hexagon::PredRegsRegClass) {
         // Do the transfer.
         SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
                                               SDValue(Arg, 0));
@@ -1158,6 +1160,25 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
   return SelectCode(N);
 }
 
+//
+// Map floating point constant values.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  APFloat APF = CN->getValueAPF();
+  if (N->getValueType(0) == MVT::f32) {
+    return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
+              CurDAG->getTargetConstantFP(APF.convertToFloat(), MVT::f32));
+  }
+  else if (N->getValueType(0) == MVT::f64) {
+    return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
+              CurDAG->getTargetConstantFP(APF.convertToDouble(), MVT::f64));
+  }
+
+  return SelectCode(N);
+}
+
 
 //
 // Map predicate true (encoded as -1 in LLVM) to a XOR.
@@ -1215,7 +1236,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
   // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
   // Rd and Rd' are assigned to the same register
-  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_rr_acc, dl, MVT::i32,
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32,
                                           N->getOperand(1),
                                           Src1->getOperand(0),
                                           Src1->getOperand(1));
@@ -1234,6 +1255,9 @@ SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant:
     return SelectConstant(N);
 
+  case ISD::ConstantFP:
+    return SelectConstantFP(N);
+
   case ISD::ADD:
     return SelectAdd(N);
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 8c4350d..703a128 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -103,12 +103,12 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     ofst = State.AllocateStack(4, 4);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     ofst = State.AllocateStack(8, 8);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
@@ -142,12 +142,12 @@ CC_Hexagon (unsigned ValNo, MVT ValVT,
       LocInfo = CCValAssign::AExt;
   }
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
       return false;
   }
 
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
       return false;
   }
@@ -217,12 +217,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
       LocInfo = CCValAssign::AExt;
   }
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
   }
 
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
   }
@@ -234,7 +234,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
@@ -249,7 +249,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
 static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
@@ -299,7 +299,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analyze return values of ISD::RET
   CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
@@ -351,7 +351,7 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
 
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
 
@@ -370,21 +370,25 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 /// LowerCall - Functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 SDValue
-HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                                 CallingConv::ID CallConv, bool isVarArg,
-                                 bool doesNotRet, bool &isTailCall,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 const SmallVectorImpl<ISD::InputArg> &Ins,
-                                 DebugLoc dl, SelectionDAG &DAG,
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Check for varargs.
   NumNamedVarArgParams = -1;
@@ -504,7 +508,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   if (!isTailCall) {
@@ -524,7 +528,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
     //
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -813,7 +817,7 @@ const {
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
@@ -839,14 +843,15 @@ const {
       // 1. int, long long, ptr args that get allocated in register.
       // 2. Large struct that gets an register to put its address in.
       EVT RegVT = VA.getLocVT();
-      if (RegVT == MVT::i8 || RegVT == MVT::i16 || RegVT == MVT::i32) {
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+          RegVT == MVT::i32 || RegVT == MVT::f32) {
         unsigned VReg =
-          RegInfo.createVirtualRegister(Hexagon::IntRegsRegisterClass);
+          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       } else if (RegVT == MVT::i64) {
         unsigned VReg =
-          RegInfo.createVirtualRegister(Hexagon::DoubleRegsRegisterClass);
+          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       } else {
@@ -918,14 +923,33 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC = Op.getOperand(4);
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
   SDNode* OpNode = Op.getNode();
+  EVT SVT = OpNode->getValueType(0);
 
-  SDValue Cond = DAG.getNode(ISD::SETCC, Op.getDebugLoc(), MVT::i1,
-                             Op.getOperand(2), Op.getOperand(3),
-                             Op.getOperand(4));
-  return DAG.getNode(ISD::SELECT, Op.getDebugLoc(), OpNode->getValueType(0),
-                     Cond, Op.getOperand(0),
-                     Op.getOperand(1));
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC);
+  return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValTy = Op.getValueType();
+
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
+                                    CP->getAlignment());
+  return DAG.getNode(HexagonISD::CONST32, dl, ValTy, Res);
 }
 
 SDValue
@@ -1010,11 +1034,18 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
   : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
     TM(targetmachine) {
 
+    const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+
     // Set up the register classes.
-    addRegisterClass(MVT::i32, Hexagon::IntRegsRegisterClass);
-    addRegisterClass(MVT::i64, Hexagon::DoubleRegsRegisterClass);
+    addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
 
-    addRegisterClass(MVT::i1, Hexagon::PredRegsRegisterClass);
+    if (QRI->Subtarget.hasV5TOps()) {
+      addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+      addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+    }
+
+    addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
 
     computeRegisterProperties();
 
@@ -1028,32 +1059,16 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     //
     // Library calls for unsupported operations
     //
-    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
 
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
     setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
     setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
 
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
     setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
-
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
     setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
 
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
     setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
     setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
 
-    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
-
     setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
@@ -1082,92 +1097,184 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
     setOperationAction(ISD::FDIV,  MVT::f64, Expand);
 
-    setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
-    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+    setOperationAction(ISD::FSQRT,  MVT::f32, Expand);
+    setOperationAction(ISD::FSQRT,  MVT::f64, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f32, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f64, Expand);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+      // Hexagon V5 Support.
+      setOperationAction(ISD::FADD,       MVT::f32, Legal);
+      setOperationAction(ISD::FADD,       MVT::f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGT,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOLE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f64, Legal);
+
+      setOperationAction(ISD::ConstantFP,  MVT::f32, Legal);
+      setOperationAction(ISD::ConstantFP,  MVT::f64, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Legal);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+
+      setOperationAction(ISD::FNEG,  MVT::f32, Legal);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    } else {
+
+      // Expand fp<->uint.
+      setOperationAction(ISD::FP_TO_SINT,  MVT::i32, Expand);
+      setOperationAction(ISD::FP_TO_UINT,  MVT::i32, Expand);
 
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
-    setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+      setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+      setOperationAction(ISD::UINT_TO_FP,  MVT::i32, Expand);
 
-    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
-    setOperationAction(ISD::FADD,  MVT::f64, Expand);
+      setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
 
-    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
 
-    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+      setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
 
-    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
-    setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
 
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
-    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+      setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
 
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
-    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+      setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
 
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
-    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+      setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
 
-    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
-    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+      setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+      setOperationAction(ISD::FADD,  MVT::f64, Expand);
 
-    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
-    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+      setOperationAction(ISD::FADD,  MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
-    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
-    setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+      setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
-    setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+      setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
-    setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+      setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
-    setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+      setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
-    setOperationAction(ISD::SREM, MVT::i32, Expand);
+      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
-    setOperationAction(ISD::FMUL, MVT::f64, Expand);
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
-    setOperationAction(ISD::MUL, MVT::f32, Expand);
+      setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
-    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+      setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
 
+      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
-    setOperationAction(ISD::SUB, MVT::f64, Expand);
+      setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+      setOperationAction(ISD::FMUL, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
-    setOperationAction(ISD::SUB, MVT::f32, Expand);
+      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+      setOperationAction(ISD::MUL, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
-    setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+      setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+      setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
-    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+      setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
 
-    setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
-    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+      setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+      setOperationAction(ISD::SUB, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
-    setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+      setOperationAction(ISD::SUB, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
-    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+      setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
-    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+      setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Expand);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f32, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    }
+
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
 
     setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
@@ -1208,20 +1315,33 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-    // Expand fp<->uint.
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
-
-    // Hexagon has no select or setcc: expand to SELECT_CC.
-    setOperationAction(ISD::SELECT, MVT::f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::f64, Expand);
-
     // Lower SELECT_CC to SETCC and SELECT.
     setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
-    // This is a workaround documented in DAGCombiner.cpp:2892 We don't
-    // support SELECT_CC on every type.
-    setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+
+      // We need to make the operation type of SELECT node to be Custom,
+      // such that we don't go into the infinite loop of
+      // select ->  setcc -> select_cc -> select loop.
+      setOperationAction(ISD::SELECT, MVT::f32, Custom);
+      setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+      setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+    } else {
+
+      // Hexagon has no select or setcc: expand to SELECT_CC.
+      setOperationAction(ISD::SELECT, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+      // This is a workaround documented in DAGCombiner.cpp:2892 We don't
+      // support SELECT_CC on every type.
+      setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    }
 
     setOperationAction(ISD::BR_CC, MVT::Other, Expand);
     setOperationAction(ISD::BRIND, MVT::Other, Expand);
@@ -1307,22 +1427,22 @@ const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     default: return 0;
-    case HexagonISD::CONST32:    return "HexagonISD::CONST32";
+    case HexagonISD::CONST32:     return "HexagonISD::CONST32";
     case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
-    case HexagonISD::CMPICC:     return "HexagonISD::CMPICC";
-    case HexagonISD::CMPFCC:     return "HexagonISD::CMPFCC";
-    case HexagonISD::BRICC:      return "HexagonISD::BRICC";
-    case HexagonISD::BRFCC:      return "HexagonISD::BRFCC";
-    case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC";
-    case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC";
-    case HexagonISD::Hi:         return "HexagonISD::Hi";
-    case HexagonISD::Lo:         return "HexagonISD::Lo";
-    case HexagonISD::FTOI:       return "HexagonISD::FTOI";
-    case HexagonISD::ITOF:       return "HexagonISD::ITOF";
-    case HexagonISD::CALL:       return "HexagonISD::CALL";
-    case HexagonISD::RET_FLAG:   return "HexagonISD::RET_FLAG";
-    case HexagonISD::BR_JT:      return "HexagonISD::BR_JT";
-    case HexagonISD::TC_RETURN:  return "HexagonISD::TC_RETURN";
+    case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
+    case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
+    case HexagonISD::BRICC:       return "HexagonISD::BRICC";
+    case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
+    case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
+    case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
+    case HexagonISD::Hi:          return "HexagonISD::Hi";
+    case HexagonISD::Lo:          return "HexagonISD::Lo";
+    case HexagonISD::FTOI:        return "HexagonISD::FTOI";
+    case HexagonISD::ITOF:        return "HexagonISD::ITOF";
+    case HexagonISD::CALL:        return "HexagonISD::CALL";
+    case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
+    case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
+    case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
   }
 }
 
@@ -1347,9 +1467,10 @@ SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
     default: llvm_unreachable("Should not custom lower this!");
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
       // Frame & Return address.  Currently unimplemented.
-    case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
-    case ISD::FRAMEADDR:  return LowerFRAMEADDR(Op, DAG);
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
     case ISD::GlobalTLSAddress:
                           llvm_unreachable("TLS not implemented for Hexagon.");
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
@@ -1359,9 +1480,10 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
 
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+    case ISD::SELECT:             return Op;
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
+    case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
 
   }
 }
@@ -1404,9 +1526,11 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
        case MVT::i32:
        case MVT::i16:
        case MVT::i8:
-         return std::make_pair(0U, Hexagon::IntRegsRegisterClass);
+       case MVT::f32:
+         return std::make_pair(0U, &Hexagon::IntRegsRegClass);
        case MVT::i64:
-         return std::make_pair(0U, Hexagon::DoubleRegsRegisterClass);
+       case MVT::f64:
+         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
     default:
       llvm_unreachable("Unknown asm register class");
@@ -1416,6 +1540,14 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+  return QRI->Subtarget.hasV5TOps();
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented by
 /// AM is legal for this target, for a load/store of the specified type.
 bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4208bcb..fe6c905 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -27,6 +27,7 @@ namespace llvm {
 
       CONST32,
       CONST32_GP,  // For marking data present in GP.
+      FCONST32,
       SETCC,
       ADJDYNALLOC,
       ARGEXTEND,
@@ -48,6 +49,7 @@ namespace llvm {
       BR_JT,       // Jump table.
       BARRIER,     // Memory barrier.
       WrapperJT,
+      WrapperCP,
       TC_RETURN
     };
   }
@@ -94,13 +96,7 @@ namespace llvm {
                                  SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerCall(SDValue Chain, SDValue Callee,
-                      CallingConv::ID CallConv, bool isVarArg,
-                      bool doesNotRet, bool &isTailCall,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                      DebugLoc dl, SelectionDAG &DAG,
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -128,6 +124,7 @@ namespace llvm {
                                  MachineBasicBlock *BB) const;
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     virtual EVT getSetCCResultType(EVT VT) const {
       return MVT::i1;
     }
@@ -150,6 +147,7 @@ namespace llvm {
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
diff --git a/lib/Target/Hexagon/HexagonImmediates.td b/lib/Target/Hexagon/HexagonImmediates.td
index e78bb79..18692c4 100644
--- a/lib/Target/Hexagon/HexagonImmediates.td
+++ b/lib/Target/Hexagon/HexagonImmediates.td
@@ -371,7 +371,7 @@ def s4_3ImmPred  : PatLeaf<(i32 imm), [{
 def u64ImmPred  : PatLeaf<(i64 imm), [{
   // immS16 predicate - True if the immediate fits in a 16-bit sign extended
   // field.
-  // Adding "N ||" to supress gcc unused warning.
+  // Adding "N ||" to suppress gcc unused warning.
   return (N || true);
 }]>;
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index c9f16fb..e472d49 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -13,29 +13,48 @@
 //                    *** Must match HexagonBaseInfo.h ***
 //===----------------------------------------------------------------------===//
 
+class Type<bits<5> t> {
+  bits<5> Value = t;
+}
+def TypePSEUDO : Type<0>;
+def TypeALU32  : Type<1>;
+def TypeCR     : Type<2>;
+def TypeJR     : Type<3>;
+def TypeJ      : Type<4>;
+def TypeLD     : Type<5>;
+def TypeST     : Type<6>;
+def TypeSYSTEM : Type<7>;
+def TypeXTYPE  : Type<8>;
+def TypeMARKER : Type<31>;
 
 //===----------------------------------------------------------------------===//
 //                         Intruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
 class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
-                  string cstr, InstrItinClass itin> : Instruction {
+                  string cstr, InstrItinClass itin, Type type> : Instruction {
   field bits<32> Inst;
 
   let Namespace = "Hexagon";
 
   dag OutOperandList = outs;
   dag InOperandList = ins;
-  let AsmString   = asmstr;
+  let AsmString = asmstr;
   let Pattern = pattern;
   let Constraints = cstr;
-  let Itinerary   = itin;
-
-  // *** The code below must match HexagonBaseInfo.h ***
-
+  let Itinerary = itin;
+  let Size = 4;
+
+  // *** Must match HexagonBaseInfo.h ***
+  // Instruction type according to the ISA.
+  Type HexagonType = type;
+  let TSFlags{4-0} = HexagonType.Value;
+  // Solo instructions, i.e., those that cannot be in a packet with others.
+  bits<1> isHexagonSolo = 0;
+  let TSFlags{5} = isHexagonSolo;
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{1} = isPredicated;
+  let TSFlags{6} = isPredicated;
 
   // *** The code above must match HexagonBaseInfo.h ***
 }
@@ -47,17 +66,25 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", LD> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
 }
 
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+  let mayLoad = 1;
+}
+
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
                  string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  LD> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -68,7 +95,24 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  ST> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+}
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+  let mayStore = 1;
+}
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", SYS, TypeSYSTEM> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
@@ -79,7 +123,7 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
                  string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  ST> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -89,7 +133,7 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
 // ALU32 Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU32> {
+   : InstHexagon<outs, ins, asmstr, pattern, "", ALU32, TypeALU32> {
   bits<5>  rd;
   bits<5>  rs;
   bits<5>  rt;
@@ -102,7 +146,17 @@ class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU64> {
+   : InstHexagon<outs, ins, asmstr, pattern, "", ALU64, TypeXTYPE> {
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+  bits<16> imm16_2;
+}
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
+   string cstr>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE> {
   bits<5>  rd;
   bits<5>  rs;
   bits<5>  rt;
@@ -115,7 +169,7 @@ class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  M> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", M, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -126,8 +180,8 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-    : InstHexagon<outs, ins, asmstr, pattern, cstr,  M> {
+    string cstr>
+    : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -138,9 +192,7 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-//: InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, M)> {
-  : InstHexagon<outs, ins, asmstr, pattern, "",  S> {
-//  : InstHexagon<outs, ins, asmstr, pattern, "", S> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", S, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -151,8 +203,8 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
+   string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE> {
 //  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
 //  : InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, S)> {
   bits<5> rd;
@@ -163,14 +215,14 @@ class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  J> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", J, TypeJ> {
   bits<16> imm16;
 }
 
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", JR> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", JR, TypeJR> {
   bits<5>  rs;
   bits<5>  pu; // Predicate register
 }
@@ -178,15 +230,22 @@ class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", CR> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", CR, TypeCR> {
   bits<5> rs;
   bits<10> imm10;
 }
 
+class Marker<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", MARKER, TypeMARKER> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
 
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
- : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO>;
-
+  : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO, TypePSEUDO> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
 
 //===----------------------------------------------------------------------===//
 //                         Intruction Classes Definitions -
@@ -222,6 +281,11 @@ class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
    : ALU64Type<outs, ins, asmstr, pattern> {
 }
 
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU64Type<outs, ins, asmstr, pattern> {
+  let rt{0-4} = 0;
+}
+
 // J Type Instructions.
 class JInst<dag outs, dag ins, string asmstr, list<dag> pattern>
   : JType<outs, ins, asmstr, pattern> {
@@ -234,15 +298,31 @@ class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 
 // Post increment ST Instruction.
-class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr>
+  : STInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
   : STInstPost<outs, ins, asmstr, pattern, cstr> {
   let rt{0-4} = 0;
+  let mayStore = 1;
 }
 
 // Post increment LD Instruction.
-class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr>
+  : LDInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
   : LDInstPost<outs, ins, asmstr, pattern, cstr> {
   let rt{0-4} = 0;
+  let mayLoad = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index bd5e449..49741a3 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -11,11 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+//----------------------------------------------------------------------------//
+//                         Hexagon Intruction Flags +
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeMEMOP  : Type<9>;
+def TypeNV     : Type<10>;
+def TypePREFIX : Type<30>;
+
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
 //
 // NV type instructions.
 //
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4, TypeNV> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
@@ -24,7 +38,7 @@ class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of Post increment new value store.
 class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
                     string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -39,8 +53,15 @@ class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
 }
 
 class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  MEM_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", MEM_V4, TypeMEMOP> {
   bits<5> rd;
   bits<5> rs;
   bits<6> imm6;
 }
+
+class Immext<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX> {
+  let isCodeGenOnly = 1;
+
+  bits<26> imm26;
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 77b3663..c8f933d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "Hexagon.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
@@ -34,24 +34,23 @@ using namespace llvm;
 /// Constants for Hexagon instructions.
 ///
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
-const int Hexagon_MEMW_OFFSET_MIN = 4096;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
-const int Hexagon_MEMD_OFFSET_MIN = 8192;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
 const int Hexagon_MEMH_OFFSET_MAX = 2047;
-const int Hexagon_MEMH_OFFSET_MIN = 2048;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
 const int Hexagon_MEMB_OFFSET_MAX = 1023;
-const int Hexagon_MEMB_OFFSET_MIN = 1024;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
 const int Hexagon_ADDI_OFFSET_MAX = 32767;
-const int Hexagon_ADDI_OFFSET_MIN = 32768;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
 const int Hexagon_MEMD_AUTOINC_MAX = 56;
-const int Hexagon_MEMD_AUTOINC_MIN = 64;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
 const int Hexagon_MEMW_AUTOINC_MAX = 28;
-const int Hexagon_MEMW_AUTOINC_MIN = 32;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
 const int Hexagon_MEMH_AUTOINC_MAX = 14;
-const int Hexagon_MEMH_AUTOINC_MIN = 16;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
-const int Hexagon_MEMB_AUTOINC_MIN = 8;
-
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
@@ -70,6 +69,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 
 
   switch (MI->getOpcode()) {
+  default: break;
   case Hexagon::LDriw:
   case Hexagon::LDrid:
   case Hexagon::LDrih:
@@ -81,11 +81,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
       return MI->getOperand(0).getReg();
     }
     break;
-
-  default:
-    break;
   }
-
   return 0;
 }
 
@@ -98,21 +94,18 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                             int &FrameIndex) const {
   switch (MI->getOpcode()) {
+  default: break;
   case Hexagon::STriw:
   case Hexagon::STrid:
   case Hexagon::STrih:
   case Hexagon::STrib:
     if (MI->getOperand(2).isFI() &&
         MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
     }
     break;
-
-  default:
-    break;
   }
-
   return 0;
 }
 
@@ -176,6 +169,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+  TBB = NULL;
   FBB = NULL;
 
   // If the block has no terminators, it just falls into the block after it.
@@ -328,7 +322,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             DestReg).addReg(SrcReg).addReg(SrcReg);
     return;
   }
-  if (Hexagon::DoubleRegsRegClass.contains(DestReg, SrcReg)) {
+  if (Hexagon::DoubleRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
     // We can have an overlap between single and double reg: r1:0 = r0.
     if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
         // r1:0 = r0
@@ -343,7 +338,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
     return;
   }
-  if (Hexagon::CRRegsRegClass.contains(DestReg, SrcReg)) {
+  if (Hexagon::CRRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
     BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
     return;
   }
@@ -370,15 +366,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       MFI.getObjectSize(FI),
                       Align);
 
-  if (Hexagon::IntRegsRegisterClass->hasSubClassEq(RC)) {
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STriw))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
-  } else if (Hexagon::DoubleRegsRegisterClass->hasSubClassEq(RC)) {
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STrid))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
-  } else if (Hexagon::PredRegsRegisterClass->hasSubClassEq(RC)) {
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
@@ -415,14 +411,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       MachineMemOperand::MOLoad,
                       MFI.getObjectSize(FI),
                       Align);
-
-  if (RC == Hexagon::IntRegsRegisterClass) {
+  if (RC == &Hexagon::IntRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == Hexagon::DoubleRegsRegisterClass) {
+  } else if (RC == &Hexagon::DoubleRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == Hexagon::PredRegsRegisterClass) {
+  } else if (RC == &Hexagon::PredRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else {
@@ -453,11 +448,11 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *TRC;
   if (VT == MVT::i1) {
-    TRC =  Hexagon::PredRegsRegisterClass;
-  } else if (VT == MVT::i32) {
-    TRC =  Hexagon::IntRegsRegisterClass;
-  } else if (VT == MVT::i64) {
-    TRC =  Hexagon::DoubleRegsRegisterClass;
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
   } else {
     llvm_unreachable("Cannot handle this register class");
   }
@@ -466,7 +461,852 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
   return NewReg;
 }
 
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+    case Hexagon::JMP_EQriPnt_nv_V4:
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+    case Hexagon::JMP_GTriPnt_nv_V4:
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
 
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+
+    // TFR_FI
+    case Hexagon::TFR_FI:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+
+    // V4 absolute set addressing.
+    case Hexagon::LDrid_abs_setimm_V4:
+    case Hexagon::LDriw_abs_setimm_V4:
+    case Hexagon::LDrih_abs_setimm_V4:
+    case Hexagon::LDrib_abs_setimm_V4:
+    case Hexagon::LDriuh_abs_setimm_V4:
+    case Hexagon::LDriub_abs_setimm_V4:
+
+    case Hexagon::STrid_abs_setimm_V4:
+    case Hexagon::STrib_abs_setimm_V4:
+    case Hexagon::STrih_abs_setimm_V4:
+    case Hexagon::STriw_abs_setimm_V4:
+
+    // V4 global address load.
+    case Hexagon::LDrid_GP_cPt_V4 :
+    case Hexagon::LDrid_GP_cNotPt_V4 :
+    case Hexagon::LDrid_GP_cdnPt_V4 :
+    case Hexagon::LDrid_GP_cdnNotPt_V4 :
+    case Hexagon::LDrib_GP_cPt_V4 :
+    case Hexagon::LDrib_GP_cNotPt_V4 :
+    case Hexagon::LDrib_GP_cdnPt_V4 :
+    case Hexagon::LDrib_GP_cdnNotPt_V4 :
+    case Hexagon::LDriub_GP_cPt_V4 :
+    case Hexagon::LDriub_GP_cNotPt_V4 :
+    case Hexagon::LDriub_GP_cdnPt_V4 :
+    case Hexagon::LDriub_GP_cdnNotPt_V4 :
+    case Hexagon::LDrih_GP_cPt_V4 :
+    case Hexagon::LDrih_GP_cNotPt_V4 :
+    case Hexagon::LDrih_GP_cdnPt_V4 :
+    case Hexagon::LDrih_GP_cdnNotPt_V4 :
+    case Hexagon::LDriuh_GP_cPt_V4 :
+    case Hexagon::LDriuh_GP_cNotPt_V4 :
+    case Hexagon::LDriuh_GP_cdnPt_V4 :
+    case Hexagon::LDriuh_GP_cdnNotPt_V4 :
+    case Hexagon::LDriw_GP_cPt_V4 :
+    case Hexagon::LDriw_GP_cNotPt_V4 :
+    case Hexagon::LDriw_GP_cdnPt_V4 :
+    case Hexagon::LDriw_GP_cdnNotPt_V4 :
+    case Hexagon::LDd_GP_cPt_V4 :
+    case Hexagon::LDd_GP_cNotPt_V4 :
+    case Hexagon::LDd_GP_cdnPt_V4 :
+    case Hexagon::LDd_GP_cdnNotPt_V4 :
+    case Hexagon::LDb_GP_cPt_V4 :
+    case Hexagon::LDb_GP_cNotPt_V4 :
+    case Hexagon::LDb_GP_cdnPt_V4 :
+    case Hexagon::LDb_GP_cdnNotPt_V4 :
+    case Hexagon::LDub_GP_cPt_V4 :
+    case Hexagon::LDub_GP_cNotPt_V4 :
+    case Hexagon::LDub_GP_cdnPt_V4 :
+    case Hexagon::LDub_GP_cdnNotPt_V4 :
+    case Hexagon::LDh_GP_cPt_V4 :
+    case Hexagon::LDh_GP_cNotPt_V4 :
+    case Hexagon::LDh_GP_cdnPt_V4 :
+    case Hexagon::LDh_GP_cdnNotPt_V4 :
+    case Hexagon::LDuh_GP_cPt_V4 :
+    case Hexagon::LDuh_GP_cNotPt_V4 :
+    case Hexagon::LDuh_GP_cdnPt_V4 :
+    case Hexagon::LDuh_GP_cdnNotPt_V4 :
+    case Hexagon::LDw_GP_cPt_V4 :
+    case Hexagon::LDw_GP_cNotPt_V4 :
+    case Hexagon::LDw_GP_cdnPt_V4 :
+    case Hexagon::LDw_GP_cdnNotPt_V4 :
+
+    // V4 global address store.
+    case Hexagon::STrid_GP_cPt_V4 :
+    case Hexagon::STrid_GP_cNotPt_V4 :
+    case Hexagon::STrid_GP_cdnPt_V4 :
+    case Hexagon::STrid_GP_cdnNotPt_V4 :
+    case Hexagon::STrib_GP_cPt_V4 :
+    case Hexagon::STrib_GP_cNotPt_V4 :
+    case Hexagon::STrib_GP_cdnPt_V4 :
+    case Hexagon::STrib_GP_cdnNotPt_V4 :
+    case Hexagon::STrih_GP_cPt_V4 :
+    case Hexagon::STrih_GP_cNotPt_V4 :
+    case Hexagon::STrih_GP_cdnPt_V4 :
+    case Hexagon::STrih_GP_cdnNotPt_V4 :
+    case Hexagon::STriw_GP_cPt_V4 :
+    case Hexagon::STriw_GP_cNotPt_V4 :
+    case Hexagon::STriw_GP_cdnPt_V4 :
+    case Hexagon::STriw_GP_cdnNotPt_V4 :
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STd_GP_cdnPt_V4 :
+    case Hexagon::STd_GP_cdnNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cdnPt_V4 :
+    case Hexagon::STb_GP_cdnNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cdnPt_V4 :
+    case Hexagon::STh_GP_cdnNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cdnPt_V4 :
+    case Hexagon::STw_GP_cdnNotPt_V4 :
+
+    // V4 predicated global address new value store.
+    case Hexagon::STrib_GP_cPt_nv_V4 :
+    case Hexagon::STrib_GP_cNotPt_nv_V4 :
+    case Hexagon::STrib_GP_cdnPt_nv_V4 :
+    case Hexagon::STrib_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STrih_GP_cPt_nv_V4 :
+    case Hexagon::STrih_GP_cNotPt_nv_V4 :
+    case Hexagon::STrih_GP_cdnPt_nv_V4 :
+    case Hexagon::STrih_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STriw_GP_cPt_nv_V4 :
+    case Hexagon::STriw_GP_cNotPt_nv_V4 :
+    case Hexagon::STriw_GP_cdnPt_nv_V4 :
+    case Hexagon::STriw_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STb_GP_cPt_nv_V4 :
+    case Hexagon::STb_GP_cNotPt_nv_V4 :
+    case Hexagon::STb_GP_cdnPt_nv_V4 :
+    case Hexagon::STb_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STh_GP_cPt_nv_V4 :
+    case Hexagon::STh_GP_cNotPt_nv_V4 :
+    case Hexagon::STh_GP_cdnPt_nv_V4 :
+    case Hexagon::STh_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STw_GP_cPt_nv_V4 :
+    case Hexagon::STw_GP_cNotPt_nv_V4 :
+    case Hexagon::STw_GP_cdnPt_nv_V4 :
+    case Hexagon::STw_GP_cdnNotPt_nv_V4 :
+
+    // TFR_FI
+    case Hexagon::TFR_FI_immext_V4:
+
+    // TFRI_F
+    case Hexagon::TFRI_f:
+    case Hexagon::TFRI_cPt_f:
+    case Hexagon::TFRI_cNotPt_f:
+    case Hexagon::CONST64_Float_Real:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+    case Hexagon::JMP_EQriPnt_nv_V4:
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+    case Hexagon::JMP_GTriPnt_nv_V4:
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+      return true;
+  }
+}
+
+unsigned HexagonInstrInfo::getImmExtForm(const MachineInstr* MI) const {
+  switch(MI->getOpcode()) {
+    default: llvm_unreachable("Unknown type of instruction.");
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+      return Hexagon::JMP_EQriPt_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+      return Hexagon::JMP_EQriNotPt_ie_nv_V4;
+    case Hexagon::JMP_EQriPnt_nv_V4:
+      return Hexagon::JMP_EQriPnt_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+      return Hexagon::JMP_EQriNotPnt_ie_nv_V4;
+
+    // JMP_EQri -- with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+      return Hexagon::JMP_EQriPtneg_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+      return Hexagon::JMP_EQriNotPtneg_ie_nv_V4;
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+      return Hexagon::JMP_EQriPntneg_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+      return Hexagon::JMP_EQriNotPntneg_ie_nv_V4;
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+      return Hexagon::JMP_EQrrPt_ie_nv_V4;
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+      return Hexagon::JMP_EQrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+      return Hexagon::JMP_EQrrPnt_ie_nv_V4;
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+      return Hexagon::JMP_EQrrNotPnt_ie_nv_V4;
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+      return Hexagon::JMP_GTriPt_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+      return Hexagon::JMP_GTriNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTriPnt_nv_V4:
+      return Hexagon::JMP_GTriPnt_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+      return Hexagon::JMP_GTriNotPnt_ie_nv_V4;
+
+    // JMP_GTri -- with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+      return Hexagon::JMP_GTriPtneg_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+      return Hexagon::JMP_GTriNotPtneg_ie_nv_V4;
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+      return Hexagon::JMP_GTriPntneg_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+      return Hexagon::JMP_GTriNotPntneg_ie_nv_V4;
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+      return Hexagon::JMP_GTrrPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+      return Hexagon::JMP_GTrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+      return Hexagon::JMP_GTrrPnt_ie_nv_V4;
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrNotPnt_ie_nv_V4;
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+      return Hexagon::JMP_GTrrdnPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnPnt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4;
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+      return Hexagon::JMP_GTUriPt_ie_nv_V4;
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+      return Hexagon::JMP_GTUriNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+      return Hexagon::JMP_GTUriPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+      return Hexagon::JMP_GTUriNotPnt_ie_nv_V4;
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+      return Hexagon::JMP_GTUrrPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+      return Hexagon::JMP_GTUrrPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPnt_ie_nv_V4;
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4;
+
+    case Hexagon::TFR_FI:
+        return Hexagon::TFR_FI_immext_V4;
+
+    case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDi_MEM_V4 :
+    case Hexagon::MEMw_SUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDr_MEM_V4 :
+    case Hexagon::MEMw_SUBr_MEM_V4 :
+    case Hexagon::MEMw_ANDr_MEM_V4 :
+    case Hexagon::MEMw_ORr_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDi_MEM_V4 :
+    case Hexagon::MEMh_SUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDr_MEM_V4 :
+    case Hexagon::MEMh_SUBr_MEM_V4 :
+    case Hexagon::MEMh_ANDr_MEM_V4 :
+    case Hexagon::MEMh_ORr_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDi_MEM_V4 :
+    case Hexagon::MEMb_SUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDr_MEM_V4 :
+    case Hexagon::MEMb_SUBr_MEM_V4 :
+    case Hexagon::MEMb_ANDr_MEM_V4 :
+    case Hexagon::MEMb_ORr_MEM_V4 :
+      llvm_unreachable("Needs implementing.");
+  }
+}
+
+unsigned HexagonInstrInfo::getNormalBranchForm(const MachineInstr* MI) const {
+  switch(MI->getOpcode()) {
+    default: llvm_unreachable("Unknown type of jump instruction.");
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+      return Hexagon::JMP_EQriPt_nv_V4;
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPt_nv_V4;
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+      return Hexagon::JMP_EQriPnt_nv_V4;
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPnt_nv_V4;
+
+    // JMP_EQri -- with -1
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+      return Hexagon::JMP_EQriPtneg_nv_V4;
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPtneg_nv_V4;
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+      return Hexagon::JMP_EQriPntneg_nv_V4;
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPntneg_nv_V4;
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_EQrrNotPt_nv_V4;
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+      return Hexagon::JMP_EQrrPnt_nv_V4;
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_EQrrNotPnt_nv_V4;
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+      return Hexagon::JMP_GTriPt_nv_V4;
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPt_nv_V4;
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+      return Hexagon::JMP_GTriPnt_nv_V4;
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPnt_nv_V4;
+
+    // JMP_GTri -- with -1
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+      return Hexagon::JMP_GTriPtneg_nv_V4;
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPtneg_nv_V4;
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+      return Hexagon::JMP_GTriPntneg_nv_V4;
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPntneg_nv_V4;
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrPt_nv_V4;
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrNotPt_nv_V4;
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrPnt_nv_V4;
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrNotPnt_nv_V4;
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnPt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnPnt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPnt_nv_V4;
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUriNotPt_nv_V4;
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUriPnt_nv_V4;
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUriNotPnt_nv_V4;
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrPt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrPnt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrNotPnt_nv_V4;
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnPt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnPnt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
+  }
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // Store Byte
+    case Hexagon::STrib_nv_V4:
+    case Hexagon::STrib_indexed_nv_V4:
+    case Hexagon::STrib_indexed_shl_nv_V4:
+    case Hexagon::STrib_shl_nv_V4:
+    case Hexagon::STrib_GP_nv_V4:
+    case Hexagon::STb_GP_nv_V4:
+    case Hexagon::POST_STbri_nv_V4:
+    case Hexagon::STrib_cPt_nv_V4:
+    case Hexagon::STrib_cdnPt_nv_V4:
+    case Hexagon::STrib_cNotPt_nv_V4:
+    case Hexagon::STrib_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STbri_cPt_nv_V4:
+    case Hexagon::POST_STbri_cdnPt_nv_V4:
+    case Hexagon::POST_STbri_cNotPt_nv_V4:
+    case Hexagon::POST_STbri_cdnNotPt_nv_V4:
+    case Hexagon::STb_GP_cPt_nv_V4:
+    case Hexagon::STb_GP_cNotPt_nv_V4:
+    case Hexagon::STb_GP_cdnPt_nv_V4:
+    case Hexagon::STb_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrib_GP_cPt_nv_V4:
+    case Hexagon::STrib_GP_cNotPt_nv_V4:
+    case Hexagon::STrib_GP_cdnPt_nv_V4:
+    case Hexagon::STrib_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrib_abs_nv_V4:
+    case Hexagon::STrib_abs_cPt_nv_V4:
+    case Hexagon::STrib_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_nv_V4:
+    case Hexagon::STrib_imm_abs_cPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Halfword
+    case Hexagon::STrih_nv_V4:
+    case Hexagon::STrih_indexed_nv_V4:
+    case Hexagon::STrih_indexed_shl_nv_V4:
+    case Hexagon::STrih_shl_nv_V4:
+    case Hexagon::STrih_GP_nv_V4:
+    case Hexagon::STh_GP_nv_V4:
+    case Hexagon::POST_SThri_nv_V4:
+    case Hexagon::STrih_cPt_nv_V4:
+    case Hexagon::STrih_cdnPt_nv_V4:
+    case Hexagon::STrih_cNotPt_nv_V4:
+    case Hexagon::STrih_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_SThri_cPt_nv_V4:
+    case Hexagon::POST_SThri_cdnPt_nv_V4:
+    case Hexagon::POST_SThri_cNotPt_nv_V4:
+    case Hexagon::POST_SThri_cdnNotPt_nv_V4:
+    case Hexagon::STh_GP_cPt_nv_V4:
+    case Hexagon::STh_GP_cNotPt_nv_V4:
+    case Hexagon::STh_GP_cdnPt_nv_V4:
+    case Hexagon::STh_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrih_GP_cPt_nv_V4:
+    case Hexagon::STrih_GP_cNotPt_nv_V4:
+    case Hexagon::STrih_GP_cdnPt_nv_V4:
+    case Hexagon::STrih_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrih_abs_nv_V4:
+    case Hexagon::STrih_abs_cPt_nv_V4:
+    case Hexagon::STrih_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_nv_V4:
+    case Hexagon::STrih_imm_abs_cPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Word
+    case Hexagon::STriw_nv_V4:
+    case Hexagon::STriw_indexed_nv_V4:
+    case Hexagon::STriw_indexed_shl_nv_V4:
+    case Hexagon::STriw_shl_nv_V4:
+    case Hexagon::STriw_GP_nv_V4:
+    case Hexagon::STw_GP_nv_V4:
+    case Hexagon::POST_STwri_nv_V4:
+    case Hexagon::STriw_cPt_nv_V4:
+    case Hexagon::STriw_cdnPt_nv_V4:
+    case Hexagon::STriw_cNotPt_nv_V4:
+    case Hexagon::STriw_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STwri_cPt_nv_V4:
+    case Hexagon::POST_STwri_cdnPt_nv_V4:
+    case Hexagon::POST_STwri_cNotPt_nv_V4:
+    case Hexagon::POST_STwri_cdnNotPt_nv_V4:
+    case Hexagon::STw_GP_cPt_nv_V4:
+    case Hexagon::STw_GP_cNotPt_nv_V4:
+    case Hexagon::STw_GP_cdnPt_nv_V4:
+    case Hexagon::STw_GP_cdnNotPt_nv_V4:
+    case Hexagon::STriw_GP_cPt_nv_V4:
+    case Hexagon::STriw_GP_cNotPt_nv_V4:
+    case Hexagon::STriw_GP_cdnPt_nv_V4:
+    case Hexagon::STriw_GP_cdnNotPt_nv_V4:
+    case Hexagon::STriw_abs_nv_V4:
+    case Hexagon::STriw_abs_cPt_nv_V4:
+    case Hexagon::STriw_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_abs_cdnNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_nv_V4:
+    case Hexagon::STriw_imm_abs_cPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnNotPt_nv_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    // Load Byte
+    case Hexagon::POST_LDrib:
+    case Hexagon::POST_LDrib_cPt:
+    case Hexagon::POST_LDrib_cNotPt:
+    case Hexagon::POST_LDrib_cdnPt_V4:
+    case Hexagon::POST_LDrib_cdnNotPt_V4:
+
+    // Load unsigned byte
+    case Hexagon::POST_LDriub:
+    case Hexagon::POST_LDriub_cPt:
+    case Hexagon::POST_LDriub_cNotPt:
+    case Hexagon::POST_LDriub_cdnPt_V4:
+    case Hexagon::POST_LDriub_cdnNotPt_V4:
+
+    // Load halfword
+    case Hexagon::POST_LDrih:
+    case Hexagon::POST_LDrih_cPt:
+    case Hexagon::POST_LDrih_cNotPt:
+    case Hexagon::POST_LDrih_cdnPt_V4:
+    case Hexagon::POST_LDrih_cdnNotPt_V4:
+
+    // Load unsigned halfword
+    case Hexagon::POST_LDriuh:
+    case Hexagon::POST_LDriuh_cPt:
+    case Hexagon::POST_LDriuh_cNotPt:
+    case Hexagon::POST_LDriuh_cdnPt_V4:
+    case Hexagon::POST_LDriuh_cdnNotPt_V4:
+
+    // Load word
+    case Hexagon::POST_LDriw:
+    case Hexagon::POST_LDriw_cPt:
+    case Hexagon::POST_LDriw_cNotPt:
+    case Hexagon::POST_LDriw_cdnPt_V4:
+    case Hexagon::POST_LDriw_cdnNotPt_V4:
+
+    // Load double word
+    case Hexagon::POST_LDrid:
+    case Hexagon::POST_LDrid_cPt:
+    case Hexagon::POST_LDrid_cNotPt:
+    case Hexagon::POST_LDrid_cdnPt_V4:
+    case Hexagon::POST_LDrid_cdnNotPt_V4:
+
+    // Store byte
+    case Hexagon::POST_STbri:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+
+    // Store halfword
+    case Hexagon::POST_SThri:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+
+    // Store word
+    case Hexagon::POST_STwri:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+
+    // Store double word
+    case Hexagon::POST_STdri:
+    case Hexagon::POST_STdri_cPt:
+    case Hexagon::POST_STdri_cNotPt:
+    case Hexagon::POST_STdri_cdnPt_V4:
+    case Hexagon::POST_STdri_cdnNotPt_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+}
 
 bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   bool isPred = MI->getDesc().isPredicable();
@@ -548,7 +1388,7 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   case Hexagon::SXTH:
   case Hexagon::ZXTB:
   case Hexagon::ZXTH:
-    return Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+    return Subtarget.hasV4TOps();
 
   case Hexagon::JMPR:
     return false;
@@ -557,8 +1397,27 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
+// This function performs the following inversiones:
+//
+//  cPt    ---> cNotPt
+//  cNotPt ---> cPt
+//
+// however, these inversiones are NOT included:
+//
+//  cdnPt      -X-> cdnNotPt
+//  cdnNotPt   -X-> cdnPt
+//  cPt_nv     -X-> cNotPt_nv (new value stores)
+//  cNotPt_nv  -X-> cPt_nv    (new value stores)
+//
+// because only the following transformations are allowed:
+//
+//  cNotPt  ---> cdnNotPt
+//  cPt     ---> cdnPt
+//  cNotPt  ---> cNotPt_nv
+//  cPt     ---> cPt_nv
 unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
   switch(Opc) {
+    default: llvm_unreachable("Unexpected predicated instruction");
     case Hexagon::TFR_cPt:
       return Hexagon::TFR_cNotPt;
     case Hexagon::TFR_cNotPt:
@@ -805,6 +1664,47 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
     case Hexagon::STrid_indexed_shl_cNotPt_V4:
       return Hexagon::STrid_indexed_shl_cPt_V4;
 
+    // V4 Store to global address.
+    case Hexagon::STd_GP_cPt_V4:
+      return Hexagon::STd_GP_cNotPt_V4;
+    case Hexagon::STd_GP_cNotPt_V4:
+      return Hexagon::STd_GP_cPt_V4;
+
+    case Hexagon::STb_GP_cPt_V4:
+      return Hexagon::STb_GP_cNotPt_V4;
+    case Hexagon::STb_GP_cNotPt_V4:
+      return Hexagon::STb_GP_cPt_V4;
+
+    case Hexagon::STh_GP_cPt_V4:
+      return Hexagon::STh_GP_cNotPt_V4;
+    case Hexagon::STh_GP_cNotPt_V4:
+      return Hexagon::STh_GP_cPt_V4;
+
+    case Hexagon::STw_GP_cPt_V4:
+      return Hexagon::STw_GP_cNotPt_V4;
+    case Hexagon::STw_GP_cNotPt_V4:
+      return Hexagon::STw_GP_cPt_V4;
+
+    case Hexagon::STrid_GP_cPt_V4:
+      return Hexagon::STrid_GP_cNotPt_V4;
+    case Hexagon::STrid_GP_cNotPt_V4:
+      return Hexagon::STrid_GP_cPt_V4;
+
+    case Hexagon::STrib_GP_cPt_V4:
+      return Hexagon::STrib_GP_cNotPt_V4;
+    case Hexagon::STrib_GP_cNotPt_V4:
+      return Hexagon::STrib_GP_cPt_V4;
+
+    case Hexagon::STrih_GP_cPt_V4:
+      return Hexagon::STrih_GP_cNotPt_V4;
+    case Hexagon::STrih_GP_cNotPt_V4:
+      return Hexagon::STrih_GP_cPt_V4;
+
+    case Hexagon::STriw_GP_cPt_V4:
+      return Hexagon::STriw_GP_cNotPt_V4;
+    case Hexagon::STriw_GP_cNotPt_V4:
+      return Hexagon::STriw_GP_cPt_V4;
+
   // Load.
     case Hexagon::LDrid_cPt:
       return Hexagon::LDrid_cNotPt;
@@ -1009,9 +1909,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
       return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
     case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
       return Hexagon::JMP_GTUrrdnPnt_nv_V4;
-
-  default:
-    llvm_unreachable("Unexpected predicated instruction");
   }
 }
 
@@ -1022,12 +1919,21 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::TFR:
     return !invertPredicate ? Hexagon::TFR_cPt :
                               Hexagon::TFR_cNotPt;
+  case Hexagon::TFRI_f:
+    return !invertPredicate ? Hexagon::TFRI_cPt_f :
+                              Hexagon::TFRI_cNotPt_f;
   case Hexagon::TFRI:
     return !invertPredicate ? Hexagon::TFRI_cPt :
                               Hexagon::TFRI_cNotPt;
   case Hexagon::JMP:
     return !invertPredicate ? Hexagon::JMP_c :
                               Hexagon::JMP_cNot;
+  case Hexagon::JMP_EQrrPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQrrPt_nv_V4 :
+                              Hexagon::JMP_EQrrNotPt_nv_V4;
+  case Hexagon::JMP_EQriPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 :
+                              Hexagon::JMP_EQriNotPt_nv_V4;
   case Hexagon::ADD_ri:
     return !invertPredicate ? Hexagon::ADD_ri_cPt :
                               Hexagon::ADD_ri_cNotPt;
@@ -1121,6 +2027,46 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::LDriw_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
                               Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 Load from global address
+  case Hexagon::LDrid_GP_V4:
+    return !invertPredicate ? Hexagon::LDrid_GP_cPt_V4 :
+                              Hexagon::LDrid_GP_cNotPt_V4;
+  case Hexagon::LDrib_GP_V4:
+    return !invertPredicate ? Hexagon::LDrib_GP_cPt_V4 :
+                              Hexagon::LDrib_GP_cNotPt_V4;
+  case Hexagon::LDriub_GP_V4:
+    return !invertPredicate ? Hexagon::LDriub_GP_cPt_V4 :
+                              Hexagon::LDriub_GP_cNotPt_V4;
+  case Hexagon::LDrih_GP_V4:
+    return !invertPredicate ? Hexagon::LDrih_GP_cPt_V4 :
+                              Hexagon::LDrih_GP_cNotPt_V4;
+  case Hexagon::LDriuh_GP_V4:
+    return !invertPredicate ? Hexagon::LDriuh_GP_cPt_V4 :
+                              Hexagon::LDriuh_GP_cNotPt_V4;
+  case Hexagon::LDriw_GP_V4:
+    return !invertPredicate ? Hexagon::LDriw_GP_cPt_V4 :
+                              Hexagon::LDriw_GP_cNotPt_V4;
+
+  case Hexagon::LDd_GP_V4:
+    return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
+                              Hexagon::LDd_GP_cNotPt_V4;
+  case Hexagon::LDb_GP_V4:
+    return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 :
+                              Hexagon::LDb_GP_cNotPt_V4;
+  case Hexagon::LDub_GP_V4:
+    return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 :
+                              Hexagon::LDub_GP_cNotPt_V4;
+  case Hexagon::LDh_GP_V4:
+    return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 :
+                              Hexagon::LDh_GP_cNotPt_V4;
+  case Hexagon::LDuh_GP_V4:
+    return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 :
+                              Hexagon::LDuh_GP_cNotPt_V4;
+  case Hexagon::LDw_GP_V4:
+    return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 :
+                              Hexagon::LDw_GP_cNotPt_V4;
+
     // Byte.
   case Hexagon::POST_STbri:
     return !invertPredicate ? Hexagon::POST_STbri_cPt :
@@ -1182,6 +2128,34 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::STrid_indexed_shl_V4:
     return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
                               Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  // V4 Store to global address
+  case Hexagon::STrid_GP_V4:
+    return !invertPredicate ? Hexagon::STrid_GP_cPt_V4 :
+                              Hexagon::STrid_GP_cNotPt_V4;
+  case Hexagon::STrib_GP_V4:
+    return !invertPredicate ? Hexagon::STrib_GP_cPt_V4 :
+                              Hexagon::STrib_GP_cNotPt_V4;
+  case Hexagon::STrih_GP_V4:
+    return !invertPredicate ? Hexagon::STrih_GP_cPt_V4 :
+                              Hexagon::STrih_GP_cNotPt_V4;
+  case Hexagon::STriw_GP_V4:
+    return !invertPredicate ? Hexagon::STriw_GP_cPt_V4 :
+                              Hexagon::STriw_GP_cNotPt_V4;
+
+  case Hexagon::STd_GP_V4:
+    return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
+                              Hexagon::STd_GP_cNotPt_V4;
+  case Hexagon::STb_GP_V4:
+    return !invertPredicate ? Hexagon::STb_GP_cPt_V4 :
+                              Hexagon::STb_GP_cNotPt_V4;
+  case Hexagon::STh_GP_V4:
+    return !invertPredicate ? Hexagon::STh_GP_cPt_V4 :
+                              Hexagon::STh_GP_cNotPt_V4;
+  case Hexagon::STw_GP_V4:
+    return !invertPredicate ? Hexagon::STw_GP_cPt_V4 :
+                              Hexagon::STw_GP_cNotPt_V4;
+
   // Load.
   case Hexagon::LDrid:
     return !invertPredicate ? Hexagon::LDrid_cPt :
@@ -1201,9 +2175,6 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::LDriub:
     return !invertPredicate ? Hexagon::LDriub_cPt :
                               Hexagon::LDriub_cNotPt;
-  case Hexagon::LDriubit:
-    return !invertPredicate ? Hexagon::LDriub_cPt :
-                              Hexagon::LDriub_cNotPt;
  // Load Indexed.
   case Hexagon::LDrid_indexed:
     return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
@@ -1297,7 +2268,7 @@ PredicateInstruction(MachineInstr *MI,
 bool
 HexagonInstrInfo::
 isProfitableToIfCvt(MachineBasicBlock &MBB,
-                    unsigned NumCyles,
+                    unsigned NumCycles,
                     unsigned ExtraPredCycles,
                     const BranchProbability &Probability) const {
   return true;
@@ -1323,7 +2294,6 @@ bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
-
 bool
 HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
                                    std::vector<MachineOperand> &Pred) const {
@@ -1331,7 +2301,7 @@ HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
     MachineOperand MO = MI->getOperand(oper);
     if (MO.isReg() && MO.isDef()) {
       const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
-      if (RC == Hexagon::PredRegsRegisterClass) {
+      if (RC == &Hexagon::PredRegsRegClass) {
         Pred.push_back(MO);
         return true;
       }
@@ -1373,6 +2343,7 @@ isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
 
 bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
+  default: return false;
   case Hexagon::DEALLOC_RET_V4 :
   case Hexagon::DEALLOC_RET_cPt_V4 :
   case Hexagon::DEALLOC_RET_cNotPt_V4 :
@@ -1382,7 +2353,6 @@ bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
   case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
    return true;
   }
-  return false;
 }
 
 
@@ -1396,13 +2366,17 @@ isValidOffset(const int Opcode, const int Offset) const {
   switch(Opcode) {
 
   case Hexagon::LDriw:
+  case Hexagon::LDriw_f:
   case Hexagon::STriw:
+  case Hexagon::STriw_f:
     assert((Offset % 4 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMW_OFFSET_MAX);
 
   case Hexagon::LDrid:
+  case Hexagon::LDrid_f:
   case Hexagon::STrid:
+  case Hexagon::STrid_f:
     assert((Offset % 8 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMD_OFFSET_MAX);
@@ -1410,7 +2384,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDrih:
   case Hexagon::LDriuh:
   case Hexagon::STrih:
-  case Hexagon::LDrih_ae:
     assert((Offset % 2 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
@@ -1418,9 +2391,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDrib:
   case Hexagon::STrib:
   case Hexagon::LDriub:
-  case Hexagon::LDriubit:
-  case Hexagon::LDrib_ae:
-  case Hexagon::LDriub_ae:
     return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMB_OFFSET_MAX);
 
@@ -1528,6 +2498,7 @@ bool HexagonInstrInfo::
 isMemOp(const MachineInstr *MI) const {
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
     case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
     case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
@@ -1570,28 +2541,59 @@ isMemOp(const MachineInstr *MI) const {
     case Hexagon::MEMb_SUBr_MEM_V4 :
     case Hexagon::MEMb_ANDr_MEM_V4 :
     case Hexagon::MEMb_ORr_MEM_V4 :
-    return true;
+      return true;
   }
-  return false;
 }
 
 
 bool HexagonInstrInfo::
 isSpillPredRegOp(const MachineInstr *MI) const {
-  switch (MI->getOpcode())
-  {
+  switch (MI->getOpcode()) {
+    default: return false;
     case Hexagon::STriw_pred :
     case Hexagon::LDriw_pred :
-    return true;
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGEri:
+    case Hexagon::CMPGEUri:
+      return true;
   }
-  return false;
 }
 
+bool HexagonInstrInfo::
+isConditionalTransfer (const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::TFR_cPt:
+    case Hexagon::TFR_cNotPt:
+    case Hexagon::TFRI_cPt:
+    case Hexagon::TFRI_cNotPt:
+    case Hexagon::TFR_cdnPt:
+    case Hexagon::TFR_cdnNotPt:
+    case Hexagon::TFRI_cdnPt:
+    case Hexagon::TFRI_cdnNotPt:
+      return true;
+  }
+}
 
 bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
   const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::ADD_ri_cPt:
     case Hexagon::ADD_ri_cNotPt:
     case Hexagon::ADD_rr_cPt:
@@ -1619,19 +2621,16 @@ bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
     case Hexagon::ZXTB_cNotPt_V4:
     case Hexagon::ZXTH_cPt_V4:
     case Hexagon::ZXTH_cNotPt_V4:
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
-
-    default:
-      return false;
+      return QRI.Subtarget.hasV4TOps();
   }
 }
 
-
 bool HexagonInstrInfo::
 isConditionalLoad (const MachineInstr* MI) const {
   const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::LDrid_cPt :
     case Hexagon::LDrid_cNotPt :
     case Hexagon::LDrid_indexed_cPt :
@@ -1669,7 +2668,7 @@ isConditionalLoad (const MachineInstr* MI) const {
     case Hexagon::POST_LDriuh_cNotPt :
     case Hexagon::POST_LDriub_cPt :
     case Hexagon::POST_LDriub_cNotPt :
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+      return QRI.Subtarget.hasV4TOps();
     case Hexagon::LDrid_indexed_cPt_V4 :
     case Hexagon::LDrid_indexed_cNotPt_V4 :
     case Hexagon::LDrid_indexed_shl_cPt_V4 :
@@ -1694,12 +2693,136 @@ isConditionalLoad (const MachineInstr* MI) const {
     case Hexagon::LDriw_indexed_cNotPt_V4 :
     case Hexagon::LDriw_indexed_shl_cPt_V4 :
     case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
-    default:
-      return false;
+      return QRI.Subtarget.hasV4TOps();
   }
 }
 
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+//
+//               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+//                ^           ^
+//               /             \ (not OK. it will cause new-value store to be
+//              /               X conditional on p0.new while R2 producer is
+//             /                 \ on p0)
+//            /                   \.
+//     p.new store                 p.old NV store
+// [if(p0.new)memw(R0+#0)=R2]    [if(p0)memw(R0+#0)=R2.new]
+//            ^                  ^
+//             \                /
+//              \              /
+//               \            /
+//                 p.old store
+//             [if (p0)memw(R0+#0)=R2]
+//
+// The above diagram shows the steps involoved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+//   2) p0 = cmp.eq(r3, #0) }
+//
+//   3) if (p0) memb(r1+#0) = r0  --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+bool HexagonInstrInfo::
+isConditionalStore (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::STrib_imm_cPt_V4 :
+    case Hexagon::STrib_imm_cNotPt_V4 :
+    case Hexagon::STrib_indexed_shl_cPt_V4 :
+    case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::STrib_cPt :
+    case Hexagon::STrib_cNotPt :
+    case Hexagon::POST_STbri_cPt :
+    case Hexagon::POST_STbri_cNotPt :
+    case Hexagon::STrid_indexed_cPt :
+    case Hexagon::STrid_indexed_cNotPt :
+    case Hexagon::STrid_indexed_shl_cPt_V4 :
+    case Hexagon::POST_STdri_cPt :
+    case Hexagon::POST_STdri_cNotPt :
+    case Hexagon::STrih_cPt :
+    case Hexagon::STrih_cNotPt :
+    case Hexagon::STrih_indexed_cPt :
+    case Hexagon::STrih_indexed_cNotPt :
+    case Hexagon::STrih_imm_cPt_V4 :
+    case Hexagon::STrih_imm_cNotPt_V4 :
+    case Hexagon::STrih_indexed_shl_cPt_V4 :
+    case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_SThri_cPt :
+    case Hexagon::POST_SThri_cNotPt :
+    case Hexagon::STriw_cPt :
+    case Hexagon::STriw_cNotPt :
+    case Hexagon::STriw_indexed_cPt :
+    case Hexagon::STriw_indexed_cNotPt :
+    case Hexagon::STriw_imm_cPt_V4 :
+    case Hexagon::STriw_imm_cNotPt_V4 :
+    case Hexagon::STriw_indexed_shl_cPt_V4 :
+    case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_STwri_cPt :
+    case Hexagon::POST_STwri_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::STrid_GP_cPt_V4 :
+    case Hexagon::STrid_GP_cNotPt_V4 :
+    case Hexagon::STrib_GP_cPt_V4 :
+    case Hexagon::STrib_GP_cNotPt_V4 :
+    case Hexagon::STrih_GP_cPt_V4 :
+    case Hexagon::STrih_GP_cNotPt_V4 :
+    case Hexagon::STriw_GP_cPt_V4 :
+    case Hexagon::STriw_GP_cNotPt_V4 :
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store. See diagram below:
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
+    //
+    //                          +---------------------+
+    //                    /-----| if (p0) memw(..)=r0 |---------\~
+    //                   ||     +---------------------+         ||
+    //          promote  ||       /\       /\                   ||  promote
+    //                   ||      /||\     /||\                  ||
+    //                  \||/    demote     ||                  \||/
+    //                   \/       ||       ||                   \/
+    //       +-------------------------+   ||   +-------------------------+
+    //       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+    //       +-------------------------+   ||   +-------------------------+
+    //                        ||           ||         ||
+    //                        ||         demote      \||/
+    //                      promote        ||         \/ NOT possible
+    //                        ||           ||         /\~
+    //                       \||/          ||        /||\~
+    //                        \/           ||         ||
+    //                      +-----------------------------+
+    //                      | if (p0.new) memw(..)=r0.new |
+    //                      +-----------------------------+
+    //                           Double Dot New Store
+    //
+  }
+}
+
+
+
 DFAPacketizer *HexagonInstrInfo::
 CreateTargetScheduleState(const TargetMachine *TM,
                            const ScheduleDAG *DAG) const {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 7306870..2bb53f8 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -112,7 +112,7 @@ public:
   PredicateInstruction(MachineInstr *MI,
                        const SmallVectorImpl<MachineOperand> &Cond) const;
 
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                    unsigned ExtraPredCycles,
                                    const BranchProbability &Probability) const;
 
@@ -160,10 +160,21 @@ public:
   bool isS8_Immediate(const int value) const;
   bool isS6_Immediate(const int value) const;
 
+  bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
+  bool isConditionalTransfer(const MachineInstr* MI) const;
   bool isConditionalALU32 (const MachineInstr* MI) const;
   bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isConditionalStore(const MachineInstr* MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
   unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  bool isExtendable(const MachineInstr* MI) const;
+  bool isExtended(const MachineInstr* MI) const;
+  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueJump(const MachineInstr* MI) const;
+  bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+  unsigned getImmExtForm(const MachineInstr* MI) const;
+  unsigned getNormalBranchForm(const MachineInstr* MI) const;
 
 private:
   int getMatchingCondBranchOpcode(int Opc, bool sense) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index b563ac3..c0c0df6 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -25,7 +25,10 @@ def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
 def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
 def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
 def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
+def HasV5T                      : Predicate<"Subtarget.hasV5TOps()">;
+def NoV5T                       : Predicate<"!Subtarget.hasV5TOps()">;
 def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
+def IEEERndNearV5T              : Predicate<"Subtarget.modeIEEERndNear()">;
 
 // Addressing modes.
 def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
@@ -84,10 +87,12 @@ def symbolLo32 : Operand<i32> {
 multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
   def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
-                 [(set IntRegs:$dst, (OpNode s10Imm:$b, IntRegs:$c))]>;
+                 [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b,
+                                                   (i32 IntRegs:$c)))]>;
 }
 
 // Multi-class for compare ops.
@@ -95,111 +100,114 @@ let isCompare = 1 in {
 multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
   def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode DoubleRegs:$b, DoubleRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>;
 }
 multiclass CMP32_rr<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
 }
 
 multiclass CMP32_rr_ri_s10<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s10ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), s10ImmPred:$c))]>;
 }
 
 multiclass CMP32_rr_ri_u9<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), u9ImmPred:$c))]>;
 }
 
-multiclass CMP32_ri_u9<string OpcStr, PatFrag OpNode> {
-  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
+multiclass CMP32_ri_u8<string OpcStr, PatFrag OpNode> {
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u8Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   u8ImmPred:$c))]>;
 }
 
 multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s8ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   s8ImmPred:$c))]>;
 }
 }
 
 //===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// http://qualnet.qualcomm.com/~erich/v1/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v2/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v3/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v4/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v5/htmldocs/index.html
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
 // ALU32/ALU +
 //===----------------------------------------------------------------------===//
 // Add.
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def ADD_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = add($src1, $src2)",
-            [(set IntRegs:$dst, (add IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 let isPredicable = 1 in
 def ADD_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s16Imm:$src2),
             "$dst = add($src1, #$src2)",
-            [(set IntRegs:$dst, (add IntRegs:$src1, s16ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                           s16ImmPred:$src2))]>;
 
 // Logical operations.
 let isPredicable = 1 in
 def XOR_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = xor($src1, $src2)",
-            [(set IntRegs:$dst, (xor IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def AND_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = and($src1, $src2)",
-            [(set IntRegs:$dst, (and IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 def OR_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s8Imm:$src2),
+            (ins IntRegs:$src1, s10Imm:$src2),
             "$dst = or($src1, #$src2)",
-            [(set IntRegs:$dst, (or IntRegs:$src1, s8ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          s10ImmPred:$src2))]>;
 
 def NOT_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1),
             "$dst = not($src1)",
-            [(set IntRegs:$dst, (not IntRegs:$src1))]>;
+            [(set (i32 IntRegs:$dst), (not (i32 IntRegs:$src1)))]>;
 
 def AND_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s10Imm:$src2),
             "$dst = and($src1, #$src2)",
-            [(set IntRegs:$dst, (and IntRegs:$src1, s10ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           s10ImmPred:$src2))]>;
 
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def OR_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = or($src1, $src2)",
-            [(set IntRegs:$dst, (or IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>;
 
 // Negate.
 def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
           "$dst = neg($src1)",
-          [(set IntRegs:$dst, (ineg IntRegs:$src1))]>;
+          [(set (i32 IntRegs:$dst), (ineg (i32 IntRegs:$src1)))]>;
 // Nop.
 let neverHasSideEffects = 1 in
 def NOP : ALU32_rr<(outs), (ins),
@@ -211,13 +219,20 @@ let isPredicable = 1 in
 def SUB_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = sub($src1, $src2)",
-            [(set IntRegs:$dst, (sub IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
+
+// Rd32=sub(#s10,Rs32)
+def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins s10Imm:$src1, IntRegs:$src2),
+            "$dst = sub(#$src1, $src2)",
+            [(set IntRegs:$dst, (sub s10ImmPred:$src1, IntRegs:$src2))]>;
 
 // Transfer immediate.
-let isReMaterializable = 1, isPredicable = 1 in
+let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
 def TFRI : ALU32_ri<(outs IntRegs:$dst), (ins s16Imm:$src1),
            "$dst = #$src1",
-           [(set IntRegs:$dst, s16ImmPred:$src1)]>;
+           [(set (i32 IntRegs:$dst), s16ImmPred:$src1)]>;
 
 // Transfer register.
 let neverHasSideEffects = 1, isPredicable = 1 in
@@ -225,6 +240,11 @@ def TFR : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1),
           "$dst = $src1",
           []>;
 
+let neverHasSideEffects = 1, isPredicable = 1 in
+def TFR64 : ALU32_ri<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+          "$dst = $src1",
+          []>;
+
 // Transfer control register.
 let neverHasSideEffects = 1 in
 def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
@@ -246,6 +266,12 @@ def COMBINE_rr : ALU32_rr<(outs DoubleRegs:$dst),
             "$dst = combine($src1, $src2)",
             []>;
 
+let neverHasSideEffects = 1 in
+def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, s8Imm:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>;
+
 // Mux.
 def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
                                                    DoubleRegs:$src2,
@@ -256,48 +282,52 @@ def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
 def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                                             IntRegs:$src2, IntRegs:$src3),
              "$dst = mux($src1, $src2, $src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    (i32 IntRegs:$src2),
+                                                    (i32 IntRegs:$src3))))]>;
 
 def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
                                                 IntRegs:$src3),
              "$dst = mux($src1, #$src2, $src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1,
-                                         s8ImmPred:$src2, IntRegs:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ImmPred:$src2,
+                                                    (i32 IntRegs:$src3))))]>;
 
 def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
                                                 s8Imm:$src3),
              "$dst = mux($src1, $src2, #$src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                         s8ImmPred:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    (i32 IntRegs:$src2),
+                                                    s8ImmPred:$src3)))]>;
 
 def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
                                                 s8Imm:$src3),
              "$dst = mux($src1, #$src2, #$src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, s8ImmPred:$src2,
-                                         s8ImmPred:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ImmPred:$src2,
+                                                    s8ImmPred:$src3)))]>;
 
 // Shift halfword.
 let isPredicable = 1 in
 def ASLH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = aslh($src1)",
-           [(set IntRegs:$dst, (shl 16, IntRegs:$src1))]>;
+           [(set (i32 IntRegs:$dst), (shl 16, (i32 IntRegs:$src1)))]>;
 
 let isPredicable = 1 in
 def ASRH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = asrh($src1)",
-           [(set IntRegs:$dst, (sra 16, IntRegs:$src1))]>;
+           [(set (i32 IntRegs:$dst), (sra 16, (i32 IntRegs:$src1)))]>;
 
 // Sign extend.
 let isPredicable = 1 in
 def SXTB : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxtb($src1)",
-           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i8))]>;
+           [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i8))]>;
 
 let isPredicable = 1 in
 def SXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxth($src1)",
-           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i16))]>;
+           [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i16))]>;
 
 // Zero extend.
 let isPredicable = 1, neverHasSideEffects = 1 in
@@ -321,25 +351,25 @@ def ZXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
 // Conditional add.
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if ($src1) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if (!$src1) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cdnPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if ($src1.new) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cdnNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if (!$src1.new) $dst = add($src2, #$src3)",
             []>;
 
@@ -497,7 +527,6 @@ def SUB_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
 
 
 // Conditional transfer.
-
 let neverHasSideEffects = 1, isPredicated = 1 in
 def TFR_cPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2),
               "if ($src1) $dst = $src2",
@@ -510,6 +539,18 @@ def TFR_cNotPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                  []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
+def TFR64_cPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                   DoubleRegs:$src2),
+              "if ($src1) $dst = $src2",
+              []>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def TFR64_cNotPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                    DoubleRegs:$src2),
+                 "if (!$src1) $dst = $src2",
+                 []>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
 def TFRI_cPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2),
                "if ($src1) $dst = #$src2",
                []>;
@@ -548,25 +589,14 @@ def TFRI_cdnNotPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1,
 defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", setugt>;
 defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", setgt>;
 defm CMPLT : CMP32_rr<"cmp.lt", setlt>;
+defm CMPLTU : CMP32_rr<"cmp.ltu", setult>;
 defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", seteq>;
 defm CMPGE : CMP32_ri_s8<"cmp.ge", setge>;
-defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>;
+defm CMPGEU : CMP32_ri_u8<"cmp.geu", setuge>;
 //===----------------------------------------------------------------------===//
 // ALU32/PRED -
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// ALU32/VH +
-//===----------------------------------------------------------------------===//
-// Vector add halfwords
-
-// Vector averagehalfwords
-
-// Vector subtract halfwords
-//===----------------------------------------------------------------------===//
-// ALU32/VH -
-//===----------------------------------------------------------------------===//
-
 
 //===----------------------------------------------------------------------===//
 // ALU64/ALU +
@@ -575,8 +605,8 @@ defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>;
 def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = add($src1, $src2)",
-               [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Add halfword.
 
@@ -589,40 +619,93 @@ defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
 def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = and($src1, $src2)",
-               [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = or($src1, $src2)",
-              [(set DoubleRegs:$dst, (or DoubleRegs:$src1, DoubleRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1),
+                                               (i64 DoubleRegs:$src2)))]>;
 
 def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = xor($src1, $src2)",
-               [(set DoubleRegs:$dst, (xor DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Maximum.
 def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
               "$dst = max($src2, $src1)",
-              [(set IntRegs:$dst, (select (i1 (setlt IntRegs:$src2,
-                                                     IntRegs:$src1)),
-                                          IntRegs:$src1, IntRegs:$src2))]>;
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setlt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setult (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setult (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
 
 // Minimum.
 def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
               "$dst = min($src2, $src1)",
-              [(set IntRegs:$dst, (select (i1 (setgt IntRegs:$src2,
-                                                     IntRegs:$src1)),
-                                          IntRegs:$src1, IntRegs:$src2))]>;
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setgt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setugt (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setugt (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
 
 // Subtract.
 def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = sub($src1, $src2)",
-               [(set DoubleRegs:$dst, (sub DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Subtract halfword.
 
@@ -652,30 +735,6 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// ALU64/VB +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VB -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/VH +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/VW +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VW -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
 // CR +
 //===----------------------------------------------------------------------===//
 // Logical reductions on predicates.
@@ -687,7 +746,8 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
 // Logical operations on predicates.
 def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
              "$dst = and($src1, $src2)",
-             [(set PredRegs:$dst, (and PredRegs:$src1, PredRegs:$src2))]>;
+             [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
 
 let neverHasSideEffects = 1 in
 def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
@@ -726,15 +786,17 @@ def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
 
 def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
              "$dst = not($src1)",
-             [(set PredRegs:$dst, (not PredRegs:$src1))]>;
+             [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>;
 
 def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
             "$dst = or($src1, $src2)",
-            [(set PredRegs:$dst, (or PredRegs:$src1, PredRegs:$src2))]>;
+            [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1),
+                                          (i1 PredRegs:$src2)))]>;
 
 def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
              "$dst = xor($src1, $src2)",
-             [(set PredRegs:$dst, (xor PredRegs:$src1, PredRegs:$src2))]>;
+             [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
 
 
 // User control register transfer.
@@ -760,7 +822,7 @@ let isBranch = 1, isTerminator=1, Defs = [PC],
   def JMP_c : JInst< (outs),
                  (ins PredRegs:$src, brtarget:$offset),
                  "if ($src) jump $offset",
-                 [(brcond PredRegs:$src, bb:$offset)]>;
+                 [(brcond (i1 PredRegs:$src), bb:$offset)]>;
 }
 
 // if (!p0) jump
@@ -826,7 +888,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                                [SDNPHasChain, SDNPOptInGlue]>;
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isPredicable =1, isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR: JRInst<(outs), (ins),
                    "jumpr r31",
@@ -834,7 +896,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 }
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR_cPt: JRInst<(outs), (ins PredRegs:$src1),
                        "if ($src1) jumpr r31",
@@ -842,7 +904,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 }
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR_cNotPt: JRInst<(outs), (ins PredRegs:$src1),
                           "if (!$src1) jumpr r31",
@@ -865,96 +927,99 @@ let isPredicable = 1 in
 def LDrid : LDInst<(outs DoubleRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memd($addr)",
-            [(set DoubleRegs:$dst, (load ADDRriS11_3:$addr))]>;
+            [(set (i64 DoubleRegs:$dst), (i64 (load ADDRriS11_3:$addr)))]>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrid_indexed : LDInst<(outs DoubleRegs:$dst),
             (ins IntRegs:$src1, s11_3Imm:$offset),
-            "$dst=memd($src1+#$offset)",
-            [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                              s11_3ImmPred:$offset)))]>;
+            "$dst = memd($src1+#$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (i64 (load (add (i32 IntRegs:$src1),
+                                  s11_3ImmPred:$offset))))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_GP : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrid_GP : LDInst2<(outs DoubleRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memd(#$global+$offset)",
-            []>;
+            "$dst = memd(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDd_GP : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDd_GP : LDInst2<(outs DoubleRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memd(#$global)",
-            []>;
+            "$dst = memd(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid : LDInstPI<(outs DoubleRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid : LDInst2PI<(outs DoubleRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memd($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load doubleword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memd($addr)",
             []>;
 
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if ($src1) $dst=memd($src2+#$src3)",
+            "if ($src1) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if (!$src1) $dst=memd($src2+#$src3)",
+            "if (!$src1) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid_cPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if ($src1) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid_cNotPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cNotPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if (!$src1) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cdnPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cdnPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cdnNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cdnPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cdnPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if ($src1.new) $dst=memd($src2+#$src3)",
+            "if ($src1.new) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cdnNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if (!$src1.new) $dst=memd($src2+#$src3)",
+            "if (!$src1.new) $dst = memd($src2+#$src3)",
             []>;
 
 
@@ -963,114 +1028,113 @@ let isPredicable = 1 in
 def LDrib : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memb($addr)",
-            [(set IntRegs:$dst, (sextloadi8 ADDRriS11_0:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (sextloadi8 ADDRriS11_0:$addr)))]>;
 
-def LDrib_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memb($addr)",
-            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
+// Load byte any-extend.
+def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)),
+            (i32 (LDrib ADDRriS11_0:$addr)) >;
 
 // Indexed load byte.
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrib_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memb($src1+#$offset)",
-            [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
+            "$dst = memb($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                        s11_0ImmPred:$offset))))]>;
 
 // Indexed load byte any-extend.
 let AddedComplexity = 20 in
-def LDrib_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memb($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                s11_0ImmPred:$offset)))]>;
+def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrib_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memb(#$global+$offset)",
-            []>;
+            "$dst = memb(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDb_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDb_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memb(#$global)",
-            []>;
+            "$dst = memb(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDub_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDub_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memub(#$global)",
-            []>;
+            "$dst = memub(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memb($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load byte conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1.new) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1.new) $dst = memb($src2+#$src3)",
             []>;
@@ -1081,112 +1145,110 @@ let isPredicable = 1 in
 def LDrih : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memh($addr)",
-            [(set IntRegs:$dst, (sextloadi16 ADDRriS11_1:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (sextloadi16 ADDRriS11_1:$addr)))]>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrih_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memh($src1+#$offset)",
-            [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
-                                                  s11_1ImmPred:$offset)))] >;
+            "$dst = memh($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                         s11_1ImmPred:$offset))))]>;
 
-def LDrih_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memh($addr)",
-            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
+def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)),
+            (i32 (LDrih ADDRriS11_1:$addr))>;
 
 let AddedComplexity = 20 in
-def LDrih_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memh($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                 s11_1ImmPred:$offset)))] >;
+def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))),
+            (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrih_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memh(#$global+$offset)",
-            []>;
+            "$dst = memh(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memh(#$global)",
-            []>;
+            "$dst = memh(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDuh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDuh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memuh(#$global)",
-            []>;
-
+            "$dst = memuh(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memh($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load halfword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1.new) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1.new) $dst = memh($src2+#$src3)",
             []>;
@@ -1196,113 +1258,96 @@ let isPredicable = 1 in
 def LDriub : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memub($addr)",
-            [(set IntRegs:$dst, (zextloadi8 ADDRriS11_0:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (zextloadi8 ADDRriS11_0:$addr)))]>;
 
-let isPredicable = 1 in
-def LDriubit : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memub($addr)",
-            [(set IntRegs:$dst, (zextloadi1 ADDRriS11_0:$addr))]>;
+def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)),
+            (i32 (LDriub ADDRriS11_0:$addr))>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriub_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
-let AddedComplexity = 20 in
-def LDriubit_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi1 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
-def LDriub_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memub($addr)",
-            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
-
+            "$dst = memub($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                        s11_0ImmPred:$offset))))]>;
 
 let AddedComplexity = 20 in
-def LDriub_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                s11_0ImmPred:$offset)))]>;
+def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriub_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memub(#$global+$offset)",
-            []>;
+            "$dst = memub(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memub($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load unsigned byte conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1.new) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1.new) $dst = memub($src2+#$src3)",
             []>;
@@ -1312,102 +1357,90 @@ let isPredicable = 1 in
 def LDriuh : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memuh($addr)",
-            [(set IntRegs:$dst, (zextloadi16 ADDRriS11_1:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (zextloadi16 ADDRriS11_1:$addr)))]>;
 
 // Indexed load unsigned halfword.
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriuh_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memuh($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
-                                                  s11_1ImmPred:$offset)))]>;
-
-def LDriuh_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memuh($addr)",
-            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
+            "$dst = memuh($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                         s11_1ImmPred:$offset))))]>;
 
-
-// Indexed load unsigned halfword any-extend.
-let AddedComplexity = 20 in
-def LDriuh_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memuh($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                 s11_1ImmPred:$offset)))] >;
-
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriuh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memuh(#$global+$offset)",
-            []>;
+            "$dst = memuh(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memuh($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load unsigned halfword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1.new) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1.new) $dst = memuh($src2+#$src3)",
             []>;
@@ -1417,10 +1450,10 @@ def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
 let isPredicable = 1 in
 def LDriw : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr), "$dst = memw($addr)",
-            [(set IntRegs:$dst, (load ADDRriS11_2:$addr))]>;
+            [(set IntRegs:$dst, (i32 (load ADDRriS11_2:$addr)))]>;
 
 // Load predicate.
-let mayLoad = 1, Defs = [R10,R11] in
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
 def LDriw_pred : LDInst<(outs PredRegs:$dst),
             (ins MEMri:$addr),
             "Error; should not emit",
@@ -1430,24 +1463,26 @@ def LDriw_pred : LDInst<(outs PredRegs:$dst),
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriw_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_2Imm:$offset),
-            "$dst=memw($src1+#$offset)",
-            [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                           s11_2ImmPred:$offset)))]>;
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (i32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriw_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memw(#$global+$offset)",
-            []>;
+            "$dst = memw(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDw_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDw_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memw(#$global)",
-            []>;
+            "$dst = memw(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memw($src1++#$offset)",
             [],
@@ -1455,71 +1490,71 @@ def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
 
 // Load word conditionally.
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if ($src1) $dst=memw($src2+#$src3)",
+            "if ($src1) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if (!$src1) $dst=memw($src2+#$src3)",
+            "if (!$src1) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if ($src1) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if (!$src1) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if ($src1.new) $dst=memw($src2+#$src3)",
+            "if ($src1.new) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if (!$src1.new) $dst=memw($src2+#$src3)",
+            "if (!$src1.new) $dst = memw($src2+#$src3)",
             []>;
 
 // Deallocate stack frame.
 let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
-  def DEALLOCFRAME : LDInst<(outs), (ins i32imm:$amt1),
+  def DEALLOCFRAME : LDInst2<(outs), (ins i32imm:$amt1),
                      "deallocframe",
                      []>;
 }
@@ -1550,13 +1585,14 @@ let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
 // Rd=+mpyi(Rs,#u8)
 def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
               "$dst =+ mpyi($src1, #$src2)",
-              [(set IntRegs:$dst, (mul IntRegs:$src1, u8ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             u8ImmPred:$src2))]>;
 
 // Rd=-mpyi(Rs,#u8)
 def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
               "$dst =- mpyi($src1, #$src2)",
-              [(set IntRegs:$dst,
-               (mul IntRegs:$src1, n8ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             n8ImmPred:$src2))]>;
 
 // Rd=mpyi(Rs,#m9)
 // s9 is NOT the same as m9 - but it works.. so far.
@@ -1564,35 +1600,40 @@ def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
 // depending on the value of m9. See Arch Spec.
 def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
               "$dst = mpyi($src1, #$src2)",
-              [(set IntRegs:$dst, (mul IntRegs:$src1, s9ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             s9ImmPred:$src2))]>;
 
 // Rd=mpyi(Rs,Rt)
 def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
            "$dst = mpyi($src1, $src2)",
-           [(set IntRegs:$dst, (mul IntRegs:$src1, IntRegs:$src2))]>;
+           [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>;
 
 // Rx+=mpyi(Rs,#u8)
 def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
             "$dst += mpyi($src2, #$src3)",
-            [(set IntRegs:$dst,
-            (add (mul IntRegs:$src2, u8ImmPred:$src3), IntRegs:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u8ImmPred:$src3),
+                       (i32 IntRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rx+=mpyi(Rs,Rt)
 def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpyi($src2, $src3)",
-            [(set IntRegs:$dst,
-            (add (mul IntRegs:$src2, IntRegs:$src3), IntRegs:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       (i32 IntRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rx-=mpyi(Rs,#u8)
 def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
             "$dst -= mpyi($src2, #$src3)",
-            [(set IntRegs:$dst,
-            (sub IntRegs:$src1, (mul IntRegs:$src2, u8ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u8ImmPred:$src3)))],
             "$src1 = $dst">;
 
 // Multiply and use upper result.
@@ -1601,27 +1642,30 @@ def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
 // Rd=mpy(Rs,Rt)
 def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
           "$dst = mpy($src1, $src2)",
-          [(set IntRegs:$dst, (mulhs IntRegs:$src1, IntRegs:$src2))]>;
+          [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 // Rd=mpy(Rs,Rt):rnd
 // Rd=mpyu(Rs,Rt)
 def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
            "$dst = mpyu($src1, $src2)",
-           [(set IntRegs:$dst, (mulhu IntRegs:$src1, IntRegs:$src2))]>;
+           [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 // Multiply and use full result.
 // Rdd=mpyu(Rs,Rt)
 def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = mpyu($src1, $src2)",
-             [(set DoubleRegs:$dst, (mul (i64 (anyext IntRegs:$src1)),
-              (i64 (anyext IntRegs:$src2))))]>;
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (anyext (i32 IntRegs:$src1))),
+                        (i64 (anyext (i32 IntRegs:$src2)))))]>;
 
 // Rdd=mpy(Rs,Rt)
 def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = mpy($src1, $src2)",
-             [(set DoubleRegs:$dst, (mul (i64 (sext IntRegs:$src1)),
-              (i64 (sext IntRegs:$src2))))]>;
-
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (sext (i32 IntRegs:$src1))),
+                        (i64 (sext (i32 IntRegs:$src2)))))]>;
 
 // Multiply and accumulate, use full result.
 // Rxx[+-]=mpy(Rs,Rt)
@@ -1629,18 +1673,20 @@ def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
 def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpy($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (add (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3))),
-               DoubleRegs:$src1))],
+            [(set (i64 DoubleRegs:$dst),
+            (add (mul (i64 (sext (i32 IntRegs:$src2))),
+                      (i64 (sext (i32 IntRegs:$src3)))),
+                 (i64 DoubleRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rxx-=mpy(Rs,Rt)
 def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst -= mpy($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (sub DoubleRegs:$src1,
-                (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3)))))],
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (sext (i32 IntRegs:$src2))),
+                            (i64 (sext (i32 IntRegs:$src3))))))],
             "$src1 = $dst">;
 
 // Rxx[+-]=mpyu(Rs,Rt)
@@ -1648,47 +1694,52 @@ def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
 def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst += mpyu($src2, $src3)",
-             [(set DoubleRegs:$dst, (add (mul (i64 (anyext IntRegs:$src2)),
-              (i64 (anyext IntRegs:$src3))),
-               DoubleRegs:$src1))],"$src1 = $dst">;
+             [(set (i64 DoubleRegs:$dst),
+                   (add (mul (i64 (anyext (i32 IntRegs:$src2))),
+                             (i64 (anyext (i32 IntRegs:$src3)))),
+                        (i64 DoubleRegs:$src1)))], "$src1 = $dst">;
 
 // Rxx-=mpyu(Rs,Rt)
 def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpyu($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (sub DoubleRegs:$src1,
-                    (mul (i64 (anyext IntRegs:$src2)),
-                         (i64 (anyext IntRegs:$src3)))))],
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (anyext (i32 IntRegs:$src2))),
+                            (i64 (anyext (i32 IntRegs:$src3))))))],
             "$src1 = $dst">;
 
 
 def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst += add($src2, $src3)",
-             [(set IntRegs:$dst, (add (add IntRegs:$src2, IntRegs:$src3),
-                                      IntRegs:$src1))],
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3)),
+                                            (i32 IntRegs:$src1)))],
              "$src1 = $dst">;
 
 def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, s8Imm:$src3),
              "$dst += add($src2, #$src3)",
-             [(set IntRegs:$dst, (add (add IntRegs:$src2, s8ImmPred:$src3),
-                                      IntRegs:$src1))],
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 s8ImmPred:$src3),
+                                            (i32 IntRegs:$src1)))],
              "$src1 = $dst">;
 
 def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst -= add($src2, $src3)",
-             [(set IntRegs:$dst, (sub IntRegs:$src1, (add IntRegs:$src2,
-                                                     IntRegs:$src3)))],
+             [(set (i32 IntRegs:$dst),
+                   (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                                  (i32 IntRegs:$src3))))],
              "$src1 = $dst">;
 
 def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, s8Imm:$src3),
              "$dst -= add($src2, #$src3)",
-             [(set IntRegs:$dst, (sub IntRegs:$src1,
-                                      (add IntRegs:$src2, s8ImmPred:$src3)))],
+             [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                            (add (i32 IntRegs:$src2),
+                                                 s8ImmPred:$src3)))],
              "$src1 = $dst">;
 
 //===----------------------------------------------------------------------===//
@@ -1731,57 +1782,70 @@ let isPredicable = 1 in
 def STrid : STInst<(outs),
             (ins MEMri:$addr, DoubleRegs:$src1),
             "memd($addr) = $src1",
-            [(store DoubleRegs:$src1, ADDRriS11_3:$addr)]>;
+            [(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr)]>;
 
 // Indexed store double word.
 let AddedComplexity = 10, isPredicable = 1 in
 def STrid_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
             "memd($src1+#$src2) = $src3",
-            [(store DoubleRegs:$src3,
-                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>;
+            [(store (i64 DoubleRegs:$src3),
+                    (add (i32 IntRegs:$src1), s11_3ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrid_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrid_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
             "memd(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
+
+let neverHasSideEffects = 1 in
+def STd_GP : STInst2<(outs),
+            (ins globaladdress:$global, DoubleRegs:$src),
+            "memd(#$global) = $src",
+            []>,
+            Requires<[NoV4T]>;
 
 let hasCtrlDep = 1, isPredicable = 1 in
 def POST_STdri : STInstPI<(outs IntRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memd($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_store DoubleRegs:$src1, IntRegs:$src2, s4_3ImmPred:$offset))],
+            (post_store (i64 DoubleRegs:$src1), (i32 IntRegs:$src2),
+                        s4_3ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store doubleword conditionally.
 // if ([!]Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if ($src1) memd($addr) = $src2",
             []>;
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cNotPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if (!$src1) memd($addr) = $src2",
             []>;
 
 // if (Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if ($src1) memd($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cNotPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if (!$src1) memd($src2+#$src3) = $src4",
@@ -1789,8 +1853,9 @@ def STrid_indexed_cNotPt : STInst<(outs),
 
 // if ([!]Pv) memd(Rx++#s4:3)=Rtt
 // if (Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if ($src1) memd($src3++#$offset) = $src2",
@@ -1798,9 +1863,9 @@ def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst),
             "$src3 = $dst">;
 
 // if (!Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1,
+let AddedComplexity = 10, neverHasSideEffects = 1, isPredicated = 1,
     isPredicated = 1 in
-def POST_STdri_cNotPt : STInstPI<(outs IntRegs:$dst),
+def POST_STdri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if (!$src1) memd($src3++#$offset) = $src2",
@@ -1814,27 +1879,30 @@ let isPredicable = 1 in
 def STrib : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memb($addr) = $src1",
-            [(truncstorei8 IntRegs:$src1, ADDRriS11_0:$addr)]>;
+            [(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr)]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STrib_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_0Imm:$src2, IntRegs:$src3),
             "memb($src1+#$src2) = $src3",
-            [(truncstorei8 IntRegs:$src3, (add IntRegs:$src1,
-                                               s11_0ImmPred:$src2))]>;
+            [(truncstorei8 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1),
+                                                     s11_0ImmPred:$src2))]>;
 
 // memb(gp+#u16:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrib_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memb(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STb_GP   : STInst<(outs),
+// memb(#global)=Rt
+let neverHasSideEffects = 1 in
+def STb_GP : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memb(#$global) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
 // memb(Rx++#s4:0)=Rt
 let hasCtrlDep = 1, isPredicable = 1 in
@@ -1843,51 +1911,51 @@ def POST_STbri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1,
                                                     s4Imm:$offset),
             "memb($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_truncsti8 IntRegs:$src1, IntRegs:$src2,
+            (post_truncsti8 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
                             s4_0ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store byte conditionally.
 // if ([!]Pv) memb(Rs+#u6:0)=Rt
 // if (Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memb($addr) = $src2",
             []>;
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memb($addr) = $src2",
             []>;
 
 // if (Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1) memb($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1) memb($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memb(Rx++#s4:0)=Rt
 // if (Pv) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STbri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STbri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -1899,27 +1967,29 @@ let isPredicable = 1 in
 def STrih : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memh($addr) = $src1",
-            [(truncstorei16 IntRegs:$src1, ADDRriS11_1:$addr)]>;
+            [(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr)]>;
 
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STrih_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_1Imm:$src2,  IntRegs:$src3),
             "memh($src1+#$src2) = $src3",
-            [(truncstorei16 IntRegs:$src3, (add IntRegs:$src1,
-                                                s11_1ImmPred:$src2))]>;
+            [(truncstorei16 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1),
+                                                      s11_1ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrih_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memh(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STh_GP   : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STh_GP   : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memh(#$global) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
 // memh(Rx++#s4:1)=Rt.H
 // memh(Rx++#s4:1)=Rt
@@ -1928,51 +1998,51 @@ def POST_SThri : STInstPI<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memh($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_truncsti16 IntRegs:$src1, IntRegs:$src2,
+            (post_truncsti16 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
                              s4_1ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store halfword conditionally.
 // if ([!]Pv) memh(Rs+#u6:1)=Rt
 // if (Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memh($addr) = $src2",
             []>;
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memh($addr) = $src2",
             []>;
 
 // if (Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1) memh($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1) memh($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memh(Rx++#s4:1)=Rt
 // if (Pv) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_SThri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_SThri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -1980,8 +2050,8 @@ def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst),
 
 // Store word.
 // Store predicate.
-let Defs = [R10,R11] in
-def STriw_pred : STInst<(outs),
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def STriw_pred : STInst2<(outs),
             (ins MEMri:$addr, PredRegs:$src1),
             "Error; should not emit",
             []>;
@@ -1991,69 +2061,79 @@ let isPredicable = 1 in
 def STriw : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memw($addr) = $src1",
-            [(store IntRegs:$src1, ADDRriS11_2:$addr)]>;
+            [(store (i32 IntRegs:$src1), ADDRriS11_2:$addr)]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STriw_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
             "memw($src1+#$src2) = $src3",
-            [(store IntRegs:$src3, (add IntRegs:$src1, s11_2ImmPred:$src2))]>;
+            [(store (i32 IntRegs:$src3),
+                    (add (i32 IntRegs:$src1), s11_2ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STriw_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memw(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
+
+let neverHasSideEffects = 1 in
+def STw_GP : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src",
+            []>,
+            Requires<[NoV4T]>;
 
 let hasCtrlDep = 1, isPredicable = 1  in
 def POST_STwri : STInstPI<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memw($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_store IntRegs:$src1, IntRegs:$src2, s4_2ImmPred:$offset))],
+            (post_store (i32 IntRegs:$src1), (i32 IntRegs:$src2),
+                        s4_2ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store word conditionally.
 // if ([!]Pv) memw(Rs+#u6:2)=Rt
 // if (Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memw($addr) = $src2",
             []>;
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memw($addr) = $src2",
             []>;
 
 // if (Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1) memw($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1) memw($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memw(Rx++#s4:2)=Rt
 // if (Pv) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STwri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STwri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -2062,7 +2142,7 @@ def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst),
 
 // Allocate stack frame.
 let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
-  def ALLOCFRAME : STInst<(outs),
+  def ALLOCFRAME : STInst2<(outs),
              (ins i32imm:$amt),
              "allocframe(#$amt)",
              []>;
@@ -2077,13 +2157,13 @@ let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
 // Logical NOT.
 def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
                "$dst = not($src1)",
-               [(set DoubleRegs:$dst, (not DoubleRegs:$src1))]>;
+               [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>;
 
 
 // Sign extend word to doubleword.
 def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxtw($src1)",
-           [(set DoubleRegs:$dst, (sext IntRegs:$src1))]>;
+           [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
 // STYPE/ALU -
 //===----------------------------------------------------------------------===//
@@ -2091,37 +2171,58 @@ def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
 //===----------------------------------------------------------------------===//
 // STYPE/BIT +
 //===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/BIT -
-//===----------------------------------------------------------------------===//
+// clrbit.
+def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (not
+                                              (shl 1, u5ImmPred:$src2))))]>;
 
+def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            []>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
+// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31).
+def : Pat <(and (i32 IntRegs:$src1), 2147483647),
+      (CLRBIT_31 (i32 IntRegs:$src1), 31)>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/PERM +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/PERM -
-//===----------------------------------------------------------------------===//
+// setbit.
+def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31).
+def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            []>;
+
+def : Pat <(or (i32 IntRegs:$src1), -2147483648),
+      (SETBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// togglebit.
+def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31).
+def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = togglebit($src1, #$src2)",
+            []>;
+
+def : Pat <(xor (i32 IntRegs:$src1), -2147483648),
+      (TOGBIT_31 (i32 IntRegs:$src1), 31)>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/PRED +
-//===----------------------------------------------------------------------===//
 // Predicate transfer.
 let neverHasSideEffects = 1 in
 def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
-               "$dst = $src1  // Should almost never emit this",
+               "$dst = $src1  /* Should almost never emit this. */",
                []>;
 
 def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
-               "$dst = $src1  // Should almost never emit!",
-               [(set PredRegs:$dst, (trunc IntRegs:$src1))]>;
+               "$dst = $src1  /* Should almost never emit this. */",
+               [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
 // STYPE/PRED -
 //===----------------------------------------------------------------------===//
@@ -2132,75 +2233,85 @@ def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
 // Shift by immediate.
 def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
              "$dst = asr($src1, #$src2)",
-             [(set IntRegs:$dst, (sra IntRegs:$src1, u5ImmPred:$src2))]>;
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
 
 def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
               "$dst = asr($src1, #$src2)",
-              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, u6ImmPred:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
           "$dst = asl($src1, #$src2)",
-          [(set IntRegs:$dst, (shl IntRegs:$src1, u5ImmPred:$src2))]>;
+          [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                         u5ImmPred:$src2))]>;
+
+def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asl($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
              "$dst = lsr($src1, #$src2)",
-             [(set IntRegs:$dst, (srl IntRegs:$src1, u5ImmPred:$src2))]>;
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
 
 def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
               "$dst = lsr($src1, #$src2)",
-              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, u6ImmPred:$src2))]>;
-
-def LSRd_ri_acc : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2,
-                                                     u6Imm:$src3),
-              "$dst += lsr($src2, #$src3)",
-              [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
-                                          (srl DoubleRegs:$src2,
-                                           u6ImmPred:$src3)))],
-              "$src1 = $dst">;
-
-// Shift by immediate and accumulate.
-def ASR_rr_acc : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1,
-                                                     IntRegs:$src2,
-                                                     IntRegs:$src3),
-                 "$dst += asr($src2, $src3)",
-                 [], "$src1 = $dst">;
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 // Shift by immediate and add.
+let AddedComplexity = 100 in
 def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
                                              u3Imm:$src3),
              "$dst = addasl($src1, $src2, #$src3)",
-             [(set IntRegs:$dst, (add IntRegs:$src1,
-                                      (shl IntRegs:$src2,
-                                           u3ImmPred:$src3)))]>;
+             [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                       (shl (i32 IntRegs:$src2),
+                                            u3ImmPred:$src3)))]>;
 
 // Shift by register.
 def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = asl($src1, $src2)",
-             [(set IntRegs:$dst, (shl IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = asr($src1, $src2)",
-             [(set IntRegs:$dst, (sra IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
+def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = lsr($src1, $src2)",
-             [(set IntRegs:$dst, (srl IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = asl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
 
 def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
            "$dst = lsl($src1, $src2)",
-           [(set DoubleRegs:$dst, (shl DoubleRegs:$src1, IntRegs:$src2))]>;
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
 
 def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                  IntRegs:$src2),
               "$dst = asr($src1, $src2)",
-              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, IntRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
 
 def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                  IntRegs:$src2),
               "$dst = lsr($src1, $src2)",
-              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, IntRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
 
 //===----------------------------------------------------------------------===//
 // STYPE/SHIFT -
@@ -2231,8 +2342,8 @@ def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
 def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
                            [SDNPHasChain]>;
 
-let hasSideEffects = 1 in
-def BARRIER : STInst<(outs), (ins),
+let hasSideEffects = 1, isHexagonSolo = 1 in
+def BARRIER : SYSInst<(outs), (ins),
                      "barrier",
                      [(HexagonBARRIER)]>;
 
@@ -2244,47 +2355,50 @@ def BARRIER : STInst<(outs), (ins),
 let isReMaterializable = 1 in
 def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
              "$dst = #$src1",
-             [(set DoubleRegs:$dst, s8Imm64Pred:$src1)]>;
+             [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
 
 // Pseudo instruction to encode a set of conditional transfers.
 // This instruction is used instead of a mux and trades-off codesize
 // for performance. We conduct this transformation optimistically in
 // the hope that these instructions get promoted to dot-new transfers.
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                                                         IntRegs:$src2,
                                                         IntRegs:$src3),
                      "Error; should not emit",
-                     [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                                 IntRegs:$src3))]>;
-
-let AddedComplexity = 100 in
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1),
+                                        (i32 IntRegs:$src2),
+                                        (i32 IntRegs:$src3))))]>;
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3),
             "Error; should not emit",
-            [(set IntRegs:$dst,
-            (select PredRegs:$src1, IntRegs:$src2, s12ImmPred:$src3))]>;
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                          s12ImmPred:$src3)))]>;
 
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3),
             "Error; should not emit",
-            [(set IntRegs:$dst,
-            (select PredRegs:$src1, s12ImmPred:$src2, IntRegs:$src3))]>;
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                          (i32 IntRegs:$src3))))]>;
 
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
                               (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
                      "Error; should not emit",
-                     [(set IntRegs:$dst, (select PredRegs:$src1,
-                                                 s12ImmPred:$src2,
-                                                 s12ImmPred:$src3))]>;
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                                        s12ImmPred:$src3)))]>;
 
 // Generate frameindex addresses.
 let isReMaterializable = 1 in
 def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
              "$dst = add($src1)",
-             [(set IntRegs:$dst, ADDRri:$src1)]>;
+             [(set (i32 IntRegs:$dst), ADDRri:$src1)]>;
 
 //
 // CR - Type.
@@ -2303,69 +2417,116 @@ def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
 
 let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
     Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : CRInst<(outs), (ins brtarget:$offset),
+def ENDLOOP0 : Marker<(outs), (ins brtarget:$offset),
                       ":endloop0",
                       []>;
 }
 
 // Support for generating global address.
 // Taken from X86InstrInfo.td.
-def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
-                                             SDTCisPtrTy<0>]>;
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisPtrTy<0>]>;
 def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
 def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
 
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.l = #LO($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.h = #HI($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.l = #LO($imm_value)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.h = #HI($imm_value)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.l = #LO($jt)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.h = #HI($jt)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.l = #LO($label)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in
+def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.h = #HI($label)",
+                  []>;
+
 // This pattern is incorrect. When we add small data, we should change
 // this pattern to use memw(#foo).
+// This is for sdata.
 let isMoveImm = 1 in
 def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
               "$dst = CONST32(#$global)",
-              [(set IntRegs:$dst,
-              (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
+              [(set (i32 IntRegs:$dst),
+                    (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
 
+// This is for non-sdata.
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                   "$dst = CONST32(#$global)",
-                  [(set IntRegs:$dst,
-                  (HexagonCONST32 tglobaladdr:$global))]>;
+                  [(set (i32 IntRegs:$dst),
+                        (HexagonCONST32 tglobaladdr:$global))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_set_jt : LDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt),
                      "$dst = CONST32(#$jt)",
-                     [(set IntRegs:$dst,
-                     (HexagonCONST32 tjumptable:$jt))]>;
+                     [(set (i32 IntRegs:$dst),
+                           (HexagonCONST32 tjumptable:$jt))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32GP_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                     "$dst = CONST32(#$global)",
-                    [(set IntRegs:$dst,
-                    (HexagonCONST32_GP tglobaladdr:$global))]>;
+                    [(set (i32 IntRegs:$dst),
+                          (HexagonCONST32_GP tglobaladdr:$global))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_Int_Real : LDInst<(outs IntRegs:$dst), (ins i32imm:$global),
+def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
                        "$dst = CONST32(#$global)",
-                       [(set IntRegs:$dst, imm:$global) ]>;
+                       [(set (i32 IntRegs:$dst), imm:$global) ]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_Label : LDInst<(outs IntRegs:$dst), (ins bblabel:$label),
+def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
                     "$dst = CONST32($label)",
-                    [(set IntRegs:$dst, (HexagonCONST32 bbl:$label))]>;
+                    [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST64_Int_Real : LDInst<(outs DoubleRegs:$dst), (ins i64imm:$global),
+def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global),
                        "$dst = CONST64(#$global)",
-                       [(set DoubleRegs:$dst, imm:$global) ]>;
+                       [(set (i64 DoubleRegs:$dst), imm:$global) ]>;
 
 def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
                   "$dst = xor($dst, $dst)",
-                  [(set PredRegs:$dst, 0)]>;
+                  [(set (i1 PredRegs:$dst), 0)]>;
 
 def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-                 "$dst = mpy($src1, $src2)",
-                 [(set IntRegs:$dst,
-                       (trunc (i64 (srl (i64 (mul (i64 (sext IntRegs:$src1)),
-                                             (i64 (sext IntRegs:$src2)))),
-                                        (i32 32)))))]>;
+       "$dst = mpy($src1, $src2)",
+       [(set (i32 IntRegs:$dst),
+             (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))),
+                                        (i64 (sext (i32 IntRegs:$src2))))),
+                              (i32 32)))))]>;
 
 // Pseudo instructions.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
@@ -2405,7 +2566,7 @@ let Defs = [R29, R30, R31], Uses = [R29] in {
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALL : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def CALL : JInst<(outs), (ins calltarget:$dst),
              "call $dst", []>;
 }
 
@@ -2413,34 +2574,28 @@ let isCall = 1, neverHasSideEffects = 1,
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLR : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+  def CALLR : JRInst<(outs), (ins IntRegs:$dst),
               "callr $dst",
               []>;
  }
 
 // Tail Calls.
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
-          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst, variable_ops),
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst),
              "jump $dst // TAILCALL", []>;
 }
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
-          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst, variable_ops),
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst),
              "jump $dst // TAILCALL", []>;
 }
 
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
-          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst, variable_ops),
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst),
              "jumpr $dst // TAILCALL", []>;
 }
 // Map call instruction.
-def : Pat<(call IntRegs:$dst),
-      (CALLR IntRegs:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>;
 def : Pat<(call tglobaladdr:$dst),
       (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
 def : Pat<(call texternalsym:$dst),
@@ -2450,309 +2605,516 @@ def : Pat<(HexagonTCRet tglobaladdr:$dst),
       (TCRETURNtg tglobaladdr:$dst)>;
 def : Pat<(HexagonTCRet texternalsym:$dst),
       (TCRETURNtext texternalsym:$dst)>;
-def : Pat<(HexagonTCRet IntRegs:$dst),
-      (TCRETURNR IntRegs:$dst)>;
+def : Pat<(HexagonTCRet (i32 IntRegs:$dst)),
+      (TCRETURNR (i32 IntRegs:$dst))>;
+
+// Atomic load and store support
+// 8 bit atomic load
+def : Pat<(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDub_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                              u16ImmPred:$offset)),
+          (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_8 ADDRriS11_0:$src1),
+          (i32 (LDriub ADDRriS11_0:$src1))>;
+
+def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)),
+          (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>;
+
+
+
+// 16 bit atomic load
+def : Pat<(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDuh_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_16 ADDRriS11_1:$src1),
+          (i32 (LDriuh ADDRriS11_1:$src1))>;
+
+def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)),
+          (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>;
+
+
+
+// 32 bit atomic load
+def : Pat<(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDw_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_32 ADDRriS11_2:$src1),
+          (i32 (LDriw ADDRriS11_2:$src1))>;
+
+def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)),
+          (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>;
+
+
+// 64 bit atomic load
+def : Pat<(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i64 (LDd_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_64 ADDRriS11_3:$src1),
+          (i64 (LDrid ADDRriS11_3:$src1))>;
 
-// Map from r0 = and(r1, 65535) to r0 = zxth(r1).
-def : Pat <(and IntRegs:$src1, 65535),
-      (ZXTH IntRegs:$src1)>;
+def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)),
+          (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>;
+
+
+// 64 bit atomic store
+def : Pat<(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i64 DoubleRegs:$src1)),
+          (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>;
+
+// 8 bit atomic store
+def : Pat<(atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                          (i32 IntRegs:$src1)),
+          (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i32 IntRegs:$src1))>, Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)),
+          (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+// 16 bit atomic store
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STrih_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i32 IntRegs:$src1))>, Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)),
+          (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_16 (i32 IntRegs:$src1),
+                          (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)),
+          (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_GP tglobaladdr:$global, u16ImmPred:$offset,
+                                         (i32 IntRegs:$src1))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)),
+          (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+
+
+def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)),
+          (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>;
+
+def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def : Pat <(and (i32 IntRegs:$src1), 65535),
+      (ZXTH (i32 IntRegs:$src1))>;
 
 // Map from r0 = and(r1, 255) to r0 = zxtb(r1).
-def : Pat <(and IntRegs:$src1, 255),
-      (ZXTB IntRegs:$src1)>;
+def : Pat <(and (i32 IntRegs:$src1), 255),
+      (ZXTB (i32 IntRegs:$src1))>;
 
 // Map Add(p1, true) to p1 = not(p1).
 //     Add(p1, false) should never be produced,
 //     if it does, it got to be mapped to NOOP.
-def : Pat <(add PredRegs:$src1, -1),
-      (NOT_p PredRegs:$src1)>;
+def : Pat <(add (i1 PredRegs:$src1), -1),
+      (NOT_p (i1 PredRegs:$src1))>;
 
 // Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
 //   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
-def : Pat <(select (i1 (setlt IntRegs:$src1, IntRegs:$src2)), IntRegs:$src3,
-                   IntRegs:$src4),
-      (TFR_condset_rr (CMPLTrr IntRegs:$src1, IntRegs:$src2), IntRegs:$src4,
-                      IntRegs:$src3)>, Requires<[HasV2TOnly]>;
+def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   (i32 IntRegs:$src3),
+                   (i32 IntRegs:$src4)),
+      (i32 (TFR_condset_rr (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                           (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>,
+      Requires<[HasV2TOnly]>;
 
 // Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
-def : Pat <(select (not PredRegs:$src1), s8ImmPred:$src2, s8ImmPred:$src3),
-      (TFR_condset_ii PredRegs:$src1, s8ImmPred:$src3, s8ImmPred:$src2)>;
+def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3),
+      (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3,
+                           s8ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2,
+                   (i32 IntRegs:$src3)),
+      (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3),
+                           s12ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, s12ImmPred:$src3),
+      (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3,
+                           (i32 IntRegs:$src2)))>;
 
 // Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
 def : Pat <(brcond (not PredRegs:$src1), bb:$offset),
-      (JMP_cNot PredRegs:$src1, bb:$offset)>;
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
 
 // Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
 def : Pat <(and PredRegs:$src1, (not PredRegs:$src2)),
-      (AND_pnotp PredRegs:$src1, PredRegs:$src2)>;
+      (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
 
 // Map from store(globaladdress + x) -> memd(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(store DoubleRegs:$src1,
+def : Pat <(store (i64 DoubleRegs:$src1),
                   (add (HexagonCONST32_GP tglobaladdr:$global),
                        u16ImmPred:$offset)),
-      (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, DoubleRegs:$src1)>;
+      (STrid_GP tglobaladdr:$global, u16ImmPred:$offset,
+                (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>;
 
-// Map from store(globaladdress) -> memd(#foo + 0).
+// Map from store(globaladdress) -> memd(#foo).
 let AddedComplexity = 100 in
-def : Pat <(store DoubleRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STrid_GP tglobaladdr:$global, 0, DoubleRegs:$src1)>;
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+      (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memw(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (add (HexagonCONST32_GP tglobaladdr:$global),
+def : Pat <(store (i32 IntRegs:$src1),
+              (add (HexagonCONST32_GP tglobaladdr:$global),
                                       u16ImmPred:$offset)),
-      (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memw(#foo + 0).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>;
 
-// Map from store(globaladdress) -> memw(#foo + 0).
+// Map from store(globaladdress) -> memw(#foo).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei16 IntRegs:$src1,
+def : Pat <(truncstorei16 (i32 IntRegs:$src1),
                           (add (HexagonCONST32_GP tglobaladdr:$global),
                                u16ImmPred:$offset)),
-      (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei16 IntRegs:$src1,
+def : Pat <(truncstorei16 (i32 IntRegs:$src1),
                           (HexagonCONST32_GP tglobaladdr:$global)),
-      (STh_GP tglobaladdr:$global, IntRegs:$src1)>;
+      (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei8 IntRegs:$src1,
+def : Pat <(truncstorei8 (i32 IntRegs:$src1),
                          (add (HexagonCONST32_GP tglobaladdr:$global),
                               u16ImmPred:$offset)),
-      (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei8 IntRegs:$src1,
+def : Pat <(truncstorei8 (i32 IntRegs:$src1),
                          (HexagonCONST32_GP tglobaladdr:$global)),
-      (STb_GP tglobaladdr:$global, IntRegs:$src1)>;
+      (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memw(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(load (add (HexagonCONST32_GP tglobaladdr:$global),
-                      u16ImmPred:$offset)),
-      (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                      u16ImmPred:$offset))),
+      (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memw(#foo + 0).
+// Map from load(globaladdress) -> memw(#foo).
 let AddedComplexity = 100 in
-def : Pat <(load (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDw_GP tglobaladdr:$global)>;
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDw_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memd(#foo + x).
 let AddedComplexity = 100 in
 def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
                            u16ImmPred:$offset))),
-      (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+      (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memw(#foo + 0).
 let AddedComplexity = 100 in
 def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
-      (LDd_GP tglobaladdr:$global)>;
-
+      (i64 (LDd_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress + 0), Pd = Rd.
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd.
 let AddedComplexity = 100 in
 def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-      (TFR_PdRs (LDrib_GP tglobaladdr:$global, 0))>;
+      (i1 (TFR_PdRs (i32 (LDb_GP tglobaladdr:$global))))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset)),
-      (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memh(#foo + 0).
+// Map from load(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDrih_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDrih_GP tglobaladdr:$global, 0))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memuh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset)),
-      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+      (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
+// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriuh_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDriuh_GP tglobaladdr:$global, 0))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress + x) -> memuh(#foo + x).
+// Map from load(globaladdress) -> memh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDh_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
-let AddedComplexity = 100 in
-def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriuh_GP tglobaladdr:$global, 0)>;
-// Map from load(globaladdress + x) -> memub(#foo + x).
+// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDuh_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
+// Map from load(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriub_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+      (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(extloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memub(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDub_GP tglobaladdr:$global)>;
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDub_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // When the Interprocedural Global Variable optimizer realizes that a
 // certain global variable takes only two constant values, it shrinks the
 // global to a boolean. Catch those loads here in the following 3 patterns.
 let AddedComplexity = 100 in
-def : Pat <(extloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 let AddedComplexity = 100 in
-def : Pat <(sextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
-
-let AddedComplexity = 100 in
-def : Pat <(zextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDub_GP tglobaladdr:$global)>;
-
-// Map from load(globaladdress) -> memh(#foo).
-let AddedComplexity = 100 in
-def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDh_GP tglobaladdr:$global)>;
-
-// Map from load(globaladdress) -> memh(#foo).
-let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDh_GP tglobaladdr:$global)>;
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDuh_GP tglobaladdr:$global)>;
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDub_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
 def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
-      (AND_rr (LDrib ADDRriS11_0:$addr), (TFRI 0x1))>;
+      (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i32)),
-      (i64 (SXTW (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg)))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
+      (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i16)),
-      (i64 (SXTW (SXTH (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
+      (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i8)),
-      (i64 (SXTW (SXTB (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
+      (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
 
-// We want to prevent emiting pnot's as much as possible.
+// We want to prevent emitting pnot's as much as possible.
 // Map brcond with an unsupported setcc to a JMP_cNot.
-def : Pat <(brcond (i1 (setne IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_cNot (CMPEQrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne IntRegs:$src1, s10ImmPred:$src2)), bb:$offset),
-      (JMP_cNot (CMPEQri IntRegs:$src1, s10ImmPred:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 -1))), bb:$offset),
-      (JMP_cNot PredRegs:$src1, bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 0))), bb:$offset),
-      (JMP_c PredRegs:$src1, bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
+      (JMP_c (i1 PredRegs:$src1), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setlt IntRegs:$src1, s8ImmPred:$src2)), bb:$offset),
-      (JMP_cNot (CMPGEri IntRegs:$src1, s8ImmPred:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setlt IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_c (CMPLTrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_c (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
+def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_cNot (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1),
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
                    bb:$offset)>;
 
-def : Pat <(brcond (i1 (setule IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_cNot (CMPGTUrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
 
-def : Pat <(brcond (i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
+def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_cNot (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2),
-                   bb:$offset)>;
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+                bb:$offset)>;
 
 // Map from a 64-bit select to an emulated 64-bit mux.
 // Hexagon does not support 64-bit MUXes; so emulate with combines.
-def : Pat <(select PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-      (COMBINE_rr
-      (MUX_rr PredRegs:$src1,
-      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg),
-      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_hireg)),
-      (MUX_rr PredRegs:$src1,
-      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
-      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_loreg)))>;
+def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
+                   (i64 DoubleRegs:$src3)),
+      (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_hireg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_hireg)))),
+                       (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_loreg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_loreg))))))>;
 
 // Map from a 1-bit select to logical ops.
 // From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
-def : Pat <(select PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-      (OR_pp (AND_pp PredRegs:$src1, PredRegs:$src2),
-             (AND_pp (NOT_p PredRegs:$src1), PredRegs:$src3))>;
+def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2),
+                   (i1 PredRegs:$src3)),
+      (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)),
+             (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>;
 
 // Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
 def : Pat<(i1 (load ADDRriS11_2:$addr)),
       (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
 
 // Map for truncating from 64 immediates to 32 bit immediates.
-def : Pat<(i32 (trunc DoubleRegs:$src)),
-      (i32 (EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))>;
+def : Pat<(i32 (trunc (i64 DoubleRegs:$src))),
+      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>;
 
 // Map for truncating from i64 immediates to i1 bit immediates.
-def :  Pat<(i1 (trunc DoubleRegs:$src)),
-       (i1 (TFR_PdRs (i32(EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))))>;
+def :  Pat<(i1 (trunc (i64 DoubleRegs:$src))),
+       (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                          subreg_loreg))))>;
 
 // Map memb(Rs) = Rdd -> memb(Rs) = Rt.
-def : Pat<(truncstorei8 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memh(Rs) = Rdd -> memh(Rs) = Rt.
-def : Pat<(truncstorei16 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt
+def : Pat<(truncstorei32 (i64  DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memw(Rs) = Rdd -> memw(Rs) = Rt.
-def : Pat<(truncstorei32 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
@@ -2763,118 +3125,134 @@ let AddedComplexity = 100 in
 // Map from i1 = constant<-1>; memw(CONST32(#foo)) = i1 -> r0 = 1;
 // memw(#foo) = r0
 def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
-      (STb_GP tglobaladdr:$global, (TFRI 1))>;
-
+      (STb_GP tglobaladdr:$global, (TFRI 1))>,
+      Requires<[NoV4T]>;
 
 // Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
 def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
       (STrib ADDRriS11_2:$addr, (TFRI 1))>;
 
 // Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
-def : Pat<(store PredRegs:$src1, ADDRriS11_2:$addr),
-      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii PredRegs:$src1, 1, 0)) )>;
+def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>;
 
 // Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
 // Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
 // Better way to do this?
-def : Pat<(i64 (anyext IntRegs:$src1)),
-      (i64 (SXTW IntRegs:$src1))>;
+def : Pat<(i64 (anyext (i32 IntRegs:$src1))),
+      (i64 (SXTW (i32 IntRegs:$src1)))>;
 
 // Map cmple -> cmpgt.
 // rs <= rt -> !(rs > rt).
-def : Pat<(i1 (setle IntRegs:$src1, s10ImmPred:$src2)),
-      (i1 (NOT_p (CMPGTri IntRegs:$src1, s10ImmPred:$src2)))>;
+def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ImmPred:$src2)))>;
 
 // rs <= rt -> !(rs > rt).
-def : Pat<(i1 (setle IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTrr IntRegs:$src1, IntRegs:$src2)))>;
+def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
 
 // Rss <= Rtt -> !(Rss > Rtt).
-def : Pat<(i1 (setle DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGT64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
 
 // Map cmpne -> cmpeq.
 // Hexagon_TODO: We should improve on this.
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne IntRegs:$src1, s10ImmPred:$src2)),
-      (i1 (NOT_p(i1 (CMPEQri IntRegs:$src1, s10ImmPred:$src2))))>;
+def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2))))>;
 
 // Map cmpne(Rs) -> !cmpeqe(Rs).
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPEQrr IntRegs:$src1, IntRegs:$src2))))>;
+def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
 
 // Convert setne back to xor for hexagon since we compute w/ pred registers.
-def : Pat <(i1 (setne PredRegs:$src1, PredRegs:$src2)),
-      (i1 (XOR_pp PredRegs:$src1, PredRegs:$src2))>;
+def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
+      (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
 
 // Map cmpne(Rss) -> !cmpew(Rss).
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPEHexagon4rr DoubleRegs:$src1, DoubleRegs:$src2))))>;
+def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1),
+                                     (i64 DoubleRegs:$src2)))))>;
 
 // Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setge IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPGTrr IntRegs:$src2, IntRegs:$src1))))>;
+def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
 
-def : Pat <(i1 (setge IntRegs:$src1, s8ImmPred:$src2)),
-      (i1 (CMPGEri IntRegs:$src1, s8ImmPred:$src2))>;
+def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2))>;
 
 // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
 // rss >= rtt -> !(rtt > rss).
-def : Pat <(i1 (setge DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))))>;
+def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2),
+                                (i64 DoubleRegs:$src1)))))>;
 
 // Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
 // rs < rt -> !(rs >= rt).
-def : Pat <(i1 (setlt IntRegs:$src1, s8ImmPred:$src2)),
-      (i1 (NOT_p (CMPGEri IntRegs:$src1, s8ImmPred:$src2)))>;
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (NOT_p (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2)))>;
 
-// Map cmplt(Rs, Rt) -> cmplt(Rs, Rt).
-// rs < rt -> rs < rt. Let assembler map it.
-def : Pat <(i1 (setlt IntRegs:$src1, IntRegs:$src2)),
-      (i1 (CMPLTrr IntRegs:$src2, IntRegs:$src1))>;
+// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs).
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
 
 // Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
 // rss < rtt -> (rtt > rss).
-def : Pat <(i1 (setlt DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
 
-// Map from cmpltu(Rs, Rd) -> !cmpgtu(Rs, Rd - 1).
+// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs)
 // rs < rt -> rt > rs.
-def : Pat <(i1 (setult IntRegs:$src1, IntRegs:$src2)),
-      (i1 (CMPGTUrr IntRegs:$src2, IntRegs:$src1))>;
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
 
-// Map from cmpltu(Rss, Rdd) -> !cmpgtu(Rss, Rdd - 1).
+// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss).
 // rs < rt -> rt > rs.
-def : Pat <(i1 (setult DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Generate cmpgeu(Rs, #u8)
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ImmPred:$src2)),
+      (i1 (CMPGEUri (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+
+// Generate cmpgtu(Rs, #u9)
+def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)),
+      (i1 (CMPGTUri (i32 IntRegs:$src1), u9ImmPred:$src2))>;
 
 // Map from Rs >= Rt -> !(Rt > Rs).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTUrr IntRegs:$src2, IntRegs:$src1)))>;
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>;
 
 // Map from Rs >= Rt -> !(Rt > Rs).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1)))>;
+def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>;
 
 // Map from cmpleu(Rs, Rs) -> !cmpgtu(Rs, Rs).
 // Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTUrr IntRegs:$src1, IntRegs:$src2)))>;
+def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
 
 // Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
 // Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
 
 // Sign extends.
 // i1 -> i32
-def : Pat <(i32 (sext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, -1, 0))>;
+def : Pat <(i32 (sext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (sext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>;
 
 // Convert sign-extended load back to load and sign extend.
 // i8 -> i64
@@ -2899,16 +3277,16 @@ def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
 
 // Zero extends.
 // i1 -> i32
-def : Pat <(i32 (zext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // i1 -> i64
-def : Pat <(i64 (zext PredRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (MUX_ii PredRegs:$src1, 1, 0)))>;
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>;
 
 // i32 -> i64
-def : Pat <(i64 (zext IntRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
 
 // i8 -> i64
 def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
@@ -2926,16 +3304,16 @@ def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
       (i32 (LDriw ADDRriS11_0:$src1))>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (zext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (anyext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (anyext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def : Pat <(i64 (anyext PredRegs:$src1)),
-      (i64 (SXTW (i32 (MUX_ii PredRegs:$src1, 1, 0))))>;
+def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
+      (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>;
 
 
 // Any extended 64-bit load.
@@ -2948,75 +3326,103 @@ def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
       (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>;
 
 // Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
-def : Pat<(i64 (zext IntRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+def : Pat<(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
 
 // Multiply 64-bit unsigned and use upper result.
-def : Pat <(mulhu DoubleRegs:$src1, DoubleRegs:$src2),
-      (MPYU64_acc(COMBINE_rr (TFRI 0),
-                 (EXTRACT_SUBREG
-                 (LSRd_ri(MPYU64_acc(MPYU64_acc(COMBINE_rr (TFRI 0),
-                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_loreg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                                  32) ,subreg_loreg)),
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_hireg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                          32),subreg_loreg)),
-                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
-                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
-      )>;
+def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPYU64_acc
+        (i64
+         (COMBINE_rr
+          (TFRI 0),
+           (i32
+            (EXTRACT_SUBREG
+             (i64
+              (LSRd_ri
+               (i64
+                (MPYU64_acc
+                 (i64
+                  (MPYU64_acc
+                   (i64
+                    (COMBINE_rr (TFRI 0),
+                     (i32
+                      (EXTRACT_SUBREG
+                       (i64
+                        (LSRd_ri
+                         (i64
+                          (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                       subreg_loreg)),
+                                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                       subreg_loreg)))), 32)),
+                       subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+               32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
 
 // Multiply 64-bit signed and use upper result.
-def : Pat <(mulhs DoubleRegs:$src1, DoubleRegs:$src2),
-      (MPY64_acc(COMBINE_rr (TFRI 0),
-                 (EXTRACT_SUBREG
-                 (LSRd_ri(MPY64_acc(MPY64_acc(COMBINE_rr (TFRI 0),
-                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_loreg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                                  32) ,subreg_loreg)),
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_hireg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                          32),subreg_loreg)),
-                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
-                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
-      )>;
+def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPY64_acc
+        (i64
+         (COMBINE_rr (TFRI 0),
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (LSRd_ri
+              (i64
+               (MPY64_acc
+                (i64
+                 (MPY64_acc
+                  (i64
+                   (COMBINE_rr (TFRI 0),
+                    (i32
+                     (EXTRACT_SUBREG
+                      (i64
+                       (LSRd_ri
+                        (i64
+                         (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                      subreg_loreg)),
+                                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                      subreg_loreg)))), 32)),
+                      subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+              32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
 
 // Hexagon specific ISD nodes.
-def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
-                                 SDTHexagonADJDYNALLOC>;
+                                  SDTHexagonADJDYNALLOC>;
 // Needed to tag these instructions for stack layout.
 let usesCustomInserter = 1 in
 def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
                                                      s16Imm:$src2),
                   "$dst = add($src1, #$src2)",
-                  [(set IntRegs:$dst, (Hexagon_ADJDYNALLOC IntRegs:$src1,
-                                                           s16ImmPred:$src2))]>;
+                  [(set (i32 IntRegs:$dst),
+                        (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1),
+                                             s16ImmPred:$src2))]>;
 
-def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, []>;
+def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
 def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
 def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
                 "$dst = $src1",
-                [(set IntRegs:$dst, (Hexagon_ARGEXTEND IntRegs:$src1))]>;
+                [(set (i32 IntRegs:$dst),
+                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
 
 let AddedComplexity = 100 in
-def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND IntRegs:$src1), i16)),
-      (TFR IntRegs:$src1)>;
-
+def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
+      (COPY (i32 IntRegs:$src1))>;
 
 def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
@@ -3024,12 +3430,91 @@ def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BR_JT : JRInst<(outs), (ins IntRegs:$src),
                    "jumpr $src",
-                   [(HexagonBR_JT IntRegs:$src)]>;
+                   [(HexagonBR_JT (i32 IntRegs:$src))]>;
+
 def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
 
 def : Pat<(HexagonWrapperJT tjumptable:$dst),
-          (CONST32_set_jt tjumptable:$dst)>;
+          (i32 (CONST32_set_jt tjumptable:$dst))>;
+
+// XTYPE/SHIFT
+
+// Multi-class for logical operators :
+// Shift by immediate/register and accumulate/logical
+multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _ri : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    u5ImmPred:$src3)))],
+            "$src1 = $dst">;
+
+  def d_ri : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1),
+                          (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))],
+            "$src1 = $dst">;
+}
+
+// Multi-class for logical operators :
+// Shift by register and accumulate/logical (32/64 bits)
+multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _rr : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+  def d_rr : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i64 DoubleRegs:$dst),
+                  (OpNode2 (i64 DoubleRegs:$src1),
+                           (OpNode1 (i64 DoubleRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+}
+
+multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>;
+}
+
+defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>,
+           xtype_xor_imm<"asl", shl>;
 
+defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>,
+           xtype_xor_imm<"lsr", srl>;
+
+defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>;
+defm LSL : basic_xtype_reg<"lsl", shl>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
+      (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>;
 
 //===----------------------------------------------------------------------===//
 // V3 Instructions +
@@ -3046,3 +3531,19 @@ include "HexagonInstrInfoV3.td"
 //===----------------------------------------------------------------------===//
 
 include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
+
+
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
index a73897e..157ab3d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -19,7 +19,7 @@
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
                 P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLv3 : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def CALLv3 : JInst<(outs), (ins calltarget:$dst),
              "call $dst", []>, Requires<[HasV3T]>;
 }
 
@@ -35,16 +35,17 @@ let isCall = 1, neverHasSideEffects = 1,
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
                 P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst),
               "callr $dst",
               []>, Requires<[HasV3TOnly]>;
  }
 
 
+// Jump to address from register
 // if(p?.new) jumpr:t r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if ($src1.new) jumpr:t $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -52,7 +53,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if (!p?.new) jumpr:t r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cNotPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnNotPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if (!$src1.new) jumpr:t $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -61,7 +62,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if(p?.new) jumpr:nt r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if ($src1.new) jumpr:nt $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -69,7 +70,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if (!p?.new) jumpr:nt r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cNotPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnNotPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if (!$src1.new) jumpr:nt $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -86,20 +87,22 @@ let AddedComplexity = 200 in
 def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = max($src2, $src1)",
-              [(set DoubleRegs:$dst, (select (i1 (setlt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))]>,
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
 Requires<[HasV3T]>;
 
 let AddedComplexity = 200 in
 def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = min($src2, $src1)",
-              [(set DoubleRegs:$dst, (select (i1 (setgt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))]>,
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
 Requires<[HasV3T]>;
 
 //===----------------------------------------------------------------------===//
@@ -109,25 +112,25 @@ Requires<[HasV3T]>;
 
 
 
-//def : Pat <(brcond (i1 (seteq IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegEzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setne IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegNzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setle IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegLezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setge IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setgt IntRegs:$src1, -1)), bb:$offset),
-//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
 
 // Map call instruction
-def : Pat<(call IntRegs:$dst),
-      (CALLRv3 IntRegs:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>;
 def : Pat<(call tglobaladdr:$dst),
       (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
 def : Pat<(call texternalsym:$dst),
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 9e60cf2..70448fc 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -11,6 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+let neverHasSideEffects = 1 in
+def IMMEXT : Immext<(outs), (ins),
+                    "/* immext #... */",
+                    []>,
+             Requires<[HasV4T]>;
+
 // Hexagon V4 Architecture spec defines 8 instruction classes:
 // LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
 // compiler)
@@ -250,23 +256,151 @@ def ZXTH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
             []>,
             Requires<[HasV4T]>;
 
+// Generate frame index addresses.
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$offset),
+            "$dst = add($src1, ##$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
 
 //===----------------------------------------------------------------------===//
 // ALU32 -
 //===----------------------------------------------------------------------===//
 
 
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine
+// Rdd=combine(Rs, #s8)
+let neverHasSideEffects = 1 in
+def COMBINE_ri_V4 : ALU32_ri<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s8Imm:$src2),
+            "$dst = combine($src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+// Rdd=combine(#s8, Rs)
+let neverHasSideEffects = 1 in
+def COMBINE_ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, IntRegs:$src2),
+            "$dst = combine(#$src1, $src2)",
+            []>,
+            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // LD +
 //===----------------------------------------------------------------------===//
-///
-/// Make sure that in post increment load, the first operand is always the post
-/// increment operand.
-///
-//// Load doubleword.
-// Rdd=memd(Re=#U6)
+//
+// These absolute set addressing mode instructions accept immediate as
+// an operand. We have duplicated these patterns to take global address.
+
+let neverHasSideEffects = 1 in
+def LDrid_abs_setimm_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memd($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrib_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memb($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrih_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memh($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriub_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memub($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriuh_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memuh($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriw_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memw($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Following patterns are defined for absolute set addressing mode
+// instruction which take global address as operand.
+let neverHasSideEffects = 1 in
+def LDrid_abs_set_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memd($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrib_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memb($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
 
+// Rd=memh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrih_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriub_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memub($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriuh_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memuh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriw_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memw($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Load doubleword.
+//
+// Make sure that in post increment load, the first operand is always the post
+// increment operand.
+//
 // Rdd=memd(Rs+Rt<<#u2)
 // Special case pattern for indexed load without offset which is easier to
 // match. AddedComplexity of this pattern should be lower than base+offset load
@@ -276,56 +410,58 @@ let AddedComplexity = 10, isPredicable = 1 in
 def LDrid_indexed_V4 : LDInst<(outs DoubleRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memd($src1+$src2<<#0)",
-                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                                      IntRegs:$src2)))]>,
+                    [(set (i64 DoubleRegs:$dst),
+                          (i64 (load (add (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDrid_indexed_shl_V4 : LDInst<(outs DoubleRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memd($src1+$src2<<#$offset)",
-                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                                 (shl IntRegs:$src2,
-                                                      u2ImmPred:$offset))))]>,
+                    [(set (i64 DoubleRegs:$dst),
+                          (i64 (load (add (i32 IntRegs:$src1),
+                                          (shl (i32 IntRegs:$src2),
+                                               u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load doubleword conditionally.
 // if ([!]Pv[.new]) Rd=memd(Rs+Rt<<#u2)
 // if (Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memd($src2+$src3<<#$offset)",
@@ -333,8 +469,8 @@ def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memd($src2+$src3<<#$offset)",
@@ -342,8 +478,8 @@ def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memd($src2+$src3<<#$offset)",
@@ -351,8 +487,8 @@ def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memd($src2+$src3<<#$offset)",
@@ -362,99 +498,101 @@ def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
 // Rdd=memd(Rt<<#u2+#U6)
 
 //// Load byte.
-// Rd=memb(Re=#U6)
-
 // Rd=memb(Rs+Rt<<#u2)
 let AddedComplexity = 10, isPredicable = 1 in
 def LDrib_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memb($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriub_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memub($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriub_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memub($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                        IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
+                                               (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDrib_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memb($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (sextloadi8 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriub_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memub($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (zextloadi8 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriub_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memub($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                   (shl IntRegs:$src2,
-                                                        u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
+                                               (shl (i32 IntRegs:$src2),
+                                                    u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load byte conditionally.
 // if ([!]Pv[.new]) Rd=memb(Rs+Rt<<#u2)
 // if (Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memb($src2+$src3<<#$offset)",
@@ -462,8 +600,8 @@ def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memb($src2+$src3<<#$offset)",
@@ -471,8 +609,8 @@ def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memb($src2+$src3<<#$offset)",
@@ -480,8 +618,8 @@ def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memb($src2+$src3<<#$offset)",
@@ -491,40 +629,40 @@ def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 //// Load unsigned byte conditionally.
 // if ([!]Pv[.new]) Rd=memub(Rs+Rt<<#u2)
 // if (Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memub($src2+$src3<<#$offset)",
@@ -532,8 +670,8 @@ def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memub($src2+$src3<<#$offset)",
@@ -541,8 +679,8 @@ def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memub($src2+$src3<<#$offset)",
@@ -550,8 +688,8 @@ def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memub($src2+$src3<<#$offset)",
@@ -561,31 +699,32 @@ def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Rd=memb(Rt<<#u2+#U6)
 
 //// Load halfword
-// Rd=memh(Re=#U6)
-
 // Rd=memh(Rs+Rt<<#u2)
 let AddedComplexity = 10, isPredicable = 1 in
 def LDrih_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
-                                                          IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                                 (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriuh_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memuh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
-                                                          IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                                 (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriuh_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memuh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 // Rd=memh(Rs+Rt<<#u2)
@@ -593,69 +732,69 @@ let AddedComplexity = 40, isPredicable = 1 in
 def LDrih_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (sextloadi16 (add IntRegs:$src1,
-                                            (shl IntRegs:$src2,
-                                                 u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                                 (shl (i32 IntRegs:$src2),
+                                                      u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriuh_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memuh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (zextloadi16 (add IntRegs:$src1,
-                                            (shl IntRegs:$src2,
-                                                 u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                                 (shl (i32 IntRegs:$src2),
+                                                      u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriuh_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memuh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (extloadi16 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load halfword conditionally.
 // if ([!]Pv[.new]) Rd=memh(Rs+Rt<<#u2)
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memh($src2+$src3<<#$offset)",
@@ -663,8 +802,8 @@ def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memh($src2+$src3<<#$offset)",
@@ -672,8 +811,8 @@ def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memh($src2+$src3<<#$offset)",
@@ -681,8 +820,8 @@ def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memh($src2+$src3<<#$offset)",
@@ -692,40 +831,40 @@ def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 //// Load unsigned halfword conditionally.
 // if ([!]Pv[.new]) Rd=memuh(Rs+Rt<<#u2)
 // if (Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memuh($src2+$src3<<#$offset)",
@@ -733,8 +872,8 @@ def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memuh($src2+$src3<<#$offset)",
@@ -742,8 +881,8 @@ def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memuh($src2+$src3<<#$offset)",
@@ -751,8 +890,8 @@ def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memuh($src2+$src3<<#$offset)",
@@ -762,6 +901,14 @@ def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Rd=memh(Rt<<#u2+#U6)
 
 //// Load word.
+// Load predicate: Fix for bug 5279.
+let neverHasSideEffects = 1 in
+def LDriw_pred_V4 : LDInst2<(outs PredRegs:$dst),
+            (ins MEMri:$addr),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
 // Rd=memw(Re=#U6)
 
 // Rd=memw(Rs+Rt<<#u2)
@@ -769,8 +916,9 @@ let AddedComplexity = 10, isPredicable = 1 in
 def LDriw_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memw($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                                   IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (load (add (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 // Rd=memw(Rs+Rt<<#u2)
@@ -778,48 +926,49 @@ let AddedComplexity = 40, isPredicable = 1 in
 def LDriw_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memw($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                              (shl IntRegs:$src2,
-                                                   u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (load (add (i32 IntRegs:$src1),
+                                          (shl (i32 IntRegs:$src2),
+                                               u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load word conditionally.
 // if ([!]Pv[.new]) Rd=memw(Rs+Rt<<#u2)
 // if (Pv) Rd=memw(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memw($src2+$src3<<#$offset)",
@@ -827,8 +976,8 @@ def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memw($src2+$src3<<#$offset)",
@@ -836,8 +985,8 @@ def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memw($src2+$src3<<#$offset)",
@@ -845,8 +994,8 @@ def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memw($src2+$src3<<#$offset)",
@@ -859,102 +1008,729 @@ def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Post-inc Load, Predicated, Dot new
 
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrid_cdnPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cdnPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if ($src1.new) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrid_cdnNotPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cdnNotPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if (!$src1.new) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrib_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1.new) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrib_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1.new) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrih_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1.new) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrih_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1.new) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriub_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1.new) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriub_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1.new) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriuh_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1.new) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriuh_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1.new) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriw_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if ($src1.new) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if (!$src1.new) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
+/// Load from global offset
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrid_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memd(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrib_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memb(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriub_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memub(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrih_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memh(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriuh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memuh(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriw_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memw(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDd_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memd(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDb_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memb(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDub_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memub(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memh(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDuh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memuh(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDw_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memw(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+
+def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memw(#foo + 0)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>,
+           Requires<[HasV4T]>;
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memub(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memuh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                      u16ImmPred:$offset))),
+           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
 
 //===----------------------------------------------------------------------===//
 // LD -
@@ -971,18 +1747,70 @@ def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
 ///    last operand.
 ///
 
-// Store doubleword.
 // memd(Re=#U6)=Rtt
-// TODO: needs to be implemented
+def STrid_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, u6Imm:$src2),
+            "memd($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U6)=Rs
+def STrib_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memb($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U6)=Rs
+def STrih_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memh($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U6)=Rs
+def STriw_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memw($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memd(Re=#U6)=Rtt
+def STrid_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, globaladdress:$src2),
+            "memd($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U6)=Rs
+def STrib_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memb($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U6)=Rs
+def STrih_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memh($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U6)=Rs
+def STriw_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memw($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
 
-// memd(Rs+#s11:3)=Rtt
 // memd(Rs+Ru<<#u2)=Rtt
 let AddedComplexity = 10, isPredicable = 1 in
 def STrid_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, DoubleRegs:$src4),
             "memd($src1+$src2<<#$src3) = $src4",
-            [(store DoubleRegs:$src4, (add IntRegs:$src1,
-                                      (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            [(store (i64 DoubleRegs:$src4),
+                    (add (i32 IntRegs:$src1),
+                         (shl (i32 IntRegs:$src2), u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memd(Ru<<#u2+#U6)=Rtt
@@ -990,9 +1818,9 @@ let AddedComplexity = 10 in
 def STrid_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, DoubleRegs:$src4),
             "memd($src1<<#$src2+#$src3) = $src4",
-            [(store DoubleRegs:$src4, (shl IntRegs:$src1,
-                                      (add u2ImmPred:$src2,
-                                           u6ImmPred:$src3)))]>,
+            [(store (i64 DoubleRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                         u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memd(Rx++#s4:3)=Rtt
@@ -1009,8 +1837,9 @@ def STrid_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memd(Rs+#u6:3)=Rtt
 // if (Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if ($src1.new) memd($addr) = $src2",
             []>,
@@ -1018,8 +1847,9 @@ def STrid_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
 // if (!Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if (!$src1.new) memd($addr) = $src2",
             []>,
@@ -1027,8 +1857,9 @@ def STrid_cdnNotPt_V4 : STInst<(outs),
 
 // if (Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if ($src1.new) memd($src2+#$src3) = $src4",
@@ -1037,8 +1868,9 @@ def STrid_indexed_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
 // if (!Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if (!$src1.new) memd($src2+#$src3) = $src4",
@@ -1047,8 +1879,9 @@ def STrid_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memd(Rs+Ru<<#u2)=Rtt
 // if (Pv) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if ($src1) memd($src2+$src3<<#$src4) = $src5",
@@ -1056,24 +1889,27 @@ def STrid_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
-            "if ($src1) memd($src2+$src3<<#$src4) = $src5",
+            "if ($src1.new) memd($src2+$src3<<#$src4) = $src5",
             []>,
             Requires<[HasV4T]>;
 // if (!Pv) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if (!$src1) memd($src2+$src3<<#$src4) = $src5",
             []>,
             Requires<[HasV4T]>;
 // if (!Pv.new) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if (!$src1.new) memd($src2+$src3<<#$src4) = $src5",
@@ -1083,8 +1919,9 @@ def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memd(Rx++#s4:3)=Rtt
 // if (Pv) memd(Rx++#s4:3)=Rtt
 // if (Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if ($src1.new) memd($src3++#$offset) = $src2",
@@ -1094,8 +1931,9 @@ def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memd(Rx++#s4:3)=Rtt
 // if (!Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if (!$src1.new) memd($src3++#$offset) = $src2",
@@ -1105,15 +1943,12 @@ def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 
 // Store byte.
-// memb(Re=#U6)=Rt
-// TODO: needs to be implemented.
-// memb(Rs+#s11:0)=Rt
 // memb(Rs+#u6:0)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STrib_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_0Imm:$src2, s8Imm:$src3),
             "memb($src1+#$src2) = #$src3",
-            [(truncstorei8 s8ImmPred:$src3, (add IntRegs:$src1,
+            [(truncstorei8 s8ImmPred:$src3, (add (i32 IntRegs:$src1),
                                                  u6_0ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
@@ -1122,9 +1957,10 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STrib_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memb($src1+$src2<<#$src3) = $src4",
-            [(truncstorei8 IntRegs:$src4, (add IntRegs:$src1,
-                                          (shl IntRegs:$src2,
-                                               u2ImmPred:$src3)))]>,
+            [(truncstorei8 (i32 IntRegs:$src4),
+                           (add (i32 IntRegs:$src1),
+                                (shl (i32 IntRegs:$src2),
+                                          u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memb(Ru<<#u2+#U6)=Rt
@@ -1132,9 +1968,9 @@ let AddedComplexity = 10 in
 def STrib_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memb($src1<<#$src2+#$src3) = $src4",
-            [(truncstorei8 IntRegs:$src4, (shl IntRegs:$src1,
-                                          (add u2ImmPred:$src2,
-                                               u6ImmPred:$src3)))]>,
+            [(truncstorei8 (i32 IntRegs:$src4),
+                           (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memb(Rx++#s4:0:circ(Mu))=Rt
@@ -1148,32 +1984,36 @@ def STrib_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(#u6)=Rt
 // if ([!]Pv[.new]) memb(Rs+#u6:0)=#S6
 // if (Pv) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if ($src1) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if (!$src1) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memb($src2+#$src3) = #$src4",
             []>,
@@ -1182,8 +2022,9 @@ def STrib_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(Rs+#u6:0)=Rt
 // if (Pv) memb(Rs+#u6:0)=Rt
 // if (Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memb($addr) = $src2",
             []>,
@@ -1191,8 +2032,9 @@ def STrib_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
 // if (!Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memb($addr) = $src2",
             []>,
@@ -1201,16 +2043,18 @@ def STrib_cdnNotPt_V4 : STInst<(outs),
 // if (Pv) memb(Rs+#u6:0)=Rt
 // if (!Pv) memb(Rs+#u6:0)=Rt
 // if (Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memb($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memb($src2+#$src3) = $src4",
             []>,
@@ -1218,8 +2062,9 @@ def STrib_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Rt
 // if (Pv) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memb($src2+$src3<<#$src4) = $src5",
@@ -1227,8 +2072,9 @@ def STrib_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memb($src2+$src3<<#$src4) = $src5",
@@ -1236,8 +2082,9 @@ def STrib_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memb($src2+$src3<<#$src4) = $src5",
@@ -1245,8 +2092,9 @@ def STrib_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memb($src2+$src3<<#$src4) = $src5",
@@ -1256,8 +2104,9 @@ def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(Rx++#s4:0)=Rt
 // if (Pv) memb(Rx++#s4:0)=Rt
 // if (Pv.new) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STbri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1.new) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1265,8 +2114,9 @@ def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memb(Rx++#s4:0)=Rt
 // if (!Pv.new) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STbri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1.new) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1274,20 +2124,15 @@ def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 
 // Store halfword.
-// memh(Re=#U6)=Rt.H
-// TODO: needs to be implemented
-
-// memh(Re=#U6)=Rt
 // TODO: needs to be implemented
-
+// memh(Re=#U6)=Rt.H
 // memh(Rs+#s11:1)=Rt.H
-// memh(Rs+#s11:1)=Rt
 // memh(Rs+#u6:1)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STrih_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_1Imm:$src2, s8Imm:$src3),
             "memh($src1+#$src2) = #$src3",
-            [(truncstorei16 s8ImmPred:$src3, (add IntRegs:$src1,
+            [(truncstorei16 s8ImmPred:$src3, (add (i32 IntRegs:$src1),
                                                   u6_1ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
@@ -1299,9 +2144,10 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STrih_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memh($src1+$src2<<#$src3) = $src4",
-            [(truncstorei16 IntRegs:$src4, (add IntRegs:$src1,
-                                          (shl IntRegs:$src2,
-                                               u2ImmPred:$src3)))]>,
+            [(truncstorei16 (i32 IntRegs:$src4),
+                            (add (i32 IntRegs:$src1),
+                                 (shl (i32 IntRegs:$src2),
+                                      u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memh(Ru<<#u2+#U6)=Rt.H
@@ -1310,9 +2156,9 @@ let AddedComplexity = 10 in
 def STrih_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memh($src1<<#$src2+#$src3) = $src4",
-            [(truncstorei16 IntRegs:$src4, (shl IntRegs:$src1,
-                                          (add u2ImmPred:$src2,
-                                               u6ImmPred:$src3)))]>,
+            [(truncstorei16 (i32 IntRegs:$src4),
+                            (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                 u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memh(Rx++#s4:1:circ(Mu))=Rt.H
@@ -1323,42 +2169,42 @@ def STrih_shl_V4 : STInst<(outs),
 // memh(Rx++Mu)=Rt
 // memh(Rx++Mu:brev)=Rt.H
 // memh(Rx++Mu:brev)=Rt
-// memh(gp+#u16:1)=Rt.H
 // memh(gp+#u16:1)=Rt
-
-
-// Store halfword conditionally.
 // if ([!]Pv[.new]) memh(#u6)=Rt.H
 // if ([!]Pv[.new]) memh(#u6)=Rt
 
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=#S6
 // if (Pv) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if ($src1) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if (!$src1) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memh($src2+#$src3) = #$src4",
             []>,
@@ -1370,8 +2216,9 @@ def STrih_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt
 // if (Pv) memh(Rs+#u6:1)=Rt
 // if (Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memh($addr) = $src2",
             []>,
@@ -1379,24 +2226,27 @@ def STrih_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
 // if (!Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memh($addr) = $src2",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memh($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memh($src2+#$src3) = $src4",
             []>,
@@ -1405,8 +2255,9 @@ def STrih_indexed_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt.H
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt
 // if (Pv) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memh($src2+$src3<<#$src4) = $src5",
@@ -1414,7 +2265,9 @@ def STrih_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+Ru<<#u2)=Rt
-def STrih_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memh($src2+$src3<<#$src4) = $src5",
@@ -1422,8 +2275,9 @@ def STrih_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memh($src2+$src3<<#$src4) = $src5",
@@ -1431,8 +2285,9 @@ def STrih_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memh($src2+$src3<<#$src4) = $src5",
@@ -1445,8 +2300,9 @@ def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt
 // if (Pv) memh(Rx++#s4:1)=Rt
 // if (Pv.new) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_SThri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1.new) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1454,8 +2310,9 @@ def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memh(Rx++#s4:1)=Rt
 // if (!Pv.new) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_SThri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1.new) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1466,13 +2323,22 @@ def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 // memw(Re=#U6)=Rt
 // TODO: Needs to be implemented.
 
-// memw(Rs+#s11:2)=Rt
+// Store predicate:
+let neverHasSideEffects = 1 in
+def STriw_pred_V4 : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
+
 // memw(Rs+#u6:2)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STriw_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_2Imm:$src2, s8Imm:$src3),
             "memw($src1+#$src2) = #$src3",
-            [(store s8ImmPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            [(store s8ImmPred:$src3, (add (i32 IntRegs:$src1),
+                                          u6_2ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // memw(Rs+Ru<<#u2)=Rt
@@ -1480,8 +2346,9 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STriw_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memw($src1+$src2<<#$src3) = $src4",
-            [(store IntRegs:$src4, (add IntRegs:$src1,
-                                    (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            [(store (i32 IntRegs:$src4), (add (i32 IntRegs:$src1),
+                                    (shl (i32 IntRegs:$src2),
+                                         u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memw(Ru<<#u2+#U6)=Rt
@@ -1489,8 +2356,9 @@ let AddedComplexity = 10 in
 def STriw_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memw($src1<<#$src2+#$src3) = $src4",
-            [(store IntRegs:$src4, (shl IntRegs:$src1,
-                                   (add u2ImmPred:$src2, u6ImmPred:$src3)))]>,
+            [(store (i32 IntRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                              u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memw(Rx++#s4:2)=Rt
@@ -1502,37 +2370,39 @@ def STriw_shl_V4 : STInst<(outs),
 
 
 // Store word conditionally.
-// if ([!]Pv[.new]) memw(#u6)=Rt
-// TODO: Needs to be implemented.
 
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=#S6
 // if (Pv) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if ($src1) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if (!$src1) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memw($src2+#$src3) = #$src4",
             []>,
@@ -1541,8 +2411,9 @@ def STriw_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=Rt
 // if (Pv) memw(Rs+#u6:2)=Rt
 // if (Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memw($addr) = $src2",
             []>,
@@ -1550,8 +2421,9 @@ def STriw_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
 // if (!Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memw($addr) = $src2",
             []>,
@@ -1560,16 +2432,18 @@ def STriw_cdnNotPt_V4 : STInst<(outs),
 // if (Pv) memw(Rs+#u6:2)=Rt
 // if (!Pv) memw(Rs+#u6:2)=Rt
 // if (Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memw($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memw($src2+#$src3) = $src4",
             []>,
@@ -1577,8 +2451,9 @@ def STriw_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Rt
 // if (Pv) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memw($src2+$src3<<#$src4) = $src5",
@@ -1586,8 +2461,9 @@ def STriw_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memw($src2+$src3<<#$src4) = $src5",
@@ -1595,8 +2471,9 @@ def STriw_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memw($src2+$src3<<#$src4) = $src5",
@@ -1604,8 +2481,9 @@ def STriw_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memw($src2+$src3<<#$src4) = $src5",
@@ -1615,8 +2493,9 @@ def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memw(Rx++#s4:2)=Rt
 // if (Pv) memw(Rx++#s4:2)=Rt
 // if (Pv.new) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STwri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1.new) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1624,14 +2503,456 @@ def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memw(Rx++#s4:2)=Rt
 // if (!Pv.new) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STwri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STwri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1.new) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
             Requires<[HasV4T]>;
 
 
+/// store to global address
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrid_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
+            "memd(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if ($src1) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if (!$src1) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if ($src1.new) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if (!$src1.new) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrib_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memb(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrih_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memh(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STriw_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memw(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// memd(#global)=Rtt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STd_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, DoubleRegs:$src),
+            "memd(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if ($src1) memd(##$global) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if (!$src1) memd(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if ($src1.new) memd(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if (!$src1.new) memd(##$global) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STb_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memb(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// memh(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STh_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memh(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// memw(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STw_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memw(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// 64 bit atomic store
+def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i64 DoubleRegs:$src1)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memd(#foo)
+let AddedComplexity = 100 in
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// 8 bit atomic store
+def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i32 IntRegs:$src1)),
+            (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+              Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+          (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+//       to "r0 = 1; memw(#foo) = r0"
+let AddedComplexity = 100 in
+def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (HexagonCONST32_GP tglobaladdr:$global)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i64 DoubleRegs:$src1),
+                    (add (HexagonCONST32_GP tglobaladdr:$global),
+                                        u16ImmPred:$offset)),
+          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset)),
+          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (add (HexagonCONST32_GP tglobaladdr:$global),
+                              u16ImmPred:$offset)),
+          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1),
+                 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+
+
 //===----------------------------------------------------------------------===
 // ST -
 //===----------------------------------------------------------------------===
@@ -1696,11 +3017,19 @@ def STrib_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+// memb(#global)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memb(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
 
 // Store new-value byte conditionally.
 // if ([!]Pv[.new]) memb(#u6)=Nt.new
 // if (Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memb($addr) = $src2.new",
@@ -1708,7 +3037,8 @@ def STrib_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memb($addr) = $src2.new",
@@ -1716,7 +3046,8 @@ def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memb($addr) = $src2.new",
@@ -1724,7 +3055,8 @@ def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memb($addr) = $src2.new",
@@ -1732,7 +3064,8 @@ def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1) memb($src2+#$src3) = $src4.new",
@@ -1740,7 +3073,8 @@ def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memb($src2+#$src3) = $src4.new",
@@ -1748,7 +3082,8 @@ def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1) memb($src2+#$src3) = $src4.new",
@@ -1756,7 +3091,8 @@ def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memb($src2+#$src3) = $src4.new",
@@ -1766,7 +3102,8 @@ def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1775,7 +3112,8 @@ def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1784,7 +3122,8 @@ def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1793,7 +3132,8 @@ def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1803,7 +3143,8 @@ def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new
 // if (Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1) memb($src3++#$offset) = $src2.new",
@@ -1811,7 +3152,8 @@ def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1.new) memb($src3++#$offset) = $src2.new",
@@ -1819,7 +3161,8 @@ def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1) memb($src3++#$offset) = $src2.new",
@@ -1827,7 +3170,8 @@ def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1.new) memb($src3++#$offset) = $src2.new",
@@ -1889,6 +3233,14 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+// memh(#global)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memh(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
 
 // Store new-value halfword conditionally.
 
@@ -1896,7 +3248,8 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=Nt.new
 // if (Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memh($addr) = $src2.new",
@@ -1904,7 +3257,8 @@ def STrih_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memh($addr) = $src2.new",
@@ -1912,7 +3266,8 @@ def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memh($addr) = $src2.new",
@@ -1920,7 +3275,8 @@ def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memh($addr) = $src2.new",
@@ -1928,7 +3284,8 @@ def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1) memh($src2+#$src3) = $src4.new",
@@ -1936,7 +3293,8 @@ def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memh($src2+#$src3) = $src4.new",
@@ -1944,7 +3302,8 @@ def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1) memh($src2+#$src3) = $src4.new",
@@ -1952,7 +3311,8 @@ def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memh($src2+#$src3) = $src4.new",
@@ -1961,7 +3321,8 @@ def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1970,7 +3331,8 @@ def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1979,7 +3341,8 @@ def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1988,7 +3351,8 @@ def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1998,7 +3362,8 @@ def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new
 // if (Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1) memh($src3++#$offset) = $src2.new",
@@ -2006,7 +3371,8 @@ def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1.new) memh($src3++#$offset) = $src2.new",
@@ -2014,7 +3380,8 @@ def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1) memh($src3++#$offset) = $src2.new",
@@ -2022,7 +3389,8 @@ def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1.new) memh($src3++#$offset) = $src2.new",
@@ -2085,6 +3453,12 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
 
 // Store new-value word conditionally.
 
@@ -2092,7 +3466,8 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=Nt.new
 // if (Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memw($addr) = $src2.new",
@@ -2100,7 +3475,8 @@ def STriw_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memw($addr) = $src2.new",
@@ -2108,7 +3484,8 @@ def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memw($addr) = $src2.new",
@@ -2116,7 +3493,8 @@ def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memw($addr) = $src2.new",
@@ -2124,7 +3502,8 @@ def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1) memw($src2+#$src3) = $src4.new",
@@ -2132,7 +3511,8 @@ def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memw($src2+#$src3) = $src4.new",
@@ -2140,7 +3520,8 @@ def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1) memw($src2+#$src3) = $src4.new",
@@ -2148,7 +3529,8 @@ def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memw($src2+#$src3) = $src4.new",
@@ -2158,7 +3540,8 @@ def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2167,7 +3550,8 @@ def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2176,7 +3560,8 @@ def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2185,7 +3570,8 @@ def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2195,7 +3581,8 @@ def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new
 // if (Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1) memw($src3++#$offset) = $src2.new",
@@ -2203,7 +3590,8 @@ def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1.new) memw($src3++#$offset) = $src2.new",
@@ -2211,7 +3599,8 @@ def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1) memw($src3++#$offset) = $src2.new",
@@ -2219,7 +3608,8 @@ def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1.new) memw($src3++#$offset) = $src2.new",
@@ -2227,6 +3617,199 @@ def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 
+
+// if (Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
 //===----------------------------------------------------------------------===//
 // NV/ST -
 //===----------------------------------------------------------------------===//
@@ -2253,7 +3836,8 @@ multiclass NVJ_type_basic_reg<string NotStr, string OpcStr, string TakenStr> {
             Requires<[HasV4T]>;
 }
 
-multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr, string TakenStr> {
+multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr,
+                                                   string TakenStr> {
   def _ie_nv_V4 : NVInst_V4<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
             !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
@@ -2307,7 +3891,8 @@ multiclass NVJ_type_basic_neg<string NotStr, string OpcStr, string TakenStr> {
             Requires<[HasV4T]>;
 }
 
-multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr, string TakenStr> {
+multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr,
+                                                string TakenStr> {
   def _ie_nv_V4 : NVInst_V4<(outs),
             (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
             !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
@@ -2416,16 +4001,18 @@ let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
 def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
           "$dst = add($src1, add($src2, #$src3))",
-          [(set IntRegs:$dst,
-           (add IntRegs:$src1, (add IntRegs:$src2, s6ImmPred:$src3)))]>,
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                          s6ImmPred:$src3)))]>,
           Requires<[HasV4T]>;
 
 //  Rd=add(Rs,sub(#s6,Ru))
 def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
-          [(set IntRegs:$dst,
-           (add IntRegs:$src1, (sub s6ImmPred:$src2, IntRegs:$src3)))]>,
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (sub s6ImmPred:$src2,
+                                          (i32 IntRegs:$src3))))]>,
           Requires<[HasV4T]>;
 
 // Generates the same instruction as ADDr_SUBri_V4 but matches different
@@ -2434,8 +4021,9 @@ def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
 def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
-          [(set IntRegs:$dst,
-           (sub (add IntRegs:$src1, s6ImmPred:$src2), IntRegs:$src3))]>,
+          [(set (i32 IntRegs:$dst),
+                (sub (add (i32 IntRegs:$src1), s6ImmPred:$src2),
+                     (i32 IntRegs:$src3)))]>,
           Requires<[HasV4T]>;
 
 
@@ -2451,16 +4039,16 @@ def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
 def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = and($src1, ~$src2)",
-          [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
-                                      (not DoubleRegs:$src2)))]>,
+          [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                      (not (i64 DoubleRegs:$src2))))]>,
           Requires<[HasV4T]>;
 
 //  Rdd=or(Rtt,~Rss)
 def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = or($src1, ~$src2)",
-          [(set DoubleRegs:$dst,
-           (or DoubleRegs:$src1, (not DoubleRegs:$src2)))]>,
+          [(set (i64 DoubleRegs:$dst),
+           (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>,
           Requires<[HasV4T]>;
 
 
@@ -2469,8 +4057,9 @@ def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
 def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
           "$dst ^= xor($src2, $src3)",
-          [(set DoubleRegs:$dst,
-           (xor DoubleRegs:$src1, (xor DoubleRegs:$src2, DoubleRegs:$src3)))],
+          [(set (i64 DoubleRegs:$dst),
+           (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src3))))],
           "$src1 = $dst">,
           Requires<[HasV4T]>;
 
@@ -2480,8 +4069,9 @@ def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
 def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst = or($src1, and($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2490,8 +4080,9 @@ def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2499,8 +4090,9 @@ def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2508,8 +4100,9 @@ def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2518,8 +4111,9 @@ def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2527,8 +4121,9 @@ def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+             (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                           (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2536,8 +4131,9 @@ def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2546,8 +4142,9 @@ def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2555,8 +4152,9 @@ def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                               (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2564,8 +4162,9 @@ def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                           (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2574,8 +4173,9 @@ def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2583,8 +4183,9 @@ def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2592,8 +4193,9 @@ def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2601,8 +4203,9 @@ def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst |= and($src2, #$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2610,8 +4213,9 @@ def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst |= or($src2, #$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2663,8 +4267,9 @@ def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
             (ins u6Imm:$src1, IntRegs:$src2, u6Imm:$src3),
             "$dst = add(#$src1, mpyi($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add (mul IntRegs:$src2, u6ImmPred:$src3), u6ImmPred:$src1))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                       u6ImmPred:$src1))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(#u6,mpyi(Rs,Rt))
@@ -2672,32 +4277,36 @@ def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
 def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
             (ins u6Imm:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add(#$src1, mpyi($src2, $src3))",
-            [(set IntRegs:$dst,
-             (add (mul IntRegs:$src2, IntRegs:$src3), u6ImmPred:$src1))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       u6ImmPred:$src1))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(Ru,mpyi(#u6:2,Rs))
 def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi(#$src2, $src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src3, u6_2ImmPred:$src2)))]>,
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3),
+                                            u6_2ImmPred:$src2)))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(Ru,mpyi(Rs,#u6))
 def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u6Imm:$src3),
             "$dst = add($src1, mpyi($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src2, u6ImmPred:$src3)))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u6ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // Rx=add(Ru,mpyi(Rx,Rs))
 def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi($src2, $src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2745,8 +4354,9 @@ def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-              (add (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2754,8 +4364,9 @@ def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2763,8 +4374,9 @@ def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (sub (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2772,8 +4384,9 @@ def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (sub (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2783,8 +4396,9 @@ def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (and (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2792,26 +4406,31 @@ def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (and (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,asl(Rx,#U5))
+let AddedComplexity = 30 in
 def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,lsr(Rx,#U5))
+let AddedComplexity = 30 in
 def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2820,7 +4439,8 @@ def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 //Rd=lsl(#s6,Rt)
 def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
             "$dst = lsl(#$src1, $src2)",
-            [(set IntRegs:$dst, (shl s6ImmPred:$src1, IntRegs:$src2))]>,
+            [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
+                                           (i32 IntRegs:$src2)))]>,
             Requires<[HasV4T]>;
 
 
@@ -2829,8 +4449,9 @@ def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
 def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= asl($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2838,8 +4459,9 @@ def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= asr($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (sra DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2847,8 +4469,9 @@ def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= lsl($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                              (shl (i64 DoubleRegs:$src2),
+                                                   (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2856,8 +4479,9 @@ def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= lsr($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (srl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2903,16 +4527,16 @@ let AddedComplexity = 30 in
 def MEMw_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-m6ImmPred:$addend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         m6ImmPred:$addend),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) += #U5
 let AddedComplexity = 30 in
 def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$addend),
-            "memw($base+#$offset) += $addend",
+            "memw($base+#$offset) += #$addend",
             []>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -2920,7 +4544,7 @@ def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
 let AddedComplexity = 30 in
 def MEMw_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$subend),
-            "memw($base+#$offset) -= $subend",
+            "memw($base+#$offset) -= #$subend",
             []>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -2929,9 +4553,9 @@ let AddedComplexity = 30 in
 def MEMw_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$addend),
             "memw($base+#$offset) += $addend",
-            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$addend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$addend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) -= Rt
@@ -2939,19 +4563,19 @@ let AddedComplexity = 30 in
 def MEMw_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$subend),
             "memw($base+#$offset) -= $subend",
-            [(store (sub (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$subend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (sub (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$subend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) &= Rt
 let AddedComplexity = 30 in
 def MEMw_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$andend),
-            "memw($base+#$offset) += $andend",
-            [(store (and (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$andend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            "memw($base+#$offset) &= $andend",
+            [(store (and (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$andend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) |= Rt
@@ -2959,9 +4583,9 @@ let AddedComplexity = 30 in
 def MEMw_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$orend),
             "memw($base+#$offset) |= $orend",
-            [(store (or (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-                        IntRegs:$orend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (or (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                        (i32 IntRegs:$orend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMw_ADDSUBi_V4:
@@ -2996,7 +4620,7 @@ let AddedComplexity = 30 in
 def MEMw_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memw($addr) += $addend",
-            [(store (add (load ADDRriU6_2:$addr), IntRegs:$addend),
+            [(store (add (load ADDRriU6_2:$addr), (i32 IntRegs:$addend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3005,7 +4629,7 @@ let AddedComplexity = 30 in
 def MEMw_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memw($addr) -= $subend",
-            [(store (sub (load ADDRriU6_2:$addr), IntRegs:$subend),
+            [(store (sub (load ADDRriU6_2:$addr), (i32 IntRegs:$subend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3014,7 +4638,7 @@ let AddedComplexity = 30 in
 def MEMw_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memw($addr) &= $andend",
-            [(store (and (load ADDRriU6_2:$addr), IntRegs:$andend),
+            [(store (and (load ADDRriU6_2:$addr), (i32 IntRegs:$andend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3023,8 +4647,8 @@ let AddedComplexity = 30 in
 def MEMw_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memw($addr) |= $orend",
-            [(store (or (load ADDRriU6_2:$addr), IntRegs:$orend),
-ADDRriU6_2:$addr)]>,
+            [(store (or (load ADDRriU6_2:$addr), (i32 IntRegs:$orend)),
+                    ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 //===----------------------------------------------------------------------===//
@@ -3060,10 +4684,10 @@ let AddedComplexity = 30 in
 def MEMh_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
                                  m6ImmPred:$addend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) += #U5
@@ -3087,10 +4711,10 @@ let AddedComplexity = 30 in
 def MEMh_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$addend),
             "memh($base+#$offset) += $addend",
-            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$addend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$addend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) -= Rt
@@ -3098,10 +4722,10 @@ let AddedComplexity = 30 in
 def MEMh_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$subend),
             "memh($base+#$offset) -= $subend",
-            [(truncstorei16 (sub (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (sub (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$subend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$subend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) &= Rt
@@ -3109,10 +4733,10 @@ let AddedComplexity = 30 in
 def MEMh_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$andend),
             "memh($base+#$offset) += $andend",
-            [(truncstorei16 (and (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (and (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$andend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$andend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) |= Rt
@@ -3120,10 +4744,10 @@ let AddedComplexity = 30 in
 def MEMh_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$orend),
             "memh($base+#$offset) |= $orend",
-            [(truncstorei16 (or (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (or (sextloadi16 (add (i32 IntRegs:$base),
                                               u6_1ImmPred:$offset)),
-                             IntRegs:$orend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                             (i32 IntRegs:$orend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMh_ADDSUBi_V4:
@@ -3159,7 +4783,7 @@ def MEMh_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memh($addr) += $addend",
             [(truncstorei16 (add (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$addend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$addend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) -= Rt
@@ -3168,7 +4792,7 @@ def MEMh_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memh($addr) -= $subend",
             [(truncstorei16 (sub (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$subend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$subend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) &= Rt
@@ -3177,7 +4801,7 @@ def MEMh_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memh($addr) &= $andend",
             [(truncstorei16 (and (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$andend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$andend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) |= Rt
@@ -3186,7 +4810,7 @@ def MEMh_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memh($addr) |= $orend",
             [(truncstorei16 (or (sextloadi16 ADDRriU6_1:$addr),
-                                IntRegs:$orend), ADDRriU6_1:$addr)]>,
+                                (i32 IntRegs:$orend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 
@@ -3223,10 +4847,10 @@ let AddedComplexity = 30 in
 def MEMb_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
                                 m6ImmPred:$addend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) += #U5
@@ -3250,10 +4874,10 @@ let AddedComplexity = 30 in
 def MEMb_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$addend),
             "memb($base+#$offset) += $addend",
-            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$addend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$addend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) -= Rt
@@ -3261,10 +4885,10 @@ let AddedComplexity = 30 in
 def MEMb_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$subend),
             "memb($base+#$offset) -= $subend",
-            [(truncstorei8 (sub (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (sub (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$subend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$subend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) &= Rt
@@ -3272,10 +4896,10 @@ let AddedComplexity = 30 in
 def MEMb_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$andend),
             "memb($base+#$offset) += $andend",
-            [(truncstorei8 (and (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (and (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$andend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$andend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) |= Rt
@@ -3283,10 +4907,10 @@ let AddedComplexity = 30 in
 def MEMb_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$orend),
             "memb($base+#$offset) |= $orend",
-            [(truncstorei8 (or (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (or (sextloadi8 (add (i32 IntRegs:$base),
                                                 u6_0ImmPred:$offset)),
-                                IntRegs:$orend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                               (i32 IntRegs:$orend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMb_ADDSUBi_V4:
@@ -3322,7 +4946,7 @@ def MEMb_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memb($addr) += $addend",
             [(truncstorei8 (add (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$addend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$addend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) -= Rt
@@ -3331,7 +4955,7 @@ def MEMb_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memb($addr) -= $subend",
             [(truncstorei8 (sub (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$subend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$subend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) &= Rt
@@ -3340,7 +4964,7 @@ def MEMb_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memb($addr) &= $andend",
             [(truncstorei8 (and (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$andend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$andend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) |= Rt
@@ -3349,7 +4973,7 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memb($addr) |= $orend",
             [(truncstorei8 (or (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$orend), ADDRriU6_0:$addr)]>,
+                               (i32 IntRegs:$orend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 
@@ -3364,13 +4988,16 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
 // The implemented patterns are: EQ/GT/GTU.
 // Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
 
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
 // Pd=cmpb.eq(Rs,#u8)
+
 let isCompare = 1 in
 def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u8Imm:$src2),
             "$dst = cmpb.eq($src1, #$src2)",
-            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 255),
-                                        u8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.eq(Rs,Rt)
@@ -3378,10 +5005,9 @@ let isCompare = 1 in
 def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
-                                                  IntRegs:$src2),
-                                             255),
-                                        0))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (xor (i32 IntRegs:$src1),
+                                   (i32 IntRegs:$src2)), 255), 0))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.eq(Rs,Rt)
@@ -3389,17 +5015,9 @@ let isCompare = 1 in
 def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 24)),
-                                        (shl IntRegs:$src2, (i32 24))))]>,
-            Requires<[HasV4T]>;
-
-// Pd=cmpb.gt(Rs,#s8)
-let isCompare = 1 in
-def CMPbGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s32Imm:$src2),
-            "$dst = cmpb.gt($src1, #$src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
-                                        s32_24ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gt(Rs,Rt)
@@ -3407,8 +5025,9 @@ let isCompare = 1 in
 def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gt($src1, $src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
-                                        (shl IntRegs:$src2, (i32 24))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,#u7)
@@ -3416,8 +5035,8 @@ let isCompare = 1 in
 def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u7Imm:$src2),
             "$dst = cmpb.gtu($src1, #$src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
-                                         u7ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                              u7ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,Rt)
@@ -3425,18 +5044,21 @@ let isCompare = 1 in
 def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gtu($src1, $src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
-                                         (and IntRegs:$src2, 255)))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                             (and (i32 IntRegs:$src2), 255)))]>,
             Requires<[HasV4T]>;
 
+// Following instruction is not being extended as it results into the incorrect
+// code for negative numbers.
+
 // Signed half compare(.eq) ri.
 // Pd=cmph.eq(Rs,#s8)
 let isCompare = 1 in
 def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u16Imm:$src2),
+            (ins IntRegs:$src1, s8Imm:$src2),
             "$dst = cmph.eq($src1, #$src2)",
-            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 65535),
-                                        u16_s8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535),
+                                             s8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Signed half compare(.eq) rr.
@@ -3449,10 +5071,9 @@ let isCompare = 1 in
 def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
-                                                  IntRegs:$src2),
-                                             65535),
-                                        0))]>,
+            [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1),
+                                                       (i32 IntRegs:$src2)),
+                                                  65535), 0))]>,
             Requires<[HasV4T]>;
 
 // Signed half compare(.eq) rr.
@@ -3465,19 +5086,25 @@ let isCompare = 1 in
 def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 16)),
-                                        (shl IntRegs:$src2, (i32 16))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
             Requires<[HasV4T]>;
 
+/* Incorrect Pattern -- immediate should be right shifted before being
+used in the cmph.gt instruction.
 // Signed half compare(.gt) ri.
 // Pd=cmph.gt(Rs,#s8)
+
 let isCompare = 1 in
 def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s32Imm:$src2),
+            (ins IntRegs:$src1, s8Imm:$src2),
             "$dst = cmph.gt($src1, #$src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
-                                        s32_16s8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         s8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
+*/
 
 // Signed half compare(.gt) rr.
 // Pd=cmph.gt(Rs,Rt)
@@ -3485,8 +5112,9 @@ let isCompare = 1 in
 def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gt($src1, $src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
-                                        (shl IntRegs:$src2, (i32 16))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
             Requires<[HasV4T]>;
 
 // Unsigned half compare rr (.gtu).
@@ -3495,8 +5123,9 @@ let isCompare = 1 in
 def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gtu($src1, $src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
-                                         (and IntRegs:$src2, 65535)))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setugt (and (i32 IntRegs:$src1), 65535),
+                          (and (i32 IntRegs:$src2), 65535)))]>,
             Requires<[HasV4T]>;
 
 // Unsigned half compare ri (.gtu).
@@ -3505,8 +5134,8 @@ let isCompare = 1 in
 def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u7Imm:$src2),
             "$dst = cmph.gtu($src1, #$src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
-                                         u7ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
+                                              u7ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 //===----------------------------------------------------------------------===//
@@ -3523,10 +5152,42 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
             Requires<[HasV4T]>;
 }
 
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
+                                   (ins calltarget:$dst),
+             "jump $dst // Restore_and_dealloc_return",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, isBarrier = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
+                                           (ins calltarget:$dst),
+             "call $dst // Restore_and_dealloc_before_tailcall",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Save registers function call.
+let isCall = 1, isBarrier = 1,
+  Uses = [R29, R31] in {
+  def SAVE_REGISTERS_CALL_V4 : JInst<(outs),
+                               (ins calltarget:$dst),
+             "call $dst // Save_calle_saved_registers",
+             []>,
+             Requires<[HasV4T]>;
+}
+
 //    if (Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
-  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs),
+                           (ins PredRegs:$src1, i32imm:$amt1),
             "if ($src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -3534,7 +5195,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                      i32imm:$amt1),
             "if (!$src1) dealloc_return",
@@ -3544,7 +5206,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                      i32imm:$amt1),
             "if ($src1.new) dealloc_return:nt",
@@ -3554,7 +5217,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                         i32imm:$amt1),
             "if (!$src1.new) dealloc_return:nt",
@@ -3564,7 +5228,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:t
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                     i32imm:$amt1),
             "if ($src1.new) dealloc_return:t",
@@ -3574,10 +5239,539 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                        i32imm:$amt1),
             "if (!$src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
 }
+
+
+// Load/Store with absolute addressing mode
+// memw(#u6)=Rt
+
+multiclass ST_abs<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : STInst2<(outs),
+            (ins globaladdress:$absaddr, IntRegs:$src),
+            !strconcat(OpcStr, "(##$absaddr) = $src"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_nv_V4 : STInst2<(outs),
+            (ins globaladdress:$absaddr, IntRegs:$src),
+            !strconcat(OpcStr, "(##$absaddr) = $src.new"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+let AddedComplexity = 30, isPredicable = 1 in
+def STrid_abs_V4 : STInst<(outs),
+          (ins globaladdress:$absaddr, DoubleRegs:$src),
+           "memd(##$absaddr) = $src",
+          [(store (i64 DoubleRegs:$src),
+                  (HexagonCONST32 tglobaladdr:$absaddr))]>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if ($src1) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cNotPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if (!$src1) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cdnPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if ($src1.new) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cdnNotPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if (!$src1.new) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+defm STrib : ST_abs<"memb">;
+defm STrih : ST_abs<"memh">;
+defm STriw : ST_abs<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                          (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+
+multiclass LD_abs<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$absaddr),
+            !strconcat("$dst = ", !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if ($src1) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if (!$src1) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if ($src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if (!$src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+let AddedComplexity = 30 in
+def LDrid_abs_V4 : LDInst<(outs DoubleRegs:$dst),
+          (ins globaladdress:$absaddr),
+          "$dst = memd(##$absaddr)",
+          [(set (i64 DoubleRegs:$dst),
+                (load (HexagonCONST32 tglobaladdr:$absaddr)))]>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if ($src1) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if (!$src1) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if ($src1.new) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if (!$src1.new) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+defm LDrib : LD_abs<"memb">;
+defm LDriub : LD_abs<"memub">;
+defm LDrih : LD_abs<"memh">;
+defm LDriuh : LD_abs<"memuh">;
+defm LDriw : LD_abs<"memw">;
+
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriw_abs_V4 tglobaladdr: $absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrib_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriub_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrih_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriuh_abs_V4 tglobaladdr:$absaddr)>;
+
+// Transfer global address into a register
+let AddedComplexity=50, isMoveImm = 1, isReMaterializable = 1 in
+def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                                (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity = 50, Predicates = [HasV4T] in
+def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
+           (TFRI_V4 tglobaladdr:$src1)>;
+
+
+// Load - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            "$dst=memd($src1<<#$src2+##$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (load (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                        (HexagonCONST32 tglobaladdr:$offset))))]>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "($src1<<#$src2+##$offset)")),
+            [(set IntRegs:$dst,
+                  (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                          (HexagonCONST32 tglobaladdr:$offset)))))]>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>;
+defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>;
+defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>;
+defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>;
+defm LDriw_ind : LD_indirect_lo<"memw", load>;
+
+// Store - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def STrid_ind_lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 DoubleRegs:$src4),
+            "memd($src1<<#$src2+#$src3) = $src4",
+            [(store (i64 DoubleRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass ST_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 IntRegs:$src4),
+            !strconcat(OpcStr, "($src1<<#$src2+##$src3) = $src4"),
+            [(OpNode (i32 IntRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+}
+
+defm STrib_ind : ST_indirect_lo<"memb", truncstorei8>;
+defm STrih_ind : ST_indirect_lo<"memh", truncstorei16>;
+defm STriw_ind : ST_indirect_lo<"memw", store>;
+
+// Store - absolute addressing mode: These instruction take constant
+// value as the extended operand
+multiclass ST_absimm<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : STInst2<(outs),
+            (ins u6Imm:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(#$src1) = $src2"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_nv_V4 : STInst2<(outs),
+            (ins u6Imm:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(#$src1) = $src2.new"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+defm STrib_imm : ST_absimm<"memb">;
+defm STrih_imm : ST_absimm<"memh">;
+defm STriw_imm : ST_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STrib_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STrih_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(store (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STriw_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+
+// Load - absolute addressing mode: These instruction take constant
+// value as the extended operand
+
+multiclass LD_absimm<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins u6Imm:$src),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "(#$src)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if ($src1) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if (!$src1) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if ($src1.new) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if (!$src1.new) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_imm : LD_absimm<"memb">;
+defm LDriub_imm : LD_absimm<"memub">;
+defm LDrih_imm : LD_absimm<"memh">;
+defm LDriuh_imm : LD_absimm<"memuh">;
+defm LDriw_imm : LD_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(i32 (load u6ImmPred:$src)),
+          (LDriw_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi8 u6ImmPred:$src)),
+          (LDrib_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi8 u6ImmPred:$src)),
+          (LDriub_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi16 u6ImmPred:$src)),
+          (LDrih_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi16 u6ImmPred:$src)),
+          (LDriuh_imm_abs_V4 u6ImmPred:$src)>;
+
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STriw_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3),
+            "memw($src1+#$src2) = ##$src3",
+            [(store (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STrih_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3),
+            "memh($src1+#$src2) = ##$src3",
+            [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 0000000..92d098c
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,626 @@
+def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, f32>,
+                                            SDTCisPtrTy<1>]>;
+def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32",     SDTHexagonFCONST32>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (f32 IntRegs:$dst),
+              (HexagonFCONST32 tglobaladdr:$global))]>,
+               Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1),
+                       "$dst = CONST64(#$src1)",
+                       [(set DoubleRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
+                       "$dst = CONST32(#$src1)",
+                       [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+// Transfer immediate float.
+// Only works with single precision fp value.
+// For double precision, use CONST64_float_real, as 64bit transfer
+// can only hold 40-bit values - 32 from const ext + 8 bit immediate.
+let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
+                          (ins PredRegs:$src1, f32imm:$src2),
+           "if ($src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+let isPredicated = 1 in
+def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, f32imm:$src2),
+           "if (!$src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+// Convert single precision to double precision and vice-versa.
+def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+                "$dst = convert_sf2df($src)",
+                [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+                "$dst = convert_df2sf($src)",
+                [(set IntRegs:$dst, (fround DoubleRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+
+// Load.
+def LDrid_f : LDInst<(outs DoubleRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memd($addr)",
+            [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s11_3Imm:$offset),
+            "$dst = memd($src1+#$offset)",
+            [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1,
+                                              s11_3ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+def LDriw_f : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr), "$dst = memw($addr)",
+            [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDriw_indexed_f : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_2Imm:$offset),
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+// Store.
+def STriw_f : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1",
+            [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+let AddedComplexity = 10 in
+def STriw_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3",
+            [(store (f32 IntRegs:$src3),
+                (add IntRegs:$src1, s11_2ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+def STrid_f : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memd($addr) = $src1",
+            [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+// Indexed store double word.
+let AddedComplexity = 10 in
+def STrid_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
+            "memd($src1+#$src2) = $src3",
+            [(store (f64 DoubleRegs:$src3),
+                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+
+// Add
+let isCommutable = 1 in
+def fADD_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfadd($src1, $src2)",
+            [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfadd($src1, $src2)",
+               [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfsub($src1, $src2)",
+            [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfsub($src1, $src2)",
+               [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfmpy($src1, $src2)",
+            [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>,
+            Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfmpy($src1, $src2)",
+               [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+// Compare.
+let isCompare = 1 in {
+multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+
+multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+}
+
+defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>;
+defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>;
+defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>;
+defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>;
+defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>;
+defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>;
+
+defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>;
+defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>;
+defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>;
+defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>;
+defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>;
+defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>;
+
+// olt.
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// gt.
+def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1),
+                        (f64 (CONST64_Float_Real fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+// ult.
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// le.
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// ne.
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1,  (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+// Convert Integer to Floating Point.
+def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer - default.
+def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20 in
+def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+def : Pat <(i32 (bitconvert (f32 IntRegs:$src))),
+           (i32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f32 (bitconvert (i32 IntRegs:$src))),
+           (f32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))),
+           (i64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))),
+           (f64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+// Floating point fused multiply-add.
+def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+              "$dst += dfmpy($src2, $src3)",
+              [(set (f64 DoubleRegs:$dst),
+                  (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+def FMADD_sp : ALU64_acc<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+              "$dst += sfmpy($src2, $src3)",
+              [(set (f32 IntRegs:$dst),
+                  (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+
+// Floating point max/min.
+let AddedComplexity = 100 in
+def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmax($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMAX_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmax($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmin($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmin($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 IntRegs:$src2,
+                                                 IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                        DoubleRegs:$src2,
+                                                        DoubleRegs:$src3),
+                     "Error; should not emit",
+                     [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1,
+                                                 DoubleRegs:$src2,
+                                                 DoubleRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 fpimm:$src2,
+                                                 fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+                   (f32 IntRegs:$src3),
+                   (f32 IntRegs:$src4)),
+    (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4,
+                      IntRegs:$src3)>, Requires<[HasV5T]>;
+
+def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+                   (f64 DoubleRegs:$src3),
+                   (f64 DoubleRegs:$src4)),
+      (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1),
+                DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3),
+      (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3),
+      (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3),
+      (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>;
+
+def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))),
+          (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f32 IntRegs:$src1)),
+           (CLRBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fneg (f32 IntRegs:$src1)),
+           (TOGBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+/*
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+          */
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b15e293..99f59d5 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -551,13 +551,6 @@ class di_SInst_diu6u6<string opc, Intrinsic IntID>
           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
                                         imm:$src3))]>;
 
-class di_SInst_didisi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                       IntRegs:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
-                                        IntRegs:$src3))]>;
-
 class di_SInst_didiqi<string opc, Intrinsic IntID>
   : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
                                        IntRegs:$src3),
@@ -818,6 +811,11 @@ class di_MInst_s8s8<string opc, Intrinsic IntID>
              !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
              [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
 
+class si_MInst_sis9<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
 class si_MInst_sisi<string opc, Intrinsic IntID>
   : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
@@ -952,6 +950,17 @@ class si_SInst_sisi_sat<string opc, Intrinsic IntID>
           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
           [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
 
+class si_SInst_didi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
 class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
   : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              !strconcat("$dst = ", !strconcat(opc ,
@@ -1612,6 +1621,18 @@ class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
                                              DoubleRegs:$src2))],
                "$dst2 = $dst">;
 
+class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+
 class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
   : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
                                            DoubleRegs:$src1,
@@ -1822,53 +1843,63 @@ class si_MInst_didi<string opc, Intrinsic IntID>
              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
              [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
 
+//
+// LDInst classes.
+//
+let mayLoad = 1, neverHasSideEffects = 1 in
+class di_LDInstPI_diu4<string opc, Intrinsic IntID>
+  : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2),
+           (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset),
+           "$dst2 = memd($src1++#$offset:circ($src3))",
+           [],
+           "$src1 = $dst">;
 
 /********************************************************************
 *            ALU32/ALU                                              *
 *********************************************************************/
 
 // ALU32 / ALU / Add.
-def Hexagon_A2_add:
+def HEXAGON_A2_add:
   si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
-def Hexagon_A2_addi:
+def HEXAGON_A2_addi:
   si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
 
 // ALU32 / ALU / Logical operations.
-def Hexagon_A2_and:
+def HEXAGON_A2_and:
   si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
-def Hexagon_A2_andir:
+def HEXAGON_A2_andir:
   si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
-def Hexagon_A2_not:
+def HEXAGON_A2_not:
   si_ALU32_si                     <"not",      int_hexagon_A2_not>;
-def Hexagon_A2_or:
+def HEXAGON_A2_or:
   si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
-def Hexagon_A2_orir:
+def HEXAGON_A2_orir:
   si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
-def Hexagon_A2_xor:
+def HEXAGON_A2_xor:
   si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
 
 // ALU32 / ALU / Negate.
-def Hexagon_A2_neg:
+def HEXAGON_A2_neg:
   si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
 
 // ALU32 / ALU / Subtract.
-def Hexagon_A2_sub:
+def HEXAGON_A2_sub:
   si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
-def Hexagon_A2_subri:
+def HEXAGON_A2_subri:
   si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
 
 // ALU32 / ALU / Transfer Immediate.
-def Hexagon_A2_tfril:
+def HEXAGON_A2_tfril:
   si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
-def Hexagon_A2_tfrih:
+def HEXAGON_A2_tfrih:
   si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
-def Hexagon_A2_tfrsi:
+def HEXAGON_A2_tfrsi:
   si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
-def Hexagon_A2_tfrpi:
+def HEXAGON_A2_tfrpi:
   di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
 
 // ALU32 / ALU / Transfer Register.
-def Hexagon_A2_tfr:
+def HEXAGON_A2_tfr:
   si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
 
 /********************************************************************
@@ -1876,45 +1907,45 @@ def Hexagon_A2_tfr:
 *********************************************************************/
 
 // ALU32 / PERM / Combine.
-def Hexagon_A2_combinew:
+def HEXAGON_A2_combinew:
   di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
-def Hexagon_A2_combine_hh:
+def HEXAGON_A2_combine_hh:
   si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
-def Hexagon_A2_combine_lh:
+def HEXAGON_A2_combine_lh:
   si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
-def Hexagon_A2_combine_hl:
+def HEXAGON_A2_combine_hl:
   si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
-def Hexagon_A2_combine_ll:
+def HEXAGON_A2_combine_ll:
   si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
-def Hexagon_A2_combineii:
+def HEXAGON_A2_combineii:
   di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
 
 // ALU32 / PERM / Mux.
-def Hexagon_C2_mux:
+def HEXAGON_C2_mux:
   si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
-def Hexagon_C2_muxri:
+def HEXAGON_C2_muxri:
   si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
-def Hexagon_C2_muxir:
+def HEXAGON_C2_muxir:
   si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
-def Hexagon_C2_muxii:
+def HEXAGON_C2_muxii:
   si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
 
 // ALU32 / PERM / Shift halfword.
-def Hexagon_A2_aslh:
+def HEXAGON_A2_aslh:
   si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
-def Hexagon_A2_asrh:
+def HEXAGON_A2_asrh:
   si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
 def SI_to_SXTHI_asrh:
   si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
 
 // ALU32 / PERM / Sign/zero extend.
-def Hexagon_A2_sxth:
+def HEXAGON_A2_sxth:
   si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
-def Hexagon_A2_sxtb:
+def HEXAGON_A2_sxtb:
   si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
-def Hexagon_A2_zxth:
+def HEXAGON_A2_zxth:
   si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
-def Hexagon_A2_zxtb:
+def HEXAGON_A2_zxtb:
   si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
 
 /********************************************************************
@@ -1922,25 +1953,25 @@ def Hexagon_A2_zxtb:
 *********************************************************************/
 
 // ALU32 / PRED / Compare.
-def Hexagon_C2_cmpeq:
+def HEXAGON_C2_cmpeq:
   qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
-def Hexagon_C2_cmpeqi:
+def HEXAGON_C2_cmpeqi:
   qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
-def Hexagon_C2_cmpgei:
+def HEXAGON_C2_cmpgei:
   qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
-def Hexagon_C2_cmpgeui:
+def HEXAGON_C2_cmpgeui:
   qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
-def Hexagon_C2_cmpgt:
+def HEXAGON_C2_cmpgt:
   qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
-def Hexagon_C2_cmpgti:
+def HEXAGON_C2_cmpgti:
   qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
-def Hexagon_C2_cmpgtu:
+def HEXAGON_C2_cmpgtu:
   qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
-def Hexagon_C2_cmpgtui:
+def HEXAGON_C2_cmpgtui:
   qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
-def Hexagon_C2_cmplt:
+def HEXAGON_C2_cmplt:
   qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
-def Hexagon_C2_cmpltu:
+def HEXAGON_C2_cmpltu:
   qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
 
 /********************************************************************
@@ -1949,27 +1980,27 @@ def Hexagon_C2_cmpltu:
 
 // ALU32 / VH / Vector add halfwords.
 // Rd32=vadd[u]h(Rs32,Rt32:sat]
-def Hexagon_A2_svaddh:
+def HEXAGON_A2_svaddh:
   si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
-def Hexagon_A2_svaddhs:
+def HEXAGON_A2_svaddhs:
   si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
-def Hexagon_A2_svadduhs:
+def HEXAGON_A2_svadduhs:
   si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
 
 // ALU32 / VH / Vector average halfwords.
-def Hexagon_A2_svavgh:
+def HEXAGON_A2_svavgh:
   si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
-def Hexagon_A2_svavghs:
+def HEXAGON_A2_svavghs:
   si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
-def Hexagon_A2_svnavgh:
+def HEXAGON_A2_svnavgh:
   si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
 
 // ALU32 / VH / Vector subtract halfwords.
-def Hexagon_A2_svsubh:
+def HEXAGON_A2_svsubh:
   si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
-def Hexagon_A2_svsubhs:
+def HEXAGON_A2_svsubhs:
   si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
-def Hexagon_A2_svsubuhs:
+def HEXAGON_A2_svsubuhs:
   si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
 
 /********************************************************************
@@ -1977,109 +2008,109 @@ def Hexagon_A2_svsubuhs:
 *********************************************************************/
 
 // ALU64 / ALU / Add.
-def Hexagon_A2_addp:
+def HEXAGON_A2_addp:
   di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
-def Hexagon_A2_addsat:
+def HEXAGON_A2_addsat:
   si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
 
 // ALU64 / ALU / Add halfword.
 // Even though the definition says hl, it should be lh -
 //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def Hexagon_A2_addh_l16_hl:
+def HEXAGON_A2_addh_l16_hl:
   si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
-def Hexagon_A2_addh_l16_ll:
+def HEXAGON_A2_addh_l16_ll:
   si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
 
-def Hexagon_A2_addh_l16_sat_hl:
+def HEXAGON_A2_addh_l16_sat_hl:
   si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
-def Hexagon_A2_addh_l16_sat_ll:
+def HEXAGON_A2_addh_l16_sat_ll:
   si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
 
-def Hexagon_A2_addh_h16_hh:
+def HEXAGON_A2_addh_h16_hh:
   si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
-def Hexagon_A2_addh_h16_hl:
+def HEXAGON_A2_addh_h16_hl:
   si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
-def Hexagon_A2_addh_h16_lh:
+def HEXAGON_A2_addh_h16_lh:
   si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
-def Hexagon_A2_addh_h16_ll:
+def HEXAGON_A2_addh_h16_ll:
   si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
 
-def Hexagon_A2_addh_h16_sat_hh:
+def HEXAGON_A2_addh_h16_sat_hh:
   si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
-def Hexagon_A2_addh_h16_sat_hl:
+def HEXAGON_A2_addh_h16_sat_hl:
   si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
-def Hexagon_A2_addh_h16_sat_lh:
+def HEXAGON_A2_addh_h16_sat_lh:
   si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
-def Hexagon_A2_addh_h16_sat_ll:
+def HEXAGON_A2_addh_h16_sat_ll:
   si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
 
 // ALU64 / ALU / Compare.
-def Hexagon_C2_cmpeqp:
+def HEXAGON_C2_cmpeqp:
   qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
-def Hexagon_C2_cmpgtp:
+def HEXAGON_C2_cmpgtp:
   qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
-def Hexagon_C2_cmpgtup:
+def HEXAGON_C2_cmpgtup:
   qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
 
 // ALU64 / ALU / Logical operations.
-def Hexagon_A2_andp:
+def HEXAGON_A2_andp:
   di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
-def Hexagon_A2_orp:
+def HEXAGON_A2_orp:
   di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
-def Hexagon_A2_xorp:
+def HEXAGON_A2_xorp:
   di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
 
 // ALU64 / ALU / Maximum.
-def Hexagon_A2_max:
+def HEXAGON_A2_max:
   si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
-def Hexagon_A2_maxu:
+def HEXAGON_A2_maxu:
   si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
 
 // ALU64 / ALU / Minimum.
-def Hexagon_A2_min:
+def HEXAGON_A2_min:
   si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
-def Hexagon_A2_minu:
+def HEXAGON_A2_minu:
   si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
 
 // ALU64 / ALU / Subtract.
-def Hexagon_A2_subp:
+def HEXAGON_A2_subp:
   di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
-def Hexagon_A2_subsat:
+def HEXAGON_A2_subsat:
   si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
 
 // ALU64 / ALU / Subtract halfword.
 // Even though the definition says hl, it should be lh -
 //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def Hexagon_A2_subh_l16_hl:
+def HEXAGON_A2_subh_l16_hl:
   si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
-def Hexagon_A2_subh_l16_ll:
+def HEXAGON_A2_subh_l16_ll:
   si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
 
-def Hexagon_A2_subh_l16_sat_hl:
+def HEXAGON_A2_subh_l16_sat_hl:
   si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
-def Hexagon_A2_subh_l16_sat_ll:
+def HEXAGON_A2_subh_l16_sat_ll:
   si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
 
-def Hexagon_A2_subh_h16_hh:
+def HEXAGON_A2_subh_h16_hh:
   si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
-def Hexagon_A2_subh_h16_hl:
+def HEXAGON_A2_subh_h16_hl:
   si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
-def Hexagon_A2_subh_h16_lh:
+def HEXAGON_A2_subh_h16_lh:
   si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
-def Hexagon_A2_subh_h16_ll:
+def HEXAGON_A2_subh_h16_ll:
   si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
 
-def Hexagon_A2_subh_h16_sat_hh:
+def HEXAGON_A2_subh_h16_sat_hh:
   si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
-def Hexagon_A2_subh_h16_sat_hl:
+def HEXAGON_A2_subh_h16_sat_hl:
   si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
-def Hexagon_A2_subh_h16_sat_lh:
+def HEXAGON_A2_subh_h16_sat_lh:
   si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
-def Hexagon_A2_subh_h16_sat_ll:
+def HEXAGON_A2_subh_h16_sat_ll:
   si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
 
 // ALU64 / ALU / Transfer register.
-def Hexagon_A2_tfrp:
+def HEXAGON_A2_tfrp:
   di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
 
 /********************************************************************
@@ -2087,7 +2118,7 @@ def Hexagon_A2_tfrp:
 *********************************************************************/
 
 // ALU64 / BIT / Masked parity.
-def Hexagon_S2_parityp:
+def HEXAGON_S2_parityp:
   si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
 
 /********************************************************************
@@ -2095,7 +2126,7 @@ def Hexagon_S2_parityp:
 *********************************************************************/
 
 // ALU64 / PERM / Vector pack high and low halfwords.
-def Hexagon_S2_packhl:
+def HEXAGON_S2_packhl:
   di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
 
 /********************************************************************
@@ -2103,37 +2134,37 @@ def Hexagon_S2_packhl:
 *********************************************************************/
 
 // ALU64 / VB / Vector add unsigned bytes.
-def Hexagon_A2_vaddub:
+def HEXAGON_A2_vaddub:
   di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
-def Hexagon_A2_vaddubs:
+def HEXAGON_A2_vaddubs:
   di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
 
 // ALU64 / VB / Vector average unsigned bytes.
-def Hexagon_A2_vavgub:
+def HEXAGON_A2_vavgub:
   di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
-def Hexagon_A2_vavgubr:
+def HEXAGON_A2_vavgubr:
   di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
 
 // ALU64 / VB / Vector compare unsigned bytes.
-def Hexagon_A2_vcmpbeq:
+def HEXAGON_A2_vcmpbeq:
   qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
-def Hexagon_A2_vcmpbgtu:
+def HEXAGON_A2_vcmpbgtu:
   qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
 
 // ALU64 / VB / Vector maximum/minimum unsigned bytes.
-def Hexagon_A2_vmaxub:
+def HEXAGON_A2_vmaxub:
   di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
-def Hexagon_A2_vminub:
+def HEXAGON_A2_vminub:
   di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
 
 // ALU64 / VB / Vector subtract unsigned bytes.
-def Hexagon_A2_vsubub:
+def HEXAGON_A2_vsubub:
   di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
-def Hexagon_A2_vsububs:
+def HEXAGON_A2_vsububs:
   di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
 
 // ALU64 / VB / Vector mux.
-def Hexagon_C2_vmux:
+def HEXAGON_C2_vmux:
   di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
 
 
@@ -2143,58 +2174,58 @@ def Hexagon_C2_vmux:
 
 // ALU64 / VH / Vector add halfwords.
 // Rdd64=vadd[u]h(Rss64,Rtt64:sat]
-def Hexagon_A2_vaddh:
+def HEXAGON_A2_vaddh:
   di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
-def Hexagon_A2_vaddhs:
+def HEXAGON_A2_vaddhs:
   di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
-def Hexagon_A2_vadduhs:
+def HEXAGON_A2_vadduhs:
   di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
 
 // ALU64 / VH / Vector average halfwords.
 // Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
-def Hexagon_A2_vavgh:
+def HEXAGON_A2_vavgh:
   di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
-def Hexagon_A2_vavghcr:
+def HEXAGON_A2_vavghcr:
   di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
-def Hexagon_A2_vavghr:
+def HEXAGON_A2_vavghr:
   di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
-def Hexagon_A2_vavguh:
+def HEXAGON_A2_vavguh:
   di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
-def Hexagon_A2_vavguhr:
+def HEXAGON_A2_vavguhr:
   di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
-def Hexagon_A2_vnavgh:
+def HEXAGON_A2_vnavgh:
   di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
-def Hexagon_A2_vnavghcr:
+def HEXAGON_A2_vnavghcr:
   di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
-def Hexagon_A2_vnavghr:
+def HEXAGON_A2_vnavghr:
   di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
 
 // ALU64 / VH / Vector compare halfwords.
-def Hexagon_A2_vcmpheq:
+def HEXAGON_A2_vcmpheq:
   qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
-def Hexagon_A2_vcmphgt:
+def HEXAGON_A2_vcmphgt:
   qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
-def Hexagon_A2_vcmphgtu:
+def HEXAGON_A2_vcmphgtu:
   qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
 
 // ALU64 / VH / Vector maximum halfwords.
-def Hexagon_A2_vmaxh:
+def HEXAGON_A2_vmaxh:
   di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
-def Hexagon_A2_vmaxuh:
+def HEXAGON_A2_vmaxuh:
   di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
 
 // ALU64 / VH / Vector minimum halfwords.
-def Hexagon_A2_vminh:
+def HEXAGON_A2_vminh:
   di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
-def Hexagon_A2_vminuh:
+def HEXAGON_A2_vminuh:
   di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
 
 // ALU64 / VH / Vector subtract halfwords.
-def Hexagon_A2_vsubh:
+def HEXAGON_A2_vsubh:
   di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
-def Hexagon_A2_vsubhs:
+def HEXAGON_A2_vsubhs:
   di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
-def Hexagon_A2_vsubuhs:
+def HEXAGON_A2_vsubuhs:
   di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
 
 
@@ -2204,53 +2235,53 @@ def Hexagon_A2_vsubuhs:
 
 // ALU64 / VW / Vector add words.
 // Rdd32=vaddw(Rss32,Rtt32)[:sat]
-def Hexagon_A2_vaddw:
+def HEXAGON_A2_vaddw:
   di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
-def Hexagon_A2_vaddws:
+def HEXAGON_A2_vaddws:
   di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
 
 // ALU64 / VW / Vector average words.
-def Hexagon_A2_vavguw:
+def HEXAGON_A2_vavguw:
   di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
-def Hexagon_A2_vavguwr:
+def HEXAGON_A2_vavguwr:
   di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
-def Hexagon_A2_vavgw:
+def HEXAGON_A2_vavgw:
   di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
-def Hexagon_A2_vavgwcr:
+def HEXAGON_A2_vavgwcr:
   di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
-def Hexagon_A2_vavgwr:
+def HEXAGON_A2_vavgwr:
   di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
-def Hexagon_A2_vnavgw:
+def HEXAGON_A2_vnavgw:
   di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
-def Hexagon_A2_vnavgwcr:
+def HEXAGON_A2_vnavgwcr:
   di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
-def Hexagon_A2_vnavgwr:
+def HEXAGON_A2_vnavgwr:
   di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
 
 // ALU64 / VW / Vector compare words.
-def Hexagon_A2_vcmpweq:
+def HEXAGON_A2_vcmpweq:
   qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
-def Hexagon_A2_vcmpwgt:
+def HEXAGON_A2_vcmpwgt:
   qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
-def Hexagon_A2_vcmpwgtu:
+def HEXAGON_A2_vcmpwgtu:
   qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
 
 // ALU64 / VW / Vector maximum words.
-def Hexagon_A2_vmaxw:
+def HEXAGON_A2_vmaxw:
   di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
-def Hexagon_A2_vmaxuw:
+def HEXAGON_A2_vmaxuw:
   di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
 
 // ALU64 / VW / Vector minimum words.
-def Hexagon_A2_vminw:
+def HEXAGON_A2_vminw:
   di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
-def Hexagon_A2_vminuw:
+def HEXAGON_A2_vminuw:
   di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
 
 // ALU64 / VW / Vector subtract words.
-def Hexagon_A2_vsubw:
+def HEXAGON_A2_vsubw:
   di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
-def Hexagon_A2_vsubws:
+def HEXAGON_A2_vsubws:
   di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
 
 
@@ -2259,25 +2290,25 @@ def Hexagon_A2_vsubws:
 *********************************************************************/
 
 // CR / Logical reductions on predicates.
-def Hexagon_C2_all8:
+def HEXAGON_C2_all8:
   qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
-def Hexagon_C2_any8:
+def HEXAGON_C2_any8:
   qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
 
 // CR / Logical operations on predicates.
-def Hexagon_C2_pxfer_map:
+def HEXAGON_C2_pxfer_map:
   qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
-def Hexagon_C2_and:
+def HEXAGON_C2_and:
   qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
-def Hexagon_C2_andn:
+def HEXAGON_C2_andn:
   qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
-def Hexagon_C2_not:
+def HEXAGON_C2_not:
   qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
-def Hexagon_C2_or:
+def HEXAGON_C2_or:
   qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
-def Hexagon_C2_orn:
+def HEXAGON_C2_orn:
   qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
-def Hexagon_C2_xor:
+def HEXAGON_C2_xor:
   qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
 
 
@@ -2286,27 +2317,27 @@ def Hexagon_C2_xor:
 *********************************************************************/
 
 // MTYPE / ALU / Add and accumulate.
-def Hexagon_M2_acci:
+def HEXAGON_M2_acci:
   si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
-def Hexagon_M2_accii:
+def HEXAGON_M2_accii:
   si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
-def Hexagon_M2_nacci:
+def HEXAGON_M2_nacci:
   si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
-def Hexagon_M2_naccii:
+def HEXAGON_M2_naccii:
   si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
 
 // MTYPE / ALU / Subtract and accumulate.
-def Hexagon_M2_subacc:
+def HEXAGON_M2_subacc:
   si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
 
 // MTYPE / ALU / Vector absolute difference.
-def Hexagon_M2_vabsdiffh:
+def HEXAGON_M2_vabsdiffh:
   di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
-def Hexagon_M2_vabsdiffw:
+def HEXAGON_M2_vabsdiffw:
   di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
 
 // MTYPE / ALU / XOR and xor with destination.
-def Hexagon_M2_xor_xacc:
+def HEXAGON_M2_xor_xacc:
   si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
 
 
@@ -2316,91 +2347,91 @@ def Hexagon_M2_xor_xacc:
 
 // MTYPE / COMPLEX / Complex multiply.
 // Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
-def Hexagon_M2_cmpys_s1:
+def HEXAGON_M2_cmpys_s1:
   di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
-def Hexagon_M2_cmpys_s0:
+def HEXAGON_M2_cmpys_s0:
   di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
-def Hexagon_M2_cmpysc_s1:
+def HEXAGON_M2_cmpysc_s1:
   di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
-def Hexagon_M2_cmpysc_s0:
+def HEXAGON_M2_cmpysc_s0:
   di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
 
-def Hexagon_M2_cmacs_s1:
+def HEXAGON_M2_cmacs_s1:
   di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
-def Hexagon_M2_cmacs_s0:
+def HEXAGON_M2_cmacs_s0:
   di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
-def Hexagon_M2_cmacsc_s1:
+def HEXAGON_M2_cmacsc_s1:
   di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
-def Hexagon_M2_cmacsc_s0:
+def HEXAGON_M2_cmacsc_s0:
   di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
 
-def Hexagon_M2_cnacs_s1:
+def HEXAGON_M2_cnacs_s1:
   di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
-def Hexagon_M2_cnacs_s0:
+def HEXAGON_M2_cnacs_s0:
   di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
-def Hexagon_M2_cnacsc_s1:
+def HEXAGON_M2_cnacsc_s1:
   di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
-def Hexagon_M2_cnacsc_s0:
+def HEXAGON_M2_cnacsc_s0:
   di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
 
 // MTYPE / COMPLEX / Complex multiply real or imaginary.
-def Hexagon_M2_cmpyr_s0:
+def HEXAGON_M2_cmpyr_s0:
   di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
-def Hexagon_M2_cmacr_s0:
+def HEXAGON_M2_cmacr_s0:
   di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
 
-def Hexagon_M2_cmpyi_s0:
+def HEXAGON_M2_cmpyi_s0:
   di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
-def Hexagon_M2_cmaci_s0:
+def HEXAGON_M2_cmaci_s0:
   di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
 
 // MTYPE / COMPLEX / Complex multiply with round and pack.
 // Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
-def Hexagon_M2_cmpyrs_s0:
+def HEXAGON_M2_cmpyrs_s0:
   si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
-def Hexagon_M2_cmpyrs_s1:
+def HEXAGON_M2_cmpyrs_s1:
   si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
 
-def Hexagon_M2_cmpyrsc_s0:
+def HEXAGON_M2_cmpyrsc_s0:
   si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
-def Hexagon_M2_cmpyrsc_s1:
+def HEXAGON_M2_cmpyrsc_s1:
   si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
 
 //MTYPE / COMPLEX / Vector complex multiply real or imaginary.
-def Hexagon_M2_vcmpy_s0_sat_i:
+def HEXAGON_M2_vcmpy_s0_sat_i:
   di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
-def Hexagon_M2_vcmpy_s1_sat_i:
+def HEXAGON_M2_vcmpy_s1_sat_i:
   di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
 
-def Hexagon_M2_vcmpy_s0_sat_r:
+def HEXAGON_M2_vcmpy_s0_sat_r:
   di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
-def Hexagon_M2_vcmpy_s1_sat_r:
+def HEXAGON_M2_vcmpy_s1_sat_r:
   di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
 
-def Hexagon_M2_vcmac_s0_sat_i:
+def HEXAGON_M2_vcmac_s0_sat_i:
   di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
-def Hexagon_M2_vcmac_s0_sat_r:
+def HEXAGON_M2_vcmac_s0_sat_r:
   di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
 
 //MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
-def Hexagon_M2_vrcmpyi_s0:
+def HEXAGON_M2_vrcmpyi_s0:
   di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
-def Hexagon_M2_vrcmpyr_s0:
+def HEXAGON_M2_vrcmpyr_s0:
   di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
 
-def Hexagon_M2_vrcmpyi_s0c:
+def HEXAGON_M2_vrcmpyi_s0c:
   di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
-def Hexagon_M2_vrcmpyr_s0c:
+def HEXAGON_M2_vrcmpyr_s0c:
   di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
 
-def Hexagon_M2_vrcmaci_s0:
+def HEXAGON_M2_vrcmaci_s0:
   di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
-def Hexagon_M2_vrcmacr_s0:
+def HEXAGON_M2_vrcmacr_s0:
   di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
 
-def Hexagon_M2_vrcmaci_s0c:
+def HEXAGON_M2_vrcmaci_s0c:
   di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
-def Hexagon_M2_vrcmacr_s0c:
+def HEXAGON_M2_vrcmacr_s0c:
   di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
 
 
@@ -2409,115 +2440,120 @@ def Hexagon_M2_vrcmacr_s0c:
 *********************************************************************/
 
 // MTYPE / MPYH / Multiply and use lower result.
-//def Hexagon_M2_mpysmi:
+//def HEXAGON_M2_mpysmi:
+//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9,
+// not si_MInst_sis9 - but for now, we will use s9.
+// def Hexagon_M2_mpysmi:
 //  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
-def Hexagon_M2_mpyi:
+def Hexagon_M2_mpysmi:
+  si_MInst_sis9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def HEXAGON_M2_mpyi:
   si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
-def Hexagon_M2_mpyui:
+def HEXAGON_M2_mpyui:
   si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
-def Hexagon_M2_macsip:
+def HEXAGON_M2_macsip:
   si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
-def Hexagon_M2_maci:
+def HEXAGON_M2_maci:
   si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
-def Hexagon_M2_macsin:
+def HEXAGON_M2_macsin:
   si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
 
 // MTYPE / MPYH / Multiply word by half (32x16).
 //Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
 //Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
-def Hexagon_M2_mmpyl_rs1:
+def HEXAGON_M2_mmpyl_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
-def Hexagon_M2_mmpyl_s1:
+def HEXAGON_M2_mmpyl_s1:
   di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
-def Hexagon_M2_mmpyl_rs0:
+def HEXAGON_M2_mmpyl_rs0:
   di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
-def Hexagon_M2_mmpyl_s0:
+def HEXAGON_M2_mmpyl_s0:
   di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
-def Hexagon_M2_mmpyh_rs1:
+def HEXAGON_M2_mmpyh_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
-def Hexagon_M2_mmpyh_s1:
+def HEXAGON_M2_mmpyh_s1:
   di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
-def Hexagon_M2_mmpyh_rs0:
+def HEXAGON_M2_mmpyh_rs0:
   di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
-def Hexagon_M2_mmpyh_s0:
+def HEXAGON_M2_mmpyh_s0:
   di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
-def Hexagon_M2_mmacls_rs1:
+def HEXAGON_M2_mmacls_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
-def Hexagon_M2_mmacls_s1:
+def HEXAGON_M2_mmacls_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
-def Hexagon_M2_mmacls_rs0:
+def HEXAGON_M2_mmacls_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
-def Hexagon_M2_mmacls_s0:
+def HEXAGON_M2_mmacls_s0:
   di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
-def Hexagon_M2_mmachs_rs1:
+def HEXAGON_M2_mmachs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
-def Hexagon_M2_mmachs_s1:
+def HEXAGON_M2_mmachs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
-def Hexagon_M2_mmachs_rs0:
+def HEXAGON_M2_mmachs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
-def Hexagon_M2_mmachs_s0:
+def HEXAGON_M2_mmachs_s0:
   di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
 
 // MTYPE / MPYH / Multiply word by unsigned half (32x16).
 //Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
 //Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
-def Hexagon_M2_mmpyul_rs1:
+def HEXAGON_M2_mmpyul_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
-def Hexagon_M2_mmpyul_s1:
+def HEXAGON_M2_mmpyul_s1:
   di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
-def Hexagon_M2_mmpyul_rs0:
+def HEXAGON_M2_mmpyul_rs0:
   di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
-def Hexagon_M2_mmpyul_s0:
+def HEXAGON_M2_mmpyul_s0:
   di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
-def Hexagon_M2_mmpyuh_rs1:
+def HEXAGON_M2_mmpyuh_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
-def Hexagon_M2_mmpyuh_s1:
+def HEXAGON_M2_mmpyuh_s1:
   di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
-def Hexagon_M2_mmpyuh_rs0:
+def HEXAGON_M2_mmpyuh_rs0:
   di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
-def Hexagon_M2_mmpyuh_s0:
+def HEXAGON_M2_mmpyuh_s0:
   di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
-def Hexagon_M2_mmaculs_rs1:
+def HEXAGON_M2_mmaculs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
-def Hexagon_M2_mmaculs_s1:
+def HEXAGON_M2_mmaculs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
-def Hexagon_M2_mmaculs_rs0:
+def HEXAGON_M2_mmaculs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
-def Hexagon_M2_mmaculs_s0:
+def HEXAGON_M2_mmaculs_s0:
   di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
-def Hexagon_M2_mmacuhs_rs1:
+def HEXAGON_M2_mmacuhs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
-def Hexagon_M2_mmacuhs_s1:
+def HEXAGON_M2_mmacuhs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
-def Hexagon_M2_mmacuhs_rs0:
+def HEXAGON_M2_mmacuhs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
-def Hexagon_M2_mmacuhs_s0:
+def HEXAGON_M2_mmacuhs_s0:
   di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
 
 // MTYPE / MPYH / Multiply and use upper result.
-def Hexagon_M2_hmmpyh_rs1:
+def HEXAGON_M2_hmmpyh_rs1:
   si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
-def Hexagon_M2_hmmpyl_rs1:
+def HEXAGON_M2_hmmpyl_rs1:
   si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
-def Hexagon_M2_mpy_up:
+def HEXAGON_M2_mpy_up:
   si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
-def Hexagon_M2_dpmpyss_rnd_s0:
+def HEXAGON_M2_dpmpyss_rnd_s0:
   si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
-def Hexagon_M2_mpyu_up:
+def HEXAGON_M2_mpyu_up:
   si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
 
 // MTYPE / MPYH / Multiply and use full result.
-def Hexagon_M2_dpmpyuu_s0:
+def HEXAGON_M2_dpmpyuu_s0:
   di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
-def Hexagon_M2_dpmpyuu_acc_s0:
+def HEXAGON_M2_dpmpyuu_acc_s0:
   di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
-def Hexagon_M2_dpmpyuu_nac_s0:
+def HEXAGON_M2_dpmpyuu_nac_s0:
   di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
-def Hexagon_M2_dpmpyss_s0:
+def HEXAGON_M2_dpmpyss_s0:
   di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
-def Hexagon_M2_dpmpyss_acc_s0:
+def HEXAGON_M2_dpmpyss_acc_s0:
   di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
-def Hexagon_M2_dpmpyss_nac_s0:
+def HEXAGON_M2_dpmpyss_nac_s0:
   di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
 
 
@@ -2528,334 +2564,334 @@ def Hexagon_M2_dpmpyss_nac_s0:
 // MTYPE / MPYS / Scalar 16x16 multiply signed.
 //Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
 //          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
-def Hexagon_M2_mpy_hh_s0:
+def HEXAGON_M2_mpy_hh_s0:
   si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
-def Hexagon_M2_mpy_hh_s1:
+def HEXAGON_M2_mpy_hh_s1:
   si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
-def Hexagon_M2_mpy_rnd_hh_s1:
+def HEXAGON_M2_mpy_rnd_hh_s1:
   si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
-def Hexagon_M2_mpy_sat_rnd_hh_s1:
+def HEXAGON_M2_mpy_sat_rnd_hh_s1:
   si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
-def Hexagon_M2_mpy_sat_hh_s1:
+def HEXAGON_M2_mpy_sat_hh_s1:
   si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
-def Hexagon_M2_mpy_rnd_hh_s0:
+def HEXAGON_M2_mpy_rnd_hh_s0:
   si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
-def Hexagon_M2_mpy_sat_rnd_hh_s0:
+def HEXAGON_M2_mpy_sat_rnd_hh_s0:
   si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
-def Hexagon_M2_mpy_sat_hh_s0:
+def HEXAGON_M2_mpy_sat_hh_s0:
   si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
 
-def Hexagon_M2_mpy_hl_s0:
+def HEXAGON_M2_mpy_hl_s0:
   si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
-def Hexagon_M2_mpy_hl_s1:
+def HEXAGON_M2_mpy_hl_s1:
   si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
-def Hexagon_M2_mpy_rnd_hl_s1:
+def HEXAGON_M2_mpy_rnd_hl_s1:
   si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
-def Hexagon_M2_mpy_sat_rnd_hl_s1:
+def HEXAGON_M2_mpy_sat_rnd_hl_s1:
   si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
-def Hexagon_M2_mpy_sat_hl_s1:
+def HEXAGON_M2_mpy_sat_hl_s1:
   si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
-def Hexagon_M2_mpy_rnd_hl_s0:
+def HEXAGON_M2_mpy_rnd_hl_s0:
   si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
-def Hexagon_M2_mpy_sat_rnd_hl_s0:
+def HEXAGON_M2_mpy_sat_rnd_hl_s0:
   si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
-def Hexagon_M2_mpy_sat_hl_s0:
+def HEXAGON_M2_mpy_sat_hl_s0:
   si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
 
-def Hexagon_M2_mpy_lh_s0:
+def HEXAGON_M2_mpy_lh_s0:
   si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
-def Hexagon_M2_mpy_lh_s1:
+def HEXAGON_M2_mpy_lh_s1:
   si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
-def Hexagon_M2_mpy_rnd_lh_s1:
+def HEXAGON_M2_mpy_rnd_lh_s1:
   si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
-def Hexagon_M2_mpy_sat_rnd_lh_s1:
+def HEXAGON_M2_mpy_sat_rnd_lh_s1:
   si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
-def Hexagon_M2_mpy_sat_lh_s1:
+def HEXAGON_M2_mpy_sat_lh_s1:
   si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
-def Hexagon_M2_mpy_rnd_lh_s0:
+def HEXAGON_M2_mpy_rnd_lh_s0:
   si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
-def Hexagon_M2_mpy_sat_rnd_lh_s0:
+def HEXAGON_M2_mpy_sat_rnd_lh_s0:
   si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
-def Hexagon_M2_mpy_sat_lh_s0:
+def HEXAGON_M2_mpy_sat_lh_s0:
   si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
 
-def Hexagon_M2_mpy_ll_s0:
+def HEXAGON_M2_mpy_ll_s0:
   si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
-def Hexagon_M2_mpy_ll_s1:
+def HEXAGON_M2_mpy_ll_s1:
   si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
-def Hexagon_M2_mpy_rnd_ll_s1:
+def HEXAGON_M2_mpy_rnd_ll_s1:
   si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
-def Hexagon_M2_mpy_sat_rnd_ll_s1:
+def HEXAGON_M2_mpy_sat_rnd_ll_s1:
   si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
-def Hexagon_M2_mpy_sat_ll_s1:
+def HEXAGON_M2_mpy_sat_ll_s1:
   si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
-def Hexagon_M2_mpy_rnd_ll_s0:
+def HEXAGON_M2_mpy_rnd_ll_s0:
   si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
-def Hexagon_M2_mpy_sat_rnd_ll_s0:
+def HEXAGON_M2_mpy_sat_rnd_ll_s0:
   si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
-def Hexagon_M2_mpy_sat_ll_s0:
+def HEXAGON_M2_mpy_sat_ll_s0:
   si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
 
 //Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
-def Hexagon_M2_mpyd_hh_s0:
+def HEXAGON_M2_mpyd_hh_s0:
   di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
-def Hexagon_M2_mpyd_hh_s1:
+def HEXAGON_M2_mpyd_hh_s1:
   di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
-def Hexagon_M2_mpyd_rnd_hh_s1:
+def HEXAGON_M2_mpyd_rnd_hh_s1:
   di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
-def Hexagon_M2_mpyd_rnd_hh_s0:
+def HEXAGON_M2_mpyd_rnd_hh_s0:
   di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
 
-def Hexagon_M2_mpyd_hl_s0:
+def HEXAGON_M2_mpyd_hl_s0:
   di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
-def Hexagon_M2_mpyd_hl_s1:
+def HEXAGON_M2_mpyd_hl_s1:
   di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
-def Hexagon_M2_mpyd_rnd_hl_s1:
+def HEXAGON_M2_mpyd_rnd_hl_s1:
   di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
-def Hexagon_M2_mpyd_rnd_hl_s0:
+def HEXAGON_M2_mpyd_rnd_hl_s0:
   di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
 
-def Hexagon_M2_mpyd_lh_s0:
+def HEXAGON_M2_mpyd_lh_s0:
   di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
-def Hexagon_M2_mpyd_lh_s1:
+def HEXAGON_M2_mpyd_lh_s1:
   di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
-def Hexagon_M2_mpyd_rnd_lh_s1:
+def HEXAGON_M2_mpyd_rnd_lh_s1:
   di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
-def Hexagon_M2_mpyd_rnd_lh_s0:
+def HEXAGON_M2_mpyd_rnd_lh_s0:
   di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
 
-def Hexagon_M2_mpyd_ll_s0:
+def HEXAGON_M2_mpyd_ll_s0:
   di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
-def Hexagon_M2_mpyd_ll_s1:
+def HEXAGON_M2_mpyd_ll_s1:
   di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
-def Hexagon_M2_mpyd_rnd_ll_s1:
+def HEXAGON_M2_mpyd_rnd_ll_s1:
   di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
-def Hexagon_M2_mpyd_rnd_ll_s0:
+def HEXAGON_M2_mpyd_rnd_ll_s0:
   di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
 
 //Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def Hexagon_M2_mpy_acc_hh_s0:
+def HEXAGON_M2_mpy_acc_hh_s0:
   si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
-def Hexagon_M2_mpy_acc_hh_s1:
+def HEXAGON_M2_mpy_acc_hh_s1:
   si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
-def Hexagon_M2_mpy_acc_sat_hh_s1:
+def HEXAGON_M2_mpy_acc_sat_hh_s1:
   si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
-def Hexagon_M2_mpy_acc_sat_hh_s0:
+def HEXAGON_M2_mpy_acc_sat_hh_s0:
   si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
 
-def Hexagon_M2_mpy_acc_hl_s0:
+def HEXAGON_M2_mpy_acc_hl_s0:
   si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
-def Hexagon_M2_mpy_acc_hl_s1:
+def HEXAGON_M2_mpy_acc_hl_s1:
   si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
-def Hexagon_M2_mpy_acc_sat_hl_s1:
+def HEXAGON_M2_mpy_acc_sat_hl_s1:
   si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
-def Hexagon_M2_mpy_acc_sat_hl_s0:
+def HEXAGON_M2_mpy_acc_sat_hl_s0:
   si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
 
-def Hexagon_M2_mpy_acc_lh_s0:
+def HEXAGON_M2_mpy_acc_lh_s0:
   si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
-def Hexagon_M2_mpy_acc_lh_s1:
+def HEXAGON_M2_mpy_acc_lh_s1:
   si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
-def Hexagon_M2_mpy_acc_sat_lh_s1:
+def HEXAGON_M2_mpy_acc_sat_lh_s1:
   si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
-def Hexagon_M2_mpy_acc_sat_lh_s0:
+def HEXAGON_M2_mpy_acc_sat_lh_s0:
   si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
 
-def Hexagon_M2_mpy_acc_ll_s0:
+def HEXAGON_M2_mpy_acc_ll_s0:
   si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
-def Hexagon_M2_mpy_acc_ll_s1:
+def HEXAGON_M2_mpy_acc_ll_s1:
   si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
-def Hexagon_M2_mpy_acc_sat_ll_s1:
+def HEXAGON_M2_mpy_acc_sat_ll_s1:
   si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
-def Hexagon_M2_mpy_acc_sat_ll_s0:
+def HEXAGON_M2_mpy_acc_sat_ll_s0:
   si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
 
 //Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def Hexagon_M2_mpy_nac_hh_s0:
+def HEXAGON_M2_mpy_nac_hh_s0:
   si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
-def Hexagon_M2_mpy_nac_hh_s1:
+def HEXAGON_M2_mpy_nac_hh_s1:
   si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
-def Hexagon_M2_mpy_nac_sat_hh_s1:
+def HEXAGON_M2_mpy_nac_sat_hh_s1:
   si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
-def Hexagon_M2_mpy_nac_sat_hh_s0:
+def HEXAGON_M2_mpy_nac_sat_hh_s0:
   si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
 
-def Hexagon_M2_mpy_nac_hl_s0:
+def HEXAGON_M2_mpy_nac_hl_s0:
   si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
-def Hexagon_M2_mpy_nac_hl_s1:
+def HEXAGON_M2_mpy_nac_hl_s1:
   si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
-def Hexagon_M2_mpy_nac_sat_hl_s1:
+def HEXAGON_M2_mpy_nac_sat_hl_s1:
   si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
-def Hexagon_M2_mpy_nac_sat_hl_s0:
+def HEXAGON_M2_mpy_nac_sat_hl_s0:
   si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
 
-def Hexagon_M2_mpy_nac_lh_s0:
+def HEXAGON_M2_mpy_nac_lh_s0:
   si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
-def Hexagon_M2_mpy_nac_lh_s1:
+def HEXAGON_M2_mpy_nac_lh_s1:
   si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
-def Hexagon_M2_mpy_nac_sat_lh_s1:
+def HEXAGON_M2_mpy_nac_sat_lh_s1:
   si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
-def Hexagon_M2_mpy_nac_sat_lh_s0:
+def HEXAGON_M2_mpy_nac_sat_lh_s0:
   si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
 
-def Hexagon_M2_mpy_nac_ll_s0:
+def HEXAGON_M2_mpy_nac_ll_s0:
   si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
-def Hexagon_M2_mpy_nac_ll_s1:
+def HEXAGON_M2_mpy_nac_ll_s1:
   si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
-def Hexagon_M2_mpy_nac_sat_ll_s1:
+def HEXAGON_M2_mpy_nac_sat_ll_s1:
   si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
-def Hexagon_M2_mpy_nac_sat_ll_s0:
+def HEXAGON_M2_mpy_nac_sat_ll_s0:
   si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
 
 //Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def Hexagon_M2_mpyd_acc_hh_s0:
+def HEXAGON_M2_mpyd_acc_hh_s0:
   di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
-def Hexagon_M2_mpyd_acc_hh_s1:
+def HEXAGON_M2_mpyd_acc_hh_s1:
   di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
 
-def Hexagon_M2_mpyd_acc_hl_s0:
+def HEXAGON_M2_mpyd_acc_hl_s0:
   di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
-def Hexagon_M2_mpyd_acc_hl_s1:
+def HEXAGON_M2_mpyd_acc_hl_s1:
   di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
 
-def Hexagon_M2_mpyd_acc_lh_s0:
+def HEXAGON_M2_mpyd_acc_lh_s0:
   di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
-def Hexagon_M2_mpyd_acc_lh_s1:
+def HEXAGON_M2_mpyd_acc_lh_s1:
   di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
 
-def Hexagon_M2_mpyd_acc_ll_s0:
+def HEXAGON_M2_mpyd_acc_ll_s0:
   di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
-def Hexagon_M2_mpyd_acc_ll_s1:
+def HEXAGON_M2_mpyd_acc_ll_s1:
   di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
 
 //Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def Hexagon_M2_mpyd_nac_hh_s0:
+def HEXAGON_M2_mpyd_nac_hh_s0:
   di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
-def Hexagon_M2_mpyd_nac_hh_s1:
+def HEXAGON_M2_mpyd_nac_hh_s1:
   di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
 
-def Hexagon_M2_mpyd_nac_hl_s0:
+def HEXAGON_M2_mpyd_nac_hl_s0:
   di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
-def Hexagon_M2_mpyd_nac_hl_s1:
+def HEXAGON_M2_mpyd_nac_hl_s1:
   di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
 
-def Hexagon_M2_mpyd_nac_lh_s0:
+def HEXAGON_M2_mpyd_nac_lh_s0:
   di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
-def Hexagon_M2_mpyd_nac_lh_s1:
+def HEXAGON_M2_mpyd_nac_lh_s1:
   di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
 
-def Hexagon_M2_mpyd_nac_ll_s0:
+def HEXAGON_M2_mpyd_nac_ll_s0:
   di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
-def Hexagon_M2_mpyd_nac_ll_s1:
+def HEXAGON_M2_mpyd_nac_ll_s1:
   di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
 
 // MTYPE / MPYS / Scalar 16x16 multiply unsigned.
 //Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_hh_s0:
+def HEXAGON_M2_mpyu_hh_s0:
   si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
-def Hexagon_M2_mpyu_hh_s1:
+def HEXAGON_M2_mpyu_hh_s1:
   si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
-def Hexagon_M2_mpyu_hl_s0:
+def HEXAGON_M2_mpyu_hl_s0:
   si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
-def Hexagon_M2_mpyu_hl_s1:
+def HEXAGON_M2_mpyu_hl_s1:
   si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
-def Hexagon_M2_mpyu_lh_s0:
+def HEXAGON_M2_mpyu_lh_s0:
   si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
-def Hexagon_M2_mpyu_lh_s1:
+def HEXAGON_M2_mpyu_lh_s1:
   si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
-def Hexagon_M2_mpyu_ll_s0:
+def HEXAGON_M2_mpyu_ll_s0:
   si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
-def Hexagon_M2_mpyu_ll_s1:
+def HEXAGON_M2_mpyu_ll_s1:
   si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
 
 //Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_hh_s0:
+def HEXAGON_M2_mpyud_hh_s0:
   di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
-def Hexagon_M2_mpyud_hh_s1:
+def HEXAGON_M2_mpyud_hh_s1:
   di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
-def Hexagon_M2_mpyud_hl_s0:
+def HEXAGON_M2_mpyud_hl_s0:
   di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
-def Hexagon_M2_mpyud_hl_s1:
+def HEXAGON_M2_mpyud_hl_s1:
   di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
-def Hexagon_M2_mpyud_lh_s0:
+def HEXAGON_M2_mpyud_lh_s0:
   di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
-def Hexagon_M2_mpyud_lh_s1:
+def HEXAGON_M2_mpyud_lh_s1:
   di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
-def Hexagon_M2_mpyud_ll_s0:
+def HEXAGON_M2_mpyud_ll_s0:
   di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
-def Hexagon_M2_mpyud_ll_s1:
+def HEXAGON_M2_mpyud_ll_s1:
   di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
 
 //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_acc_hh_s0:
+def HEXAGON_M2_mpyu_acc_hh_s0:
   si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
-def Hexagon_M2_mpyu_acc_hh_s1:
+def HEXAGON_M2_mpyu_acc_hh_s1:
   si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
-def Hexagon_M2_mpyu_acc_hl_s0:
+def HEXAGON_M2_mpyu_acc_hl_s0:
   si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
-def Hexagon_M2_mpyu_acc_hl_s1:
+def HEXAGON_M2_mpyu_acc_hl_s1:
   si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
-def Hexagon_M2_mpyu_acc_lh_s0:
+def HEXAGON_M2_mpyu_acc_lh_s0:
   si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
-def Hexagon_M2_mpyu_acc_lh_s1:
+def HEXAGON_M2_mpyu_acc_lh_s1:
   si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
-def Hexagon_M2_mpyu_acc_ll_s0:
+def HEXAGON_M2_mpyu_acc_ll_s0:
   si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
-def Hexagon_M2_mpyu_acc_ll_s1:
+def HEXAGON_M2_mpyu_acc_ll_s1:
   si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
 
 //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_nac_hh_s0:
+def HEXAGON_M2_mpyu_nac_hh_s0:
   si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
-def Hexagon_M2_mpyu_nac_hh_s1:
+def HEXAGON_M2_mpyu_nac_hh_s1:
   si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
-def Hexagon_M2_mpyu_nac_hl_s0:
+def HEXAGON_M2_mpyu_nac_hl_s0:
   si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
-def Hexagon_M2_mpyu_nac_hl_s1:
+def HEXAGON_M2_mpyu_nac_hl_s1:
   si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
-def Hexagon_M2_mpyu_nac_lh_s0:
+def HEXAGON_M2_mpyu_nac_lh_s0:
   si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
-def Hexagon_M2_mpyu_nac_lh_s1:
+def HEXAGON_M2_mpyu_nac_lh_s1:
   si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
-def Hexagon_M2_mpyu_nac_ll_s0:
+def HEXAGON_M2_mpyu_nac_ll_s0:
   si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
-def Hexagon_M2_mpyu_nac_ll_s1:
+def HEXAGON_M2_mpyu_nac_ll_s1:
   si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
 
 //Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_acc_hh_s0:
+def HEXAGON_M2_mpyud_acc_hh_s0:
   di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
-def Hexagon_M2_mpyud_acc_hh_s1:
+def HEXAGON_M2_mpyud_acc_hh_s1:
   di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
-def Hexagon_M2_mpyud_acc_hl_s0:
+def HEXAGON_M2_mpyud_acc_hl_s0:
   di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
-def Hexagon_M2_mpyud_acc_hl_s1:
+def HEXAGON_M2_mpyud_acc_hl_s1:
   di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
-def Hexagon_M2_mpyud_acc_lh_s0:
+def HEXAGON_M2_mpyud_acc_lh_s0:
   di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
-def Hexagon_M2_mpyud_acc_lh_s1:
+def HEXAGON_M2_mpyud_acc_lh_s1:
   di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
-def Hexagon_M2_mpyud_acc_ll_s0:
+def HEXAGON_M2_mpyud_acc_ll_s0:
   di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
-def Hexagon_M2_mpyud_acc_ll_s1:
+def HEXAGON_M2_mpyud_acc_ll_s1:
   di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
 
 //Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_nac_hh_s0:
+def HEXAGON_M2_mpyud_nac_hh_s0:
   di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
-def Hexagon_M2_mpyud_nac_hh_s1:
+def HEXAGON_M2_mpyud_nac_hh_s1:
   di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
-def Hexagon_M2_mpyud_nac_hl_s0:
+def HEXAGON_M2_mpyud_nac_hl_s0:
   di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
-def Hexagon_M2_mpyud_nac_hl_s1:
+def HEXAGON_M2_mpyud_nac_hl_s1:
   di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
-def Hexagon_M2_mpyud_nac_lh_s0:
+def HEXAGON_M2_mpyud_nac_lh_s0:
   di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
-def Hexagon_M2_mpyud_nac_lh_s1:
+def HEXAGON_M2_mpyud_nac_lh_s1:
   di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
-def Hexagon_M2_mpyud_nac_ll_s0:
+def HEXAGON_M2_mpyud_nac_ll_s0:
   di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
-def Hexagon_M2_mpyud_nac_ll_s1:
+def HEXAGON_M2_mpyud_nac_ll_s1:
   di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
 
 
@@ -2864,15 +2900,15 @@ def Hexagon_M2_mpyud_nac_ll_s1:
 *********************************************************************/
 
 // MTYPE / VB / Vector reduce add unsigned bytes.
-def Hexagon_A2_vraddub:
+def HEXAGON_A2_vraddub:
   di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
-def Hexagon_A2_vraddub_acc:
+def HEXAGON_A2_vraddub_acc:
   di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
 
 // MTYPE / VB / Vector sum of absolute differences unsigned bytes.
-def Hexagon_A2_vrsadub:
+def HEXAGON_A2_vrsadub:
   di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
-def Hexagon_A2_vrsadub_acc:
+def HEXAGON_A2_vrsadub_acc:
   di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
 
 /********************************************************************
@@ -2880,56 +2916,56 @@ def Hexagon_A2_vrsadub_acc:
 *********************************************************************/
 
 // MTYPE / VH / Vector dual multiply.
-def Hexagon_M2_vdmpys_s1:
+def HEXAGON_M2_vdmpys_s1:
   di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
-def Hexagon_M2_vdmpys_s0:
+def HEXAGON_M2_vdmpys_s0:
   di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
-def Hexagon_M2_vdmacs_s1:
+def HEXAGON_M2_vdmacs_s1:
   di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
-def Hexagon_M2_vdmacs_s0:
+def HEXAGON_M2_vdmacs_s0:
   di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
 
 // MTYPE / VH / Vector dual multiply with round and pack.
-def Hexagon_M2_vdmpyrs_s0:
+def HEXAGON_M2_vdmpyrs_s0:
   si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
-def Hexagon_M2_vdmpyrs_s1:
+def HEXAGON_M2_vdmpyrs_s1:
   si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
 
 // MTYPE / VH / Vector multiply even halfwords.
-def Hexagon_M2_vmpy2es_s1:
+def HEXAGON_M2_vmpy2es_s1:
   di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
-def Hexagon_M2_vmpy2es_s0:
+def HEXAGON_M2_vmpy2es_s0:
   di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
-def Hexagon_M2_vmac2es:
+def HEXAGON_M2_vmac2es:
   di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
-def Hexagon_M2_vmac2es_s1:
+def HEXAGON_M2_vmac2es_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
-def Hexagon_M2_vmac2es_s0:
+def HEXAGON_M2_vmac2es_s0:
   di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
 
 // MTYPE / VH / Vector multiply halfwords.
-def Hexagon_M2_vmpy2s_s0:
+def HEXAGON_M2_vmpy2s_s0:
   di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
-def Hexagon_M2_vmpy2s_s1:
+def HEXAGON_M2_vmpy2s_s1:
   di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
-def Hexagon_M2_vmac2:
+def HEXAGON_M2_vmac2:
   di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
-def Hexagon_M2_vmac2s_s0:
+def HEXAGON_M2_vmac2s_s0:
   di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
-def Hexagon_M2_vmac2s_s1:
+def HEXAGON_M2_vmac2s_s1:
   di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
 
 // MTYPE / VH / Vector multiply halfwords with round and pack.
-def Hexagon_M2_vmpy2s_s0pack:
+def HEXAGON_M2_vmpy2s_s0pack:
   si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
-def Hexagon_M2_vmpy2s_s1pack:
+def HEXAGON_M2_vmpy2s_s1pack:
   si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
 
 // MTYPE / VH / Vector reduce multiply halfwords.
 // Rxx32+=vrmpyh(Rss32,Rtt32)
-def Hexagon_M2_vrmpy_s0:
+def HEXAGON_M2_vrmpy_s0:
   di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
-def Hexagon_M2_vrmac_s0:
+def HEXAGON_M2_vrmac_s0:
   di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
 
 
@@ -2938,25 +2974,25 @@ def Hexagon_M2_vrmac_s0:
 *********************************************************************/
 
 // STYPE / ALU / Absolute value.
-def Hexagon_A2_abs:
+def HEXAGON_A2_abs:
   si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
-def Hexagon_A2_absp:
+def HEXAGON_A2_absp:
   di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
-def Hexagon_A2_abssat:
+def HEXAGON_A2_abssat:
   si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
 
 // STYPE / ALU / Negate.
-def Hexagon_A2_negp:
+def HEXAGON_A2_negp:
   di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
-def Hexagon_A2_negsat:
+def HEXAGON_A2_negsat:
   si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
 
 // STYPE / ALU / Logical Not.
-def Hexagon_A2_notp:
+def HEXAGON_A2_notp:
   di_SInst_di                     <"not",     int_hexagon_A2_notp>;
 
 // STYPE / ALU / Sign extend word to doubleword.
-def Hexagon_A2_sxtw:
+def HEXAGON_A2_sxtw:
   di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
 
 
@@ -2965,88 +3001,88 @@ def Hexagon_A2_sxtw:
 *********************************************************************/
 
 // STYPE / BIT / Count leading.
-def Hexagon_S2_cl0:
+def HEXAGON_S2_cl0:
   si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
-def Hexagon_S2_cl0p:
+def HEXAGON_S2_cl0p:
   si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
-def Hexagon_S2_cl1:
+def HEXAGON_S2_cl1:
   si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
-def Hexagon_S2_cl1p:
+def HEXAGON_S2_cl1p:
   si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
-def Hexagon_S2_clb:
+def HEXAGON_S2_clb:
   si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
-def Hexagon_S2_clbp:
+def HEXAGON_S2_clbp:
   si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
-def Hexagon_S2_clbnorm:
+def HEXAGON_S2_clbnorm:
   si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
 
 // STYPE / BIT / Count trailing.
-def Hexagon_S2_ct0:
+def HEXAGON_S2_ct0:
   si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
-def Hexagon_S2_ct1:
+def HEXAGON_S2_ct1:
   si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
 
 // STYPE / BIT / Compare bit mask.
-def HEXAGON_C2_bitsclr:
+def Hexagon_C2_bitsclr:
   qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
-def HEXAGON_C2_bitsclri:
+def Hexagon_C2_bitsclri:
   qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
-def HEXAGON_C2_bitsset:
+def Hexagon_C2_bitsset:
   qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
 
 // STYPE / BIT / Extract unsigned.
 // Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
-def Hexagon_S2_extractu:
+def HEXAGON_S2_extractu:
   si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
-def Hexagon_S2_extractu_rp:
+def HEXAGON_S2_extractu_rp:
   si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
-def Hexagon_S2_extractup:
+def HEXAGON_S2_extractup:
   di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
-def Hexagon_S2_extractup_rp:
+def HEXAGON_S2_extractup_rp:
   di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
 
 // STYPE / BIT / Insert bitfield.
-def HEXAGON_S2_insert:
+def Hexagon_S2_insert:
   si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
-def HEXAGON_S2_insert_rp:
+def Hexagon_S2_insert_rp:
   si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
-def HEXAGON_S2_insertp:
+def Hexagon_S2_insertp:
   di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
-def HEXAGON_S2_insertp_rp:
+def Hexagon_S2_insertp_rp:
   di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
 
 // STYPE / BIT / Innterleave/deinterleave.
-def HEXAGON_S2_interleave:
+def Hexagon_S2_interleave:
   di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
-def HEXAGON_S2_deinterleave:
+def Hexagon_S2_deinterleave:
   di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
 
 // STYPE / BIT / Linear feedback-shift Iteration.
-def HEXAGON_S2_lfsp:
+def Hexagon_S2_lfsp:
   di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
 
 // STYPE / BIT / Bit reverse.
-def HEXAGON_S2_brev:
+def Hexagon_S2_brev:
   si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
 
 // STYPE / BIT / Set/Clear/Toggle Bit.
-def Hexagon_S2_setbit_i:
+def HEXAGON_S2_setbit_i:
   si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
-def Hexagon_S2_togglebit_i:
+def HEXAGON_S2_togglebit_i:
   si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
-def Hexagon_S2_clrbit_i:
+def HEXAGON_S2_clrbit_i:
   si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
-def Hexagon_S2_setbit_r:
+def HEXAGON_S2_setbit_r:
   si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
-def Hexagon_S2_togglebit_r:
+def HEXAGON_S2_togglebit_r:
   si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
-def Hexagon_S2_clrbit_r:
+def HEXAGON_S2_clrbit_r:
   si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
 
 // STYPE / BIT / Test Bit.
-def Hexagon_S2_tstbit_i:
+def HEXAGON_S2_tstbit_i:
   qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
-def Hexagon_S2_tstbit_r:
+def HEXAGON_S2_tstbit_r:
   qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
 
 
@@ -3055,11 +3091,11 @@ def Hexagon_S2_tstbit_r:
 *********************************************************************/
 
 // STYPE / COMPLEX / Vector Complex conjugate.
-def Hexagon_A2_vconj:
+def HEXAGON_A2_vconj:
   di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
 
 // STYPE / COMPLEX / Vector Complex rotate.
-def Hexagon_S2_vcrotate:
+def HEXAGON_S2_vcrotate:
   di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
 
 
@@ -3068,102 +3104,102 @@ def Hexagon_S2_vcrotate:
 *********************************************************************/
 
 // STYPE / PERM / Saturate.
-def Hexagon_A2_sat:
+def HEXAGON_A2_sat:
   si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
-def Hexagon_A2_satb:
+def HEXAGON_A2_satb:
   si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
-def Hexagon_A2_sath:
+def HEXAGON_A2_sath:
   si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
-def Hexagon_A2_satub:
+def HEXAGON_A2_satub:
   si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
-def Hexagon_A2_satuh:
+def HEXAGON_A2_satuh:
   si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
 
 // STYPE / PERM / Swizzle bytes.
-def Hexagon_A2_swiz:
+def HEXAGON_A2_swiz:
   si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
 
 // STYPE / PERM / Vector align.
 // Need custom lowering
-def Hexagon_S2_valignib:
+def HEXAGON_S2_valignib:
   di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
-def Hexagon_S2_valignrb:
+def HEXAGON_S2_valignrb:
   di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
 
 // STYPE / PERM / Vector round and pack.
-def Hexagon_S2_vrndpackwh:
+def HEXAGON_S2_vrndpackwh:
   si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
-def Hexagon_S2_vrndpackwhs:
+def HEXAGON_S2_vrndpackwhs:
   si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
 
 // STYPE / PERM / Vector saturate and pack.
-def Hexagon_S2_svsathb:
+def HEXAGON_S2_svsathb:
   si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
-def Hexagon_S2_vsathb:
+def HEXAGON_S2_vsathb:
   si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
-def Hexagon_S2_svsathub:
+def HEXAGON_S2_svsathub:
   si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
-def Hexagon_S2_vsathub:
+def HEXAGON_S2_vsathub:
   si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
-def Hexagon_S2_vsatwh:
+def HEXAGON_S2_vsatwh:
   si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
-def Hexagon_S2_vsatwuh:
+def HEXAGON_S2_vsatwuh:
   si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
 
 // STYPE / PERM / Vector saturate without pack.
-def Hexagon_S2_vsathb_nopack:
+def HEXAGON_S2_vsathb_nopack:
   di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
-def Hexagon_S2_vsathub_nopack:
+def HEXAGON_S2_vsathub_nopack:
   di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
-def Hexagon_S2_vsatwh_nopack:
+def HEXAGON_S2_vsatwh_nopack:
   di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
-def Hexagon_S2_vsatwuh_nopack:
+def HEXAGON_S2_vsatwuh_nopack:
   di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
 
 // STYPE / PERM / Vector shuffle.
-def Hexagon_S2_shuffeb:
+def HEXAGON_S2_shuffeb:
   di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
-def Hexagon_S2_shuffeh:
+def HEXAGON_S2_shuffeh:
   di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
-def Hexagon_S2_shuffob:
+def HEXAGON_S2_shuffob:
   di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
-def Hexagon_S2_shuffoh:
+def HEXAGON_S2_shuffoh:
   di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
 
 // STYPE / PERM / Vector splat bytes.
-def Hexagon_S2_vsplatrb:
+def HEXAGON_S2_vsplatrb:
   si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
 
 // STYPE / PERM / Vector splat halfwords.
-def Hexagon_S2_vsplatrh:
+def HEXAGON_S2_vsplatrh:
   di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
 
 // STYPE / PERM / Vector splice.
-def HEXAGON_S2_vsplicerb:
+def Hexagon_S2_vsplicerb:
   di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
-def HEXAGON_S2_vspliceib:
+def Hexagon_S2_vspliceib:
   di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
 
 // STYPE / PERM / Sign extend.
-def Hexagon_S2_vsxtbh:
+def HEXAGON_S2_vsxtbh:
   di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
-def Hexagon_S2_vsxthw:
+def HEXAGON_S2_vsxthw:
   di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
 
 // STYPE / PERM / Truncate.
-def Hexagon_S2_vtrunehb:
+def HEXAGON_S2_vtrunehb:
   si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
-def Hexagon_S2_vtrunohb:
+def HEXAGON_S2_vtrunohb:
   si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
-def Hexagon_S2_vtrunewh:
+def HEXAGON_S2_vtrunewh:
   di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
-def Hexagon_S2_vtrunowh:
+def HEXAGON_S2_vtrunowh:
   di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
 
 // STYPE / PERM / Zero extend.
-def Hexagon_S2_vzxtbh:
+def HEXAGON_S2_vzxtbh:
   di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
-def Hexagon_S2_vzxthw:
+def HEXAGON_S2_vzxthw:
   di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
 
 
@@ -3172,17 +3208,17 @@ def Hexagon_S2_vzxthw:
 *********************************************************************/
 
 // STYPE / PRED / Mask generate from predicate.
-def Hexagon_C2_mask:
+def HEXAGON_C2_mask:
   di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
 
 // STYPE / PRED / Predicate transfer.
-def Hexagon_C2_tfrpr:
+def HEXAGON_C2_tfrpr:
   si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
-def Hexagon_C2_tfrrp:
+def HEXAGON_C2_tfrrp:
   qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
 
 // STYPE / PRED / Viterbi pack even and odd predicate bits.
-def Hexagon_C2_vitpack:
+def HEXAGON_C2_vitpack:
   si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
 
 
@@ -3191,202 +3227,202 @@ def Hexagon_C2_vitpack:
 *********************************************************************/
 
 // STYPE / SHIFT / Shift by immediate.
-def Hexagon_S2_asl_i_r:
+def HEXAGON_S2_asl_i_r:
   si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
-def Hexagon_S2_asr_i_r:
+def HEXAGON_S2_asr_i_r:
   si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
-def Hexagon_S2_lsr_i_r:
+def HEXAGON_S2_lsr_i_r:
   si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
-def Hexagon_S2_asl_i_p:
+def HEXAGON_S2_asl_i_p:
   di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
-def Hexagon_S2_asr_i_p:
+def HEXAGON_S2_asr_i_p:
   di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
-def Hexagon_S2_lsr_i_p:
+def HEXAGON_S2_lsr_i_p:
   di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
 
 // STYPE / SHIFT / Shift by immediate and accumulate.
-def Hexagon_S2_asl_i_r_acc:
+def HEXAGON_S2_asl_i_r_acc:
   si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
-def Hexagon_S2_asr_i_r_acc:
+def HEXAGON_S2_asr_i_r_acc:
   si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
-def Hexagon_S2_lsr_i_r_acc:
+def HEXAGON_S2_lsr_i_r_acc:
   si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
-def Hexagon_S2_asl_i_r_nac:
+def HEXAGON_S2_asl_i_r_nac:
   si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
-def Hexagon_S2_asr_i_r_nac:
+def HEXAGON_S2_asr_i_r_nac:
   si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
-def Hexagon_S2_lsr_i_r_nac:
+def HEXAGON_S2_lsr_i_r_nac:
   si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
-def Hexagon_S2_asl_i_p_acc:
+def HEXAGON_S2_asl_i_p_acc:
   di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
-def Hexagon_S2_asr_i_p_acc:
+def HEXAGON_S2_asr_i_p_acc:
   di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
-def Hexagon_S2_lsr_i_p_acc:
+def HEXAGON_S2_lsr_i_p_acc:
   di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
-def Hexagon_S2_asl_i_p_nac:
+def HEXAGON_S2_asl_i_p_nac:
   di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
-def Hexagon_S2_asr_i_p_nac:
+def HEXAGON_S2_asr_i_p_nac:
   di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
-def Hexagon_S2_lsr_i_p_nac:
+def HEXAGON_S2_lsr_i_p_nac:
   di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
 
 // STYPE / SHIFT / Shift by immediate and add.
-def Hexagon_S2_addasl_rrri:
+def HEXAGON_S2_addasl_rrri:
   si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
 
 // STYPE / SHIFT / Shift by immediate and logical.
-def Hexagon_S2_asl_i_r_and:
+def HEXAGON_S2_asl_i_r_and:
   si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
-def Hexagon_S2_asr_i_r_and:
+def HEXAGON_S2_asr_i_r_and:
   si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
-def Hexagon_S2_lsr_i_r_and:
+def HEXAGON_S2_lsr_i_r_and:
   si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
 
-def Hexagon_S2_asl_i_r_xacc:
+def HEXAGON_S2_asl_i_r_xacc:
   si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
-def Hexagon_S2_lsr_i_r_xacc:
+def HEXAGON_S2_lsr_i_r_xacc:
   si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
 
-def Hexagon_S2_asl_i_r_or:
+def HEXAGON_S2_asl_i_r_or:
   si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
-def Hexagon_S2_asr_i_r_or:
+def HEXAGON_S2_asr_i_r_or:
   si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
-def Hexagon_S2_lsr_i_r_or:
+def HEXAGON_S2_lsr_i_r_or:
   si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
 
-def Hexagon_S2_asl_i_p_and:
+def HEXAGON_S2_asl_i_p_and:
   di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
-def Hexagon_S2_asr_i_p_and:
+def HEXAGON_S2_asr_i_p_and:
   di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
-def Hexagon_S2_lsr_i_p_and:
+def HEXAGON_S2_lsr_i_p_and:
   di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
 
-def Hexagon_S2_asl_i_p_xacc:
+def HEXAGON_S2_asl_i_p_xacc:
   di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
-def Hexagon_S2_lsr_i_p_xacc:
+def HEXAGON_S2_lsr_i_p_xacc:
   di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
 
-def Hexagon_S2_asl_i_p_or:
+def HEXAGON_S2_asl_i_p_or:
   di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
-def Hexagon_S2_asr_i_p_or:
+def HEXAGON_S2_asr_i_p_or:
   di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
-def Hexagon_S2_lsr_i_p_or:
+def HEXAGON_S2_lsr_i_p_or:
   di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
 
 // STYPE / SHIFT / Shift right by immediate with rounding.
-def Hexagon_S2_asr_i_r_rnd:
+def HEXAGON_S2_asr_i_r_rnd:
   si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
-def Hexagon_S2_asr_i_r_rnd_goodsyntax:
+def HEXAGON_S2_asr_i_r_rnd_goodsyntax:
   si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
 
 // STYPE / SHIFT / Shift left by immediate with saturation.
-def Hexagon_S2_asl_i_r_sat:
+def HEXAGON_S2_asl_i_r_sat:
   si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
 
 // STYPE / SHIFT / Shift by register.
-def Hexagon_S2_asl_r_r:
+def HEXAGON_S2_asl_r_r:
   si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
-def Hexagon_S2_asr_r_r:
+def HEXAGON_S2_asr_r_r:
   si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
-def Hexagon_S2_lsl_r_r:
+def HEXAGON_S2_lsl_r_r:
   si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
-def Hexagon_S2_lsr_r_r:
+def HEXAGON_S2_lsr_r_r:
   si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
-def Hexagon_S2_asl_r_p:
+def HEXAGON_S2_asl_r_p:
   di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
-def Hexagon_S2_asr_r_p:
+def HEXAGON_S2_asr_r_p:
   di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
-def Hexagon_S2_lsl_r_p:
+def HEXAGON_S2_lsl_r_p:
   di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
-def Hexagon_S2_lsr_r_p:
+def HEXAGON_S2_lsr_r_p:
   di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
 
 // STYPE / SHIFT / Shift by register and accumulate.
-def Hexagon_S2_asl_r_r_acc:
+def HEXAGON_S2_asl_r_r_acc:
   si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
-def Hexagon_S2_asr_r_r_acc:
+def HEXAGON_S2_asr_r_r_acc:
   si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
-def Hexagon_S2_lsl_r_r_acc:
+def HEXAGON_S2_lsl_r_r_acc:
   si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
-def Hexagon_S2_lsr_r_r_acc:
+def HEXAGON_S2_lsr_r_r_acc:
   si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
-def Hexagon_S2_asl_r_p_acc:
+def HEXAGON_S2_asl_r_p_acc:
   di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
-def Hexagon_S2_asr_r_p_acc:
+def HEXAGON_S2_asr_r_p_acc:
   di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
-def Hexagon_S2_lsl_r_p_acc:
+def HEXAGON_S2_lsl_r_p_acc:
   di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
-def Hexagon_S2_lsr_r_p_acc:
+def HEXAGON_S2_lsr_r_p_acc:
   di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
 
-def Hexagon_S2_asl_r_r_nac:
+def HEXAGON_S2_asl_r_r_nac:
   si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
-def Hexagon_S2_asr_r_r_nac:
+def HEXAGON_S2_asr_r_r_nac:
   si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
-def Hexagon_S2_lsl_r_r_nac:
+def HEXAGON_S2_lsl_r_r_nac:
   si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
-def Hexagon_S2_lsr_r_r_nac:
+def HEXAGON_S2_lsr_r_r_nac:
   si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
-def Hexagon_S2_asl_r_p_nac:
+def HEXAGON_S2_asl_r_p_nac:
   di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
-def Hexagon_S2_asr_r_p_nac:
+def HEXAGON_S2_asr_r_p_nac:
   di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
-def Hexagon_S2_lsl_r_p_nac:
+def HEXAGON_S2_lsl_r_p_nac:
   di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
-def Hexagon_S2_lsr_r_p_nac:
+def HEXAGON_S2_lsr_r_p_nac:
   di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
 
 // STYPE / SHIFT / Shift by register and logical.
-def Hexagon_S2_asl_r_r_and:
+def HEXAGON_S2_asl_r_r_and:
   si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
-def Hexagon_S2_asr_r_r_and:
+def HEXAGON_S2_asr_r_r_and:
   si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
-def Hexagon_S2_lsl_r_r_and:
+def HEXAGON_S2_lsl_r_r_and:
   si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
-def Hexagon_S2_lsr_r_r_and:
+def HEXAGON_S2_lsr_r_r_and:
   si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
 
-def Hexagon_S2_asl_r_r_or:
+def HEXAGON_S2_asl_r_r_or:
   si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
-def Hexagon_S2_asr_r_r_or:
+def HEXAGON_S2_asr_r_r_or:
   si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
-def Hexagon_S2_lsl_r_r_or:
+def HEXAGON_S2_lsl_r_r_or:
   si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
-def Hexagon_S2_lsr_r_r_or:
+def HEXAGON_S2_lsr_r_r_or:
   si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
 
-def Hexagon_S2_asl_r_p_and:
+def HEXAGON_S2_asl_r_p_and:
   di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
-def Hexagon_S2_asr_r_p_and:
+def HEXAGON_S2_asr_r_p_and:
   di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
-def Hexagon_S2_lsl_r_p_and:
+def HEXAGON_S2_lsl_r_p_and:
   di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
-def Hexagon_S2_lsr_r_p_and:
+def HEXAGON_S2_lsr_r_p_and:
   di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
 
-def Hexagon_S2_asl_r_p_or:
+def HEXAGON_S2_asl_r_p_or:
   di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
-def Hexagon_S2_asr_r_p_or:
+def HEXAGON_S2_asr_r_p_or:
   di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
-def Hexagon_S2_lsl_r_p_or:
+def HEXAGON_S2_lsl_r_p_or:
   di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
-def Hexagon_S2_lsr_r_p_or:
+def HEXAGON_S2_lsr_r_p_or:
   di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
 
 // STYPE / SHIFT / Shift by register with saturation.
-def Hexagon_S2_asl_r_r_sat:
+def HEXAGON_S2_asl_r_r_sat:
   si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
-def Hexagon_S2_asr_r_r_sat:
+def HEXAGON_S2_asr_r_r_sat:
   si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
 
 // STYPE / SHIFT / Table Index.
-def HEXAGON_S2_tableidxb_goodsyntax:
+def Hexagon_S2_tableidxb_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
-def HEXAGON_S2_tableidxd_goodsyntax:
+def Hexagon_S2_tableidxd_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
-def HEXAGON_S2_tableidxh_goodsyntax:
+def Hexagon_S2_tableidxh_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
-def HEXAGON_S2_tableidxw_goodsyntax:
+def Hexagon_S2_tableidxw_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
 
 
@@ -3396,29 +3432,29 @@ def HEXAGON_S2_tableidxw_goodsyntax:
 
 // STYPE / VH / Vector absolute value halfwords.
 // Rdd64=vabsh(Rss64)
-def Hexagon_A2_vabsh:
+def HEXAGON_A2_vabsh:
   di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
-def Hexagon_A2_vabshsat:
+def HEXAGON_A2_vabshsat:
   di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
 
 // STYPE / VH / Vector shift halfwords by immediate.
 // Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
-def Hexagon_S2_asl_i_vh:
+def HEXAGON_S2_asl_i_vh:
   di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
-def Hexagon_S2_asr_i_vh:
+def HEXAGON_S2_asr_i_vh:
   di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
-def Hexagon_S2_lsr_i_vh:
+def HEXAGON_S2_lsr_i_vh:
   di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
 
 // STYPE / VH / Vector shift halfwords by register.
 // Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
-def Hexagon_S2_asl_r_vh:
+def HEXAGON_S2_asl_r_vh:
   di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
-def Hexagon_S2_asr_r_vh:
+def HEXAGON_S2_asr_r_vh:
   di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
-def Hexagon_S2_lsl_r_vh:
+def HEXAGON_S2_lsl_r_vh:
   di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
-def Hexagon_S2_lsr_r_vh:
+def HEXAGON_S2_lsr_r_vh:
   di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
 
 
@@ -3427,36 +3463,41 @@ def Hexagon_S2_lsr_r_vh:
 *********************************************************************/
 
 // STYPE / VW / Vector absolute value words.
-def Hexagon_A2_vabsw:
+def HEXAGON_A2_vabsw:
   di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
-def Hexagon_A2_vabswsat:
+def HEXAGON_A2_vabswsat:
   di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
 
 // STYPE / VW / Vector shift words by immediate.
 // Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def Hexagon_S2_asl_i_vw:
+def HEXAGON_S2_asl_i_vw:
   di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
-def Hexagon_S2_asr_i_vw:
+def HEXAGON_S2_asr_i_vw:
   di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
-def Hexagon_S2_lsr_i_vw:
+def HEXAGON_S2_lsr_i_vw:
   di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
 
 // STYPE / VW / Vector shift words by register.
 // Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def Hexagon_S2_asl_r_vw:
+def HEXAGON_S2_asl_r_vw:
   di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
-def Hexagon_S2_asr_r_vw:
+def HEXAGON_S2_asr_r_vw:
   di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
-def Hexagon_S2_lsl_r_vw:
+def HEXAGON_S2_lsl_r_vw:
   di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
-def Hexagon_S2_lsr_r_vw:
+def HEXAGON_S2_lsr_r_vw:
   di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
 
 // STYPE / VW / Vector shift words with truncate and pack.
-def Hexagon_S2_asr_r_svw_trun:
+def HEXAGON_S2_asr_r_svw_trun:
   si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
-def Hexagon_S2_asr_i_svw_trun:
+def HEXAGON_S2_asr_i_svw_trun:
   si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
 
+// LD / Circular loads.
+def HEXAGON_circ_ldd:
+  di_LDInstPI_diu4                <"circ_ldd", int_hexagon_circ_ldd>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
index 68eaf68..2788101 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -12,18 +12,28 @@
 // Optimized with intrinisics accumulates
 //
 def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
-      (COMBINE_rr
-                  (Hexagon_M2_maci
-                           (Hexagon_M2_maci (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                                           (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
-                                       subreg_hireg),
-                                       (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                       (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                            (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
-                            (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)),
-                    (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
-                     subreg_loreg))>;
+      (i64
+       (COMBINE_rr
+        (HEXAGON_M2_maci
+         (HEXAGON_M2_maci
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                          subreg_loreg)),
+                     (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                          subreg_loreg)))),
+            subreg_hireg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg))),
+        (i32
+         (EXTRACT_SUBREG
+          (i64
+           (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                   (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                        subreg_loreg)))), subreg_loreg))))>;
 
 
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 0000000..1d44b52
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,395 @@
+class sf_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class si_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class df_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class di_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class di_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+
+class df_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_MInst_sfsf<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class df_MInst_dfdf<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfdf<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfu5<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class qi_SInst_sfsf<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_sfu5<string opc, Intrinsic IntID>
+  : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class sf_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class sf_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_MInst_diu6<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class di_MInst_diu4_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_diu4_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+def HEXAGON_C4_fastcorner9:
+    qi_SInst_qiqi       <"fastcorner9", int_hexagon_C4_fastcorner9>;
+def HEXAGON_C4_fastcorner9_not:
+    qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>;
+def HEXAGON_M5_vrmpybuu:
+    di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>;
+def HEXAGON_M5_vrmacbuu:
+    di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>;
+def HEXAGON_M5_vrmpybsu:
+    di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>;
+def HEXAGON_M5_vrmacbsu:
+    di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>;
+def HEXAGON_M5_vmpybuu:
+    di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>;
+def HEXAGON_M5_vmpybsu:
+    di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>;
+def HEXAGON_M5_vmacbuu:
+    di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>;
+def HEXAGON_M5_vmacbsu:
+    di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>;
+def HEXAGON_M5_vdmpybsu:
+    di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>;
+def HEXAGON_M5_vdmacbsu:
+    di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>;
+def HEXAGON_A5_vaddhubs:
+    si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>;
+def HEXAGON_S5_popcountp:
+    si_SInst_di <"popcount", int_hexagon_S5_popcountp>;
+def HEXAGON_S5_asrhub_rnd_sat_goodsyntax:
+    si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+def HEXAGON_S5_asrhub_sat:
+    si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>;
+def HEXAGON_S5_vasrhrnd_goodsyntax:
+    di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>;
+def HEXAGON_S2_asr_i_p_rnd:
+    di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>;
+def HEXAGON_S2_asr_i_p_rnd_goodsyntax:
+    di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+def HEXAGON_F2_sfadd:
+    sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>;
+def HEXAGON_F2_sfsub:
+    sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>;
+def HEXAGON_F2_sfmpy:
+    sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>;
+def HEXAGON_F2_sffma:
+    sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>;
+def HEXAGON_F2_sffma_sc:
+    sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>;
+def HEXAGON_F2_sffms:
+    sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>;
+def HEXAGON_F2_sffma_lib:
+    sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>;
+def HEXAGON_F2_sffms_lib:
+    sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>;
+def HEXAGON_F2_sfcmpeq:
+    qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>;
+def HEXAGON_F2_sfcmpgt:
+    qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>;
+def HEXAGON_F2_sfcmpge:
+    qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>;
+def HEXAGON_F2_sfcmpuo:
+    qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>;
+def HEXAGON_F2_sfmax:
+    sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>;
+def HEXAGON_F2_sfmin:
+    sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>;
+def HEXAGON_F2_sfclass:
+    qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>;
+def HEXAGON_F2_sfimm_p:
+    sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>;
+def HEXAGON_F2_sfimm_n:
+    sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>;
+def HEXAGON_F2_sffixupn:
+    sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>;
+def HEXAGON_F2_sffixupd:
+    sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>;
+def HEXAGON_F2_sffixupr:
+    sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>;
+def HEXAGON_F2_dfadd:
+    df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>;
+def HEXAGON_F2_dfsub:
+    df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>;
+def HEXAGON_F2_dfmpy:
+    df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>;
+def HEXAGON_F2_dffma:
+    df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>;
+def HEXAGON_F2_dffms:
+    df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>;
+def HEXAGON_F2_dffma_lib:
+    df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>;
+def HEXAGON_F2_dffms_lib:
+    df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>;
+def HEXAGON_F2_dffma_sc:
+    df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>;
+def HEXAGON_F2_dfmax:
+    df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>;
+def HEXAGON_F2_dfmin:
+    df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>;
+def HEXAGON_F2_dfcmpeq:
+    qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>;
+def HEXAGON_F2_dfcmpgt:
+    qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>;
+def HEXAGON_F2_dfcmpge:
+    qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>;
+def HEXAGON_F2_dfcmpuo:
+    qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>;
+def HEXAGON_F2_dfclass:
+    qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>;
+def HEXAGON_F2_dfimm_p:
+    df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>;
+def HEXAGON_F2_dfimm_n:
+    df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>;
+def HEXAGON_F2_dffixupn:
+    df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>;
+def HEXAGON_F2_dffixupd:
+    df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>;
+def HEXAGON_F2_dffixupr:
+    df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>;
+def HEXAGON_F2_conv_sf2df:
+    df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>;
+def HEXAGON_F2_conv_df2sf:
+    sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>;
+def HEXAGON_F2_conv_uw2sf:
+    sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>;
+def HEXAGON_F2_conv_uw2df:
+    df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>;
+def HEXAGON_F2_conv_w2sf:
+    sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>;
+def HEXAGON_F2_conv_w2df:
+    df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>;
+def HEXAGON_F2_conv_ud2sf:
+    sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>;
+def HEXAGON_F2_conv_ud2df:
+    df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>;
+def HEXAGON_F2_conv_d2sf:
+    sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>;
+def HEXAGON_F2_conv_d2df:
+    df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>;
+def HEXAGON_F2_conv_sf2uw:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>;
+def HEXAGON_F2_conv_sf2w:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>;
+def HEXAGON_F2_conv_sf2ud:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>;
+def HEXAGON_F2_conv_sf2d:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>;
+def HEXAGON_F2_conv_df2uw:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>;
+def HEXAGON_F2_conv_df2w:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>;
+def HEXAGON_F2_conv_df2ud:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>;
+def HEXAGON_F2_conv_df2d:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>;
+def HEXAGON_F2_conv_sf2uw_chop:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>;
+def HEXAGON_F2_conv_sf2w_chop:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>;
+def HEXAGON_F2_conv_sf2ud_chop:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>;
+def HEXAGON_F2_conv_sf2d_chop:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>;
+def HEXAGON_F2_conv_df2uw_chop:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>;
+def HEXAGON_F2_conv_df2w_chop:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>;
+def HEXAGON_F2_conv_df2ud_chop:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>;
+def HEXAGON_F2_conv_df2d_chop:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>;
diff --git a/lib/Target/Hexagon/HexagonMCInst.h b/lib/Target/Hexagon/HexagonMCInst.h
new file mode 100644
index 0000000..7a16c24
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMCInst.h
@@ -0,0 +1,41 @@
+//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some VLIW annotation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCINST_H
+#define HEXAGONMCINST_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+  class HexagonMCInst: public MCInst {
+    // Packet start and end markers
+    unsigned startPacket: 1, endPacket: 1;
+    const MachineInstr *MachineI;
+  public:
+    explicit HexagonMCInst(): MCInst(),
+                              startPacket(0), endPacket(0) {}
+
+    const MachineInstr* getMI() const { return MachineI; }
+
+    void setMI(const MachineInstr *MI) { MachineI = MI; }
+
+    bool isStartPacket() const { return (startPacket); }
+    bool isEndPacket() const { return (endPacket); }
+
+    void setStartPacket(bool yes) { startPacket = yes; }
+    void setEndPacket(bool yes) { endPacket = yes; }
+  };
+}
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index fbb331b..70bddcc 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -49,7 +49,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, MCInst& MCI,
     switch (MO.getType()) {
     default:
       MI->dump();
-      assert(0 && "unknown operand type");
+      llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
       if (MO.isImplicit()) continue;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 0000000..7ece408
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,647 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-nvj"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+
+#include <map>
+
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+  "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable New Value Jumps"));
+
+namespace {
+  struct HexagonNewValueJump : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+
+  public:
+    static char ID;
+
+    HexagonNewValueJump() : MachineFunctionPass(ID) { }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon NewValueJump";
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+
+  };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+                                      const TargetRegisterInfo *TRI,
+                                      MachineBasicBlock::iterator II,
+                                      MachineBasicBlock::iterator end,
+                                      MachineBasicBlock::iterator skip,
+                                      MachineFunction &MF) {
+
+  // Predicated instruction can not be feeder to NVJ.
+  if (QII->isPredicated(II))
+    return false;
+
+  // Bail out if feederReg is a paired register (double regs in
+  // our case). One would think that we can check to see if a given
+  // register cmpReg1 or cmpReg2 is a sub register of feederReg
+  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+  // before the callsite of this function
+  // But we can not as it comes in the following fashion.
+  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+  //    %P0<def> = CMPEQri %R0<kill>, 0
+  // Hence, we need to check if it's a KILL instruction.
+  if (II->getOpcode() == TargetOpcode::KILL)
+    return false;
+
+
+  // Make sure there there is no 'def' or 'use' of any of the uses of
+  // feeder insn between it's definition, this MI and jump, jmpInst
+  // skipping compare, cmpInst.
+  // Here's the example.
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    r4=memub(r3+r21<<#0)
+  //    if (p0.new) jump:t .LBB29_45
+  // Without this check, it will be converted into
+  //    r4=memub(r3+r21<<#0)
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    if (p0.new) jump:t .LBB29_45
+  // and result WAR hazards if converted to New Value Jump.
+
+  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+    if (II->getOperand(i).isReg() &&
+        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+      MachineBasicBlock::iterator localII = II;
+      ++localII;
+      unsigned Reg = II->getOperand(i).getReg();
+      for (MachineBasicBlock::iterator localBegin = localII;
+                        localBegin != end; ++localBegin) {
+        if (localBegin == skip ) continue;
+        // Check for Subregisters too.
+        if (localBegin->modifiesRegister(Reg, TRI) ||
+            localBegin->readsRegister(Reg, TRI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+                          MachineBasicBlock::iterator MII) {
+
+  // If store in path, bail out.
+  if (MII->getDesc().mayStore())
+    return false;
+
+  // if call in path, bail out.
+  if (MII->getOpcode() == Hexagon::CALLv3)
+    return false;
+
+  // if NVJ is running prior to RA, do the following checks.
+  if (!afterRA) {
+    // The following Target Opcode instructions are spurious
+    // to new value jump. If they are in the path, bail out.
+    // KILL sets kill flag on the opcode. It also sets up a
+    // single register, out of pair.
+    //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+    //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+    //    %P0<def> = CMPEQri %R0<kill>, 0
+    // PHI can be anything after RA.
+    // COPY can remateriaze things in between feeder, compare and nvj.
+    if (MII->getOpcode() == TargetOpcode::KILL ||
+        MII->getOpcode() == TargetOpcode::PHI  ||
+        MII->getOpcode() == TargetOpcode::COPY)
+      return false;
+
+    // The following pseudo Hexagon instructions sets "use" and "def"
+    // of registers by individual passes in the backend. At this time,
+    // we don't know the scope of usage and definitions of these
+    // instructions.
+    if (MII->getOpcode() == Hexagon::TFR_condset_rr ||
+        MII->getOpcode() == Hexagon::TFR_condset_ii ||
+        MII->getOpcode() == Hexagon::TFR_condset_ri ||
+        MII->getOpcode() == Hexagon::TFR_condset_ir ||
+        MII->getOpcode() == Hexagon::LDriw_pred     ||
+        MII->getOpcode() == Hexagon::STriw_pred)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+                                     const TargetRegisterInfo *TRI,
+                                     MachineBasicBlock::iterator II,
+                                     unsigned pReg,
+                                     bool secondReg,
+                                     bool optLocation,
+                                     MachineBasicBlock::iterator end,
+                                     MachineFunction &MF) {
+
+  MachineInstr *MI = II;
+
+  // If the second operand of the compare is an imm, make sure it's in the
+  // range specified by the arch.
+  if (!secondReg) {
+    int64_t v = MI->getOperand(2).getImm();
+    if (MI->getOpcode() == Hexagon::CMPGEri ||
+       (MI->getOpcode() == Hexagon::CMPGEUri && v > 0))
+      --v;
+
+    if (!(isUInt<5>(v) ||
+         ((MI->getOpcode() == Hexagon::CMPEQri ||
+           MI->getOpcode() == Hexagon::CMPGTri ||
+           MI->getOpcode() == Hexagon::CMPGEri) &&
+          (v == -1))))
+      return false;
+  }
+
+  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+  cmpReg1 = MI->getOperand(1).getReg();
+
+  if (secondReg) {
+    cmpOp2 = MI->getOperand(2).getReg();
+
+    // Make sure that that second register is not from COPY
+    // At machine code level, we don't need this, but if we decide
+    // to move new value jump prior to RA, we would be needing this.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+      MachineInstr *def = MRI.getVRegDef(cmpOp2);
+      if (def->getOpcode() == TargetOpcode::COPY)
+        return false;
+    }
+  }
+
+  // Walk the instructions after the compare (predicate def) to the jump,
+  // and satisfy the following conditions.
+  ++II ;
+  for (MachineBasicBlock::iterator localII = II; localII != end;
+       ++localII) {
+
+    // Check 1.
+    // If "common" checks fail, bail out.
+    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+      return false;
+
+    // Check 2.
+    // If there is a def or use of predicate (result of compare), bail out.
+    if (localII->modifiesRegister(pReg, TRI) ||
+        localII->readsRegister(pReg, TRI))
+      return false;
+
+    // Check 3.
+    // If there is a def of any of the use of the compare (operands of compare),
+    // bail out.
+    // Eg.
+    //    p0 = cmp.eq(r2, r0)
+    //    r2 = r4
+    //    if (p0.new) jump:t .LBB28_3
+    if (localII->modifiesRegister(cmpReg1, TRI) ||
+        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+      return false;
+  }
+  return true;
+}
+
+// Given a compare operator, return a matching New Value Jump
+// compare operator. Make sure that MI here is included in
+// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+static unsigned getNewValueJumpOpcode(const MachineInstr *MI, int reg,
+                                      bool secondRegNewified) {
+  switch (MI->getOpcode()) {
+    case Hexagon::CMPEQrr:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+
+    case Hexagon::CMPEQri: {
+      if (reg >= 0)
+        return Hexagon::JMP_EQriPt_nv_V4;
+      else
+        return Hexagon::JMP_EQriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGEri: {
+      if (reg >= 1)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPGTri: {
+      if (reg >= 0)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTUrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGTUri:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+
+    case Hexagon::CMPGEUri: {
+      if (reg == 0)
+        return Hexagon::JMP_EQrrPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUriPt_nv_V4;
+    }
+
+    default:
+       llvm_unreachable("Could not find matching New Value Jump instruction.");
+  }
+  // return *some value* to avoid compiler warning
+  return 0;
+}
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+               << "********** Function: "
+               << MF.getFunction()->getName() << "\n");
+
+#if 0
+  // for now disable this, if we move NewValueJump before register
+  // allocation we need this information.
+  LiveVariables &LVs = getAnalysis<LiveVariables>();
+#endif
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+  QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+
+  if (!QRI->Subtarget.hasV4TOps() ||
+      DisableNewValueJumps) {
+    return false;
+  }
+
+  int nvjCount = DbgNVJCount;
+  int nvjGenerated = 0;
+
+  // Loop through all the bb's of the function
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+        MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    DEBUG(dbgs() << "** dumping bb ** "
+                 << MBB->getNumber() << "\n");
+    DEBUG(MBB->dump());
+    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    bool foundJump    = false;
+    bool foundCompare = false;
+    bool invertPredicate = false;
+    unsigned predReg = 0; // predicate reg of the jump.
+    unsigned cmpReg1 = 0;
+    int cmpOp2 = 0;
+    bool MO1IsKill = false;
+    bool MO2IsKill = false;
+    MachineBasicBlock::iterator jmpPos;
+    MachineBasicBlock::iterator cmpPos;
+    MachineInstr *cmpInstr = NULL, *jmpInstr = NULL;
+    MachineBasicBlock *jmpTarget = NULL;
+    bool afterRA = false;
+    bool isSecondOpReg = false;
+    bool isSecondOpNewified = false;
+    // Traverse the basic block - bottom up
+    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+             MII != E;) {
+      MachineInstr *MI = --MII;
+      if (MI->isDebugValue()) {
+        continue;
+      }
+
+      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+        break;
+
+      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
+
+      if (!foundJump &&
+         (MI->getOpcode() == Hexagon::JMP_c ||
+          MI->getOpcode() == Hexagon::JMP_cNot ||
+          MI->getOpcode() == Hexagon::JMP_cdnPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnPnt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPnt)) {
+        // This is where you would insert your compare and
+        // instr that feeds compare
+        jmpPos = MII;
+        jmpInstr = MI;
+        predReg = MI->getOperand(0).getReg();
+        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+        // If ifconverter had not messed up with the kill flags of the
+        // operands, the following check on the kill flag would suffice.
+        // if(!jmpInstr->getOperand(0).isKill()) break;
+
+        // This predicate register is live out out of BB
+        // this would only work if we can actually use Live
+        // variable analysis on phy regs - but LLVM does not
+        // provide LV analysis on phys regs.
+        //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+        // Get all the successors of this block - which will always
+        // be 2. Check if the predicate register is live in in those
+        // successor. If yes, we can not delete the predicate -
+        // I am doing this only because LLVM does not provide LiveOut
+        // at the BB level.
+        bool predLive = false;
+        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
+          MachineBasicBlock* succMBB = *SI;
+         if (succMBB->isLiveIn(predReg)) {
+            predLive = true;
+          }
+        }
+        if (predLive)
+          break;
+
+        jmpTarget = MI->getOperand(1).getMBB();
+        foundJump = true;
+        if (MI->getOpcode() == Hexagon::JMP_cNot ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPnt) {
+          invertPredicate = true;
+        }
+        continue;
+      }
+
+      // No new value jump if there is a barrier. A barrier has to be in its
+      // own packet. A barrier has zero operands. We conservatively bail out
+      // here if we see any instruction with zero operands.
+      if (foundJump && MI->getNumOperands() == 0)
+        break;
+
+      if (foundJump &&
+         !foundCompare &&
+          MI->getOperand(0).isReg() &&
+          MI->getOperand(0).getReg() == predReg) {
+
+        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+        if (QII->isNewValueJumpCandidate(MI)) {
+
+          assert((MI->getDesc().isCompare()) &&
+              "Only compare instruction can be collapsed into New Value Jump");
+          isSecondOpReg = MI->getOperand(2).isReg();
+
+          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+                                        afterRA, jmpPos, MF))
+            break;
+
+          cmpInstr = MI;
+          cmpPos = MII;
+          foundCompare = true;
+
+          // We need cmpReg1 and cmpOp2(imm or reg) while building
+          // new value jump instruction.
+          cmpReg1 = MI->getOperand(1).getReg();
+          if (MI->getOperand(1).isKill())
+            MO1IsKill = true;
+
+          if (isSecondOpReg) {
+            cmpOp2 = MI->getOperand(2).getReg();
+            if (MI->getOperand(2).isKill())
+              MO2IsKill = true;
+          } else
+            cmpOp2 = MI->getOperand(2).getImm();
+          continue;
+        }
+      }
+
+      if (foundCompare && foundJump) {
+
+        // If "common" checks fail, bail out on this BB.
+        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+          break;
+
+        bool foundFeeder = false;
+        MachineBasicBlock::iterator feederPos = MII;
+        if (MI->getOperand(0).isReg() &&
+            MI->getOperand(0).isDef() &&
+           (MI->getOperand(0).getReg() == cmpReg1 ||
+            (isSecondOpReg &&
+             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {
+
+          unsigned feederReg = MI->getOperand(0).getReg();
+
+          // First try to see if we can get the feeder from the first operand
+          // of the compare. If we can not, and if secondOpReg is true
+          // (second operand of the compare is also register), try that one.
+          // TODO: Try to come up with some heuristic to figure out which
+          // feeder would benefit.
+
+          if (feederReg == cmpReg1) {
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+              if (!isSecondOpReg)
+                break;
+              else
+                continue;
+            } else
+              foundFeeder = true;
+          }
+
+          if (!foundFeeder &&
+               isSecondOpReg &&
+               feederReg == (unsigned) cmpOp2)
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+              break;
+
+          if (isSecondOpReg) {
+            // In case of CMPLT, or CMPLTU, or EQ with the second register
+            // to newify, swap the operands.
+            if (cmpInstr->getOpcode() == Hexagon::CMPLTrr  ||
+                cmpInstr->getOpcode() == Hexagon::CMPLTUrr ||
+                (cmpInstr->getOpcode() == Hexagon::CMPEQrr &&
+                                     feederReg == (unsigned) cmpOp2)) {
+              unsigned tmp = cmpReg1;
+              bool tmpIsKill = MO1IsKill;
+              cmpReg1 = cmpOp2;
+              MO1IsKill = MO2IsKill;
+              cmpOp2 = tmp;
+              MO2IsKill = tmpIsKill;
+            }
+
+            // Now we have swapped the operands, all we need to check is,
+            // if the second operand (after swap) is the feeder.
+            // And if it is, make a note.
+            if (feederReg == (unsigned)cmpOp2)
+              isSecondOpNewified = true;
+          }
+
+          // Now that we are moving feeder close the jump,
+          // make sure we are respecting the kill values of
+          // the operands of the feeder.
+
+          bool updatedIsKill = false;
+          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
+            MachineOperand &MO = MI->getOperand(i);
+            if (MO.isReg() && MO.isUse()) {
+              unsigned feederReg = MO.getReg();
+              for (MachineBasicBlock::iterator localII = feederPos,
+                   end = jmpPos; localII != end; localII++) {
+                MachineInstr *localMI = localII;
+                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI->getOperand(j);
+                  if (localMO.isReg() && localMO.isUse() &&
+                      localMO.isKill() && feederReg == localMO.getReg()) {
+                    // We found that there is kill of a use register
+                    // Set up a kill flag on the register
+                    localMO.setIsKill(false);
+                    MO.setIsKill();
+                    updatedIsKill = true;
+                    break;
+                  }
+                }
+                if (updatedIsKill) break;
+              }
+            }
+            if (updatedIsKill) break;
+          }
+
+          MBB->splice(jmpPos, MI->getParent(), MI);
+          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
+          DebugLoc dl = MI->getDebugLoc();
+          MachineInstr *NewMI;
+
+           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
+                      "This compare is not a New Value Jump candidate.");
+          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+                                               isSecondOpNewified);
+          if (invertPredicate)
+            opc = QII->getInvertedPredicatedOpcode(opc);
+
+          // Manage the conversions from CMPGEUri to either CMPEQrr
+          // or CMPGTUri properly. See Arch spec for CMPGEUri instructions.
+          // This has to be after the getNewValueJumpOpcode function call as
+          // second operand of the compare could be modified in this logic.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEUri) {
+            if (cmpOp2 == 0) {
+              cmpOp2 = cmpReg1;
+              MO2IsKill = MO1IsKill;
+              isSecondOpReg = true;
+            } else
+              --cmpOp2;
+          }
+
+          // Manage the conversions from CMPGEri to CMPGTUri properly.
+          // See Arch spec for CMPGEri instructions.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEri)
+            --cmpOp2;
+
+          if (isSecondOpReg) {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
+                                    .addMBB(jmpTarget);
+          }
+          else {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addImm(cmpOp2)
+                                    .addMBB(jmpTarget);
+          }
+
+          assert(NewMI && "New Value Jump Instruction Not created!");
+          if (cmpInstr->getOperand(0).isReg() &&
+              cmpInstr->getOperand(0).isKill())
+            cmpInstr->getOperand(0).setIsKill(false);
+          if (cmpInstr->getOperand(1).isReg() &&
+              cmpInstr->getOperand(1).isKill())
+            cmpInstr->getOperand(1).setIsKill(false);
+          cmpInstr->eraseFromParent();
+          jmpInstr->eraseFromParent();
+          ++nvjGenerated;
+          ++NumNVJGenerated;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+  return new HexagonNewValueJump();
+}
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2a9de92..2c23674 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -63,6 +63,7 @@ const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
     return CalleeSavedRegsV2;
   case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
     return CalleeSavedRegsV3;
   }
   llvm_unreachable("Callee saved registers requested for unknown architecture "
@@ -109,6 +110,7 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     return CalleeSavedRegClassesV2;
   case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
     return CalleeSavedRegClassesV3;
   }
   llvm_unreachable("Callee saved register classes requested for unknown "
@@ -179,13 +181,15 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // r0 = add(r30, #10000)
       // r0 = memw(r0)
       if ( (MI.getOpcode() == Hexagon::LDriw)  ||
-           (MI.getOpcode() == Hexagon::LDrid) ||
-           (MI.getOpcode() == Hexagon::LDrih) ||
-           (MI.getOpcode() == Hexagon::LDriuh) ||
-           (MI.getOpcode() == Hexagon::LDrib) ||
-           (MI.getOpcode() == Hexagon::LDriub) ) {
+           (MI.getOpcode() == Hexagon::LDrid)   ||
+           (MI.getOpcode() == Hexagon::LDrih)   ||
+           (MI.getOpcode() == Hexagon::LDriuh)  ||
+           (MI.getOpcode() == Hexagon::LDrib)   ||
+           (MI.getOpcode() == Hexagon::LDriub)  ||
+           (MI.getOpcode() == Hexagon::LDriw_f) ||
+           (MI.getOpcode() == Hexagon::LDrid_f)) {
         unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
-          *getSubRegisters(MI.getOperand(0).getReg()) :
+          getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) :
           MI.getOperand(0).getReg();
 
         // Check if offset can fit in addi.
@@ -203,10 +207,13 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
         MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
         MI.getOperand(i+1).ChangeToImmediate(0);
-      } else if ((MI.getOpcode() == Hexagon::STriw) ||
+      } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
+                 (MI.getOpcode() == Hexagon::STriw) ||
                  (MI.getOpcode() == Hexagon::STrid) ||
                  (MI.getOpcode() == Hexagon::STrih) ||
-                 (MI.getOpcode() == Hexagon::STrib)) {
+                 (MI.getOpcode() == Hexagon::STrib) ||
+                 (MI.getOpcode() == Hexagon::STrid_f) ||
+                 (MI.getOpcode() == Hexagon::STriw_f)) {
         // For stores, we need a reserved register. Change
         // memw(r30 + #10000) = r0 to:
         //
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 6cf727b..85355ae 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -73,6 +73,10 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
     return true;
   }
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return true;
+  }
+
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index d44eae3..fe41fc3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -131,6 +131,9 @@ let Namespace = "Hexagon" in {
   def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>;
   def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>;
 
+  def M0 : Rc<6, "m0">, DwarfRegNum<[71]>;
+  def M1 : Rc<7, "m1">, DwarfRegNum<[72]>;
+
   def PC : Rc<9,  "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct?
   def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct?
 }
@@ -140,19 +143,15 @@ let Namespace = "Hexagon" in {
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"Hexagon", [i32], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32,
                             (add (sequence "R%u", 0, 9),
                                  (sequence "R%u", 12, 28),
                                  R10, R11, R29, R30, R31)> {
 }
 
-
-
-def DoubleRegs : RegisterClass<"Hexagon", [i64], 64,
+def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64,
                                (add (sequence "D%u", 0, 4),
-                                    (sequence "D%u", 6, 13), D5, D14, D15)> {
-  let SubRegClasses = [(IntRegs subreg_loreg, subreg_hireg)];
-}
+                                    (sequence "D%u", 6, 13), D5, D14, D15)>;
 
 
 def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
@@ -162,6 +161,7 @@ def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
 
 def CRRegs : RegisterClass<"Hexagon", [i32], 32,
                            (add (sequence "LC%u", 0, 1),
-                                (sequence "SA%u", 0, 1), PC, GP)> {
+                                (sequence "SA%u", 0, 1),
+                                (sequence "M%u", 0, 1), PC, GP)> {
   let Size = 32;
 }
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 66a00e1..2468f0b 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -1,4 +1,4 @@
-//===- HexagonRemoveExtendArgs.cpp - Remove unecessary argument sign extends =//
+//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index fbea445..d1076b8 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -13,7 +13,6 @@ def LSUNIT    : FuncUnit;
 def MUNIT     : FuncUnit;
 def SUNIT     : FuncUnit;
 
-
 // Itinerary classes
 def ALU32     : InstrItinClass;
 def ALU64     : InstrItinClass;
@@ -24,23 +23,31 @@ def LD        : InstrItinClass;
 def M         : InstrItinClass;
 def ST        : InstrItinClass;
 def S         : InstrItinClass;
+def SYS       : InstrItinClass;
+def MARKER    : InstrItinClass;
 def PSEUDO    : InstrItinClass;
 
-
 def HexagonItineraries :
- ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
-  InstrItinData<ALU32      , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-  InstrItinData<ALU64      , [InstrStage<1,  [MUNIT, SUNIT]>]>,
-  InstrItinData<CR         , [InstrStage<1,  [SUNIT]>]>,
-  InstrItinData<J          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
-  InstrItinData<JR         , [InstrStage<1,  [MUNIT]>]>,
-  InstrItinData<LD         , [InstrStage<1,  [LUNIT, LSUNIT]>]>,
-  InstrItinData<M          , [InstrStage<1,  [MUNIT, SUNIT]>]>,
-  InstrItinData<ST         , [InstrStage<1,  [LSUNIT]>]>,
-  InstrItinData<S          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
-  InstrItinData<PSEUDO     , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
-]>;
-
+      ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
+        InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
+        InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
+        InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<MARKER , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
+      ]>;
+
+def HexagonModel : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItineraries;
+}
 
 //===----------------------------------------------------------------------===//
 // V4 Machine Info +
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 4cf66fe..9b41126 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -23,7 +23,6 @@
 //    | SLOT3     |  XTYPE          ALU32     J         CR           |
 //    |===========|==================================================|
 
-
 // Functional Units.
 def SLOT0       : FuncUnit;
 def SLOT1       : FuncUnit;
@@ -34,22 +33,32 @@ def SLOT3       : FuncUnit;
 def NV_V4       : InstrItinClass;
 def MEM_V4      : InstrItinClass;
 // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def PREFIX      : InstrItinClass;
+
+def HexagonItinerariesV4 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
+        InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<MARKER , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>
+      ]>;
 
-def HexagonItinerariesV4 : ProcessorItineraries<
-                    [SLOT0, SLOT1, SLOT2, SLOT3], [], [
-  InstrItinData<LD            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
-  InstrItinData<ST            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
-  InstrItinData<ALU32         , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-  InstrItinData<NV_V4         , [InstrStage<1,  [SLOT0]>]>,
-  InstrItinData<MEM_V4        , [InstrStage<1,  [SLOT0]>]>,
-  InstrItinData<J             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<JR            , [InstrStage<1,  [SLOT2]>]>,
-  InstrItinData<CR            , [InstrStage<1,  [SLOT3]>]>,
-  InstrItinData<PSEUDO        , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-  InstrItinData<ALU64         , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<M             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<S             , [InstrStage<1,  [SLOT2, SLOT3]>]>
-]>;
+def HexagonModelV4 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV4;
+}
 
 //===----------------------------------------------------------------------===//
 // Hexagon V4 Resource Definitions -
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index d10c9f2..a81cd91 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -14,7 +14,7 @@
 //   {p0 = cmp.eq(r0,r1)}
 //   {r3 = mux(p0,#1,#3)}
 //
-// This requires two packets.  If we use .new predicated immediate transfers, 
+// This requires two packets.  If we use .new predicated immediate transfers,
 // then we can do this in a single packet, e.g.:
 //
 //   {p0 = cmp.eq(r0,r1)
@@ -81,40 +81,126 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
          ++MII) {
       MachineInstr *MI = MII;
-      int Opc = MI->getOpcode();
-      if (Opc == Hexagon::TFR_condset_rr) {
-
-        int DestReg = MI->getOperand(0).getReg();
-        int SrcReg1 = MI->getOperand(2).getReg();
-        int SrcReg2 = MI->getOperand(3).getReg();
-
-        // Minor optimization: do not emit the predicated copy if the source and
-        // the destination is the same register
-        if (DestReg != SrcReg1) {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cPt),
-                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+      int Opc1, Opc2;
+      switch(MI->getOpcode()) {
+        case Hexagon::TFR_condset_rr:
+        case Hexagon::TFR_condset_rr_f:
+        case Hexagon::TFR_condset_rr64_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() == Hexagon::TFR_condset_rr ||
+              MI->getOpcode() == Hexagon::TFR_condset_rr_f) {
+            Opc1 = Hexagon::TFR_cPt;
+            Opc2 = Hexagon::TFR_cNotPt;
+          }
+          else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) {
+            Opc1 = Hexagon::TFR64_cPt;
+            Opc2 = Hexagon::TFR64_cNotPt;
+          }
+
+          // Minor optimization: do not emit the predicated copy if the source
+          // and the destination is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
         }
-        if (DestReg != SrcReg2) {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cNotPt),
-                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+        case Hexagon::TFR_condset_ri:
+        case Hexagon::TFR_condset_ri_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+
+          //  Do not emit the predicated copy if the source and the destination
+          // is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ri ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(3).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ri_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(3).getFPImm());
+          }
+
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ir:
+        case Hexagon::TFR_condset_ir_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ir ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(2).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ir_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(2).getFPImm());
+          }
+
+          // Do not emit the predicated copy if the source and
+          // the destination is the same register.
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ii:
+        case Hexagon::TFR_condset_ii_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(1).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ii ) {
+            int Immed1 = MI->getOperand(2).getImm();
+            int Immed2 = MI->getOperand(3).getImm();
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed2);
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ii_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(2).getFPImm());
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(3).getFPImm());
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
         }
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::TFR_condset_ii) {
-        int DestReg = MI->getOperand(0).getReg();
-        int SrcReg1 = MI->getOperand(1).getReg();
-        int Immed1 = MI->getOperand(2).getImm();
-        int Immed2 = MI->getOperand(3).getImm();
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cPt),
-                DestReg).addReg(SrcReg1).addImm(Immed1);
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cNotPt),
-                DestReg).addReg(SrcReg1).addImm(Immed2);
-        MII = MBB->erase(MI);
-        --MII;
       }
     }
   }
-
   return true;
 }
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 654d336..5d087db 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "HexagonSubtarget.h"
 #include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
@@ -29,11 +30,17 @@ static cl::opt<bool>
 EnableMemOps(
     "enable-hexagon-memops",
     cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed,
-    cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+    cl::desc("Generate V4 memop instructions."));
+
+static cl::opt<bool>
+EnableIEEERndNear(
+    "enable-hexagon-ieee-rnd-near",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Generate non-chopped conversion from fp to int."));
 
 HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
   HexagonGenSubtargetInfo(TT, CPU, FS),
-  HexagonArchVersion(V1),
+  HexagonArchVersion(V2),
   CPUString(CPU.str()) {
   ParseSubtargetFeatures(CPU, FS);
 
@@ -45,18 +52,27 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
     break;
   case HexagonSubtarget::V4:
     break;
+  case HexagonSubtarget::V5:
+    break;
   default:
-    llvm_unreachable("Unknown Architecture Version.");
+    // If the programmer has not specified a Hexagon version, default
+    // to -mv4.
+    CPUString = "hexagonv4";
+    HexagonArchVersion = HexagonSubtarget::V4;
+    break;
   }
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  // Max issue per cycle == bundle width.
-  InstrItins.IssueWidth = 4;
-
   if (EnableMemOps)
     UseMemOps = true;
   else
     UseMemOps = false;
+
+  if (EnableIEEERndNear)
+    ModeIEEERndNear = true;
+  else
+    ModeIEEERndNear = false;
 }
+
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 3079086..5d9d6d8 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -22,16 +22,18 @@
 #include "HexagonGenSubtargetInfo.inc"
 
 #define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
 
 namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
 
   bool UseMemOps;
+  bool ModeIEEERndNear;
 
 public:
   enum HexagonArchEnum {
-    V1, V2, V3, V4
+    V1, V2, V3, V4, V5
   };
 
   HexagonArchEnum HexagonArchVersion;
@@ -55,7 +57,11 @@ public:
   bool hasV3TOps () const { return HexagonArchVersion >= V3; }
   bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
   bool hasV4TOps () const { return HexagonArchVersion >= V4; }
+  bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; }
   bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
+  bool hasV5TOps () const { return HexagonArchVersion >= V5; }
+  bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; }
+  bool modeIEEERndNear () const { return ModeIEEERndNear; }
 
   bool isSubtargetV2() const { return HexagonArchVersion == V2;}
   const std::string &getCPUString () const { return CPUString; }
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 55bbba7..a7b291f 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -55,7 +55,9 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DataLayout("e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-a0:0") ,
+    DataLayout("e-p:32:32:32-"
+                "i64:64:64-i32:32:32-i16:16:16-i1:32:32-"
+                "f64:64:64-f32:32:32-a0:0-n32") ,
     Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget),
@@ -100,43 +102,47 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool HexagonPassConfig::addInstSelector() {
-  PM->add(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
-  PM->add(createHexagonISelDag(getHexagonTargetMachine()));
-  PM->add(createHexagonPeephole());
+  addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
+  addPass(createHexagonISelDag(getHexagonTargetMachine()));
+  addPass(createHexagonPeephole());
   return false;
 }
 
 
 bool HexagonPassConfig::addPreRegAlloc() {
   if (!DisableHardwareLoops) {
-    PM->add(createHexagonHardwareLoops());
+    addPass(createHexagonHardwareLoops());
   }
-
   return false;
 }
 
 bool HexagonPassConfig::addPostRegAlloc() {
-  PM->add(createHexagonCFGOptimizer(getHexagonTargetMachine()));
+  addPass(createHexagonCFGOptimizer(getHexagonTargetMachine()));
   return true;
 }
 
 
 bool HexagonPassConfig::addPreSched2() {
-  addPass(IfConverterID);
+  addPass(&IfConverterID);
   return true;
 }
 
 bool HexagonPassConfig::addPreEmitPass() {
 
   if (!DisableHardwareLoops) {
-    PM->add(createHexagonFixupHwLoops());
+    addPass(createHexagonFixupHwLoops());
   }
 
+  addPass(createHexagonNewValueJump());
+
   // Expand Spill code for predicate registers.
-  PM->add(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
+  addPass(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
 
   // Split up TFRcondsets into conditional transfers.
-  PM->add(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+  addPass(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+
+  // Create Packets.
+  addPass(createHexagonPacketizer());
 
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 0000000..a03ed03
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,3646 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "packets"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+
+#include <map>
+
+using namespace llvm;
+
+namespace {
+  class HexagonPacketizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    HexagonPacketizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon Packetizer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char HexagonPacketizer::ID = 0;
+
+  class HexagonPacketizerList : public VLIWPacketizerList {
+
+  private:
+
+    // Has the instruction been promoted to a dot-new instruction.
+    bool PromotedToDotNew;
+
+    // Has the instruction been glued to allocframe.
+    bool GlueAllocframeStore;
+
+    // Has the feeder instruction been glued to new value jump.
+    bool GlueToNewValueJump;
+
+    // Check if there is a dependence between some instruction already in this
+    // packet and this instruction.
+    bool Dependence;
+
+    // Only check for dependence if there are resources available to
+    // schedule this instruction.
+    bool FoundSequentialDependence;
+
+  public:
+    // Ctor.
+    HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                          MachineDominatorTree &MDT);
+
+    // initPacketizerState - initialize some internal flags.
+    void initPacketizerState();
+
+    // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+    bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB);
+
+    // isSoloInstruction - return true if instruction MI can not be packetized
+    // with any other instruction, which means that MI itself is a packet.
+    bool isSoloInstruction(MachineInstr *MI);
+
+    // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+    // together.
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ);
+
+    // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+    // and SUJ.
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ);
+
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI);
+  private:
+    bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
+    bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII);
+    bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool DemoteToDotOld(MachineInstr* MI);
+    bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool RestrictingDepExistInPacket(MachineInstr*,
+                    unsigned, std::map <MachineInstr*, SUnit*>);
+    bool isNewifiable(MachineInstr* MI);
+    bool isCondInst(MachineInstr* MI);
+    bool IsNewifyStore (MachineInstr* MI);
+    bool tryAllocateResourcesForConstExt(MachineInstr* MI);
+    bool canReserveResourcesForConstExt(MachineInstr *MI);
+    void reserveResourcesForConstExt(MachineInstr* MI);
+    bool isNewValueInst(MachineInstr* MI);
+    bool isDotNewInst(MachineInstr* MI);
+  };
+}
+
+// HexagonPacketizerList Ctor.
+HexagonPacketizerList::HexagonPacketizerList(
+  MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT)
+  : VLIWPacketizerList(MF, MLI, MDT, true){
+}
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+  // Instantiate the packetizer.
+  HexagonPacketizerList Packetizer(Fn, MLI, MDT);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill()) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = llvm::prior(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == llvm::prior(RegionEnd)) {
+        RegionEnd = llvm::prior(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+}
+
+
+static bool IsIndirectCall(MachineInstr* MI) {
+  return ((MI->getOpcode() == Hexagon::CALLR) ||
+          (MI->getOpcode() == Hexagon::CALLRv3));
+}
+
+// Reserve resources for constant extender. Trigure an assertion if
+// reservation fail.
+void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr(
+                                  QII->get(Hexagon::IMMEXT), MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    llvm_unreachable("can not reserve resources for constant extender.");
+  }
+  return;
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  assert(QII->isExtended(MI) &&
+         "Should only be called for constant extended instructions");
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT),
+                                                  MI->getDebugLoc());
+  bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
+  MF->DeleteMachineInstr(PseudoMI);
+  return CanReserve;
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
+// true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr(
+                                  QII->get(Hexagon::IMMEXT), MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return true;
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return false;
+  }
+}
+
+
+bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
+                                          SDep::Kind DepType,
+                                          unsigned DepReg) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const HexagonRegisterInfo* QRI =
+              (const HexagonRegisterInfo *) TM.getRegisterInfo();
+
+  // Check for lr dependence
+  if (DepReg == QRI->getRARegister()) {
+    return true;
+  }
+
+  if (QII->isDeallocRet(MI)) {
+    if (DepReg == QRI->getFrameRegister() ||
+        DepReg == QRI->getStackRegister())
+      return true;
+  }
+
+  // Check if this is a predicate dependence
+  const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass) {
+    return true;
+  }
+
+  //
+  // Lastly check for an operand used in an indirect call
+  // If we had an attribute for checking if an instruction is an indirect call,
+  // then we could have avoided this relatively brittle implementation of
+  // IsIndirectCall()
+  //
+  // Assumes that the first operand of the CALLr is the function address
+  //
+  if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+    MachineOperand MO = MI->getOperand(0);
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool IsRegDependence(const SDep::Kind DepType) {
+  return (DepType == SDep::Data || DepType == SDep::Anti ||
+          DepType == SDep::Output);
+}
+
+static bool IsDirectJump(MachineInstr* MI) {
+  return (MI->getOpcode() == Hexagon::JMP);
+}
+
+static bool IsSchedBarrier(MachineInstr* MI) {
+  switch (MI->getOpcode()) {
+  case Hexagon::BARRIER:
+    return true;
+  }
+  return false;
+}
+
+static bool IsControlFlow(MachineInstr* MI) {
+  return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
+}
+
+bool HexagonPacketizerList::isNewValueInst(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if (QII->isNewValueJump(MI))
+    return true;
+
+  if (QII->isNewValueStore(MI))
+    return true;
+
+  return false;
+}
+
+// Function returns true if an instruction can be promoted to the new-value
+// store. It will always return false for v2 and v3.
+// It lists all the conditional and unconditional stores that can be promoted
+// to the new-value stores.
+
+bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
+  const HexagonRegisterInfo* QRI =
+                          (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    // store byte
+    case Hexagon::STrib:
+    case Hexagon::STrib_indexed:
+    case Hexagon::STrib_indexed_shl_V4:
+    case Hexagon::STrib_shl_V4:
+    case Hexagon::STrib_GP_V4:
+    case Hexagon::STb_GP_V4:
+    case Hexagon::POST_STbri:
+    case Hexagon::STrib_cPt:
+    case Hexagon::STrib_cdnPt_V4:
+    case Hexagon::STrib_cNotPt:
+    case Hexagon::STrib_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_cPt:
+    case Hexagon::STrib_indexed_cdnPt_V4:
+    case Hexagon::STrib_indexed_cNotPt:
+    case Hexagon::STrib_indexed_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+    case Hexagon::STb_GP_cPt_V4:
+    case Hexagon::STb_GP_cNotPt_V4:
+    case Hexagon::STb_GP_cdnPt_V4:
+    case Hexagon::STb_GP_cdnNotPt_V4:
+    case Hexagon::STrib_GP_cPt_V4:
+    case Hexagon::STrib_GP_cNotPt_V4:
+    case Hexagon::STrib_GP_cdnPt_V4:
+    case Hexagon::STrib_GP_cdnNotPt_V4:
+
+    // store halfword
+    case Hexagon::STrih:
+    case Hexagon::STrih_indexed:
+    case Hexagon::STrih_indexed_shl_V4:
+    case Hexagon::STrih_shl_V4:
+    case Hexagon::STrih_GP_V4:
+    case Hexagon::STh_GP_V4:
+    case Hexagon::POST_SThri:
+    case Hexagon::STrih_cPt:
+    case Hexagon::STrih_cdnPt_V4:
+    case Hexagon::STrih_cNotPt:
+    case Hexagon::STrih_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_cPt:
+    case Hexagon::STrih_indexed_cdnPt_V4:
+    case Hexagon::STrih_indexed_cNotPt:
+    case Hexagon::STrih_indexed_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+    case Hexagon::STh_GP_cPt_V4:
+    case Hexagon::STh_GP_cNotPt_V4:
+    case Hexagon::STh_GP_cdnPt_V4:
+    case Hexagon::STh_GP_cdnNotPt_V4:
+    case Hexagon::STrih_GP_cPt_V4:
+    case Hexagon::STrih_GP_cNotPt_V4:
+    case Hexagon::STrih_GP_cdnPt_V4:
+    case Hexagon::STrih_GP_cdnNotPt_V4:
+
+    // store word
+    case Hexagon::STriw:
+    case Hexagon::STriw_indexed:
+    case Hexagon::STriw_indexed_shl_V4:
+    case Hexagon::STriw_shl_V4:
+    case Hexagon::STriw_GP_V4:
+    case Hexagon::STw_GP_V4:
+    case Hexagon::POST_STwri:
+    case Hexagon::STriw_cPt:
+    case Hexagon::STriw_cdnPt_V4:
+    case Hexagon::STriw_cNotPt:
+    case Hexagon::STriw_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_cPt:
+    case Hexagon::STriw_indexed_cdnPt_V4:
+    case Hexagon::STriw_indexed_cNotPt:
+    case Hexagon::STriw_indexed_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+    case Hexagon::STw_GP_cPt_V4:
+    case Hexagon::STw_GP_cNotPt_V4:
+    case Hexagon::STw_GP_cdnPt_V4:
+    case Hexagon::STw_GP_cdnNotPt_V4:
+    case Hexagon::STriw_GP_cPt_V4:
+    case Hexagon::STriw_GP_cNotPt_V4:
+    case Hexagon::STriw_GP_cdnPt_V4:
+    case Hexagon::STriw_GP_cdnNotPt_V4:
+        return QRI->Subtarget.hasV4TOps();
+  }
+  return false;
+}
+
+static bool IsLoopN(MachineInstr *MI) {
+  return (MI->getOpcode() == Hexagon::LOOP0_i ||
+          MI->getOpcode() == Hexagon::LOOP0_r);
+}
+
+/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
+/// callee-saved register.
+static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI) {
+  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+    unsigned CalleeSavedReg = *CSR;
+    if (MI->modifiesRegister(CalleeSavedReg, TRI))
+      return true;
+  }
+  return false;
+}
+
+// Return the new value instruction for a given store.
+static int GetDotNewOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib:
+    return Hexagon::STrib_nv_V4;
+
+  case Hexagon::STrib_indexed:
+    return Hexagon::STrib_indexed_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_V4:
+    return Hexagon::STrib_indexed_shl_nv_V4;
+
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STrib_GP_V4:
+    return Hexagon::STrib_GP_nv_V4;
+
+  case Hexagon::STb_GP_V4:
+    return Hexagon::STb_GP_nv_V4;
+
+  case Hexagon::POST_STbri:
+    return Hexagon::POST_STbri_nv_V4;
+
+  case Hexagon::STrib_cPt:
+    return Hexagon::STrib_cPt_nv_V4;
+
+  case Hexagon::STrib_cdnPt_V4:
+    return Hexagon::STrib_cdnPt_nv_V4;
+
+  case Hexagon::STrib_cNotPt:
+    return Hexagon::STrib_cNotPt_nv_V4;
+
+  case Hexagon::STrib_cdnNotPt_V4:
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt:
+    return Hexagon::STrib_indexed_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnPt_V4:
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cNotPt:
+    return Hexagon::STrib_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4:
+    return Hexagon::STrib_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt:
+    return Hexagon::POST_STbri_cPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnPt_V4:
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STbri_cNotPt:
+    return Hexagon::POST_STbri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnNotPt_V4:
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_V4:
+    return Hexagon::STb_GP_cPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4:
+    return Hexagon::STb_GP_cNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnPt_V4:
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_V4:
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cPt_V4:
+    return Hexagon::STrib_GP_cPt_nv_V4;
+
+  case Hexagon::STrib_GP_cNotPt_V4:
+    return Hexagon::STrib_GP_cNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cdnPt_V4:
+    return Hexagon::STrib_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
+
+  // store new value halfword
+  case Hexagon::STrih:
+    return Hexagon::STrih_nv_V4;
+
+  case Hexagon::STrih_indexed:
+    return Hexagon::STrih_indexed_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_V4:
+    return Hexagon::STrih_indexed_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STrih_GP_V4:
+    return Hexagon::STrih_GP_nv_V4;
+
+  case Hexagon::STh_GP_V4:
+    return Hexagon::STh_GP_nv_V4;
+
+  case Hexagon::POST_SThri:
+    return Hexagon::POST_SThri_nv_V4;
+
+  case Hexagon::STrih_cPt:
+    return Hexagon::STrih_cPt_nv_V4;
+
+  case Hexagon::STrih_cdnPt_V4:
+    return Hexagon::STrih_cdnPt_nv_V4;
+
+  case Hexagon::STrih_cNotPt:
+    return Hexagon::STrih_cNotPt_nv_V4;
+
+  case Hexagon::STrih_cdnNotPt_V4:
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt:
+    return Hexagon::STrih_indexed_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnPt_V4:
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cNotPt:
+    return Hexagon::STrih_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4:
+    return Hexagon::STrih_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt:
+    return Hexagon::POST_SThri_cPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnPt_V4:
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+
+  case Hexagon::POST_SThri_cNotPt:
+    return Hexagon::POST_SThri_cNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnNotPt_V4:
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_V4:
+    return Hexagon::STh_GP_cPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4:
+    return Hexagon::STh_GP_cNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnPt_V4:
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_V4:
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cPt_V4:
+    return Hexagon::STrih_GP_cPt_nv_V4;
+
+  case Hexagon::STrih_GP_cNotPt_V4:
+    return Hexagon::STrih_GP_cNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cdnPt_V4:
+    return Hexagon::STrih_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
+
+  // store new value word
+  case Hexagon::STriw:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_V4:
+    return Hexagon::STriw_indexed_shl_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  case Hexagon::STriw_GP_V4:
+    return Hexagon::STriw_GP_nv_V4;
+
+  case Hexagon::STw_GP_V4:
+    return Hexagon::STw_GP_nv_V4;
+
+  case Hexagon::POST_STwri:
+    return Hexagon::POST_STwri_nv_V4;
+
+  case Hexagon::STriw_cPt:
+    return Hexagon::STriw_cPt_nv_V4;
+
+  case Hexagon::STriw_cdnPt_V4:
+    return Hexagon::STriw_cdnPt_nv_V4;
+
+  case Hexagon::STriw_cNotPt:
+    return Hexagon::STriw_cNotPt_nv_V4;
+
+  case Hexagon::STriw_cdnNotPt_V4:
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt:
+    return Hexagon::STriw_indexed_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnPt_V4:
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cNotPt:
+    return Hexagon::STriw_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4:
+    return Hexagon::STriw_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt:
+    return Hexagon::POST_STwri_cPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnPt_V4:
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STwri_cNotPt:
+    return Hexagon::POST_STwri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnNotPt_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_V4:
+    return Hexagon::STw_GP_cPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4:
+    return Hexagon::STw_GP_cNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnPt_V4:
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_V4:
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cPt_V4:
+    return Hexagon::STriw_GP_cPt_nv_V4;
+
+  case Hexagon::STriw_GP_cNotPt_V4:
+    return Hexagon::STriw_GP_cNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cdnPt_V4:
+    return Hexagon::STriw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
+  }
+}
+
+// Return .new predicate version for an instruction
+static int GetDotNewPredOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // Conditional stores
+  // Store byte conditionally
+  case Hexagon::STrib_cPt :
+    return Hexagon::STrib_cdnPt_V4;
+
+  case Hexagon::STrib_cNotPt :
+    return Hexagon::STrib_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_cPt :
+    return Hexagon::STrib_indexed_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_cNotPt :
+    return Hexagon::STrib_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrib_imm_cPt_V4 :
+    return Hexagon::STrib_imm_cdnPt_V4;
+
+  case Hexagon::STrib_imm_cNotPt_V4 :
+    return Hexagon::STrib_imm_cdnNotPt_V4;
+
+  case Hexagon::POST_STbri_cPt :
+    return Hexagon::POST_STbri_cdnPt_V4;
+
+  case Hexagon::POST_STbri_cNotPt :
+    return Hexagon::POST_STbri_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::STb_GP_cPt_V4 :
+    return Hexagon::STb_GP_cdnPt_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4 :
+    return Hexagon::STb_GP_cdnNotPt_V4;
+
+  case Hexagon::STrib_GP_cPt_V4 :
+    return Hexagon::STrib_GP_cdnPt_V4;
+
+  case Hexagon::STrib_GP_cNotPt_V4 :
+    return Hexagon::STrib_GP_cdnNotPt_V4;
+
+  // Store doubleword conditionally
+  case Hexagon::STrid_cPt :
+    return Hexagon::STrid_cdnPt_V4;
+
+  case Hexagon::STrid_cNotPt :
+    return Hexagon::STrid_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_cPt :
+    return Hexagon::STrid_indexed_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_cNotPt :
+    return Hexagon::STrid_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STdri_cPt :
+    return Hexagon::POST_STdri_cdnPt_V4;
+
+  case Hexagon::POST_STdri_cNotPt :
+    return Hexagon::POST_STdri_cdnNotPt_V4;
+
+  case Hexagon::STd_GP_cPt_V4 :
+    return Hexagon::STd_GP_cdnPt_V4;
+
+  case Hexagon::STd_GP_cNotPt_V4 :
+    return Hexagon::STd_GP_cdnNotPt_V4;
+
+  case Hexagon::STrid_GP_cPt_V4 :
+    return Hexagon::STrid_GP_cdnPt_V4;
+
+  case Hexagon::STrid_GP_cNotPt_V4 :
+    return Hexagon::STrid_GP_cdnNotPt_V4;
+
+  // Store halfword conditionally
+  case Hexagon::STrih_cPt :
+    return Hexagon::STrih_cdnPt_V4;
+
+  case Hexagon::STrih_cNotPt :
+    return Hexagon::STrih_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_cPt :
+    return Hexagon::STrih_indexed_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_cNotPt :
+    return Hexagon::STrih_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrih_imm_cPt_V4 :
+    return Hexagon::STrih_imm_cdnPt_V4;
+
+  case Hexagon::STrih_imm_cNotPt_V4 :
+    return Hexagon::STrih_imm_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_SThri_cPt :
+    return Hexagon::POST_SThri_cdnPt_V4;
+
+  case Hexagon::POST_SThri_cNotPt :
+    return Hexagon::POST_SThri_cdnNotPt_V4;
+
+  case Hexagon::STh_GP_cPt_V4 :
+    return Hexagon::STh_GP_cdnPt_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4 :
+    return Hexagon::STh_GP_cdnNotPt_V4;
+
+  case Hexagon::STrih_GP_cPt_V4 :
+    return Hexagon::STrih_GP_cdnPt_V4;
+
+  case Hexagon::STrih_GP_cNotPt_V4 :
+    return Hexagon::STrih_GP_cdnNotPt_V4;
+
+  // Store word conditionally
+  case Hexagon::STriw_cPt :
+    return Hexagon::STriw_cdnPt_V4;
+
+  case Hexagon::STriw_cNotPt :
+    return Hexagon::STriw_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_cPt :
+    return Hexagon::STriw_indexed_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_cNotPt :
+    return Hexagon::STriw_indexed_cdnNotPt_V4;
+
+  case Hexagon::STriw_imm_cPt_V4 :
+    return Hexagon::STriw_imm_cdnPt_V4;
+
+  case Hexagon::STriw_imm_cNotPt_V4 :
+    return Hexagon::STriw_imm_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STwri_cPt :
+    return Hexagon::POST_STwri_cdnPt_V4;
+
+  case Hexagon::POST_STwri_cNotPt :
+    return Hexagon::POST_STwri_cdnNotPt_V4;
+
+  case Hexagon::STw_GP_cPt_V4 :
+    return Hexagon::STw_GP_cdnPt_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4 :
+    return Hexagon::STw_GP_cdnNotPt_V4;
+
+  case Hexagon::STriw_GP_cPt_V4 :
+    return Hexagon::STriw_GP_cdnPt_V4;
+
+  case Hexagon::STriw_GP_cNotPt_V4 :
+    return Hexagon::STriw_GP_cdnNotPt_V4;
+
+  // Condtional Jumps
+  case Hexagon::JMP_c:
+    return Hexagon::JMP_cdnPt;
+
+  case Hexagon::JMP_cNot:
+    return Hexagon::JMP_cdnNotPt;
+
+  case Hexagon::JMPR_cPt:
+    return Hexagon::JMPR_cdnPt_V3;
+
+  case Hexagon::JMPR_cNotPt:
+    return Hexagon::JMPR_cdnNotPt_V3;
+
+  // Conditional Transfers
+  case Hexagon::TFR_cPt:
+    return Hexagon::TFR_cdnPt;
+
+  case Hexagon::TFR_cNotPt:
+    return Hexagon::TFR_cdnNotPt;
+
+  case Hexagon::TFRI_cPt:
+    return Hexagon::TFRI_cdnPt;
+
+  case Hexagon::TFRI_cNotPt:
+    return Hexagon::TFRI_cdnNotPt;
+
+  // Load double word
+  case Hexagon::LDrid_cPt :
+    return Hexagon::LDrid_cdnPt;
+
+  case Hexagon::LDrid_cNotPt :
+    return Hexagon::LDrid_cdnNotPt;
+
+  case Hexagon::LDrid_indexed_cPt :
+    return Hexagon::LDrid_indexed_cdnPt;
+
+  case Hexagon::LDrid_indexed_cNotPt :
+    return Hexagon::LDrid_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrid_cPt :
+    return Hexagon::POST_LDrid_cdnPt_V4;
+
+  case Hexagon::POST_LDrid_cNotPt :
+    return Hexagon::POST_LDrid_cdnNotPt_V4;
+
+  // Load word
+  case Hexagon::LDriw_cPt :
+    return Hexagon::LDriw_cdnPt;
+
+  case Hexagon::LDriw_cNotPt :
+    return Hexagon::LDriw_cdnNotPt;
+
+  case Hexagon::LDriw_indexed_cPt :
+    return Hexagon::LDriw_indexed_cdnPt;
+
+  case Hexagon::LDriw_indexed_cNotPt :
+    return Hexagon::LDriw_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriw_cPt :
+    return Hexagon::POST_LDriw_cdnPt_V4;
+
+  case Hexagon::POST_LDriw_cNotPt :
+    return Hexagon::POST_LDriw_cdnNotPt_V4;
+
+  // Load halfword
+  case Hexagon::LDrih_cPt :
+    return Hexagon::LDrih_cdnPt;
+
+  case Hexagon::LDrih_cNotPt :
+    return Hexagon::LDrih_cdnNotPt;
+
+  case Hexagon::LDrih_indexed_cPt :
+    return Hexagon::LDrih_indexed_cdnPt;
+
+  case Hexagon::LDrih_indexed_cNotPt :
+    return Hexagon::LDrih_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrih_cPt :
+    return Hexagon::POST_LDrih_cdnPt_V4;
+
+  case Hexagon::POST_LDrih_cNotPt :
+    return Hexagon::POST_LDrih_cdnNotPt_V4;
+
+  // Load byte
+  case Hexagon::LDrib_cPt :
+    return Hexagon::LDrib_cdnPt;
+
+  case Hexagon::LDrib_cNotPt :
+    return Hexagon::LDrib_cdnNotPt;
+
+  case Hexagon::LDrib_indexed_cPt :
+    return Hexagon::LDrib_indexed_cdnPt;
+
+  case Hexagon::LDrib_indexed_cNotPt :
+    return Hexagon::LDrib_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrib_cPt :
+    return Hexagon::POST_LDrib_cdnPt_V4;
+
+  case Hexagon::POST_LDrib_cNotPt :
+    return Hexagon::POST_LDrib_cdnNotPt_V4;
+
+  // Load unsigned halfword
+  case Hexagon::LDriuh_cPt :
+    return Hexagon::LDriuh_cdnPt;
+
+  case Hexagon::LDriuh_cNotPt :
+    return Hexagon::LDriuh_cdnNotPt;
+
+  case Hexagon::LDriuh_indexed_cPt :
+    return Hexagon::LDriuh_indexed_cdnPt;
+
+  case Hexagon::LDriuh_indexed_cNotPt :
+    return Hexagon::LDriuh_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriuh_cPt :
+    return Hexagon::POST_LDriuh_cdnPt_V4;
+
+  case Hexagon::POST_LDriuh_cNotPt :
+    return Hexagon::POST_LDriuh_cdnNotPt_V4;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cPt :
+    return Hexagon::LDriub_cdnPt;
+
+  case Hexagon::LDriub_cNotPt :
+    return Hexagon::LDriub_cdnNotPt;
+
+  case Hexagon::LDriub_indexed_cPt :
+    return Hexagon::LDriub_indexed_cdnPt;
+
+  case Hexagon::LDriub_indexed_cNotPt :
+    return Hexagon::LDriub_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriub_cPt :
+    return Hexagon::POST_LDriub_cdnPt_V4;
+
+  case Hexagon::POST_LDriub_cNotPt :
+    return Hexagon::POST_LDriub_cdnNotPt_V4;
+
+  // V4 indexed+scaled load
+
+  case Hexagon::LDrid_indexed_cPt_V4 :
+    return Hexagon::LDrid_indexed_cdnPt_V4;
+
+  case Hexagon::LDrid_indexed_cNotPt_V4 :
+    return Hexagon::LDrid_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrib_indexed_cPt_V4 :
+    return Hexagon::LDrib_indexed_cdnPt_V4;
+
+  case Hexagon::LDrib_indexed_cNotPt_V4 :
+    return Hexagon::LDrib_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriub_indexed_cPt_V4 :
+    return Hexagon::LDriub_indexed_cdnPt_V4;
+
+  case Hexagon::LDriub_indexed_cNotPt_V4 :
+    return Hexagon::LDriub_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrih_indexed_cPt_V4 :
+    return Hexagon::LDrih_indexed_cdnPt_V4;
+
+  case Hexagon::LDrih_indexed_cNotPt_V4 :
+    return Hexagon::LDrih_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_cPt_V4 :
+    return Hexagon::LDriuh_indexed_cdnPt_V4;
+
+  case Hexagon::LDriuh_indexed_cNotPt_V4 :
+    return Hexagon::LDriuh_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriw_indexed_cPt_V4 :
+    return Hexagon::LDriw_indexed_cdnPt_V4;
+
+  case Hexagon::LDriw_indexed_cNotPt_V4 :
+    return Hexagon::LDriw_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cPt_V4:
+    return Hexagon::LDd_GP_cdnPt_V4;
+
+  case Hexagon::LDd_GP_cNotPt_V4:
+    return Hexagon::LDd_GP_cdnNotPt_V4;
+
+  case Hexagon::LDb_GP_cPt_V4:
+    return Hexagon::LDb_GP_cdnPt_V4;
+
+  case Hexagon::LDb_GP_cNotPt_V4:
+    return Hexagon::LDb_GP_cdnNotPt_V4;
+
+  case Hexagon::LDub_GP_cPt_V4:
+    return Hexagon::LDub_GP_cdnPt_V4;
+
+  case Hexagon::LDub_GP_cNotPt_V4:
+    return Hexagon::LDub_GP_cdnNotPt_V4;
+
+  case Hexagon::LDh_GP_cPt_V4:
+    return Hexagon::LDh_GP_cdnPt_V4;
+
+  case Hexagon::LDh_GP_cNotPt_V4:
+    return Hexagon::LDh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDuh_GP_cPt_V4:
+    return Hexagon::LDuh_GP_cdnPt_V4;
+
+  case Hexagon::LDuh_GP_cNotPt_V4:
+    return Hexagon::LDuh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDw_GP_cPt_V4:
+    return Hexagon::LDw_GP_cdnPt_V4;
+
+  case Hexagon::LDw_GP_cNotPt_V4:
+    return Hexagon::LDw_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrid_GP_cPt_V4:
+    return Hexagon::LDrid_GP_cdnPt_V4;
+
+  case Hexagon::LDrid_GP_cNotPt_V4:
+    return Hexagon::LDrid_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrib_GP_cPt_V4:
+    return Hexagon::LDrib_GP_cdnPt_V4;
+
+  case Hexagon::LDrib_GP_cNotPt_V4:
+    return Hexagon::LDrib_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriub_GP_cPt_V4:
+    return Hexagon::LDriub_GP_cdnPt_V4;
+
+  case Hexagon::LDriub_GP_cNotPt_V4:
+    return Hexagon::LDriub_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrih_GP_cPt_V4:
+    return Hexagon::LDrih_GP_cdnPt_V4;
+
+  case Hexagon::LDrih_GP_cNotPt_V4:
+    return Hexagon::LDrih_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_GP_cPt_V4:
+    return Hexagon::LDriuh_GP_cdnPt_V4;
+
+  case Hexagon::LDriuh_GP_cNotPt_V4:
+    return Hexagon::LDriuh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriw_GP_cPt_V4:
+    return Hexagon::LDriw_GP_cdnPt_V4;
+
+  case Hexagon::LDriw_GP_cNotPt_V4:
+    return Hexagon::LDriw_GP_cdnNotPt_V4;
+
+  // Conditional store new-value byte
+  case Hexagon::STrib_cPt_nv_V4 :
+    return Hexagon::STrib_cdnPt_nv_V4;
+  case Hexagon::STrib_cNotPt_nv_V4 :
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+  case Hexagon::POST_STbri_cNotPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_nv_V4 :
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_nv_V4 :
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cPt_nv_V4 :
+    return Hexagon::STrib_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrib_GP_cNotPt_nv_V4 :
+    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value halfword
+  case Hexagon::STrih_cPt_nv_V4 :
+    return Hexagon::STrih_cdnPt_nv_V4;
+  case Hexagon::STrih_cNotPt_nv_V4 :
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_nv_V4 :
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_nv_V4 :
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cPt_nv_V4 :
+    return Hexagon::STrih_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrih_GP_cNotPt_nv_V4 :
+    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value word
+  case Hexagon::STriw_cPt_nv_V4 :
+    return  Hexagon::STriw_cdnPt_nv_V4;
+  case Hexagon::STriw_cNotPt_nv_V4 :
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+  case Hexagon::POST_STwri_cNotPt_nv_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_nv_V4 :
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_nv_V4 :
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cPt_nv_V4 :
+    return Hexagon::STriw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STriw_GP_cNotPt_nv_V4 :
+    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
+
+  // Conditional add
+  case Hexagon::ADD_ri_cPt :
+    return Hexagon::ADD_ri_cdnPt;
+  case Hexagon::ADD_ri_cNotPt :
+    return Hexagon::ADD_ri_cdnNotPt;
+
+  case Hexagon::ADD_rr_cPt :
+    return Hexagon::ADD_rr_cdnPt;
+  case Hexagon::ADD_rr_cNotPt :
+    return Hexagon::ADD_rr_cdnNotPt;
+
+  // Conditional logical Operations
+  case Hexagon::XOR_rr_cPt :
+    return Hexagon::XOR_rr_cdnPt;
+  case Hexagon::XOR_rr_cNotPt :
+    return Hexagon::XOR_rr_cdnNotPt;
+
+  case Hexagon::AND_rr_cPt :
+    return Hexagon::AND_rr_cdnPt;
+  case Hexagon::AND_rr_cNotPt :
+    return Hexagon::AND_rr_cdnNotPt;
+
+  case Hexagon::OR_rr_cPt :
+    return Hexagon::OR_rr_cdnPt;
+  case Hexagon::OR_rr_cNotPt :
+    return Hexagon::OR_rr_cdnNotPt;
+
+  // Conditional Subtract
+  case Hexagon::SUB_rr_cPt :
+    return Hexagon::SUB_rr_cdnPt;
+  case Hexagon::SUB_rr_cNotPt :
+    return Hexagon::SUB_rr_cdnNotPt;
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+
+  case Hexagon::ASLH_cPt_V4 :
+    return Hexagon::ASLH_cdnPt_V4;
+  case Hexagon::ASLH_cNotPt_V4 :
+    return Hexagon::ASLH_cdnNotPt_V4;
+
+  case Hexagon::ASRH_cPt_V4 :
+    return Hexagon::ASRH_cdnPt_V4;
+  case Hexagon::ASRH_cNotPt_V4 :
+    return Hexagon::ASRH_cdnNotPt_V4;
+
+  case Hexagon::SXTB_cPt_V4 :
+    return Hexagon::SXTB_cdnPt_V4;
+  case Hexagon::SXTB_cNotPt_V4 :
+    return Hexagon::SXTB_cdnNotPt_V4;
+
+  case Hexagon::SXTH_cPt_V4 :
+    return Hexagon::SXTH_cdnPt_V4;
+  case Hexagon::SXTH_cNotPt_V4 :
+    return Hexagon::SXTH_cdnNotPt_V4;
+
+  case Hexagon::ZXTB_cPt_V4 :
+    return Hexagon::ZXTB_cdnPt_V4;
+  case Hexagon::ZXTB_cNotPt_V4 :
+    return Hexagon::ZXTB_cdnNotPt_V4;
+
+  case Hexagon::ZXTH_cPt_V4 :
+    return Hexagon::ZXTH_cdnPt_V4;
+  case Hexagon::ZXTH_cNotPt_V4 :
+    return Hexagon::ZXTH_cdnNotPt_V4;
+  }
+}
+
+// Returns true if an instruction can be promoted to .new predicate
+// or new-value store.
+bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
+  if ( isCondInst(MI) || IsNewifyStore(MI))
+    return true;
+  else
+    return false;
+}
+
+bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const MCInstrDesc& TID = MI->getDesc();
+                                    // bug 5670: until that is fixed,
+                                    // this portion is disabled.
+  if (   TID.isConditionalBranch()  // && !IsRegisterJump(MI)) ||
+      || QII->isConditionalTransfer(MI)
+      || QII->isConditionalALU32(MI)
+      || QII->isConditionalLoad(MI)
+      || QII->isConditionalStore(MI)) {
+    return true;
+  }
+  return false;
+}
+
+
+// Promote an instructiont to its .new form.
+// At this time, we have already made a call to CanPromoteToDotNew
+// and made sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
+                        SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+                        const TargetRegisterClass* RC) {
+
+  assert (DepType == SDep::Data);
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  int NewOpcode;
+  if (RC == &Hexagon::PredRegsRegClass)
+    NewOpcode = GetDotNewPredOp(MI->getOpcode());
+  else
+    NewOpcode = GetDotNewOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+
+  return true;
+}
+
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+//   if (p0.new) R2 = add(R3, R4)
+//   R5 = add (R3, R1)
+//   }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
+
+static int GetDotOldOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .old type");
+  case Hexagon::TFR_cdnPt:
+    return Hexagon::TFR_cPt;
+
+  case Hexagon::TFR_cdnNotPt:
+    return Hexagon::TFR_cNotPt;
+
+  case Hexagon::TFRI_cdnPt:
+    return Hexagon::TFRI_cPt;
+
+  case Hexagon::TFRI_cdnNotPt:
+    return Hexagon::TFRI_cNotPt;
+
+  case Hexagon::JMP_cdnPt:
+    return Hexagon::JMP_c;
+
+  case Hexagon::JMP_cdnNotPt:
+    return Hexagon::JMP_cNot;
+
+  case Hexagon::JMPR_cdnPt_V3:
+    return Hexagon::JMPR_cPt;
+
+  case Hexagon::JMPR_cdnNotPt_V3:
+    return Hexagon::JMPR_cNotPt;
+
+  // Load double word
+
+  case Hexagon::LDrid_cdnPt :
+    return Hexagon::LDrid_cPt;
+
+  case Hexagon::LDrid_cdnNotPt :
+    return Hexagon::LDrid_cNotPt;
+
+  case Hexagon::LDrid_indexed_cdnPt :
+    return Hexagon::LDrid_indexed_cPt;
+
+  case Hexagon::LDrid_indexed_cdnNotPt :
+    return Hexagon::LDrid_indexed_cNotPt;
+
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+    return Hexagon::POST_LDrid_cPt;
+
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+    return Hexagon::POST_LDrid_cNotPt;
+
+  // Load word
+
+  case Hexagon::LDriw_cdnPt :
+    return Hexagon::LDriw_cPt;
+
+  case Hexagon::LDriw_cdnNotPt :
+    return Hexagon::LDriw_cNotPt;
+
+  case Hexagon::LDriw_indexed_cdnPt :
+    return Hexagon::LDriw_indexed_cPt;
+
+  case Hexagon::LDriw_indexed_cdnNotPt :
+    return Hexagon::LDriw_indexed_cNotPt;
+
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+    return Hexagon::POST_LDriw_cPt;
+
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+    return Hexagon::POST_LDriw_cNotPt;
+
+  // Load half
+
+  case Hexagon::LDrih_cdnPt :
+    return Hexagon::LDrih_cPt;
+
+  case Hexagon::LDrih_cdnNotPt :
+    return Hexagon::LDrih_cNotPt;
+
+  case Hexagon::LDrih_indexed_cdnPt :
+    return Hexagon::LDrih_indexed_cPt;
+
+  case Hexagon::LDrih_indexed_cdnNotPt :
+    return Hexagon::LDrih_indexed_cNotPt;
+
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+    return Hexagon::POST_LDrih_cPt;
+
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+    return Hexagon::POST_LDrih_cNotPt;
+
+  // Load byte
+
+  case Hexagon::LDrib_cdnPt :
+    return Hexagon::LDrib_cPt;
+
+  case Hexagon::LDrib_cdnNotPt :
+    return Hexagon::LDrib_cNotPt;
+
+  case Hexagon::LDrib_indexed_cdnPt :
+    return Hexagon::LDrib_indexed_cPt;
+
+  case Hexagon::LDrib_indexed_cdnNotPt :
+    return Hexagon::LDrib_indexed_cNotPt;
+
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+    return Hexagon::POST_LDrib_cPt;
+
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+    return Hexagon::POST_LDrib_cNotPt;
+
+  // Load unsigned half
+
+  case Hexagon::LDriuh_cdnPt :
+    return Hexagon::LDriuh_cPt;
+
+  case Hexagon::LDriuh_cdnNotPt :
+    return Hexagon::LDriuh_cNotPt;
+
+  case Hexagon::LDriuh_indexed_cdnPt :
+    return Hexagon::LDriuh_indexed_cPt;
+
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+    return Hexagon::LDriuh_indexed_cNotPt;
+
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+    return Hexagon::POST_LDriuh_cPt;
+
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+    return Hexagon::POST_LDriuh_cNotPt;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cdnPt :
+    return Hexagon::LDriub_cPt;
+
+  case Hexagon::LDriub_cdnNotPt :
+    return Hexagon::LDriub_cNotPt;
+
+  case Hexagon::LDriub_indexed_cdnPt :
+    return Hexagon::LDriub_indexed_cPt;
+
+  case Hexagon::LDriub_indexed_cdnNotPt :
+    return Hexagon::LDriub_indexed_cNotPt;
+
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+    return Hexagon::POST_LDriub_cPt;
+
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+    return Hexagon::POST_LDriub_cNotPt;
+
+  // V4 indexed+scaled Load
+
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+    return Hexagon::LDrid_indexed_cPt_V4;
+
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrid_indexed_cNotPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+    return Hexagon::LDrib_indexed_cPt_V4;
+
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrib_indexed_cNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+    return Hexagon::LDriub_indexed_cPt_V4;
+
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriub_indexed_cNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+    return Hexagon::LDrih_indexed_cPt_V4;
+
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrih_indexed_cNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+    return Hexagon::LDriuh_indexed_cPt_V4;
+
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriuh_indexed_cNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+    return Hexagon::LDriw_indexed_cPt_V4;
+
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriw_indexed_cNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cdnPt_V4:
+    return Hexagon::LDd_GP_cPt_V4;
+
+  case Hexagon::LDd_GP_cdnNotPt_V4:
+    return Hexagon::LDd_GP_cNotPt_V4;
+
+  case Hexagon::LDb_GP_cdnPt_V4:
+    return Hexagon::LDb_GP_cPt_V4;
+
+  case Hexagon::LDb_GP_cdnNotPt_V4:
+    return Hexagon::LDb_GP_cNotPt_V4;
+
+  case Hexagon::LDub_GP_cdnPt_V4:
+    return Hexagon::LDub_GP_cPt_V4;
+
+  case Hexagon::LDub_GP_cdnNotPt_V4:
+    return Hexagon::LDub_GP_cNotPt_V4;
+
+  case Hexagon::LDh_GP_cdnPt_V4:
+    return Hexagon::LDh_GP_cPt_V4;
+
+  case Hexagon::LDh_GP_cdnNotPt_V4:
+    return Hexagon::LDh_GP_cNotPt_V4;
+
+  case Hexagon::LDuh_GP_cdnPt_V4:
+    return Hexagon::LDuh_GP_cPt_V4;
+
+  case Hexagon::LDuh_GP_cdnNotPt_V4:
+    return Hexagon::LDuh_GP_cNotPt_V4;
+
+  case Hexagon::LDw_GP_cdnPt_V4:
+    return Hexagon::LDw_GP_cPt_V4;
+
+  case Hexagon::LDw_GP_cdnNotPt_V4:
+    return Hexagon::LDw_GP_cNotPt_V4;
+
+  case Hexagon::LDrid_GP_cdnPt_V4:
+    return Hexagon::LDrid_GP_cPt_V4;
+
+  case Hexagon::LDrid_GP_cdnNotPt_V4:
+    return Hexagon::LDrid_GP_cNotPt_V4;
+
+  case Hexagon::LDrib_GP_cdnPt_V4:
+    return Hexagon::LDrib_GP_cPt_V4;
+
+  case Hexagon::LDrib_GP_cdnNotPt_V4:
+    return Hexagon::LDrib_GP_cNotPt_V4;
+
+  case Hexagon::LDriub_GP_cdnPt_V4:
+    return Hexagon::LDriub_GP_cPt_V4;
+
+  case Hexagon::LDriub_GP_cdnNotPt_V4:
+    return Hexagon::LDriub_GP_cNotPt_V4;
+
+  case Hexagon::LDrih_GP_cdnPt_V4:
+    return Hexagon::LDrih_GP_cPt_V4;
+
+  case Hexagon::LDrih_GP_cdnNotPt_V4:
+    return Hexagon::LDrih_GP_cNotPt_V4;
+
+  case Hexagon::LDriuh_GP_cdnPt_V4:
+    return Hexagon::LDriuh_GP_cPt_V4;
+
+  case Hexagon::LDriuh_GP_cdnNotPt_V4:
+    return Hexagon::LDriuh_GP_cNotPt_V4;
+
+  case Hexagon::LDriw_GP_cdnPt_V4:
+    return Hexagon::LDriw_GP_cPt_V4;
+
+  case Hexagon::LDriw_GP_cdnNotPt_V4:
+    return Hexagon::LDriw_GP_cNotPt_V4;
+
+  // Conditional add
+
+  case Hexagon::ADD_ri_cdnPt :
+    return Hexagon::ADD_ri_cPt;
+  case Hexagon::ADD_ri_cdnNotPt :
+    return Hexagon::ADD_ri_cNotPt;
+
+  case Hexagon::ADD_rr_cdnPt :
+    return Hexagon::ADD_rr_cPt;
+  case Hexagon::ADD_rr_cdnNotPt:
+    return Hexagon::ADD_rr_cNotPt;
+
+  // Conditional logical Operations
+
+  case Hexagon::XOR_rr_cdnPt :
+    return Hexagon::XOR_rr_cPt;
+  case Hexagon::XOR_rr_cdnNotPt :
+    return Hexagon::XOR_rr_cNotPt;
+
+  case Hexagon::AND_rr_cdnPt :
+    return Hexagon::AND_rr_cPt;
+  case Hexagon::AND_rr_cdnNotPt :
+    return Hexagon::AND_rr_cNotPt;
+
+  case Hexagon::OR_rr_cdnPt :
+    return Hexagon::OR_rr_cPt;
+  case Hexagon::OR_rr_cdnNotPt :
+    return Hexagon::OR_rr_cNotPt;
+
+  // Conditional Subtract
+
+  case Hexagon::SUB_rr_cdnPt :
+    return Hexagon::SUB_rr_cPt;
+  case Hexagon::SUB_rr_cdnNotPt :
+    return Hexagon::SUB_rr_cNotPt;
+
+  // Conditional combine
+
+  case Hexagon::COMBINE_rr_cdnPt :
+    return Hexagon::COMBINE_rr_cPt;
+  case Hexagon::COMBINE_rr_cdnNotPt :
+    return Hexagon::COMBINE_rr_cNotPt;
+
+// Conditional shift operations
+
+  case Hexagon::ASLH_cdnPt_V4 :
+    return Hexagon::ASLH_cPt_V4;
+  case Hexagon::ASLH_cdnNotPt_V4 :
+    return Hexagon::ASLH_cNotPt_V4;
+
+  case Hexagon::ASRH_cdnPt_V4 :
+    return Hexagon::ASRH_cPt_V4;
+  case Hexagon::ASRH_cdnNotPt_V4 :
+    return Hexagon::ASRH_cNotPt_V4;
+
+  case Hexagon::SXTB_cdnPt_V4 :
+    return Hexagon::SXTB_cPt_V4;
+  case Hexagon::SXTB_cdnNotPt_V4 :
+    return Hexagon::SXTB_cNotPt_V4;
+
+  case Hexagon::SXTH_cdnPt_V4 :
+    return Hexagon::SXTH_cPt_V4;
+  case Hexagon::SXTH_cdnNotPt_V4 :
+    return Hexagon::SXTH_cNotPt_V4;
+
+  case Hexagon::ZXTB_cdnPt_V4 :
+    return Hexagon::ZXTB_cPt_V4;
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+    return Hexagon::ZXTB_cNotPt_V4;
+
+  case Hexagon::ZXTH_cdnPt_V4 :
+    return Hexagon::ZXTH_cPt_V4;
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+    return Hexagon::ZXTH_cNotPt_V4;
+
+  // Store byte
+
+  case Hexagon::STrib_imm_cdnPt_V4 :
+    return Hexagon::STrib_imm_cPt_V4;
+
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+    return Hexagon::STrib_imm_cNotPt_V4;
+
+  case Hexagon::STrib_cdnPt_nv_V4 :
+  case Hexagon::STrib_cPt_nv_V4 :
+  case Hexagon::STrib_cdnPt_V4 :
+    return Hexagon::STrib_cPt;
+
+  case Hexagon::STrib_cdnNotPt_nv_V4 :
+  case Hexagon::STrib_cNotPt_nv_V4 :
+  case Hexagon::STrib_cdnNotPt_V4 :
+    return Hexagon::STrib_cNotPt;
+
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnPt_nv_V4 :
+    return Hexagon::STrib_indexed_cPt;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cNotPt;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrib_indexed_shl_cPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STbri_cdnPt_nv_V4 :
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+    return Hexagon::POST_STbri_cPt;
+
+  case Hexagon::POST_STbri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STbri_cNotPt_nv_V4:
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+    return Hexagon::POST_STbri_cNotPt;
+
+  case Hexagon::STb_GP_cdnPt_nv_V4:
+  case Hexagon::STb_GP_cdnPt_V4:
+  case Hexagon::STb_GP_cPt_nv_V4:
+    return Hexagon::STb_GP_cPt_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_nv_V4:
+  case Hexagon::STb_GP_cdnNotPt_V4:
+  case Hexagon::STb_GP_cNotPt_nv_V4:
+    return Hexagon::STb_GP_cNotPt_V4;
+
+  case Hexagon::STrib_GP_cdnPt_nv_V4:
+  case Hexagon::STrib_GP_cdnPt_V4:
+  case Hexagon::STrib_GP_cPt_nv_V4:
+    return Hexagon::STrib_GP_cPt_V4;
+
+  case Hexagon::STrib_GP_cdnNotPt_nv_V4:
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+  case Hexagon::STrib_GP_cNotPt_nv_V4:
+    return Hexagon::STrib_GP_cNotPt_V4;
+
+  // Store new-value byte - unconditional
+  case Hexagon::STrib_nv_V4:
+    return Hexagon::STrib;
+
+  case Hexagon::STrib_indexed_nv_V4:
+    return Hexagon::STrib_indexed;
+
+  case Hexagon::STrib_indexed_shl_nv_V4:
+    return Hexagon::STrib_indexed_shl_V4;
+
+  case Hexagon::STrib_shl_nv_V4:
+    return Hexagon::STrib_shl_V4;
+
+  case Hexagon::STrib_GP_nv_V4:
+    return Hexagon::STrib_GP_V4;
+
+  case Hexagon::STb_GP_nv_V4:
+    return Hexagon::STb_GP_V4;
+
+  case Hexagon::POST_STbri_nv_V4:
+    return Hexagon::POST_STbri;
+
+  // Store halfword
+  case Hexagon::STrih_imm_cdnPt_V4 :
+    return Hexagon::STrih_imm_cPt_V4;
+
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+    return Hexagon::STrih_imm_cNotPt_V4;
+
+  case Hexagon::STrih_cdnPt_nv_V4 :
+  case Hexagon::STrih_cPt_nv_V4 :
+  case Hexagon::STrih_cdnPt_V4 :
+    return Hexagon::STrih_cPt;
+
+  case Hexagon::STrih_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_cNotPt_nv_V4 :
+  case Hexagon::STrih_cdnNotPt_V4 :
+    return Hexagon::STrih_cNotPt;
+
+  case Hexagon::STrih_indexed_cdnPt_nv_V4:
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+    return Hexagon::STrih_indexed_cPt;
+
+  case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_cNotPt;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrih_indexed_shl_cPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_SThri_cdnPt_nv_V4 :
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+    return Hexagon::POST_SThri_cPt;
+
+  case Hexagon::POST_SThri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+    return Hexagon::POST_SThri_cNotPt;
+
+  case Hexagon::STh_GP_cdnPt_nv_V4:
+  case Hexagon::STh_GP_cdnPt_V4:
+  case Hexagon::STh_GP_cPt_nv_V4:
+    return Hexagon::STh_GP_cPt_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_nv_V4:
+  case Hexagon::STh_GP_cdnNotPt_V4:
+  case Hexagon::STh_GP_cNotPt_nv_V4:
+    return Hexagon::STh_GP_cNotPt_V4;
+
+  case Hexagon::STrih_GP_cdnPt_nv_V4:
+  case Hexagon::STrih_GP_cdnPt_V4:
+  case Hexagon::STrih_GP_cPt_nv_V4:
+    return Hexagon::STrih_GP_cPt_V4;
+
+  case Hexagon::STrih_GP_cdnNotPt_nv_V4:
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+  case Hexagon::STrih_GP_cNotPt_nv_V4:
+    return Hexagon::STrih_GP_cNotPt_V4;
+
+  // Store new-value halfword - unconditional
+
+  case Hexagon::STrih_nv_V4:
+    return Hexagon::STrih;
+
+  case Hexagon::STrih_indexed_nv_V4:
+    return Hexagon::STrih_indexed;
+
+  case Hexagon::STrih_indexed_shl_nv_V4:
+    return Hexagon::STrih_indexed_shl_V4;
+
+  case Hexagon::STrih_shl_nv_V4:
+    return Hexagon::STrih_shl_V4;
+
+  case Hexagon::STrih_GP_nv_V4:
+    return Hexagon::STrih_GP_V4;
+
+  case Hexagon::STh_GP_nv_V4:
+    return Hexagon::STh_GP_V4;
+
+  case Hexagon::POST_SThri_nv_V4:
+    return Hexagon::POST_SThri;
+
+   // Store word
+
+   case Hexagon::STriw_imm_cdnPt_V4 :
+    return Hexagon::STriw_imm_cPt_V4;
+
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+    return Hexagon::STriw_imm_cNotPt_V4;
+
+  case Hexagon::STriw_cdnPt_nv_V4 :
+  case Hexagon::STriw_cPt_nv_V4 :
+  case Hexagon::STriw_cdnPt_V4 :
+    return Hexagon::STriw_cPt;
+
+  case Hexagon::STriw_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_cNotPt_nv_V4 :
+  case Hexagon::STriw_cdnNotPt_V4 :
+    return Hexagon::STriw_cNotPt;
+
+  case Hexagon::STriw_indexed_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+    return Hexagon::STriw_indexed_cPt;
+
+  case Hexagon::STriw_indexed_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_cNotPt;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::STriw_indexed_shl_cPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STwri_cdnPt_nv_V4 :
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+    return Hexagon::POST_STwri_cPt;
+
+  case Hexagon::POST_STwri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+    return Hexagon::POST_STwri_cNotPt;
+
+  case Hexagon::STw_GP_cdnPt_nv_V4:
+  case Hexagon::STw_GP_cdnPt_V4:
+  case Hexagon::STw_GP_cPt_nv_V4:
+    return Hexagon::STw_GP_cPt_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_nv_V4:
+  case Hexagon::STw_GP_cdnNotPt_V4:
+  case Hexagon::STw_GP_cNotPt_nv_V4:
+    return Hexagon::STw_GP_cNotPt_V4;
+
+  case Hexagon::STriw_GP_cdnPt_nv_V4:
+  case Hexagon::STriw_GP_cdnPt_V4:
+  case Hexagon::STriw_GP_cPt_nv_V4:
+    return Hexagon::STriw_GP_cPt_V4;
+
+  case Hexagon::STriw_GP_cdnNotPt_nv_V4:
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+  case Hexagon::STriw_GP_cNotPt_nv_V4:
+    return Hexagon::STriw_GP_cNotPt_V4;
+
+  // Store new-value word - unconditional
+
+  case Hexagon::STriw_nv_V4:
+    return Hexagon::STriw;
+
+  case Hexagon::STriw_indexed_nv_V4:
+    return Hexagon::STriw_indexed;
+
+  case Hexagon::STriw_indexed_shl_nv_V4:
+    return Hexagon::STriw_indexed_shl_V4;
+
+  case Hexagon::STriw_shl_nv_V4:
+    return Hexagon::STriw_shl_V4;
+
+  case Hexagon::STriw_GP_nv_V4:
+    return Hexagon::STriw_GP_V4;
+
+  case Hexagon::STw_GP_nv_V4:
+    return Hexagon::STw_GP_V4;
+
+  case Hexagon::POST_STwri_nv_V4:
+    return Hexagon::POST_STwri;
+
+ // Store doubleword
+
+  case Hexagon::STrid_cdnPt_V4 :
+    return Hexagon::STrid_cPt;
+
+  case Hexagon::STrid_cdnNotPt_V4 :
+    return Hexagon::STrid_cNotPt;
+
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+    return Hexagon::STrid_indexed_cPt;
+
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_cNotPt;
+
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrid_indexed_shl_cPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STdri_cdnPt_V4 :
+    return Hexagon::POST_STdri_cPt;
+
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+    return Hexagon::POST_STdri_cNotPt;
+
+  case Hexagon::STd_GP_cdnPt_V4 :
+    return Hexagon::STd_GP_cPt_V4;
+
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+    return Hexagon::STd_GP_cNotPt_V4;
+
+  case Hexagon::STrid_GP_cdnPt_V4 :
+    return Hexagon::STrid_GP_cPt_V4;
+
+  case Hexagon::STrid_GP_cdnNotPt_V4 :
+    return Hexagon::STrid_GP_cNotPt_V4;
+  }
+}
+
+bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  int NewOpcode = GetDotOldOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+  return true;
+}
+
+// Returns true if an instruction is predicated on p0 and false if it's
+// predicated on !p0.
+
+static bool GetPredicateSense(MachineInstr* MI,
+                              const HexagonInstrInfo *QII) {
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown predicate sense of the instruction");
+  case Hexagon::TFR_cPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFRI_cPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::STrib_cPt :
+  case Hexagon::STrib_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt :
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STbri_cPt :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+  case Hexagon::STrih_cPt :
+  case Hexagon::STrih_cdnPt_V4 :
+  case Hexagon::STrih_indexed_cPt :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_SThri_cPt :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+  case Hexagon::STriw_cPt :
+  case Hexagon::STriw_cdnPt_V4 :
+  case Hexagon::STriw_indexed_cPt :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STwri_cPt :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+  case Hexagon::STrib_imm_cPt_V4 :
+  case Hexagon::STrib_imm_cdnPt_V4 :
+  case Hexagon::STrid_cPt :
+  case Hexagon::STrid_cdnPt_V4 :
+  case Hexagon::STrid_indexed_cPt :
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STdri_cPt :
+  case Hexagon::POST_STdri_cdnPt_V4 :
+  case Hexagon::STrih_imm_cPt_V4 :
+  case Hexagon::STrih_imm_cdnPt_V4 :
+  case Hexagon::STriw_imm_cPt_V4 :
+  case Hexagon::STriw_imm_cdnPt_V4 :
+  case Hexagon::JMP_cdnPt :
+  case Hexagon::LDrid_cPt :
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_indexed_cPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::POST_LDrid_cPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::LDriw_cPt :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_indexed_cPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::POST_LDriw_cPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::LDrih_cPt :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_indexed_cPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::POST_LDrih_cPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::LDrib_cPt :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_indexed_cPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::POST_LDrib_cPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::LDriuh_cPt :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_indexed_cPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::POST_LDriuh_cPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::LDriub_cPt :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_indexed_cPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::POST_LDriub_cPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_cPt_V4 :
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_cPt_V4 :
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_cPt_V4 :
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_cPt_V4 :
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_cPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_cPt_V4 :
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::ADD_ri_cPt :
+  case Hexagon::ADD_ri_cdnPt :
+  case Hexagon::ADD_rr_cPt :
+  case Hexagon::ADD_rr_cdnPt :
+  case Hexagon::XOR_rr_cPt :
+  case Hexagon::XOR_rr_cdnPt :
+  case Hexagon::AND_rr_cPt :
+  case Hexagon::AND_rr_cdnPt :
+  case Hexagon::OR_rr_cPt :
+  case Hexagon::OR_rr_cdnPt :
+  case Hexagon::SUB_rr_cPt :
+  case Hexagon::SUB_rr_cdnPt :
+  case Hexagon::COMBINE_rr_cPt :
+  case Hexagon::COMBINE_rr_cdnPt :
+  case Hexagon::ASLH_cPt_V4 :
+  case Hexagon::ASLH_cdnPt_V4 :
+  case Hexagon::ASRH_cPt_V4 :
+  case Hexagon::ASRH_cdnPt_V4 :
+  case Hexagon::SXTB_cPt_V4 :
+  case Hexagon::SXTB_cdnPt_V4 :
+  case Hexagon::SXTH_cPt_V4 :
+  case Hexagon::SXTH_cdnPt_V4 :
+  case Hexagon::ZXTB_cPt_V4 :
+  case Hexagon::ZXTB_cdnPt_V4 :
+  case Hexagon::ZXTH_cPt_V4 :
+  case Hexagon::ZXTH_cdnPt_V4 :
+  case Hexagon::LDrid_GP_cPt_V4 :
+  case Hexagon::LDrib_GP_cPt_V4 :
+  case Hexagon::LDriub_GP_cPt_V4 :
+  case Hexagon::LDrih_GP_cPt_V4 :
+  case Hexagon::LDriuh_GP_cPt_V4 :
+  case Hexagon::LDriw_GP_cPt_V4 :
+  case Hexagon::LDd_GP_cPt_V4 :
+  case Hexagon::LDb_GP_cPt_V4 :
+  case Hexagon::LDub_GP_cPt_V4 :
+  case Hexagon::LDh_GP_cPt_V4 :
+  case Hexagon::LDuh_GP_cPt_V4 :
+  case Hexagon::LDw_GP_cPt_V4 :
+  case Hexagon::STrid_GP_cPt_V4 :
+  case Hexagon::STrib_GP_cPt_V4 :
+  case Hexagon::STrih_GP_cPt_V4 :
+  case Hexagon::STriw_GP_cPt_V4 :
+  case Hexagon::STd_GP_cPt_V4 :
+  case Hexagon::STb_GP_cPt_V4 :
+  case Hexagon::STh_GP_cPt_V4 :
+  case Hexagon::STw_GP_cPt_V4 :
+  case Hexagon::LDrid_GP_cdnPt_V4 :
+  case Hexagon::LDrib_GP_cdnPt_V4 :
+  case Hexagon::LDriub_GP_cdnPt_V4 :
+  case Hexagon::LDrih_GP_cdnPt_V4 :
+  case Hexagon::LDriuh_GP_cdnPt_V4 :
+  case Hexagon::LDriw_GP_cdnPt_V4 :
+  case Hexagon::LDd_GP_cdnPt_V4 :
+  case Hexagon::LDb_GP_cdnPt_V4 :
+  case Hexagon::LDub_GP_cdnPt_V4 :
+  case Hexagon::LDh_GP_cdnPt_V4 :
+  case Hexagon::LDuh_GP_cdnPt_V4 :
+  case Hexagon::LDw_GP_cdnPt_V4 :
+  case Hexagon::STrid_GP_cdnPt_V4 :
+  case Hexagon::STrib_GP_cdnPt_V4 :
+  case Hexagon::STrih_GP_cdnPt_V4 :
+  case Hexagon::STriw_GP_cdnPt_V4 :
+  case Hexagon::STd_GP_cdnPt_V4 :
+  case Hexagon::STb_GP_cdnPt_V4 :
+  case Hexagon::STh_GP_cdnPt_V4 :
+  case Hexagon::STw_GP_cdnPt_V4 :
+    return true;
+
+  case Hexagon::TFR_cNotPt:
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFRI_cNotPt:
+  case Hexagon::TFRI_cdnNotPt:
+  case Hexagon::STrib_cNotPt :
+  case Hexagon::STrib_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt :
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STbri_cNotPt :
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+  case Hexagon::STrih_cNotPt :
+  case Hexagon::STrih_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_cNotPt :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_SThri_cNotPt :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+  case Hexagon::STriw_cNotPt :
+  case Hexagon::STriw_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_cNotPt :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STwri_cNotPt :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+  case Hexagon::STrib_imm_cNotPt_V4 :
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+  case Hexagon::STrid_cNotPt :
+  case Hexagon::STrid_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cNotPt :
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STdri_cNotPt :
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+  case Hexagon::STrih_imm_cNotPt_V4 :
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+  case Hexagon::STriw_imm_cNotPt_V4 :
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+  case Hexagon::JMP_cdnNotPt :
+  case Hexagon::LDrid_cNotPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cNotPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cNotPt :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cNotPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cNotPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cNotPt :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cNotPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cNotPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cNotPt :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cNotPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cNotPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cNotPt :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cNotPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cNotPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cNotPt :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cNotPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cNotPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cNotPt :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::ADD_ri_cNotPt :
+  case Hexagon::ADD_ri_cdnNotPt :
+  case Hexagon::ADD_rr_cNotPt :
+  case Hexagon::ADD_rr_cdnNotPt :
+  case Hexagon::XOR_rr_cNotPt :
+  case Hexagon::XOR_rr_cdnNotPt :
+  case Hexagon::AND_rr_cNotPt :
+  case Hexagon::AND_rr_cdnNotPt :
+  case Hexagon::OR_rr_cNotPt :
+  case Hexagon::OR_rr_cdnNotPt :
+  case Hexagon::SUB_rr_cNotPt :
+  case Hexagon::SUB_rr_cdnNotPt :
+  case Hexagon::COMBINE_rr_cNotPt :
+  case Hexagon::COMBINE_rr_cdnNotPt :
+  case Hexagon::ASLH_cNotPt_V4 :
+  case Hexagon::ASLH_cdnNotPt_V4 :
+  case Hexagon::ASRH_cNotPt_V4 :
+  case Hexagon::ASRH_cdnNotPt_V4 :
+  case Hexagon::SXTB_cNotPt_V4 :
+  case Hexagon::SXTB_cdnNotPt_V4 :
+  case Hexagon::SXTH_cNotPt_V4 :
+  case Hexagon::SXTH_cdnNotPt_V4 :
+  case Hexagon::ZXTB_cNotPt_V4 :
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+  case Hexagon::ZXTH_cNotPt_V4 :
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+
+  case Hexagon::LDrid_GP_cNotPt_V4 :
+  case Hexagon::LDrib_GP_cNotPt_V4 :
+  case Hexagon::LDriub_GP_cNotPt_V4 :
+  case Hexagon::LDrih_GP_cNotPt_V4 :
+  case Hexagon::LDriuh_GP_cNotPt_V4 :
+  case Hexagon::LDriw_GP_cNotPt_V4 :
+  case Hexagon::LDd_GP_cNotPt_V4 :
+  case Hexagon::LDb_GP_cNotPt_V4 :
+  case Hexagon::LDub_GP_cNotPt_V4 :
+  case Hexagon::LDh_GP_cNotPt_V4 :
+  case Hexagon::LDuh_GP_cNotPt_V4 :
+  case Hexagon::LDw_GP_cNotPt_V4 :
+  case Hexagon::STrid_GP_cNotPt_V4 :
+  case Hexagon::STrib_GP_cNotPt_V4 :
+  case Hexagon::STrih_GP_cNotPt_V4 :
+  case Hexagon::STriw_GP_cNotPt_V4 :
+  case Hexagon::STd_GP_cNotPt_V4 :
+  case Hexagon::STb_GP_cNotPt_V4 :
+  case Hexagon::STh_GP_cNotPt_V4 :
+  case Hexagon::STw_GP_cNotPt_V4 :
+  case Hexagon::LDrid_GP_cdnNotPt_V4 :
+  case Hexagon::LDrib_GP_cdnNotPt_V4 :
+  case Hexagon::LDriub_GP_cdnNotPt_V4 :
+  case Hexagon::LDrih_GP_cdnNotPt_V4 :
+  case Hexagon::LDriuh_GP_cdnNotPt_V4 :
+  case Hexagon::LDriw_GP_cdnNotPt_V4 :
+  case Hexagon::LDd_GP_cdnNotPt_V4 :
+  case Hexagon::LDb_GP_cdnNotPt_V4 :
+  case Hexagon::LDub_GP_cdnNotPt_V4 :
+  case Hexagon::LDh_GP_cdnNotPt_V4 :
+  case Hexagon::LDuh_GP_cdnNotPt_V4 :
+  case Hexagon::LDw_GP_cdnNotPt_V4 :
+  case Hexagon::STrid_GP_cdnNotPt_V4 :
+  case Hexagon::STrib_GP_cdnNotPt_V4 :
+  case Hexagon::STrih_GP_cdnNotPt_V4 :
+  case Hexagon::STriw_GP_cdnNotPt_V4 :
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+  case Hexagon::STb_GP_cdnNotPt_V4 :
+  case Hexagon::STh_GP_cdnNotPt_V4 :
+  case Hexagon::STw_GP_cdnNotPt_V4 :
+    return false;
+  }
+  // return *some value* to avoid compiler warning
+  return false;
+}
+
+bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) {
+  if (isNewValueInst(MI))
+    return true;
+
+  switch (MI->getOpcode()) {
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFRI_cdnNotPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+
+// Coditional add
+  case Hexagon::ADD_ri_cdnPt:
+  case Hexagon::ADD_ri_cdnNotPt:
+  case Hexagon::ADD_rr_cdnPt:
+  case Hexagon::ADD_rr_cdnNotPt:
+
+  // Conditional logical operations
+  case Hexagon::XOR_rr_cdnPt :
+  case Hexagon::XOR_rr_cdnNotPt :
+  case Hexagon::AND_rr_cdnPt :
+  case Hexagon::AND_rr_cdnNotPt :
+  case Hexagon::OR_rr_cdnPt :
+  case Hexagon::OR_rr_cdnNotPt :
+
+  // Conditonal subtract
+  case Hexagon::SUB_rr_cdnPt :
+  case Hexagon::SUB_rr_cdnNotPt :
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cdnPt :
+  case Hexagon::COMBINE_rr_cdnNotPt :
+
+  // Conditional shift operations
+  case Hexagon::ASLH_cdnPt_V4:
+  case Hexagon::ASLH_cdnNotPt_V4:
+  case Hexagon::ASRH_cdnPt_V4:
+  case Hexagon::ASRH_cdnNotPt_V4:
+  case Hexagon::SXTB_cdnPt_V4:
+  case Hexagon::SXTB_cdnNotPt_V4:
+  case Hexagon::SXTH_cdnPt_V4:
+  case Hexagon::SXTH_cdnNotPt_V4:
+  case Hexagon::ZXTB_cdnPt_V4:
+  case Hexagon::ZXTB_cdnNotPt_V4:
+  case Hexagon::ZXTH_cdnPt_V4:
+  case Hexagon::ZXTH_cdnNotPt_V4:
+
+  // Conditional stores
+  case Hexagon::STrib_imm_cdnPt_V4 :
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+  case Hexagon::STrib_cdnPt_V4 :
+  case Hexagon::STrib_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+
+  // Store doubleword conditionally
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STdri_cdnPt_V4 :
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+
+  // Store halfword conditionally
+  case Hexagon::STrih_cdnPt_V4 :
+  case Hexagon::STrih_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+  case Hexagon::STrih_imm_cdnPt_V4 :
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+
+  // Store word conditionally
+  case Hexagon::STriw_cdnPt_V4 :
+  case Hexagon::STriw_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+  case Hexagon::STriw_imm_cdnPt_V4 :
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+
+  case Hexagon::LDd_GP_cdnPt_V4:
+  case Hexagon::LDd_GP_cdnNotPt_V4:
+  case Hexagon::LDb_GP_cdnPt_V4:
+  case Hexagon::LDb_GP_cdnNotPt_V4:
+  case Hexagon::LDub_GP_cdnPt_V4:
+  case Hexagon::LDub_GP_cdnNotPt_V4:
+  case Hexagon::LDh_GP_cdnPt_V4:
+  case Hexagon::LDh_GP_cdnNotPt_V4:
+  case Hexagon::LDuh_GP_cdnPt_V4:
+  case Hexagon::LDuh_GP_cdnNotPt_V4:
+  case Hexagon::LDw_GP_cdnPt_V4:
+  case Hexagon::LDw_GP_cdnNotPt_V4:
+  case Hexagon::LDrid_GP_cdnPt_V4:
+  case Hexagon::LDrid_GP_cdnNotPt_V4:
+  case Hexagon::LDrib_GP_cdnPt_V4:
+  case Hexagon::LDrib_GP_cdnNotPt_V4:
+  case Hexagon::LDriub_GP_cdnPt_V4:
+  case Hexagon::LDriub_GP_cdnNotPt_V4:
+  case Hexagon::LDrih_GP_cdnPt_V4:
+  case Hexagon::LDrih_GP_cdnNotPt_V4:
+  case Hexagon::LDriuh_GP_cdnPt_V4:
+  case Hexagon::LDriuh_GP_cdnNotPt_V4:
+  case Hexagon::LDriw_GP_cdnPt_V4:
+  case Hexagon::LDriw_GP_cdnNotPt_V4:
+
+  case Hexagon::STrid_GP_cdnPt_V4:
+  case Hexagon::STrid_GP_cdnNotPt_V4:
+  case Hexagon::STrib_GP_cdnPt_V4:
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+  case Hexagon::STrih_GP_cdnPt_V4:
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+  case Hexagon::STriw_GP_cdnPt_V4:
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+  case Hexagon::STd_GP_cdnPt_V4:
+  case Hexagon::STd_GP_cdnNotPt_V4:
+  case Hexagon::STb_GP_cdnPt_V4:
+  case Hexagon::STb_GP_cdnNotPt_V4:
+  case Hexagon::STh_GP_cdnPt_V4:
+  case Hexagon::STh_GP_cdnNotPt_V4:
+  case Hexagon::STw_GP_cdnPt_V4:
+  case Hexagon::STw_GP_cdnNotPt_V4:
+    return true;
+  }
+  return false;
+}
+
+static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
+                                               const HexagonInstrInfo *QII) {
+  assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+  // Post Increment means duplicates. Use dense map to find duplicates in the
+  // list. Caution: Densemap initializes with the minimum of 64 buckets,
+  // whereas there are at most 5 operands in the post increment.
+  DenseMap<unsigned,  unsigned> DefRegsSet;
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isDef()) {
+      DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
+    }
+
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isUse()) {
+      if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
+        return MI->getOperand(opNum);
+      }
+    }
+#else
+  if (MI->getDesc().mayLoad()) {
+    // The 2nd operand is always the post increment operand in load.
+    assert(MI->getOperand(1).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(1));
+  }
+  if (MI->getDesc().mayStore()) {
+    // The 1st operand is always the post increment operand in store.
+    assert(MI->getOperand(0).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(0));
+  }
+#endif
+  // we should never come here.
+  llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// get the value being stored
+static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+  // value being stored is always the last operand.
+  return (MI->getOperand(MI->getNumOperands()-1));
+}
+
+// can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+//    be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode,
+//    its address register cannot be a new-value register.
+//    Arch Spec 5.4.2.1.TODO: This is not enabled as
+//    as absolute-set address mode patters are not implemented.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+//    as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets a new-value register is conditional, then
+//    the instruction that uses the new-value register must also be conditional,
+//    and both must always have their predicates evaluate identically.
+//    Arch Spec 5.4.2.3.
+// 5. There is an implied restriction of a packet can not have another store,
+//    if there is a  new value store in the packet. Corollary, if there is
+//    already a store in a packet, there can not be a new value store.
+//    Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
+                MachineInstr *PacketMI, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit)
+{
+  // Make sure we are looking at the store
+  if (!IsNewifyStore(MI))
+    return false;
+
+  // Make sure there is dependency and can be new value'ed
+  if (GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() != DepReg)
+    return false;
+
+  const HexagonRegisterInfo* QRI = 
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const MCInstrDesc& MCID = PacketMI->getDesc();
+  // first operand is always the result
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
+
+  // if there is already an store in the packet, no can do new value store
+  // Arch Spec 3.4.4.2.
+  for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
+         VE = CurrentPacketMIs.end();
+       (VI != VE); ++VI) {
+    SUnit* PacketSU = MIToSUnit[*VI];
+    if (PacketSU->getInstr()->getDesc().mayStore() ||
+        // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
+        // then we don't need this
+        PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+        PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME)
+      return false;
+  }
+
+  if (PacketRC == &Hexagon::DoubleRegsRegClass) {
+    // new value store constraint: double regs can not feed into new value store
+    // arch spec section: 5.4.2.2
+    return false;
+  }
+
+  // Make sure it's NOT the post increment register that we are going to
+  // new value.
+  if (QII->isPostIncrement(MI) &&
+      MI->getDesc().mayStore() &&
+      GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+    return false;
+  }
+
+  if (QII->isPostIncrement(PacketMI) &&
+      PacketMI->getDesc().mayLoad() &&
+      GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
+    // if source is post_inc, or absolute-set addressing,
+    // it can not feed into new value store
+    //  r3 = memw(r2++#4)
+    //  memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1
+    return false;
+  }
+
+  // If the source that feeds the store is predicated, new value store must
+  // also be also predicated.
+  if (QII->isPredicated(PacketMI)) {
+    if (!QII->isPredicated(MI))
+      return false;
+
+    // Check to make sure that they both will have their predicates
+    // evaluate identically
+    unsigned predRegNumSrc = 0;
+    unsigned predRegNumDst = 0;
+    const TargetRegisterClass* predRegClass = NULL;
+
+    // Get predicate register used in the source instruction
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if ( PacketMI->getOperand(opNum).isReg())
+      predRegNumSrc = PacketMI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+        ("predicate register not found in a predicated PacketMI instruction"));
+
+    // Get predicate register used in new-value store instruction
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if ( MI->getOperand(opNum).isReg())
+      predRegNumDst = MI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+            ("predicate register not found in a predicated MI instruction"));
+
+    // New-value register producer and user (store) need to satisfy these
+    // constraints:
+    // 1) Both instructions should be predicated on the same register.
+    // 2) If producer of the new-value register is .new predicated then store
+    // should also be .new predicated and if producer is not .new predicated
+    // then store should not be .new predicated.
+    // 3) Both new-value register producer and user should have same predicate
+    // sense, i.e, either both should be negated or both should be none negated.
+
+    if (( predRegNumDst != predRegNumSrc) ||
+          isDotNewInst(PacketMI) != isDotNewInst(MI)  ||
+          GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) {
+      return false;
+    }
+  }
+
+  // Make sure that other than the new-value register no other store instruction
+  // register has been modified in the same packet. Predicate registers can be
+  // modified by they should not be modified between the producer and the store
+  // instruction as it will make them both conditional on different values.
+  // We already know this to be true for all the instructions before and
+  // including PacketMI. Howerver, we need to perform the check for the
+  // remaining instructions in the packet.
+
+  std::vector<MachineInstr*>::iterator VI;
+  std::vector<MachineInstr*>::iterator VE;
+  unsigned StartCheck = 0;
+
+  for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
+      (VI != VE); ++VI) {
+    SUnit* TempSU = MIToSUnit[*VI];
+    MachineInstr* TempMI = TempSU->getInstr();
+
+    // Following condition is true for all the instructions until PacketMI is
+    // reached (StartCheck is set to 0 before the for loop).
+    // StartCheck flag is 1 for all the instructions after PacketMI.
+    if (TempMI != PacketMI && !StartCheck) // start processing only after
+      continue;                            // encountering PacketMI
+
+    StartCheck = 1;
+    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+      continue;
+
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
+                                               QRI))
+        return false;
+    }
+  }
+
+  // Make sure that for non POST_INC stores:
+  // 1. The only use of reg is DepReg and no other registers.
+  //    This handles V4 base+index registers.
+  //    The following store can not be dot new.
+  //    Eg.   r0 = add(r0, #3)a
+  //          memw(r1+r0<<#2) = r0
+  if (!QII->isPostIncrement(MI) &&
+      GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() == DepReg) {
+    for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          MI->getOperand(opNum).getReg() == DepReg) {
+        return false;
+      }
+    }
+    // 2. If data definition is because of implicit definition of the register,
+    //    do not newify the store. Eg.
+    //    %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+    //    STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if (PacketMI->getOperand(opNum).isReg() &&
+          PacketMI->getOperand(opNum).getReg() == DepReg &&
+          PacketMI->getOperand(opNum).isDef() &&
+          PacketMI->getOperand(opNum).isImplicit()) {
+        return false;
+      }
+    }
+  }
+
+  // Can be dot new store.
+  return true;
+}
+
+// can this MI to promoted to either
+// new value store or new value jump
+bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
+                SUnit *PacketSU, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit,
+                MachineBasicBlock::iterator &MII)
+{
+
+  const HexagonRegisterInfo* QRI =
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  if (!QRI->Subtarget.hasV4TOps() ||
+      !IsNewifyStore(MI))
+    return false;
+
+  MachineInstr *PacketMI = PacketSU->getInstr();
+
+  // Check to see the store can be new value'ed.
+  if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+    return true;
+
+  // Check to see the compare/jump can be new value'ed.
+  // This is done as a pass on its own. Don't need to check it here.
+  return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
+                              SUnit *PacketSU, unsigned DepReg,
+                              std::map <MachineInstr*, SUnit*> MIToSUnit,
+                              MachineBasicBlock::iterator &MII,
+                              const TargetRegisterClass* RC )
+{
+  // already a dot new instruction
+  if (isDotNewInst(MI) && !IsNewifyStore(MI))
+    return false;
+
+  if (!isNewifiable(MI))
+    return false;
+
+  // predicate .new
+  if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
+      return true;
+  else if (RC != &Hexagon::PredRegsRegClass &&
+      !IsNewifyStore(MI)) // MI is not a new-value store
+    return false;
+  else {
+    // Create a dot new machine instruction to see if resources can be
+    // allocated. If not, bail out now.
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+    int NewOpcode = GetDotNewOp(MI->getOpcode());
+    const MCInstrDesc &desc = QII->get(NewOpcode);
+    DebugLoc dl;
+    MachineInstr *NewMI =
+                    MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
+    bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
+
+    if (!ResourcesAvailable)
+      return false;
+
+    // new value store only
+    // new new value jump generated as a passes
+    if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Go through the packet instructions and search for anti dependency
+// between them and DepReg from MI
+// Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+//   b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
+//   c) %P3<def> = TFR_PdRs %R23
+//   d) %R1<def> = TFRI_cdnPt %P3, 4
+//  }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti Dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
+      unsigned DepReg,
+      std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  SUnit* PacketSUDep = MIToSUnit[MI];
+
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // We only care for dependencies to predicated instructions
+    if(!QII->isPredicated(*VIN)) continue;
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // Look at dependencies between current members of the packet
+    // and predicate defining instruction MI.
+    // Make sure that dependency is on the exact register
+    // we care about.
+    if (PacketSU->isSucc(PacketSUDep)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
+            (PacketSU->Succs[i].getKind() == SDep::Anti) &&
+            (PacketSU->Succs[i].getReg() == DepReg)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements
+bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
+     MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Currently can only reason about conditional transfers
+  if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) {
+    return false;
+  }
+
+  // Scheduling unit for candidate
+  SUnit* SU = MIToSUnit[MI1];
+
+  // One corner case deals with the following scenario:
+  // Trying to add
+  // a) %R24<def> = TFR_cPt %P0, %R25
+  // to this packet:
+  //
+  // {
+  //   b) %R25<def> = TFR_cNotPt %P0, %R24
+  //   c) %P0<def> = CMPEQri %R26, 1
+  // }
+  //
+  // On general check a) and b) are complements, but
+  // presence of c) will convert a) to .new form, and
+  // then it is not a complement
+  // We attempt to detect it by analyzing  existing
+  // dependencies in the packet
+
+  // Analyze relationships between all existing members of the packet.
+  // Look for Anti dependecy on the same predicate reg
+  // as used in the candidate
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // If this instruction in the packet is succeeded by the candidate...
+    if (PacketSU->isSucc(SU)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        // The corner case exist when there is true data
+        // dependency between candidate and one of current
+        // packet members, this dep is on predicate reg, and
+        // there already exist anti dep on the same pred in
+        // the packet.
+        if (PacketSU->Succs[i].getSUnit() == SU &&
+            Hexagon::PredRegsRegClass.contains(
+              PacketSU->Succs[i].getReg()) &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
+            // Here I know that *VIN is predicate setting instruction
+            // with true data dep to candidate on the register
+            // we care about - c) in the above example.
+            // Now I need to see if there is an anti dependency
+            // from c) to any other instruction in the
+            // same packet on the pred reg of interest
+            RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
+                                        MIToSUnit)) {
+           return false;
+        }
+      }
+    }
+  }
+
+  // If the above case does not apply, check regular
+  // complement condition.
+  // Check that the predicate register is the same and
+  // that the predicate sense is different
+  // We also need to differentiate .old vs. .new:
+  // !p0 is not complimentary to p0.new
+  return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) &&
+          (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) &&
+          (isDotNewInst(MI1) == isDotNewInst(MI2)));
+}
+
+// initPacketizerState - Initialize packetizer flags
+void HexagonPacketizerList::initPacketizerState() {
+
+  Dependence = false;
+  PromotedToDotNew = false;
+  GlueToNewValueJump = false;
+  GlueAllocframeStore = false;
+  FoundSequentialDependence = false;
+
+  return;
+}
+
+// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
+                                                    MachineBasicBlock *MBB) {
+  if (MI->isDebugValue())
+    return true;
+
+  // We must print out inline assembly
+  if (MI->isInlineAsm())
+    return false;
+
+  // We check if MI has any functional units mapped to it.
+  // If it doesn't, we ignore the instruction.
+  const MCInstrDesc& TID = MI->getDesc();
+  unsigned SchedClass = TID.getSchedClass();
+  const InstrStage* IS =
+                    ResourceTracker->getInstrItins()->beginStage(SchedClass);
+  unsigned FuncUnits = IS->getUnits();
+  return !FuncUnits;
+}
+
+// isSoloInstruction: - Returns true for instructions that must be
+// scheduled in their own packet.
+bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+
+  if (MI->isInlineAsm())
+    return true;
+
+  if (MI->isEHLabel())
+    return true;
+
+  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+  // They must not be grouped with other instructions in a packet.
+  if (IsSchedBarrier(MI))
+    return true;
+
+  return false;
+}
+
+// isLegalToPacketizeTogether:
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
+
+  const MCInstrDesc &MCIDI = I->getDesc();
+  const MCInstrDesc &MCIDJ = J->getDesc();
+
+  MachineBasicBlock::iterator II = I;
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+  const HexagonRegisterInfo* QRI =
+                      (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  // Inline asm cannot go in the packet.
+  if (I->getOpcode() == Hexagon::INLINEASM)
+    llvm_unreachable("Should not meet inline asm here!");
+
+  if (isSoloInstruction(I))
+    llvm_unreachable("Should not meet solo instr here!");
+
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((QII->isSaveCalleeSavedRegsCall(I) &&
+       DoesModifyCalleeSavedReg(J, QRI)) ||
+      (QII->isSaveCalleeSavedRegsCall(J) &&
+       DoesModifyCalleeSavedReg(I, QRI))) {
+    Dependence = true;
+    return false;
+  }
+
+  // Two control flow instructions cannot go in the same packet.
+  if (IsControlFlow(I) && IsControlFlow(J)) {
+    Dependence = true;
+    return false;
+  }
+
+  // A LoopN instruction cannot appear in the same packet as a jump or call.
+  if (IsLoopN(I) && (   IsDirectJump(J)
+                     || MCIDJ.isCall()
+                     || QII->isDeallocRet(J))) {
+    Dependence = true;
+    return false;
+  }
+  if (IsLoopN(J) && (   IsDirectJump(I)
+                     || MCIDI.isCall()
+                     || QII->isDeallocRet(I))) {
+    Dependence = true;
+    return false;
+  }
+
+  // dealloc_return cannot appear in the same packet as a conditional or
+  // unconditional jump.
+  if (QII->isDeallocRet(I) && (   MCIDJ.isBranch()
+                               || MCIDJ.isCall()
+                               || MCIDJ.isBarrier())) {
+    Dependence = true;
+    return false;
+  }
+
+
+  // V4 allows dual store. But does not allow second store, if the
+  // first store is not in SLOT0. New value store, new value jump,
+  // dealloc_return and memop always take SLOT0.
+  // Arch spec 3.4.4.2
+  if (QRI->Subtarget.hasV4TOps()) {
+
+    if (MCIDI.mayStore() && MCIDJ.mayStore() && isNewValueInst(J)) {
+      Dependence = true;
+      return false;
+    }
+
+    if (   (QII->isMemOp(J) && MCIDI.mayStore())
+        || (MCIDJ.mayStore() && QII->isMemOp(I))
+        || (QII->isMemOp(J) && QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    //if dealloc_return
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)){
+      Dependence = true;
+      return false;
+    }
+
+    // If an instruction feeds new value jump, glue it.
+    MachineBasicBlock::iterator NextMII = I;
+    ++NextMII;
+    MachineInstr *NextMI = NextMII;
+
+    if (QII->isNewValueJump(NextMI)) {
+
+      bool secondRegMatch = false;
+      bool maintainNewValueJump = false;
+
+      if (NextMI->getOperand(1).isReg() &&
+          I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+        secondRegMatch = true;
+        maintainNewValueJump = true;
+      }
+
+      if (!secondRegMatch &&
+           I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
+        maintainNewValueJump = true;
+      }
+
+      for (std::vector<MachineInstr*>::iterator
+            VI = CurrentPacketMIs.begin(),
+             VE = CurrentPacketMIs.end();
+           (VI != VE && maintainNewValueJump); ++VI) {
+        SUnit* PacketSU = MIToSUnit[*VI];
+
+        // NVJ can not be part of the dual jump - Arch Spec: section 7.8
+        if (PacketSU->getInstr()->getDesc().isCall()) {
+          Dependence = true;
+          break;
+        }
+        // Validate
+        // 1. Packet does not have a store in it.
+        // 2. If the first operand of the nvj is newified, and the second
+        //    operand is also a reg, it (second reg) is not defined in
+        //    the same packet.
+        // 3. If the second operand of the nvj is newified, (which means
+        //    first operand is also a reg), first reg is not defined in
+        //    the same packet.
+        if (PacketSU->getInstr()->getDesc().mayStore()               ||
+            PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+            // Check #2.
+            (!secondRegMatch && NextMI->getOperand(1).isReg() &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(1).getReg(), QRI)) ||
+            // Check #3.
+            (secondRegMatch &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(0).getReg(), QRI))) {
+          Dependence = true;
+          break;
+        }
+      }
+      if (!Dependence)
+        GlueToNewValueJump = true;
+      else
+        return false;
+    }
+  }
+
+  if (SUJ->isSucc(SUI)) {
+    for (unsigned i = 0;
+         (i < SUJ->Succs.size()) && !FoundSequentialDependence;
+         ++i) {
+
+      if (SUJ->Succs[i].getSUnit() != SUI) {
+        continue;
+      }
+
+      SDep::Kind DepType = SUJ->Succs[i].getKind();
+
+      // For direct calls:
+      // Ignore register dependences for call instructions for
+      // packetization purposes except for those due to r31 and
+      // predicate registers.
+      //
+      // For indirect calls:
+      // Same as direct calls + check for true dependences to the register
+      // used in the indirect call.
+      //
+      // We completely ignore Order dependences for call instructions
+      //
+      // For returns:
+      // Ignore register dependences for return instructions like jumpr,
+      // dealloc return unless we have dependencies on the explicit uses
+      // of the registers used by jumpr (like r31) or dealloc return
+      // (like r29 or r30).
+      //
+      // TODO: Currently, jumpr is handling only return of r31. So, the
+      // following logic (specificaly IsCallDependent) is working fine.
+      // We need to enable jumpr for register other than r31 and then,
+      // we need to rework the last part, where it handles indirect call
+      // of that (IsCallDependent) function. Bug 6216 is opened for this.
+      //
+      unsigned DepReg = 0;
+      const TargetRegisterClass* RC = NULL;
+      if (DepType == SDep::Data) {
+        DepReg = SUJ->Succs[i].getReg();
+        RC = QRI->getMinimalPhysRegClass(DepReg);
+      }
+      if ((MCIDI.isCall() || MCIDI.isReturn()) &&
+          (!IsRegDependence(DepType) ||
+            !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
+        /* do nothing */
+      }
+
+      // For instructions that can be promoted to dot-new, try to promote.
+      else if ((DepType == SDep::Data) &&
+               CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
+               PromoteToDotNew(I, DepType, II, RC)) {
+        PromotedToDotNew = true;
+        /* do nothing */
+      }
+
+      else if ((DepType == SDep::Data) &&
+               (QII->isNewValueJump(I))) {
+        /* do nothing */
+      }
+
+      // For predicated instructions, if the predicates are complements
+      // then there can be no dependence.
+      else if (QII->isPredicated(I) &&
+               QII->isPredicated(J) &&
+          ArePredicatesComplements(I, J, MIToSUnit)) {
+        /* do nothing */
+
+      }
+      else if (IsDirectJump(I) &&
+               !MCIDJ.isBranch() &&
+               !MCIDJ.isCall() &&
+               (DepType == SDep::Order)) {
+        // Ignore Order dependences between unconditional direct branches
+        // and non-control-flow instructions
+        /* do nothing */
+      }
+      else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
+               (DepType != SDep::Output)) {
+        // Ignore all dependences for jumps except for true and output
+        // dependences
+        /* do nothing */
+      }
+
+      // Ignore output dependences due to superregs. We can
+      // write to two different subregisters of R1:0 for instance
+      // in the same cycle
+      //
+
+      //
+      // Let the
+      // If neither I nor J defines DepReg, then this is a
+      // superfluous output dependence. The dependence must be of the
+      // form:
+      //  R0 = ...
+      //  R1 = ...
+      // and there is an output dependence between the two instructions
+      // with
+      // DepReg = D0
+      // We want to ignore these dependences.
+      // Ideally, the dependence constructor should annotate such
+      // dependences. We can then avoid this relatively expensive check.
+      //
+      else if (DepType == SDep::Output) {
+        // DepReg is the register that's responsible for the dependence.
+        unsigned DepReg = SUJ->Succs[i].getReg();
+
+        // Check if I and J really defines DepReg.
+        if (I->definesRegister(DepReg) ||
+            J->definesRegister(DepReg)) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // We ignore Order dependences for
+      // 1. Two loads unless they are volatile.
+      // 2. Two stores in V4 unless they are volatile.
+      else if ((DepType == SDep::Order) &&
+               !I->hasVolatileMemoryRef() &&
+               !J->hasVolatileMemoryRef()) {
+        if (QRI->Subtarget.hasV4TOps() &&
+            // hexagonv4 allows dual store.
+            MCIDI.mayStore() && MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        // store followed by store-- not OK on V2
+        // store followed by load -- not OK on all (OK if addresses
+        // are not aliased)
+        // load followed by store -- OK on all
+        // load followed by load  -- OK on all
+        else if ( !MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        else {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // For V4, special case ALLOCFRAME. Even though there is dependency
+      // between ALLOCAFRAME and subsequent store, allow it to be
+      // packetized in a same packet. This implies that the store is using
+      // caller's SP. Hense, offset needs to be updated accordingly.
+      else if (DepType == SDep::Data
+               && QRI->Subtarget.hasV4TOps()
+               && J->getOpcode() == Hexagon::ALLOCFRAME
+               && (I->getOpcode() == Hexagon::STrid
+                   || I->getOpcode() == Hexagon::STriw
+                   || I->getOpcode() == Hexagon::STrib)
+               && I->getOperand(0).getReg() == QRI->getStackRegister()
+               && QII->isValidOffset(I->getOpcode(),
+                                     I->getOperand(1).getImm() -
+                                     (FrameSize + HEXAGON_LRFP_SIZE)))
+      {
+        GlueAllocframeStore = true;
+        // Since this store is to be glued with allocframe in the same
+        // packet, it will use SP of the previous stack frame, i.e
+        // caller's SP. Therefore, we need to recalculate offset according
+        // to this change.
+        I->getOperand(1).setImm(I->getOperand(1).getImm() -
+                                        (FrameSize + HEXAGON_LRFP_SIZE));
+      }
+
+      //
+      // Skip over anti-dependences. Two instructions that are
+      // anti-dependent can share a packet
+      //
+      else if (DepType != SDep::Anti) {
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    if (FoundSequentialDependence) {
+      Dependence = true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// isLegalToPruneDependencies
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+  if (Dependence) {
+
+    // Check if the instruction was promoted to a dot-new. If so, demote it
+    // back into a dot-old.
+    if (PromotedToDotNew) {
+      DemoteToDotOld(I);
+    }
+
+    // Check if the instruction (must be a store) was glued with an Allocframe
+    // instruction. If so, restore its offset to its original value, i.e. use
+    // curent SP instead of caller's SP.
+    if (GlueAllocframeStore) {
+      I->getOperand(1).setImm(I->getOperand(1).getImm() +
+                                             FrameSize + HEXAGON_LRFP_SIZE);
+    }
+
+    return false;
+  }
+  return true;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+
+    MachineBasicBlock::iterator MII = MI;
+    MachineBasicBlock *MBB = MI->getParent();
+
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+    if (GlueToNewValueJump) {
+
+      ++MII;
+      MachineInstr *nvjMI = MII;
+      assert(ResourceTracker->canReserveResources(MI));
+      ResourceTracker->reserveResources(MI);
+      if (QII->isExtended(MI) &&
+          !tryAllocateResourcesForConstExt(MI)) {
+        endPacket(MBB, MI);
+        ResourceTracker->reserveResources(MI);
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(MI);
+        // Reserve resources for new value jump constant extender.
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(nvjMI);
+        assert(ResourceTracker->canReserveResources(nvjMI) &&
+               "Ensure that there is a slot");
+
+      } else if (   // Extended instruction takes two slots in the packet.
+        // Try reserve and allocate 4-byte in the current packet first.
+        (QII->isExtended(nvjMI)
+            && (!tryAllocateResourcesForConstExt(nvjMI)
+                || !ResourceTracker->canReserveResources(nvjMI)))
+        || // For non-extended instruction, no need to allocate extra 4 bytes.
+        (!QII->isExtended(nvjMI) && 
+              !ResourceTracker->canReserveResources(nvjMI)))
+      {
+        endPacket(MBB, MI);
+        // A new and empty packet starts.
+        // We are sure that the resources requirements can be satisfied.
+        // Therefore, do not need to call "canReserveResources" anymore.
+        ResourceTracker->reserveResources(MI);
+        if (QII->isExtended(nvjMI))
+          reserveResourcesForConstExt(nvjMI);
+      }
+      // Here, we are sure that "reserveResources" would succeed.
+      ResourceTracker->reserveResources(nvjMI);
+      CurrentPacketMIs.push_back(MI);
+      CurrentPacketMIs.push_back(nvjMI);
+    } else {
+      if (   QII->isExtended(MI)
+          && (   !tryAllocateResourcesForConstExt(MI)
+              || !ResourceTracker->canReserveResources(MI)))
+      {
+        endPacket(MBB, MI);
+        // Check if the instruction was promoted to a dot-new. If so, demote it
+        // back into a dot-old
+        if (PromotedToDotNew) {
+          DemoteToDotOld(MI);
+        }
+        reserveResourcesForConstExt(MI);
+      }
+      // In case that "MI" is not an extended insn,
+      // the resource availability has already been checked.
+      ResourceTracker->reserveResources(MI);
+      CurrentPacketMIs.push_back(MI);
+    }
+    return MII;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+  return new HexagonPacketizer();
+}
+
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 47384cd..035afe8 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonInstPrinter.h"
+#include "HexagonMCInst.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -37,20 +38,50 @@ StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
 
 void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                    StringRef Annot) {
+  printInst((const HexagonMCInst*)(MI), O, Annot);
+}
+
+void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
   const char packetPadding[] = "      ";
   const char startPacket = '{',
              endPacket = '}';
   // TODO: add outer HW loop when it's supported too.
   if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-    MCInst Nop;
+    // Ending a harware loop is different from ending an regular packet.
+    assert(MI->isEndPacket() && "Loop end must also end the packet");
+
+    if (MI->isStartPacket()) {
+      // There must be a packet to end a loop.
+      // FIXME: when shuffling is always run, this shouldn't be needed.
+      HexagonMCInst Nop;
+      StringRef NoAnnot;
+
+      Nop.setOpcode (Hexagon::NOP);
+      Nop.setStartPacket (MI->isStartPacket());
+      printInst (&Nop, O, NoAnnot);
+    }
+
+    // Close the packet.
+    if (MI->isEndPacket())
+      O << packetPadding << endPacket;
 
-    O << packetPadding << startPacket << '\n';
-    Nop.setOpcode(Hexagon::NOP);
-    printInstruction(&Nop, O);
-    O << packetPadding << endPacket;
+    printInstruction(MI, O);
+  }
+  else {
+    // Prefix the insn opening the packet.
+    if (MI->isStartPacket())
+      O << packetPadding << startPacket << '\n';
+
+    printInstruction(MI, O);
+
+    // Suffix the insn closing the packet.
+    if (MI->isEndPacket())
+      // Suffix the packet in a new line always, since the GNU assembler has
+      // issues with a closing brace on the same line as CONST{32,64}.
+      O << '\n' << packetPadding << endPacket;
   }
 
-  printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
 
@@ -65,22 +96,22 @@ void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else if(MO.isImm()) {
     printImmOperand(MI, OpNo, O);
   } else {
-    assert(false && "Unknown operand");
+    llvm_unreachable("Unknown operand");
   }
 }
 
-void HexagonInstPrinter::printImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
 void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
-                                                raw_ostream &O) const {
+                                         raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printUnsignedImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
+                                    unsigned OpNo, raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
@@ -89,13 +120,13 @@ void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
   O << -MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printNOneImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
   O << -1;
 }
 
-void HexagonInstPrinter::printMEMriOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
   const MCOperand& MO0 = MI->getOperand(OpNo);
   const MCOperand& MO1 = MI->getOperand(OpNo + 1);
 
@@ -103,8 +134,8 @@ void HexagonInstPrinter::printMEMriOperand
   O << " + #" << MO1.getImm();
 }
 
-void HexagonInstPrinter::printFrameIndexOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
   const MCOperand& MO0 = MI->getOperand(OpNo);
   const MCOperand& MO1 = MI->getOperand(OpNo + 1);
 
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
index dad4334..902a323 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -14,6 +14,7 @@
 #ifndef HEXAGONINSTPRINTER_H
 #define HEXAGONINSTPRINTER_H
 
+#include "HexagonMCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -25,6 +26,7 @@ namespace llvm {
       : MCInstPrinter(MAI, MII, MRI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
     virtual StringRef getOpcodeName(unsigned Opcode) const;
     void printInstruction(const MCInst *MI, raw_ostream &O);
     StringRef getRegName(unsigned RegNo) const;
@@ -33,16 +35,16 @@ namespace llvm {
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
     void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
     void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) const;
     void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
-    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) const;
     void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
@@ -55,7 +57,8 @@ namespace llvm {
            const;
     void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
 
-    void printConstantPool(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printConstantPool(const MCInst *MI, unsigned OpNo,
+                           raw_ostream &O) const;
 
     void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
       { printSymbol(MI, OpNo, O, true); }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ed55c3c..7221e90 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -23,14 +23,41 @@ namespace llvm {
 /// instruction info tracks.
 ///
 namespace HexagonII {
-
   // *** The code below must match HexagonInstrFormat*.td *** //
 
+  // Insn types.
+  // *** Must match HexagonInstrFormat*.td ***
+  enum Type {
+    TypePSEUDO = 0,
+    TypeALU32  = 1,
+    TypeCR     = 2,
+    TypeJR     = 3,
+    TypeJ      = 4,
+    TypeLD     = 5,
+    TypeST     = 6,
+    TypeSYSTEM = 7,
+    TypeXTYPE  = 8,
+    TypeMEMOP  = 9,
+    TypeNV     = 10,
+    TypePREFIX = 30, // Such as extenders.
+    TypeMARKER = 31  // Such as end of a HW loop.
+  };
+
+
+
   // MCInstrDesc TSFlags
+  // *** Must match HexagonInstrFormat*.td ***
   enum {
+    // This 5-bit field describes the insn type.
+    TypePos  = 0,
+    TypeMask = 0x1f,
+
+    // Solo instructions.
+    SoloPos  = 5,
+    SoloMask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 1,
+    PredicatedPos  = 6,
     PredicatedMask = 0x1
   };
 
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 8ec5673..8995080 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
+subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index bf1deef..6c3e8b6 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -30,6 +30,8 @@ add_llvm_target(MBlazeCodeGen
   MBlazeELFWriterInfo.cpp
   )
 
+add_dependencies(LLVMMBlazeCodeGen intrinsics_gen)
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
index b4edff0..c288855 100644
--- a/lib/Target/MBlaze/MBlaze.td
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -50,7 +50,7 @@ def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
 // MBlaze processors supported.
 //===----------------------------------------------------------------------===//
 
-def : Processor<"mblaze",  MBlazeGenericItineraries, []>;
+def : Processor<"mblaze",  NoItineraries, []>;
 def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
 def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
 
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index 55fffe3..e9f340f 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -135,7 +135,7 @@ void MBlazeAsmPrinter::printSavedRegsBitmask() {
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     unsigned RegNum = getMBlazeRegisterNumbering(Reg);
-    if (MBlaze::GPRRegisterClass->contains(Reg))
+    if (MBlaze::GPRRegClass.contains(Reg))
       CPUBitmask |= (1 << RegNum);
   }
 
@@ -187,7 +187,7 @@ void MBlazeAsmPrinter::EmitFunctionBodyEnd() {
 
 //===----------------------------------------------------------------------===//
 void MBlazeAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MBlazeMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MBlazeMCInstLower MCInstLowering(OutContext, *this);
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
@@ -200,7 +200,13 @@ PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                 unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
 
   printOperand(MI, OpNo, O);
   return false;
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index edfc335..310c25e 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -62,9 +62,9 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
-  addRegisterClass(MVT::i32, MBlaze::GPRRegisterClass);
+  addRegisterClass(MVT::i32, &MBlaze::GPRRegClass);
   if (Subtarget->hasFPU()) {
-    addRegisterClass(MVT::f32, MBlaze::GPRRegisterClass);
+    addRegisterClass(MVT::f32, &MBlaze::GPRRegClass);
     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   }
 
@@ -291,12 +291,12 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
   loop->addSuccessor(finish);
   loop->addSuccessor(loop);
 
-  unsigned IAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned IAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(MBB, dl, TII->get(MBlaze::ANDI), IAMT)
     .addReg(MI->getOperand(2).getReg())
     .addImm(31);
 
-  unsigned IVAL = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned IVAL = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(MBB, dl, TII->get(MBlaze::ADDIK), IVAL)
     .addReg(MI->getOperand(1).getReg())
     .addImm(0);
@@ -305,14 +305,14 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
     .addReg(IAMT)
     .addMBB(finish);
 
-  unsigned DST = R.createVirtualRegister(MBlaze::GPRRegisterClass);
-  unsigned NDST = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned DST = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NDST = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
     .addReg(IVAL).addMBB(MBB)
     .addReg(NDST).addMBB(loop);
 
-  unsigned SAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
-  unsigned NAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned SAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT)
     .addReg(IAMT).addMBB(MBB)
     .addReg(NAMT).addMBB(loop);
@@ -500,7 +500,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
     case MBlaze::LAN32: opcode = MBlaze::AND; break;
     }
 
-    finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+    finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
     start->addSuccessor(exit);
     start->addSuccessor(start);
 
@@ -510,7 +510,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
 
     if (MI->getOpcode() == MBlaze::LAN32) {
       unsigned tmp = finalReg;
-      finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+      finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
       BuildMI(start, dl, TII->get(MBlaze::XORI), finalReg)
         .addReg(tmp)
         .addImm(-1);
@@ -528,7 +528,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
     final->addSuccessor(exit);
     final->addSuccessor(start);
 
-    unsigned CMP = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+    unsigned CMP = R.createVirtualRegister(&MBlaze::GPRRegClass);
     BuildMI(start, dl, TII->get(MBlaze::CMP), CMP)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(2).getReg());
@@ -543,7 +543,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   }
   }
 
-  unsigned CHK = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned CHK = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(final, dl, TII->get(MBlaze::SWX))
     .addReg(finalReg)
     .addReg(MI->getOperand(1).getReg())
@@ -681,13 +681,19 @@ static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isVarArg, isTailCall.
 SDValue MBlazeTargetLowering::
-LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-          bool isVarArg, bool doesNotRet, bool &isTailCall,
-          const SmallVectorImpl<ISD::OutputArg> &Outs,
-          const SmallVectorImpl<SDValue> &OutVals,
-          const SmallVectorImpl<ISD::InputArg> &Ins,
-          DebugLoc dl, SelectionDAG &DAG,
+LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MBlaze does not yet support tail call optimization
   isTailCall = false;
 
@@ -702,7 +708,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -841,7 +847,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
 
@@ -884,7 +890,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
   SDValue StackPtr;
@@ -899,9 +905,9 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
-        RC = MBlaze::GPRRegisterClass;
+        RC = &MBlaze::GPRRegClass;
       else if (RegVT == MVT::f32)
-        RC = MBlaze::GPRRegisterClass;
+        RC = &MBlaze::GPRRegClass;
       else
         llvm_unreachable("RegVT not supported by LowerFormalArguments");
 
@@ -964,7 +970,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       StackPtr = DAG.getRegister(StackReg, getPointerTy());
 
     // The last register argument that must be saved is MBlaze::R10
-    const TargetRegisterClass *RC = MBlaze::GPRRegisterClass;
+    const TargetRegisterClass *RC = &MBlaze::GPRRegClass;
 
     unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
     unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
@@ -1016,7 +1022,7 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
@@ -1124,14 +1130,14 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      return std::make_pair(0U, MBlaze::GPRRegisterClass);
+      return std::make_pair(0U, &MBlaze::GPRRegClass);
       // TODO: These can't possibly be right, but match what was in
       // getRegClassForInlineAsmConstraint.
     case 'd':
     case 'y':
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, MBlaze::GPRRegisterClass);
+        return std::make_pair(0U, &MBlaze::GPRRegClass);
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 6a79fc1..a01fab5 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -132,13 +132,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index db71434..b5025fc 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -287,7 +287,7 @@ unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
-  GlobalBaseReg = RegInfo.createVirtualRegister(MBlaze::GPRRegisterClass);
+  GlobalBaseReg = RegInfo.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
           GlobalBaseReg).addReg(MBlaze::R20);
   RegInfo.addLiveIn(MBlaze::R20);
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 02a2157..139bf71 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -295,7 +295,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> :
 // Branch and Link Instructions
 //===----------------------------------------------------------------------===//
 class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
-              TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops),
+              TA<op, flags, (outs), (ins GPR:$link, GPR:$target),
                  !strconcat(instr_asm, "   $link, $target"),
                  [], IIC_BRl> {
   let ra = br;
@@ -303,7 +303,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 }
 
 class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
-               TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops),
+               TB<op, (outs), (ins GPR:$link, calltarget:$target),
                   !strconcat(instr_asm, "   $link, $target"),
                   [], IIC_BRl> {
   let ra = br;
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.h b/lib/Target/MBlaze/MBlazeMCInstLower.h
index 7b97744..8ab2c9a 100644
--- a/lib/Target/MBlaze/MBlazeMCInstLower.h
+++ b/lib/Target/MBlaze/MBlazeMCInstLower.h
@@ -21,18 +21,16 @@ namespace llvm {
   class MachineInstr;
   class MachineModuleInfoMachO;
   class MachineOperand;
-  class Mangler;
 
   /// MBlazeMCInstLower - This class is used to lower an MachineInstr
   /// into an MCInst.
 class LLVM_LIBRARY_VISIBILITY MBlazeMCInstLower {
   MCContext &Ctx;
-  Mangler &Mang;
 
   AsmPrinter &Printer;
 public:
-  MBlazeMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
-    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  MBlazeMCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
index 4a3ae5f..cd5691c 100644
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -40,11 +40,6 @@ def IIC_WDC    : InstrItinClass;
 def IIC_Pseudo : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// MBlaze generic instruction itineraries.
-//===----------------------------------------------------------------------===//
-def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>;
-
-//===----------------------------------------------------------------------===//
 // MBlaze instruction itineraries for three stage pipeline.
 //===----------------------------------------------------------------------===//
 include "MBlazeSchedule3.td"
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index d12d142..dc2ad29 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -43,13 +43,6 @@ MBlazeSubtarget::MBlazeSubtarget(const std::string &TT,
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
-
-  // Compute the issue width of the MBlaze itineraries
-  computeIssueWidth();
-}
-
-void MBlazeSubtarget::computeIssueWidth() {
-  InstrItins.IssueWidth = 1;
 }
 
 bool MBlazeSubtarget::
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 62393d0..5f82f14 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -68,7 +68,7 @@ TargetPassConfig *MBlazeTargetMachine::createPassConfig(PassManagerBase &PM) {
 // Install an instruction selector pass using
 // the ISelDag to gen MBlaze code.
 bool MBlazePassConfig::addInstSelector() {
-  PM->add(createMBlazeISelDag(getMBlazeTargetMachine()));
+  addPass(createMBlazeISelDag(getMBlazeTargetMachine()));
   return false;
 }
 
@@ -76,6 +76,6 @@ bool MBlazePassConfig::addInstSelector() {
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 bool MBlazePassConfig::addPreEmitPass() {
-  PM->add(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
+  addPass(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
   return true;
 }
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
index c9b1636..bfd11a0 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -98,6 +98,7 @@ public:
 
 
 MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx) {
   return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index ae82c32..7cc96c6 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCContext;
 class MCCodeEmitter;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -30,6 +31,7 @@ class raw_ostream;
 extern Target TheMBlazeTarget;
 
 MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index a8f9b52..f9ecaed 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -23,6 +23,8 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
+add_dependencies(LLVMMSP430CodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 1d1094b..86bc183c 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -154,7 +154,7 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 //===----------------------------------------------------------------------===//
 void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MSP430MCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MSP430MCInstLower MCInstLowering(OutContext, *this);
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 071a2f7..f8b7e14 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -59,13 +59,13 @@ HWMultMode("msp430-hwmult-mode",
 
 MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   TargetLowering(tm, new TargetLoweringObjectFileELF()),
-  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+  Subtarget(*tm.getSubtargetImpl()) {
 
   TD = getTargetData();
 
   // Set up the register classes.
-  addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
-  addRegisterClass(MVT::i16, MSP430::GR16RegisterClass);
+  addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
+  addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties();
@@ -226,9 +226,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     default: break;
     case 'r':   // GENERAL_REGS
       if (VT == MVT::i8)
-        return std::make_pair(0U, MSP430::GR8RegisterClass);
+        return std::make_pair(0U, &MSP430::GR8RegClass);
 
-      return std::make_pair(0U, MSP430::GR16RegisterClass);
+      return std::make_pair(0U, &MSP430::GR16RegClass);
     }
   }
 
@@ -266,14 +266,19 @@ MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
 }
 
 SDValue
-MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                                CallingConv::ID CallConv, bool isVarArg,
-                                bool doesNotRet, bool &isTailCall,
-                                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                const SmallVectorImpl<SDValue> &OutVals,
-                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                                DebugLoc dl, SelectionDAG &DAG,
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MSP430 target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -310,7 +315,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
 
   assert(!isVarArg && "Varargs not supported yet");
@@ -330,8 +335,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
           llvm_unreachable(0);
         }
       case MVT::i16:
-        unsigned VReg =
-          RegInfo.createVirtualRegister(MSP430::GR16RegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
 
@@ -391,7 +395,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
@@ -445,7 +449,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
 
@@ -568,7 +572,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
 
@@ -1024,27 +1028,27 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   default: llvm_unreachable("Invalid shift opcode!");
   case MSP430::Shl8:
    Opc = MSP430::SHL8r1;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Shl16:
    Opc = MSP430::SHL16r1;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   case MSP430::Sra8:
    Opc = MSP430::SAR8r1;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Sra16:
    Opc = MSP430::SAR16r1;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   case MSP430::Srl8:
    Opc = MSP430::SAR8r1c;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Srl16:
    Opc = MSP430::SAR16r1c;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   }
 
@@ -1072,8 +1076,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   LoopBB->addSuccessor(RemBB);
   LoopBB->addSuccessor(LoopBB);
 
-  unsigned ShiftAmtReg = RI.createVirtualRegister(MSP430::GR8RegisterClass);
-  unsigned ShiftAmtReg2 = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
   unsigned ShiftReg = RI.createVirtualRegister(RC);
   unsigned ShiftReg2 = RI.createVirtualRegister(RC);
   unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index e372f00..d8ad02f 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -152,12 +152,7 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -174,7 +169,6 @@ namespace llvm {
                                             SelectionDAG &DAG) const;
 
     const MSP430Subtarget &Subtarget;
-    const MSP430TargetMachine &TM;
     const TargetData *TD;
   };
 } // namespace llvm
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index c03ba47..be332f0 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI(tm, *this), TM(tm) {}
+    RI(tm, *this) {}
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 04f339b..d79f992 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,7 +42,6 @@ namespace MSP430II {
 
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
-  MSP430TargetMachine &TM;
 public:
   explicit MSP430InstrInfo(MSP430TargetMachine &TM);
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 4348dd5..f003574 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -210,13 +210,13 @@ let isCall = 1 in
   let Defs = [R12W, R13W, R14W, R15W, SRW],
       Uses = [SPW] in {
     def CALLi     : II16i<0x0,
-                          (outs), (ins i16imm:$dst, variable_ops),
+                          (outs), (ins i16imm:$dst),
                           "call\t$dst", [(MSP430call imm:$dst)]>;
     def CALLr     : II16r<0x0,
-                          (outs), (ins GR16:$dst, variable_ops),
+                          (outs), (ins GR16:$dst),
                           "call\t$dst", [(MSP430call GR16:$dst)]>;
     def CALLm     : II16m<0x0,
-                          (outs), (ins memsrc:$dst, variable_ops),
+                          (outs), (ins memsrc:$dst),
                           "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
   }
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h
index 24151e2..794aa56 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/lib/Target/MSP430/MSP430MCInstLower.h
@@ -21,18 +21,16 @@ namespace llvm {
   class MachineInstr;
   class MachineModuleInfoMachO;
   class MachineOperand;
-  class Mangler;
 
   /// MSP430MCInstLower - This class is used to lower an MachineInstr
   /// into an MCInst.
 class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
   MCContext &Ctx;
-  Mangler &Mang;
 
   AsmPrinter &Printer;
 public:
-  MSP430MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
-    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 51ec71a..aed46a2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -96,7 +96,8 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const {
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
   return &MSP430::GR16RegClass;
 }
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 82ee499..9ee0a03 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -39,7 +39,8 @@ public:
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
-  const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 3f2eb8c..07619d0 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -78,8 +78,4 @@ def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Frame pointer, sometimes allocable
    FPW,
    // Volatile, but not allocable
-   PCW, SPW, SRW, CGW)>
-{
-  let SubRegClasses = [(GR8 subreg_8bit)];
-}
-
+   PCW, SPW, SRW, CGW)>;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 3acf96b..817001d 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -60,12 +60,12 @@ TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool MSP430PassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM->add(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+  addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
   return false;
 }
 
 bool MSP430PassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  PM->add(createMSP430BranchSelectionPass());
+  addPass(createMSP430BranchSelectionPass());
   return false;
 }
diff --git a/lib/Target/Mips/AsmParser/CMakeLists.txt b/lib/Target/Mips/AsmParser/CMakeLists.txt
index ac21c25..6c7343b 100644
--- a/lib/Target/Mips/AsmParser/CMakeLists.txt
+++ b/lib/Target/Mips/AsmParser/CMakeLists.txt
@@ -1,6 +1,5 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMipsAsmParser
   MipsAsmParser.cpp
   )
 
+add_dependencies(LLVMMipsAsmParser MipsCommonTableGen)
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 0500c5d..aab8a01 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -13,26 +13,33 @@ tablegen(LLVM MipsGenEDInfo.inc -gen-enhanced-disassembly-info)
 add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
+  Mips16FrameLowering.cpp
+  Mips16InstrInfo.cpp
+  Mips16RegisterInfo.cpp
   MipsAnalyzeImmediate.cpp
   MipsAsmPrinter.cpp
   MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
-  MipsEmitGPRestore.cpp
-  MipsExpandPseudo.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
   MipsFrameLowering.cpp
+  MipsLongBranch.cpp
   MipsMCInstLower.cpp
   MipsMachineFunction.cpp
   MipsRegisterInfo.cpp
+  MipsSEFrameLowering.cpp
+  MipsSEInstrInfo.cpp
+  MipsSERegisterInfo.cpp
   MipsSubtarget.cpp
   MipsTargetMachine.cpp
   MipsTargetObjectFile.cpp
   MipsSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMMipsCodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 78dbc06..042b456 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -13,136 +13,87 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "MipsRegisterInfo.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MathExtras.h"
 
-
 #include "MipsGenEDInfo.inc"
 
 using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
-/// MipsDisassembler - a disasembler class for Mips32.
-class MipsDisassembler : public MCDisassembler {
+namespace {
+
+/// MipsDisassemblerBase - a disasembler class for Mips.
+class MipsDisassemblerBase : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, bool bigEndian) :
-    MCDisassembler(STI), isBigEndian(bigEndian) {
-  }
-
-  ~MipsDisassembler() {
-  }
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                       bool bigEndian) :
+    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+  virtual ~MipsDisassemblerBase() {}
 
   /// getEDInfo - See MCDisassembler.
   const EDInstInfo *getEDInfo() const;
 
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+
 private:
+  const MCRegisterInfo *RegInfo;
+protected:
   bool isBigEndian;
 };
 
-
-/// Mips64Disassembler - a disasembler class for Mips64.
-class Mips64Disassembler : public MCDisassembler {
+/// MipsDisassembler - a disasembler class for Mips32.
+class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  Mips64Disassembler(const MCSubtargetInfo &STI, bool bigEndian) :
-    MCDisassembler(STI), isBigEndian(bigEndian) {
-  }
-
-  ~Mips64Disassembler() {
-  }
+  MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                   bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
 
   /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
-
-  /// getEDInfo - See MCDisassembler.
-  const EDInstInfo *getEDInfo() const;
-
-private:
-  bool isBigEndian;
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
 };
 
-const EDInstInfo *MipsDisassembler::getEDInfo() const {
-  return instInfoMips;
-}
-
-const EDInstInfo *Mips64Disassembler::getEDInfo() const {
-  return instInfoMips;
-}
-
-// Decoder tables for Mips register
-static const unsigned CPURegsTable[] = {
-  Mips::ZERO, Mips::AT, Mips::V0, Mips::V1,
-  Mips::A0, Mips::A1, Mips::A2, Mips::A3,
-  Mips::T0, Mips::T1, Mips::T2, Mips::T3,
-  Mips::T4, Mips::T5, Mips::T6, Mips::T7,
-  Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-  Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-  Mips::T8, Mips::T9, Mips::K0, Mips::K1,
-  Mips::GP, Mips::SP, Mips::FP, Mips::RA
-};
 
-static const unsigned FGR32RegsTable[] = {
-  Mips::F0, Mips::F1, Mips::F2, Mips::F3,
-  Mips::F4, Mips::F5, Mips::F6, Mips::F7,
-  Mips::F8, Mips::F9, Mips::F10, Mips::F11,
-  Mips::F12, Mips::F13, Mips::F14, Mips::F15,
-  Mips::F16, Mips::F17, Mips::F18, Mips::F18,
-  Mips::F20, Mips::F21, Mips::F22, Mips::F23,
-  Mips::F24, Mips::F25, Mips::F26, Mips::F27,
-  Mips::F28, Mips::F29, Mips::F30, Mips::F31
-};
+/// Mips64Disassembler - a disasembler class for Mips64.
+class Mips64Disassembler : public MipsDisassemblerBase {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                     bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
 
-static const unsigned CPU64RegsTable[] = {
-  Mips::ZERO_64, Mips::AT_64, Mips::V0_64, Mips::V1_64,
-  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
-  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64,
-  Mips::T4_64, Mips::T5_64, Mips::T6_64, Mips::T7_64,
-  Mips::S0_64, Mips::S1_64, Mips::S2_64, Mips::S3_64,
-  Mips::S4_64, Mips::S5_64, Mips::S6_64, Mips::S7_64,
-  Mips::T8_64, Mips::T9_64, Mips::K0_64, Mips::K1_64,
-  Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64
+  /// getInstruction - See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
 };
 
-static const unsigned FGR64RegsTable[] = {
-  Mips::D0_64,  Mips::D1_64,  Mips::D2_64,  Mips::D3_64,
-  Mips::D4_64,  Mips::D5_64,  Mips::D6_64,  Mips::D7_64,
-  Mips::D8_64,  Mips::D9_64,  Mips::D10_64, Mips::D11_64,
-  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
-  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64,
-  Mips::D20_64, Mips::D21_64, Mips::D22_64, Mips::D23_64,
-  Mips::D24_64, Mips::D25_64, Mips::D26_64, Mips::D27_64,
-  Mips::D28_64, Mips::D29_64, Mips::D30_64, Mips::D31_64
-};
+} // end anonymous namespace
 
-static const unsigned AFGR64RegsTable[] = {
-  Mips::D0,  Mips::D1,  Mips::D2,  Mips::D3,
-  Mips::D4,  Mips::D5,  Mips::D6,  Mips::D7,
-  Mips::D8,  Mips::D9,  Mips::D10, Mips::D11,
-  Mips::D12, Mips::D13, Mips::D14, Mips::D15
-};
+const EDInstInfo *MipsDisassemblerBase::getEDInfo() const {
+  return instInfoMips;
+}
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
@@ -239,25 +190,25 @@ extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI,true);
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), true);
 }
 
 static MCDisassembler *createMipselDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI,false);
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), false);
 }
 
 static MCDisassembler *createMips64Disassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI,true);
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), true);
 }
 
 static MCDisassembler *createMips64elDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, false);
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), false);
 }
 
 extern "C" void LLVMInitializeMipsDisassembler() {
@@ -362,6 +313,11 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   return MCDisassembler::Fail;
 }
 
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+}
+
 static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
@@ -370,7 +326,8 @@ static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(CPU64RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::CPU64RegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -380,8 +337,8 @@ static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
                                                const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::CPURegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -392,7 +349,8 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -403,7 +361,8 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(FGR32RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -420,15 +379,18 @@ static DecodeStatus DecodeMem(MCInst &Inst,
                               uint64_t Address,
                               const void *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
-  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+  unsigned Reg = fieldFromInstruction32(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction32(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::CPURegsRegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
 
   if(Inst.getOpcode() == Mips::SC){
-    Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
+    Inst.addOperand(MCOperand::CreateReg(Reg));
   }
 
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
   Inst.addOperand(MCOperand::CreateImm(Offset));
 
   return MCDisassembler::Success;
@@ -439,11 +401,14 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
                                uint64_t Address,
                                const void *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
-  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+  unsigned Reg = fieldFromInstruction32(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction32(Insn, 21, 5);
 
-  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[Reg]));
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
   Inst.addOperand(MCOperand::CreateImm(Offset));
 
   return MCDisassembler::Success;
@@ -474,10 +439,12 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               unsigned RegNo,
                                               uint64_t Address,
                                               const void *Decoder) {
-  if (RegNo > 31)
+  if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(AFGR64RegsTable[RegNo]));
+  ;
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -488,7 +455,7 @@ static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
   //Currently only hardware register 29 is supported
   if (RegNo != 29)
     return  MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29_64));
   return MCDisassembler::Success;
 }
 
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 6886b17..b38463d 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "MipsInstPrinter.h"
+#include "MipsInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -68,8 +69,25 @@ void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                 StringRef Annot) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\t.set\tpush\n";
+    O << "\t.set\tmips32r2\n";
+  }
+
   printInstruction(MI, O);
   printAnnotation(O, Annot);
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\n\t.set\tpop";
+  }
 }
 
 static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
@@ -108,6 +126,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_GOT_DISP:  OS << "%got_disp("; break;
   case MCSymbolRefExpr::VK_Mips_GOT_PAGE:  OS << "%got_page("; break;
   case MCSymbolRefExpr::VK_Mips_GOT_OFST:  OS << "%got_ofst("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:    OS << "%higher("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:   OS << "%highest("; break;
   }
 
   OS << SRE->getSymbol();
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 76b839b..3d8a6f9 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -16,7 +16,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-// These enumeration declarations were orignally in MipsInstrInfo.h but
+// These enumeration declarations were originally in MipsInstrInfo.h but
 // had to be moved here to avoid circular dependencies between
 // LLVMMipsCodeGen and LLVMMipsAsmPrinter.
 namespace Mips {
diff --git a/lib/Target/Mips/MCTargetDesc/Makefile b/lib/Target/Mips/MCTargetDesc/Makefile
index 7fe2086..22a2721 100644
--- a/lib/Target/Mips/MCTargetDesc/Makefile
+++ b/lib/Target/Mips/MCTargetDesc/Makefile
@@ -14,3 +14,4 @@ LIBRARYNAME = LLVMMipsDesc
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 9b4caf6..18961fd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -35,7 +35,13 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     return 0;
   case FK_GPRel_4:
   case FK_Data_4:
+  case FK_Data_8:
   case Mips::fixup_Mips_LO16:
+  case Mips::fixup_Mips_GPOFF_HI:
+  case Mips::fixup_Mips_GPOFF_LO:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -54,9 +60,17 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     break;
   case Mips::fixup_Mips_HI16:
   case Mips::fixup_Mips_GOT_Local:
-    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
+    // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
+  case Mips::fixup_Mips_HIGHER:
+    // Get the 3rd 16-bits.
+    Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    // Get the 4th 16-bits.
+    Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
+    break;
   }
 
   return Value;
@@ -74,7 +88,8 @@ public:
     :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createMipsELFObjectWriter(OS, OSType, IsLittle, Is64Bit);
+    return createMipsELFObjectWriter(OS,
+      MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
   }
 
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
@@ -115,7 +130,8 @@ public:
       CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
     }
 
-    uint64_t Mask = ((uint64_t)(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
+    uint64_t Mask = ((uint64_t)(-1) >>
+                     (64 - getFixupKindInfo(Kind).TargetSize));
     CurVal |= Value & Mask;
 
     // Write out the fixed up bytes back to the code/data bits.
@@ -156,7 +172,14 @@ public:
       { "fixup_Mips_TLSLDM",       0,     16,   0 },
       { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
       { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
-      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel }
+      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+      { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+      { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+      { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+      { "fixup_Mips_GOT_DISP",     0,     16,   0 },
+      { "fixup_Mips_HIGHER",       0,     16,   0 },
+      { "fixup_Mips_HIGHEST",      0,     16,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -206,6 +229,14 @@ public:
   ///
   /// \return - True on success.
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    // Check for a less than instruction size number of bytes
+    // FIXME: 16 bit instructions are not handled yet here.
+    // We shouldn't be using a hard coded number for instruction size.
+    if (Count % 4) return false;
+
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write32(0);
     return true;
   }
 }; // class MipsAsmBackend
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index fb1c5ce..234455e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -79,7 +79,12 @@ namespace MipsII {
     MO_GPOFF_LO,
     MO_GOT_DISP,
     MO_GOT_PAGE,
-    MO_GOT_OFST
+    MO_GOT_OFST,
+
+    /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+    /// 64-bit symbol address.
+    MO_HIGHER,
+    MO_HIGHEST
   };
 
   enum {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 2091bec..8e84b3f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -34,7 +34,7 @@ namespace {
 
   class MipsELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI);
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64);
 
     virtual ~MipsELFObjectWriter();
 
@@ -52,9 +52,11 @@ namespace {
   };
 }
 
-MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI)
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                                         bool _isN64)
   : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
-                            /*HasRelocationAddend*/ false) {}
+                            /*HasRelocationAddend*/ false,
+                            /*IsN64*/ _isN64) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
@@ -101,6 +103,9 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case FK_Data_4:
     Type = ELF::R_MIPS_32;
     break;
+  case FK_Data_8:
+    Type = ELF::R_MIPS_64;
+    break;
   case FK_GPRel_4:
     Type = ELF::R_MIPS_GPREL32;
     break;
@@ -148,8 +153,32 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_Mips_PC16:
     Type = ELF::R_MIPS_PC16;
     break;
+  case Mips::fixup_Mips_GOT_PAGE:
+    Type = ELF::R_MIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_Mips_GOT_OFST:
+    Type = ELF::R_MIPS_GOT_OFST;
+    break;
+  case Mips::fixup_Mips_GOT_DISP:
+    Type = ELF::R_MIPS_GOT_DISP;
+    break;
+  case Mips::fixup_Mips_GPOFF_HI:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+    break;
+  case Mips::fixup_Mips_GPOFF_LO:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    Type = ELF::R_MIPS_HIGHER;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    Type = ELF::R_MIPS_HIGHEST;
+    break;
   }
-
   return Type;
 }
 
@@ -184,10 +213,10 @@ static int CompareOffset(const RelEntry &R0, const RelEntry &R1) {
 
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
-  // Call the defualt function first. Relocations are sorted in descending
+  // Call the default function first. Relocations are sorted in descending
   // order of r_offset.
   MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
-  
+
   RelLs RelocLs;
   std::vector<RelLsIter> Unmatched;
 
@@ -244,6 +273,7 @@ MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
                                                 uint8_t OSABI,
                                                 bool IsLittleEndian,
                                                 bool Is64Bit) {
-  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI);
+  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI,
+                                                (Is64Bit) ? true : false);
   return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 9b76eda..77faec5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -95,6 +95,27 @@ namespace Mips {
     // PC relative branch fixup resulting in - R_MIPS_PC16
     fixup_Mips_Branch_PCRel,
 
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    fixup_Mips_GPOFF_HI,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    fixup_Mips_GPOFF_LO,
+
+    // resulting in - R_MIPS_PAGE
+    fixup_Mips_GOT_PAGE,
+
+    // resulting in - R_MIPS_GOT_OFST
+    fixup_Mips_GOT_OFST,
+
+    // resulting in - R_MIPS_GOT_DISP
+    fixup_Mips_GOT_DISP,
+
+    // resulting in - R_MIPS_GOT_HIGHER
+    fixup_Mips_HIGHER,
+
+    // resulting in - R_MIPS_HIGHEST
+    fixup_Mips_HIGHEST,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4ed2be0..8dab62d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -91,6 +91,7 @@ public:
 }  // namespace
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
@@ -98,6 +99,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
 }
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
@@ -179,7 +181,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   } else if (MO.isFPImm()) {
     return static_cast<unsigned>(APFloat(MO.getFPImm())
         .bitcastToAPInt().getHiBits(32).getLimitedValue());
-  } 
+  }
 
   // MO must be an Expr.
   assert(MO.isExpr());
@@ -193,10 +195,27 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   }
 
   assert (Kind == MCExpr::SymbolRef);
-    
+
   Mips::Fixups FixupKind = Mips::Fixups(0);
 
   switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+  default: llvm_unreachable("Unknown fixup kind!");
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
+    FixupKind = Mips::fixup_Mips_GPOFF_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
+    FixupKind = Mips::fixup_Mips_GPOFF_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_DISP :
+    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    break;
   case MCSymbolRefExpr::VK_Mips_GPREL:
     FixupKind = Mips::fixup_Mips_GPREL16;
     break;
@@ -236,7 +255,11 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   case MCSymbolRefExpr::VK_Mips_TPREL_LO:
     FixupKind = Mips::fixup_Mips_TPREL_LO;
     break;
-  default:
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+    FixupKind = Mips::fixup_Mips_HIGHER;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    FixupKind = Mips::fixup_Mips_HIGHEST;
     break;
   } // switch
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 547ccdd..bfcc2a2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -33,9 +34,11 @@ extern Target TheMips64Target;
 extern Target TheMips64elTarget;
 
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index bafadc8..2963f7e 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -24,9 +24,7 @@ namespace llvm {
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
-
+  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
 
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index cbebe84..8548ae0 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -72,6 +72,9 @@ def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
 
+def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
+                                      "Mips16 mode">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
@@ -83,6 +86,7 @@ def : Proc<"mips32", [FeatureMips32]>;
 def : Proc<"mips32r2", [FeatureMips32r2]>;
 def : Proc<"mips64", [FeatureMips64]>;
 def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips16", [FeatureMips16]>;
 
 def MipsAsmWriter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
new file mode 100644
index 0000000..030042f
--- /dev/null
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -0,0 +1,87 @@
+//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16FrameLowering.h"
+#include "MipsInstrInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MipsInstrInfo &TII =
+    *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  // Adjust stack.
+  if (isInt<16>(-StackSize))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::SaveRaF16)).addImm(StackSize);
+}
+
+void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MipsInstrInfo &TII =
+    *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  // Adjust stack.
+  if (isInt<16>(StackSize))
+    // assumes stacksize multiple of 8
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::RestoreRaF16)).addImm(StackSize);
+}
+
+bool Mips16FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  // FIXME: implement.
+  return true;
+}
+
+bool
+Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // FIXME: implement.
+  return true;
+}
+
+void Mips16FrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+}
+
+const MipsFrameLowering *
+llvm::createMips16FrameLowering(const MipsSubtarget &ST) {
+  return new Mips16FrameLowering(ST);
+}
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
new file mode 100644
index 0000000..25cc37b
--- /dev/null
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -0,0 +1,43 @@
+//===-- Mips16FrameLowering.h - Mips16 frame lowering  ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16_FRAMEINFO_H
+#define MIPS16_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+class Mips16FrameLowering : public MipsFrameLowering {
+public:
+  explicit Mips16FrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 0000000..61602b6
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,663 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  funct or f      Function field
+//
+//  immediate       4-,5-,8- or 11-bit immediate, branch displacement, or
+//  or imm          address displacement
+//
+//  op              5-bit major operation code
+//
+//  rx              3-bit source or destination register
+//
+//  ry              3-bit source or destination register
+//
+//  rz              3-bit source or destination register
+//
+//  sa              3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+//
+class Format16<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo16          : Format16<0>;
+def FrmI16            : Format16<1>;
+def FrmRI16           : Format16<2>;
+def FrmRR16           : Format16<3>;
+def FrmRRI16          : Format16<4>;
+def FrmRRR16          : Format16<5>;
+def FrmRRI_A16        : Format16<6>;
+def FrmSHIFT16        : Format16<7>;
+def FrmI8_TYPE16      : Format16<8>;
+def FrmI8_MOVR3216    : Format16<9>;
+def FrmI8_MOV32R16    : Format16<10>;
+def FrmI8_SVRS16      : Format16<11>;
+def FrmJAL16          : Format16<12>;
+def FrmJALX16         : Format16<13>;
+def FrmEXT_I16        : Format16<14>;
+def FrmASMACRO16      : Format16<15>;
+def FrmEXT_RI16       : Format16<16>;
+def FrmEXT_RRI16      : Format16<17>;
+def FrmEXT_RRI_A16    : Format16<18>;
+def FrmEXT_SHIFT16    : Format16<19>;
+def FrmEXT_I816       : Format16<20>;
+def FrmEXT_I8_SVRS16  : Format16<21>;
+def FrmOther16        : Format16<22>; // Instruction w/ a custom format
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+                      InstrItinClass itin, Format16 f>: Instruction
+{
+  Format16 Form = f;
+
+  let Namespace = "Mips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  //
+  // Attributes specific to Mips instructions...
+  //
+  bits<5> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+  let TSFlags{4-0}   = FormBits;
+
+  let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 InstrItinClass itin, Format16 f>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+{
+  field bits<16> Inst;
+  bits<5> Opcode = 0;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{15-11} = Opcode;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    InstrItinClass itin, Format16 f>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+{
+  field bits<32> Inst;
+
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin, Format16 f>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin, f>
+{
+  let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo, Pseudo16> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI16>
+{
+  bits<11> imm11;
+
+  let Opcode = op;
+
+  let Inst{10-0}  = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRI16>
+{
+  bits<3>  rx;
+  bits<8>   imm8;
+
+  let Opcode = op;
+
+  let Inst{10-8} = rx;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = _funct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+               string asmstr, list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<3>  subfunct;
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+  let subfunct = _subfunct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = subfunct;
+  let Inst{4-0}   = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<6>  _code;  // code is a keyword in tablegen
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+
+  let Inst{10-5} = _code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<1>  nd;
+  bits<1>  l;
+  bits<1>  ra;
+
+  let nd = _nd;
+  let l = _l;
+  let ra = r_a;
+
+  let Opcode = 0b11101;
+
+  let Inst{10-8} = rx;
+  let Inst{7} = nd;
+  let Inst{6} = l;
+  let Inst{5} = ra;
+  let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  imm5;
+
+  let Opcode = op;
+
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRR16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  rz;
+  bits<2>  f;
+
+  let Opcode = 0b11100;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = rz;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI_A16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<1>  f;
+  bits<4>  imm4;
+
+  let Opcode = 0b01000;
+  let  f = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0}   = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmSHIFT16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  sa;
+  bits<2>  f;
+
+  let Opcode = 0b00110;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = sa;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_TYPE16>
+{
+  bits<3>  func;
+  bits<8>   imm8;
+
+  let Opcode = 0b01100;
+  let func  = _func;
+
+  let Inst{10-8} = func;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOVR3216>
+{
+
+  bits<4> ry;
+  bits<4> r32;
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b111;
+  let Inst{7-4} = ry;
+  let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOV32R16>
+{
+
+  bits<3>  func;
+  bits<5> r32;
+  bits<3> rz;
+
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b101;
+  let Inst{7-5} = r32{2-0};
+  let Inst{4-3} = r32{4-3};
+  let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+{
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+  bits<4> framesize = 0;
+
+  let s =_s;
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b100;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin, FrmJAL16>
+{
+  bits<1> X;
+  bits<26> imm26;
+
+
+  let X = _X;
+
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = X;
+  let Inst{25-21} = imm26{20-16};
+  let Inst{20-16} = imm26{25-21};
+  let Inst{15-0}  = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I16>
+{
+  bits<16> imm16;
+  bits<5> eop;
+
+  let eop = _eop;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = eop;
+  let Inst{10-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+//    <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmASMACRO16>
+{
+  bits<3> select;
+  bits<3> p4;
+  bits<5> p3;
+  bits<5> RRR = 0b11100;
+  bits<3> p2;
+  bits<3> p1;
+  bits<5> p0;
+
+
+  let Inst{26-24} = select;
+  let Inst{23-21} = p4;
+  let Inst{20-16} = p3;
+  let Inst{15-11} = RRR;
+  let Inst{10-8} = p2;
+  let Inst{7-5} = p1;
+  let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RI16>
+{
+  bits<16> imm16;
+  bits<5> op;
+  bits<3> rx;
+
+  let op = _op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI16>
+{
+  bits<5> op;
+  bits<16> imm16;
+  bits<3> rx;
+  bits<3> ry;
+
+  let op=_op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+//    <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI_A16>
+{
+  bits<15> imm15;
+  bits<3> rx;
+  bits<3> ry;
+  bits<1> f;
+
+  let f = _f;
+
+  let Inst{26-20} = imm15{10-4};
+  let Inst{19-16} = imm15{14-11};
+  let Inst{15-11} = 0b01000;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+//    <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_SHIFT16>
+{
+  bits<6> sa6;
+  bits<3> rx;
+  bits<3> ry;
+  bits<2> f;
+
+  let f = _f;
+
+  let Inst{26-22} = sa6{4-0};
+  let Inst{21} = sa6{5};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00110;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = 0;
+  let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I816>
+{
+  bits<16> imm16;
+  bits<5> I8;
+  bits<3> funct;
+
+  let funct = _funct;
+  let I8 = 0b0110;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = I8;
+  let Inst{10-8} = funct;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+//    <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+                     list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+{
+  bits<3> xsregs =0;
+  bits<8> framesize =0;
+  bits<3> aregs =0;
+  bits<5> I8 = 0b01100;
+  bits<3> SVRS = 0b100;
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+
+  let s= s_;
+
+  let Inst{26-24} = xsregs;
+  let Inst{23-20} = framesize{7-4};
+  let Inst{19} = 0;
+  let Inst{18-16} = aregs;
+  let Inst{15-11} = I8;
+  let Inst{10-8} = SVRS;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize{3-0};
+
+
+}
+
+
+
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
new file mode 100644
index 0000000..2bc286b
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -0,0 +1,132 @@
+//===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16InstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+
+Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
+  : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0),
+    RI(*tm.getSubtargetImpl(), *this) {}
+
+const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned Mips16InstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned Mips16InstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  return 0;
+}
+
+void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0, ZeroReg = 0;
+
+  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::CPURegsRegClass.contains(SrcReg))
+      Opc = Mips::Mov32R16;
+  }
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void Mips16InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  assert(false && "Implement this function.");
+}
+
+void Mips16InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  assert(false && "Implement this function.");
+}
+
+bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA16:
+    ExpandRetRA16(MBB, MI, Mips::JrRa16);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  assert(false && "Implement this function.");
+  return 0;
+}
+
+unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+  return 0;
+}
+
+void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+}
+
+const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
+  return new Mips16InstrInfo(TM);
+}
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
new file mode 100644
index 0000000..260c5b6
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -0,0 +1,76 @@
+//===-- Mips16InstrInfo.h - Mips16 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16INSTRUCTIONINFO_H
+#define MIPS16INSTRUCTIONINFO_H
+
+#include "MipsInstrInfo.h"
+#include "Mips16RegisterInfo.h"
+
+namespace llvm {
+
+class Mips16InstrInfo : public MipsInstrInfo {
+  const Mips16RegisterInfo RI;
+
+public:
+  explicit Mips16InstrInfo(MipsTargetMachine &TM);
+
+  virtual const MipsRegisterInfo &getRegisterInfo() const;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+
+  void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 0000000..94cf984
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,419 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16  -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+
+class FEXT_2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+
+//
+// RR-type instruction format
+//
+
+class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRxRxRy16_ins<bits<5> f, string asmstr,
+                    InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+            !strconcat(asmstr, "\t$rz, $ry"),
+            [], itin> {
+  let Constraints = "$rx = $rz";
+}
+
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+                              string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+              [], itin> ;
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                          InstrItinClass itin>:
+  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+//
+// Some general instruction class info
+//
+//
+
+class ArithLogic16Defs<bit isCom=0> {
+  bits<5> shamt = 0;
+  bit isCommutable = isCom;
+  bit isReMaterializable = 1;
+  bit neverHasSideEffects = 1;
+}
+
+//
+
+// Format: ADDIU rx, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
+// To add a constant to a 32-bit integer.
+//
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
+
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0>;
+
+//
+
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: AND rx, ry MIPS16e
+// Purpose: AND
+// To do a bitwise logical AND.
+
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu>;
+
+//
+// Format: LB ry, offset(rx) MIPS16e
+// Purpose: Load Byte (Extended)
+// To load a byte from memory as a signed value.
+//
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IIAlu>;
+
+//
+// Format: LBU ry, offset(rx) MIPS16e
+// Purpose: Load Byte Unsigned (Extended)
+// To load a byte from memory as a unsigned value.
+//
+def LbuRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IIAlu>;
+
+//
+// Format: LH ry, offset(rx) MIPS16e
+// Purpose: Load Halfword signed (Extended)
+// To load a halfword from memory as a signed value.
+//
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IIAlu>;
+
+//
+// Format: LHU ry, offset(rx) MIPS16e
+// Purpose: Load Halfword unsigned (Extended)
+// To load a halfword from memory as an unsigned value.
+//
+def LhuRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IIAlu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IIAlu>;
+
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Mov32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+
+//
+// Format: NEG rx, ry MIPS16e
+// Purpose: Negate
+// To negate an integer value.
+//
+def NegRxRy16: FRR16_ins<0b11101, "neg", IIAlu>;
+
+//
+// Format: NOT rx, ry MIPS16e
+// Purpose: Not
+// To complement an integer value
+//
+def NotRxRy16: FRR16_ins<0b01111, "not", IIAlu>;
+
+//
+// Format: OR rx, ry MIPS16e
+// Purpose: Or
+// To do a bitwise logical OR.
+//
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+// fixed form for restoring RA and the frame
+// for direct object emitter, encoding needs to be adjusted for the
+// frame size
+//
+let ra=1, s=0,s0=0,s1=0 in
+def RestoreRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "restore \t$$ra, $frame_size", [], IILoad >;
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+let ra=1, s=1,s0=0,s1=0 in
+def SaveRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "save \t$$ra, $frame_size", [], IILoad >;
+
+//
+// Format: SB ry, offset(rx) MIPS16e
+// Purpose: Store Byte (Extended)
+// To store a byte to memory.
+//
+def SbRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIAlu>;
+
+//
+// Format: SH ry, offset(rx) MIPS16e
+// Purpose: Store Halfword (Extended)
+// To store a halfword to memory.
+//
+def ShRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, IIAlu>;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+
+//
+// Format: SLLV ry, rx MIPS16e
+// Purpose: Shift Word Left Logical Variable
+// To execute a left-shift of a word by a variable number of bits.
+//
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
+
+
+//
+// Format: SRAV ry, rx MIPS16e
+// Purpose: Shift Word Right Arithmetic Variable
+// To execute an arithmetic right-shift of a word by a variable
+// number of bits.
+//
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
+
+
+//
+// Format: SRA rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Arithmetic (Extended)
+// To execute an arithmetic right-shift of a word by a fixed
+// number of bits—1 to 8 bits.
+//
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
+
+
+//
+// Format: SRLV ry, rx MIPS16e
+// Purpose: Shift Word Right Logical Variable
+// To execute a logical right-shift of a word by a variable
+// number of bits.
+//
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
+
+
+//
+// Format: SRL rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Logical (Extended)
+// To execute a logical right-shift of a word by a fixed
+// number of bits—1 to 31 bits.
+//
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
+
+//
+// Format: SUBU rz, rx, ry MIPS16e
+// Purpose: Subtract Unsigned Word
+// To subtract 32-bit integers
+//
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+def SwRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, IIAlu>;
+
+//
+// Format: XOR rx, ry MIPS16e
+// Purpose: Xor
+// To do a bitwise logical XOR.
+//
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [InMips16Mode];
+}
+
+// Unary Arith/Logic
+//
+class ArithLogicU_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r),
+            (I CPU16Regs:$r)>;
+
+def: ArithLogicU_pat<not, NotRxRy16>;
+def: ArithLogicU_pat<ineg, NegRxRy16>;
+
+class ArithLogic16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$l, CPU16Regs:$r),
+            (I CPU16Regs:$l, CPU16Regs:$r)>;
+
+def: ArithLogic16_pat<add, AdduRxRyRz16>;
+def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<or, OrRxRxRy16>;
+def: ArithLogic16_pat<sub, SubuRxRyRz16>;
+def: ArithLogic16_pat<xor, XorRxRxRy16>;
+
+// Arithmetic and logical instructions with 2 register operands.
+
+class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
+            (I CPU16Regs:$in, imm_type:$imm)>;
+
+def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
+def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
+def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
+def: ArithLogicI16_pat<sra, immZExt5, SraX16>;
+
+class shift_rotate_reg16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, CPU16Regs:$ra),
+            (I CPU16Regs:$r, CPU16Regs:$ra)>;
+
+def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
+def: shift_rotate_reg16_pat<sra, SravRxRy16>;
+def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
+
+class LoadM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode addr:$addr), (I addr:$addr)>;
+
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi16_a, LhRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi16_a, LhuRxRyOffMemX16>;
+def: LoadM16_pat<load_a, LwRxRyOffMemX16>;
+
+class StoreM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, addr:$addr), (I CPU16Regs:$r, addr:$addr)>;
+
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei16_a, ShRxRyOffMemX16>;
+def: StoreM16_pat<store_a, SwRxRyOffMemX16>;
+
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1 in
+def JumpLinkReg16:
+  FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+              "jalr \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+  hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+// Small immediates
+def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+
+def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
+               (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
new file mode 100644
index 0000000..c15d1bf
--- /dev/null
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -0,0 +1,111 @@
+//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information -== ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16RegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
+                                       const TargetInstrInfo &TII)
+  : MipsRegisterInfo(ST, TII) {}
+
+// This function eliminate ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void Mips16RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+      MachineInstr &MI = *II;
+      MachineFunction &MF = *MI.getParent()->getParent();
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+      int MinCSFI = 0;
+      int MaxCSFI = -1;
+
+      if (CSI.size()) {
+        MinCSFI = CSI[0].getFrameIdx();
+        MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+      }
+
+      // The following stack frame objects are always
+      // referenced relative to $sp:
+      //  1. Outgoing arguments.
+      //  2. Pointer to dynamically allocated stack space.
+      //  3. Locations for callee-saved registers.
+      // Everything else is referenced relative to whatever register
+      // getFrameRegister() returns.
+      unsigned FrameReg;
+
+      if (MipsFI->isOutArgFI(FrameIndex) ||
+         (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
+        FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+      else
+        FrameReg = getFrameRegister(MF);
+
+      // Calculate final offset.
+      // - There is no need to change the offset if the frame object
+      //   is one of the
+      //   following: an outgoing argument, pointer to a dynamically allocated
+      //   stack space or a $gp restore location,
+      // - If the frame object is any of the following,
+      //   its offset must be adjusted
+      //   by adding the size of the stack:
+      //   incoming argument, callee-saved register location or local variable.
+      int64_t Offset;
+
+      if (MipsFI->isOutArgFI(FrameIndex))
+        Offset = SPOffset;
+      else
+        Offset = SPOffset + (int64_t)StackSize;
+
+      Offset    += MI.getOperand(OpNo + 1).getImm();
+
+      DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+      MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
+      MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+
+
+}
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
new file mode 100644
index 0000000..3f4b3a7
--- /dev/null
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -0,0 +1,37 @@
+//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16REGISTERINFO_H
+#define MIPS16REGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+
+class Mips16RegisterInfo : public MipsRegisterInfo {
+public:
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
+                     const TargetInstrInfo &TII);
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 0382869..20fc178 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -49,21 +49,24 @@ class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
   Div<op, func, instr_asm, itin, CPU64Regs, [HI64, LO64]>;
 
 multiclass Atomic2Ops64<PatFrag Op, string Opstr> {
-  def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>,
+               Requires<[IsN64, HasStandardEncoding]> {
     let isCodeGenOnly = 1;
   }
 }
 
 multiclass AtomicCmpSwap64<PatFrag Op, string Width>  {
-  def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>, Requires<[NotN64]>;
+  def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>,
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : AtomicCmpSwap<Op, Width, CPU64Regs, CPU64Regs>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let isCodeGenOnly = 1;
   }
 }
 }
-let usesCustomInserter = 1, Predicates = [HasMips64],
+let usesCustomInserter = 1, Predicates = [HasMips64, HasStandardEncoding],
   DecoderNamespace = "Mips64" in {
   defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64, "load_add_64">;
   defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64, "load_sub_64">;
@@ -106,9 +109,15 @@ def DSRA     : shift_rotate_imm64<0x3b, 0x00, "dsra", sra>;
 def DSLLV    : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>;
 def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
 def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
+let Pattern = []<dag> in {
+def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
+def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
+def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
+}
 }
 // Rotate Instructions
-let Predicates = [HasMips64r2], DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64r2, HasStandardEncoding],
+    DecoderNamespace = "Mips64" in {
   def DROTR    : shift_rotate_imm64<0x3a, 0x01, "drotr", rotr>;
   def DROTRV   : shift_rotate_reg<0x16, 0x01, "drotrv", rotr, CPU64Regs>;
 }
@@ -137,18 +146,34 @@ defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
 defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
 defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
 
+/// load/store left/right
+let isCodeGenOnly = 1 in {
+  defm LWL64 : LoadLeftRightM64<0x22, "lwl", MipsLWL>;
+  defm LWR64 : LoadLeftRightM64<0x26, "lwr", MipsLWR>;
+  defm SWL64 : StoreLeftRightM64<0x2a, "swl", MipsSWL>;
+  defm SWR64 : StoreLeftRightM64<0x2e, "swr", MipsSWR>;
+}
+defm LDL   : LoadLeftRightM64<0x1a, "ldl", MipsLDL>;
+defm LDR   : LoadLeftRightM64<0x1b, "ldr", MipsLDR>;
+defm SDL   : StoreLeftRightM64<0x2c, "sdl", MipsSDL>;
+defm SDR   : StoreLeftRightM64<0x2d, "sdr", MipsSDR>;
+
 /// Load-linked, Store-conditional
-def LLD    : LLBase<0x34, "lld", CPU64Regs, mem>, Requires<[NotN64]>;
-def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, Requires<[IsN64]> {
+def LLD    : LLBase<0x34, "lld", CPU64Regs, mem>,
+             Requires<[NotN64, HasStandardEncoding]>;
+def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>,
+             Requires<[IsN64, HasStandardEncoding]> {
   let isCodeGenOnly = 1;
 }
-def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>, Requires<[NotN64]>;
-def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]> {
+def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>,
+             Requires<[NotN64, HasStandardEncoding]>;
+def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>,
+             Requires<[IsN64, HasStandardEncoding]> {
   let isCodeGenOnly = 1;
 }
 
 /// Jump and Branch Instructions
-def JR64   : JumpFR<0x00, 0x08, "jr", CPU64Regs>;
+def JR64   : IndirectBranch<CPU64Regs>;
 def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
 def BNE64  : CBranch<0x05, "bne", setne, CPU64Regs>;
 def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>;
@@ -183,74 +208,75 @@ def DCLO : CountLeading1<0x25, "dclo", CPU64Regs>;
 def DSBH : SubwordSwap<0x24, 0x2, "dsbh", CPU64Regs>;
 def DSHD : SubwordSwap<0x24, 0x5, "dshd", CPU64Regs>;
 
-def LEA_ADDiu64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>;
+def LEA_ADDiu64 : EffectiveAddress<0x19,"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>;
 }
 let Uses = [SP_64], DecoderNamespace = "Mips64" in
-def DynAlloc64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>,
-                 Requires<[IsN64]> {
-  let isCodeGenOnly = 1;
-}
+def DynAlloc64 : EffectiveAddress<0x19,"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>,
+                 Requires<[IsN64, HasStandardEncoding]>;
 let DecoderNamespace = "Mips64" in {
 def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
 
 def DEXT : ExtBase<3, "dext", CPU64Regs>;
 def DINS : InsBase<7, "dins", CPU64Regs>;
 
-def DSLL64_32 : FR<0x3c, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
-                   "dsll\t$rd, $rt, 32", [], IIAlu>;
-def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
-                  "sll\t$rd, $rt, 0", [], IIAlu>;
-let isCodeGenOnly = 1 in
-def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
-                  "sll\t$rd, $rt, 0", [], IIAlu>;
+let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
+  def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                     "dsll\t$rd, $rt, 32", [], IIAlu>;
+  def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                    "sll\t$rd, $rt, 0", [], IIAlu>;
+  def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
+                    "sll\t$rd, $rt, 0", [], IIAlu>;
+}
 }
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [NotN64] in {
-  def : Pat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
-  def : Pat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : Pat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
-  def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
-  def : Pat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
-  def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
-  def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
+  def : MipsPat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
-  def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
+  def : MipsPat<(zextloadi32_u addr:$a),
+                (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
 }
 
 // hi/lo relocs
-def : Pat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : Pat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
-def : Pat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
-def : Pat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
-def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
-
-def : Pat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
-def : Pat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
-def : Pat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
-def : Pat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
-def : Pat<(MipsLo tglobaltlsaddr:$in), (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
-
-def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
-          (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
-          (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
-          (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
-          (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-          (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
+              (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
+              (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
 
 def : WrapperPat<tglobaladdr, DADDiu, CPU64Regs>;
 def : WrapperPat<tconstpool, DADDiu, CPU64Regs>;
@@ -270,19 +296,22 @@ defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
 defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
 
 // select MipsDynAlloc
-def : Pat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>, Requires<[IsN64]>;
+def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>,
+      Requires<[IsN64, HasStandardEncoding]>;
 
 // truncate
-def : Pat<(i32 (trunc CPU64Regs:$src)),
-          (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>, Requires<[IsN64]>;
+def : MipsPat<(i32 (trunc CPU64Regs:$src)),
+              (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>,
+      Requires<[IsN64, HasStandardEncoding]>;
 
 // 32-to-64-bit extension
-def : Pat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
-def : Pat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
-def : Pat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
+def : MipsPat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
 
 // Sign extend in register
-def : Pat<(i64 (sext_inreg CPU64Regs:$src, i32)), (SLL64_64 CPU64Regs:$src)>;
+def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)),
+              (SLL64_64 CPU64Regs:$src)>;
 
-// bswap pattern
-def : Pat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
+// bswap MipsPattern
+def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 8206cfc..00ff754 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -13,29 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-asm-printer"
-#include "MipsAsmPrinter.h"
 #include "Mips.h"
+#include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/BasicBlock.h"
-#include "llvm/Instructions.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -43,19 +43,6 @@
 
 using namespace llvm;
 
-void MipsAsmPrinter::EmitInstrWithMacroNoAT(const MachineInstr *MI) {
-  MCInst TmpInst;
-
-  MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
-  if (MipsFI->getEmitNOAT())
-    OutStreamer.EmitRawText(StringRef("\t.set\tat"));
-  OutStreamer.EmitInstruction(TmpInst);
-  if (MipsFI->getEmitNOAT())
-    OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
-  OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
-}
-
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   MipsFI = MF.getInfo<MipsFunctionInfo>();
   AsmPrinter::runOnMachineFunction(MF);
@@ -71,84 +58,33 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  unsigned Opc = MI->getOpcode();
-  MCInst TmpInst0;
-  SmallVector<MCInst, 4> MCInsts;
-
-  switch (Opc) {
-  case Mips::ULW:
-  case Mips::ULH:
-  case Mips::ULHu:
-  case Mips::USW:
-  case Mips::USH:
-  case Mips::ULW_P8:
-  case Mips::ULH_P8:
-  case Mips::ULHu_P8:
-  case Mips::USW_P8:
-  case Mips::USH_P8:
-  case Mips::ULD:
-  case Mips::ULW64:
-  case Mips::ULH64:
-  case Mips::ULHu64:
-  case Mips::USD:
-  case Mips::USW64:
-  case Mips::USH64:
-  case Mips::ULD_P8:
-  case Mips::ULW64_P8:
-  case Mips::ULH64_P8:
-  case Mips::ULHu64_P8:
-  case Mips::USD_P8:
-  case Mips::USW64_P8:
-  case Mips::USH64_P8: {
-    if (OutStreamer.hasRawTextSupport()) {
-      EmitInstrWithMacroNoAT(MI);
-      return;
-    }
-
-    MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts);
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
-           != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
-
-    return;
-  }
-  case Mips::CPRESTORE: {
-    const MachineOperand &MO = MI->getOperand(0);
-    assert(MO.isImm() && "CPRESTORE's operand must be an immediate.");
-    int64_t Offset = MO.getImm();
-
-    if (OutStreamer.hasRawTextSupport()) {
-      if (!isInt<16>(Offset)) {
-        EmitInstrWithMacroNoAT(MI);
+  // Direct object specific instruction lowering
+  if (!OutStreamer.hasRawTextSupport())
+    switch (MI->getOpcode()) {
+    case Mips::DSLL:
+    case Mips::DSRL:
+    case Mips::DSRA:
+      assert(MI->getNumOperands() == 3 &&
+             "Invalid no. of machine operands for shift!");
+      assert(MI->getOperand(2).isImm());
+      int64_t Shift = MI->getOperand(2).getImm();
+      if (Shift > 31) {
+        MCInst TmpInst0;
+        MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32);
+        OutStreamer.EmitInstruction(TmpInst0);
         return;
       }
-    } else {
-      MCInstLowering.LowerCPRESTORE(Offset, MCInsts);
-
-      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-           I != MCInsts.end(); ++I)
-        OutStreamer.EmitInstruction(*I);
-
-      return;
+      break;
     }
 
-    break;
-  }
-  case Mips::SETGP01: {
-    MCInstLowering.LowerSETGP01(MI, MCInsts);
-
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-         I != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
-
-    return;
-  }
-  default:
-    break;
-  }
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
-  MCInstLowering.Lower(MI, TmpInst0);
-  OutStreamer.EmitInstruction(TmpInst0);
+  do {
+    MCInst TmpInst0;
+    MCInstLowering.Lower(I++, TmpInst0);
+    OutStreamer.EmitInstruction(TmpInst0);
+  } while ((I != E) && I->isInsideBundle());
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,9 +133,9 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   // size of stack area to which FP callee-saved regs are saved.
-  unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize();
-  unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize();
-  unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize();
+  unsigned CPURegSize = Mips::CPURegsRegClass.getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
   bool HasAFGR64Reg = false;
   unsigned CSFPRegsSize = 0;
   unsigned i, e = CSI.size();
@@ -207,11 +143,11 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   // Set FPU Bitmask.
   for (i = 0; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (Mips::CPURegsRegisterClass->contains(Reg))
+    if (Mips::CPURegsRegClass.contains(Reg))
       break;
 
     unsigned RegNum = getMipsRegisterNumbering(Reg);
-    if (Mips::AFGR64RegisterClass->contains(Reg)) {
+    if (Mips::AFGR64RegClass.contains(Reg)) {
       FPUBitmask |= (3 << RegNum);
       CSFPRegsSize += AFGR64RegSize;
       HasAFGR64Reg = true;
@@ -283,8 +219,15 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 }
 
 void MipsAsmPrinter::EmitFunctionEntryLabel() {
-  if (OutStreamer.hasRawTextSupport())
+  if (OutStreamer.hasRawTextSupport()) {
+    if (Subtarget->inMips16Mode())
+      OutStreamer.EmitRawText(StringRef("\t.set\tmips16"));
+    else
+      OutStreamer.EmitRawText(StringRef("\t.set\tnomips16"));
+    // leave out until FSF available gas has micromips changes
+    // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
     OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
+  }
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -295,10 +238,6 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 
   emitFrameDirective();
 
-  bool EmitCPLoad = (MF->getTarget().getRelocationModel() == Reloc::PIC_) &&
-    Subtarget->isABI_O32() && MipsFI->globalBaseRegSet() &&
-    MipsFI->globalBaseRegFixed();
-
   if (OutStreamer.hasRawTextSupport()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
@@ -306,20 +245,9 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
     OutStreamer.EmitRawText(OS.str());
 
     OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
-
-    // Emit .cpload directive if needed.
-    if (EmitCPLoad)
-      OutStreamer.EmitRawText(StringRef("\t.cpload\t$25"));
-
     OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
     if (MipsFI->getEmitNOAT())
       OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
-  } else if (EmitCPLoad) {
-    SmallVector<MCInst, 4> MCInsts;
-    MCInstLowering.LowerCPLOAD(MCInsts);
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-         I != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
   }
 }
 
@@ -382,14 +310,99 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
 }
 
 // Print out an operand for an inline asm expression.
-bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                      unsigned AsmVariant,const char *ExtraCode,
                                      raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
-  printOperand(MI, OpNo, O);
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+    case 'X': // hex const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm())).lower();
+      return false;
+    case 'x': // hex const int (low 16 bits)
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm() & 0xffff)).lower();
+      return false;
+    case 'd': // decimal const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'm': // decimal const int minus 1
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm() - 1;
+      return false;
+    case 'z': {
+      // $0 if zero, regular printing otherwise
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      int64_t Val = MO.getImm();
+      if (Val)
+        O << Val;
+      else
+        O << "$0";
+      return false;
+    }
+    case 'D': // Second part of a double word register operand
+    case 'L': // Low order register of a double word register operand
+    case 'M': // High order register of a double word register operand
+    {
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      // Number of registers represented by this operand. We are looking
+      // for 2 for 32 bit mode and 1 for 64 bit mode.
+      if (NumVals != 2) {
+        if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+          unsigned Reg = MO.getReg();
+          O << '$' << MipsInstPrinter::getRegisterName(Reg);
+          return false;
+        }
+        return true;
+      }
+
+      unsigned RegOp = OpNum;
+      if (!Subtarget->isGP64bit()){
+        // Endianess reverses which register holds the high or low value
+        // between M and L.
+        switch(ExtraCode[0]) {
+        case 'M':
+          RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+          break;
+        case 'L':
+          RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+          break;
+        case 'D': // Always the second part
+          RegOp = OpNum + 1;
+        }
+        if (RegOp >= MI->getNumOperands())
+          return true;
+        const MachineOperand &MO = MI->getOperand(RegOp);
+        if (!MO.isReg())
+          return true;
+        unsigned Reg = MO.getReg();
+        O << '$' << MipsInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
   return false;
 }
 
@@ -398,11 +411,12 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                            const char *ExtraCode,
                                            raw_ostream &O) {
   if (ExtraCode && ExtraCode[0])
-     return true; // Unknown modifier.
+    return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "0($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")";
+
   return false;
 }
 
@@ -450,7 +464,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
 
     case MachineOperand::MO_BlockAddress: {
-      MCSymbol* BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
       O << BA->getName();
       break;
     }
@@ -511,7 +525,7 @@ printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
 void MipsAsmPrinter::
 printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                 const char *Modifier) {
-  const MachineOperand& MO = MI->getOperand(opNum);
+  const MachineOperand &MO = MI->getOperand(opNum);
   O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 4b7e1d3..8aadefd 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -145,6 +145,58 @@ def RetCC_MipsEABI : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+
+  // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+                                 T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+                                 T8_64, V1_64]>>,
+
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+                                 D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+                                 D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+                                 D18_64, D19_64]>>,
+
+  // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+  // 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers. All scratch registers,
+  // except for AT, V0 and T9, are available to be used as argument registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6,
+                                 T7, T8, V1]>>,
+
+  // f32 arguments are passed in single-precision floating pointer registers.
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10,
+                                 F11, F12, F13, F14, F15, F16, F17, F18, F19]>>,
+
+  // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+  CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//===----------------------------------------------------------------------===//
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 7d81902..cb7022b 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -145,8 +145,8 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
         MBB != E; ++MBB){
       MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-          I != E; ++I)
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E; ++I)
         emitInstruction(*I);
     }
   } while (MCE.finishFunction(MF));
@@ -258,7 +258,7 @@ void MipsCodeEmitter::emitGlobalAddressUnaligned(const GlobalValue *GV,
 void MipsCodeEmitter::
 emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
   MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES, 0, 0, false));
+                                                 Reloc, ES, 0, 0));
 }
 
 void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index da33680..b12b1f2 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -61,41 +61,54 @@ multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
                      Instruction MOVZInst, Instruction SLTOp,
                      Instruction SLTuOp, Instruction SLTiOp,
                      Instruction SLTiuOp> {
-  def : Pat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
-  def : Pat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
-  def : Pat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
 }
 
 multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
                      Instruction MOVZInst, Instruction XOROp> {
-  def : Pat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XORiOp> {
+  def : MipsPat<
+            (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+            (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
 }
 
 multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
                     Instruction XOROp> {
-  def : Pat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select CRC:$cond, DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
-  def : Pat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
 }
 
 // Instantiation of instructions.
 def MOVZ_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0a, "movz">;
-let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVZ_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0a, "movz">;
   def MOVZ_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0a, "movz"> {
     let isCodeGenOnly = 1;
@@ -106,7 +119,8 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
 }
 
 def MOVN_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0b, "movn">;
-let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVN_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0b, "movn">;
   def MOVN_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0b, "movn"> {
     let isCodeGenOnly = 1;
@@ -118,21 +132,22 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
 
 def MOVZ_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 18, "movz.s">;
 def MOVZ_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 18, "movz.s">,
-                 Requires<[HasMips64]> {
+                 Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVN_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 19, "movn.s">;
 def MOVN_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 19, "movn.s">,
-                 Requires<[HasMips64]> {
+                 Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def MOVZ_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 18, "movz.d">;
   def MOVN_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 19, "movn.d">;
 }
-let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 18, "movz.d">;
   def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d"> {
     let isCodeGenOnly = 1;
@@ -145,24 +160,25 @@ let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in {
 
 def MOVT_I   : CondMovFPInt<CPURegs, MipsCMovFP_T, 1, "movt">;
 def MOVT_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_T, 1, "movt">,
-               Requires<[HasMips64]> {
+               Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVF_I   : CondMovFPInt<CPURegs, MipsCMovFP_F, 0, "movf">;
 def MOVF_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_F, 0, "movf">,
-               Requires<[HasMips64]> {
+               Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
 def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def MOVT_D32 : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D32 : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding],
+    DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CondMovFPFP<FGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D64 : CondMovFPFP<FGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
@@ -170,7 +186,8 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
 // Instantiation of conditional move patterns.
 defm : MovzPats0<CPURegs, CPURegs, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<CPURegs, CPURegs, MOVZ_I_I, XOR>;
-let Predicates = [HasMips64] in {
+defm : MovzPats2<CPURegs, CPURegs, MOVZ_I_I, XORi>;
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, CPU64Regs, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats0<CPU64Regs, CPURegs, MOVZ_I_I, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
@@ -179,10 +196,13 @@ let Predicates = [HasMips64] in {
   defm : MovzPats1<CPURegs, CPU64Regs, MOVZ_I_I64, XOR>;
   defm : MovzPats1<CPU64Regs, CPURegs, MOVZ_I64_I, XOR64>;
   defm : MovzPats1<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XOR64>;
+  defm : MovzPats2<CPURegs, CPU64Regs, MOVZ_I_I64, XORi>;
+  defm : MovzPats2<CPU64Regs, CPURegs, MOVZ_I64_I, XORi64>;
+  defm : MovzPats2<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XORi64>;
 }
 
 defm : MovnPats<CPURegs, CPURegs, MOVN_I_I, XOR>;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovnPats<CPURegs, CPU64Regs, MOVN_I_I64, XOR>;
   defm : MovnPats<CPU64Regs, CPURegs, MOVN_I64_I, XOR64>;
   defm : MovnPats<CPU64Regs, CPU64Regs, MOVN_I64_I64, XOR64>;
@@ -191,19 +211,19 @@ let Predicates = [HasMips64] in {
 defm : MovzPats0<CPURegs, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<CPURegs, FGR32, MOVZ_I_S, XOR>;
 defm : MovnPats<CPURegs, FGR32, MOVN_I_S, XOR>;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovzPats0<CPU64Regs, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
   defm : MovzPats1<CPU64Regs, FGR32, MOVZ_I64_S, XOR64>;
   defm : MovnPats<CPU64Regs, FGR32, MOVN_I64_S, XOR64>;
 }
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats1<CPURegs, AFGR64, MOVZ_I_D32, XOR>;
   defm : MovnPats<CPURegs, AFGR64, MOVN_I_D32, XOR>;
 }
-let Predicates = [IsFP64bit] in {
+let Predicates = [IsFP64bit, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats0<CPU64Regs, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index debf2f1..2bba8a3 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -36,12 +36,21 @@ static cl::opt<bool> EnableDelaySlotFiller(
   cl::desc("Fill the Mips delay slots useful instructions."),
   cl::Hidden);
 
+// This option can be used to silence complaints by machine verifier passes.
+static cl::opt<bool> SkipDelaySlotFiller(
+  "skip-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Skip MIPS' delay slot filling pass."),
+  cl::Hidden);
+
 namespace {
   struct Filler : public MachineFunctionPass {
+    typedef MachineBasicBlock::instr_iterator InstrIter;
+    typedef MachineBasicBlock::reverse_instr_iterator ReverseInstrIter;
 
     TargetMachine &TM;
     const TargetInstrInfo *TII;
-    MachineBasicBlock::iterator LastFiller;
+    InstrIter LastFiller;
 
     static char ID;
     Filler(TargetMachine &tm)
@@ -53,6 +62,9 @@ namespace {
 
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
     bool runOnMachineFunction(MachineFunction &F) {
+      if (SkipDelaySlotFiller)
+        return false;
+
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -61,27 +73,27 @@ namespace {
     }
 
     bool isDelayFiller(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator candidate);
+                       InstrIter candidate);
 
-    void insertCallUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegDefs,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertCallUses(InstrIter MI,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
 
-    void insertDefsUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegDefs,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertDefsUses(InstrIter MI,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
 
-    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+    bool IsRegInSet(SmallSet<unsigned, 32> &RegSet,
                     unsigned Reg);
 
-    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+    bool delayHasHazard(InstrIter candidate,
                         bool &sawLoad, bool &sawStore,
                         SmallSet<unsigned, 32> &RegDefs,
                         SmallSet<unsigned, 32> &RegUses);
 
     bool
-    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot,
-                   MachineBasicBlock::iterator &Filler);
+    findDelayInstr(MachineBasicBlock &MBB, InstrIter slot,
+                   InstrIter &Filler);
 
 
   };
@@ -93,14 +105,14 @@ namespace {
 bool Filler::
 runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  LastFiller = MBB.end();
+  LastFiller = MBB.instr_end();
 
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+  for (InstrIter I = MBB.instr_begin(); I != MBB.instr_end(); ++I)
     if (I->hasDelaySlot()) {
       ++FilledSlots;
       Changed = true;
 
-      MachineBasicBlock::iterator D;
+      InstrIter D;
 
       if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
         MBB.splice(llvm::next(I), &MBB, D);
@@ -111,6 +123,10 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       // Record the filler instruction that filled the delay slot.
       // The instruction after it will be visited in the next iteration.
       LastFiller = ++I;
+
+      // Set InsideBundle bit so that the machine verifier doesn't expect this
+      // instruction to be a terminator.
+      LastFiller->setIsInsideBundle();
      }
   return Changed;
 
@@ -123,8 +139,8 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
 }
 
 bool Filler::findDelayInstr(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator slot,
-                            MachineBasicBlock::iterator &Filler) {
+                            InstrIter slot,
+                            InstrIter &Filler) {
   SmallSet<unsigned, 32> RegDefs;
   SmallSet<unsigned, 32> RegUses;
 
@@ -133,13 +149,13 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
   bool sawLoad = false;
   bool sawStore = false;
 
-  for (MachineBasicBlock::reverse_iterator I(slot); I != MBB.rend(); ++I) {
+  for (ReverseInstrIter I(slot); I != MBB.instr_rend(); ++I) {
     // skip debug value
     if (I->isDebugValue())
       continue;
 
     // Convert to forward iterator.
-    MachineBasicBlock::iterator FI(llvm::next(I).base());
+    InstrIter FI(llvm::next(I).base());
 
     if (I->hasUnmodeledSideEffects()
         || I->isInlineAsm()
@@ -165,7 +181,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
   return false;
 }
 
-bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+bool Filler::delayHasHazard(InstrIter candidate,
                             bool &sawLoad, bool &sawStore,
                             SmallSet<unsigned, 32> &RegDefs,
                             SmallSet<unsigned, 32> &RegUses) {
@@ -213,9 +229,9 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
 }
 
 // Insert Defs and Uses of MI into the sets RegDefs and RegUses.
-void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
-                            SmallSet<unsigned, 32>& RegDefs,
-                            SmallSet<unsigned, 32>& RegUses) {
+void Filler::insertDefsUses(InstrIter MI,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
   // If MI is a call or return, just examine the explicit non-variadic operands.
   MCInstrDesc MCID = MI->getDesc();
   unsigned e = MI->isCall() || MI->isReturn() ? MCID.getNumOperands() :
@@ -240,14 +256,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
 }
 
 //returns true if the Reg or its alias is in the RegSet.
-bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) {
-  if (RegSet.count(Reg))
-    return true;
-  // check Aliased Registers
-  for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
-       *Alias; ++Alias)
-    if (RegSet.count(*Alias))
+bool Filler::IsRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
       return true;
-
   return false;
 }
diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp
deleted file mode 100644
index 119d1a8..0000000
--- a/lib/Target/Mips/MipsEmitGPRestore.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- MipsEmitGPRestore.cpp - Emit GP Restore Instruction ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass emits instructions that restore $gp right
-// after jalr instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "emit-gp-restore"
-
-#include "Mips.h"
-#include "MipsTargetMachine.h"
-#include "MipsMachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-
-using namespace llvm;
-
-namespace {
-  struct Inserter : public MachineFunctionPass {
-
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-
-    static char ID;
-    Inserter(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
-
-    virtual const char *getPassName() const {
-      return "Mips Emit GP Restore";
-    }
-
-    bool runOnMachineFunction(MachineFunction &F);
-  };
-  char Inserter::ID = 0;
-} // end of anonymous namespace
-
-bool Inserter::runOnMachineFunction(MachineFunction &F) {
-  MipsFunctionInfo *MipsFI = F.getInfo<MipsFunctionInfo>();
-
-  if ((TM.getRelocationModel() != Reloc::PIC_) ||
-      (!MipsFI->globalBaseRegFixed()))
-    return false;
-
-  bool Changed = false;
-  int FI = MipsFI->getGPFI();
-
-  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
-       MFI != MFE; ++MFI) {
-    MachineBasicBlock& MBB = *MFI;
-    MachineBasicBlock::iterator I = MFI->begin();
-
-    // If MBB is a landing pad, insert instruction that restores $gp after
-    // EH_LABEL.
-    if (MBB.isLandingPad()) {
-      // Find EH_LABEL first.
-      for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ;
-
-      // Insert lw.
-      ++I;
-      DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-      BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI)
-                                                       .addImm(0);
-      Changed = true;
-    }
-
-    while (I != MFI->end()) {
-      if (I->getOpcode() != Mips::JALR) {
-        ++I;
-        continue;
-      }
-
-      DebugLoc dl = I->getDebugLoc();
-      // emit lw $gp, ($gp save slot on stack) after jalr
-      BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI)
-                                                         .addImm(0);
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
-
-/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that
-/// restores $gp clobbered by jalr instructions.
-FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) {
-  return new Inserter(tm);
-}
-
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
deleted file mode 100644
index baeae97..0000000
--- a/lib/Target/Mips/MipsExpandPseudo.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===--  MipsExpandPseudo.cpp - Expand Pseudo Instructions ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass expands pseudo instructions into target instructions after register
-// allocation but before post-RA scheduling.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mips-expand-pseudo"
-
-#include "Mips.h"
-#include "MipsTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-
-using namespace llvm;
-
-namespace {
-  struct MipsExpandPseudo : public MachineFunctionPass {
-
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-
-    static char ID;
-    MipsExpandPseudo(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
-
-    virtual const char *getPassName() const {
-      return "Mips PseudoInstrs Expansion";
-    }
-
-    bool runOnMachineFunction(MachineFunction &F);
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-
-  private:
-    void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator);
-    void ExpandExtractElementF64(MachineBasicBlock&,
-                                 MachineBasicBlock::iterator);
-  };
-  char MipsExpandPseudo::ID = 0;
-} // end of anonymous namespace
-
-bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) {
-  bool Changed = false;
-
-  for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I)
-    Changed |= runOnMachineBasicBlock(*I);
-
-  return Changed;
-}
-
-bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) {
-
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
-    const MCInstrDesc& MCid = I->getDesc();
-
-    switch(MCid.getOpcode()) {
-    default:
-      ++I;
-      continue;
-    case Mips::SETGP2:
-      // Convert "setgp2 $globalreg, $t9" to "addu $globalreg, $v0, $t9"
-      BuildMI(MBB, I, I->getDebugLoc(), TII->get(Mips::ADDu),
-              I->getOperand(0).getReg())
-        .addReg(Mips::V0).addReg(I->getOperand(1).getReg());
-      break;
-    case Mips::BuildPairF64:
-      ExpandBuildPairF64(MBB, I);
-      break;
-    case Mips::ExtractElementF64:
-      ExpandExtractElementF64(MBB, I);
-      break;
-    }
-
-    // delete original instr
-    MBB.erase(I++);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB,
-                                            MachineBasicBlock::iterator I) {
-  unsigned DstReg = I->getOperand(0).getReg();
-  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
-  const MCInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1);
-  DebugLoc dl = I->getDebugLoc();
-  const uint16_t* SubReg =
-    TM.getRegisterInfo()->getSubRegisters(DstReg);
-
-  // mtc1 Lo, $fp
-  // mtc1 Hi, $fp + 1
-  BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg);
-  BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg);
-}
-
-void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB,
-                                               MachineBasicBlock::iterator I) {
-  unsigned DstReg = I->getOperand(0).getReg();
-  unsigned SrcReg = I->getOperand(1).getReg();
-  unsigned N = I->getOperand(2).getImm();
-  const MCInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1);
-  DebugLoc dl = I->getDebugLoc();
-  const uint16_t* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg);
-
-  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N));
-}
-
-/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo
-/// instrs into real instrs
-FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) {
-  return new MipsExpandPseudo(tm);
-}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index f8ea3d0..8c0474b 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -81,6 +82,14 @@ using namespace llvm;
 //
 //===----------------------------------------------------------------------===//
 
+const MipsFrameLowering *MipsFrameLowering::create(MipsTargetMachine &TM,
+                                                   const MipsSubtarget &ST) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16FrameLowering(ST);
+
+  return llvm::createMipsSEFrameLowering(ST);
+}
+
 // hasFP - Return true if the specified function should have a dedicated frame
 // pointer register.  This is true if the function has variable sized allocas or
 // if frame pointer elimination is disabled.
@@ -89,238 +98,3 @@ bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
       MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
 }
-
-bool MipsFrameLowering::targetHandlesStackFrameRounding() const {
-  return true;
-}
-
-// Build an instruction sequence to load an immediate that is too large to fit
-// in 16-bit and add the result to Reg.
-static void expandLargeImm(unsigned Reg, int64_t Imm, bool IsN64,
-                           const MipsInstrInfo &TII, MachineBasicBlock& MBB,
-                           MachineBasicBlock::iterator II, DebugLoc DL) {
-  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
-  unsigned ADDu = IsN64 ? Mips::DADDu : Mips::ADDu;
-  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
-  MipsAnalyzeImmediate AnalyzeImm;
-  const MipsAnalyzeImmediate::InstSeq &Seq =
-    AnalyzeImm.Analyze(Imm, IsN64 ? 64 : 32, false /* LastInstrIsADDiu */);
-  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-  // The first instruction can be a LUi, which is different from other
-  // instructions (ADDiu, ORI and SLL) in that it does not have a register
-  // operand.
-  if (Inst->Opc == LUi)
-    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-  else
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  // Build the remaining instructions in Seq.
-  for (++Inst; Inst != Seq.end(); ++Inst)
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(Reg).addReg(ATReg);
-}
-
-void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB   = MF.front();
-  MachineFrameInfo *MFI    = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
-  const MipsInstrInfo &TII =
-    *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-  bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
-  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
-  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
-  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
-
-  // First, compute final stack size.
-  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
-  unsigned StackAlign = getStackAlignment();
-  unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ?
-    (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) :
-    MipsFI->getMaxCallFrameSize();
-  uint64_t StackSize =  RoundUpToAlignment(LocalVarAreaOffset, StackAlign) +
-     RoundUpToAlignment(MFI->getStackSize(), StackAlign);
-
-   // Update stack size
-  MFI->setStackSize(StackSize);
-
-  // Emit instructions that set the global base register if the target ABI is
-  // O32.
-  if (isPIC && MipsFI->globalBaseRegSet() && STI.isABI_O32() &&
-      !MipsFI->globalBaseRegFixed()) {
-      // See MipsInstrInfo.td for explanation.
-    MachineBasicBlock *NewEntry = MF.CreateMachineBasicBlock();
-    MF.insert(&MBB, NewEntry);
-    NewEntry->addSuccessor(&MBB);
-
-    // Copy live in registers.
-    for (MachineBasicBlock::livein_iterator R = MBB.livein_begin();
-         R != MBB.livein_end(); ++R)
-      NewEntry->addLiveIn(*R);
-
-    BuildMI(*NewEntry, NewEntry->begin(), dl, TII.get(Mips:: SETGP01),
-            Mips::V0);
-  }
-
-  // No need to allocate space on the stack.
-  if (StackSize == 0 && !MFI->adjustsStack()) return;
-
-  MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  MachineLocation DstML, SrcML;
-
-  // Adjust stack.
-  if (isInt<16>(-StackSize)) // addi sp, sp, (-stacksize)
-    BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize);
-  else { // Expand immediate that doesn't fit in 16-bit.
-    MipsFI->setEmitNOAT();
-    expandLargeImm(SP, -StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl);
-  }
-
-  // emit ".cfi_def_cfa_offset StackSize"
-  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
-  BuildMI(MBB, MBBI, dl,
-          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP);
-  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
-  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
-
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-  if (CSI.size()) {
-    // Find the instruction past the last instruction that saves a callee-saved
-    // register to the stack.
-    for (unsigned i = 0; i < CSI.size(); ++i)
-      ++MBBI;
-
-    // Iterate over list of callee-saved registers and emit .cfi_offset
-    // directives.
-    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl,
-            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
-
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
-      unsigned Reg = I->getReg();
-
-      // If Reg is a double precision register, emit two cfa_offsets,
-      // one for each of the paired single precision registers.
-      if (Mips::AFGR64RegisterClass->contains(Reg)) {
-        const uint16_t *SubRegs = RegInfo->getSubRegisters(Reg);
-        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
-        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
-        MachineLocation SrcML0(*SubRegs);
-        MachineLocation SrcML1(*(SubRegs + 1));
-
-        if (!STI.isLittle())
-          std::swap(SrcML0, SrcML1);
-
-        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
-        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
-      }
-      else {
-        // Reg is either in CPURegs or FGR32.
-        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
-        SrcML = MachineLocation(Reg);
-        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
-      }
-    }
-  }
-
-  // if framepointer enabled, set it to point to the stack pointer.
-  if (hasFP(MF)) {
-    // Insert instruction "move $fp, $sp" at this location.
-    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
-
-    // emit ".cfi_def_cfa_register $fp"
-    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl,
-            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
-    DstML = MachineLocation(FP);
-    SrcML = MachineLocation(MachineLocation::VirtualFP);
-    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
-  }
-
-  // Restore GP from the saved stack location
-  if (MipsFI->needGPSaveRestore()) {
-    unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI());
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset)
-      .addReg(Mips::GP);
-  }
-}
-
-void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
-                                 MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  MachineFrameInfo *MFI            = MF.getFrameInfo();
-  const MipsInstrInfo &TII =
-    *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
-  DebugLoc dl = MBBI->getDebugLoc();
-  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
-  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
-  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
-
-  // if framepointer enabled, restore the stack pointer.
-  if (hasFP(MF)) {
-    // Find the first instruction that restores a callee-saved register.
-    MachineBasicBlock::iterator I = MBBI;
-
-    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
-      --I;
-
-    // Insert instruction "move $sp, $fp" at this location.
-    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
-  }
-
-  // Get the number of bytes from FrameInfo
-  uint64_t StackSize = MFI->getStackSize();
-
-  if (!StackSize)
-    return;
-
-  // Adjust stack.
-  if (isInt<16>(StackSize)) // addi sp, sp, (-stacksize)
-    BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(StackSize);
-  else // Expand immediate that doesn't fit in 16-bit.
-    expandLargeImm(SP, StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl);
-}
-
-void MipsFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
-  MachineRegisterInfo& MRI = MF.getRegInfo();
-  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
-
-  // FIXME: remove this code if register allocator can correctly mark
-  //        $fp and $ra used or unused.
-
-  // Mark $fp and $ra as used or unused.
-  if (hasFP(MF))
-    MRI.setPhysRegUsed(FP);
-
-  // The register allocator might determine $ra is used after seeing
-  // instruction "jr $ra", but we do not want PrologEpilogInserter to insert
-  // instructions to save/restore $ra unless there is a function call.
-  // To correct this, $ra is explicitly marked unused if there is no
-  // function call.
-  if (MF.getFrameInfo()->hasCalls())
-    MRI.setPhysRegUsed(Mips::RA);
-  else {
-    MRI.setPhysRegUnused(Mips::RA);
-    MRI.setPhysRegUnused(Mips::RA_64);
-  }
-}
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index bd1d89f..ed7b7fe 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -27,23 +27,19 @@ protected:
 
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, sti.hasMips64() ? 16 : 8, 0),
-      STI(sti) {
-  }
+    : TargetFrameLowering(StackGrowsDown, sti.hasMips64() ? 16 : 8, 0,
+                          sti.hasMips64() ? 16 : 8), STI(sti) {}
 
-  bool targetHandlesStackFrameRounding() const;
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  static const MipsFrameLowering *create(MipsTargetMachine &TM,
+                                         const MipsSubtarget &ST);
 
   bool hasFP(const MachineFunction &MF) const;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
 };
 
+/// Create MipsInstrInfo objects.
+const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
+const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index f0651c6..5a97c17 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -125,20 +125,19 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
-  bool FixGlobalBaseReg = MipsFI->globalBaseRegFixed();
-
-  if (Subtarget.isABI_O32() && FixGlobalBaseReg)
-    // $gp is the global base register.
-    V0 = V1 = GlobalBaseReg;
-  else {
-    const TargetRegisterClass *RC;
-    RC = Subtarget.isABI_N64() ?
-         Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass;
-
-    V0 = RegInfo.createVirtualRegister(RC);
-    V1 = RegInfo.createVirtualRegister(RC);
-  }
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC;
+
+  if (Subtarget.isABI_N64())
+    RC = (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
+  else if (Subtarget.inMips16Mode())
+    RC = (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  else
+    RC = (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
 
   if (Subtarget.isABI_N64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
@@ -150,10 +149,25 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
     const GlobalValue *FName = MF.getFunction();
     BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
       .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0).addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
     BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
       .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-  } else if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+    return;
+  }
+
+  if (Subtarget.inMips16Mode()) {
+    BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+    BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+    BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+      .addReg(V1).addReg(V2);
+    return;
+  }
+
+  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
     // Set global register to __gnu_local_gp.
     //
     // lui   $v0, %hi(__gnu_local_gp)
@@ -162,27 +176,48 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
       .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
     BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
       .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
-  } else {
-    MF.getRegInfo().addLiveIn(Mips::T9);
-    MBB.addLiveIn(Mips::T9);
-
-    if (Subtarget.isABI_N32()) {
-      // lui $v0, %hi(%neg(%gp_rel(fname)))
-      // addu $v1, $v0, $t9
-      // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
-      const GlobalValue *FName = MF.getFunction();
-      BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
-        .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-      BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
-      BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
-        .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-    } else if (!MipsFI->globalBaseRegFixed()) {
-      assert(Subtarget.isABI_O32());
-
-      BuildMI(MBB, I, DL, TII.get(Mips::SETGP2), GlobalBaseReg)
-        .addReg(Mips::T9);
-    }
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (Subtarget.isABI_N32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
   }
+
+  assert(Subtarget.isABI_O32());
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
 }
 
 bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
@@ -207,12 +242,14 @@ bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
 
   // Replace uses with ZeroReg.
   for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
-       E = MRI->use_end(); U != E; ++U) {
+       E = MRI->use_end(); U != E;) {
     MachineOperand &MO = U.getOperand();
+    unsigned OpNo = U.getOperandNo();
     MachineInstr *MI = MO.getParent();
+    ++U;
 
     // Do not replace if it is a phi's operand or is tied to def operand.
-    if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo()))
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
       continue;
 
     MO.setReg(ZeroReg);
@@ -253,21 +290,6 @@ bool MipsDAGToDAGISel::
 SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
   EVT ValTy = Addr.getValueType();
 
-  // If Parent is an unaligned f32 load or store, select a (base + index)
-  // floating point load/store instruction (luxc1 or suxc1).
-  const LSBaseSDNode* LS = 0;
-
-  if (Parent && (LS = dyn_cast<LSBaseSDNode>(Parent))) {
-    EVT VT = LS->getMemoryVT();
-
-    if (VT.getSizeInBits() / 8 > LS->getAlignment()) {
-      assert(TLI.allowsUnalignedMemoryAccesses(VT) &&
-             "Unaligned loads/stores not supported for this type.");
-      if (VT == MVT::f32)
-        return false;
-    }
-  }
-
   // if Address is FI, get the TargetFrameIndex.
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
     Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
@@ -316,17 +338,20 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
     if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1);
-      if (isa<ConstantPoolSDNode>(LoVal.getOperand(0)) ||
-          isa<GlobalAddressSDNode>(LoVal.getOperand(0))) {
+      SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
         Base = Addr.getOperand(0);
-        Offset = LoVal.getOperand(0);
+        Offset = Opnd0;
         return true;
       }
     }
 
     // If an indexed floating point load/store can be emitted, return false.
-    if (LS && (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
+    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
+
+    if (LS &&
+        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
         Subtarget.hasMips32r2Or64())
       return false;
   }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index ace47ab..c5207c6 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -81,6 +81,14 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
   case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::LWL:               return "MipsISD::LWL";
+  case MipsISD::LWR:               return "MipsISD::LWR";
+  case MipsISD::SWL:               return "MipsISD::SWL";
+  case MipsISD::SWR:               return "MipsISD::SWR";
+  case MipsISD::LDL:               return "MipsISD::LDL";
+  case MipsISD::LDR:               return "MipsISD::LDR";
+  case MipsISD::SDL:               return "MipsISD::SDL";
+  case MipsISD::SDR:               return "MipsISD::SDR";
   default:                         return NULL;
   }
 }
@@ -98,20 +106,25 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
-  addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
 
   if (HasMips64)
-    addRegisterClass(MVT::i64, Mips::CPU64RegsRegisterClass);
+    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
+
+  if (Subtarget->inMips16Mode()) {
+    addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+    addRegisterClass(MVT::i32, &Mips::CPURARegRegClass);
+  }
 
   if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
     if (!Subtarget->isSingleFloat()) {
       if (HasMips64)
-        addRegisterClass(MVT::f64, Mips::FGR64RegisterClass);
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
-        addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
     }
   }
 
@@ -139,15 +152,18 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -161,7 +177,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
     setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,   Custom);
+    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+  }
+
+  if (!HasMips64) {
+    setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -192,6 +215,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i64,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,  Expand);
 
   if (!Subtarget->hasMips32r2())
     setOperationAction(ISD::ROTR, MVT::i32,   Expand);
@@ -199,9 +224,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   if (!Subtarget->hasMips64r2())
     setOperationAction(ISD::ROTR, MVT::i64,   Expand);
 
-  setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
@@ -243,9 +265,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  if (Subtarget->isSingleFloat())
-    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-
   if (!Subtarget->hasSEInReg()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -261,6 +280,13 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
   }
 
+  if (HasMips64) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
+    setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+  }
+
   setTargetDAGCombine(ISD::ADDE);
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::SDIVREM);
@@ -268,6 +294,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::ADD);
 
   setMinFunctionAlignment(HasMips64 ? 3 : 2);
 
@@ -276,6 +303,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
   setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
+
+  maxStoresPerMemcpy = 16;
 }
 
 bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
@@ -284,10 +313,7 @@ bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
-  case MVT::i16:
     return true;
-  case MVT::f32:
-    return Subtarget->hasMips32r2Or64();
   default:
     return false;
   }
@@ -305,17 +331,17 @@ EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
+static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
   // ADDENode's second operand must be a flag output of an ADDC node in order
   // for the matching to be successful.
-  SDNode* ADDCNode = ADDENode->getOperand(2).getNode();
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
 
   if (ADDCNode->getOpcode() != ISD::ADDC)
     return false;
 
   SDValue MultHi = ADDENode->getOperand(0);
   SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode* MultNode = MultHi.getNode();
+  SDNode *MultNode = MultHi.getNode();
   unsigned MultOpc = MultHi.getOpcode();
 
   // MultHi and MultLo must be generated by the same node,
@@ -378,17 +404,17 @@ static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) {
+static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) {
   // SUBENode's second operand must be a flag output of an SUBC node in order
   // for the matching to be successful.
-  SDNode* SUBCNode = SUBENode->getOperand(2).getNode();
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
 
   if (SUBCNode->getOpcode() != ISD::SUBC)
     return false;
 
   SDValue MultHi = SUBENode->getOperand(1);
   SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode* MultNode = MultHi.getNode();
+  SDNode *MultNode = MultHi.getNode();
   unsigned MultOpc = MultHi.getOpcode();
 
   // MultHi and MultLo must be generated by the same node,
@@ -443,9 +469,9 @@ static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) {
   return true;
 }
 
-static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformADDECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget* Subtarget) {
+                                  const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
@@ -456,9 +482,9 @@ static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
   return SDValue();
 }
 
-static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget* Subtarget) {
+                                  const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
@@ -469,9 +495,9 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
   return SDValue();
 }
 
-static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget* Subtarget) {
+                                    const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -546,7 +572,7 @@ static bool InvertFPCondCode(Mips::CondCode CC) {
 
 // Creates and returns an FPCmp node from a setcc node.
 // Returns Op if setcc is not a floating point comparison.
-static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
+static SDValue CreateFPCmp(SelectionDAG &DAG, const SDValue &Op) {
   // must be a SETCC node
   if (Op.getOpcode() != ISD::SETCC)
     return Op;
@@ -568,7 +594,7 @@ static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
 }
 
 // Creates and returns a CMovFPT/F node.
-static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
+static SDValue CreateCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
                             SDValue False, DebugLoc DL) {
   bool invert = InvertFPCondCode((Mips::CondCode)
                                  cast<ConstantSDNode>(Cond.getOperand(2))
@@ -578,9 +604,9 @@ static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
                      True.getValueType(), True, False, Cond);
 }
 
-static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget* Subtarget) {
+                                    const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -604,16 +630,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG,
   const DebugLoc DL = N->getDebugLoc();
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   SDValue True = N->getOperand(1);
-  
+
   SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
                        SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
-  
+
   return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
 }
 
-static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget* Subtarget) {
+                                 const MipsSubtarget *Subtarget) {
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
@@ -651,9 +677,9 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
                      DAG.getConstant(SMSize, MVT::i32));
 }
 
-static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
-                                const MipsSubtarget* Subtarget) {
+                                const MipsSubtarget *Subtarget) {
   // Pattern match INS.
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
@@ -705,6 +731,33 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
                      DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
 }
 
+static SDValue PerformADDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Add = N->getOperand(1);
+
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue Lo = Add.getOperand(1);
+
+  if ((Lo.getOpcode() != MipsISD::Lo) ||
+      (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+    return SDValue();
+
+  EVT ValTy = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+                             Add.getOperand(0));
+  return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -720,11 +773,13 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   case ISD::UDIVREM:
     return PerformDivRemCombine(N, DAG, DCI, Subtarget);
   case ISD::SELECT:
-    return PerformSELECTCombine(N, DAG, DCI, Subtarget);  
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     return PerformANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return PerformORCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:
+    return PerformADDCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -737,19 +792,25 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   {
     case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
     case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
     case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
     case ISD::SETCC:              return LowerSETCC(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
     case ISD::FABS:               return LowerFABS(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
     case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::SHL_PARTS:          return LowerShiftLeftParts(Op, DAG);
+    case ISD::SRA_PARTS:          return LowerShiftRightParts(Op, DAG, true);
+    case ISD::SRL_PARTS:          return LowerShiftRightParts(Op, DAG, false);
+    case ISD::LOAD:               return LowerLOAD(Op, DAG);
+    case ISD::STORE:              return LowerSTORE(Op, DAG);
   }
   return SDValue();
 }
@@ -784,7 +845,7 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
 /*
 static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
                                         DebugLoc dl,
-                                        const MipsSubtarget* Subtarget,
+                                        const MipsSubtarget *Subtarget,
                                         const TargetInstrInfo *TII,
                                         bool isFPCmp, unsigned Opc) {
   // There is no need to expand CMov instructions if target has
@@ -1440,42 +1501,6 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
 SDValue MipsTargetLowering::
-LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
-{
-  MachineFunction &MF = DAG.getMachineFunction();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  unsigned SP = IsN64 ? Mips::SP_64 : Mips::SP;
-
-  assert(getTargetMachine().getFrameLowering()->getStackAlignment() >=
-         cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue() &&
-         "Cannot lower if the alignment of the allocated space is larger than \
-          that of the stack.");
-
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
-
-  // Get a reference from Mips stack pointer
-  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, SP, getPointerTy());
-
-  // Subtract the dynamic size from the actual stack size to
-  // obtain the new stack size.
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, getPointerTy(), StackPointer, Size);
-
-  // The Sub result contains the new stack start address, so it
-  // must be placed in the stack pointer register.
-  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, SP, Sub, SDValue());
-
-  // This node always has two return values: a new stack pointer
-  // value and a chain
-  SDVTList VTLs = DAG.getVTList(getPointerTy(), MVT::Other);
-  SDValue Ptr = DAG.getFrameIndex(MipsFI->getDynAllocFI(), getPointerTy());
-  SDValue Ops[] = { Chain, Ptr, Chain.getValue(1) };
-
-  return DAG.getNode(MipsISD::DynAlloc, dl, VTLs, Ops, 3);
-}
-
-SDValue MipsTargetLowering::
 LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
 {
   // The first operand is the chain, the second is the condition, the third is
@@ -1512,6 +1537,19 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const
                       Op.getDebugLoc());
 }
 
+SDValue MipsTargetLowering::
+LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty),
+                             Op.getOperand(0), Op.getOperand(1),
+                             Op.getOperand(4));
+
+  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
+                     Op.getOperand(3));
+}
+
 SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = CreateFPCmp(DAG, Op);
 
@@ -1614,10 +1652,13 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
 
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    // General Dynamic TLS Model
-    bool LocalDynamic = GV->hasInternalLinkage();
-    unsigned Flag = LocalDynamic ? MipsII::MO_TLSLDM :MipsII::MO_TLSGD;
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    // General Dynamic and Local Dynamic TLS Model.
+    unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+                                                      : MipsII::MO_TLSGD;
+
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, Flag);
     SDValue Argument = DAG.getNode(MipsISD::Wrapper, dl, PtrVT,
                                    GetGlobalReg(DAG, PtrVT), TGA);
@@ -1632,16 +1673,16 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Entry.Ty = PtrTy;
     Args.push_back(Entry);
 
-    std::pair<SDValue, SDValue> CallResult =
-      LowerCallTo(DAG.getEntryNode(), PtrTy,
+    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
                   false, false, false, false, 0, CallingConv::C,
                   /*isTailCall=*/false, /*doesNotRet=*/false,
                   /*isReturnValueUsed=*/true,
                   TlsGetAddr, Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
 
-    if (!LocalDynamic)
+    if (model != TLSModel::LocalDynamic)
       return Ret;
 
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -1655,7 +1696,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   }
 
   SDValue Offset;
-  if (GV->isDeclaration()) {
+  if (model == TLSModel::InitialExec) {
     // Initial Exec TLS Model
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                              MipsII::MO_GOTTPREL);
@@ -1666,6 +1707,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
                          false, false, false, 0);
   } else {
     // Local Exec TLS Model
+    assert(model == TLSModel::LocalExec);
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                MipsII::MO_TPREL_HI);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -1942,9 +1984,26 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Return address can be determined only for current frame.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  EVT VT = Op.getValueType();
+  unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA;
+  MFI->setReturnAddressIsTaken(true);
+
+  // Return RA, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
+}
+
 // TODO: set SType according to the desired memory barrier behavior.
 SDValue
-MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const {
+MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
   unsigned SType = 0;
   DebugLoc dl = Op.getDebugLoc();
   return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
@@ -1952,7 +2011,7 @@ MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const {
 }
 
 SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
-                                              SelectionDAG& DAG) const {
+                                              SelectionDAG &DAG) const {
   // FIXME: Need pseudo-fence for 'singlethread' fences
   // FIXME: Set SType for weaker fences where supported/appropriate.
   unsigned SType = 0;
@@ -1961,6 +2020,210 @@ SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
                      DAG.getConstant(SType, MVT::i32));
 }
 
+SDValue MipsTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (shl lo, shamt)
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  // else:
+  //  lo = 0
+  //  hi = (shl lo, shamt[4:0])
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo,
+                                      DAG.getConstant(1, MVT::i32));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo,
+                                     Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   DAG.getConstant(0, MVT::i32), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  if isSRA:
+  //    hi = (sra hi, shamt)
+  //  else:
+  //    hi = (srl hi, shamt)
+  // else:
+  //  if isSRA:
+  //   lo = (sra hi, shamt[4:0])
+  //   hi = (sra hi, 31)
+  //  else:
+  //   lo = (srl hi, shamt[4:0])
+  //   hi = 0
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                                     DAG.getConstant(1, MVT::i32));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                                     Hi, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi,
+                                DAG.getConstant(31, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32),
+                   ShiftRightHi);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+                            SDValue Chain, SDValue Src, unsigned Offset) {
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+  EVT BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = LD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Ptr, Src };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  EVT MemVT = LD->getMemoryVT();
+
+  // Return if load is aligned or if MemVT is neither i32 nor i64.
+  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  EVT VT = Op.getValueType();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::i32) || (VT == MVT::i64));
+
+  // Expand
+  //  (set dst, (i64 (load baseptr)))
+  // to
+  //  (set tmp, (ldl (add baseptr, 7), undef))
+  //  (set dst, (ldr baseptr, tmp))
+  if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+    SDValue LDL = CreateLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+                               IsLittle ? 7 : 0);
+    return CreateLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+                        IsLittle ? 0 : 7);
+  }
+
+  SDValue LWL = CreateLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+                             IsLittle ? 3 : 0);
+  SDValue LWR = CreateLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+                             IsLittle ? 0 : 3);
+
+  // Expand
+  //  (set dst, (i32 (load baseptr))) or
+  //  (set dst, (i64 (sextload baseptr))) or
+  //  (set dst, (i64 (extload baseptr)))
+  // to
+  //  (set tmp, (lwl (add baseptr, 3), undef))
+  //  (set dst, (lwr baseptr, tmp))
+  if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+      (ExtType == ISD::EXTLOAD))
+    return LWR;
+
+  assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+  // Expand
+  //  (set dst, (i64 (zextload baseptr)))
+  // to
+  //  (set tmp0, (lwl (add baseptr, 3), undef))
+  //  (set tmp1, (lwr baseptr, tmp0))
+  //  (set tmp2, (shl tmp1, 32))
+  //  (set dst, (srl tmp2, 32))
+  DebugLoc DL = LD->getDebugLoc();
+  SDValue Const32 = DAG.getConstant(32, MVT::i32);
+  SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+  SDValue Ops[] = { SRL, LWR.getValue(1) };
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+                             SDValue Chain, unsigned Offset) {
+  SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+  EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = SD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Value, Ptr };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Return if store is aligned or if MemVT is neither i32 nor i64.
+  if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  SDValue Value = SD->getValue(), Chain = SD->getChain();
+  EVT VT = Value.getValueType();
+
+  // Expand
+  //  (store val, baseptr) or
+  //  (truncstore val, baseptr)
+  // to
+  //  (swl val, (add baseptr, 3))
+  //  (swr val, baseptr)
+  if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+    SDValue SWL = CreateStoreLR(MipsISD::SWL, DAG, SD, Chain,
+                                IsLittle ? 3 : 0);
+    return CreateStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+  }
+
+  assert(VT == MVT::i64);
+
+  // Expand
+  //  (store val, baseptr)
+  // to
+  //  (sdl val, (add baseptr, 7))
+  //  (sdr val, baseptr)
+  SDValue SDL = CreateStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+  return CreateStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2153,11 +2416,11 @@ static unsigned getNextIntArgReg(unsigned Reg) {
 
 // Write ByVal Arg to arg registers and stack.
 static void
-WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
-              SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
-              SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+WriteByValArg(SDValue Chain, DebugLoc dl,
+              SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+              SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
               MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-              const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+              const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
               MVT PtrType, bool isLittle) {
   unsigned LocMemOffset = VA.getLocMemOffset();
   unsigned Offset = 0;
@@ -2229,26 +2492,26 @@ WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
     return;
   }
 
-  // Create a fixed object on stack at offset LocMemOffset and copy
-  // remaining part of byval arg to it using memcpy.
+  // Copy remaining part of byval arg using memcpy.
   SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
                             DAG.getConstant(Offset, MVT::i32));
-  LastFI = MFI->CreateFixedObject(RemainingSize, LocMemOffset, true);
-  SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
-  ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
-                             DAG.getConstant(RemainingSize, MVT::i32),
-                             std::min(ByValAlign, (unsigned)4),
-                             /*isVolatile=*/false, /*AlwaysInline=*/false,
-                             MachinePointerInfo(0), MachinePointerInfo(0));
+  SDValue Dst = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr,
+                            DAG.getIntPtrConstant(LocMemOffset));
+  Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
+                        DAG.getConstant(RemainingSize, MVT::i32),
+                        std::min(ByValAlign, (unsigned)4),
+                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        MachinePointerInfo(0), MachinePointerInfo(0));
+  MemOpChains.push_back(Chain);
 }
 
 // Copy Mips64 byVal arg to registers and stack.
 void static
-PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
-               SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
-               SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+PassByValArg64(SDValue Chain, DebugLoc dl,
+               SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+               SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
                MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-               const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+               const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                EVT PtrTy, bool isLittle) {
   unsigned ByValSize = Flags.getByValSize();
   unsigned Alignment = std::min(Flags.getByValAlign(), (unsigned)8);
@@ -2318,30 +2581,35 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
 
   assert(MemCpySize && "MemCpySize must not be zero.");
 
-  // Create a fixed object on stack at offset LocMemOffset and copy
-  // remainder of byval arg to it with memcpy.
+  // Copy remainder of byval arg to it with memcpy.
   SDValue Src = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
                             DAG.getConstant(Offset, PtrTy));
-  LastFI = MFI->CreateFixedObject(MemCpySize, LocMemOffset, true);
-  SDValue Dst = DAG.getFrameIndex(LastFI, PtrTy);
-  ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
-                             DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                             /*isVolatile=*/false, /*AlwaysInline=*/false,
-                             MachinePointerInfo(0), MachinePointerInfo(0));
+  SDValue Dst = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr,
+                            DAG.getIntPtrConstant(LocMemOffset));
+  Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
+                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
+                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        MachinePointerInfo(0), MachinePointerInfo(0));
+  MemOpChains.push_back(Chain);
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
 SDValue
-MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
-                              CallingConv::ID CallConv, bool isVarArg,
-                              bool doesNotRet, bool &isTailCall,
-                              const SmallVectorImpl<ISD::OutputArg> &Outs,
-                              const SmallVectorImpl<SDValue> &OutVals,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl, SelectionDAG &DAG,
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MIPs target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -2356,7 +2624,9 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  if (IsO32)
+  if (CallConv == CallingConv::Fast)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Mips_FastCC);
+  else if (IsO32)
     CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
   else if (HasMips64)
     AnalyzeMips64CallOperands(CCInfo, Outs);
@@ -2365,54 +2635,32 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
-
-  // Chain is the output chain of the last Load/Store or CopyToReg node.
-  // ByValChain is the output chain of the last Memcpy node created for copying
-  // byval arguments to the stack.
-  SDValue Chain, CallSeqStart, ByValChain;
-  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
-  Chain = CallSeqStart = DAG.getCALLSEQ_START(InChain, NextStackOffsetVal);
-  ByValChain = InChain;
-
-  // If this is the first call, create a stack frame object that points to
-  // a location to which .cprestore saves $gp.
-  if (IsO32 && IsPIC && MipsFI->globalBaseRegFixed() && !MipsFI->getGPFI())
-    MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true));
-
-  // Get the frame index of the stack frame object that points to the location
-  // of dynamically allocated area on the stack.
-  int DynAllocFI = MipsFI->getDynAllocFI();
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
 
   // Update size of the maximum argument space.
   // For O32, a minimum of four words (16 bytes) of argument space is
   // allocated.
-  if (IsO32)
+  if (IsO32 && (CallConv != CallingConv::Fast))
     NextStackOffset = std::max(NextStackOffset, (unsigned)16);
 
-  unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize();
-
-  if (MaxCallFrameSize < NextStackOffset) {
-    MipsFI->setMaxCallFrameSize(NextStackOffset);
-
-    // Set the offsets relative to $sp of the $gp restore slot and dynamically
-    // allocated stack space. These offsets must be aligned to a boundary
-    // determined by the stack alignment of the ABI.
-    unsigned StackAlignment = TFL->getStackAlignment();
-    NextStackOffset = (NextStackOffset + StackAlignment - 1) /
-                      StackAlignment * StackAlignment;
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
+  Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
 
-    if (MipsFI->needGPSaveRestore())
-      MFI->setObjectOffset(MipsFI->getGPFI(), NextStackOffset);
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                        IsN64 ? Mips::SP_64 : Mips::SP,
+                                        getPointerTy());
 
-    MFI->setObjectOffset(DynAllocFI, NextStackOffset);
-  }
+  if (MipsFI->getMaxCallFrameSize() < NextStackOffset)
+    MipsFI->setMaxCallFrameSize(NextStackOffset);
 
   // With EABI is it possible to have 16 args on registers.
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  int FirstFI = -MFI->getNumFixedObjects() - 1, LastFI = 0;
-
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     SDValue Arg = OutVals[i];
@@ -2425,11 +2673,11 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
       if (IsO32)
-        WriteByValArg(ByValChain, Chain, dl, RegsToPass, MemOpChains, LastFI,
+        WriteByValArg(Chain, dl, RegsToPass, MemOpChains, StackPtr,
                       MFI, DAG, Arg, VA, Flags, getPointerTy(),
                       Subtarget->isLittle());
       else
-        PassByValArg64(ByValChain, Chain, dl, RegsToPass, MemOpChains, LastFI,
+        PassByValArg64(Chain, dl, RegsToPass, MemOpChains, StackPtr,
                        MFI, DAG, Arg, VA, Flags, getPointerTy(),
                        Subtarget->isLittle());
       continue;
@@ -2479,29 +2727,14 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     // Register can't get to this point...
     assert(VA.isMemLoc());
 
-    // Create the frame index object for this incoming parameter
-    LastFI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
-                                    VA.getLocMemOffset(), true);
-    SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
-
     // emit ISD::STORE whichs stores the
     // parameter value to a stack Location
+    SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                 DAG.getIntPtrConstant(VA.getLocMemOffset()));
     MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
                                        MachinePointerInfo(), false, false, 0));
   }
 
-  // Extend range of indices of frame objects for outgoing arguments that were
-  // created during this function call. Skip this step if no such objects were
-  // created.
-  if (LastFI)
-    MipsFI->extendOutArgFIRange(FirstFI, LastFI);
-
-  // If a memcpy has been created to copy a byval arg to a stack, replace the
-  // chain input of CallSeqStart with ByValChain.
-  if (InChain != ByValChain)
-    DAG.UpdateNodeOperands(CallSeqStart.getNode(), ByValChain,
-                           NextStackOffsetVal);
-
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
@@ -2565,6 +2798,9 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     }
   }
 
+  // T9 register operand.
+  SDValue T9;
+
   // T9 should contain the address of the callee function if
   // -reloction-model=pic or it is an indirect call.
   if (IsPICCall || !GlobalOrExternal) {
@@ -2572,7 +2808,19 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
     Chain = DAG.getCopyToReg(Chain, dl, T9Reg, Callee, SDValue(0, 0));
     InFlag = Chain.getValue(1);
-    Callee = DAG.getRegister(T9Reg, getPointerTy());
+
+    if (Subtarget->inMips16Mode())
+      T9 = DAG.getRegister(T9Reg, getPointerTy());
+    else
+      Callee = DAG.getRegister(T9Reg, getPointerTy());
+  }
+
+  // Insert node "GP copy globalreg" before call to function.
+  // Lazy-binding stubs require GP to point to the GOT.
+  if (IsPICCall) {
+    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
+    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, GetGlobalReg(DAG, Ty)));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token
@@ -2600,6 +2848,10 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  // Add T9 register operand.
+  if (T9.getNode())
+    Ops.push_back(T9);
+
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2613,8 +2865,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(NextStackOffset, true),
+  Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
                              DAG.getIntPtrConstant(0, true), InFlag);
   InFlag = Chain.getValue(1);
 
@@ -2635,7 +2886,7 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
@@ -2654,9 +2905,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                         std::vector<SDValue>& OutChains,
+                         std::vector<SDValue> &OutChains,
                          SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
-                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+                         const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                          const Argument *FuncArg) {
   unsigned LocMem = VA.getLocMemOffset();
   unsigned FirstWord = LocMem / 4;
@@ -2668,7 +2919,7 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
       break;
 
     unsigned SrcReg = O32IntRegs[CurWord];
-    unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass);
+    unsigned Reg = AddLiveIn(MF, SrcReg, &Mips::CPURegsRegClass);
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
                                    DAG.getConstant(i * 4, MVT::i32));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
@@ -2681,8 +2932,8 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
 // Create frame object on stack and copy registers used for byval passing to it.
 static unsigned
 CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                    std::vector<SDValue>& OutChains, SelectionDAG &DAG,
-                    const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+                    std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                    const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                     MachineFrameInfo *MFI, bool IsRegLoc,
                     SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI,
                     EVT PtrTy, const Argument *FuncArg) {
@@ -2705,7 +2956,7 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
   // Copy arg registers.
   for (unsigned I = 0; (Reg != Mips64IntRegs + 8) && (I < NumRegs);
        ++Reg, ++I) {
-    unsigned VReg = AddLiveIn(MF, *Reg, Mips::CPU64RegsRegisterClass);
+    unsigned VReg = AddLiveIn(MF, *Reg, &Mips::CPU64RegsRegClass);
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, PtrTy, FIN,
                                    DAG.getConstant(I * 8, PtrTy));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(VReg, MVT::i64),
@@ -2741,7 +2992,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  if (IsO32)
+  if (CallConv == CallingConv::Fast)
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FastCC);
+  else if (IsO32)
     CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
@@ -2781,13 +3034,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
-        RC = Mips::CPURegsRegisterClass;
+        RC = &Mips::CPURegsRegClass;
       else if (RegVT == MVT::i64)
-        RC = Mips::CPU64RegsRegisterClass;
+        RC = &Mips::CPU64RegsRegClass;
       else if (RegVT == MVT::f32)
-        RC = Mips::FGR32RegisterClass;
+        RC = &Mips::FGR32RegClass;
       else if (RegVT == MVT::f64)
-        RC = HasMips64 ? Mips::FGR64RegisterClass : Mips::AFGR64RegisterClass;
+        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
       else
         llvm_unreachable("RegVT not supported by FormalArguments Lowering");
 
@@ -2861,8 +3114,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     const uint16_t *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs;
     unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumOfRegs);
     int FirstRegSlotOffset = IsO32 ? 0 : -64 ; // offset of $a0's slot.
-    const TargetRegisterClass *RC
-      = IsO32 ? Mips::CPURegsRegisterClass : Mips::CPU64RegsRegisterClass;
+    const TargetRegisterClass *RC = IsO32 ?
+      (const TargetRegisterClass*)&Mips::CPURegsRegClass :
+      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
     unsigned RegSize = RC->getSize();
     int RegSlotOffset = FirstRegSlotOffset + Idx * RegSize;
 
@@ -2926,7 +3180,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
@@ -2972,11 +3226,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
   // Return on Mips is always a "jr $ra"
   if (Flag.getNode())
-    return DAG.getNode(MipsISD::Ret, dl, MVT::Other,
-                       Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag);
-  else // Return Void
-    return DAG.getNode(MipsISD::Ret, dl, MVT::Other,
-                       Chain, DAG.getRegister(Mips::RA, MVT::i32));
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2995,13 +3248,19 @@ getConstraintType(const std::string &Constraint) const
   //       unless generating MIPS16 code.
   // 'y' : Equivalent to r; retained for
   //       backwards compatibility.
-  // 'f' : Floating Point registers.
+  // 'c' : A register suitable for use in an indirect
+  //       jump. This will always be $25 for -mabicalls.
+  // 'l' : The lo register. 1 word storage.
+  // 'x' : The hilo register pair. Double word storage.
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
       default : break;
       case 'd':
       case 'y':
       case 'f':
+      case 'c':
+      case 'l':
+      case 'x':
         return C_RegisterClass;
     }
   }
@@ -3035,6 +3294,22 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (type->isFloatTy())
       weight = CW_Register;
     break;
+  case 'c': // $25 for indirect jumps
+  case 'l': // lo register
+  case 'x': // hilo register pair
+      if (type->isIntegerTy())
+      weight = CW_SpecificReg;
+      break;
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+  case 'O': // signed 15 bit immediate (+- 16383)
+  case 'P': // immediate in the range of 65535 to 1 (inclusive)
+    if (isa<ConstantInt>(CallOperandVal))
+      weight = CW_Constant;
+    break;
   }
   return weight;
 }
@@ -3050,30 +3325,152 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32)
-        return std::make_pair(0U, Mips::CPURegsRegisterClass);
-      assert(VT == MVT::i64 && "Unexpected type.");
-      return std::make_pair(0U, Mips::CPU64RegsRegisterClass);
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      if (VT == MVT::i64 && !HasMips64)
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      if (VT == MVT::i64 && HasMips64)
+        return std::make_pair(0U, &Mips::CPU64RegsRegClass);
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, Mips::FGR32RegisterClass);
+        return std::make_pair(0U, &Mips::FGR32RegClass);
       if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
         if (Subtarget->isFP64bit())
-          return std::make_pair(0U, Mips::FGR64RegisterClass);
-        else
-          return std::make_pair(0U, Mips::AFGR64RegisterClass);
+          return std::make_pair(0U, &Mips::FGR64RegClass);
+        return std::make_pair(0U, &Mips::AFGR64RegClass);
       }
+      break;
+    case 'c': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::T9, &Mips::CPURegsRegClass);
+      assert(VT == MVT::i64 && "Unexpected type.");
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::CPU64RegsRegClass);
+    case 'l': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::LO, &Mips::HILORegClass);
+      return std::make_pair((unsigned)Mips::LO64, &Mips::HILO64RegClass);
+    case 'x': // register suitable for indirect jump
+      // Fixme: Not triggering the use of both hi and low
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break; // This will fall through to the generic implementation
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if (isInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getZExtValue();
+      if (Val == 0) {
+        Result = DAG.getTargetConstant(0, Type);
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      uint64_t Val = (uint64_t)C->getZExtValue();
+      if (isUInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -65535) && (Val <= -1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'O': // signed 15 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<15>(Val))) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'P': // immediate in the range of 1 to 65535 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val <= 65535) && (Val >= 1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
 bool
 MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Mips target isn't yet aware of offsets.
   return false;
 }
 
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                            unsigned SrcAlign, bool IsZeroVal,
+                                            bool MemcpyStrSrc,
+                                            MachineFunction &MF) const {
+  if (Subtarget->hasMips64())
+    return MVT::i64;
+
+  return MVT::i32;
+}
+
 bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (VT != MVT::f32 && VT != MVT::f64)
     return false;
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index c36f40f..95ea8fa 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -79,7 +79,17 @@ namespace llvm {
       Sync,
 
       Ext,
-      Ins
+      Ins,
+
+      // Load/Store Left/Right nodes.
+      LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LWR,
+      SWL,
+      SWR,
+      LDL,
+      LDR,
+      SDL,
+      SDR
     };
   }
 
@@ -122,19 +132,25 @@ namespace llvm {
     // Lower Operand specifics
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+                                 bool IsSRA) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -144,13 +160,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -176,8 +186,22 @@ namespace llvm {
               getRegForInlineAsmConstraint(const std::string &Constraint,
               EVT VT) const;
 
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                    unsigned SrcAlign, bool IsZeroVal,
+                                    bool MemcpyStrSrc,
+                                    MachineFunction &MF) const;
+
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 14d8f1e..3e78c45 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -54,10 +54,14 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">;
-def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">;
-def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">;
-def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">;
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"!FeatureSingleFloat">;
 
 // FP immediate patterns.
 def fpimm0 : PatLeaf<(fpimm), [{
@@ -97,7 +101,7 @@ class FPStore<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
 }
 // FP indexed load.
 class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
-                RegisterClass PRC, PatFrag FOp>:
+                RegisterClass PRC, SDPatternOperator FOp = null_frag>:
   FFMemIdx<funct, (outs DRC:$fd), (ins PRC:$base, PRC:$index),
            !strconcat(opstr, "\t$fd, $index($base)"),
            [(set DRC:$fd, (FOp (add PRC:$base, PRC:$index)))]> {
@@ -106,7 +110,7 @@ class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
 
 // FP indexed store.
 class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC,
-                 RegisterClass PRC, PatFrag FOp>:
+                 RegisterClass PRC, SDPatternOperator FOp= null_frag>:
   FFMemIdx<funct, (outs), (ins DRC:$fs, PRC:$base, PRC:$index),
            !strconcat(opstr, "\t$fs, $index($base)"),
            [(FOp DRC:$fs, (add PRC:$base, PRC:$index))]> {
@@ -117,15 +121,15 @@ class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC,
 multiclass FFR1_W_M<bits<6> funct, string opstr> {
   def _S   : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>;
   def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
 
 // Instructions that convert an FP value to 64-bit fixed point.
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in
+let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in
 multiclass FFR1_L_M<bits<6> funct, string opstr> {
   def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
   def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
@@ -135,9 +139,9 @@ multiclass FFR1_L_M<bits<6> funct, string opstr> {
 multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
   def _S   : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>;
   def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -146,9 +150,9 @@ multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
   let isCommutable = isComm in {
   def _S   : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>;
   def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -185,13 +189,13 @@ def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
 def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>;
 def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>;
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
   def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
   def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
 }
 
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in {
  def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
  def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
  def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
@@ -199,7 +203,7 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
  def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
 }
 
-let Predicates = [NoNaNsFPMath] in {
+let Predicates = [NoNaNsFPMath, HasStandardEncoding] in {
   defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
   defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
 }
@@ -242,14 +246,14 @@ def DMTC1 : FFRGPR<0x05, (outs FGR64:$fs), (ins CPU64Regs:$rt),
 
 def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
 def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
-               Requires<[NotFP64bit]>;
+               Requires<[NotFP64bit, HasStandardEncoding]>;
 def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
-               Requires<[IsFP64bit]> {
+               Requires<[IsFP64bit, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64], DecoderNamespace = "Mips64" in {
+let Predicates = [IsN64, HasStandardEncoding], DecoderNamespace = "Mips64" in {
   def LWC1_P8   : FPLoad<0x31, "lwc1", FGR32, mem64>;
   def SWC1_P8   : FPStore<0x39, "swc1", FGR32, mem64>;
   def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64> {
@@ -260,81 +264,91 @@ let Predicates = [IsN64], DecoderNamespace = "Mips64" in {
   }
 }
 
-let Predicates = [NotN64] in {
+let Predicates = [NotN64, HasStandardEncoding] in {
   def LWC1   : FPLoad<0x31, "lwc1", FGR32, mem>;
   def SWC1   : FPStore<0x39, "swc1", FGR32, mem>;
 }
 
-let Predicates = [NotN64, HasMips64], DecoderNamespace = "Mips64" in {
+let Predicates = [NotN64, HasMips64, HasStandardEncoding],
+  DecoderNamespace = "Mips64" in {
   def LDC164 : FPLoad<0x35, "ldc1", FGR64, mem>;
   def SDC164 : FPStore<0x3d, "sdc1", FGR64, mem>;
 }
 
-let Predicates = [NotN64, NotMips64] in {
+let Predicates = [NotN64, NotMips64, HasStandardEncoding] in {
   def LDC1   : FPLoad<0x35, "ldc1", AFGR64, mem>;
   def SDC1   : FPStore<0x3d, "sdc1", AFGR64, mem>;
 }
 
 // Indexed loads and stores.
-let Predicates = [HasMips32r2Or64] in {
+let Predicates = [HasMips32r2Or64, HasStandardEncoding] in {
   def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load_a>;
-  def LUXC1 : FPIdxLoad<0x5, "luxc1", FGR32, CPURegs, load_u>;
   def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store_a>;
-  def SUXC1 : FPIdxStore<0xd, "suxc1", FGR32, CPURegs, store_u>;
 }
 
-let Predicates = [HasMips32r2, NotMips64] in {
+let Predicates = [HasMips32r2, NotMips64, HasStandardEncoding] in {
   def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load_a>;
   def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>;
 }
 
-let Predicates = [HasMips64, NotN64], DecoderNamespace="Mips64" in {
+let Predicates = [HasMips64, NotN64, HasStandardEncoding], DecoderNamespace="Mips64" in {
   def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>;
   def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>;
 }
 
 // n64
-let Predicates = [IsN64], isCodeGenOnly=1 in {
+let Predicates = [IsN64, HasStandardEncoding], isCodeGenOnly=1 in {
   def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>;
-  def LUXC1_P8   : FPIdxLoad<0x5, "luxc1", FGR32, CPU64Regs, load_u>;
   def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>;
   def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store_a>;
-  def SUXC1_P8   : FPIdxStore<0xd, "suxc1", FGR32, CPU64Regs, store_u>;
   def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store_a>;
 }
 
+// Load/store doubleword indexed unaligned.
+let Predicates = [NotMips64, HasStandardEncoding] in {
+  def LUXC1 : FPIdxLoad<0x5, "luxc1", AFGR64, CPURegs>;
+  def SUXC1 : FPIdxStore<0xd, "suxc1", AFGR64, CPURegs>;
+}
+
+let Predicates = [HasMips64, HasStandardEncoding],
+  DecoderNamespace="Mips64" in {
+  def LUXC164 : FPIdxLoad<0x5, "luxc1", FGR64, CPURegs>;
+  def SUXC164 : FPIdxStore<0xd, "suxc1", FGR64, CPURegs>;
+}
+
 /// Floating-point Aritmetic
 defm FADD : FFR2P_M<0x00, "add", fadd, 1>;
 defm FDIV : FFR2P_M<0x03, "div", fdiv>;
 defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>;
 defm FSUB : FFR2P_M<0x01, "sub", fsub>;
 
-let Predicates = [HasMips32r2] in {
+let Predicates = [HasMips32r2, HasStandardEncoding] in {
   def MADD_S : FMADDSUB<0x4, 0, "madd", "s", fadd, FGR32>;
   def MSUB_S : FMADDSUB<0x5, 0, "msub", "s", fsub, FGR32>;
 }
 
-let Predicates = [HasMips32r2, NoNaNsFPMath] in {
+let Predicates = [HasMips32r2, NoNaNsFPMath, HasStandardEncoding] in {
   def NMADD_S : FNMADDSUB<0x6, 0, "nmadd", "s", fadd, FGR32>;
   def NMSUB_S : FNMADDSUB<0x7, 0, "nmsub", "s", fsub, FGR32>;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit] in {
+let Predicates = [HasMips32r2, NotFP64bit, HasStandardEncoding] in {
   def MADD_D32 : FMADDSUB<0x4, 1, "madd", "d", fadd, AFGR64>;
   def MSUB_D32 : FMADDSUB<0x5, 1, "msub", "d", fsub, AFGR64>;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath] in {
+let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStandardEncoding] in {
   def NMADD_D32 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, AFGR64>;
   def NMSUB_D32 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, AFGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit], isCodeGenOnly=1 in {
+let Predicates = [HasMips32r2, IsFP64bit, HasStandardEncoding], isCodeGenOnly=1 in {
   def MADD_D64 : FMADDSUB<0x4, 1, "madd", "d", fadd, FGR64>;
   def MSUB_D64 : FMADDSUB<0x5, 1, "msub", "d", fsub, FGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath], isCodeGenOnly=1 in {
+let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStandardEncoding],
+    isCodeGenOnly=1 in {
   def NMADD_D64 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, FGR64>;
   def NMSUB_D64 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, FGR64>;
 }
@@ -391,8 +405,10 @@ class FCMP<bits<5> fmt, RegisterClass RC, string typestr> :
 /// Floating Point Compare
 let Defs=[FCR31] in {
   def FCMP_S32 : FCMP<0x10, FGR32, "s">;
-  def FCMP_D32 : FCMP<0x11, AFGR64, "d">, Requires<[NotFP64bit]>;
-  def FCMP_D64 : FCMP<0x11, FGR64, "d">, Requires<[IsFP64bit]> {
+  def FCMP_D32 : FCMP<0x11, AFGR64, "d">,
+      Requires<[NotFP64bit, HasStandardEncoding]>;
+  def FCMP_D64 : FCMP<0x11, FGR64, "d">,
+      Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -400,69 +416,59 @@ let Defs=[FCR31] in {
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
 //===----------------------------------------------------------------------===//
-def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src),
-                             "# MOVCCRToCCR", []>;
+def MOVCCRToCCR : PseudoSE<(outs CCR:$dst), (ins CCR:$src),
+                           "# MOVCCRToCCR", []>;
 
 // This pseudo instr gets expanded into 2 mtc1 instrs after register
 // allocation.
 def BuildPairF64 :
-  MipsPseudo<(outs AFGR64:$dst),
-             (ins CPURegs:$lo, CPURegs:$hi), "",
-             [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
+  PseudoSE<(outs AFGR64:$dst),
+           (ins CPURegs:$lo, CPURegs:$hi), "",
+           [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
 // if n is 0, lower part of src is extracted.
 // if n is 1, higher part of src is extracted.
 def ExtractElementF64 :
-  MipsPseudo<(outs CPURegs:$dst),
-             (ins AFGR64:$src, i32imm:$n), "",
-             [(set CPURegs:$dst,
-               (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
+  PseudoSE<(outs CPURegs:$dst), (ins AFGR64:$src, i32imm:$n), "",
+           [(set CPURegs:$dst, (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
 //===----------------------------------------------------------------------===//
-def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
-def : Pat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
-
-def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
-def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
-
-let Predicates = [NotFP64bit] in {
-  def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D32_W (MTC1 CPURegs:$src))>;
-  def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
-  def : Pat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
-}
-
-let Predicates = [IsFP64bit] in {
-  def : Pat<(f64 fpimm0), (DMTC1 ZERO_64)>;
-  def : Pat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
-
-  def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D64_W (MTC1 CPURegs:$src))>;
-  def : Pat<(f32 (sint_to_fp CPU64Regs:$src)),
-            (CVT_S_L (DMTC1 CPU64Regs:$src))>;
-  def : Pat<(f64 (sint_to_fp CPU64Regs:$src)),
-            (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
-
-  def : Pat<(i32 (fp_to_sint FGR64:$src)), (MFC1 (TRUNC_W_D64 FGR64:$src))>;
-  def : Pat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
-  def : Pat<(i64 (fp_to_sint FGR64:$src)), (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
-
-  def : Pat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
-}
-
-// Patterns for unaligned floating point loads and stores.
-let Predicates = [HasMips32r2Or64, NotN64] in {
-  def : Pat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>;
-  def : Pat<(store_u FGR32:$src, CPURegs:$addr),
-            (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>;
-}
-
-let Predicates = [IsN64] in {
-  def : Pat<(f32 (load_u CPU64Regs:$addr)), (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>;
-  def : Pat<(store_u FGR32:$src, CPU64Regs:$addr),
-            (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>;
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+
+def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
+def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
+
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D32_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(i32 (fp_to_sint AFGR64:$src)),
+                (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
+  def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
+}
+
+let Predicates = [IsFP64bit, HasStandardEncoding] in {
+  def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
+  def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
+
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D64_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)),
+                (CVT_S_L (DMTC1 CPU64Regs:$src))>;
+  def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)),
+                (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
+
+  def : MipsPat<(i32 (fp_to_sint FGR64:$src)),
+                (MFC1 (TRUNC_W_D64 FGR64:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR64:$src)),
+                (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
+
+  def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 841eba0..8feb853 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -72,20 +72,33 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
   field bits<32> SoftFail = 0;
 }
 
+// Mips32/64 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin, Format f>:
+  MipsInst<outs, ins, asmstr, pattern, itin, f> {
+  let Predicates = [HasStandardEncoding];
+}
+
 // Mips Pseudo Instructions Format
 class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MipsInst<outs, ins, asmstr, pattern, IIPseudo, Pseudo> {
+  MipsInst<outs, ins, asmstr, pattern, IIPseudo, Pseudo> {
   let isCodeGenOnly = 1;
   let isPseudo = 1;
 }
 
+// Mips32/64 Pseudo Instruction Format
+class PseudoSE<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsPseudo<outs, ins, asmstr, pattern> {
+  let Predicates = [HasStandardEncoding];
+}
+
 //===----------------------------------------------------------------------===//
 // Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
 //===----------------------------------------------------------------------===//
 
 class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
          list<dag> pattern, InstrItinClass itin>:
-      MipsInst<outs, ins, asmstr, pattern, itin, FrmR>
+  InstSE<outs, ins, asmstr, pattern, itin, FrmR>
 {
   bits<5>  rd;
   bits<5>  rs;
@@ -108,7 +121,7 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
 //===----------------------------------------------------------------------===//
 
 class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin, FrmI>
+         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
 {
   bits<5>  rt;
   bits<5>  rs;
@@ -123,7 +136,7 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 
 class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
                   list<dag> pattern, InstrItinClass itin>:
-  MipsInst<outs, ins, asmstr, pattern, itin, FrmI>
+  InstSE<outs, ins, asmstr, pattern, itin, FrmI>
 {
   bits<5>  rs;
   bits<5>  rt;
@@ -141,7 +154,7 @@ class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
 //===----------------------------------------------------------------------===//
 
 class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin, FrmJ>
+         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmJ>
 {
   bits<26> addr;
 
@@ -169,7 +182,7 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 
 class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
           string asmstr, list<dag> pattern> :
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmFR>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFR>
 {
   bits<5>  fd;
   bits<5>  fs;
@@ -193,7 +206,7 @@ class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
 //===----------------------------------------------------------------------===//
 
 class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
 {
   bits<5>  ft;
   bits<5>  base;
@@ -211,7 +224,7 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
 //===----------------------------------------------------------------------===//
 
 class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> :
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
 {
   bits<5>  fs;
   bits<5>  ft;
@@ -232,7 +245,7 @@ class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> :
 
 class FCMOV<bits<1> _tf, dag outs, dag ins, string asmstr,
             list<dag> pattern> :
-  MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
 {
   bits<5>  rd;
   bits<5>  rs;
@@ -253,7 +266,7 @@ class FCMOV<bits<1> _tf, dag outs, dag ins, string asmstr,
 
 class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr,
              list<dag> pattern> :
-  MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
 {
   bits<5>  fd;
   bits<5>  fs;
@@ -300,7 +313,7 @@ class FFR2P<bits<6> funct, bits<5> fmt, string opstr,
 // Floating point madd/msub/nmadd/nmsub.
 class FFMADDSUB<bits<3> funct, bits<3> fmt, dag outs, dag ins, string asmstr,
                 list<dag> pattern>
-  : MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmOther> {
+  : InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther> {
   bits<5> fd;
   bits<5> fr;
   bits<5> fs;
@@ -318,7 +331,7 @@ class FFMADDSUB<bits<3> funct, bits<3> fmt, dag outs, dag ins, string asmstr,
 // FP indexed load/store instructions.
 class FFMemIdx<bits<6> funct, dag outs, dag ins, string asmstr,
                list<dag> pattern> :
-  MipsInst<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
 {
   bits<5>  base;
   bits<5>  index;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index a3a18bf..50e3eb5 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsTargetMachine.h"
 #include "MipsMachineFunction.h"
@@ -26,67 +27,19 @@
 
 using namespace llvm;
 
-MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
+MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
-    TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()),
-    RI(*TM.getSubtargetImpl(), *this),
-    UncondBrOpc(TM.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J) {}
+    TM(tm), UncondBrOpc(UncondBr) {}
 
-const MipsRegisterInfo &MipsInstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-static bool isZeroImm(const MachineOperand &op) {
-  return op.isImm() && op.getImm() == 0;
-}
-
-/// isLoadFromStackSlot - If the specified machine instruction is a direct
-/// load from a stack slot, return the virtual or physical register number of
-/// the destination along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than loading from the stack slot.
-unsigned MipsInstrInfo::
-isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
-  unsigned Opc = MI->getOpcode();
-
-  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
-      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
-      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
-      (Opc == Mips::LDC164_P8)) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-  }
+const MipsInstrInfo *MipsInstrInfo::create(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16InstrInfo(TM);
 
-  return 0;
+  return llvm::createMipsSEInstrInfo(TM);
 }
 
-/// isStoreToStackSlot - If the specified machine instruction is a direct
-/// store to a stack slot, return the virtual or physical register number of
-/// the source reg along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than storing to the stack slot.
-unsigned MipsInstrInfo::
-isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
-  unsigned Opc = MI->getOpcode();
-
-  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
-      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
-      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
-      (Opc == Mips::SDC164_P8)) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-  }
-  return 0;
+bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
+  return op.isImm() && op.getImm() == 0;
 }
 
 /// insertNoop - If data hazard condition is found insert the target nop
@@ -98,78 +51,8 @@ insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
   BuildMI(MBB, MI, DL, get(Mips::NOP));
 }
 
-void MipsInstrInfo::
-copyPhysReg(MachineBasicBlock &MBB,
-            MachineBasicBlock::iterator I, DebugLoc DL,
-            unsigned DestReg, unsigned SrcReg,
-            bool KillSrc) const {
-  unsigned Opc = 0, ZeroReg = 0;
-
-  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
-    if (Mips::CPURegsRegClass.contains(SrcReg))
-      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
-    else if (Mips::CCRRegClass.contains(SrcReg))
-      Opc = Mips::CFC1;
-    else if (Mips::FGR32RegClass.contains(SrcReg))
-      Opc = Mips::MFC1;
-    else if (SrcReg == Mips::HI)
-      Opc = Mips::MFHI, SrcReg = 0;
-    else if (SrcReg == Mips::LO)
-      Opc = Mips::MFLO, SrcReg = 0;
-  }
-  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
-    if (Mips::CCRRegClass.contains(DestReg))
-      Opc = Mips::CTC1;
-    else if (Mips::FGR32RegClass.contains(DestReg))
-      Opc = Mips::MTC1;
-    else if (DestReg == Mips::HI)
-      Opc = Mips::MTHI, DestReg = 0;
-    else if (DestReg == Mips::LO)
-      Opc = Mips::MTLO, DestReg = 0;
-  }
-  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
-    Opc = Mips::FMOV_S;
-  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
-    Opc = Mips::FMOV_D32;
-  else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
-    Opc = Mips::FMOV_D64;
-  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
-    Opc = Mips::MOVCCRToCCR;
-  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
-    if (Mips::CPU64RegsRegClass.contains(SrcReg))
-      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
-    else if (SrcReg == Mips::HI64)
-      Opc = Mips::MFHI64, SrcReg = 0;
-    else if (SrcReg == Mips::LO64)
-      Opc = Mips::MFLO64, SrcReg = 0;
-    else if (Mips::FGR64RegClass.contains(SrcReg))
-      Opc = Mips::DMFC1;
-  }
-  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
-    if (DestReg == Mips::HI64)
-      Opc = Mips::MTHI64, DestReg = 0;
-    else if (DestReg == Mips::LO64)
-      Opc = Mips::MTLO64, DestReg = 0;
-    else if (Mips::FGR64RegClass.contains(DestReg))
-      Opc = Mips::DMTC1;
-  }
-
-  assert(Opc && "Cannot copy registers");
-
-  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
-
-  if (DestReg)
-    MIB.addReg(DestReg, RegState::Define);
-
-  if (ZeroReg)
-    MIB.addReg(ZeroReg);
-
-  if (SrcReg)
-    MIB.addReg(SrcReg, getKillRegState(KillSrc));
-}
-
-static MachineMemOperand* GetMemOperand(MachineBasicBlock &MBB, int FI,
-                                        unsigned Flag) {
+MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                                unsigned Flag) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
@@ -178,60 +61,6 @@ static MachineMemOperand* GetMemOperand(MachineBasicBlock &MBB, int FI,
                                  MFI.getObjectSize(FI), Align);
 }
 
-void MipsInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
-                    const TargetRegisterClass *RC,
-                    const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (I != MBB.end()) DL = I->getDebugLoc();
-  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
-
-  unsigned Opc = 0;
-
-  if (RC == Mips::CPURegsRegisterClass)
-    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
-  else if (RC == Mips::CPU64RegsRegisterClass)
-    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
-  else if (RC == Mips::FGR32RegisterClass)
-    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
-  else if (RC == Mips::AFGR64RegisterClass)
-    Opc = Mips::SDC1;
-  else if (RC == Mips::FGR64RegisterClass)
-    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
-
-  assert(Opc && "Register class not handled!");
-  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
-    .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-}
-
-void MipsInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
-                     const TargetRegisterClass *RC,
-                     const TargetRegisterInfo *TRI) const
-{
-  DebugLoc DL;
-  if (I != MBB.end()) DL = I->getDebugLoc();
-  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
-  unsigned Opc = 0;
-
-  if (RC == Mips::CPURegsRegisterClass)
-    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
-  else if (RC == Mips::CPU64RegsRegisterClass)
-    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
-  else if (RC == Mips::FGR32RegisterClass)
-    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
-  else if (RC == Mips::AFGR64RegisterClass)
-    Opc = Mips::LDC1;
-  else if (RC == Mips::FGR64RegisterClass)
-    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
-
-  assert(Opc && "Register class not handled!");
-  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(0)
-    .addMemOperand(MMO);
-}
-
 MachineInstr*
 MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                         uint64_t Offset, const MDNode *MDPtr,
@@ -245,42 +74,9 @@ MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
 // Branch Analysis
 //===----------------------------------------------------------------------===//
 
-static unsigned GetAnalyzableBrOpc(unsigned Opc) {
-  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
-          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
-          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
-          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
-          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
-          Opc == Mips::J) ?
-         Opc : 0;
-}
-
-/// GetOppositeBranchOpc - Return the inverse of the specified
-/// opcode, e.g. turning BEQ to BNE.
-unsigned Mips::GetOppositeBranchOpc(unsigned Opc)
-{
-  switch (Opc) {
-  default:           llvm_unreachable("Illegal opcode!");
-  case Mips::BEQ:    return Mips::BNE;
-  case Mips::BNE:    return Mips::BEQ;
-  case Mips::BGTZ:   return Mips::BLEZ;
-  case Mips::BGEZ:   return Mips::BLTZ;
-  case Mips::BLTZ:   return Mips::BGEZ;
-  case Mips::BLEZ:   return Mips::BGTZ;
-  case Mips::BEQ64:  return Mips::BNE64;
-  case Mips::BNE64:  return Mips::BEQ64;
-  case Mips::BGTZ64: return Mips::BLEZ64;
-  case Mips::BGEZ64: return Mips::BLTZ64;
-  case Mips::BLTZ64: return Mips::BGEZ64;
-  case Mips::BLEZ64: return Mips::BGTZ64;
-  case Mips::BC1T:   return Mips::BC1F;
-  case Mips::BC1F:   return Mips::BC1T;
-  }
-}
-
-static void AnalyzeCondBr(const MachineInstr* Inst, unsigned Opc,
-                          MachineBasicBlock *&BB,
-                          SmallVectorImpl<MachineOperand>& Cond) {
+void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                                  MachineBasicBlock *&BB,
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
   assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
   int NumOp = Inst->getNumExplicitOperands();
 
@@ -450,7 +246,62 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
 {
   assert( (Cond.size() && Cond.size() <= 3) &&
           "Invalid Mips branch condition!");
-  Cond[0].setImm(Mips::GetOppositeBranchOpc(Cond[0].getImm()));
+  Cond[0].setImm(GetOppositeBranchOpc(Cond[0].getImm()));
   return false;
 }
 
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getDesc().getSize();
+  case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  }
+}
+
+unsigned
+llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
+                          MachineBasicBlock& MBB,
+                          MachineBasicBlock::iterator II, DebugLoc DL,
+                          bool LastInstrIsADDiu,
+                          MipsAnalyzeImmediate::Inst *LastInst) {
+  MipsAnalyzeImmediate AnalyzeImm;
+  unsigned Size = IsN64 ? 64 : 32;
+  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  if (LastInst && (Seq.size() == 1)) {
+    *LastInst = *Inst;
+    return 0;
+  }
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq. Skip the last instruction if
+  // LastInst is not 0.
+  for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst)
+    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInst)
+    *LastInst = *Inst;
+
+  return Seq.size() - !!LastInst;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 4be727d..7d56259 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -15,6 +15,7 @@
 #define MIPSINSTRUCTIONINFO_H
 
 #include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -24,87 +25,85 @@
 
 namespace llvm {
 
-namespace Mips {
-  /// GetOppositeBranchOpc - Return the inverse of the specified
-  /// opcode, e.g. turning BEQ to BNE.
-  unsigned GetOppositeBranchOpc(unsigned Opc);
-}
-
 class MipsInstrInfo : public MipsGenInstrInfo {
+protected:
   MipsTargetMachine &TM;
-  bool IsN64;
-  const MipsRegisterInfo RI;
   unsigned UncondBrOpc;
+
 public:
-  explicit MipsInstrInfo(MipsTargetMachine &TM);
+  explicit MipsInstrInfo(MipsTargetMachine &TM, unsigned UncondBrOpc);
 
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  ///
-  virtual const MipsRegisterInfo &getRegisterInfo() const;
-
-  /// isLoadFromStackSlot - If the specified machine instruction is a direct
-  /// load from a stack slot, return the virtual or physical register number of
-  /// the destination along with the FrameIndex of the loaded stack slot.  If
-  /// not, return 0.  This predicate must return 0 if the instruction has
-  /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-
-  /// isStoreToStackSlot - If the specified machine instruction is a direct
-  /// store to a stack slot, return the virtual or physical register number of
-  /// the source reg along with the FrameIndex of the loaded stack slot.  If
-  /// not, return 0.  This predicate must return 0 if the instruction has
-  /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  static const MipsInstrInfo *create(MipsTargetMachine &TM);
 
   /// Branch Analysis
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
                              bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
-private:
-  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
-                   const SmallVectorImpl<MachineOperand>& Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
-public:
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
   virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF,
                                                  int FrameIx, uint64_t Offset,
                                                  const MDNode *MDPtr,
                                                  DebugLoc DL) const;
 
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
   /// Insert nop instruction when hazard condition is found
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const = 0;
+
+  /// Return the number of bytes of code the specified instruction may be.
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+protected:
+  bool isZeroImm(const MachineOperand &op) const;
+
+  MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                   unsigned Flag) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const = 0;
+
+  void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                     MachineBasicBlock *&BB,
+                     SmallVectorImpl<MachineOperand> &Cond) const;
+
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
+                   const SmallVectorImpl<MachineOperand>& Cond) const;
 };
 
+namespace Mips {
+  /// Emit a series of instructions to load an immediate. All instructions
+  /// except for the last one are emitted. The function returns the number of
+  /// MachineInstrs generated. The opcode-immediate pair of the last
+  /// instruction is returned in LastInst, if it is not 0.
+  unsigned
+  loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
+                MachineBasicBlock& MBB, MachineBasicBlock::iterator II,
+                DebugLoc DL, bool LastInstrIsADDiu,
+                MipsAnalyzeImmediate::Inst *LastInst);
+}
+
+/// Create MipsInstrInfo objects.
+const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
+const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);
+
 }
 
 #endif
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 873d2bd..da15d4d 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -11,17 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Instruction format superclass
-//===----------------------------------------------------------------------===//
-
-include "MipsInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 // Mips profiles and nodes
 //===----------------------------------------------------------------------===//
 
-def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<1, 2>,
@@ -49,6 +43,10 @@ def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                    SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
                                    SDTCisSameAs<0, 4>]>;
 
+def SDTMipsLoadLR  : SDTypeProfile<1, 2,
+                                   [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                    SDTCisSameAs<0, 2>]>;
+
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
@@ -72,8 +70,7 @@ def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
 def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
 
 // Return
-def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain,
-                     SDNPOptInGlue]>;
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
@@ -118,6 +115,23 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
 
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -145,12 +159,26 @@ def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
 def NotN64      :     Predicate<"!Subtarget.isABI_N64()">,
                       AssemblerPredicate<"!FeatureN64">;
+def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
+                      AssemblerPredicate<"FeatureMips16">;
 def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
                       AssemblerPredicate<"FeatureMips32">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
                       AssemblerPredicate<"FeatureMips32">;
+def HasStandardEncoding : Predicate<"Subtarget.hasStandardEncoding()">,
+                          AssemblerPredicate<"!FeatureMips16">;
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [HasStandardEncoding];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -190,6 +218,7 @@ def mem : Operand<i32> {
 def mem64 : Operand<i64> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops CPU64Regs, simm16_64);
+  let EncoderMethod = "getMemEncoding";
 }
 
 def mem_ea : Operand<i32> {
@@ -252,7 +281,8 @@ def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 
 // Mips Address Mode! SDNode frameindex could possibily be a match
 // since load and store instructions from stack used it.
-def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
+def addr :
+  ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragment for load/store
@@ -418,21 +448,13 @@ class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
   let isPseudo = Pseudo;
 }
 
-// Unaligned Memory Load/Store
-let canFoldAsLoad = 1 in
-class LoadUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr), "", [], IILoad> {}
-
-class StoreUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr), "", [], IIStore> {}
-
 // 32-bit load.
 multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
                    bit Pseudo = 0> {
   def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -442,31 +464,21 @@ multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
 multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
                    bit Pseudo = 0> {
   def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
 }
 
-// 32-bit load.
-multiclass LoadUnAlign32<bits<6> op> {
-  def #NAME# : LoadUnAlign<op, CPURegs, mem>,
-               Requires<[NotN64]>;
-  def _P8    : LoadUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
 // 32-bit store.
 multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
                     bit Pseudo = 0> {
   def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -476,20 +488,69 @@ multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
 multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
                     bit Pseudo = 0> {
   def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
 }
 
-// 32-bit store.
-multiclass StoreUnAlign32<bits<6> op> {
-  def #NAME# : StoreUnAlign<op, CPURegs, mem>,
-               Requires<[NotN64]>;
-  def _P8    : StoreUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]> {
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<bits<6> op, string instr_asm, SDNode OpNode,
+                    RegisterClass RC, Operand MemOpnd> :
+  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr, RC:$src),
+       !strconcat(instr_asm, "\t$rt, $addr"),
+       [(set RC:$rt, (OpNode addr:$addr, RC:$src))], IILoad> {
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<bits<6> op, string instr_asm, SDNode OpNode,
+                     RegisterClass RC, Operand MemOpnd>:
+  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr),
+       !strconcat(instr_asm, "\t$rt, $addr"), [(OpNode RC:$rt, addr:$addr)],
+       IIStore>;
+
+// 32-bit load left/right.
+multiclass LoadLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 64-bit load left/right.
+multiclass LoadLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 32-bit store left/right.
+multiclass StoreLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 64-bit store left/right.
+multiclass StoreLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -503,6 +564,7 @@ class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
+  let Defs = [AT];
 }
 
 class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
@@ -514,6 +576,7 @@ class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
+  let Defs = [AT];
 }
 
 // SetCC
@@ -541,8 +604,9 @@ class JumpFJ<bits<6> op, string instr_asm>:
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocStatic];
+  let Predicates = [RelocStatic, HasStandardEncoding];
   let DecoderMethod = "DecodeJumpTarget";
+  let Defs = [AT];
 }
 
 // Unconditional branch
@@ -555,23 +619,37 @@ class UncondBranch<bits<6> op, string instr_asm>:
   let isTerminator = 1;
   let isBarrier = 1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocPIC];
+  let Predicates = [RelocPIC, HasStandardEncoding];
+  let Defs = [AT];
 }
 
-let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1,
-    isIndirectBranch = 1 in
-class JumpFR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>:
-  FR<op, func, (outs), (ins RC:$rs),
-     !strconcat(instr_asm, "\t$rs"), [(brind RC:$rs)], IIBranch> {
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFR<RegisterClass RC, list<dag> pattern>:
+  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> {
   let rt = 0;
   let rd = 0;
   let shamt = 0;
 }
 
+// Indirect branch
+class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Return instruction
+class RetBase<RegisterClass RC>: JumpFR<RC, []> {
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+}
+
 // Jump and Link (Call)
-let isCall=1, hasDelaySlot=1 in {
+let isCall=1, hasDelaySlot=1, Defs = [RA] in {
   class JumpLink<bits<6> op, string instr_asm>:
-    FJ<op, (outs), (ins calltarget:$target, variable_ops),
+    FJ<op, (outs), (ins calltarget:$target),
        !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)],
        IIBranch> {
        let DecoderMethod = "DecodeJumpTarget";
@@ -579,7 +657,7 @@ let isCall=1, hasDelaySlot=1 in {
 
   class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm,
                     RegisterClass RC>:
-    FR<op, func, (outs), (ins RC:$rs, variable_ops),
+    FR<op, func, (outs), (ins RC:$rs),
        !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink RC:$rs)], IIBranch> {
     let rt = 0;
     let rd = 31;
@@ -587,7 +665,7 @@ let isCall=1, hasDelaySlot=1 in {
   }
 
   class BranchLink<string instr_asm, bits<5> _rt, RegisterClass RC>:
-    FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16, variable_ops),
+    FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16),
        !strconcat(instr_asm, "\t$rs, $imm16"), [], IIBranch> {
     let rt = _rt;
   }
@@ -644,16 +722,18 @@ class MoveToLOHI<bits<6> func, string instr_asm, RegisterClass RC,
   let neverHasSideEffects = 1;
 }
 
-class EffectiveAddress<string instr_asm, RegisterClass RC, Operand Mem> :
-  FMem<0x09, (outs RC:$rt), (ins Mem:$addr),
-     instr_asm, [(set RC:$rt, addr:$addr)], IIAlu>;
+class EffectiveAddress<bits<6> opc, string instr_asm, RegisterClass RC, Operand Mem> :
+  FMem<opc, (outs RC:$rt), (ins Mem:$addr),
+     instr_asm, [(set RC:$rt, addr:$addr)], IIAlu> {
+ let isCodeGenOnly = 1;
+}
 
 // Count Leading Ones/Zeros in Word
 class CountLeading0<bits<6> func, string instr_asm, RegisterClass RC>:
   FR<0x1c, func, (outs RC:$rd), (ins RC:$rs),
      !strconcat(instr_asm, "\t$rd, $rs"),
      [(set RC:$rd, (ctlz RC:$rs))], IIAlu>,
-     Requires<[HasBitCount]> {
+     Requires<[HasBitCount, HasStandardEncoding]> {
   let shamt = 0;
   let rt = rd;
 }
@@ -662,7 +742,7 @@ class CountLeading1<bits<6> func, string instr_asm, RegisterClass RC>:
   FR<0x1c, func, (outs RC:$rd), (ins RC:$rs),
      !strconcat(instr_asm, "\t$rd, $rs"),
      [(set RC:$rd, (ctlz (not RC:$rs)))], IIAlu>,
-     Requires<[HasBitCount]> {
+     Requires<[HasBitCount, HasStandardEncoding]> {
   let shamt = 0;
   let rt = rd;
 }
@@ -675,7 +755,7 @@ class SignExtInReg<bits<5> sa, string instr_asm, ValueType vt,
      [(set RC:$rd, (sext_inreg RC:$rt, vt))], NoItinerary> {
   let rs = 0;
   let shamt = sa;
-  let Predicates = [HasSEInReg];
+  let Predicates = [HasSEInReg, HasStandardEncoding];
 }
 
 // Subword Swap
@@ -684,7 +764,7 @@ class SubwordSwap<bits<6> func, bits<5> sa, string instr_asm, RegisterClass RC>:
      !strconcat(instr_asm, "\t$rd, $rt"), [], NoItinerary> {
   let rs = 0;
   let shamt = sa;
-  let Predicates = [HasSwap];
+  let Predicates = [HasSwap, HasStandardEncoding];
   let neverHasSideEffects = 1;
 }
 
@@ -705,7 +785,7 @@ class ExtBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
   bits<5> sz;
   let rd = sz;
   let shamt = pos;
-  let Predicates = [HasMips32r2];
+  let Predicates = [HasMips32r2, HasStandardEncoding];
 }
 
 class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
@@ -718,20 +798,22 @@ class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
   bits<5> sz;
   let rd = sz;
   let shamt = pos;
-  let Predicates = [HasMips32r2];
+  let Predicates = [HasMips32r2, HasStandardEncoding];
   let Constraints = "$src = $rt";
 }
 
 // Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
 class Atomic2Ops<PatFrag Op, string Opstr, RegisterClass DRC,
                  RegisterClass PRC> :
-  MipsPseudo<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
-             !strconcat("atomic_", Opstr, "\t$dst, $ptr, $incr"),
-             [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
+  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
+           !strconcat("atomic_", Opstr, "\t$dst, $ptr, $incr"),
+           [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
 
 multiclass Atomic2Ops32<PatFrag Op, string Opstr> {
-  def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>,
+                          Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>,
+                          Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -739,13 +821,15 @@ multiclass Atomic2Ops32<PatFrag Op, string Opstr> {
 // Atomic Compare & Swap.
 class AtomicCmpSwap<PatFrag Op, string Width, RegisterClass DRC,
                     RegisterClass PRC> :
-  MipsPseudo<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
-             !strconcat("atomic_cmp_swap_", Width, "\t$dst, $ptr, $cmp, $swap"),
-             [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
+  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
+           !strconcat("atomic_cmp_swap_", Width, "\t$dst, $ptr, $cmp, $swap"),
+           [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
 
 multiclass AtomicCmpSwap32<PatFrag Op, string Width>  {
-  def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>,
+                             Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>,
+                             Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -767,12 +851,15 @@ class SCBase<bits<6> Opc, string opstring, RegisterClass RC, Operand Mem> :
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
 
-// As stack alignment is always done with addiu, we need a 16-bit immediate
-let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins uimm16:$amt),
+// Return RA.
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
+def RetRA : PseudoSE<(outs), (ins), "", [(MipsRet)]>;
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
                                   "!ADJCALLSTACKDOWN $amt",
                                   [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins uimm16:$amt1, uimm16:$amt2),
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                   "!ADJCALLSTACKUP $amt1",
                                   [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
@@ -782,31 +869,8 @@ def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins uimm16:$amt1, uimm16:$amt2),
 // are used, we have the same behavior, but get also a bunch of warnings
 // from the assembler.
 let neverHasSideEffects = 1 in
-def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc, CPURegs:$gp),
-                           ".cprestore\t$loc", []>;
-
-// For O32 ABI & PIC & non-fixed global base register, the following instruction
-// seqeunce is emitted to set the global base register:
-//
-//  0. lui   $2, %hi(_gp_disp)
-//  1. addiu $2, $2, %lo(_gp_disp)
-//  2. addu  $globalbasereg, $2, $t9
-//
-// SETGP01 is emitted during Prologue/Epilogue insertion and then converted to
-// instructions 0 and 1 in the sequence above during MC lowering.
-// SETGP2 is emitted just before register allocation and converted to
-// instruction 2 just prior to post-RA scheduling.
-//
-// These pseudo instructions are needed to ensure no instructions are inserted
-// before or between instructions 0 and 1, which is a limitation imposed by
-// GNU linker.
-
-let isTerminator = 1, isBarrier = 1 in
-def SETGP01 : MipsPseudo<(outs CPURegs:$dst), (ins), "", []>;
-
-let neverHasSideEffects = 1 in
-def SETGP2 : MipsPseudo<(outs CPURegs:$globalreg), (ins CPURegs:$picreg), "",
-                        []>;
+def CPRESTORE : PseudoSE<(outs), (ins i32imm:$loc, CPURegs:$gp),
+                         ".cprestore\t$loc", []>;
 
 let usesCustomInserter = 1 in {
   defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8, "load_add_8">;
@@ -876,7 +940,7 @@ def SRLV    : shift_rotate_reg<0x06, 0x00, "srlv", srl, CPURegs>;
 def SRAV    : shift_rotate_reg<0x07, 0x00, "srav", sra, CPURegs>;
 
 // Rotate Instructions
-let Predicates = [HasMips32r2] in {
+let Predicates = [HasMips32r2, HasStandardEncoding] in {
     def ROTR    : shift_rotate_imm32<0x02, 0x01, "rotr", rotr>;
     def ROTRV   : shift_rotate_reg<0x06, 0x01, "rotrv", rotr, CPURegs>;
 }
@@ -899,15 +963,15 @@ defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
 defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
 defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
 
-/// Primitives for unaligned
-defm LWL     : LoadUnAlign32<0x22>;
-defm LWR     : LoadUnAlign32<0x26>;
-defm SWL     : StoreUnAlign32<0x2A>;
-defm SWR     : StoreUnAlign32<0x2E>;
+/// load/store left/right
+defm LWL : LoadLeftRightM32<0x22, "lwl", MipsLWL>;
+defm LWR : LoadLeftRightM32<0x26, "lwr", MipsLWR>;
+defm SWL : StoreLeftRightM32<0x2a, "swl", MipsSWL>;
+defm SWR : StoreLeftRightM32<0x2e, "swr", MipsSWR>;
 
 let hasSideEffects = 1 in
-def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
-                    [(MipsSync imm:$stype)], NoItinerary, FrmOther>
+def SYNC : InstSE<(outs), (ins i32imm:$stype), "sync $stype",
+                  [(MipsSync imm:$stype)], NoItinerary, FrmOther>
 {
   bits<5> stype;
   let Opcode = 0;
@@ -917,19 +981,23 @@ def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
 }
 
 /// Load-linked, Store-conditional
-def LL    : LLBase<0x30, "ll", CPURegs, mem>, Requires<[NotN64]>;
-def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, Requires<[IsN64]> {
+def LL    : LLBase<0x30, "ll", CPURegs, mem>,
+            Requires<[NotN64, HasStandardEncoding]>;
+def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>,
+            Requires<[IsN64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
-def SC    : SCBase<0x38, "sc", CPURegs, mem>, Requires<[NotN64]>;
-def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]> {
+def SC    : SCBase<0x38, "sc", CPURegs, mem>,
+            Requires<[NotN64, HasStandardEncoding]>;
+def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>,
+            Requires<[IsN64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
-def JR      : JumpFR<0x00, 0x08, "jr", CPURegs>;
+def JR      : IndirectBranch<CPURegs>;
 def B       : UncondBranch<0x04, "b">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
 def BNE     : CBranch<0x05, "bne", setne, CPURegs>;
@@ -938,15 +1006,16 @@ def BGTZ    : CBranchZero<0x07, 0, "bgtz", setgt, CPURegs>;
 def BLEZ    : CBranchZero<0x06, 0, "blez", setle, CPURegs>;
 def BLTZ    : CBranchZero<0x01, 0, "bltz", setlt, CPURegs>;
 
+let rt = 0, rs = 0, isBranch = 1, isTerminator = 1, isBarrier = 1,
+    hasDelaySlot = 1, Defs = [RA] in
+def BAL_BR: FI<0x1, (outs), (ins brtarget:$imm16), "bal\t$imm16", [], IIBranch>;
+
 def JAL  : JumpLink<0x03, "jal">;
 def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>;
 def BGEZAL  : BranchLink<"bgezal", 0x11, CPURegs>;
 def BLTZAL  : BranchLink<"bltzal", 0x10, CPURegs>;
 
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isCodeGenOnly=1,
-    isBarrier=1, hasCtrlDep=1, rd=0, rt=0, shamt=0 in
-  def RET : FR <0x00, 0x08, (outs), (ins CPURegs:$target),
-                "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
+def RET : RetBase<CPURegs>;
 
 /// Multiply and Divide Instructions.
 def MULT    : Mult32<0x18, "mult", IIImul>;
@@ -978,17 +1047,13 @@ let addr=0 in
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea> {
-  let isCodeGenOnly = 1;
-}
+def LEA_ADDiu : EffectiveAddress<0x09,"addiu\t$rt, $addr", CPURegs, mem_ea>;
 
 // DynAlloc node points to dynamically allocated stack space.
 // $sp is added to the list of implicitly used registers to prevent dead code
 // elimination from removing instructions that modify $sp.
 let Uses = [SP] in
-def DynAlloc : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea> {
-  let isCodeGenOnly = 1;
-}
+def DynAlloc : EffectiveAddress<0x09,"addiu\t$rt, $addr", CPURegs, mem_ea>;
 
 // MADD*/MSUB*
 def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
@@ -999,7 +1064,7 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
 def MUL   : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>,
-            Requires<[HasMips32]>;
+            Requires<[HasMips32, HasStandardEncoding]>;
 
 def RDHWR : ReadHardware<CPURegs, HWRegs>;
 
@@ -1011,67 +1076,67 @@ def INS : InsBase<4, "ins", CPURegs>;
 //===----------------------------------------------------------------------===//
 
 // Small immediates
-def : Pat<(i32 immSExt16:$in),
-          (ADDiu ZERO, imm:$in)>;
-def : Pat<(i32 immZExt16:$in),
-          (ORi ZERO, imm:$in)>;
-def : Pat<(i32 immLow16Zero:$in),
-          (LUi (HI16 imm:$in))>;
+def : MipsPat<(i32 immSExt16:$in),
+              (ADDiu ZERO, imm:$in)>;
+def : MipsPat<(i32 immZExt16:$in),
+              (ORi ZERO, imm:$in)>;
+def : MipsPat<(i32 immLow16Zero:$in),
+              (LUi (HI16 imm:$in))>;
 
 // Arbitrary immediates
-def : Pat<(i32 imm:$imm),
+def : MipsPat<(i32 imm:$imm),
           (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
 
-// Carry patterns
-def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs),
-          (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
-def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs),
-          (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
-def : Pat<(addc  CPURegs:$src, immSExt16:$imm),
-          (ADDiu CPURegs:$src, imm:$imm)>;
+// Carry MipsPatterns
+def : MipsPat<(subc CPURegs:$lhs, CPURegs:$rhs),
+              (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc CPURegs:$lhs, CPURegs:$rhs),
+              (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc  CPURegs:$src, immSExt16:$imm),
+              (ADDiu CPURegs:$src, imm:$imm)>;
 
 // Call
-def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
-          (JAL tglobaladdr:$dst)>;
-def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
-          (JAL texternalsym:$dst)>;
-//def : Pat<(MipsJmpLink CPURegs:$dst),
-//          (JALR CPURegs:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+              (JAL tglobaladdr:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink CPURegs:$dst),
+//              (JALR CPURegs:$dst)>;
 
 // hi/lo relocs
-def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
-def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
-def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
-def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
-
-def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
-def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
-def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
-def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
-def : Pat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
-
-def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
-          (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
-          (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
-          (ADDiu CPURegs:$hi, tjumptable:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
-          (ADDiu CPURegs:$hi, tconstpool:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu CPURegs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
 
 // gp_rel relocs
-def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
-          (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
-def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
-          (ADDiu CPURegs:$gp, tconstpool:$in)>;
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu CPURegs:$gp, tconstpool:$in)>;
 
 // wrapper_pic
 class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
-      Pat<(MipsWrapper RC:$gp, node:$in),
-          (ADDiuOp RC:$gp, node:$in)>;
+      MipsPat<(MipsWrapper RC:$gp, node:$in),
+              (ADDiuOp RC:$gp, node:$in)>;
 
 def : WrapperPat<tglobaladdr, ADDiu, CPURegs>;
 def : WrapperPat<tconstpool, ADDiu, CPURegs>;
@@ -1081,58 +1146,58 @@ def : WrapperPat<tjumptable, ADDiu, CPURegs>;
 def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>;
 
 // Mips does not have "not", so we expand our way
-def : Pat<(not CPURegs:$in),
-          (NOR CPURegs:$in, ZERO)>;
+def : MipsPat<(not CPURegs:$in),
+              (NOR CPURegs:$in, ZERO)>;
 
 // extended loads
-let Predicates = [NotN64] in {
-  def : Pat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
-  def : Pat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : Pat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
-  def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
 }
 
 // peepholes
-let Predicates = [NotN64] in {
-  def : Pat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-  def : Pat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+  def : MipsPat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-  def : Pat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
+  def : MipsPat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
 }
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
                       Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
                       Instruction SLTiuOp, Register ZEROReg> {
-def : Pat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
-          (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
-def : Pat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
-          (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+              (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+              (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
 
-def : Pat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
-          (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
-          (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
 
-def : Pat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
 
-def : Pat<(brcond RC:$cond, bb:$dst),
-          (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+              (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
 }
 
 defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
@@ -1140,39 +1205,39 @@ defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
 // setcc patterns
 multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
                      Instruction SLTuOp, Register ZEROReg> {
-  def : Pat<(seteq RC:$lhs, RC:$rhs),
-            (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
-  def : Pat<(setne RC:$lhs, RC:$rhs),
-            (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+  def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+                (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setne RC:$lhs, RC:$rhs),
+                (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
 }
 
 multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setle RC:$lhs, RC:$rhs),
-            (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
-  def : Pat<(setule RC:$lhs, RC:$rhs),
-            (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setle RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setule RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
 }
 
 multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setgt RC:$lhs, RC:$rhs),
-            (SLTOp RC:$rhs, RC:$lhs)>;
-  def : Pat<(setugt RC:$lhs, RC:$rhs),
-            (SLTuOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+                (SLTOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+                (SLTuOp RC:$rhs, RC:$lhs)>;
 }
 
 multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setge RC:$lhs, RC:$rhs),
-            (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
-  def : Pat<(setuge RC:$lhs, RC:$rhs),
-            (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setge RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
 }
 
 multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
                         Instruction SLTiuOp> {
-  def : Pat<(setge RC:$lhs, immSExt16:$rhs),
-            (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
-  def : Pat<(setuge RC:$lhs, immSExt16:$rhs),
-            (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
 }
 
 defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
@@ -1182,10 +1247,10 @@ defm : SetgePats<CPURegs, SLT, SLTu>;
 defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
 
 // select MipsDynAlloc
-def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
+def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
 
 // bswap pattern
-def : Pat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
+def : MipsPat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Support
@@ -1195,3 +1260,8 @@ include "MipsInstrFPU.td"
 include "Mips64InstrInfo.td"
 include "MipsCondMov.td"
 
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
index 76ca3e1..052046a 100644
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -27,7 +27,52 @@ using namespace llvm;
 
 
 void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)Old;
+  const unsigned NopInstr = 0x0;
+
+  // If the functions are in the same memory segment, insert PC-region branch.
+  if ((NewAddr & 0xF0000000) == ((OldAddr + 4) & 0xF0000000)) {
+    unsigned *OldInstruction = (unsigned *)Old;
+    *OldInstruction = 0x08000000;
+    unsigned JTargetAddr = NewAddr & 0x0FFFFFFC;
+
+    JTargetAddr >>= 2;
+    *OldInstruction |= JTargetAddr;
+
+    // Insert a NOP.
+    OldInstruction++;
+    *OldInstruction = NopInstr;
+
+    sys::Memory::InvalidateInstructionCache(Old, 2 * 4);
+  } else {
+    // We need to clear hint bits from the instruction, in case it is 'jr ra'.
+    const unsigned HintMask = 0xFFFFF83F, ReturnSequence = 0x03e00008;
+    unsigned* CurrentInstr = (unsigned*)Old;
+    unsigned CurrInstrHintClear = (*CurrentInstr) & HintMask;
+    unsigned* NextInstr = CurrentInstr + 1;
+    unsigned NextInstrHintClear = (*NextInstr) & HintMask;
+
+    // Do absolute jump if there are 2 or more instructions before return from
+    // the old function.
+    if ((CurrInstrHintClear != ReturnSequence) &&
+        (NextInstrHintClear != ReturnSequence)) {
+      const unsigned LuiT0Instr = 0x3c080000, AddiuT0Instr = 0x25080000;
+      const unsigned JrT0Instr = 0x01000008;
+      // lui  t0,  high 16 bit of the NewAddr
+      (*(CurrentInstr++)) = LuiT0Instr | ((NewAddr & 0xffff0000) >> 16);
+      // addiu  t0, t0, low 16 bit of the NewAddr
+      (*(CurrentInstr++)) = AddiuT0Instr | (NewAddr & 0x0000ffff);
+      // jr t0
+      (*(CurrentInstr++)) = JrT0Instr;
+      (*CurrentInstr) = NopInstr;
+
+      sys::Memory::InvalidateInstructionCache(Old, 4 * 4);
+    } else {
+      // Unsupported case
+      report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+    }
+  }
 }
 
 /// JITCompilerFunction - This contains the address of the JIT function used to
@@ -154,8 +199,8 @@ TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
   return Result;
 }
 
-void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
-    JITCodeEmitter &JCE) {
+void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                    JITCodeEmitter &JCE) {
   JCE.emitAlignment(4);
   void *Addr = (void*) (JCE.getCurrentPCValue());
   if (!sys::Memory::setRangeWritable(Addr, 16))
@@ -193,7 +238,7 @@ void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
 /// it must rewrite the code to contain the actual addresses of any
 /// referenced global symbols.
 void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
-    unsigned NumRelocs, unsigned char* GOTBase) {
+                           unsigned NumRelocs, unsigned char *GOTBase) {
   for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
 
     void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
index f4c4ae8..637a318 100644
--- a/lib/Target/Mips/MipsJITInfo.h
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -45,8 +45,8 @@ class MipsJITInfo : public TargetJITInfo {
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function* F, void *Fn,
-        JITCodeEmitter &JCE);
+    virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                   JITCodeEmitter &JCE);
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
     virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
@@ -55,7 +55,7 @@ class MipsJITInfo : public TargetJITInfo {
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
     virtual void relocate(void *Function, MachineRelocation *MR,
-        unsigned NumRelocs, unsigned char* GOTBase);
+                          unsigned NumRelocs, unsigned char *GOTBase);
 
     /// Initialize - Initialize internal stage for the function being JITted.
     void Initialize(const MachineFunction &MF, bool isPIC) {
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 0000000..f78203f
--- /dev/null
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,419 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-long-branch"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+  "skip-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Skip long branch pass."),
+  cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+  "force-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Expand all branches to long format."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  struct MBBInfo {
+    uint64_t Size;
+    bool HasLongBranch;
+    MachineInstr *Br;
+
+    MBBInfo() : Size(0), HasLongBranch(false), Br(0) {}
+  };
+
+  class MipsLongBranch : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsLongBranch(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    void splitMBB(MachineBasicBlock *MBB);
+    void initMBBInfo();
+    int64_t computeOffset(const MachineInstr *Br);
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL,
+                       MachineBasicBlock *MBBOpnd);
+    void expandToLongBranch(MBBInfo &Info);
+
+    const TargetMachine &TM;
+    const MipsInstrInfo *TII;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBInfos;
+  };
+
+  char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+  for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = Br.getOperand(I);
+
+    if (MO.isMBB())
+      return MO.getMBB();
+  }
+
+  assert(false && "This instruction does not have an MBB operand.");
+  return 0;
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) {
+  for (; B != E; ++B)
+    if (!B->isDebugValue())
+      return B;
+
+  return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+  ReverseIter End = MBB->rend();
+  ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+  // Return if MBB has no branch instructions.
+  if ((LastBr == End) ||
+      (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+    return;
+
+  ReverseIter FirstBr = getNonDebugInstr(llvm::next(LastBr), End);
+
+  // MBB has only one branch instruction if FirstBr is not a branch
+  // instruction.
+  if ((FirstBr == End) ||
+      (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+    return;
+
+  assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+  // Create a new MBB. Move instructions in MBB to the newly created MBB.
+  MachineBasicBlock *NewMBB =
+    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+  // Insert NewMBB and fix control flow.
+  MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+  NewMBB->transferSuccessors(MBB);
+  NewMBB->removeSuccessor(Tgt);
+  MBB->addSuccessor(NewMBB);
+  MBB->addSuccessor(Tgt);
+  MF->insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+
+  NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+  // Split the MBBs if they have two branches. Each basic block should have at
+  // most one branch after this loop is executed.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
+    splitMBB(I++);
+
+  MF->RenumberBlocks();
+  MBBInfos.clear();
+  MBBInfos.resize(MF->size());
+
+  for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+    // Compute size of MBB.
+    for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+         MI != MBB->instr_end(); ++MI)
+      MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI);
+
+    // Search for MBB's branch instruction.
+    ReverseIter End = MBB->rend();
+    ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+    if ((Br != End) && !Br->isIndirectBranch() &&
+        (Br->isConditionalBranch() ||
+         (Br->isUnconditionalBranch() &&
+          TM.getRelocationModel() == Reloc::PIC_)))
+      MBBInfos[I].Br = (++Br).base();
+  }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+  int64_t Offset = 0;
+  int ThisMBB = Br->getParent()->getNumber();
+  int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+  // Compute offset of a forward branch.
+  if (ThisMBB < TargetMBB) {
+    for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+      Offset += MBBInfos[N].Size;
+
+    return Offset + 4;
+  }
+
+  // Compute offset of a backward branch.
+  for (int N = ThisMBB; N >= TargetMBB; --N)
+    Offset += MBBInfos[N].Size;
+
+  return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                   DebugLoc DL, MachineBasicBlock *MBBOpnd) {
+  unsigned NewOpc = TII->GetOppositeBranchOpc(Br->getOpcode());
+  const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+  MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+  for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+    MachineOperand &MO = Br->getOperand(I);
+
+    if (!MO.isReg()) {
+      assert(MO.isMBB() && "MBB operand expected.");
+      break;
+    }
+
+    MIB.addReg(MO.getReg());
+  }
+
+  MIB.addMBB(MBBOpnd);
+
+  Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+  I.HasLongBranch = true;
+
+  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI();
+  bool N64 = ABI == MipsSubtarget::N64;
+
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+  DebugLoc DL = I.Br->getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+
+  MF->insert(FallThroughMBB, LongBrMBB);
+  MBB->removeSuccessor(TgtMBB);
+  MBB->addSuccessor(LongBrMBB);
+
+  if (IsPIC) {
+    // $longbr:
+    //  addiu $sp, $sp, -regsize * 2
+    //  sw $ra, 0($sp)
+    //  bal $baltgt
+    //  sw $a3, regsize($sp)
+    // $baltgt:
+    //  lui $a3, %hi($baltgt)
+    //  lui $at, %hi($tgt)
+    //  addiu $a3, $a3, %lo($baltgt)
+    //  addiu $at, $at, %lo($tgt)
+    //  subu $at, $at, $a3
+    //  addu $at, $ra, $at
+    //
+    //  if n64:
+    //   lui $a3, %highest($baltgt)
+    //   lui $ra, %highest($tgt)
+    //   addiu $a3, $a3, %higher($baltgt)
+    //   addiu $ra, $ra, %higher($tgt)
+    //   dsll $a3, $a3, 32
+    //   dsll $ra, $ra, 32
+    //   subu $at, $at, $a3
+    //   addu $at, $at, $ra
+    //
+    //  lw $ra, 0($sp)
+    //  lw $a3, regsize($sp)
+    //  jr $at
+    //  addiu $sp, $sp, regsize * 2
+    // $fallthrough:
+    //
+    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
+    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(FallThroughMBB, BalTgtMBB);
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
+
+    int RegSize = N64 ? 8 : 4;
+    unsigned AT = N64 ? Mips::AT_64 : Mips::AT;
+    unsigned A3 = N64 ? Mips::A3_64 : Mips::A3;
+    unsigned SP = N64 ? Mips::SP_64 : Mips::SP;
+    unsigned RA = N64 ? Mips::RA_64 : Mips::RA;
+    unsigned Load = N64 ? Mips::LD_P8 : Mips::LW;
+    unsigned Store = N64 ? Mips::SD_P8 : Mips::SW;
+    unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi;
+    unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu;
+    unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu;
+    unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu;
+    unsigned JR = N64 ? Mips::JR64 : Mips::JR;
+
+    Pos = LongBrMBB->begin();
+
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
+      .addImm(-RegSize * 2);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP)
+      .addImm(0);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP)
+      .addImm(RegSize)->setIsInsideBundle();
+
+    Pos = BalTgtMBB->begin();
+
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
+      .addMBB(BalTgtMBB, MipsII::MO_ABS_HI);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT)
+      .addMBB(TgtMBB, MipsII::MO_ABS_HI);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
+      .addMBB(BalTgtMBB, MipsII::MO_ABS_LO);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT)
+      .addMBB(TgtMBB, MipsII::MO_ABS_LO);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT);
+
+    if (N64) {
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
+        .addMBB(BalTgtMBB, MipsII::MO_HIGHEST);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA)
+        .addMBB(TgtMBB, MipsII::MO_HIGHEST);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
+        .addMBB(BalTgtMBB, MipsII::MO_HIGHER);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA)
+        .addMBB(TgtMBB, MipsII::MO_HIGHER);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3)
+        .addImm(32);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA)
+        .addImm(32);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA);
+      I.Size += 4 * 8;
+    }
+
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
+      .addImm(RegSize * 2)->setIsInsideBundle();
+    I.Size += 4 * 14;
+  } else {
+    // $longbr:
+    //  j $tgt
+    //  nop
+    // $fallthrough:
+    //
+    Pos = LongBrMBB->begin();
+    LongBrMBB->addSuccessor(TgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle();
+    I.Size += 4 * 2;
+  }
+
+  if (I.Br->isUnconditionalBranch()) {
+    // Change branch destination.
+    assert(I.Br->getDesc().getNumOperands() == 1);
+    I.Br->RemoveOperand(0);
+    I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+  } else
+    // Change branch destination and reverse condition.
+    replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+  MachineBasicBlock &MBB = F.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  if ((TM.getRelocationModel() == Reloc::PIC_) &&
+      TM.getSubtarget<MipsSubtarget>().isABI_O32() &&
+      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(F, TII);
+
+  if (SkipLongBranch)
+    return true;
+
+  MF = &F;
+  initMBBInfo();
+
+  SmallVector<MBBInfo, 16>::iterator I, E = MBBInfos.end();
+  bool EverMadeChange = false, MadeChange = true;
+
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (I = MBBInfos.begin(); I != E; ++I) {
+      // Skip if this MBB doesn't have a branch or the branch has already been
+      // converted to a long branch.
+      if (!I->Br || I->HasLongBranch)
+        continue;
+
+      if (!ForceLongBranch)
+        // Check if offset fits into 16-bit immediate field of branches.
+        if (isInt<16>(computeOffset(I->Br) / 4))
+          continue;
+
+      expandToLongBranch(*I);
+      ++LongBranches;
+      EverMadeChange = MadeChange = true;
+    }
+  }
+
+  if (EverMadeChange)
+    MF->RenumberBlocks();
+
+  return true;
+}
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 1597b93..d4c5e6d 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
   : AsmPrinter(asmprinter) {}
 
-void MipsMCInstLower::Initialize(Mangler *M, MCContext* C) {
+void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
   Mang = M;
   Ctx = C;
 }
@@ -61,6 +61,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_GOT_DISP:  Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break;
   case MipsII::MO_GOT_PAGE:  Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
   case MipsII::MO_GOT_OFST:  Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break;
+  case MipsII::MO_HIGHER:    Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break;
+  case MipsII::MO_HIGHEST:   Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break;
   }
 
   switch (MOTy) {
@@ -70,14 +72,17 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   case MachineOperand::MO_GlobalAddress:
     Symbol = Mang->getSymbol(MO.getGlobal());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_BlockAddress:
     Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_ExternalSymbol:
     Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_JumpTableIndex:
@@ -86,8 +91,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   case MachineOperand::MO_ConstantPoolIndex:
     Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
-    if (MO.getOffset())
-      Offset += MO.getOffset();
+    Offset += MO.getOffset();
     break;
 
   default:
@@ -103,71 +107,23 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   assert(Offset > 0);
 
   const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
-  const MCBinaryExpr *AddExpr = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
-  return MCOperand::CreateExpr(AddExpr);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
 }
 
-static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand& Opnd0,
-                         const MCOperand& Opnd1,
-                         const MCOperand& Opnd2 = MCOperand()) {
+/*
+static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0,
+                         const MCOperand &Opnd1,
+                         const MCOperand &Opnd2 = MCOperand()) {
   Inst.setOpcode(Opc);
   Inst.addOperand(Opnd0);
   Inst.addOperand(Opnd1);
   if (Opnd2.isValid())
     Inst.addOperand(Opnd2);
 }
+*/
 
-// Lower ".cpload $reg" to
-//  "lui   $gp, %hi(_gp_disp)"
-//  "addiu $gp, $gp, %lo(_gp_disp)"
-//  "addu  $gp, $gp, $t9"
-void MipsMCInstLower::LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts) {
-  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
-  MCOperand T9Reg = MCOperand::CreateReg(Mips::T9);
-  StringRef SymName("_gp_disp");
-  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
-  const MCSymbolRefExpr *MCSym;
-
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
-  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
-  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
-
-  MCInsts.resize(3);
-
-  CreateMCInst(MCInsts[0], Mips::LUi, GPReg, SymHi);
-  CreateMCInst(MCInsts[1], Mips::ADDiu, GPReg, GPReg, SymLo);
-  CreateMCInst(MCInsts[2], Mips::ADDu, GPReg, GPReg, T9Reg);
-}
-
-// Lower ".cprestore offset" to "sw $gp, offset($sp)".
-void MipsMCInstLower::LowerCPRESTORE(int64_t Offset,
-                                     SmallVector<MCInst, 4>& MCInsts) {
-  assert(isInt<32>(Offset) && (Offset >= 0) &&
-         "Imm operand of .cprestore must be a non-negative 32-bit value.");
-
-  MCOperand SPReg = MCOperand::CreateReg(Mips::SP), BaseReg = SPReg;
-  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
-
-  if (!isInt<16>(Offset)) {
-    unsigned Hi = ((Offset + 0x8000) >> 16) & 0xffff;
-    Offset &= 0xffff;
-    MCOperand ATReg = MCOperand::CreateReg(Mips::AT);
-    BaseReg = ATReg;
-
-    // lui   at,hi
-    // addu  at,at,sp
-    MCInsts.resize(2);
-    CreateMCInst(MCInsts[0], Mips::LUi, ATReg, MCOperand::CreateImm(Hi));
-    CreateMCInst(MCInsts[1], Mips::ADDu, ATReg, ATReg, SPReg);
-  }
-
-  MCInst Sw;
-  CreateMCInst(Sw, Mips::SW, GPReg, BaseReg, MCOperand::CreateImm(Offset));
-  MCInsts.push_back(Sw);
-}
-
-MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO,
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
                                         unsigned offset) const {
   MachineOperandType MOTy = MO.getType();
 
@@ -205,139 +161,31 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-void MipsMCInstLower::LowerUnalignedLoadStore(const MachineInstr *MI,
-                                              SmallVector<MCInst,
-                                              4>& MCInsts) {
-  unsigned Opc = MI->getOpcode();
-  MCInst Instr1, Instr2, Instr3, Move;
-
-  bool TwoInstructions = false;
-
-  assert(MI->getNumOperands() == 3);
-  assert(MI->getOperand(0).isReg());
-  assert(MI->getOperand(1).isReg());
-
-  MCOperand Target = LowerOperand(MI->getOperand(0));
-  MCOperand Base = LowerOperand(MI->getOperand(1));
-  MCOperand ATReg = MCOperand::CreateReg(Mips::AT);
-  MCOperand ZeroReg = MCOperand::CreateReg(Mips::ZERO);
-
-  MachineOperand UnLoweredName = MI->getOperand(2);
-  MCOperand Name = LowerOperand(UnLoweredName);
-
-  Move.setOpcode(Mips::ADDu);
-  Move.addOperand(Target);
-  Move.addOperand(ATReg);
-  Move.addOperand(ZeroReg);
-
-  switch (Opc) {
-  case Mips::ULW: {
-    // FIXME: only works for little endian right now
-    MCOperand AdjName = LowerOperand(UnLoweredName, 3);
-    if (Base.getReg() == (Target.getReg())) {
-      Instr1.setOpcode(Mips::LWL);
-      Instr1.addOperand(ATReg);
-      Instr1.addOperand(Base);
-      Instr1.addOperand(AdjName);
-      Instr2.setOpcode(Mips::LWR);
-      Instr2.addOperand(ATReg);
-      Instr2.addOperand(Base);
-      Instr2.addOperand(Name);
-      Instr3 = Move;
-    } else {
-      TwoInstructions = true;
-      Instr1.setOpcode(Mips::LWL);
-      Instr1.addOperand(Target);
-      Instr1.addOperand(Base);
-      Instr1.addOperand(AdjName);
-      Instr2.setOpcode(Mips::LWR);
-      Instr2.addOperand(Target);
-      Instr2.addOperand(Base);
-      Instr2.addOperand(Name);
-    }
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI,
+                                      MCInst& Inst,
+                                      int64_t Shift) {
+  // rt
+  Inst.addOperand(LowerOperand(MI->getOperand(0)));
+  // rd
+  Inst.addOperand(LowerOperand(MI->getOperand(1)));
+  // saminus32
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  switch (MI->getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
     break;
-  }
-  case Mips::ULHu: {
-    // FIXME: only works for little endian right now
-    MCOperand AdjName = LowerOperand(UnLoweredName, 1);
-    Instr1.setOpcode(Mips::LBu);
-    Instr1.addOperand(ATReg);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(AdjName);
-    Instr2.setOpcode(Mips::LBu);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(Base);
-    Instr2.addOperand(Name);
-    Instr3.setOpcode(Mips::INS);
-    Instr3.addOperand(Target);
-    Instr3.addOperand(ATReg);
-    Instr3.addOperand(MCOperand::CreateImm(0x8));
-    Instr3.addOperand(MCOperand::CreateImm(0x18));
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
     break;
-  }
-
-  case Mips::USW: {
-    // FIXME: only works for little endian right now
-    assert (Base.getReg() != Target.getReg());
-    TwoInstructions = true;
-    MCOperand AdjName = LowerOperand(UnLoweredName, 3);
-    Instr1.setOpcode(Mips::SWL);
-    Instr1.addOperand(Target);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(AdjName);
-    Instr2.setOpcode(Mips::SWR);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(Base);
-    Instr2.addOperand(Name);
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
     break;
-  }
-  case Mips::USH: {
-    MCOperand AdjName = LowerOperand(UnLoweredName, 1);
-    Instr1.setOpcode(Mips::SB);
-    Instr1.addOperand(Target);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(Name);
-    Instr2.setOpcode(Mips::SRL);
-    Instr2.addOperand(ATReg);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(MCOperand::CreateImm(8));
-    Instr3.setOpcode(Mips::SB);
-    Instr3.addOperand(ATReg);
-    Instr3.addOperand(Base);
-    Instr3.addOperand(AdjName);
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
     break;
   }
-  default:
-    // FIXME: need to add others
-    llvm_unreachable("unaligned instruction not processed");
-  }
-
-  MCInsts.push_back(Instr1);
-  MCInsts.push_back(Instr2);
-  if (!TwoInstructions) MCInsts.push_back(Instr3);
-}
-
-// Convert
-//  "setgp01 $reg"
-// to
-//  "lui   $reg, %hi(_gp_disp)"
-//  "addiu $reg, $reg, %lo(_gp_disp)"
-void MipsMCInstLower::LowerSETGP01(const MachineInstr *MI,
-                                   SmallVector<MCInst, 4>& MCInsts) {
-  const MachineOperand &MO = MI->getOperand(0);
-  assert(MO.isReg());
-  MCOperand RegOpnd = MCOperand::CreateReg(MO.getReg());
-  StringRef SymName("_gp_disp");
-  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
-  const MCSymbolRefExpr *MCSym;
-
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
-  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
-  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
-
-  MCInsts.resize(2);
-
-  CreateMCInst(MCInsts[0], Mips::LUi, RegOpnd, SymHi);
-  CreateMCInst(MCInsts[1], Mips::ADDiu, RegOpnd, RegOpnd, SymLo);
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index c1d007d..0abb996 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -31,13 +31,10 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   MipsAsmPrinter &AsmPrinter;
 public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
-  void Initialize(Mangler *mang, MCContext* C);
+  void Initialize(Mangler *mang, MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts);
-  void LowerCPRESTORE(int64_t Offset, SmallVector<MCInst, 4>& MCInsts);
-  void LowerUnalignedLoadStore(const MachineInstr *MI,
-                               SmallVector<MCInst, 4>& MCInsts);
-  void LowerSETGP01(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
+  void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift);
+
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index b00c62b..362173e 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -22,10 +22,6 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
-bool MipsFunctionInfo::globalBaseRegFixed() const {
-  return FixGlobalBaseReg;
-}
-
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
@@ -37,13 +33,13 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
 
   const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
 
-  if (FixGlobalBaseReg) // $gp is the global base register.
-    return GlobalBaseReg = ST.isABI_N64() ? Mips::GP_64 : Mips::GP;
-
   const TargetRegisterClass *RC;
-  RC = ST.isABI_N64() ?
-    Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass;
-
+  if (ST.inMips16Mode())
+    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  else
+    RC = ST.isABI_N64() ?
+      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass :
+      (const TargetRegisterClass*)&Mips::CPURegsRegClass;
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 0fde55c..df3c4c0 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,8 +14,11 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
+#include "MipsSubtarget.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
 #include <utility>
 
 namespace llvm {
@@ -45,8 +48,6 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   // OutArgFIRange: Range of indices of all frame objects created during call to
   //                LowerCall except for the frame object for restoring $gp.
   std::pair<int, int> InArgFIRange, OutArgFIRange;
-  int GPFI; // Index of the frame object for restoring $gp
-  mutable int DynAllocFI; // Frame index of dynamically allocated stack area.
   unsigned MaxCallFrameSize;
 
   bool EmitNOAT;
@@ -55,8 +56,7 @@ public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
     VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
-    OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0),
-    MaxCallFrameSize(0), EmitNOAT(false)
+    OutArgFIRange(std::make_pair(-1, 0)), MaxCallFrameSize(0), EmitNOAT(false)
   {}
 
   bool isInArgFI(int FI) const {
@@ -74,25 +74,9 @@ public:
     OutArgFIRange.second = LastFI;
   }
 
-  int getGPFI() const { return GPFI; }
-  void setGPFI(int FI) { GPFI = FI; }
-  bool needGPSaveRestore() const { return getGPFI(); }
-  bool isGPFI(int FI) const { return GPFI && GPFI == FI; }
-
-  // The first call to this function creates a frame object for dynamically
-  // allocated stack area.
-  int getDynAllocFI() const {
-    if (!DynAllocFI)
-      DynAllocFI = MF.getFrameInfo()->CreateFixedObject(4, 0, true);
-
-    return DynAllocFI;
-  }
-  bool isDynAllocFI(int FI) const { return DynAllocFI && DynAllocFI == FI; }
-
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
-  bool globalBaseRegFixed() const;
   bool globalBaseRegSet() const;
   unsigned getGlobalBaseReg();
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index f30de44..ae6ae3a 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -16,9 +16,11 @@
 #include "MipsRegisterInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsMachineFunction.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Type.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -35,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/DebugInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
@@ -54,8 +55,7 @@ unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
 /// Mips Callee Saved Registers
 const uint16_t* MipsRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const
-{
+getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
   else if (!Subtarget.hasMips64())
@@ -64,12 +64,11 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     return CSR_N32_SaveList;
 
   assert(Subtarget.isABI_N64());
-  return CSR_N64_SaveList;  
+  return CSR_N64_SaveList;
 }
 
 const uint32_t*
-MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const
-{  
+MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_RegMask;
   else if (!Subtarget.hasMips64())
@@ -78,23 +77,21 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const
     return CSR_N32_RegMask;
 
   assert(Subtarget.isABI_N64());
-  return CSR_N64_RegMask;  
+  return CSR_N64_RegMask;
 }
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   static const uint16_t ReservedCPURegs[] = {
-    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1,
-    Mips::SP, Mips::FP, Mips::RA
+    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, Mips::SP
   };
 
   static const uint16_t ReservedCPU64Regs[] = {
-    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64,
-    Mips::SP_64, Mips::FP_64, Mips::RA_64
+    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
   BitVector Reserved(getNumRegs());
-  typedef TargetRegisterClass::iterator RegIter;
+  typedef TargetRegisterClass::const_iterator RegIter;
 
   for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I)
     Reserved.set(ReservedCPURegs[I]);
@@ -104,31 +101,36 @@ getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(ReservedCPU64Regs[I]);
 
     // Reserve all registers in AFGR64.
-    for (RegIter Reg = Mips::AFGR64RegisterClass->begin();
-         Reg != Mips::AFGR64RegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+         EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
-  }
-  else {
+  } else {
     // Reserve all registers in CPU64Regs & FGR64.
-    for (RegIter Reg = Mips::CPU64RegsRegisterClass->begin();
-         Reg != Mips::CPU64RegsRegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::CPU64RegsRegClass.begin(),
+         EReg = Mips::CPU64RegsRegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
 
-    for (RegIter Reg = Mips::FGR64RegisterClass->begin();
-         Reg != Mips::FGR64RegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::FGR64RegClass.begin(),
+         EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
   }
 
-  // If GP is dedicated as a global base register, reserve it.
-  if (MF.getInfo<MipsFunctionInfo>()->globalBaseRegFixed()) {
-    Reserved.set(Mips::GP);
-    Reserved.set(Mips::GP_64);
+  // Reserve FP if this function should have a dedicated frame pointer register.
+  if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
+    Reserved.set(Mips::FP);
+    Reserved.set(Mips::FP_64);
   }
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
   Reserved.set(Mips::HWR29_64);
 
+  // Reserve RA if in mips16 mode.
+  if (Subtarget.inMips16Mode()) {
+    Reserved.set(Mips::RA);
+    Reserved.set(Mips::RA_64);
+  }
+
   return Reserved;
 }
 
@@ -137,13 +139,9 @@ MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
 }
 
-// This function eliminate ADJCALLSTACKDOWN,
-// ADJCALLSTACKUP pseudo instructions
-void MipsRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
-  MBB.erase(I);
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
 }
 
 // FrameIndex represent objects inside a abstract stack.
@@ -154,8 +152,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                     RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
@@ -175,88 +171,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                << "spOffset   : " << spOffset << "\n"
                << "stackSize  : " << stackSize << "\n");
 
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
-  }
-
-  // The following stack frame objects are always referenced relative to $sp:
-  //  1. Outgoing arguments.
-  //  2. Pointer to dynamically allocated stack space.
-  //  3. Locations for callee-saved registers.
-  // Everything else is referenced relative to whatever register
-  // getFrameRegister() returns.
-  unsigned FrameReg;
-
-  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isDynAllocFI(FrameIndex) ||
-      (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
-    FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
-  else
-    FrameReg = getFrameRegister(MF);
-
-  // Calculate final offset.
-  // - There is no need to change the offset if the frame object is one of the
-  //   following: an outgoing argument, pointer to a dynamically allocated
-  //   stack space or a $gp restore location,
-  // - If the frame object is any of the following, its offset must be adjusted
-  //   by adding the size of the stack:
-  //   incoming argument, callee-saved register location or local variable.
-  int64_t Offset;
-
-  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex) ||
-      MipsFI->isDynAllocFI(FrameIndex))
-    Offset = spOffset;
-  else
-    Offset = spOffset + (int64_t)stackSize;
-
-  Offset    += MI.getOperand(i+1).getImm();
-
-  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
-
-  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
-  // field.
-  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
-    MachineBasicBlock &MBB = *MI.getParent();
-    DebugLoc DL = II->getDebugLoc();
-    MipsAnalyzeImmediate AnalyzeImm;
-    unsigned Size = Subtarget.isABI_N64() ? 64 : 32;
-    unsigned LUi = Subtarget.isABI_N64() ? Mips::LUi64 : Mips::LUi;
-    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned ZEROReg = Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
-    unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
-    const MipsAnalyzeImmediate::InstSeq &Seq =
-      AnalyzeImm.Analyze(Offset, Size, true /* LastInstrIsADDiu */);
-    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-    MipsFI->setEmitNOAT();
-
-    // The first instruction can be a LUi, which is different from other
-    // instructions (ADDiu, ORI and SLL) in that it does not have a register
-    // operand.
-    if (Inst->Opc == LUi)
-      BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-    else
-      BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-    // Build the remaining instructions in Seq except for the last one.
-    for (++Inst; Inst != Seq.end() - 1; ++Inst)
-      BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
-
-    FrameReg = ATReg;
-    Offset = SignExtend64<16>(Inst->ImmOpnd);
-  }
-
-  MI.getOperand(i).ChangeToRegister(FrameReg, false);
-  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  eliminateFI(MI, i, FrameIndex, stackSize, spOffset);
 }
 
 unsigned MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 0716d29..9a05e94 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -25,10 +25,12 @@ class MipsSubtarget;
 class TargetInstrInfo;
 class Type;
 
-struct MipsRegisterInfo : public MipsGenRegisterInfo {
+class MipsRegisterInfo : public MipsGenRegisterInfo {
+protected:
   const MipsSubtarget &Subtarget;
   const TargetInstrInfo &TII;
 
+public:
   MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
 
   /// getRegisterNumbering - Given the enum value for some register, e.g.
@@ -42,16 +44,14 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
 
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
@@ -65,6 +65,11 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   /// Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const = 0;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index ce399a0..b255e42 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -70,8 +70,8 @@ class HWR<bits<5> num, string n> : MipsReg<n> {
 
 let Namespace = "Mips" in {
   // General Purpose Registers
-  def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
-  def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
+  def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "at">,   DwarfRegNum<[1]>;
   def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
   def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
   def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
@@ -98,14 +98,14 @@ let Namespace = "Mips" in {
   def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
   def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
   def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
-  def GP   : MipsGPRReg< 28, "GP">,  DwarfRegNum<[28]>;
-  def SP   : MipsGPRReg< 29, "SP">,  DwarfRegNum<[29]>;
-  def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
-  def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
+  def GP   : MipsGPRReg< 28, "gp">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "sp">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "fp">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "ra">,  DwarfRegNum<[31]>;
 
   // General Purpose 64-bit Registers
-  def ZERO_64 : Mips64GPRReg< 0, "ZERO", [ZERO]>, DwarfRegNum<[0]>;
-  def AT_64   : Mips64GPRReg< 1, "AT",   [AT]>, DwarfRegNum<[1]>;
+  def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+  def AT_64   : Mips64GPRReg< 1, "at",   [AT]>, DwarfRegNum<[1]>;
   def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
   def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
   def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
@@ -132,97 +132,97 @@ let Namespace = "Mips" in {
   def T9_64   : Mips64GPRReg< 25, "25",  [T9]>, DwarfRegNum<[25]>;
   def K0_64   : Mips64GPRReg< 26, "26",  [K0]>, DwarfRegNum<[26]>;
   def K1_64   : Mips64GPRReg< 27, "27",  [K1]>, DwarfRegNum<[27]>;
-  def GP_64   : Mips64GPRReg< 28, "GP",  [GP]>, DwarfRegNum<[28]>;
-  def SP_64   : Mips64GPRReg< 29, "SP",  [SP]>, DwarfRegNum<[29]>;
-  def FP_64   : Mips64GPRReg< 30, "FP",  [FP]>, DwarfRegNum<[30]>;
-  def RA_64   : Mips64GPRReg< 31, "RA",  [RA]>, DwarfRegNum<[31]>;
+  def GP_64   : Mips64GPRReg< 28, "gp",  [GP]>, DwarfRegNum<[28]>;
+  def SP_64   : Mips64GPRReg< 29, "sp",  [SP]>, DwarfRegNum<[29]>;
+  def FP_64   : Mips64GPRReg< 30, "fp",  [FP]>, DwarfRegNum<[30]>;
+  def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
 
   /// Mips Single point precision FPU Registers
-  def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
-  def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
-  def F2  : FPR< 2,  "F2">, DwarfRegNum<[34]>;
-  def F3  : FPR< 3,  "F3">, DwarfRegNum<[35]>;
-  def F4  : FPR< 4,  "F4">, DwarfRegNum<[36]>;
-  def F5  : FPR< 5,  "F5">, DwarfRegNum<[37]>;
-  def F6  : FPR< 6,  "F6">, DwarfRegNum<[38]>;
-  def F7  : FPR< 7,  "F7">, DwarfRegNum<[39]>;
-  def F8  : FPR< 8,  "F8">, DwarfRegNum<[40]>;
-  def F9  : FPR< 9,  "F9">, DwarfRegNum<[41]>;
-  def F10 : FPR<10, "F10">, DwarfRegNum<[42]>;
-  def F11 : FPR<11, "F11">, DwarfRegNum<[43]>;
-  def F12 : FPR<12, "F12">, DwarfRegNum<[44]>;
-  def F13 : FPR<13, "F13">, DwarfRegNum<[45]>;
-  def F14 : FPR<14, "F14">, DwarfRegNum<[46]>;
-  def F15 : FPR<15, "F15">, DwarfRegNum<[47]>;
-  def F16 : FPR<16, "F16">, DwarfRegNum<[48]>;
-  def F17 : FPR<17, "F17">, DwarfRegNum<[49]>;
-  def F18 : FPR<18, "F18">, DwarfRegNum<[50]>;
-  def F19 : FPR<19, "F19">, DwarfRegNum<[51]>;
-  def F20 : FPR<20, "F20">, DwarfRegNum<[52]>;
-  def F21 : FPR<21, "F21">, DwarfRegNum<[53]>;
-  def F22 : FPR<22, "F22">, DwarfRegNum<[54]>;
-  def F23 : FPR<23, "F23">, DwarfRegNum<[55]>;
-  def F24 : FPR<24, "F24">, DwarfRegNum<[56]>;
-  def F25 : FPR<25, "F25">, DwarfRegNum<[57]>;
-  def F26 : FPR<26, "F26">, DwarfRegNum<[58]>;
-  def F27 : FPR<27, "F27">, DwarfRegNum<[59]>;
-  def F28 : FPR<28, "F28">, DwarfRegNum<[60]>;
-  def F29 : FPR<29, "F29">, DwarfRegNum<[61]>;
-  def F30 : FPR<30, "F30">, DwarfRegNum<[62]>;
-  def F31 : FPR<31, "F31">, DwarfRegNum<[63]>;
+  def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
+  def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
 
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
-  def D0  : AFPR< 0,  "F0", [F0,   F1]>;
-  def D1  : AFPR< 2,  "F2", [F2,   F3]>;
-  def D2  : AFPR< 4,  "F4", [F4,   F5]>;
-  def D3  : AFPR< 6,  "F6", [F6,   F7]>;
-  def D4  : AFPR< 8,  "F8", [F8,   F9]>;
-  def D5  : AFPR<10, "F10", [F10, F11]>;
-  def D6  : AFPR<12, "F12", [F12, F13]>;
-  def D7  : AFPR<14, "F14", [F14, F15]>;
-  def D8  : AFPR<16, "F16", [F16, F17]>;
-  def D9  : AFPR<18, "F18", [F18, F19]>;
-  def D10 : AFPR<20, "F20", [F20, F21]>;
-  def D11 : AFPR<22, "F22", [F22, F23]>;
-  def D12 : AFPR<24, "F24", [F24, F25]>;
-  def D13 : AFPR<26, "F26", [F26, F27]>;
-  def D14 : AFPR<28, "F28", [F28, F29]>;
-  def D15 : AFPR<30, "F30", [F30, F31]>;
+  def D0  : AFPR< 0,  "f0", [F0,   F1]>;
+  def D1  : AFPR< 2,  "f2", [F2,   F3]>;
+  def D2  : AFPR< 4,  "f4", [F4,   F5]>;
+  def D3  : AFPR< 6,  "f6", [F6,   F7]>;
+  def D4  : AFPR< 8,  "f8", [F8,   F9]>;
+  def D5  : AFPR<10, "f10", [F10, F11]>;
+  def D6  : AFPR<12, "f12", [F12, F13]>;
+  def D7  : AFPR<14, "f14", [F14, F15]>;
+  def D8  : AFPR<16, "f16", [F16, F17]>;
+  def D9  : AFPR<18, "f18", [F18, F19]>;
+  def D10 : AFPR<20, "f20", [F20, F21]>;
+  def D11 : AFPR<22, "f22", [F22, F23]>;
+  def D12 : AFPR<24, "f24", [F24, F25]>;
+  def D13 : AFPR<26, "f26", [F26, F27]>;
+  def D14 : AFPR<28, "f28", [F28, F29]>;
+  def D15 : AFPR<30, "f30", [F30, F31]>;
 
   /// Mips Double point precision FPU Registers in MFP64 mode.
-  def D0_64  : AFPR64<0, "F0", [F0]>, DwarfRegNum<[32]>;
-  def D1_64  : AFPR64<1, "F1", [F1]>, DwarfRegNum<[33]>;
-  def D2_64  : AFPR64<2, "F2", [F2]>, DwarfRegNum<[34]>;
-  def D3_64  : AFPR64<3, "F3", [F3]>, DwarfRegNum<[35]>;
-  def D4_64  : AFPR64<4, "F4", [F4]>, DwarfRegNum<[36]>;
-  def D5_64  : AFPR64<5, "F5", [F5]>, DwarfRegNum<[37]>;
-  def D6_64  : AFPR64<6, "F6", [F6]>, DwarfRegNum<[38]>;
-  def D7_64  : AFPR64<7, "F7", [F7]>, DwarfRegNum<[39]>;
-  def D8_64  : AFPR64<8, "F8", [F8]>, DwarfRegNum<[40]>;
-  def D9_64  : AFPR64<9, "F9", [F9]>, DwarfRegNum<[41]>;
-  def D10_64  : AFPR64<10, "F10", [F10]>, DwarfRegNum<[42]>;
-  def D11_64  : AFPR64<11, "F11", [F11]>, DwarfRegNum<[43]>;
-  def D12_64  : AFPR64<12, "F12", [F12]>, DwarfRegNum<[44]>;
-  def D13_64  : AFPR64<13, "F13", [F13]>, DwarfRegNum<[45]>;
-  def D14_64  : AFPR64<14, "F14", [F14]>, DwarfRegNum<[46]>;
-  def D15_64  : AFPR64<15, "F15", [F15]>, DwarfRegNum<[47]>;
-  def D16_64  : AFPR64<16, "F16", [F16]>, DwarfRegNum<[48]>;
-  def D17_64  : AFPR64<17, "F17", [F17]>, DwarfRegNum<[49]>;
-  def D18_64  : AFPR64<18, "F18", [F18]>, DwarfRegNum<[50]>;
-  def D19_64  : AFPR64<19, "F19", [F19]>, DwarfRegNum<[51]>;
-  def D20_64  : AFPR64<20, "F20", [F20]>, DwarfRegNum<[52]>;
-  def D21_64  : AFPR64<21, "F21", [F21]>, DwarfRegNum<[53]>;
-  def D22_64  : AFPR64<22, "F22", [F22]>, DwarfRegNum<[54]>;
-  def D23_64  : AFPR64<23, "F23", [F23]>, DwarfRegNum<[55]>;
-  def D24_64  : AFPR64<24, "F24", [F24]>, DwarfRegNum<[56]>;
-  def D25_64  : AFPR64<25, "F25", [F25]>, DwarfRegNum<[57]>;
-  def D26_64  : AFPR64<26, "F26", [F26]>, DwarfRegNum<[58]>;
-  def D27_64  : AFPR64<27, "F27", [F27]>, DwarfRegNum<[59]>;
-  def D28_64  : AFPR64<28, "F28", [F28]>, DwarfRegNum<[60]>;
-  def D29_64  : AFPR64<29, "F29", [F29]>, DwarfRegNum<[61]>;
-  def D30_64  : AFPR64<30, "F30", [F30]>, DwarfRegNum<[62]>;
-  def D31_64  : AFPR64<31, "F31", [F31]>, DwarfRegNum<[63]>;
+  def D0_64  : AFPR64<0, "f0", [F0]>, DwarfRegNum<[32]>;
+  def D1_64  : AFPR64<1, "f1", [F1]>, DwarfRegNum<[33]>;
+  def D2_64  : AFPR64<2, "f2", [F2]>, DwarfRegNum<[34]>;
+  def D3_64  : AFPR64<3, "f3", [F3]>, DwarfRegNum<[35]>;
+  def D4_64  : AFPR64<4, "f4", [F4]>, DwarfRegNum<[36]>;
+  def D5_64  : AFPR64<5, "f5", [F5]>, DwarfRegNum<[37]>;
+  def D6_64  : AFPR64<6, "f6", [F6]>, DwarfRegNum<[38]>;
+  def D7_64  : AFPR64<7, "f7", [F7]>, DwarfRegNum<[39]>;
+  def D8_64  : AFPR64<8, "f8", [F8]>, DwarfRegNum<[40]>;
+  def D9_64  : AFPR64<9, "f9", [F9]>, DwarfRegNum<[41]>;
+  def D10_64  : AFPR64<10, "f10", [F10]>, DwarfRegNum<[42]>;
+  def D11_64  : AFPR64<11, "f11", [F11]>, DwarfRegNum<[43]>;
+  def D12_64  : AFPR64<12, "f12", [F12]>, DwarfRegNum<[44]>;
+  def D13_64  : AFPR64<13, "f13", [F13]>, DwarfRegNum<[45]>;
+  def D14_64  : AFPR64<14, "f14", [F14]>, DwarfRegNum<[46]>;
+  def D15_64  : AFPR64<15, "f15", [F15]>, DwarfRegNum<[47]>;
+  def D16_64  : AFPR64<16, "f16", [F16]>, DwarfRegNum<[48]>;
+  def D17_64  : AFPR64<17, "f17", [F17]>, DwarfRegNum<[49]>;
+  def D18_64  : AFPR64<18, "f18", [F18]>, DwarfRegNum<[50]>;
+  def D19_64  : AFPR64<19, "f19", [F19]>, DwarfRegNum<[51]>;
+  def D20_64  : AFPR64<20, "f20", [F20]>, DwarfRegNum<[52]>;
+  def D21_64  : AFPR64<21, "f21", [F21]>, DwarfRegNum<[53]>;
+  def D22_64  : AFPR64<22, "f22", [F22]>, DwarfRegNum<[54]>;
+  def D23_64  : AFPR64<23, "f23", [F23]>, DwarfRegNum<[55]>;
+  def D24_64  : AFPR64<24, "f24", [F24]>, DwarfRegNum<[56]>;
+  def D25_64  : AFPR64<25, "f25", [F25]>, DwarfRegNum<[57]>;
+  def D26_64  : AFPR64<26, "f26", [F26]>, DwarfRegNum<[58]>;
+  def D27_64  : AFPR64<27, "f27", [F27]>, DwarfRegNum<[59]>;
+  def D28_64  : AFPR64<28, "f28", [F28]>, DwarfRegNum<[60]>;
+  def D29_64  : AFPR64<29, "f29", [F29]>, DwarfRegNum<[61]>;
+  def D30_64  : AFPR64<30, "f30", [F30]>, DwarfRegNum<[62]>;
+  def D31_64  : AFPR64<31, "f31", [F31]>, DwarfRegNum<[63]>;
 
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
@@ -236,6 +236,9 @@ let Namespace = "Mips" in {
   // Status flags register
   def FCR31 : Register<"31">;
 
+  // fcc0 register
+  def FCC0 : Register<"fcc0">;
+
   // Hardware register $29
   def HWR29 : Register<"29">;
   def HWR29_64 : Register<"29">;
@@ -246,26 +249,41 @@ let Namespace = "Mips" in {
 //===----------------------------------------------------------------------===//
 
 def CPURegs : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO, AT,
   // Return Values and Arguments
   V0, V1, A0, A1, A2, A3,
   // Not preserved across procedure calls
-  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9,
+  T0, T1, T2, T3, T4, T5, T6, T7,
   // Callee save
   S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
   // Reserved
-  ZERO, AT, K0, K1, GP, SP, FP, RA)>;
+  K0, K1, GP, SP, FP, RA)>;
 
 def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+  ZERO_64, AT_64,
   // Return Values and Arguments
   V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
   // Not preserved across procedure calls
-  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, T8_64, T9_64,
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
   // Callee save
   S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Not preserved across procedure calls
+  T8_64, T9_64,
   // Reserved
-  ZERO_64, AT_64, K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)> {
-  let SubRegClasses = [(CPURegs sub_32)];
-}
+  K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>;
+
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -278,26 +296,24 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
 def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
-  D0, D1, D6, D7,
+  D0, D1,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5,
+  // Return Values and Arguments
+  D6, D7,
   // Not preserved across procedure calls
-  D2, D3, D4, D5, D8, D9,
+  D8, D9,
   // Callee save
-  D10, D11, D12, D13, D14, D15)> {
-  let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
-}
+  D10, D11, D12, D13, D14, D15)>;
 
-def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> {
-  let SubRegClasses = [(FGR32 sub_32)];
-}
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
 // Condition Register for floating point operations
-def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31)>;
+def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>;
 
 // Hi/Lo Registers
 def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
-def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> {
-  let SubRegClasses = [(HILO sub_32)];
-}
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>;
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
new file mode 100644
index 0000000..1c59847
--- /dev/null
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -0,0 +1,210 @@
+//===-- MipsSEFrameLowering.cpp - Mips32/64 Frame Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEFrameLowering.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // First, compute final stack size.
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP);
+  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
+  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegClass.contains(Reg)) {
+        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
+        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
+        MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven));
+        MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd));
+
+        if (!STI.isLittle())
+          std::swap(SrcML0, SrcML1);
+
+        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
+        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+      } else {
+        // Reg is either in CPURegs or FGR32.
+        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+        SrcML = MachineLocation(Reg);
+        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+      }
+    }
+  }
+
+  // if framepointer enabled, set it to point to the stack pointer.
+  if (hasFP(MF)) {
+    // Insert instruction "move $fp, $sp" at this location.
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
+
+    // emit ".cfi_def_cfa_register $fp"
+    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
+    DstML = MachineLocation(FP);
+    SrcML = MachineLocation(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
+  }
+}
+
+void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // if framepointer enabled, restore the stack pointer.
+  if (hasFP(MF)) {
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
+  }
+
+  // Get the number of bytes from FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
+}
+
+bool MipsSEFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+        && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    bool IsKill = !IsRAAndRetAddrIsTaken;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+                            CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool
+MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Reserve call frame if the size of the maximum call frame fits into 16-bit
+  // immediate field and there are no variable sized objects on the stack.
+  return isInt<16>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
+}
+
+void MipsSEFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+
+  // Mark $fp as used if function has dedicated frame pointer.
+  if (hasFP(MF))
+    MRI.setPhysRegUsed(FP);
+}
+
+const MipsFrameLowering *
+llvm::createMipsSEFrameLowering(const MipsSubtarget &ST) {
+  return new MipsSEFrameLowering(ST);
+}
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
new file mode 100644
index 0000000..6481a0a
--- /dev/null
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -0,0 +1,44 @@
+//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSE_FRAMEINFO_H
+#define MIPSSE_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+
+class MipsSEFrameLowering : public MipsFrameLowering {
+public:
+  explicit MipsSEFrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
new file mode 100644
index 0000000..eeb1de3
--- /dev/null
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -0,0 +1,320 @@
+//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
+  : MipsInstrInfo(tm,
+                  tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
+    RI(*tm.getSubtargetImpl(), *this),
+    IsN64(tm.getSubtarget<MipsSubtarget>().isABI_N64()) {}
+
+const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsSEInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
+      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
+      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
+      (Opc == Mips::LDC164_P8)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsSEInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
+      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
+      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
+      (Opc == Mips::SDC164_P8)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0, ZeroReg = 0;
+
+  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::CPURegsRegClass.contains(SrcReg))
+      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
+    else if (Mips::FGR32RegClass.contains(SrcReg))
+      Opc = Mips::MFC1;
+    else if (SrcReg == Mips::HI)
+      Opc = Mips::MFHI, SrcReg = 0;
+    else if (SrcReg == Mips::LO)
+      Opc = Mips::MFLO, SrcReg = 0;
+  }
+  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
+    if (Mips::CCRRegClass.contains(DestReg))
+      Opc = Mips::CTC1;
+    else if (Mips::FGR32RegClass.contains(DestReg))
+      Opc = Mips::MTC1;
+    else if (DestReg == Mips::HI)
+      Opc = Mips::MTHI, DestReg = 0;
+    else if (DestReg == Mips::LO)
+      Opc = Mips::MTLO, DestReg = 0;
+  }
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D64;
+  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::MOVCCRToCCR;
+  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::CPU64RegsRegClass.contains(SrcReg))
+      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+    else if (SrcReg == Mips::HI64)
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (SrcReg == Mips::LO64)
+      Opc = Mips::MFLO64, SrcReg = 0;
+    else if (Mips::FGR64RegClass.contains(SrcReg))
+      Opc = Mips::DMFC1;
+  }
+  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (DestReg == Mips::HI64)
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (DestReg == Mips::LO64)
+      Opc = Mips::MTLO64, DestReg = 0;
+    else if (Mips::FGR64RegClass.contains(DestReg))
+      Opc = Mips::DMTC1;
+  }
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void MipsSEInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+
+  unsigned Opc = 0;
+
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+}
+
+void MipsSEInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(0)
+    .addMemOperand(MMO);
+}
+
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA:
+    ExpandRetRA(MBB, MI, Mips::RET);
+    break;
+  case Mips::BuildPairF64:
+    ExpandBuildPairF64(MBB, MI);
+    break;
+  case Mips::ExtractElementF64:
+    ExpandExtractElementF64(MBB, MI);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned MipsSEInstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:           llvm_unreachable("Illegal opcode!");
+  case Mips::BEQ:    return Mips::BNE;
+  case Mips::BNE:    return Mips::BEQ;
+  case Mips::BGTZ:   return Mips::BLEZ;
+  case Mips::BGEZ:   return Mips::BLTZ;
+  case Mips::BLTZ:   return Mips::BGEZ;
+  case Mips::BLEZ:   return Mips::BGTZ;
+  case Mips::BEQ64:  return Mips::BNE64;
+  case Mips::BNE64:  return Mips::BEQ64;
+  case Mips::BGTZ64: return Mips::BLEZ64;
+  case Mips::BGEZ64: return Mips::BLTZ64;
+  case Mips::BLTZ64: return Mips::BGEZ64;
+  case Mips::BLEZ64: return Mips::BGTZ64;
+  case Mips::BC1T:   return Mips::BC1F;
+  case Mips::BC1F:   return Mips::BC1T;
+  }
+}
+
+/// Adjust SP by Amount bytes.
+void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+
+  if (isInt<16>(Amount))// addi sp, sp, amount
+    BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
+  else { // Expand immediate that doesn't fit in 16-bit.
+    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
+
+    MBB.getParent()->getInfo<MipsFunctionInfo>()->setEmitNOAT();
+    Mips::loadImmediate(Amount, STI.isABI_N64(), *this, MBB, I, DL, false, 0);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(ATReg);
+  }
+}
+
+unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
+          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
+          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
+          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
+          Opc == Mips::J) ?
+         Opc : 0;
+}
+
+void MipsSEInstrInfo::ExpandRetRA(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA);
+}
+
+void MipsSEInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  const MCInstrDesc& Mfc1Tdd = get(Mips::MFC1);
+  DebugLoc dl = I->getDebugLoc();
+
+  assert(N < 2 && "Invalid immediate");
+  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+
+  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+}
+
+void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+
+  // mtc1 Lo, $fp
+  // mtc1 Hi, $fp + 1
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpeven))
+    .addReg(LoReg);
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpodd))
+    .addReg(HiReg);
+}
+
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) {
+  return new MipsSEInstrInfo(TM);
+}
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
new file mode 100644
index 0000000..346e74d
--- /dev/null
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -0,0 +1,86 @@
+//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEINSTRUCTIONINFO_H
+#define MIPSSEINSTRUCTIONINFO_H
+
+#include "MipsInstrInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsSERegisterInfo.h"
+
+namespace llvm {
+
+class MipsSEInstrInfo : public MipsInstrInfo {
+  const MipsSERegisterInfo RI;
+  bool IsN64;
+
+public:
+  explicit MipsSEInstrInfo(MipsTargetMachine &TM);
+
+  virtual const MipsRegisterInfo &getRegisterInfo() const;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+
+  void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+  void ExpandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) const;
+  void ExpandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
new file mode 100644
index 0000000..043a1ef
--- /dev/null
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -0,0 +1,138 @@
+//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSERegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
+                                       const TargetInstrInfo &TII)
+  : MipsRegisterInfo(ST, TII) {}
+
+// This function eliminate ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void MipsSERegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (!TFI->hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    const MipsSEInstrInfo *II = static_cast<const MipsSEInstrInfo*>(&TII);
+    unsigned SP = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+
+    II->adjustStackPtr(SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
+void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if (MipsFI->isOutArgFI(FrameIndex) ||
+      (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
+    FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  else
+    FrameReg = getFrameRegister(MF);
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  int64_t Offset;
+
+  if (MipsFI->isOutArgFI(FrameIndex))
+    Offset = SPOffset;
+  else
+    Offset = SPOffset + (int64_t)StackSize;
+
+  Offset    += MI.getOperand(OpNo + 1).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
+  // field.
+  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+    unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
+    MipsAnalyzeImmediate::Inst LastInst(0, 0);
+
+    MipsFI->setEmitNOAT();
+    Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true,
+                        &LastInst);
+    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
+
+    FrameReg = ATReg;
+    Offset = SignExtend64<16>(LastInst.ImmOpnd);
+  }
+
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+}
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
new file mode 100644
index 0000000..4b17b33
--- /dev/null
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -0,0 +1,39 @@
+//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEREGISTERINFO_H
+#define MIPSSEREGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+
+class MipsSERegisterInfo : public MipsRegisterInfo {
+public:
+  MipsSERegisterInfo(const MipsSubtarget &Subtarget,
+                     const TargetInstrInfo &TII);
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 00347df..11ff809 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -30,7 +30,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
-  HasMinMax(false), HasSwap(false), HasBitCount(false)
+  HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
@@ -58,9 +58,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
 
 bool
 MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                    TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                                     RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+                                     RegClassVector &CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   CriticalPathRCs.clear();
   CriticalPathRCs.push_back(hasMips64() ?
                             &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass);
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 7faf77b..3215c44 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -86,6 +86,9 @@ protected:
   // HasBitCount - Count leading '1' and '0' bits.
   bool HasBitCount;
 
+  // InMips16 -- can process Mips16 instructions
+  bool InMips16Mode;
+
   InstrItineraryData InstrItins;
 
 public:
@@ -124,8 +127,11 @@ public:
   bool isSingleFloat() const { return IsSingleFloat; }
   bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
+  bool inMips16Mode() const { return InMips16Mode; }
   bool isLinux() const { return IsLinux; }
 
+  bool hasStandardEncoding() const { return !inMips16Mode(); }
+
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; }
   bool hasCondMov()   const { return HasCondMov; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 858723b..03a024a 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -13,6 +13,8 @@
 
 #include "MipsTargetMachine.h"
 #include "Mips.h"
+#include "MipsFrameLowering.h"
+#include "MipsInstrInfo.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -22,8 +24,8 @@ extern "C" void LLVMInitializeMipsTarget() {
   // Register the target.
   RegisterTargetMachine<MipsebTargetMachine> X(TheMipsTarget);
   RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
-  RegisterTargetMachine<Mips64ebTargetMachine> A(TheMips64Target);
-  RegisterTargetMachine<Mips64elTargetMachine> B(TheMips64elTarget);
+  RegisterTargetMachine<MipsebTargetMachine> A(TheMips64Target);
+  RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -48,8 +50,8 @@ MipsTargetMachine(const Target &T, StringRef TT,
                (Subtarget.isABI_N64() ?
                 "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
                 "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
-    InstrInfo(*this),
-    FrameLowering(Subtarget),
+    InstrInfo(MipsInstrInfo::create(*this)),
+    FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
     TLInfo(*this), TSInfo(*this), JITInfo() {
 }
 
@@ -71,24 +73,6 @@ MipselTargetMachine(const Target &T, StringRef TT,
                     CodeGenOpt::Level OL)
   : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
-void Mips64ebTargetMachine::anchor() { }
-
-Mips64ebTargetMachine::
-Mips64ebTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL)
-  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-void Mips64elTargetMachine::anchor() { }
-
-Mips64elTargetMachine::
-Mips64elTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL)
-  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
 namespace {
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
@@ -105,8 +89,6 @@ public:
   }
 
   virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPreSched2();
   virtual bool addPreEmitPass();
 };
 } // namespace
@@ -118,7 +100,7 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
 bool MipsPassConfig::addInstSelector() {
-  PM->add(createMipsISelDag(getMipsTargetMachine()));
+  addPass(createMipsISelDag(getMipsTargetMachine()));
   return false;
 }
 
@@ -126,20 +108,13 @@ bool MipsPassConfig::addInstSelector() {
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 bool MipsPassConfig::addPreEmitPass() {
-  PM->add(createMipsDelaySlotFillerPass(getMipsTargetMachine()));
-  return true;
-}
+  MipsTargetMachine &TM = getMipsTargetMachine();
+  addPass(createMipsDelaySlotFillerPass(TM));
 
-bool MipsPassConfig::addPreRegAlloc() {
-  // Do not restore $gp if target is Mips64.
-  // In N32/64, $gp is a callee-saved register.
-  if (!getMipsSubtarget().hasMips64())
-    PM->add(createMipsEmitGPRestorePass(getMipsTargetMachine()));
-  return true;
-}
+  // NOTE: long branch has not been implemented for mips16.
+  if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding())
+    addPass(createMipsLongBranchPass(TM));
 
-bool MipsPassConfig::addPreSched2() {
-  PM->add(createMipsExpandPseudoPass(getMipsTargetMachine()));
   return true;
 }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 80c00e8..21b49e6 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -25,56 +25,56 @@
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-  class formatted_raw_ostream;
+class formatted_raw_ostream;
+class MipsRegisterInfo;
+
+class MipsTargetMachine : public LLVMTargetMachine {
+  MipsSubtarget       Subtarget;
+  const TargetData    DataLayout; // Calculates type size & alignment
+  const MipsInstrInfo *InstrInfo;
+  const MipsFrameLowering *FrameLowering;
+  MipsTargetLowering  TLInfo;
+  MipsSelectionDAGInfo TSInfo;
+  MipsJITInfo JITInfo;
 
-  class MipsTargetMachine : public LLVMTargetMachine {
-    MipsSubtarget       Subtarget;
-    const TargetData    DataLayout; // Calculates type size & alignment
-    MipsInstrInfo       InstrInfo;
-    MipsFrameLowering   FrameLowering;
-    MipsTargetLowering  TLInfo;
-    MipsSelectionDAGInfo TSInfo;
-    MipsJITInfo JITInfo;
-
-  public:
-    MipsTargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL,
-                      bool isLittle);
-
-    virtual const MipsInstrInfo   *getInstrInfo()     const
-    { return &InstrInfo; }
-    virtual const TargetFrameLowering *getFrameLowering()     const
-    { return &FrameLowering; }
-    virtual const MipsSubtarget   *getSubtargetImpl() const
-    { return &Subtarget; }
-    virtual const TargetData      *getTargetData()    const
-    { return &DataLayout;}
-    virtual MipsJITInfo *getJITInfo()
-    { return &JITInfo; }
-
-
-    virtual const MipsRegisterInfo *getRegisterInfo()  const {
-      return &InstrInfo.getRegisterInfo();
-    }
-
-    virtual const MipsTargetLowering *getTargetLowering() const {
-      return &TLInfo;
-    }
-
-    virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
-      return &TSInfo;
-    }
-
-    // Pass Pipeline Configuration
-    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-    virtual bool addCodeEmitter(PassManagerBase &PM,
-				 JITCodeEmitter &JCE);
-
-  };
+public:
+  MipsTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL,
+                    bool isLittle);
+
+  virtual ~MipsTargetMachine() { delete InstrInfo; }
+
+  virtual const MipsInstrInfo *getInstrInfo() const
+  { return InstrInfo; }
+  virtual const TargetFrameLowering *getFrameLowering() const
+  { return FrameLowering; }
+  virtual const MipsSubtarget *getSubtargetImpl() const
+  { return &Subtarget; }
+  virtual const TargetData *getTargetData()    const
+  { return &DataLayout;}
+  virtual MipsJITInfo *getJITInfo()
+  { return &JITInfo; }
+
+  virtual const MipsRegisterInfo *getRegisterInfo()  const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+  virtual const MipsTargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+};
 
-/// MipsebTargetMachine - Mips32 big endian target machine.
+/// MipsebTargetMachine - Mips32/64 big endian target machine.
 ///
 class MipsebTargetMachine : public MipsTargetMachine {
   virtual void anchor();
@@ -85,7 +85,7 @@ public:
                       CodeGenOpt::Level OL);
 };
 
-/// MipselTargetMachine - Mips32 little endian target machine.
+/// MipselTargetMachine - Mips32/64 little endian target machine.
 ///
 class MipselTargetMachine : public MipsTargetMachine {
   virtual void anchor();
@@ -96,29 +96,6 @@ public:
                       CodeGenOpt::Level OL);
 };
 
-/// Mips64ebTargetMachine - Mips64 big endian target machine.
-///
-class Mips64ebTargetMachine : public MipsTargetMachine {
-  virtual void anchor();
-public:
-  Mips64ebTargetMachine(const Target &T, StringRef TT,
-                        StringRef CPU, StringRef FS,
-                        const TargetOptions &Options,
-                        Reloc::Model RM, CodeModel::Model CM,
-                        CodeGenOpt::Level OL);
-};
-
-/// Mips64elTargetMachine - Mips64 little endian target machine.
-///
-class Mips64elTargetMachine : public MipsTargetMachine {
-  virtual void anchor();
-public:
-  Mips64elTargetMachine(const Target &T, StringRef TT,
-                        StringRef CPU, StringRef FS,
-                        const TargetOptions &Options,
-                        Reloc::Model RM, CodeModel::Model CM,
-                        CodeGenOpt::Level OL);
-};
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
new file mode 100644
index 0000000..7cb16b4
--- /dev/null
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(LLVM_TARGET_DEFINITIONS NVPTX.td)
+
+
+tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(NVPTXCommonTableGen)
+
+set(NVPTXCodeGen_sources
+  NVPTXFrameLowering.cpp
+  NVPTXInstrInfo.cpp
+  NVPTXISelDAGToDAG.cpp
+  NVPTXISelLowering.cpp
+  NVPTXRegisterInfo.cpp
+  NVPTXSubtarget.cpp
+  NVPTXTargetMachine.cpp
+  NVPTXSplitBBatBar.cpp
+  NVPTXLowerAggrCopies.cpp
+  NVPTXutil.cpp
+  NVPTXAllocaHoisting.cpp
+  NVPTXAsmPrinter.cpp
+  NVPTXUtilities.cpp
+  VectorElementize.cpp
+  )
+
+add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
+
+add_dependencies(LLVMNVPTXCodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..ae4c751
--- /dev/null
+++ b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXAsmPrinter
+  NVPTXInstPrinter.cpp
+  )
+
+add_dependencies(LLVMNVPTXAsmPrinter NVPTXCommonTableGen)
diff --git a/lib/Target/PTX/InstPrinter/LLVMBuild.txt b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
index af5d200..032b573 100644
--- a/lib/Target/PTX/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/InstPrinter/LLVMBuild.txt ---------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXAsmPrinter
-parent = PTX
+name = NVPTXAsmPrinter
+parent = NVPTX
 required_libraries = MC Support
-add_to_library_groups = PTX
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile
index 0ccfe44..7b78654 100644
--- a/lib/Target/PTX/InstPrinter/Makefile
+++ b/lib/Target/NVPTX/InstPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===##
 #
 #											The LLVM Compiler Infrastructure
 #
@@ -7,10 +7,9 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXAsmPrinter
+LIBRARYNAME = LLVMNVPTXAsmPrinter
 
 # Hack: we need to include 'main' ptx target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 0000000..10051c7
--- /dev/null
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1 @@
+// Placeholder
diff --git a/lib/Target/PTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index 15a1eb5..e2d6ed2 100644
--- a/lib/Target/PTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/LLVMBuild.txt -------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -20,13 +20,13 @@ subdirectories = InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
-name = PTX
+name = NVPTX
 parent = Target
 has_asmprinter = 1
 
 [component_1]
 type = Library
-name = PTXCodeGen
-parent = PTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC PTXDesc PTXInfo SelectionDAG Support Target TransformUtils
-add_to_library_groups = PTX
+name = NVPTXCodeGen
+parent = NVPTX
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXDesc NVPTXInfo SelectionDAG Support Target TransformUtils
+add_to_library_groups = NVPTX
diff --git a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..a030d9f
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(LLVMNVPTXDesc
+  NVPTXMCAsmInfo.cpp
+  NVPTXMCTargetDesc.cpp
+  )
+
+add_dependencies(LLVMNVPTXDesc NVPTXCommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
index 19b80c5..01a051a 100644
--- a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXDesc
-parent = PTX
-required_libraries = MC PTXAsmPrinter PTXInfo Support
-add_to_library_groups = PTX
+name = NVPTXDesc
+parent = NVPTX
+required_libraries = MC NVPTXAsmPrinter NVPTXInfo Support
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile
index 35f5a7b..31d06cb 100644
--- a/lib/Target/PTX/MCTargetDesc/Makefile
+++ b/lib/Target/NVPTX/MCTargetDesc/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/TargetDesc/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXDesc
+LIBRARYNAME = LLVMNVPTXDesc
 
 # Hack: we need to include 'main' target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 0000000..4545838
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,88 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+  PROPERTY_MAXNTID_X = 0,
+  PROPERTY_MAXNTID_Y,
+  PROPERTY_MAXNTID_Z,
+  PROPERTY_REQNTID_X,
+  PROPERTY_REQNTID_Y,
+  PROPERTY_REQNTID_Z,
+  PROPERTY_MINNCTAPERSM,
+  PROPERTY_ISTEXTURE,
+  PROPERTY_ISSURFACE,
+  PROPERTY_ISSAMPLER,
+  PROPERTY_ISREADONLY_IMAGE_PARAM,
+  PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISKERNEL_FUNCTION,
+  PROPERTY_ALIGN,
+
+  // last property
+  PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 8; // length of each annotation name
+const char
+PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+  "maxntidx",               // PROPERTY_MAXNTID_X
+  "maxntidy",               // PROPERTY_MAXNTID_Y
+  "maxntidz",               // PROPERTY_MAXNTID_Z
+  "reqntidx",               // PROPERTY_REQNTID_X
+  "reqntidy",               // PROPERTY_REQNTID_Y
+  "reqntidz",               // PROPERTY_REQNTID_Z
+  "minctasm",               // PROPERTY_MINNCTAPERSM
+  "texture",                // PROPERTY_ISTEXTURE
+  "surface",                // PROPERTY_ISSURFACE
+  "sampler",                // PROPERTY_ISSAMPLER
+  "rdoimage",               // PROPERTY_ISREADONLY_IMAGE_PARAM
+  "wroimage",               // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "kernel",                 // PROPERTY_ISKERNEL_FUNCTION
+  "align",                  // PROPERTY_ALIGN
+
+  // last property
+  "proplast",               // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+static const char* NamedMDForAnnotations = "nvvm.annotations";
+
+}
+
+
+#endif
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 0000000..1d41665
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,63 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+bool CompileForDebugging;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool, true>
+Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
+      cl::location(CompileForDebugging),
+      cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() { }
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::nvptx64)
+    PointerSize = 8;
+
+  CommentString = "//";
+
+  PrivateGlobalPrefix = "$L__";
+
+  AllowPeriodsInName = false;
+
+  HasSetDirective = false;
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " inline asm";
+  InlineAsmEnd = " inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  HasDotTypeDotSizeDirective = false;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  PrivateGlobalPrefix = "";
+  ZeroDirective =  " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  GlobalDirective = "\t// .globl\t";
+}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 32ca069..82097da 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- PTXMCAsmInfo.h - PTX asm properties --------------------*- C++ -*--===//
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,24 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the declaration of the PTXMCAsmInfo class.
+// This file contains the declaration of the NVPTXMCAsmInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PTX_MCASM_INFO_H
-#define PTX_MCASM_INFO_H
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
 
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-  class Target;
-  class StringRef;
+class Target;
+class StringRef;
 
-  class PTXMCAsmInfo : public MCAsmInfo {
-    virtual void anchor();
-  public:
-    explicit PTXMCAsmInfo(const Target &T, const StringRef &TT);
-  };
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+public:
+  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+};
 } // namespace llvm
 
-#endif // PTX_MCASM_INFO_H
+#endif // NVPTX_MCASM_INFO_H
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 0000000..44aa01c
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,91 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+
+using namespace llvm;
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+                                        createNVPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+                                        createNVPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+                                    createNVPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+                                    createNVPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+                                          createNVPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+                                          createNVPTXMCSubtargetInfo);
+
+}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index 542638a..af95c76 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- PTXMCTargetDesc.h - PTX Target Descriptions ------------*- C++ -*-===//
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,30 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides PTX specific target descriptions.
+// This file provides NVPTX specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PTXMCTARGETDESC_H
-#define PTXMCTARGETDESC_H
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
 
 namespace llvm {
 class Target;
 
-extern Target ThePTX32Target;
-extern Target ThePTX64Target;
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
 
 } // End llvm namespace
 
 // Defines symbolic names for PTX registers.
 #define GET_REGINFO_ENUM
-#include "PTXGenRegisterInfo.inc"
+#include "NVPTXGenRegisterInfo.inc"
 
 // Defines symbolic names for the PTX instructions.
 #define GET_INSTRINFO_ENUM
-#include "PTXGenInstrInfo.inc"
+#include "NVPTXGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
-#include "PTXGenSubtargetInfo.inc"
+#include "NVPTXGenSubtargetInfo.inc"
 
 #endif
diff --git a/lib/Target/PTX/Makefile b/lib/Target/NVPTX/Makefile
index fa09634..8db20eb 100644
--- a/lib/Target/PTX/Makefile
+++ b/lib/Target/NVPTX/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/Makefile -----------------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,15 +8,15 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../..
-LIBRARYNAME = LLVMPTXCodeGen
-TARGET = PTX
+LIBRARYNAME = LLVMNVPTXCodeGen
+TARGET = NVPTX
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = PTXGenAsmWriter.inc \
-		PTXGenDAGISel.inc \
-		PTXGenInstrInfo.inc \
-		PTXGenRegisterInfo.inc \
-		PTXGenSubtargetInfo.inc
+BUILT_SOURCES = NVPTXGenAsmWriter.inc \
+		NVPTXGenDAGISel.inc \
+		NVPTXGenInstrInfo.inc \
+		NVPTXGenRegisterInfo.inc \
+		NVPTXGenSubtargetInfo.inc
 
 DIRS = InstPrinter TargetInfo MCTargetDesc
 
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 0000000..b568488
--- /dev/null
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,49 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVector<std::string *, 8>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 0000000..a8d082a
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,137 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include "llvm/Value.h"
+#include "llvm/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+  switch (CC) {
+  case NVPTXCC::NE:  return "ne";
+  case NVPTXCC::EQ:   return "eq";
+  case NVPTXCC::LT:   return "lt";
+  case NVPTXCC::LE:  return "le";
+  case NVPTXCC::GT:  return "gt";
+  case NVPTXCC::GE:   return "ge";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                 llvm::CodeGenOpt::Level OptLevel);
+FunctionPass *createVectorElementizePass(NVPTXTargetMachine &);
+FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX
+{
+enum DrvInterface {
+  NVCL,
+  CUDA,
+  TEST
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace{
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 0000000..ae7710e
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,44 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_10", [FeatureDummy]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 0000000..668c393
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,48 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "NVPTXAllocaHoisting.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool               functionModified    = false;
+  Function::iterator I                   = function.begin();
+  TerminatorInst    *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+RegisterPass<NVPTXAllocaHoisting> X("alloca-hoisting",
+                                    "Hoisting alloca instructions in non-entry "
+                                    "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() {
+  return new NVPTXAllocaHoisting();
+}
+
+} // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 0000000..24b3bd5
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,49 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual const char *getPassName() const {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  virtual bool runOnFunction(Function &function);
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 0000000..f2b9616
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2064 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTXNumRegisters.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Assembly/Writer.h"
+#include "cl_common_defines.h"
+#include <sstream>
+using namespace llvm;
+
+
+#include "NVPTXGenAsmWriter.inc"
+
+bool RegAllocNilUsed = true;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers",
+                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+                cl::init(true));
+
+namespace llvm  {
+bool InterleaveSrcInPtx = false;
+}
+
+static cl::opt<bool, true>InterleaveSrc("nvptx-emit-src",
+                                        cl::ZeroOrMore,
+                       cl::desc("NVPTX Specific: Emit source line in ptx file"),
+                                        cl::location(llvm::InterleaveSrcInPtx));
+
+
+
+
+// @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
+// cannot just link to the existing version.
+/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+using namespace nvptx;
+const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::Create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (CE == 0)
+    llvm_unreachable("Unknown constant value to lower!");
+
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using TargetData as a
+    // last resort before giving up.
+    if (Constant *C =
+        ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
+      if (C != CE)
+        return LowerConstant(C, AP);
+
+    // Otherwise report the problem to the user.
+    {
+        std::string S;
+        raw_string_ostream OS(S);
+        OS << "Unsupported expression in static initializer: ";
+        WriteAsOperand(OS, CE, /*PrintType=*/false,
+                       !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+        report_fatal_error(OS.str());
+    }
+  case Instruction::GetElementPtr: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Generate a symbolic expression for the byte address
+    const Constant *PtrVal = CE->getOperand(0);
+    SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
+    int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec);
+
+    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    if (Offset == 0)
+      return Base;
+
+    // Truncate/sext the offset to the pointer size.
+    if (TD.getPointerSizeInBits() != 64) {
+      int SExtAmount = 64-TD.getPointerSizeInBits();
+      Offset = (Offset << SExtAmount) >> SExtAmount;
+    }
+
+    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    // FALL THROUGH.
+  case Instruction::BitCast:
+    return LowerConstant(CE->getOperand(0), AP);
+
+  case Instruction::IntToPtr: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+                                      false/*ZExt*/);
+    return LowerConstant(Op, AP);
+  }
+
+  case Instruction::PtrToInt: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = LowerConstant(Op, AP);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+  // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    switch (CE->getOpcode()) {
+    default: llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Sub: return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+    case Instruction::Mul: return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+    case Instruction::SDiv: return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+    case Instruction::SRem: return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+    case Instruction::Shl: return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+    case Instruction::And: return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+    case Instruction::Or:  return MCBinaryExpr::CreateOr (LHS, RHS, Ctx);
+    case Instruction::Xor: return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI)
+{
+  if (!EmitLineNumbers)
+    return;
+  if (ignoreLoc(MI))
+    return;
+
+  DebugLoc curLoc = MI.getDebugLoc();
+
+  if (prevDebugLoc.isUnknown() && curLoc.isUnknown())
+    return;
+
+  if (prevDebugLoc == curLoc)
+    return;
+
+  prevDebugLoc = curLoc;
+
+  if (curLoc.isUnknown())
+    return;
+
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  //const TargetMachine &TM = MF->getTarget();
+
+  const LLVMContext &ctx = MF->getFunction()->getContext();
+  DIScope Scope(curLoc.getScope(ctx));
+
+  if (!Scope.Verify())
+    return;
+
+  StringRef fileName(Scope.getFilename());
+  StringRef dirName(Scope.getDirectory());
+  SmallString<128> FullPathName = dirName;
+  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+    sys::path::append(FullPathName, fileName);
+    fileName = FullPathName.str();
+  }
+
+  if (filenameMap.find(fileName.str()) == filenameMap.end())
+    return;
+
+
+  // Emit the line from the source file.
+  if (llvm::InterleaveSrcInPtx)
+    this->emitSrcInText(fileName.str(), curLoc.getLine());
+
+  std::stringstream temp;
+  temp << "\t.loc " << filenameMap[fileName.str()]
+       << " " << curLoc.getLine() << " " << curLoc.getCol();
+  OutStreamer.EmitRawText(Twine(temp.str().c_str()));
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    emitLineNumberAsDotLoc(*MI);
+  printInstruction(MI, OS);
+  OutStreamer.EmitRawText(OS.str());
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F,
+                                        raw_ostream &O)
+{
+  const TargetData *TD = TM.getTargetData();
+  const TargetLowering *TLI = TM.getTargetLowering();
+
+  Type *Ty = F->getReturnType();
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return;
+
+  O << " (";
+
+  if (isABI) {
+    if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+        size = ITy->getBitWidth();
+        if (size < 32) size = 32;
+      } else {
+        assert(Ty->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = Ty->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " func_retval0";
+    }
+    else if (isa<PointerType>(Ty)) {
+      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+            << " func_retval0";
+    } else {
+      if ((Ty->getTypeID() == Type::StructTyID) ||
+          isa<VectorType>(Ty)) {
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*TLI, Ty, vtparts);
+        unsigned totalsz = 0;
+        for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+          unsigned elems = 1;
+          EVT elemtype = vtparts[i];
+          if (vtparts[i].isVector()) {
+            elems = vtparts[i].getVectorNumElements();
+            elemtype = vtparts[i].getVectorElementType();
+          }
+          for (unsigned j=0, je=elems; j!=je; ++j) {
+            unsigned sz = elemtype.getSizeInBits();
+            if (elemtype.isInteger() && (sz < 8)) sz = 8;
+            totalsz += sz/8;
+          }
+        }
+        unsigned retAlignment = 0;
+        if (!llvm::getAlign(*F, 0, retAlignment))
+          retAlignment = TD->getABITypeAlignment(Ty);
+        O << ".param .align "
+            << retAlignment
+            << " .b8 func_retval0["
+            << totalsz << "]";
+      } else
+        assert(false &&
+               "Unknown return type");
+    }
+  } else {
+    SmallVector<EVT, 16> vtparts;
+    ComputeValueVTs(*TLI, Ty, vtparts);
+    unsigned idx = 0;
+    for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[i];
+      if (vtparts[i].isVector()) {
+        elems = vtparts[i].getVectorNumElements();
+        elemtype = vtparts[i].getVectorElementType();
+      }
+
+      for (unsigned j=0, je=elems; j!=je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        O << ".reg .b" << sz << " func_retval" << idx;
+        if (j<je-1) O << ", ";
+        ++idx;
+      }
+      if (i < e-1)
+        O << ", ";
+    }
+  }
+  O << ") ";
+  return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+                                        raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  printReturnValStr(F, O);
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Set up
+  MRI = &MF->getRegInfo();
+  F = MF->getFunction();
+  emitLinkageDirective(F,O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else {
+    O << ".func ";
+    printReturnValStr(*MF, O);
+  }
+
+  O << *CurrentFnSym;
+
+  emitFunctionParamList(*MF, O);
+
+  if (llvm::isKernelFunction(*F))
+    emitKernelFunctionDirectives(*F, O);
+
+  OutStreamer.EmitRawText(O.str());
+
+  prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  unsigned numRegClasses = TRI.getNumRegClasses();
+  VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses+1];
+  OutStreamer.EmitRawText(StringRef("{\n"));
+  setAndEmitFunctionVirtualRegisters(*MF);
+
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+  emitDemotedVars(MF->getFunction(), O);
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(StringRef("}\n"));
+  delete []VRidGlobal2LocalMap;
+}
+
+
+void
+NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function& F,
+                                              raw_ostream &O) const {
+  // If the NVVM IR has some of reqntid* specified, then output
+  // the reqntid directive, and set the unspecified ones to 1.
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (llvm::getReqNTIDx(F, reqntidx) == false) reqntidx = 1;
+  else specified = true;
+  if (llvm::getReqNTIDy(F, reqntidy) == false) reqntidy = 1;
+  else specified = true;
+  if (llvm::getReqNTIDz(F, reqntidz) == false) reqntidz = 1;
+  else specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", "
+    << reqntidy << ", " << reqntidz << "\n";
+
+  // If the NVVM IR has some of maxntid* specified, then output
+  // the maxntid directive, and set the unspecified ones to 1.
+  // If none of maxntid* is specified, don't output maxntid directive.
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (llvm::getMaxNTIDx(F, maxntidx) == false) maxntidx = 1;
+  else specified = true;
+  if (llvm::getMaxNTIDy(F, maxntidy) == false) maxntidy = 1;
+  else specified = true;
+  if (llvm::getMaxNTIDz(F, maxntidz) == false) maxntidz = 1;
+  else specified = true;
+
+  if (specified)
+    O << ".maxntid " << maxntidx << ", "
+    << maxntidy << ", " << maxntidz << "\n";
+
+  unsigned mincta;
+  if (llvm::getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
+}
+
+void
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
+                                        raw_ostream &O) {
+  const TargetRegisterClass * RC = MRI->getRegClass(vr);
+  unsigned id = RC->getID();
+
+  std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
+  unsigned mapped_vr = regmap[vr];
+
+  if (!isVec) {
+    O << getNVPTXRegClassStr(RC) << mapped_vr;
+    return;
+  }
+  // Vector virtual register
+  if (getNVPTXVectorSize(RC) == 4)
+    O << "{"
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_1, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_2, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_3"
+    << "}";
+  else if (getNVPTXVectorSize(RC) == 2)
+    O << "{"
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_1"
+    << "}";
+  else
+    llvm_unreachable("Unsupported vector size");
+}
+
+void
+NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+                                     raw_ostream &O) {
+  getVirtualRegisterName(vr, isVec, O);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(const MachineOperand &MO,
+                                                const char *Modifier,
+                                                raw_ostream &O) {
+  static const char vecelem[] = {'0', '1', '2', '3', '0', '1', '2', '3'};
+  int Imm = (int)MO.getImm();
+  if(0 == strcmp(Modifier, "vecelem"))
+    O << "_" << vecelem[Imm];
+  else if(0 == strcmp(Modifier, "vecv4comm1")) {
+    if((Imm < 0) || (Imm > 3))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv4comm2")) {
+    if((Imm < 4) || (Imm > 7))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv4pos")) {
+    if(Imm < 0) Imm = 0;
+    O << "_" << vecelem[Imm%4];
+  }
+  else if(0 == strcmp(Modifier, "vecv2comm1")) {
+    if((Imm < 0) || (Imm > 1))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv2comm2")) {
+    if((Imm < 2) || (Imm > 3))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv2pos")) {
+    if(Imm < 0) Imm = 0;
+    O << "_" << vecelem[Imm%2];
+  }
+  else
+    llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << getRegisterName(MO.getReg());
+    } else {
+      if (!Modifier)
+        emitVirtualRegister(MO.getReg(), false, O);
+      else {
+        if (strcmp(Modifier, "vecfull") == 0)
+          emitVirtualRegister(MO.getReg(), true, O);
+        else
+          llvm_unreachable(
+                 "Don't know how to handle the modifier on virtual register.");
+      }
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable("Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char * symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname+6, "%u[];", &index);
+      printParamName(index, O);
+    }
+    else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname+9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    }
+    else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::
+printImplicitDef(const MachineInstr *MI, raw_ostream &O) const {
+#ifndef __OPTIMIZE__
+  O << "\t// Implicit def :";
+  //printOperand(MI, 0);
+  O << "\n";
+#endif
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+  } else {
+    if (MI->getOperand(opNum+1).isImm() &&
+        MI->getOperand(opNum+1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum+1, O);
+  }
+}
+
+void NVPTXAsmPrinter::printLdStCode(const MachineInstr *MI, int opNum,
+                                    raw_ostream &O, const char *Modifier)
+{
+  if (Modifier) {
+    const MachineOperand &MO = MI->getOperand(opNum);
+    int Imm = (int)MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL: O << ".global"; break;
+      case NVPTX::PTXLdStInstCode::SHARED: O << ".shared"; break;
+      case NVPTX::PTXLdStInstCode::LOCAL: O << ".local"; break;
+      case NVPTX::PTXLdStInstCode::PARAM: O << ".param"; break;
+      case NVPTX::PTXLdStInstCode::CONSTANT: O << ".const"; break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        if (!nvptxSubtarget.hasGenericLdSt())
+          O << ".global";
+        break;
+      default:
+        assert("wrong value");
+      }
+    }
+    else if (!strcmp(Modifier, "sign")) {
+      if (Imm==NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm==NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    }
+    else if (!strcmp(Modifier, "vec")) {
+      if (Imm==NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm==NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    }
+    else
+      assert("unknown modifier");
+  }
+  else
+    assert("unknown modifier");
+}
+
+void NVPTXAsmPrinter::emitDeclaration (const Function *F, raw_ostream &O) {
+
+  emitLinkageDirective(F,O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else
+    O << ".func ";
+  printReturnValStr(F, O);
+  O << *CurrentFnSym << "\n";
+  emitFunctionParamList(F, O);
+  O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C)
+{
+  if (!C)
+    return false;
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->getName().str() == "llvm.used")
+      return false;
+    return true;
+  }
+
+  for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end();
+      ui!=ue; ++ui) {
+    const Constant *C = dyn_cast<Constant>(*ui);
+    if (usedInGlobalVarDef(C))
+      return true;
+  }
+  return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc)
+{
+  if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+    if (othergv->getName().str() == "llvm.used")
+      return true;
+  }
+
+  if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+    if (instr->getParent() && instr->getParent()->getParent()) {
+      const Function *curFunc = instr->getParent()->getParent();
+      if (oneFunc && (curFunc != oneFunc))
+        return false;
+      oneFunc = curFunc;
+      return true;
+    }
+    else
+      return false;
+  }
+
+  if (const MDNode *md = dyn_cast<MDNode>(U))
+    if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
+        (md->getName().str() == "llvm.dbg.sp")))
+      return true;
+
+
+  for (User::const_use_iterator ui=U->use_begin(), ue=U->use_end();
+      ui!=ue; ++ui) {
+    if (usedInOneFunc(*ui, oneFunc) == false)
+      return false;
+  }
+  return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+  if (gv->hasInternalLinkage() == false)
+    return false;
+  const PointerType *Pty = gv->getType();
+  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+    return false;
+
+  const Function *oneFunc = 0;
+
+  bool flag = usedInOneFunc(gv, oneFunc);
+  if (flag == false)
+    return false;
+  if (!oneFunc)
+    return false;
+  f = oneFunc;
+  return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+                        llvm::DenseMap<const Function *, bool> &seenMap) {
+  for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end();
+      ui!=ue; ++ui) {
+    if (const Constant *cu = dyn_cast<Constant>(*ui)) {
+      if (useFuncSeen(cu, seenMap))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) {
+      const BasicBlock *bb = I->getParent();
+      if (!bb) continue;
+      const Function *caller = bb->getParent();
+      if (!caller) continue;
+      if (seenMap.find(caller) != seenMap.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations (Module &M, raw_ostream &O) {
+  llvm::DenseMap<const Function *, bool> seenMap;
+  for (Module::const_iterator FI=M.begin(), FE=M.end();
+      FI!=FE; ++FI) {
+    const Function *F = FI;
+
+    if (F->isDeclaration()) {
+      if (F->use_empty())
+        continue;
+      if (F->getIntrinsicID())
+        continue;
+      CurrentFnSym = Mang->getSymbol(F);
+      emitDeclaration(F, O);
+      continue;
+    }
+    for (Value::const_use_iterator iter=F->use_begin(),
+        iterEnd=F->use_end(); iter!=iterEnd; ++iter) {
+      if (const Constant *C = dyn_cast<Constant>(*iter)) {
+        if (usedInGlobalVarDef(C)) {
+          // The use is in the initialization of a global variable
+          // that is a function pointer, so print a declaration
+          // for the original function
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+        // Emit a declaration of this function if the function that
+        // uses this constant expr has already been seen.
+        if (useFuncSeen(C, seenMap)) {
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+      }
+
+      if (!isa<Instruction>(*iter)) continue;
+      const Instruction *instr = cast<Instruction>(*iter);
+      const BasicBlock *bb = instr->getParent();
+      if (!bb) continue;
+      const Function *caller = bb->getParent();
+      if (!caller) continue;
+
+      // If a caller has already been seen, then the caller is
+      // appearing in the module before the callee. so print out
+      // a declaration for the callee.
+      if (seenMap.find(caller) != seenMap.end()) {
+        CurrentFnSym = Mang->getSymbol(F);
+        emitDeclaration(F, O);
+        break;
+      }
+    }
+    seenMap[F] = true;
+  }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  unsigned i=1;
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+      E = DbgFinder.compile_unit_end(); I != E; ++I) {
+    DICompileUnit DIUnit(*I);
+    StringRef Filename(DIUnit.getFilename());
+    StringRef Dirname(DIUnit.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    OutStreamer.EmitDwarfFileDirective(i, "", Filename.str());
+    ++i;
+  }
+
+  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+      E = DbgFinder.subprogram_end(); I != E; ++I) {
+    DISubprogram SP(*I);
+    StringRef Filename(SP.getFilename());
+    StringRef Dirname(SP.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    ++i;
+  }
+}
+
+bool NVPTXAsmPrinter::doInitialization (Module &M) {
+
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MMI->AnalyzeModule(M);
+
+  // We need to call the parent's one explicitly.
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+          .Initialize(OutContext, TM);
+
+  Mang = new Mangler(OutContext, *TM.getTargetData());
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1);
+  OutStreamer.EmitRawText(OS1.str());
+
+
+  // Already commented out
+  //bool Result = AsmPrinter::doInitialization(M);
+
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    recordAndEmitFilenames(M);
+
+  SmallString<128> Str2;
+  raw_svector_ostream OS2(Str2);
+
+  emitDeclarations(M, OS2);
+
+  // Print out module-level global variables here.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+      I != E; ++I)
+    printModuleLevelGV(I, OS2);
+
+  OS2 << '\n';
+
+  OutStreamer.EmitRawText(OS2.str());
+  return false;  // success
+}
+
+void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) {
+  O << "//\n";
+  O << "// Generated by LLVM NVPTX Back-End\n";
+  O << "//\n";
+  O << "\n";
+
+  O << ".version 3.0\n";
+
+  O << ".target ";
+  O << nvptxSubtarget.getTargetName();
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+    O << ", texmode_independent";
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (!nvptxSubtarget.hasDouble())
+      O << ", map_f64_to_f32";
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    O << ", debug";
+
+  O << "\n";
+
+  O << ".address_size ";
+  if (nvptxSubtarget.is64Bit())
+    O << "64";
+  else
+    O << "32";
+  O << "\n";
+
+  O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable* [n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+      I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i ++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  delete[] gv_array;
+  return ret;
+
+
+  //bool Result = AsmPrinter::doFinalization(M);
+  // Instead of calling the parents doFinalization, we may
+  // clone parents doFinalization and customize here.
+  // Currently, we if NVISA out the EmitGlobals() in
+  // parent's doFinalization, which is too intrusive.
+  //
+  // Same for the doInitialization.
+  //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration            -> .extern
+// extern function definition             -> .visible
+// external global variable with init     -> .visible
+// external without init                  -> .extern
+// appending                              -> not allowed, assert.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue* V, raw_ostream &O)
+{
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (V->hasExternalLinkage()) {
+      if (isa<GlobalVariable>(V)) {
+        const GlobalVariable *GVar = cast<GlobalVariable>(V);
+        if (GVar) {
+          if (GVar->hasInitializer())
+            O << ".visible ";
+          else
+            O << ".extern ";
+        }
+      } else if (V->isDeclaration())
+        O << ".extern ";
+      else
+        O << ".visible ";
+    } else if (V->hasAppendingLinkage()) {
+      std::string msg;
+      msg.append("Error: ");
+      msg.append("Symbol ");
+      if (V->hasName())
+        msg.append(V->getName().str());
+      msg.append("has unsupported appending linkage type");
+      llvm_unreachable(msg.c_str());
+    }
+  }
+}
+
+
+void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
+                                         bool processDemoted) {
+
+  // Skip meta data
+  if (GVar->hasSection()) {
+    if (GVar->getSection() == "llvm.metadata")
+      return;
+  }
+
+  const TargetData *TD = TM.getTargetData();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  if (GVar->hasExternalLinkage()) {
+    if (GVar->hasInitializer())
+      O << ".visible ";
+    else
+      O << ".extern ";
+  }
+
+  if (llvm::isTexture(*GVar)) {
+    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+    return;
+  }
+
+  if (llvm::isSurface(*GVar)) {
+    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+    return;
+  }
+
+  if (GVar->isDeclaration()) {
+    // (extern) declarations, no definition or initializer
+    // Currently the only known declaration is for an automatic __local
+    // (.shared) promoted to global.
+    emitPTXGlobalVariable(GVar, O);
+    O << ";\n";
+    return;
+  }
+
+  if (llvm::isSampler(*GVar)) {
+    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+    Constant *Initializer = NULL;
+    if (GVar->hasInitializer())
+      Initializer = GVar->getInitializer();
+    ConstantInt *CI = NULL;
+    if (Initializer)
+      CI = dyn_cast<ConstantInt>(Initializer);
+    if (CI) {
+      unsigned sample=CI->getZExtValue();
+
+      O << " = { ";
+
+      for (int i =0, addr=((sample & __CLK_ADDRESS_MASK ) >>
+          __CLK_ADDRESS_BASE) ; i < 3 ; i++) {
+        O << "addr_mode_" << i << " = ";
+        switch (addr) {
+        case 0: O << "wrap"; break;
+        case 1: O << "clamp_to_border"; break;
+        case 2: O << "clamp_to_edge"; break;
+        case 3: O << "wrap"; break;
+        case 4: O << "mirror"; break;
+        }
+        O <<", ";
+      }
+      O << "filter_mode = ";
+      switch (( sample & __CLK_FILTER_MASK ) >> __CLK_FILTER_BASE ) {
+      case 0: O << "nearest"; break;
+      case 1: O << "linear";  break;
+      case 2: assert ( 0 && "Anisotropic filtering is not supported");
+      default: O << "nearest"; break;
+      }
+      if (!(( sample &__CLK_NORMALIZED_MASK ) >> __CLK_NORMALIZED_BASE)) {
+        O << ", force_unnormalized_coords = 1";
+      }
+      O << " }";
+    }
+
+    O << ";\n";
+    return;
+  }
+
+  if (GVar->hasPrivateLinkage()) {
+
+    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+      return;
+
+    // FIXME - need better way (e.g. Metadata) to avoid generating this global
+    if (!strncmp(GVar->getName().data(), "filename", 8))
+      return;
+    if (GVar->use_empty())
+      return;
+  }
+
+  const Function *demotedFunc = 0;
+  if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+    O << "// " << GVar->getName().str() << " has been demoted\n";
+    if (localDecls.find(demotedFunc) != localDecls.end())
+      localDecls[demotedFunc].push_back(GVar);
+    else {
+      std::vector<GlobalVariable *> temp;
+      temp.push_back(GVar);
+      localDecls[demotedFunc] = temp;
+    }
+    return;
+  }
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy, false);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+
+    // Ptx allows variable initilization only for constant and global state
+    // spaces.
+    if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+        (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+        (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST))
+        && GVar->hasInitializer()) {
+      Constant *Initializer = GVar->getInitializer();
+      if (!Initializer->isNullValue()) {
+        O << " = " ;
+        printScalarConstant(Initializer, O);
+      }
+    }
+  } else {
+    unsigned int ElementSize =0;
+
+    // Although PTX has direct support for struct type and array type and
+    // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+    // targets that support these high level field accesses. Structs, arrays
+    // and vectors are lowered into arrays of bytes.
+    switch (ETy->getTypeID()) {
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::VectorTyID:
+      ElementSize = TD->getTypeStoreSize(ETy);
+      // Ptx allows variable initilization only for constant and
+      // global state spaces.
+      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST))
+          && GVar->hasInitializer()) {
+        Constant *Initializer = GVar->getInitializer();
+        if (!isa<UndefValue>(Initializer) &&
+            !Initializer->isNullValue()) {
+          AggBuffer aggBuffer(ElementSize, O, *this);
+          bufferAggregateConstant(Initializer, &aggBuffer);
+          if (aggBuffer.numSymbols) {
+            if (nvptxSubtarget.is64Bit()) {
+              O << " .u64 " << *Mang->getSymbol(GVar) <<"[" ;
+              O << ElementSize/8;
+            }
+            else {
+              O << " .u32 " << *Mang->getSymbol(GVar) <<"[" ;
+              O << ElementSize/4;
+            }
+            O << "]";
+          }
+          else {
+            O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ;
+            O << ElementSize;
+            O << "]";
+          }
+          O << " = {" ;
+          aggBuffer.print();
+          O << "}";
+        }
+        else {
+          O << " .b8 " << *Mang->getSymbol(GVar) ;
+          if (ElementSize) {
+            O <<"[" ;
+            O << ElementSize;
+            O << "]";
+          }
+        }
+      }
+      else {
+        O << " .b8 " << *Mang->getSymbol(GVar);
+        if (ElementSize) {
+          O <<"[" ;
+          O << ElementSize;
+          O << "]";
+        }
+      }
+      break;
+    default:
+      assert( 0 && "type not supported yet");
+    }
+
+  }
+  O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+  if (localDecls.find(f) == localDecls.end())
+    return;
+
+  std::vector<GlobalVariable *> &gvars = localDecls[f];
+
+  for (unsigned i=0, e=gvars.size(); i!=e; ++i) {
+    O << "\t// demoted variable\n\t";
+    printModuleLevelGV(gvars[i], O, true);
+  }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+                                          raw_ostream &O) const {
+  switch (AddressSpace) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    O << "local" ;
+    break;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    O << "global" ;
+    break;
+  case llvm::ADDRESS_SPACE_CONST:
+    // This logic should be consistent with that in
+    // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp)
+    if (nvptxSubtarget.hasGenericLdSt())
+      O << "global" ;
+    else
+      O << "const" ;
+    break;
+  case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+    O << "const" ;
+    break;
+  case llvm::ADDRESS_SPACE_SHARED:
+    O << "shared" ;
+    break;
+  default:
+    llvm_unreachable("unexpected address space");
+  }
+}
+
+std::string NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty,
+                                                      bool useB4PTR) const {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unexpected type");
+    break;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return "pred";
+    else if (NumBits <= 64) {
+      std::string name = "u";
+      return name + utostr(NumBits);
+    } else {
+      llvm_unreachable("Integer too large");
+      break;
+    }
+    break;
+  }
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    if (nvptxSubtarget.is64Bit())
+      if (useB4PTR) return "b64";
+      else return "u64";
+    else
+      if (useB4PTR) return "b32";
+      else return "u32";
+  }
+  llvm_unreachable("unexpected type");
+  return NULL;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable* GVar,
+                                            raw_ostream &O) {
+
+  const TargetData *TD = TM.getTargetData();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+
+  int64_t ElementSize =0;
+
+  // Although PTX has direct support for struct type and array type and LLVM IR
+  // is very similar to PTX, the LLVM CodeGen does not support for targets that
+  // support these high level field accesses. Structs and arrays are lowered
+  // into arrays of bytes.
+  switch (ETy->getTypeID()) {
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    ElementSize = TD->getTypeStoreSize(ETy);
+    O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ;
+    if (ElementSize) {
+      O << itostr(ElementSize) ;
+    }
+    O << "]";
+    break;
+  default:
+    assert( 0 && "type not supported yet");
+  }
+  return ;
+}
+
+
+static unsigned int
+getOpenCLAlignment(const TargetData *TD,
+                   Type *Ty) {
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty))
+    return TD->getPrefTypeAlignment(Ty);
+
+  const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+  if (ATy)
+    return getOpenCLAlignment(TD, ATy->getElementType());
+
+  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy) {
+    Type *ETy = VTy->getElementType();
+    unsigned int numE = VTy->getNumElements();
+    unsigned int alignE = TD->getPrefTypeAlignment(ETy);
+    if (numE == 3)
+      return 4*alignE;
+    else
+      return numE*alignE;
+  }
+
+  const StructType *STy = dyn_cast<StructType>(Ty);
+  if (STy) {
+    unsigned int alignStruct = 1;
+    // Go through each element of the struct and find the
+    // largest alignment.
+    for (unsigned i=0, e=STy->getNumElements(); i != e; i++) {
+      Type *ETy = STy->getElementType(i);
+      unsigned int align = getOpenCLAlignment(TD, ETy);
+      if (align > alignStruct)
+        alignStruct = align;
+    }
+    return alignStruct;
+  }
+
+  const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+  if (FTy)
+    return TD->getPointerPrefAlignment();
+  return TD->getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+                                     int paramIndex, raw_ostream &O) {
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
+    O << *CurrentFnSym << "_param_" << paramIndex;
+  else {
+    std::string argName = I->getName();
+    const char *p = argName.c_str();
+    while (*p) {
+      if (*p == '.')
+        O << "_";
+      else
+        O << *p;
+      p++;
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
+  Function::const_arg_iterator I, E;
+  int i = 0;
+
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
+    O << *CurrentFnSym << "_param_" << paramIndex;
+    return;
+  }
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
+    if (i==paramIndex) {
+      printParamName(I, paramIndex, O);
+      return;
+    }
+  }
+  llvm_unreachable("paramIndex out of bound");
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
+                                            raw_ostream &O) {
+  const TargetData *TD = TM.getTargetData();
+  const AttrListPtr &PAL = F->getAttributes();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  Function::const_arg_iterator I, E;
+  unsigned paramIndex = 0;
+  bool first = true;
+  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  MVT thePointerTy = TLI->getPointerTy();
+
+  O << "(\n";
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+    const Type *Ty = I->getType();
+
+    if (!first)
+      O << ",\n";
+
+    first = false;
+
+    // Handle image/sampler parameters
+    if (llvm::isSampler(*I) || llvm::isImage(*I)) {
+      if (llvm::isImage(*I)) {
+        std::string sname = I->getName();
+        if (llvm::isImageWriteOnly(*I))
+          O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+        else // Default image is read_only
+          O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+      }
+      else // Should be llvm::isSampler(*I)
+        O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+        << paramIndex;
+      continue;
+    }
+
+    if (PAL.paramHasAttr(paramIndex+1, Attribute::ByVal) == false) {
+      // Just a scalar
+      const PointerType *PTy = dyn_cast<PointerType>(Ty);
+      if (isKernelFunc) {
+        if (PTy) {
+          // Special handling for pointer arguments to kernel
+          O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+          if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+            Type *ETy = PTy->getElementType();
+            int addrSpace = PTy->getAddressSpace();
+            switch(addrSpace) {
+            default:
+              O << ".ptr ";
+              break;
+            case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+              O << ".ptr .const ";
+              break;
+            case llvm::ADDRESS_SPACE_SHARED:
+              O << ".ptr .shared ";
+              break;
+            case llvm::ADDRESS_SPACE_GLOBAL:
+            case llvm::ADDRESS_SPACE_CONST:
+              O << ".ptr .global ";
+              break;
+            }
+            O << ".align " << (int)getOpenCLAlignment(TD, ETy) << " ";
+          }
+          printParamName(I, paramIndex, O);
+          continue;
+        }
+
+        // non-pointer scalar to kernel func
+        O << "\t.param ."
+            << getPTXFundamentalTypeStr(Ty) << " ";
+        printParamName(I, paramIndex, O);
+        continue;
+      }
+      // Non-kernel function, just print .param .b<size> for ABI
+      // and .reg .b<size> for non ABY
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32) sz = 32;
+      }
+      else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << "\t.param .b" << sz << " ";
+      else
+        O << "\t.reg .b" << sz << " ";
+      printParamName(I, paramIndex, O);
+      continue;
+    }
+
+    // param has byVal attribute. So should be a pointer
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy &&
+           "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI || isKernelFunc) {
+      // Just print .param .b8 .align <a> .param[size];
+      // <a> = PAL.getparamalignment
+      // size = typeallocsize of element type
+      unsigned align = PAL.getParamAlignment(paramIndex+1);
+      unsigned sz = TD->getTypeAllocSize(ETy);
+      O << "\t.param .align " << align
+          << " .b8 ";
+      printParamName(I, paramIndex, O);
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      // Split the ETy into constituent parts and
+      // print .param .b<size> <name> for each part.
+      // Further, if a part is vector, print the above for
+      // each vector element.
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*TLI, ETy, vtparts);
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0,je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << "\t.reg .b" << sz << " ";
+          printParamName(I, paramIndex, O);
+          if (j<je-1) O << ",\n";
+          ++paramIndex;
+        }
+        if (i<e-1)
+          O << ",\n";
+      }
+      --paramIndex;
+      continue;
+    }
+  }
+
+  O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+                                            raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  emitFunctionParamList(F, O);
+}
+
+
+void NVPTXAsmPrinter::
+setAndEmitFunctionVirtualRegisters(const MachineFunction &MF) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Map the global virtual register number to a register class specific
+  // virtual register number starting from 1 with that class.
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  //unsigned numRegClasses = TRI->getNumRegClasses();
+
+  // Emit the Fake Stack Object
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes) {
+    O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t"
+        << DEPOTNAME
+        << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (nvptxSubtarget.is64Bit()) {
+      O << "\t.reg .b64 \t%SP;\n";
+      O << "\t.reg .b64 \t%SPL;\n";
+    }
+    else {
+      O << "\t.reg .b32 \t%SP;\n";
+      O << "\t.reg .b32 \t%SPL;\n";
+    }
+  }
+
+  // Go through all virtual registers to establish the mapping between the
+  // global virtual
+  // register number and the per class virtual register number.
+  // We use the per class virtual register number in the ptx output.
+  unsigned int numVRs = MRI->getNumVirtRegs();
+  for (unsigned i=0; i< numVRs; i++) {
+    unsigned int vr = TRI->index2VirtReg(i);
+    const TargetRegisterClass *RC = MRI->getRegClass(vr);
+    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+    int n = regmap.size();
+    regmap.insert(std::make_pair(vr, n+1));
+  }
+
+  // Emit register declarations
+  // @TODO: Extract out the real register usage
+  O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+
+  // Emit declaration of the virtual registers or 'physical' registers for
+  // each register class
+  //for (unsigned i=0; i< numRegClasses; i++) {
+  //    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
+  //    const TargetRegisterClass *RC = TRI->getRegClass(i);
+  //    std::string rcname = getNVPTXRegClassName(RC);
+  //    std::string rcStr = getNVPTXRegClassStr(RC);
+  //    //int n = regmap.size();
+  //    if (!isNVPTXVectorRegClass(RC)) {
+  //      O << "\t.reg " << rcname << " \t" << rcStr << "<"
+  //        << NVPTXNumRegisters << ">;\n";
+  //    }
+
+  // Only declare those registers that may be used. And do not emit vector
+  // registers as
+  // they are all elementized to scalar registers.
+  //if (n && !isNVPTXVectorRegClass(RC)) {
+  //    if (RegAllocNilUsed) {
+  //        O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+  //          << ">;\n";
+  //    }
+  //    else {
+  //        O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
+  //          << "<" << 32 << ">;\n";
+  //    }
+  //}
+  //}
+
+  OutStreamer.EmitRawText(O.str());
+}
+
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+  APFloat APF = APFloat(Fp->getValueAPF());  // make a copy
+  bool ignored;
+  unsigned int numHex;
+  const char *lead;
+
+  if (Fp->getType()->getTypeID()==Type::FloatTyID) {
+    numHex = 8;
+    lead = "0f";
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+                &ignored);
+  } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+    numHex = 16;
+    lead = "0d";
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                &ignored);
+  } else
+    llvm_unreachable("unsupported fp type");
+
+  APInt API = APF.bitcastToAPInt();
+  std::string hexstr(utohexstr(API.getZExtValue()));
+  O << lead;
+  if (hexstr.length() < numHex)
+    O << std::string(numHex - hexstr.length(), '0');
+  O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    O << CI->getValue();
+    return;
+  }
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+    printFPConstant(CFP, O);
+    return;
+  }
+  if (isa<ConstantPointerNull>(CPV)) {
+    O << "0";
+    return;
+  }
+  if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+  if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    Value *v = Cexpr->stripPointerCasts();
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+      O << *Mang->getSymbol(GVar);
+      return;
+    } else {
+      O << *LowerConstant(CPV, *this);
+      return;
+    }
+  }
+  llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+
+void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+                                   AggBuffer *aggBuffer) {
+
+  const TargetData *TD = TM.getTargetData();
+
+  if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+    int s = TD->getTypeAllocSize(CPV->getType());
+    if (s<Bytes)
+      s = Bytes;
+    aggBuffer->addZeros(s);
+    return;
+  }
+
+  unsigned char *ptr;
+  switch (CPV->getType()->getTypeID()) {
+
+  case Type::IntegerTyID: {
+    const Type *ETy = CPV->getType();
+    if ( ETy == Type::getInt8Ty(CPV->getContext()) ){
+      unsigned char c =
+          (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = &c;
+      aggBuffer->addBytes(ptr, 1, Bytes);
+    } else if ( ETy == Type::getInt16Ty(CPV->getContext()) ) {
+      short int16 =
+          (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = (unsigned char*)&int16;
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if ( ETy == Type::getInt32Ty(CPV->getContext()) ) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        int int32 =(int)(constInt->getZExtValue());
+        ptr = (unsigned char*)&int32;
+        aggBuffer->addBytes(ptr, 4, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt =
+            dyn_cast<ConstantInt>(ConstantFoldConstantExpression(
+                Cexpr, TD))) {
+          int int32 =(int)(constInt->getZExtValue());
+          ptr = (unsigned char*)&int32;
+          aggBuffer->addBytes(ptr, 4, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(4);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else if (ETy == Type::getInt64Ty(CPV->getContext()) ) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        long long int64 =(long long)(constInt->getZExtValue());
+        ptr = (unsigned char*)&int64;
+        aggBuffer->addBytes(ptr, 8, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+            ConstantFoldConstantExpression(Cexpr, TD))) {
+          long long int64 =(long long)(constInt->getZExtValue());
+          ptr = (unsigned char*)&int64;
+          aggBuffer->addBytes(ptr, 8, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(8);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else
+      llvm_unreachable("unsupported integer const type");
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const Type* Ty = CFP->getType();
+    if (Ty == Type::getFloatTy(CPV->getContext())) {
+      float float32 = (float)CFP->getValueAPF().convertToFloat();
+      ptr = (unsigned char*)&float32;
+      aggBuffer->addBytes(ptr, 4, Bytes);
+    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+      double float64 = CFP->getValueAPF().convertToDouble();
+      ptr = (unsigned char*)&float64;
+      aggBuffer->addBytes(ptr, 8, Bytes);
+    }
+    else {
+      llvm_unreachable("unsupported fp const type");
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+      aggBuffer->addSymbol(GVar);
+    }
+    else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      Value *v = Cexpr->stripPointerCasts();
+      aggBuffer->addSymbol(v);
+    }
+    unsigned int s = TD->getTypeAllocSize(CPV->getType());
+    aggBuffer->addZeros(s);
+    break;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID: {
+    if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
+        isa<ConstantStruct>(CPV)) {
+      int ElementSize = TD->getTypeAllocSize(CPV->getType());
+      bufferAggregateConstant(CPV, aggBuffer);
+      if ( Bytes > ElementSize )
+        aggBuffer->addZeros(Bytes-ElementSize);
+    }
+    else if (isa<ConstantAggregateZero>(CPV))
+      aggBuffer->addZeros(Bytes);
+    else
+      llvm_unreachable("Unexpected Constant type");
+    break;
+  }
+
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+                                              AggBuffer *aggBuffer) {
+  const TargetData *TD = TM.getTargetData();
+  int Bytes;
+
+  // Old constants
+  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+    if (CPV->getNumOperands())
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+      dyn_cast<ConstantDataSequential>(CPV)) {
+    if (CDS->getNumElements())
+      for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+        bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+                     aggBuffer);
+    return;
+  }
+
+
+  if (isa<ConstantStruct>(CPV)) {
+    if (CPV->getNumOperands()) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        if ( i == (e - 1))
+          Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
+          TD->getTypeAllocSize(ST)
+          - TD->getStructLayout(ST)->getElementOffset(i);
+        else
+          Bytes = TD->getStructLayout(ST)->getElementOffset(i+1) -
+          TD->getStructLayout(ST)->getElementOffset(i);
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes,
+                     aggBuffer);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+
+bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
+
+  std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
+
+  if (PI != TypeNameMap.end() &&
+      (!PI->second.compare("struct._image1d_t") ||
+          !PI->second.compare("struct._image2d_t") ||
+          !PI->second.compare("struct._image3d_t")))
+    return true;
+
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode,
+                                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI)
+{
+  switch(MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::CallArgBeginInst:  case NVPTX::CallArgEndInst0:
+  case NVPTX::CallArgEndInst1:  case NVPTX::CallArgF32:
+  case NVPTX::CallArgF64:  case NVPTX::CallArgI16:
+  case NVPTX::CallArgI32:  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgI64:  case NVPTX::CallArgI8:
+  case NVPTX::CallArgParam:  case NVPTX::CallVoidInst:
+  case NVPTX::CallVoidInstReg:  case NVPTX::Callseq_End:
+  case NVPTX::CallVoidInstReg64:
+  case NVPTX::DeclareParamInst:  case NVPTX::DeclareRetMemInst:
+  case NVPTX::DeclareRetRegInst:  case NVPTX::DeclareRetScalarInst:
+  case NVPTX::DeclareScalarParamInst:  case NVPTX::DeclareScalarRegInst:
+  case NVPTX::StoreParamF32:  case NVPTX::StoreParamF64:
+  case NVPTX::StoreParamI16:  case NVPTX::StoreParamI32:
+  case NVPTX::StoreParamI64:  case NVPTX::StoreParamI8:
+  case NVPTX::StoreParamS32I8:  case NVPTX::StoreParamU32I8:
+  case NVPTX::StoreParamS32I16:  case NVPTX::StoreParamU32I16:
+  case NVPTX::StoreParamScalar2F32:  case NVPTX::StoreParamScalar2F64:
+  case NVPTX::StoreParamScalar2I16:  case NVPTX::StoreParamScalar2I32:
+  case NVPTX::StoreParamScalar2I64:  case NVPTX::StoreParamScalar2I8:
+  case NVPTX::StoreParamScalar4F32:  case NVPTX::StoreParamScalar4I16:
+  case NVPTX::StoreParamScalar4I32:  case NVPTX::StoreParamScalar4I8:
+  case NVPTX::StoreParamV2F32:  case NVPTX::StoreParamV2F64:
+  case NVPTX::StoreParamV2I16:  case NVPTX::StoreParamV2I32:
+  case NVPTX::StoreParamV2I64:  case NVPTX::StoreParamV2I8:
+  case NVPTX::StoreParamV4F32:  case NVPTX::StoreParamV4I16:
+  case NVPTX::StoreParamV4I32:  case NVPTX::StoreParamV4I8:
+  case NVPTX::StoreRetvalF32:  case NVPTX::StoreRetvalF64:
+  case NVPTX::StoreRetvalI16:  case NVPTX::StoreRetvalI32:
+  case NVPTX::StoreRetvalI64:  case NVPTX::StoreRetvalI8:
+  case NVPTX::StoreRetvalScalar2F32:  case NVPTX::StoreRetvalScalar2F64:
+  case NVPTX::StoreRetvalScalar2I16:  case NVPTX::StoreRetvalScalar2I32:
+  case NVPTX::StoreRetvalScalar2I64:  case NVPTX::StoreRetvalScalar2I8:
+  case NVPTX::StoreRetvalScalar4F32:  case NVPTX::StoreRetvalScalar4I16:
+  case NVPTX::StoreRetvalScalar4I32:  case NVPTX::StoreRetvalScalar4I8:
+  case NVPTX::StoreRetvalV2F32:  case NVPTX::StoreRetvalV2F64:
+  case NVPTX::StoreRetvalV2I16:  case NVPTX::StoreRetvalV2I32:
+  case NVPTX::StoreRetvalV2I64:  case NVPTX::StoreRetvalV2I8:
+  case NVPTX::StoreRetvalV4F32:  case NVPTX::StoreRetvalV4I16:
+  case NVPTX::StoreRetvalV4I32:  case NVPTX::StoreRetvalV4I8:
+  case NVPTX::LastCallArgF32:  case NVPTX::LastCallArgF64:
+  case NVPTX::LastCallArgI16:  case NVPTX::LastCallArgI32:
+  case NVPTX::LastCallArgI32imm:  case NVPTX::LastCallArgI64:
+  case NVPTX::LastCallArgI8:  case NVPTX::LastCallArgParam:
+  case NVPTX::LoadParamMemF32:  case NVPTX::LoadParamMemF64:
+  case NVPTX::LoadParamMemI16:  case NVPTX::LoadParamMemI32:
+  case NVPTX::LoadParamMemI64:  case NVPTX::LoadParamMemI8:
+  case NVPTX::LoadParamRegF32:  case NVPTX::LoadParamRegF64:
+  case NVPTX::LoadParamRegI16:  case NVPTX::LoadParamRegI32:
+  case NVPTX::LoadParamRegI64:  case NVPTX::LoadParamRegI8:
+  case NVPTX::LoadParamScalar2F32:  case NVPTX::LoadParamScalar2F64:
+  case NVPTX::LoadParamScalar2I16:  case NVPTX::LoadParamScalar2I32:
+  case NVPTX::LoadParamScalar2I64:  case NVPTX::LoadParamScalar2I8:
+  case NVPTX::LoadParamScalar4F32:  case NVPTX::LoadParamScalar4I16:
+  case NVPTX::LoadParamScalar4I32:  case NVPTX::LoadParamScalar4I8:
+  case NVPTX::LoadParamV2F32:  case NVPTX::LoadParamV2F64:
+  case NVPTX::LoadParamV2I16:  case NVPTX::LoadParamV2I32:
+  case NVPTX::LoadParamV2I64:  case NVPTX::LoadParamV2I8:
+  case NVPTX::LoadParamV4F32:  case NVPTX::LoadParamV4I16:
+  case NVPTX::LoadParamV4I32:  case NVPTX::LoadParamV4I8:
+  case NVPTX::PrototypeInst:   case NVPTX::DBG_VALUE:
+    return true;
+  }
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
+
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+  std::stringstream temp;
+  LineReader * reader = this->getReader(filename.str());
+  temp << "\n//";
+  temp << filename.str();
+  temp << ":";
+  temp << line;
+  temp << " ";
+  temp << reader->readLine(line);
+  temp << "\n";
+  this->OutStreamer.EmitRawText(Twine(temp.str()));
+}
+
+
+LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+  if (reader == NULL)  {
+    reader =  new LineReader(filename);
+  }
+
+  if (reader->fileName() != filename) {
+    delete reader;
+    reader =  new LineReader(filename);
+  }
+
+  return reader;
+}
+
+
+std::string
+LineReader::readLine(unsigned lineNum) {
+  if (lineNum < theCurLine) {
+    theCurLine = 0;
+    fstr.seekg(0,std::ios::beg);
+  }
+  while (theCurLine < lineNum) {
+    fstr.getline(buff,500);
+    theCurLine++;
+  }
+  return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 0000000..6488b14
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,315 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *LowerConstant(const llvm::Constant *CV,
+                                  llvm::AsmPrinter &AP) ;
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+  unsigned theCurLine ;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() {
+    fstr.close();
+  }
+  std::string readLine(unsigned line);
+};
+
+
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned size;   // size of the buffer in bytes
+    unsigned char *buffer; // the buffer
+    unsigned numSymbols;   // number of symbol addresses
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<Value *, 4> Symbols;
+
+  private:
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+
+  public:
+    AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+    :O(_O),AP(_AP) {
+      buffer = new unsigned char[_size];
+      size = _size;
+      curpos = 0;
+      numSymbols = 0;
+    }
+    ~AggBuffer() {
+      delete [] buffer;
+    }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos+Num) <= size);
+      assert((curpos+Bytes) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos ++;
+      }
+      for ( int i=Num; i < Bytes ; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos+Num) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    void addSymbol(Value *GVar) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i=0; i<size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int)buffer[i];
+        }
+      } else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (AP.nvptxSubtarget.is64Bit())
+          nBytes = 8;
+        for (pos=0; pos<size; pos+=nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            Value *v = Symbols[nSym];
+            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              O << *Name;
+            }
+            else if (ConstantExpr *Cexpr =
+                dyn_cast<ConstantExpr>(v)) {
+              O << *nvptx::LowerConstant(Cexpr, AP);
+            } else
+              llvm_unreachable("symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size+1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          } else
+            if (nBytes == 4)
+              O << *(unsigned int*)(buffer+pos);
+            else
+              O << *(unsigned long long*)(buffer+pos);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  virtual void emitSrcInText(StringRef filename, unsigned line);
+
+private :
+  virtual const char *getPassName() const {
+    return "NVPTX Assembly Printer";
+  }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitFunctionEntryLabel();
+  void EmitFunctionBodyStart();
+  void EmitFunctionBodyEnd();
+
+  void EmitInstruction(const MachineInstr *);
+
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+
+  void printGlobalVariable(const GlobalVariable *GVar);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier=0);
+  void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O,
+                     const char *Modifier=0);
+  void printVecModifiedImmediate(const MachineOperand &MO,
+                                 const char *Modifier, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier=0);
+  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+  // definition autogenerated.
+  void printInstruction(const MachineInstr *MI, raw_ostream &O);
+  void printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
+                          bool=false);
+  void printParamName(int paramIndex, raw_ostream &O);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitHeader(Module &M, raw_ostream &O);
+  void emitKernelFunctionDirectives(const Function& F,
+                                    raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitFunctionExternParamList(const MachineFunction &MF);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void emitFunctionTempData(const MachineFunction &MF,
+                            unsigned &FrameSize);
+  bool isImageType(const Type *Ty);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+
+protected:
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  // cache the subtarget here.
+  const NVPTXSubtarget &nvptxSubtarget;
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<const Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace,
+                           raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(const Type *Ty, bool=true) const ;
+  void printScalarConstant(Constant *CPV, raw_ostream &O) ;
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O) ;
+  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer) ;
+  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer) ;
+
+  void printOperandProper(const MachineOperand &MO);
+
+  void emitLinkageDirective(const GlobalValue* V, raw_ostream &O);
+  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+
+  static const char *getRegisterName(unsigned RegNo);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  LineReader *reader;
+  LineReader *getReader(std::string);
+public:
+  NVPTXAsmPrinter(TargetMachine &TM,
+                  MCStreamer &Streamer)
+  : AsmPrinter(TM, Streamer),
+    nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+    CurrentBankselLabelInBasicBlock = "";
+    VRidGlobal2LocalMap = NULL;
+    reader = NULL;
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 0000000..a9abc00
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,76 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const {
+  return true;
+}
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+  if (MF.getFrameInfo()->hasStackObjects()) {
+    MachineBasicBlock &MBB = MF.front();
+    // Insert "mov.u32 %SP, %Depot"
+    MachineBasicBlock::iterator MBBI = MBB.begin();
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    if (tm.getSubtargetImpl()->hasGenericLdSt()) {
+      // mov %SPL, %depot;
+      // cvta.local %SP, %SPL;
+      if (is64bit) {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                               tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      } else {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                                  tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      }
+    }
+    else {
+      // mov %SP, %depot;
+      if (is64bit)
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+      else
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+    }
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 0000000..ee87b39
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,40 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+
+namespace llvm {
+class NVPTXTargetMachine;
+
+class NVPTXFrameLowering : public TargetFrameLowering {
+  NVPTXTargetMachine &tm;
+  bool is64bit;
+
+public:
+  explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+    tm(_tm), is64bit(_is64bit) {}
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF,
+                            MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 0000000..4e92f0e
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,683 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Instructions.h"
+#include "llvm/Support/raw_ostream.h"
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/GlobalValue.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-isel"
+
+using namespace llvm;
+
+
+static cl::opt<bool>
+UseFMADInstruction("nvptx-mad-enable",
+                   cl::ZeroOrMore,
+                cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
+                   cl::init(false));
+
+static cl::opt<int>
+FMAContractLevel("nvptx-fma-level",
+                 cl::ZeroOrMore,
+                 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                     " 1: do it  2: do it aggressively"),
+                     cl::init(2));
+
+
+static cl::opt<int>
+UsePrecDivF32("nvptx-prec-divf32",
+              cl::ZeroOrMore,
+             cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+                  " IEEE Compliant F32 div.rnd if avaiable."),
+                  cl::init(2));
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+: SelectionDAGISel(tm, OptLevel),
+  Subtarget(tm.getSubtarget<NVPTXSubtarget>())
+{
+  // Always do fma.f32 fpcontract if the target supports the instruction.
+  // Always do fma.f64 fpcontract if the target supports the instruction.
+  // Do mad.f32 is nvptx-mad-enable is specified and the target does not
+  // support fma.f32.
+
+  doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
+  doFMAF32 =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel>=1);
+  doFMAF64 =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel>=1);
+  doFMAF32AGG =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel==2);
+  doFMAF64AGG =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel==2);
+
+  allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
+
+  UseF32FTZ = false;
+
+  doMulWide = (OptLevel > 0);
+
+  // Decide how to translate f32 div
+  do_DIVF32_PREC = UsePrecDivF32;
+  // sm less than sm_20 does not support div.rnd. Use div.full.
+  if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
+    do_DIVF32_PREC = 1;
+
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  SDNode *ResNode = NULL;
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    ResNode = SelectLoad(N);
+    break;
+  case ISD::STORE:
+    ResNode = SelectStore(N);
+    break;
+  }
+  if (ResNode)
+    return ResNode;
+  return SelectCode(N);
+}
+
+
+static unsigned int
+getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget)
+{
+  const Value *Src = N->getSrcValue();
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::LOCAL;
+
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+      return NVPTX::PTXLdStInstCode::CONSTANT;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST:
+      // If the arch supports generic address space, translate it to GLOBAL
+      // for correctness.
+      // If the arch does not support generic address space, then the arch
+      // does not really support ADDRESS_SPACE_CONST, translate it to
+      // to CONSTANT for better performance.
+      if (Subtarget.hasGenericLdSt())
+        return NVPTX::PTXLdStInstCode::GLOBAL;
+      else
+        return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::LOCAL;
+}
+
+
+SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD= NULL;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return NULL;
+
+  if (!LoadedVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned fromTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N1.getNode(), N1, Base, Offset):
+      SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N1.getNode(), N1, Base, Offset):
+      SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  }
+  else {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  }
+
+  if (NVPTXLD != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXLD;
+}
+
+SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = NULL;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return NULL;
+
+  if (!StoreVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT =
+      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Addr, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N2.getNode(), N2, Base, Offset):
+      SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N2.getNode(), N2, Base, Offset):
+      SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      N2, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  }
+
+  if (NVPTXST != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXST;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress
+      || N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base=Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+          dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = NULL;
+  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+  // the classof() for MemSDNode does not include MemIntrinsicSDNode
+  // (See SelectionDAGNodes.h). So we need to check for both.
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                     char ConstraintCode,
+                                                 std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
+
+// Return true if N is a undef or a constant.
+// If N was undef, return a (i8imm 0) in Retval
+// If N was imm, convert it to i8imm and return in Retval
+// Note: The convert to i8imm is required, otherwise the
+// pattern matcher inserts a bunch of IMOVi8rr to convert
+// the imm to i8imm, and this causes instruction selection
+// to fail.
+bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N,
+                                   SDValue &Retval) {
+  if (!(N.getOpcode() == ISD::UNDEF) &&
+      !(N.getOpcode() == ISD::Constant))
+    return false;
+
+  if (N.getOpcode() == ISD::UNDEF)
+    Retval = CurDAG->getTargetConstant(0, MVT::i8);
+  else {
+    ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode());
+    unsigned retval = cn->getZExtValue();
+    Retval = CurDAG->getTargetConstant(retval, MVT::i8);
+  }
+  return true;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 0000000..ccd69b29
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,105 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-isel"
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Intrinsics.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+  // If true, generate corresponding FPCONTRACT. This is
+  // language dependent (i.e. CUDA and OpenCL works differently).
+  bool doFMADF32;
+  bool doFMAF64;
+  bool doFMAF32;
+  bool doFMAF64AGG;
+  bool doFMAF32AGG;
+  bool allowFMA;
+
+  // 0: use div.approx
+  // 1: use div.full
+  // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
+  //    Otherwise, use div.full
+  int do_DIVF32_PREC;
+
+  // If true, add .ftz to f32 instructions.
+  // This is only meaningful for sm_20 and later, as the default
+  // is not ftz.
+  // For sm earlier than sm_20, f32 denorms are always ftz by the
+  // hardware.
+  // We always add the .ftz modifier regardless of the sm value
+  // when Use32FTZ is true.
+  bool UseF32FTZ;
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level OptLevel);
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+
+  const NVPTXSubtarget &Subtarget;
+
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+private:
+  // Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N);
+  SDNode* SelectLoad(SDNode *N);
+  SDNode* SelectStore(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+  bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval);
+
+};
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 0000000..6ea10ea
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,1291 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Module.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool>
+RetainVectorOperands("nvptx-codegen-vectors",
+     cl::desc("NVPTX Specific: Retain LLVM's vectors and generate PTX vectors"),
+                     cl::init(true));
+
+static cl::opt<bool>
+sched4reg("nvptx-sched4reg",
+          cl::desc("NVPTX Specific: schedule for register pressue"),
+          cl::init(false));
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
+: TargetLowering(TM, new NVPTXTargetObjectFile()),
+  nvTM(&TM),
+  nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  maxStoresPerMemset = (unsigned)0xFFFFFFFF;
+  maxStoresPerMemcpy = (unsigned)0xFFFFFFFF;
+  maxStoresPerMemmove = (unsigned)0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+  if (RetainVectorOperands) {
+    addRegisterClass(MVT::v2f32, &NVPTX::V2F32RegsRegClass);
+    addRegisterClass(MVT::v4f32, &NVPTX::V4F32RegsRegClass);
+    addRegisterClass(MVT::v2i32, &NVPTX::V2I32RegsRegClass);
+    addRegisterClass(MVT::v4i32, &NVPTX::V4I32RegsRegClass);
+    addRegisterClass(MVT::v2f64, &NVPTX::V2F64RegsRegClass);
+    addRegisterClass(MVT::v2i64, &NVPTX::V2I64RegsRegClass);
+    addRegisterClass(MVT::v2i16, &NVPTX::V2I16RegsRegClass);
+    addRegisterClass(MVT::v4i16, &NVPTX::V4I16RegsRegClass);
+    addRegisterClass(MVT::v2i8, &NVPTX::V2I8RegsRegClass);
+    addRegisterClass(MVT::v4i8, &NVPTX::V4I8RegsRegClass);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8   , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8   , Custom);
+
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8   , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i8   , Custom);
+  }
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  if (nvptxSubtarget.hasROT64()) {
+    setOperationAction(ISD::ROTL , MVT::i64, Legal);
+    setOperationAction(ISD::ROTR , MVT::i64, Legal);
+  }
+  else {
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+  }
+  if (nvptxSubtarget.hasROT32()) {
+    setOperationAction(ISD::ROTL , MVT::i32, Legal);
+    setOperationAction(ISD::ROTR , MVT::i32, Legal);
+  }
+  else {
+    setOperationAction(ISD::ROTL , MVT::i32, Expand);
+    setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL , MVT::i16, Expand);
+  setOperationAction(ISD::ROTR , MVT::i16, Expand);
+  setOperationAction(ISD::ROTL , MVT::i8, Expand);
+  setOperationAction(ISD::ROTR , MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
+  setOperationAction(ISD::BRIND,             MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i64  , Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setOperationAction(ISD::STORE, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP,         MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP,         MVT::f32, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP,               MVT::Other, Legal);
+
+  // By default, CONCAT_VECTORS is implemented via store/load
+  // through stack. It is slow and uses local memory. We need
+  // to custom-lowering them.
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8   , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i16  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i8   , Custom);
+
+  // Expand vector int to float and float to int conversions
+  // - For SINT_TO_FP and UINT_TO_FP, the src type
+  //   (Node->getOperand(0).getValueType())
+  //   is used to determine the action, while for FP_TO_UINT and FP_TO_SINT,
+  //   the dest type (Node->getValueType(0)) is used.
+  //
+  //   See VectorLegalizer::LegalizeOp() (LegalizeVectorOps.cpp) for the vector
+  //   case, and
+  //   SelectionDAGLegalize::LegalizeOp() (LegalizeDAG.cpp) for the scalar case.
+  //
+  //   That is why v4i32 or v2i32 are used here.
+  //
+  //   The expansion for vectors happens in VectorLegalizer::LegalizeOp()
+  //   (LegalizeVectorOps.cpp).
+  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties();
+}
+
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case NVPTXISD::CALL:            return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:        return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::Wrapper:         return "NVPTXISD::Wrapper";
+  case NVPTXISD::NVBuiltin:       return "NVPTXISD::NVBuiltin";
+  case NVPTXISD::DeclareParam:    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:      return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareRetParam: return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:       return "NVPTXISD::PrintCall";
+  case NVPTXISD::LoadParam:       return "NVPTXISD::LoadParam";
+  case NVPTXISD::StoreParam:      return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamS32:   return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:   return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::MoveToParam:     return "NVPTXISD::MoveToParam";
+  case NVPTXISD::CallArgBegin:    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:         return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:     return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:      return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:        return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:         return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:      return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:       return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:       return "NVPTXISD::MoveParam";
+  case NVPTXISD::MoveRetval:      return "NVPTXISD::MoveRetval";
+  case NVPTXISD::MoveToRetval:    return "NVPTXISD::MoveToRetval";
+  case NVPTXISD::StoreRetval:     return "NVPTXISD::StoreRetval";
+  case NVPTXISD::PseudoUseParam:  return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:          return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:      return "NVPTXISD::CallSeqEnd";
+  }
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(Type *retTy,
+                                              const ArgListTy &Args,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                              unsigned retAlignment) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID)
+    O << "()";
+  else {
+    O << "(";
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
+        unsigned size = 0;
+        if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+          size = ITy->getBitWidth();
+          if (size < 32) size = 32;
+        }
+        else {
+          assert(retTy->isFloatingPointTy() &&
+                 "Floating point type expected here");
+          size = retTy->getPrimitiveSizeInBits();
+        }
+
+        O << ".param .b" << size << " _";
+      }
+      else if (isa<PointerType>(retTy))
+        O << ".param .b" << getPointerTy().getSizeInBits()
+        << " _";
+      else {
+        if ((retTy->getTypeID() == Type::StructTyID) ||
+            isa<VectorType>(retTy)) {
+          SmallVector<EVT, 16> vtparts;
+          ComputeValueVTs(*this, retTy, vtparts);
+          unsigned totalsz = 0;
+          for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+            unsigned elems = 1;
+            EVT elemtype = vtparts[i];
+            if (vtparts[i].isVector()) {
+              elems = vtparts[i].getVectorNumElements();
+              elemtype = vtparts[i].getVectorElementType();
+            }
+            for (unsigned j=0, je=elems; j!=je; ++j) {
+              unsigned sz = elemtype.getSizeInBits();
+              if (elemtype.isInteger() && (sz < 8)) sz = 8;
+              totalsz += sz/8;
+            }
+          }
+          O << ".param .align "
+              << retAlignment
+              << " .b8 _["
+              << totalsz << "]";
+        }
+        else {
+          assert(false &&
+                 "Unknown return type");
+        }
+      }
+    }
+    else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, retTy, vtparts);
+      unsigned idx = 0;
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0, je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << ".reg .b" << sz << " _";
+          if (j<je-1) O << ", ";
+          ++idx;
+        }
+        if (i < e-1)
+          O << ", ";
+      }
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+  MVT thePointerTy = getPointerTy();
+
+  for (unsigned i=0,e=Args.size(); i!=e; ++i) {
+    const Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32) sz = 32;
+      }
+      else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << ".param .b" << sz << " ";
+      else
+        O << ".reg .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy &&
+           "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI) {
+      unsigned align = Outs[i].Flags.getByValAlign();
+      unsigned sz = getTargetData()->getTypeAllocSize(ETy);
+      O << ".param .align " << align
+          << " .b8 ";
+      O << "_";
+      O << "[" << sz << "]";
+      continue;
+    }
+    else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, ETy, vtparts);
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0,je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << ".reg .b" << sz << " ";
+          O << "_";
+          if (j<je-1) O << ", ";
+        }
+        if (i<e-1)
+          O << ", ";
+      }
+      continue;
+    }
+  }
+  O << ");";
+  return O.str();
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  ArgListTy &Args                       = CLI.Args;
+  Type *retTy                           = CLI.RetTy;
+  ImmutableCallSite *CS                 = CLI.CS;
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  SDValue tempChain = Chain;
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getIntPtrConstant(uniqueCallSite, true));
+  SDValue InFlag = Chain.getValue(1);
+
+  assert((Outs.size() == Args.size()) &&
+         "Unexpected number of arguments to function call");
+  unsigned paramCount = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i=0, e=Outs.size(); i!=e; ++i) {
+    EVT VT = Outs[i].VT;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      // Plain scalar
+      // for ABI,    declare .param .b<size> .param<n>;
+      // for nonABI, declare .reg .b<size> .param<n>;
+      unsigned isReg = 1;
+      if (isABI)
+        isReg = 0;
+      unsigned sz = VT.getSizeInBits();
+      if (VT.isInteger() && (sz < 32)) sz = 32;
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareParamOps[] = { Chain,
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    DAG.getConstant(isReg, MVT::i32),
+                                    InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                             DAG.getConstant(0, MVT::i32), OutVals[i], InFlag };
+
+      unsigned opcode = NVPTXISD::StoreParam;
+      if (isReg)
+        opcode = NVPTXISD::MoveToParam;
+      else {
+        if (Outs[i].Flags.isZExt())
+          opcode = NVPTXISD::StoreParamU32;
+        else if (Outs[i].Flags.isSExt())
+          opcode = NVPTXISD::StoreParamS32;
+      }
+      Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5);
+
+      InFlag = Chain.getValue(1);
+      ++paramCount;
+      continue;
+    }
+    // struct or vector
+    SmallVector<EVT, 16> vtparts;
+    const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy &&
+           "Type of a byval parameter should be pointer");
+    ComputeValueVTs(*this, PTy->getElementType(), vtparts);
+
+    if (isABI) {
+      // declare .param .align 16 .b8 .param<n>[<size>];
+      unsigned sz = Outs[i].Flags.getByValSize();
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      // The ByValAlign in the Outs[i].Flags is alway set at this point, so we
+      // don't need to
+      // worry about natural alignment or not. See TargetLowering::LowerCallTo()
+      SDValue DeclareParamOps[] = { Chain,
+                       DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32),
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      unsigned curOffset = 0;
+      for (unsigned j=0,je=vtparts.size(); j!=je; ++j) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[j];
+        if (vtparts[j].isVector()) {
+          elems = vtparts[j].getVectorNumElements();
+          elemtype = vtparts[j].getVectorElementType();
+        }
+        for (unsigned k=0,ke=elems; k!=ke; ++k) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 8)) sz = 8;
+          SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                        OutVals[i],
+                                        DAG.getConstant(curOffset,
+                                                        getPointerTy()));
+          SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                MachinePointerInfo(), false, false, false, 0);
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount,
+                                                            MVT::i32),
+                                           DAG.getConstant(curOffset, MVT::i32),
+                                                            theVal, InFlag };
+          Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                              CopyParamOps, 5);
+          InFlag = Chain.getValue(1);
+          curOffset += sz/8;
+        }
+      }
+      ++paramCount;
+      continue;
+    }
+    // Non-abi, struct or vector
+    // Declare a bunch or .reg .b<size> .param<n>
+    unsigned curOffset = 0;
+    for (unsigned j=0,je=vtparts.size(); j!=je; ++j) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[j];
+      if (vtparts[j].isVector()) {
+        elems = vtparts[j].getVectorNumElements();
+        elemtype = vtparts[j].getVectorElementType();
+      }
+      for (unsigned k=0,ke=elems; k!=ke; ++k) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(paramCount,
+                                                             MVT::i32),
+                                                  DAG.getConstant(sz, MVT::i32),
+                                                   DAG.getConstant(1, MVT::i32),
+                                                             InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareParamOps, 5);
+        InFlag = Chain.getValue(1);
+        SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
+                                      DAG.getConstant(curOffset,
+                                                      getPointerTy()));
+        SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                  MachinePointerInfo(), false, false, false, 0);
+        SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                   DAG.getConstant(0, MVT::i32), theVal,
+                                   InFlag };
+        Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs,
+                            CopyParamOps, 5);
+        InFlag = Chain.getValue(1);
+        ++paramCount;
+      }
+    }
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  unsigned retCount = 0;
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, retTy, resvtparts);
+
+    // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or
+    // individual .reg .b<size> func_retval<0..> for non ABI
+    unsigned resultsz = 0;
+    for (unsigned i=0,e=resvtparts.size(); i!=e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = resvtparts[i];
+      if (resvtparts[i].isVector()) {
+        elems = resvtparts[i].getVectorNumElements();
+        elemtype = resvtparts[i].getVectorElementType();
+      }
+      for (unsigned j=0,je=elems; j!=je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (isABI == false) {
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        }
+        else {
+          if (elemtype.isInteger() && (sz < 8)) sz = 8;
+        }
+        if (isABI == false) {
+          SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32),
+                                      DAG.getConstant(retCount, MVT::i32),
+                                      InFlag };
+          Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                              DeclareRetOps, 5);
+          InFlag = Chain.getValue(1);
+          ++retCount;
+        }
+        resultsz += sz;
+      }
+    }
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
+          retTy->isPointerTy() ) {
+        // Scalar needs to be at least 32bit wide
+        if (resultsz < 32)
+          resultsz = 32;
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                    DAG.getConstant(resultsz, MVT::i32),
+                                    DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      }
+      else {
+        if (Func) { // direct call
+          if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment))
+            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+        } else { // indirect call
+          const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
+          if (!llvm::getAlign(*CallI, 0, retAlignment))
+            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+        }
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment,
+                                                           MVT::i32),
+                                          DAG.getConstant(resultsz/8, MVT::i32),
+                                         DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      }
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for an
+    // INLINEASM SDNode.
+    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment);
+    const char *asmstr = nvTM->getManagedStrPool()->
+        getManagedString(proto_string.c_str())->c_str();
+    SDValue InlineAsmOps[] = { Chain,
+                               DAG.getTargetExternalSymbol(asmstr,
+                                                           getPointerTy()),
+                                                           DAG.getMDNode(0),
+                                   DAG.getTargetConstant(0, MVT::i32), InFlag };
+    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = { Chain,
+                             DAG.getConstant(isABI ? ((Ins.size()==0) ? 0 : 1)
+                                 : retCount, MVT::i32),
+                                   InFlag };
+  Chain = DAG.getNode(Func?(NVPTXISD::PrintCallUni):(NVPTXISD::PrintCall), dl,
+      PrintCallVTs, PrintCallOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps, 2);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i=0, e=paramCount; i!=e; ++i) {
+    unsigned opcode;
+    if (i==(e-1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                             DAG.getConstant(i, MVT::i32),
+                             InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain,
+                              DAG.getConstant(Func ? 1 : 0, MVT::i32),
+                              InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps,
+                      3);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain,
+                               DAG.getConstant(uniqueCallSite, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    if (isABI) {
+      unsigned resoffset = 0;
+      for (unsigned i=0,e=Ins.size(); i!=e; ++i) {
+        unsigned sz = Ins[i].VT.getSizeInBits();
+        if (Ins[i].VT.isInteger() && (sz < 8)) sz = 8;
+        std::vector<EVT> LoadRetVTs;
+        LoadRetVTs.push_back(Ins[i].VT);
+        LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue);
+        std::vector<SDValue> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs,
+                                     &LoadRetOps[0], LoadRetOps.size());
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        InVals.push_back(retval);
+        resoffset += sz/8;
+      }
+    }
+    else {
+      SmallVector<EVT, 16> resvtparts;
+      ComputeValueVTs(*this, retTy, resvtparts);
+
+      assert(Ins.size() == resvtparts.size() &&
+             "Unexpected number of return values in non-ABI case");
+      unsigned paramNum = 0;
+      for (unsigned i=0,e=Ins.size(); i!=e; ++i) {
+        assert(EVT(Ins[i].VT) == resvtparts[i] &&
+               "Unexpected EVT type in non-ABI case");
+        unsigned numelems = 1;
+        EVT elemtype = Ins[i].VT;
+        if (Ins[i].VT.isVector()) {
+          numelems = Ins[i].VT.getVectorNumElements();
+          elemtype = Ins[i].VT.getVectorElementType();
+        }
+        std::vector<SDValue> tempRetVals;
+        for (unsigned j=0; j<numelems; ++j) {
+          std::vector<EVT> MoveRetVTs;
+          MoveRetVTs.push_back(elemtype);
+          MoveRetVTs.push_back(MVT::Other); MoveRetVTs.push_back(MVT::Glue);
+          std::vector<SDValue> MoveRetOps;
+          MoveRetOps.push_back(Chain);
+          MoveRetOps.push_back(DAG.getConstant(0, MVT::i32));
+          MoveRetOps.push_back(DAG.getConstant(paramNum, MVT::i32));
+          MoveRetOps.push_back(InFlag);
+          SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs,
+                                       &MoveRetOps[0], MoveRetOps.size());
+          Chain = retval.getValue(1);
+          InFlag = retval.getValue(2);
+          tempRetVals.push_back(retval);
+          ++paramNum;
+        }
+        if (Ins[i].VT.isVector())
+          InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT,
+                                       &tempRetVals[0], tempRetVals.size()));
+        else
+          InVals.push_back(tempRetVals[0]);
+      }
+    }
+  }
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(uniqueCallSite, true),
+                             DAG.getIntPtrConstant(uniqueCallSite+1, true),
+                             InFlag);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue NVPTXTargetLowering::
+LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j=0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j)));
+    }
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
+                     &Ops[0], Ops.size());
+}
+
+SDValue NVPTXTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR: return SDValue();
+  case ISD::FRAMEADDR:  return SDValue();
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return Op;
+  case ISD::BUILD_VECTOR:
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue
+NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx,
+                                EVT v) const {
+  std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
+  std::stringstream suffix;
+  suffix << idx;
+  *name += suffix.str();
+  return DAG.getTargetExternalSymbol(name->c_str(), v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  return getExtSymb(DAG, ".PARAM", idx, v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
+  return getExtSymb(DAG, ".HLPPARAM", idx);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = {
+                                             "struct._image2d_t",
+                                             "struct._image3d_t",
+                                             "struct._sampler_t"
+  };
+
+  const Type *Ty = arg->getType();
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+  const std::string TypeName = STy ? STy->getName() : "";
+
+  for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
+    if (TypeName == specialTypes[i])
+      return true;
+
+  return false;
+}
+
+SDValue
+NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetData *TD = getTargetData();
+
+  const Function *F = MF.getFunction();
+  const AttrListPtr &PAL = F->getAttributes();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isKernel = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      I != E; ++I) {
+    theArgs.push_back(I);
+    argTypes.push_back(I->getType());
+  }
+  assert(argTypes.size() == Ins.size() &&
+         "Ins types and function types did not match");
+
+  int idx = 0;
+  for (unsigned i=0, e=Ins.size(); i!=e; ++i, ++idx) {
+    Type *Ty = argTypes[i];
+    EVT ObjectVT = getValueType(Ty);
+    assert(ObjectVT == Ins[i].VT &&
+           "Ins type did not match function type");
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(theArgs[i],
+                           (theArgs[i]->getParent() ?
+                               theArgs[i]->getParent()->getParent() : 0))) {
+      assert(isKernel && "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i+1, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT));
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNOdes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (PAL.paramHasAttr(i+1, Attribute::ByVal) == false) {
+      // A plain scalar.
+      if (isABI || isKernel) {
+        // If ABI, load from the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx);
+        Value *srcValue = new Argument(PointerType::get(ObjectVT.getTypeForEVT(
+            F->getContext()),
+            llvm::ADDRESS_SPACE_PARAM));
+        SDValue p = DAG.getLoad(ObjectVT, dl, Root, Arg,
+                                MachinePointerInfo(srcValue), false, false,
+                                false,
+                                TD->getABITypeAlignment(ObjectVT.getTypeForEVT(
+                                  F->getContext())));
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx+1);
+        InVals.push_back(p);
+      }
+      else {
+        // If no ABI, just move the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
+        SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx+1);
+        InVals.push_back(p);
+      }
+      continue;
+    }
+
+    // Param has ByVal attribute
+    if (isABI || isKernel) {
+      // Return MoveParam(param symbol).
+      // Ideally, the param symbol can be returned directly,
+      // but when SDNode builder decides to use it in a CopyToReg(),
+      // machine instruction fails because TargetExternalSymbol
+      // (not lowered) is target dependent, and CopyToReg assumes
+      // the source is lowered.
+      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+      if (p.getNode())
+        DAG.AssignOrdering(p.getNode(), idx+1);
+      if (isKernel)
+        InVals.push_back(p);
+      else {
+        SDValue p2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+                    DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32),
+                                 p);
+        InVals.push_back(p2);
+      }
+    } else {
+      // Have to move a set of param symbols to registers and
+      // store them locally and return the local pointer in InVals
+      const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]);
+      assert(elemPtrType &&
+             "Byval parameter should be a pointer type");
+      Type *elemType = elemPtrType->getElementType();
+      // Compute the constituent parts
+      SmallVector<EVT, 16> vtparts;
+      SmallVector<uint64_t, 16> offsets;
+      ComputeValueVTs(*this, elemType, vtparts, &offsets, 0);
+      unsigned totalsize = 0;
+      for (unsigned j=0, je=vtparts.size(); j!=je; ++j)
+        totalsize += vtparts[j].getStoreSizeInBits();
+      SDValue localcopy =  DAG.getFrameIndex(MF.getFrameInfo()->
+                                      CreateStackObject(totalsize/8, 16, false),
+                                             getPointerTy());
+      unsigned sizesofar = 0;
+      std::vector<SDValue> theChains;
+      for (unsigned j=0, je=vtparts.size(); j!=je; ++j) {
+        unsigned numElems = 1;
+        if (vtparts[j].isVector()) numElems = vtparts[j].getVectorNumElements();
+        for (unsigned k=0, ke=numElems; k!=ke; ++k) {
+          EVT tmpvt = vtparts[j];
+          if (tmpvt.isVector()) tmpvt = tmpvt.getVectorElementType();
+          SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt,
+                                    getParamSymbol(DAG, idx, tmpvt));
+          SDValue addr = DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy,
+                                    DAG.getConstant(sizesofar, getPointerTy()));
+          theChains.push_back(DAG.getStore(Chain, dl, arg, addr,
+                                        MachinePointerInfo(), false, false, 0));
+          sizesofar += tmpvt.getStoreSizeInBits()/8;
+          ++idx;
+        }
+      }
+      --idx;
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0],
+                          theChains.size());
+      InVals.push_back(localcopy);
+    }
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass.
+  // We treat this case as if the arg list is empty.
+  //if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &OutChains[0], OutChains.size()));
+
+  return Chain;
+}
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 DebugLoc dl, SelectionDAG &DAG) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  unsigned sizesofar = 0;
+  unsigned idx = 0;
+  for (unsigned i=0, e=Outs.size(); i!=e; ++i) {
+    SDValue theVal = OutVals[i];
+    EVT theValType = theVal.getValueType();
+    unsigned numElems = 1;
+    if (theValType.isVector()) numElems = theValType.getVectorNumElements();
+    for (unsigned j=0,je=numElems; j!=je; ++j) {
+      SDValue tmpval = theVal;
+      if (theValType.isVector())
+        tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                             theValType.getVectorElementType(),
+                             tmpval, DAG.getIntPtrConstant(j));
+      Chain = DAG.getNode(isABI ? NVPTXISD::StoreRetval :NVPTXISD::MoveToRetval,
+          dl, MVT::Other,
+          Chain,
+          DAG.getConstant(isABI ? sizesofar : idx, MVT::i32),
+          tmpval);
+      if (theValType.isVector())
+        sizesofar += theValType.getVectorElementType().getStoreSizeInBits()/8;
+      else
+        sizesofar += theValType.getStoreSizeInBits()/8;
+      ++idx;
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+void
+NVPTXTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  std::string &Constraint,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const
+{
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// NVPTX suuport vector of legal types of any length in Intrinsics because the
+// NVPTX specific type legalizer
+// will legalize them to the PTX supported length.
+bool
+NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
+  if (isTypeLegal(VT))
+    return true;
+  if (VT.isVector()) {
+    MVT eVT = VT.getVectorElementType();
+    if (isTypeLegal(eVT))
+      return true;
+  }
+  return false;
+}
+
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool
+NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I,
+                                        unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p:
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = MVT::i32;
+    else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy();
+    else
+      Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 0;
+    return true;
+
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool
+NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                           Type *Ty) const {
+
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
+      return false;
+    return true;
+  }
+
+  switch (AM.Scale) {
+  case 0:  // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg)  // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+
+std::pair<unsigned, const TargetRegisterClass*>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int8RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
+  return 4;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 0000000..86246e6
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,144 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  NVBuiltin,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  LoadParam,
+  StoreParam,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently
+  MoveToParam,
+  PrintCall,
+  PrintCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  MoveRetval,
+  MoveToRetval,
+  StoreRetval,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  Dummy
+};
+}
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+                             SelectionDAG &DAG) const;
+
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I,
+                          unsigned Intrinsic) const;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  virtual EVT getSetCCResultType(EVT VT) const {
+    return MVT::i1;
+  }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+  std::pair<unsigned, const TargetRegisterClass*>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+  virtual SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual SDValue
+  LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const;
+
+  std::string getPrototype(Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment) const;
+
+  virtual SDValue
+  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              SelectionDAG &DAG) const;
+
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                            std::vector<SDValue> &Ops,
+                                            SelectionDAG &DAG) const;
+
+  NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  virtual MVT getShiftAmountTy(EVT LHSTy) const {
+    return MVT::i32;
+  }
+
+private:
+  const NVPTXSubtarget &nvptxSubtarget;  // cache the subtarget here
+
+  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, EVT =
+                         MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const;
+  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 0000000..f11f1b8
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,43 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  let TSFlags{3-0} = VecInstType;
+  let TSFlags{4-4} = IsSimpleMove;
+  let TSFlags{5-5} = IsLoad;
+  let TSFlags{6-6} = IsStore;
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 0000000..cd50deb
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,326 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#define GET_INSTRINFO_CTOR
+#include "NVPTXGenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include <cstdio>
+
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
+: NVPTXGenInstrInfo(),
+  TM(tm),
+  RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+
+void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
+      NVPTX::Int32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
+      NVPTX::Int8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
+      NVPTX::Int1RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
+      NVPTX::Float32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
+      NVPTX::Int16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
+      NVPTX::Int64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
+      NVPTX::Float64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else {
+    llvm_unreachable("Don't know how to copy a register");
+  }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >>
+      NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool  NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const
+{
+  switch (MI.getOpcode()) {
+  default: return false;
+  case NVPTX::INT_PTX_SREG_NTID_X:
+  case NVPTX::INT_PTX_SREG_NTID_Y:
+  case NVPTX::INT_PTX_SREG_NTID_Z:
+  case NVPTX::INT_PTX_SREG_TID_X:
+  case NVPTX::INT_PTX_SREG_TID_Y:
+  case NVPTX::INT_PTX_SREG_TID_Z:
+  case NVPTX::INT_PTX_SREG_CTAID_X:
+  case NVPTX::INT_PTX_SREG_CTAID_Y:
+  case NVPTX::INT_PTX_SREG_CTAID_Z:
+  case NVPTX::INT_PTX_SREG_NCTAID_X:
+  case NVPTX::INT_PTX_SREG_NCTAID_Y:
+  case NVPTX::INT_PTX_SREG_NCTAID_Z:
+  case NVPTX::INT_PTX_SREG_WARPSIZE:
+    return true;
+  }
+}
+
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isLoadMask) >>
+      NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isStoreMask) >>
+      NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+  unsigned addrspace = 0;
+  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+    return false;
+  if (isLoadInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  if (isStoreInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  return true;
+}
+
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch))
+      .addReg(Cond[0].getReg()).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch))
+  .addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 0000000..7b8e218
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,83 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo
+{
+  NVPTXTargetMachine &TM;
+  const NVPTXRegisterInfo RegInfo;
+public:
+  explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+
+  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const ;
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isReadSpecialReg(MachineInstr &MI) const;
+
+  virtual bool CanTailMerge(const MachineInstr *MI) const ;
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return  MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 0000000..8a410b8
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2837 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+def NOP : NVPTXInst<(outs), (ins), "", []>;
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget.hasVote()">;
+def hasDouble : Predicate<"Subtarget.hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDU : Predicate<"Subtarget.hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"UseF32FTZ">;
+
+def doFMAF32      : Predicate<"doFMAF32">;
+def doFMAF32_ftz  : Predicate<"(doFMAF32 && UseF32FTZ)">;
+def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
+def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
+def doFMAF64      : Predicate<"doFMAF64">;
+def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
+def doFMADF32     : Predicate<"doFMADF32">;
+def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA">;
+def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
+
+def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
+def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
+
+def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+//===----------------------------------------------------------------------===//
+// Special Handling for 8-bit Operands and Operations
+//
+// PTX supports 8-bit signed and unsigned types, but does not support 8-bit
+// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have
+// 8-bit registers.
+//
+// PTX ld, st and cvt instructions permit source and destination data operands
+// to be wider than the instruction-type size, so that narrow values may be
+// loaded, stored, and converted using regular-width registers.
+//
+// So in PTX generation, we
+// - always use 16-bit registers in place in 8-bit registers.
+//   (8-bit variables should stay as 8-bit as they represent memory layout.)
+// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values
+//   before operation
+//   . div
+//   . rem
+//   . neg (sign)
+//   . set, setp
+//   . shr
+//
+// We are patching the operations by inserting the cvt instructions in the
+// asm strings of the affected instructions.
+//
+// Since vector operations, except for ld/st, are eventually elementized. We
+// do not need to special-hand the vector 8-bit operations.
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt.s16.s8 %temp2, %b;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   mov.b16    %temp2, %b;
+//   cvt.s16.s8 %temp2, %temp2;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .",
+             !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat("mov.b16 \t%temp2, $b;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   mov.b16    %temp1, %b;
+//   cvt.s16.s8 %temp1, %temp1;
+//   cvt.s16.s8 %temp2, %a;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat("mov.b16 \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_noi8<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+       Int32Regs:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+}
+
+multiclass F3_rn<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                      [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                      Requires<[doF32FTZ]>;
+   def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+//mul.wide PTX instruction
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(32))
+    return true;
+  return false;
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(32))
+    return true;
+  return false;
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(16))
+    return true;
+  return false;
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(16))
+    return true;
+  return false;
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 32
+  // Only then the result from (x << v) will be i32
+  if (v.sge(0) && v.slt(32))
+    return true;
+  return false;
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 16
+  // Only then the result from (x << v) will be i16
+  if (v.sge(0) && v.slt(16))
+    return true;
+  return false;
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
+}]>;
+
+def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3_noi8<"mul.hi.s", mulhs>;
+defm MULTHU : I3_noi8<"mul.hi.u", mulhu>;
+def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>;
+def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>;
+
+
+defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">;
+defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">;
+
+defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">;
+// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
+defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">;
+// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+
+def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        Int8Regs:$c))]>;
+def MAD8rri : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        imm:$c))]>;
+def MAD8rir : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        Int8Regs:$c))]>;
+def MAD8rii : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
+def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
+def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
+def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
+    (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
+def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
+def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
+def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), imm:$c))]>;
+
+def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
+def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
+def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
+def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), imm:$c))]>;
+
+
+def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                     !strconcat("cvt.s16.s8 \t$dst, $src;\n\t",
+                                 "neg.s16 \t$dst, $dst;"),
+         [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>;
+def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                     "neg.s16 \t$dst, $src;",
+         [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                     "neg.s32 \t$dst, $src;",
+         [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                     "neg.s64 \t$dst, $src;",
+         [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
+      return false;
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+// Constand (double)1.0
+def DoubleConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
+      return false;
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+defm FADD : F3<"add", fadd>;
+defm FSUB : F3<"sub", fsub>;
+defm FMUL : F3<"mul", fmul>;
+
+defm FADD_rn : F3_rn<"add", fadd>;
+defm FSUB_rn : F3_rn<"sub", fsub>;
+defm FMUL_rn : F3_rn<"mul", fmul>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins f64imm:$a, Float64Regs:$b),
+                      "rcp.rn.f64 \t$dst, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[reqPTX20]>;
+
+
+multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b),
+                        Float32Regs:$c))]>, Requires<[Pred]>;
+   // This is to WAR a weird bug in Tablegen that does not automatically
+   // generate the following permutated rule rrr2 from the above rrr.
+   // So we explicitly add it here. This happens to FMA32 only.
+   // See the comments at FMAD32 and FMA32 for more information.
+   def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd Float32Regs:$c,
+                        (fmul Float32Regs:$a, Float32Regs:$b)))]>,
+                      Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, Float64Regs:$b),
+                        Float64Regs:$c))]>, Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
+                        Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
+// automatically generate the rrr2 rule from
+// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
+// If we reverse the order of the following two lines, then rrr2 rule will be
+// generated for FMA32, but not for rrr.
+// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
+defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
+defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
+defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
+
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
+// b*c-a => fmad(b, c, -a)
+// - legal because b*c-a <=> b*c+(-a)
+multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
+          (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
+          Requires<[Pred]>;
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
+          (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
+          (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+defm FMAF32ext_ftz  : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
+defm FMAF32ext  : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
+defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
+defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
+defm FMAF64ext  : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
+
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+//-----------------------------------
+// Logical Arithmetic
+//-----------------------------------
+
+multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
+  def b1rr:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b8rr:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def b8ri:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int16Regs:$b))]>;
+  def b16ri:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+  def b32ri:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int64Regs:$b))]>;
+  def b64ri:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : LOG_FORMAT<"or", or>;
+defm AND : LOG_FORMAT<"and", and>;
+defm XOR : LOG_FORMAT<"xor", xor>;
+
+def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT8:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int8Regs:$dst, (not Int8Regs:$src))]>;
+def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                      "not.b64 \t$dst, $src;",
+                      [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// For shifts, the second src operand must be 32-bit value
+multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
+
+// For shifts, the second src operand must be 32-bit value
+// Need to add cvt for the 8-bits.
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl, "cvt.u16.u8">;
+
+// 32bit
+def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
+
+def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
+
+// 64bit
+def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+  [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+  [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+    let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+    let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+    let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                     "mov.u32 \t$dst, $a;",
+                     [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                     "mov.u64 \t$dst, $a;",
+                     [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1 in {
+def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                   "mov.pred \t$dst, $sss;", []>;
+def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                    "mov.u32 \t$dst, $sss;", []>;
+def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                    "mov.u64 \t$dst, $sss;", []>;
+
+def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                    "mov.f32 \t$dst, $src;", []>;
+def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                    "mov.f64 \t$dst, $src;", []>;
+}
+def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                    "mov.pred \t$dst, $src;",
+          [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int8Regs:$dst, imm:$src)]>;
+def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                    "mov.u32 \t$dst, $src;",
+          [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                    "mov.u64 \t$dst, $src;",
+          [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                    "mov.f32 \t$dst, $src;",
+          [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                    "mov.f64 \t$dst, $src;",
+          [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                        "add.u32 \t$dst, ${addr:add};",
+                        [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                        "add.u64 \t$dst, ${addr:add};",
+                        [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   setp.gt.s16 p, %a, %b;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s sz1=16 sz2=16 d=%dst a=%a b=%b
+class Set_Str<string OpcStr, string sz1, string sz2, string d, string a,
+  string b> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1 , OpcStr);
+  string t3  = !strconcat(t2 , sz1);
+  string t4  = !strconcat(t3 , " \tp, ");
+  string t5  = !strconcat(t4 , a);
+  string t6  = !strconcat(t5 , ", ");
+  string t7  = !strconcat(t6 , b);
+  string t8  = !strconcat(t7 , ";\n\tselp.s");
+  string t9  = !strconcat(t8 , sz2);
+  string t10 = !strconcat(t9, " \t");
+  string t11 = !strconcat(t10, d);
+  string s   = !strconcat(t11, ", -1, 0, p;\n\t}}");
+}
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt s16.s8 %temp1, %b;
+//   setp.gt.s16 p, %temp1, %temp2;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8
+class Set_Stri8<string OpcStr, string d, string a, string b, string type,
+  string cvt> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1, ".reg .");
+  string t3  = !strconcat(t2, type);
+  string t4  = !strconcat(t3, " %temp1;\n\t");
+  string t5  = !strconcat(t4, ".reg .");
+  string t6  = !strconcat(t5, type);
+  string t7  = !strconcat(t6, " %temp2;\n\t");
+  string t8  = !strconcat(t7, cvt);
+  string t9  = !strconcat(t8, " \t%temp1, ");
+  string t10 = !strconcat(t9, a);
+  string t11 = !strconcat(t10, ";\n\t");
+  string t12 = !strconcat(t11, cvt);
+  string t13 = !strconcat(t12, " \t%temp2, ");
+  string t14 = !strconcat(t13, b);
+  string t15 = !strconcat(t14, ";\n\t");
+  string t16 = !strconcat(t15, OpcStr);
+  string t17 = !strconcat(t16, "16");
+  string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t");
+  string t19 = !strconcat(t18, "selp.s16 \t");
+  string t20 = !strconcat(t19, d);
+  string s   = !strconcat(t20, ", -1, 0, p;\n\t}}");
+}
+
+multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode,
+  string TypeStr, string CVTStr> {
+  def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s,
+               []>;
+  def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                     Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s,
+               []>;
+  def i32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                     Set_Str<OpcStr, "32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def i64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                     Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s,
+               []>;
+
+  def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+
+  def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+}
+
+multiclass FSET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode> {
+  def f32rr_toi32_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "ftz.f32", "32", "$dst", "$a", "$b">.s,
+               []>, Requires<[doF32FTZ]>;
+  def f32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "f32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "64", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "32", "$dst", "$a", "$b">.s,
+               []>;
+
+  def f32rr_p_ftz: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a
+      , Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>
+  , Requires<[doF32FTZ]>;
+  def f32rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+
+  def f32rr_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f32ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+}
+
+defm ISetSGT
+: ISET_FORMAT<"setp.gt.s", "set.gt.u32.s", setgt, "s16", "cvt.s16.s8">;
+defm ISetUGT
+: ISET_FORMAT<"setp.gt.u", "set.gt.u32.u", setugt, "u16", "cvt.u16.u8">;
+defm ISetSLT
+: ISET_FORMAT<"setp.lt.s", "set.lt.u32.s", setlt, "s16", "cvt.s16.s8">;
+defm ISetULT
+: ISET_FORMAT<"setp.lt.u", "set.lt.u32.u", setult, "u16", "cvt.u16.u8">;
+defm ISetSGE
+: ISET_FORMAT<"setp.ge.s", "set.ge.u32.s", setge, "s16", "cvt.s16.s8">;
+defm ISetUGE
+: ISET_FORMAT<"setp.ge.u", "set.ge.u32.u", setuge, "u16", "cvt.u16.u8">;
+defm ISetSLE
+: ISET_FORMAT<"setp.le.s", "set.le.u32.s", setle, "s16", "cvt.s16.s8">;
+defm ISetULE
+: ISET_FORMAT<"setp.le.u", "set.le.u32.u", setule, "u16", "cvt.u16.u8">;
+defm ISetSEQ
+: ISET_FORMAT<"setp.eq.s", "set.eq.u32.s", seteq, "s16", "cvt.s16.s8">;
+defm ISetUEQ
+: ISET_FORMAT<"setp.eq.u", "set.eq.u32.u", setueq, "u16", "cvt.u16.u8">;
+defm ISetSNE
+: ISET_FORMAT<"setp.ne.s", "set.ne.u32.s", setne, "s16", "cvt.s16.s8">;
+defm ISetUNE
+: ISET_FORMAT<"setp.ne.u", "set.ne.u32.u", setune, "u16", "cvt.u16.u8">;
+
+def ISetSNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setune Int1Regs:$a, Int1Regs:$b))]>;
+def ISetSEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (setueq Int1Regs:$a, Int1Regs:$b))]>;
+
+// Compare 2 i1's and produce a u32
+def ISETSNEi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, -1, 0, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISETSEQi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, 0, -1, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+
+defm FSetGT : FSET_FORMAT<"setp.gt.", "set.gt.u32.", setogt>;
+defm FSetLT : FSET_FORMAT<"setp.lt.", "set.lt.u32.", setolt>;
+defm FSetGE : FSET_FORMAT<"setp.ge.", "set.ge.u32.", setoge>;
+defm FSetLE : FSET_FORMAT<"setp.le.", "set.le.u32.", setole>;
+defm FSetEQ : FSET_FORMAT<"setp.eq.", "set.eq.u32.", setoeq>;
+defm FSetNE : FSET_FORMAT<"setp.ne.", "set.ne.u32.", setone>;
+
+defm FSetUGT : FSET_FORMAT<"setp.gtu.", "set.gtu.u32.", setugt>;
+defm FSetULT : FSET_FORMAT<"setp.ltu.", "set.ltu.u32.",setult>;
+defm FSetUGE : FSET_FORMAT<"setp.geu.", "set.geu.u32.",setuge>;
+defm FSetULE : FSET_FORMAT<"setp.leu.", "set.leu.u32.",setule>;
+defm FSetUEQ : FSET_FORMAT<"setp.equ.", "set.equ.u32.",setueq>;
+defm FSetUNE : FSET_FORMAT<"setp.neu.", "set.neu.u32.",setune>;
+
+defm FSetNUM : FSET_FORMAT<"setp.num.", "set.num.u32.",seto>;
+defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>;
+
+def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+                     (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+                             (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>;
+def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>;
+def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>;
+def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, Int16Regs:$b))]>;
+def SELECTi16ri : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, imm:$b))]>;
+def SELECTi16ir : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, Int16Regs:$b))]>;
+def SELECTi16ii : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi32rr : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, Int32Regs:$b))]>;
+def SELECTi32ri : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, imm:$b))]>;
+def SELECTi32ir : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, Int32Regs:$b))]>;
+def SELECTi32ii : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi64rr : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, Int64Regs:$b))]>;
+def SELECTi64ri : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, imm:$b))]>;
+def SELECTi64ir : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, Int64Regs:$b))]>;
+def SELECTi64ii : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTf32rr : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst,
+        (select Int1Regs:$p, Float32Regs:$a, Float32Regs:$b))]>;
+def SELECTf32ri : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, Float32Regs:$a, fpimm:$b))]>;
+def SELECTf32ir : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float32Regs:$b))]>;
+def SELECTf32ii : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+def SELECTf64rr : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst,
+        (select Int1Regs:$p, Float64Regs:$a, Float64Regs:$b))]>;
+def SELECTf64ri : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, Float64Regs:$a, fpimm:$b))]>;
+def SELECTf64ir : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float64Regs:$b))]>;
+def SELECTf64ii : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t $dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+//def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+  SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
+  SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTMoveRetvalProfile : SDTypeProfile<0, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
+  SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
+  SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveToParam  : SDNode<"NVPTXISD::MoveToParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg  : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd   : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid     : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype    : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
+                         []>;
+def MoveRetval   : SDNode<"NVPTXISD::MoveRetval", SDTMoveRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def MoveToRetval : SDNode<"NVPTXISD::MoveToRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
+  SDTPseudoUseParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode   : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+
+class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                "\t$dst, [retval0+$b];"),
+                [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[param$a+$b], $val;"),
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class MoveToParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\tparam$a, $val;"),
+                [(MoveToParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[func_retval0+$a], $val;"),
+                [(StoreRetval (i32 imm:$a), regclass:$val)]>;
+
+class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins i32imm:$num, regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval$num, $val;"),
+                [(MoveToRetval (i32 imm:$num), regclass:$val)]>;
+
+class MoveRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval0, $val;"),
+                [(MoveRetval regclass:$val)]>;
+
+def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+"call (retval0), ",
+                                [(PrintCall (i32 1))]>;
+def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1), ",
+                                [(PrintCall (i32 2))]>;
+def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2), ",
+                                [(PrintCall (i32 3))]>;
+def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3), ",
+                                [(PrintCall (i32 4))]>;
+def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCall (i32 5))]>;
+def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCall (i32 6))]>;
+def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCall (i32 7))]>;
+def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCall (i32 8))]>;
+
+def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
+                                [(PrintCall (i32 0))]>;
+
+def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
+"call.uni (retval0), ",
+                                [(PrintCallUni (i32 1))]>;
+def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1), ",
+                                [(PrintCallUni (i32 2))]>;
+def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2), ",
+                                [(PrintCallUni (i32 3))]>;
+def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3), ",
+                                [(PrintCallUni (i32 4))]>;
+def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCallUni (i32 5))]>;
+def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCallUni (i32 6))]>;
+def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCallUni (i32 7))]>;
+def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCallUni (i32 8))]>;
+
+def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
+                                [(PrintCallUni (i32 0))]>;
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int8Regs, ".b8">;
+
+//def LoadParamMemI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+//def LoadParamMemI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+
+def LoadParamRegI64    : LoadParamRegInst<Int64Regs, ".b64">;
+def LoadParamRegI32    : LoadParamRegInst<Int32Regs, ".b32">;
+def LoadParamRegI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int16Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+def LoadParamRegI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int8Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+def LoadParamRegF32    : LoadParamRegInst<Float32Regs, ".f32">;
+def LoadParamRegF64    : LoadParamRegInst<Float64Regs, ".f64">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b16\t[param$a+$b], $val;",
+           [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamI8     : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b8\t[param$a+$b], $val;",
+                       [(StoreParam
+                         (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamS32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def StoreParamU32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamU32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+def StoreParamS32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+
+def MoveToParamI64   : MoveToParamInst<Int64Regs, ".b64">;
+def MoveToParamI32   : MoveToParamInst<Int32Regs, ".b32">;
+def MoveToParamF64   : MoveToParamInst<Float64Regs, ".f64">;
+def MoveToParamF32   : MoveToParamInst<Float32Regs, ".f32">;
+def MoveToParamI16   : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def MoveToParamI8    : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int8Regs, ".b8">;
+
+//def StoreRetvalI16    : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>;
+//def StoreRetvalI8     : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+
+def MoveRetvalI64    : MoveRetvalInst<Int64Regs, ".b64">;
+def MoveRetvalI32    : MoveRetvalInst<Int32Regs, ".b32">;
+def MoveRetvalI16    : MoveRetvalInst<Int16Regs, ".b16">;
+def MoveRetvalI8     : MoveRetvalInst<Int8Regs, ".b8">;
+def MoveRetvalF64    : MoveRetvalInst<Float64Regs, ".f64">;
+def MoveRetvalF32    : MoveRetvalInst<Float32Regs, ".f32">;
+
+def MoveToRetvalI64    : MoveToRetvalInst<Int64Regs, ".b64">;
+def MoveToRetvalI32    : MoveToRetvalInst<Int32Regs, ".b32">;
+def MoveToRetvalF64    : MoveToRetvalInst<Float64Regs, ".f64">;
+def MoveToRetvalF32    : MoveToRetvalInst<Float32Regs, ".f32">;
+def MoveToRetvalI16    : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>;
+def MoveToRetvalI8     : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+                [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a",
+                [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgI8      : CallArgInst<Int8Regs>;
+
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgI8  : LastCallArgInst<Int8Regs>;
+
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                              [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                             [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
+                             "$addr, ",
+                             [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
+                             ", prototype_$val;",
+                             [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$size, i32imm:$num),
+         ".param .align $align .b8 retval$num[$size];",
+         [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".param .b$size retval$num;",
+         [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".reg .b$size retval$num;",
+         [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$a, i32imm:$size),
+         ".param .align $align .b8 param$a[$size];",
+         [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".param .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".reg .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+      NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+                !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
+                [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamI8  : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$src),
+      "// Pseudo use of $src",
+      [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamI8  : PseudoUseParamInst<Int8Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _ari : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+  def _asi : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+}
+
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LD_i8  : LD<Int8Regs>;
+defm LD_i16 : LD<Int16Regs>;
+defm LD_i32 : LD<Int32Regs>;
+defm LD_i64 : LD<Int64Regs>;
+defm LD_f32 : LD<Float32Regs>;
+defm LD_f64 : LD<Float64Regs>;
+}
+
+let VecInstType=isVecLD.Value, mayLoad=1, neverHasSideEffects=1 in {
+defm LD_v2i8 : LD<V2I8Regs>;
+defm LD_v4i8 : LD<V4I8Regs>;
+defm LD_v2i16 : LD<V2I16Regs>;
+defm LD_v4i16 : LD<V4I16Regs>;
+defm LD_v2i32 : LD<V2I32Regs>;
+defm LD_v4i32 : LD<V4I32Regs>;
+defm LD_v2f32 : LD<V2F32Regs>;
+defm LD_v4f32 : LD<V4F32Regs>;
+defm LD_v2i64 : LD<V2I64Regs>;
+defm LD_v2f64 : LD<V2F64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _ari : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+  def _asi : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+}
+
+let mayStore=1, neverHasSideEffects=1 in {
+defm ST_i8  : ST<Int8Regs>;
+defm ST_i16 : ST<Int16Regs>;
+defm ST_i32 : ST<Int32Regs>;
+defm ST_i64 : ST<Int64Regs>;
+defm ST_f32 : ST<Float32Regs>;
+defm ST_f64 : ST<Float64Regs>;
+}
+
+let VecInstType=isVecST.Value, mayStore=1, neverHasSideEffects=1 in {
+defm ST_v2i8 : ST<V2I8Regs>;
+defm ST_v4i8 : ST<V4I8Regs>;
+defm ST_v2i16 : ST<V2I16Regs>;
+defm ST_v4i16 : ST<V4I16Regs>;
+defm ST_v2i32 : ST<V2I32Regs>;
+defm ST_v4i32 : ST<V4I32Regs>;
+defm ST_v2f32 : ST<V2F32Regs>;
+defm ST_v4f32 : ST<V4F32Regs>;
+defm ST_v2i64 : ST<V2I64Regs>;
+defm ST_v2f64 : ST<V2F64Regs>;
+}
+
+// The following is used only in and after vector elementizations.
+// Vector elementization happens at the machine instruction level, so the
+// following instruction
+// never appears in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+      regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+  def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+}
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LDV_i8  : LD_VEC<Int8Regs>;
+defm LDV_i16 : LD_VEC<Int16Regs>;
+defm LDV_i32 : LD_VEC<Int32Regs>;
+defm LDV_i64 : LD_VEC<Int64Regs>;
+defm LDV_f32 : LD_VEC<Float32Regs>;
+defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v4_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+  def _v4_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+}
+let mayStore=1, neverHasSideEffects=1 in {
+defm STV_i8  : ST_VEC<Int8Regs>;
+defm STV_i16 : ST_VEC<Int16Regs>;
+defm STV_i32 : ST_VEC<Int32Regs>;
+defm STV_i64 : ST_VEC<Int64Regs>;
+defm STV_f32 : ST_VEC<Float32Regs>;
+defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTf16i8 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>;
+//  def CVTf16i16 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int16Regs:$a))]>;
+//  def CVTf16i32 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int32Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "32 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int32Regs:$a))]>;
+//  def CVTf16i64:
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int64Regs:$a),
+//          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+//            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf32i1 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a),
+              "selp.f32 \t$d, 1.0, 0.0, $a;",
+        [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf32i8 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf32i16 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf32i32 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "32 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf32i64:
+    NVPTXInst<(outs Float32Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf64i1 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a),
+              "selp.f64 \t$d, 1.0, 0.0, $a;",
+        [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf64i8 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf64i16 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf64i32 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "32 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf64i64:
+    NVPTXInst<(outs Float64Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f64.", OpStr), "64 \t$d, $a;"),
+            [(set Float64Regs:$d, (OpNode Int64Regs:$a))]>;
+}
+
+defm Sint_to_fp : CVT_INT_TO_FP <"s", sint_to_fp>;
+defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>;
+
+multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTi8f16:
+//    NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"),
+//        [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi8f32_ftz:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi8f32:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi8f64:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi16f16:
+//    NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"),
+//        [(set Int16Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi16f32_ftz:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi16f32:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi16f64:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi32f16:  def CVTi32f16:
+//    NVPTXInst<(outs Int32Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f16 \t$d, $a;"),
+//        [(set Int32Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi32f32_ftz:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi32f32:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi32f64:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f64 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi64f16:
+//    NVPTXInst<(outs Int64Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f16 \t$d, $a;"),
+//        [(set Int64Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi64f32_ftz:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi64f32:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi64f64:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f64 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float64Regs:$a))]>;
+}
+
+defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>;
+defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>;
+
+multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.u32 \t$d, 1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.u64 \t$d, 1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.s32 \t$d, -1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.s64 \t$d, -1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND <string OpStr, SDNode OpNode> {
+  // All Int8Regs are emiited as 16bit registers in ptx.
+  // And there is no selp.u8 in ptx.
+  def ext8to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext16to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext16to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext32to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "32 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int32Regs:$a))]>;
+}
+
+defm Sint_extend_1 : INT_EXTEND_SIGNED_1<sext>;
+defm Zint_extend_1 : INT_EXTEND_UNSIGNED_1<zext>;
+defm Aint_extend_1 : INT_EXTEND_UNSIGNED_1<anyext>;
+
+defm Sint_extend : INT_EXTEND <"s", sext>;
+defm Zint_extend : INT_EXTEND <"u", zext>;
+defm Aint_extend : INT_EXTEND <"u", anyext>;
+
+class TRUNC_to1_asm<string sz> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg ",
+             !strconcat(sz,
+             !strconcat(" temp;\n\t",
+             !strconcat("and",
+             !strconcat(sz,
+             !strconcat("\t temp, $a, 1;\n\t",
+             !strconcat("setp",
+             !strconcat(sz, ".eq \t $d, temp, 1;\n\t}}")))))))));
+}
+
+def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+             "cvt.u32.u64 \t$d, $a;",
+       [(set Int32Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a),
+             "cvt.u16.u64 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a),
+             "cvt.u8.u64 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a),
+             "cvt.u16.u32 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a),
+             "cvt.u8.u32 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a),
+             "cvt.u8.u16 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+             TRUNC_to1_asm<".b64">.s,
+             [(set Int1Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+             TRUNC_to1_asm<".b32">.s,
+             [(set Int1Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int8Regs:$a))]>;
+
+// Select instructions
+def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b),
+          (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELECTi16rr Int16Regs:$a, Int16Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELECTi32rr Int32Regs:$a, Int32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELECTi64rr Int64Regs:$a, Int64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELECTf32rr Float32Regs:$a, Float32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELECTf64rr Float64Regs:$a, Float64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// pack a set of smaller int registers to a larger int register
+def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2,
+                               Int8Regs:$s3, Int8Regs:$s4),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                          !strconcat("\n\tcvt.u8.u8\t%t2, $s3;",
+                          !strconcat("\n\tcvt.u8.u8\t%t3, $s4;",
+                           "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))),
+                          []>;
+def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2,
+                               Int16Regs:$s3, Int16Regs:$s4),
+                          "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
+                          []>;
+def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                                     "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))),
+                          []>;
+def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2),
+                          "mov.b32\t$d, {{$s1, $s2}};",
+                          []>;
+def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int32Regs:$s1, Int32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                          (ins Float32Regs:$s1, Float32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+
+// unpack a larger int register to a set of smaller int registers
+def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2,
+                                Int8Regs:$d3, Int8Regs:$d4),
+                          (ins Int32Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                          !strconcat("\n\tcvt.u8.u8\t$d2, %t1;",
+                          !strconcat("\n\tcvt.u8.u8\t$d3, %t2;",
+                                     "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))),
+                          []>;
+def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                 Int16Regs:$d3, Int16Regs:$d4),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
+                          []>;
+def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2),
+                          (ins Int16Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                                     "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))),
+                          []>;
+def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                           (ins Int32Regs:$s),
+                           "mov.b32\t{{$d1, $d2}}, $s;",
+                          []>;
+def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                           (ins Float64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+
+def FPRound_ftz : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.ftz.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPRound : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>;
+
+def FPExtend_ftz : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.ftz.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPExtend : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>;
+
+def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@$a bra \t$target;",
+                           [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@!$a bra \t$target;",
+                           []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                        "bra.uni \t$target;",
+                  [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch
+    (ISetUNEi32ri_p Int32Regs:$a, 0), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if
+// the target block is the next block so that the code can fall through to the
+// target block.
+// The invertion is done by 'xor condition, 1', which will be translated to
+// (setne condition, -1).
+// Since ptx supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+  (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
+                  "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+// @TODO: We use some tricks here to emit curly braces.  Can we clean this up
+// a bit without TableGen modifications?
+def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
+  "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
+                               [(callseq_start timm:$amt)]>;
+def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "\n\t//{{\n\t}}// Callseq End $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+
+def trapinst : NVPTXInst<(outs), (ins),
+                         "trap;",
+                         [(trap)]>;
+
+include "NVPTXVector.td"
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 0000000..028a94b
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,1675 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization Functions
+//-----------------------------------
+def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_cuda_syncthreads)]>;
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+        !strconcat("}}", ""))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+              NVPTXInst<(outs), (ins),
+            StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+multiclass SAT<NVPTXRegClass regclass, Operand fimm, Intrinsic IntMinOp,
+  Intrinsic IntMaxOp, PatLeaf f0, PatLeaf f1, string OpStr> {
+
+   // fmin(1.0, fmax(0.0, x)) => sat(x)
+   def SAT11 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp f0:$srcf1, regclass:$src)))]>;
+
+   // fmin(1.0, fmax(x, 0.0)) => sat(x)
+   def SAT12 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp regclass:$src, f0:$srcf1)))]>;
+
+   // fmin(fmax(0.0, x), 1.0) => sat(x)
+   def SAT13 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp f0:$srcf0, regclass:$src), f1:$srcf1))]>;
+
+   // fmin(fmax(x, 0.0), 1.0) => sat(x)
+   def SAT14 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+         OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp regclass:$src, f0:$srcf0), f1:$srcf1))]>;
+
+}
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x
+// is NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+defm SAT_fmin_fmax_f : SAT<Float32Regs, f32imm, int_nvvm_fmin_f,
+  int_nvvm_fmax_f, immFloat0, immFloat1,
+           "cvt.sat.f32.f32 \t$dst, $src; \n">;
+defm SAT_fmin_fmax_d : SAT<Float64Regs, f64imm, int_nvvm_fmin_d,
+  int_nvvm_fmax_d, immDouble0, immDouble1,
+           "cvt.sat.f64.f64 \t$dst, $src; \n">;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+  NVPTXRegClass src_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+            OpcStr,
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1),
+            OpcStr,
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+            OpcStr,
+        [(set t_regclass:$dst,
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+  : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+    Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor  Ceil
+//
+
+def INT_NVVM_FLOOR_FTZ_F : F_MATH_1<"cvt.rmi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_ftz_f>;
+def INT_NVVM_FLOOR_F : F_MATH_1<"cvt.rmi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_f>;
+def INT_NVVM_FLOOR_D : F_MATH_1<"cvt.rmi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_floor_d>;
+
+def INT_NVVM_CEIL_FTZ_F : F_MATH_1<"cvt.rpi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_ftz_f>;
+def INT_NVVM_CEIL_F : F_MATH_1<"cvt.rpi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_f>;
+def INT_NVVM_CEIL_D : F_MATH_1<"cvt.rpi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ceil_d>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def INT_NVVM_ROUND_FTZ_F : F_MATH_1<"cvt.rni.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_round_ftz_f>;
+def INT_NVVM_ROUND_F : F_MATH_1<"cvt.rni.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_round_f>;
+
+def INT_NVVM_ROUND_D : F_MATH_1<"cvt.rni.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_round_d>;
+
+//
+// Trunc
+//
+
+def INT_NVVM_TRUNC_FTZ_F : F_MATH_1<"cvt.rzi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_trunc_ftz_f>;
+def INT_NVVM_TRUNC_F : F_MATH_1<"cvt.rzi.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_trunc_f>;
+
+def INT_NVVM_TRUNC_D : F_MATH_1<"cvt.rzi.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_trunc_d>;
+
+//
+// Saturate
+//
+
+def INT_NVVM_SATURATE_FTZ_F : F_MATH_1<"cvt.sat.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_ftz_f>;
+def INT_NVVM_SATURATE_F : F_MATH_1<"cvt.sat.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_f>;
+
+def INT_NVVM_SATURATE_D : F_MATH_1<"cvt.sat.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_saturate_d>;
+
+//
+// Exp2  Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin  Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rp_d>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+  : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+    int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def INT_NVVM_D2F_RN_FTZ : F_MATH_1<"cvt.rn.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn_ftz>;
+def INT_NVVM_D2F_RN : F_MATH_1<"cvt.rn.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn>;
+def INT_NVVM_D2F_RZ_FTZ : F_MATH_1<"cvt.rz.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz_ftz>;
+def INT_NVVM_D2F_RZ : F_MATH_1<"cvt.rz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz>;
+def INT_NVVM_D2F_RM_FTZ : F_MATH_1<"cvt.rm.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm_ftz>;
+def INT_NVVM_D2F_RM : F_MATH_1<"cvt.rm.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm>;
+def INT_NVVM_D2F_RP_FTZ : F_MATH_1<"cvt.rp.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp_ftz>;
+def INT_NVVM_D2F_RP : F_MATH_1<"cvt.rp.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp>;
+
+def INT_NVVM_D2I_RN : F_MATH_1<"cvt.rni.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rn>;
+def INT_NVVM_D2I_RZ : F_MATH_1<"cvt.rzi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rz>;
+def INT_NVVM_D2I_RM : F_MATH_1<"cvt.rmi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rm>;
+def INT_NVVM_D2I_RP : F_MATH_1<"cvt.rpi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rp>;
+
+def INT_NVVM_D2UI_RN : F_MATH_1<"cvt.rni.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rn>;
+def INT_NVVM_D2UI_RZ : F_MATH_1<"cvt.rzi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rz>;
+def INT_NVVM_D2UI_RM : F_MATH_1<"cvt.rmi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rm>;
+def INT_NVVM_D2UI_RP : F_MATH_1<"cvt.rpi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rp>;
+
+def INT_NVVM_I2D_RN : F_MATH_1<"cvt.rn.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rn>;
+def INT_NVVM_I2D_RZ : F_MATH_1<"cvt.rz.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rz>;
+def INT_NVVM_I2D_RM : F_MATH_1<"cvt.rm.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rm>;
+def INT_NVVM_I2D_RP : F_MATH_1<"cvt.rp.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rp>;
+
+def INT_NVVM_UI2D_RN : F_MATH_1<"cvt.rn.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rn>;
+def INT_NVVM_UI2D_RZ : F_MATH_1<"cvt.rz.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rz>;
+def INT_NVVM_UI2D_RM : F_MATH_1<"cvt.rm.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rm>;
+def INT_NVVM_UI2D_RP : F_MATH_1<"cvt.rp.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rp>;
+
+def INT_NVVM_F2I_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rn_ftz>;
+def INT_NVVM_F2I_RN : F_MATH_1<"cvt.rni.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rn>;
+def INT_NVVM_F2I_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rz_ftz>;
+def INT_NVVM_F2I_RZ : F_MATH_1<"cvt.rzi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rz>;
+def INT_NVVM_F2I_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rm_ftz>;
+def INT_NVVM_F2I_RM : F_MATH_1<"cvt.rmi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rm>;
+def INT_NVVM_F2I_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rp_ftz>;
+def INT_NVVM_F2I_RP : F_MATH_1<"cvt.rpi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rp>;
+
+def INT_NVVM_F2UI_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rn_ftz>;
+def INT_NVVM_F2UI_RN : F_MATH_1<"cvt.rni.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rn>;
+def INT_NVVM_F2UI_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rz_ftz>;
+def INT_NVVM_F2UI_RZ : F_MATH_1<"cvt.rzi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rz>;
+def INT_NVVM_F2UI_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rm_ftz>;
+def INT_NVVM_F2UI_RM : F_MATH_1<"cvt.rmi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rm>;
+def INT_NVVM_F2UI_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rp_ftz>;
+def INT_NVVM_F2UI_RP : F_MATH_1<"cvt.rpi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rp>;
+
+def INT_NVVM_I2F_RN : F_MATH_1<"cvt.rn.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rn>;
+def INT_NVVM_I2F_RZ : F_MATH_1<"cvt.rz.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rz>;
+def INT_NVVM_I2F_RM : F_MATH_1<"cvt.rm.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rm>;
+def INT_NVVM_I2F_RP : F_MATH_1<"cvt.rp.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rp>;
+
+def INT_NVVM_UI2F_RN : F_MATH_1<"cvt.rn.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rn>;
+def INT_NVVM_UI2F_RZ : F_MATH_1<"cvt.rz.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rz>;
+def INT_NVVM_UI2F_RM : F_MATH_1<"cvt.rm.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rm>;
+def INT_NVVM_UI2F_RP : F_MATH_1<"cvt.rp.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rp>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+  Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+               "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+                           "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def INT_NVVM_F2LL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rn_ftz>;
+def INT_NVVM_F2LL_RN : F_MATH_1<"cvt.rni.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rn>;
+def INT_NVVM_F2LL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rz_ftz>;
+def INT_NVVM_F2LL_RZ : F_MATH_1<"cvt.rzi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rz>;
+def INT_NVVM_F2LL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rm_ftz>;
+def INT_NVVM_F2LL_RM : F_MATH_1<"cvt.rmi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rm>;
+def INT_NVVM_F2LL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rp_ftz>;
+def INT_NVVM_F2LL_RP : F_MATH_1<"cvt.rpi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rp>;
+
+def INT_NVVM_F2ULL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rn_ftz>;
+def INT_NVVM_F2ULL_RN : F_MATH_1<"cvt.rni.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rn>;
+def INT_NVVM_F2ULL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rz_ftz>;
+def INT_NVVM_F2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rz>;
+def INT_NVVM_F2ULL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rm_ftz>;
+def INT_NVVM_F2ULL_RM : F_MATH_1<"cvt.rmi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rm>;
+def INT_NVVM_F2ULL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rp_ftz>;
+def INT_NVVM_F2ULL_RP : F_MATH_1<"cvt.rpi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rp>;
+
+def INT_NVVM_D2LL_RN : F_MATH_1<"cvt.rni.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rn>;
+def INT_NVVM_D2LL_RZ : F_MATH_1<"cvt.rzi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rz>;
+def INT_NVVM_D2LL_RM : F_MATH_1<"cvt.rmi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rm>;
+def INT_NVVM_D2LL_RP : F_MATH_1<"cvt.rpi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rp>;
+
+def INT_NVVM_D2ULL_RN : F_MATH_1<"cvt.rni.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rn>;
+def INT_NVVM_D2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rz>;
+def INT_NVVM_D2ULL_RM : F_MATH_1<"cvt.rmi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rm>;
+def INT_NVVM_D2ULL_RP : F_MATH_1<"cvt.rpi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rp>;
+
+def INT_NVVM_LL2F_RN : F_MATH_1<"cvt.rn.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rn>;
+def INT_NVVM_LL2F_RZ : F_MATH_1<"cvt.rz.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rz>;
+def INT_NVVM_LL2F_RM : F_MATH_1<"cvt.rm.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rm>;
+def INT_NVVM_LL2F_RP : F_MATH_1<"cvt.rp.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rp>;
+def INT_NVVM_ULL2F_RN : F_MATH_1<"cvt.rn.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rn>;
+def INT_NVVM_ULL2F_RZ : F_MATH_1<"cvt.rz.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rz>;
+def INT_NVVM_ULL2F_RM : F_MATH_1<"cvt.rm.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rm>;
+def INT_NVVM_ULL2F_RP : F_MATH_1<"cvt.rp.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rp>;
+
+def INT_NVVM_LL2D_RN : F_MATH_1<"cvt.rn.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rn>;
+def INT_NVVM_LL2D_RZ : F_MATH_1<"cvt.rz.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rz>;
+def INT_NVVM_LL2D_RM : F_MATH_1<"cvt.rm.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rm>;
+def INT_NVVM_LL2D_RP : F_MATH_1<"cvt.rp.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rp>;
+def INT_NVVM_ULL2D_RN : F_MATH_1<"cvt.rn.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rn>;
+def INT_NVVM_ULL2D_RZ : F_MATH_1<"cvt.rz.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rz>;
+def INT_NVVM_ULL2D_RM : F_MATH_1<"cvt.rm.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rm>;
+def INT_NVVM_ULL2D_RP : F_MATH_1<"cvt.rp.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rp>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+                            !strconcat(".reg .b16 %temp;\n\t",
+          !strconcat("mov.b16 \t%temp, $src0;\n\t",
+          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+            "}}")))),
+          Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, SDNode IMM, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+  defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+    !strconcat("{{ \n\t",
+         !strconcat(".reg \t.s",
+         !strconcat(TypeStr,
+         !strconcat(" temp; \n\t",
+         !strconcat("neg.s",
+         !strconcat(TypeStr,
+         !strconcat(" \ttemp, $b; \n\t",
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(".u",
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], temp; \n\t",
+           !strconcat("}}", "")))))))))))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+  Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst,
+           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+         Requires<[Pred]>;
+  def imm1 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+  def imm2 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+  Requires<[Pred]>;
+  def imm3 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_inc  atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
+      NVPTXInst<(outs regclassOut:$dst), (ins),
+               OpStr,
+         [(set regclassOut:$dst, (IntOp))]>;
+
+def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_z>;
+
+def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_z>;
+
+def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_z>;
+
+def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_z>;
+
+def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
+  int_nvvm_read_ptx_sreg_warpsize>;
+
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Scalar
+// @TODO: Revisit this, Changed imemAny to imem
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ldu.global.", TyStr),
+                [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+                Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];",  Int8Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_p>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_p>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+ def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int8Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float32Regs>;
+
+// Vector ldu
+multiclass VLDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp,
+  NVPTXInst eleInst, NVPTXInst eleInst64> {
+ def _32:    NVPTXVecInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))], eleInst>,
+ Requires<[hasLDU]>;
+ def _64:    NVPTXVecInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))], eleInst64>,
+ Requires<[hasLDU]>;
+}
+
+let VecInstType=isVecLD.Value in {
+defm INT_PTX_LDU_G_v2i8  : VLDU_G<"v2.u8 \t${result:vecfull}, [$src];",
+  V2I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i8_ELE_32,
+  INT_PTX_LDU_G_v2i8_ELE_64>;
+defm INT_PTX_LDU_G_v4i8  : VLDU_G<"v4.u8 \t${result:vecfull}, [$src];",
+  V4I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i8_ELE_32,
+  INT_PTX_LDU_G_v4i8_ELE_64>;
+defm INT_PTX_LDU_G_v2i16 : VLDU_G<"v2.u16 \t${result:vecfull}, [$src];",
+  V2I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i16_ELE_32,
+  INT_PTX_LDU_G_v2i16_ELE_64>;
+defm INT_PTX_LDU_G_v4i16 : VLDU_G<"v4.u16 \t${result:vecfull}, [$src];",
+  V4I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i16_ELE_32,
+  INT_PTX_LDU_G_v4i16_ELE_64>;
+defm INT_PTX_LDU_G_v2i32 : VLDU_G<"v2.u32 \t${result:vecfull}, [$src];",
+  V2I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i32_ELE_32,
+  INT_PTX_LDU_G_v2i32_ELE_64>;
+defm INT_PTX_LDU_G_v4i32 : VLDU_G<"v4.u32 \t${result:vecfull}, [$src];",
+  V4I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i32_ELE_32,
+  INT_PTX_LDU_G_v4i32_ELE_64>;
+defm INT_PTX_LDU_G_v2f32 : VLDU_G<"v2.f32 \t${result:vecfull}, [$src];",
+  V2F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f32_ELE_32,
+  INT_PTX_LDU_G_v2f32_ELE_64>;
+defm INT_PTX_LDU_G_v4f32 : VLDU_G<"v4.f32 \t${result:vecfull}, [$src];",
+  V4F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v4f32_ELE_32,
+  INT_PTX_LDU_G_v4f32_ELE_64>;
+defm INT_PTX_LDU_G_v2i64 : VLDU_G<"v2.u64 \t${result:vecfull}, [$src];",
+  V2I64Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i64_ELE_32,
+  INT_PTX_LDU_G_v2i64_ELE_64>;
+defm INT_PTX_LDU_G_v2f64 : VLDU_G<"v2.f64 \t${result:vecfull}, [$src];",
+  V2F64Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f64_ELE_32,
+  INT_PTX_LDU_G_v2f64_ELE_64>;
+}
+
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;
+   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;*/
+
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+
+defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+
+def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
+def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
+
+
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the def.
+/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;
+def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;*/
+
+
+def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+            "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
+def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+            "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+  (ins Int32Regs:$src),
+                        "mov.u32 \t$result, $src;",
+                              [(set Int32Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+  (ins Int64Regs:$src),
+                        "mov.u64 \t$result, $src;",
+                              [(set Int64Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int8Regs:$r,
+                               (int_nvvm_move_i8 Int8Regs:$s))]>;
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int16Regs:$r,
+                               (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.b32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.b64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+                             "mov.f32 \t$r, $s;",
+                             [(set Float32Regs:$r,
+                               (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+                             "mov.f64 \t$r, $s;",
+                             [(set Float64Regs:$r,
+                               (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam        %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr64  texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr32  texternalsym:$src)>;
+
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+
+//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
+
+// These intrinsics are handled to retain compatibility with the old backend.
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int64Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int32Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
+
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
+
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
+
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_ptx_bar_sync imm:$i)]>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 0000000..56b2372
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,208 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetData.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createLowerAggrCopies();
+}
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(Instruction *splitAt, Value *srcAddr,
+                                  Value *dstAddr, Value *len,
+                                  //unsigned numLoads,
+                                  bool srcVolatile, bool dstVolatile,
+                                  LLVMContext &Context, Function &F) {
+  Type *indType = len->getType();
+
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  // srcAddr and dstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned srcAS =
+      dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+  IRBuilder<> loop(loopBB);
+  // The loop index (ind) is a phi node.
+  PHINode *ind = loop.CreatePHI(indType, 0);
+  // Incoming value for ind is 0
+  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+  // load from srcAddr+ind
+  Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+  // store at dstAddr+ind
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+  // The value for ind coming from backedge is (ind + 1)
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+                                Value *len, Value *val, LLVMContext &Context,
+                                Function &F) {
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointer to the type of value getting stored
+  dstAddr = builder.CreateBitCast(dstAddr,
+                                  PointerType::get(val->getType(), dstAS));
+
+  IRBuilder<> loop(loopBB);
+  PHINode *ind = loop.CreatePHI(len->getType(), 0);
+  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> aggrLoads;
+  SmallVector<MemTransferInst *, 4> aggrMemcpys;
+  SmallVector<MemSetInst *, 4> aggrMemsets;
+
+  TargetData *TD = &getAnalysis<TargetData>();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  //
+  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+  //
+  //const BasicBlock *firstBB = &F.front();  // first BB in F
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    //BasicBlock *bb = BI;
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+        ++II) {
+      if (LoadInst * load = dyn_cast<LoadInst>(II)) {
+
+        if (load->hasOneUse() == false) continue;
+
+        if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue;
+
+        User *use = *(load->use_begin());
+        if (StoreInst * store = dyn_cast<StoreInst>(use)) {
+          if (store->getOperand(0) != load) //getValueOperand
+          continue;
+          aggrLoads.push_back(load);
+        }
+      } else if (MemTransferInst * intr = dyn_cast<MemTransferInst>(II)) {
+        Value *len = intr->getLength();
+        // If the number of elements being copied is greater
+        // than MaxAggrCopySize, lower it to a loop
+        if (ConstantInt * len_int = dyn_cast < ConstantInt > (len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemcpys.push_back(intr);
+          }
+        } else {
+          // turn variable length memcpy/memmov into loop
+          aggrMemcpys.push_back(intr);
+        }
+      } else if (MemSetInst * memsetintr = dyn_cast<MemSetInst>(II)) {
+        Value *len = memsetintr->getLength();
+        if (ConstantInt * len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemsets.push_back(memsetintr);
+          }
+        } else {
+          // turn variable length memset into loop
+          aggrMemsets.push_back(memsetintr);
+        }
+      }
+    }
+  }
+  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0)
+      && (aggrMemsets.size() == 0)) return false;
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+    LoadInst *load = aggrLoads[i];
+    StoreInst *store = dyn_cast<StoreInst>(*load->use_begin());
+    Value *srcAddr = load->getOperand(0);
+    Value *dstAddr = store->getOperand(1);
+    unsigned numLoads = TD->getTypeStoreSize(load->getType());
+    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+                          store->isVolatile(), Context, F);
+
+    store->eraseFromParent();
+    load->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+    MemTransferInst *cpy = aggrMemcpys[i];
+    Value *len = cpy->getLength();
+    // llvm 2.7 version of memcpy does not have volatile
+    // operand yet. So always making it non-volatile
+    // optimistically, so that we don't see unnecessary
+    // st.volatile in ptx
+    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+                          false, Context, F);
+    cpy->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+    MemSetInst *memsetinst = aggrMemsets[i];
+    Value *len = memsetinst->getLength();
+    Value *val = memsetinst->getValue();
+    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+                        F);
+    memsetinst->eraseFromParent();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 0000000..ac7f150
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,47 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual bool runOnFunction(Function &F);
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  virtual const char *getPassName() const {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.cpp b/lib/Target/NVPTX/NVPTXNumRegisters.h
index 60acfc7..b4a4dbc 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXNumRegisters.h
@@ -1,4 +1,5 @@
-//===-- PTXMachineFuctionInfo.cpp - PTX machine function info -------------===//
+
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +8,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTXMachineFunctionInfo.h"
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = 396;
 
-using namespace llvm;
+}
 
-void PTXMachineFunctionInfo::anchor() { }
+#endif
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 0000000..e3cd46f
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,325 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+
+using namespace llvm;
+
+namespace llvm
+{
+std::string getNVPTXRegClassName (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return ".s64";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".s32";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".s16";
+  }
+  // Int8Regs become 16-bit registers in PTX
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return ".s16";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return ".v2.f32";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return ".v4.f32";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return ".v2.s32";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return ".v4.s32";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return ".v2.f64";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return ".v2.s64";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return ".v4.s16";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return ".v4.s16";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  }
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return "%rc";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return "%v2f";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return "%v4f";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return "%v2r";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return "%v4r";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return "%v2fd";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return "%v2rd";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return "%v2s";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return "%v4rs";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return "%v2rc";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return "%v4rc";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return true;
+  return false;
+}
+
+std::string getNVPTXElemClassName(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  llvm_unreachable("Not a vector register class");
+}
+
+const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return (&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return (&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  llvm_unreachable("Not a vector register class");
+}
+
+int getNVPTXVectorSize(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return 4;
+  llvm_unreachable("Not a vector register class");
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                                     const NVPTXSubtarget &st)
+  : NVPTXGenRegisterInfo(0),
+    Is64Bit(st.is64Bit()) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const uint16_t* NVPTXRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// NVPTX Callee Saved Reg Classes
+const TargetRegisterClass* const*
+NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II,
+                    int SPAdj,
+                    RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+      MI.getOperand(i+1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(i).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+}
+
+
+int NVPTXRegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return 0;
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
+
+unsigned NVPTXRegisterInfo::getRARegister() const {
+  return 0;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 0000000..5951783
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,92 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  bool Is64Bit;
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool     ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                    const NVPTXSubtarget &st);
+
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  virtual const uint16_t*
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // NVPTX callee saved register classes
+  virtual const TargetRegisterClass* const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                   int SPAdj,
+                                   RegScavenger *RS=NULL) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+
+std::string getNVPTXRegClassName (const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr (const TargetRegisterClass *RC);
+bool isNVPTXVectorRegClass (const TargetRegisterClass *RC);
+std::string getNVPTXElemClassName (const TargetRegisterClass *RC);
+int getNVPTXVectorSize (const TargetRegisterClass *RC);
+const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 0000000..ba15825
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,108 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+
+foreach i = 0-395 in {
+  def P#i  : NVPTXReg<"%p"#i>;  // Predicate
+  def RC#i : NVPTXReg<"%rc"#i>; // 8-bit
+  def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+  def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
+  def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
+  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
+  def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float
+  // Vectors
+  foreach s = [ "2b8", "2b16", "2b32", "2b64", "4b8", "4b16", "4b32" ] in
+    def v#s#_#i : NVPTXReg<"%v"#s#"_"#i>;
+
+  // Arguments
+  def ia#i : NVPTXReg<"%ia"#i>;
+  def la#i : NVPTXReg<"%la"#i>;
+  def fa#i : NVPTXReg<"%fa"#i>;
+  def da#i : NVPTXReg<"%da"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>;
+def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 395))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 395))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 395))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 395))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 395))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+
+class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
+                       NVPTXRegClass sClass,
+                       int e,
+                       string n>
+  : NVPTXRegClass<regTypes, alignment, regList>
+{
+  NVPTXRegClass scalarClass=sClass;
+  int elems=e;
+  string name=n;
+}
+def V2F32Regs
+  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%u", 0, 395)),
+    Float32Regs, 2, ".v2.f32">;
+def V4F32Regs
+  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%u", 0, 395)),
+    Float32Regs, 4, ".v4.f32">;
+def V2I32Regs
+  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%u", 0, 395)),
+    Int32Regs, 2, ".v2.u32">;
+def V4I32Regs
+  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%u", 0, 395)),
+    Int32Regs, 4, ".v4.u32">;
+def V2F64Regs
+  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%u", 0, 395)),
+    Float64Regs, 2, ".v2.f64">;
+def V2I64Regs
+  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%u", 0, 395)),
+    Int64Regs, 2, ".v2.u64">;
+def V2I16Regs
+  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%u", 0, 395)),
+    Int16Regs, 2, ".v2.u16">;
+def V4I16Regs
+  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%u", 0, 395)),
+    Int16Regs, 4, ".v4.u16">;
+def V2I8Regs
+  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%u", 0, 395)),
+    Int8Regs, 2, ".v2.u8">;
+def V4I8Regs
+  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%u", 0, 395)),
+    Int8Regs, 4, ".v4.u8">;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 0000000..f1ca466
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,45 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/GlobalVariable.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+  ~NVPTXSection() {}
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                    raw_ostream &OS) const {}
+
+  /// Base address of PTX sections is zero.
+  virtual bool isBaseAddressKnownZero() const { return true; }
+  virtual bool UseCodeAlign() const { return false; }
+  virtual bool isVirtualSection() const { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
new file mode 100644
index 0000000..2836cad
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -0,0 +1,77 @@
+//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier  --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Split basic blocks so that a basic block that contains a barrier instruction
+// only contains the barrier instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/InstIterator.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXSplitBBatBar.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createSplitBBatBarPass();
+}
+
+char NVPTXSplitBBatBar::ID = 0;
+
+bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
+
+  SmallVector<Instruction *, 4> SplitPoints;
+  bool changed = false;
+
+  // Collect all the split points in SplitPoints
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    BasicBlock::iterator IB = BI->begin();
+    BasicBlock::iterator II = IB;
+    BasicBlock::iterator IE = BI->end();
+
+    // Skit the first intruction. No splitting is needed at this
+    // point even if this is a bar.
+    while (II != IE) {
+      if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
+        Intrinsic::ID id = inst->getIntrinsicID();
+        // If this is a barrier, split at this instruction
+        // and the next instruction.
+        if (llvm::isBarrierIntrinsic(id)) {
+          if (II != IB)
+            SplitPoints.push_back(II);
+          II++;
+          if ((II != IE) && (!II->isTerminator())) {
+            SplitPoints.push_back(II);
+            II++;
+          }
+          continue;
+        }
+      }
+      II++;
+    }
+  }
+
+  for (unsigned i = 0; i != SplitPoints.size(); i++) {
+    changed = true;
+    Instruction *inst = SplitPoints[i];
+    inst->getParent()->splitBasicBlock(inst, "bar_split");
+  }
+
+  return changed;
+}
+
+// This interface will most likely not be necessary, because this pass will
+// not be invoked by the driver, but will be used as a prerequisite to
+// another pass.
+FunctionPass *llvm::createSplitBBatBarPass() {
+  return new NVPTXSplitBBatBar();
+}
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
new file mode 100644
index 0000000..9e4d5a0
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
@@ -0,0 +1,41 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific declarations
+// for splitting basic blocks at barrier instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_SPLIT_BB_AT_BAR_H
+#define NVPTX_SPLIT_BB_AT_BAR_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXSplitBBatBar : public FunctionPass {
+  static char ID;
+
+  NVPTXSplitBBatBar() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "Split basic blocks at barrier";
+  }
+};
+
+extern FunctionPass *createSplitBBatBarPass();
+}
+
+#endif //NVPTX_SPLIT_BB_AT_BAR_H
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 0000000..6aadd43
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,57 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// Select Driver Interface
+#include "llvm/Support/CommandLine.h"
+namespace {
+cl::opt<NVPTX::DrvInterface>
+DriverInterface(cl::desc("Choose driver interface:"),
+                cl::values(
+                    clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"),
+                    clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"),
+                    clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"),
+                    clEnumValEnd),
+                    cl::init(NVPTX::NVCL));
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, bool is64Bit)
+:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
+ // because we don't register all
+ // nvptx targets.
+ Is64Bit(is64Bit) {
+
+  drvInterface = DriverInterface;
+
+  // Provide the default CPU if none
+  std::string defCPU = "sm_10";
+
+  // Get the TargetName from the FS if available
+  if (FS.empty() && CPU.empty())
+    TargetName = defCPU;
+  else if (!CPU.empty())
+    TargetName = CPU;
+  else
+    llvm_unreachable("we are not using FeatureStr");
+
+  // Set up the SmVersion
+  SmVersion = atoi(TargetName.c_str()+3);
+}
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 0000000..8f2a629
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,92 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "NVPTX.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+
+  unsigned int SmVersion;
+  std::string TargetName;
+  NVPTX::DrvInterface drvInterface;
+  bool dummy; // For the 'dummy' feature, see NVPTX.td
+  bool Is64Bit;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool is64Bit);
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDU() const { return SmVersion >= 20; }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return false; }
+  inline bool hasSWROT32() const {
+    return true;
+  }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32() ; }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+
+
+  bool is64Bit() const { return Is64Bit; }
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  std::string getTargetName() const { return TargetName; }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+    else
+      p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+
+    return std::string(p);
+  }
+
+};
+
+} // End llvm namespace
+
+#endif  // NVPTXSUBTARGET_H
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 0000000..433f415
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,133 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "NVPTX.h"
+#include "NVPTXSplitBBatBar.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
+
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T,
+                                       StringRef TT,
+                                       StringRef CPU,
+                                       StringRef FS,
+                                       const TargetOptions& Options,
+                                       Reloc::Model RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
+                                       bool is64bit)
+: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+  Subtarget(TT, CPU, FS, is64bit),
+  DataLayout(Subtarget.getDataLayout()),
+  InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit)
+/*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+}
+
+
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+
+namespace llvm {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+  : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+};
+}
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+  return PassConfig;
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  addPass(createLowerAggrCopies());
+  addPass(createSplitBBatBarPass());
+  addPass(createAllocaHoisting());
+  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+  addPass(createVectorElementizePass(getNVPTXTargetMachine()));
+  return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() {
+  return false;
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 0000000..b3f9cac
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,125 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "NVPTXInstrInfo.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXFrameLowering.h"
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  NVPTXSubtarget        Subtarget;
+  const TargetData      DataLayout;       // Calculates type size & alignment
+  NVPTXInstrInfo        InstrInfo;
+  NVPTXTargetLowering   TLInfo;
+  TargetSelectionDAGInfo   TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering       FrameLowering;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool     ManagedStrPool;
+
+  //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+  //                            bool DisableVerify, MCContext *&OutCtx);
+
+public:
+  NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OP,
+                     bool is64bit);
+
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const NVPTXInstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget;}
+
+  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &(InstrInfo.getRegisterInfo());
+  }
+
+  virtual NVPTXTargetLowering *getTargetLowering() const {
+    return const_cast<NVPTXTargetLowering*>(&TLInfo);
+  }
+
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  //virtual bool addInstSelector(PassManagerBase &PM,
+  //                             CodeGenOpt::Level OptLevel);
+
+  //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool*>(&ManagedStrPool);
+  }
+
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  // Emission of machine code through JITCodeEmitter is not supported.
+  virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                          JITCodeEmitter &,
+                                          bool = true) {
+    return true;
+  }
+
+  // Emission of machine code through MCJIT is not supported.
+  virtual bool addPassesToEmitMC(PassManagerBase &,
+                                 MCContext *&,
+                                 raw_ostream &,
+                                 bool = true) {
+    return true;
+  }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 0000000..b5698a2
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {}
+  ~NVPTXTargetObjectFile() {
+    delete TextSection;
+    delete DataSection;
+    delete BSSSection;
+    delete ReadOnlySection;
+
+    delete StaticCtorSection;
+    delete StaticDtorSection;
+    delete LSDASection;
+    delete EHFrameSection;
+    delete DwarfAbbrevSection;
+    delete DwarfInfoSection;
+    delete DwarfLineSection;
+    delete DwarfFrameSection;
+    delete DwarfPubTypesSection;
+    delete DwarfDebugInlineSection;
+    delete DwarfStrSection;
+    delete DwarfLocSection;
+    delete DwarfARangesSection;
+    delete DwarfRangesSection;
+    delete DwarfMacroInfoSection;
+  }
+
+  virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TextSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getText());
+    DataSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getDataRel());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF,
+                                  SectionKind::getBSS());
+    ReadOnlySection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getReadOnly());
+
+    StaticCtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    StaticDtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    LSDASection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getMetadata());
+    EHFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                      SectionKind::getMetadata());
+    DwarfAbbrevSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfLineSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    DwarfPubTypesSection = new NVPTXSection(MCSection::SV_ELF,
+                                            SectionKind::getMetadata());
+    DwarfDebugInlineSection = new NVPTXSection(MCSection::SV_ELF,
+                                               SectionKind::getMetadata());
+    DwarfStrSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfLocSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfARangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                           SectionKind::getMetadata());
+    DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfMacroInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                             SectionKind::getMetadata());
+  }
+
+  virtual const MCSection *getSectionForConstant(SectionKind Kind) const {
+    return ReadOnlySection;
+  }
+
+  virtual const MCSection *
+  getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang,
+                           const TargetMachine &TM) const {
+    return DataSection;
+  }
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 0000000..3f52251
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,514 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Constants.h"
+#include "llvm/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+//#include <iostream>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i+1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    Value *entity = elem->getOperand(0);
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = tmp;
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = tmp;
+    (*annotationCache)[m] = tmp1;
+  }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 unsigned &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 std::vector<unsigned> &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+          llvm::PropertyAnnotationNames[llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+         llvm::PropertyAnnotationNames[llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X],
+                                      x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y],
+                                      y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z],
+                                      z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X],
+                                      x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y],
+                                      y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z],
+                                      z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                    llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM],
+                                      x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = llvm::findOneNVVMAnnotation(&F,
+               llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION],
+                                            x);
+  if (retval == false) {
+    // There is no NVVM metadata, check the calling convention
+    if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+      return true;
+    else
+      return false;
+  }
+  return (x==1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = llvm::findAllNVVMAnnotation(&F,
+                           llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN],
+                                            Vs);
+  if (retval == false)
+    return false;
+  for (int i=0, e=Vs.size(); i<e; i++) {
+    unsigned v = Vs[i];
+    if ( (v >> 16) == index ) {
+      align =  v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i=0, n = alignNode->getNumOperands();
+        i<n; i++) {
+      if (const ConstantInt *CI =
+          dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ( (v>>16) == index ) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ( (v>>16) > index ) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+  if ((id == Intrinsic::nvvm_barrier0) ||
+      (id == Intrinsic::nvvm_barrier0_popc) ||
+      (id == Intrinsic::nvvm_barrier0_and) ||
+      (id == Intrinsic::nvvm_barrier0_or) ||
+      (id == Intrinsic::cuda_syncthreads))
+    return true;
+  return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+      id == Intrinsic::nvvm_ptr_shared_to_gen ||
+      id == Intrinsic::nvvm_ptr_global_to_gen ||
+      id == Intrinsic::nvvm_ptr_constant_to_gen ||
+      id == Intrinsic::nvvm_ptr_gen_to_global ||
+      id == Intrinsic::nvvm_ptr_gen_to_shared ||
+      id == Intrinsic::nvvm_ptr_gen_to_local ||
+      id == Intrinsic::nvvm_ptr_gen_to_constant ||
+      id == Intrinsic::nvvm_ptr_gen_to_param) {
+    return true;
+  }
+
+  return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       bool ignore_GEP_indices) {
+  V = V->stripPointerCasts();
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (ignore_GEP_indices)
+      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+        V = GEP->getPointerOperand()->stripPointerCasts();
+        continue;
+      }
+    break;
+  }
+  return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       std::set<const Value *> &processed) {
+  if (processed.find(V) != processed.end())
+    return NULL;
+  processed.insert(V);
+
+  const Value *V2 = V->stripPointerCasts();
+  if (V2 != V && processed.find(V2) != processed.end())
+    return NULL;
+  processed.insert(V2);
+
+  V = V2;
+
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand()->stripPointerCasts();
+      continue;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (V != V2 && processed.find(V) != processed.end())
+        return NULL;
+      processed.insert(PN);
+      const Value *common = 0;
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        const Value *pv = PN->getIncomingValue(i);
+        const Value *base = skipPointerTransfer(pv, processed);
+        if (base) {
+          if (common == 0)
+            common = base;
+          else if (common != base)
+            return PN;
+        }
+      }
+      if (common == 0)
+        return PN;
+      V = common;
+    }
+    break;
+  }
+  return V;
+}
+
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent();
+
+  return 0;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+  if (Function *F = dyn_cast<Function>(v))
+    return F;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent()->getParent();
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B->getParent();
+
+  return 0;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+  Function *F = getParentFunction(v);
+  if (F == 0)
+    return;
+
+  for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+    BasicBlock *B = it;
+    if (strcmp(B->getName().data(), blockName) == 0) {
+      B->dump();
+      return;
+    }
+  }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+  Function *F = getParentFunction(base);
+  if (F == 0)
+    return 0;
+
+  for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+    Instruction *I = &*it;
+    if (strcmp(I->getName().data(), instName) == 0) {
+      return I;
+    }
+  }
+
+  return 0;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+  Instruction *I = getInst(base, instName);
+  if (I)
+    I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+    if (visited->find(I) != visited->end())
+      return;
+
+    visited->insert(I);
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      dumpInstRec(I->getOperand(i), visited);
+
+    I->dump();
+  }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+  std::set<Instruction *> visited;
+
+  //BasicBlock *B = getParentBlock(v);
+
+  dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+    I->getParent()->dump();
+    return;
+  }
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+    B->getParent()->dump();
+    return;
+  }
+
+  if (Function *F = dyn_cast<Function>(v)) {
+    F->getParent()->dump();
+    return;
+  }
+}
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 0000000..fe6ad55
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,94 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/Value.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+                           std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors.  This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template<typename T>
+inline std::vector<T> make_vector(T A, ...) {
+  va_list Args;
+  va_start(Args, A);
+  std::vector<T> Result;
+  Result.push_back(A);
+  while (T Val = va_arg(Args, T))
+    Result.push_back(Val);
+  va_end(Args);
+  return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *skipPointerTransfer(const Value *V,
+                                 std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXVector.td b/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 0000000..775df19
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1481 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+  NVPTXInst sInst=NOP>
+  : NVPTXInst<outs, ins, asmstr, pattern> {
+  NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V2I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v2i16 V2I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V4I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v4i16 V4I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V2I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v2i8 V2I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V4I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v4i8 V4I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V2I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v2i32 V2I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V2F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v2f32 V2F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+  (ins V2I64Regs:$src, i8imm:$c),
+                         "mov.u64 \t$dst, $src${c:vecelem};",
+                         [(set Int64Regs:$dst, (vector_extract
+                           (v2i64 V2I64Regs:$src), imm:$c))],
+                         IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+  (ins V2F64Regs:$src, i8imm:$c),
+                         "mov.f64 \t$dst, $src${c:vecelem};",
+                         [(set Float64Regs:$dst, (vector_extract
+                           (v2f64 V2F64Regs:$src), imm:$c))],
+                         FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V4I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v4i32 V4I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V4F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v4f32 V4F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+  (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I8Regs:$dst,
+         (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+  (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I8Regs:$dst,
+         (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+  (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I16Regs:$dst,
+         (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+  (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I16Regs:$dst,
+         (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+  (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V2I32Regs:$dst,
+         (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+  (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V2F32Regs:$dst,
+         (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+  (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+                       "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+       [(set V2I64Regs:$dst,
+         (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                         IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+  (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+                       "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+       [(set V2F64Regs:$dst,
+         (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                         FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+  (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V4I32Regs:$dst,
+         (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+  (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V4F32Regs:$dst,
+         (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+  string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+                 sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+                 NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+                 asmstr.s,
+                 [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+                 sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+                     NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+                     i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+    i64op>;
+  def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+    i32op>;
+  def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+    i32op>;
+  def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+    i16op>;
+  def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+    i16op>;
+  def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+    i8op>;
+  def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+    i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+                       NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+                       NVPTXInst f32_ftz=NOP> {
+  def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+    V2F64Regs, f64>;
+  def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V4F32Regs, f32>;
+  def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+                       NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+                       NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+    V2I64Regs, i64op>;
+  def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V4I32Regs, i32op>;
+  def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V2I32Regs, i32op>;
+  def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I16Regs, i16op>;
+  def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I16Regs, i16op>;
+  def V4I8  : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I8Regs,   i8op>;
+  def V2I8  : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I8Regs,   i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+  ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+  ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+  SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+  SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+  ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+  ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+  SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+  SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+  SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+  SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+  SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+  SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+  SHLi16rr>;
+def ShiftLV2I8  : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs,  V2I32Regs,
+  SHLi8rr>;
+def ShiftLV4I8  : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs,  V4I32Regs,
+  SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+  string s=!strconcat(op, !strconcat("\t",
+           !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+  SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+  SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+  SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+  SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+  SRAi16rr>;
+def ShiftRAV2I8  : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs,  V2I32Regs,
+  SRAi8rr>;
+def ShiftRAV4I8  : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs,  V4I32Regs,
+  SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+  SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+  SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+  SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+  SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+  SRLi16rr>;
+def ShiftRLV2I8  : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs,  V2I32Regs,
+  SRLi8rr>;
+def ShiftRLV4I8  : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs,  V4I32Regs,
+  SRLi8rr>;
+
+defm VMult   : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+  MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+  MULTHSi16rr,
+  MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+  MULTHUi16rr,
+  MULTHUi8rr>;
+defm VSDiv   : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+  SDIVi8rr>;
+defm VUDiv   : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+  UDIVi8rr>;
+defm VSRem   : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+  SREMi8rr>;
+defm VURem   : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+  UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+  NVPTXRegClass regclassv2,
+                SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+                Predicate Pred> {
+  def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+    (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+                      V4MADStr<asmstr>.s,
+                      [(set regclassv4:$dst,
+                        (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+                      sop>,
+           Requires<[Pred]>;
+  def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+    (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclassv2:$dst,
+                        (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+                      sop>,
+           Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (add
+                        (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (fadd
+                        (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD  : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+  true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+  true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+  doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+  doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr  : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+          (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c,  V2F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+          (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz  : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext  : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+          (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c,  V4F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+          (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz  : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext  : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+          (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+          (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+  string t1 = !strconcat("${c", elem);
+  string t2 = !strconcat(t1, ":vecv");
+  string t3 = !strconcat(t2, vecsize);
+  string t4 = !strconcat(t3, extra);
+  string t5 = !strconcat(t4, l);
+  string s =  !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+  string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+  string t2 = !strconcat(t1, "mov.");
+  string t3 = !strconcat(t2, type);
+  string t4 = !strconcat(t3, " \t${dst}_");
+  string t5 = !strconcat(t4, elem);
+  string t6 = !strconcat(t5, ", $src1");
+  string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+  string t8 = !strconcat(t7, ";\n\t");
+  string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+  string t10 = !strconcat(t9, "mov.");
+  string t11 = !strconcat(t10, type);
+  string t12 = !strconcat(t11, " \t${dst}_");
+  string t13 = !strconcat(t12, elem);
+  string t14 = !strconcat(t13, ", $src2");
+  string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+  string s =   !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+  string t1 = ShuffleOneLine<"2", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+  string t1 = ShuffleOneLine<"4", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+                       (ins  V4F32Regs:$src1, V4F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+                       (ins  V4I32Regs:$src1, V4I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+                       (ins  V4I16Regs:$src1, V4I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+                       (ins  V4I8Regs:$src1, V4I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+                       (ins  V2F32Regs:$src1, V2F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+                       (ins  V2I32Regs:$src1, V2I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+                       (ins  V2I8Regs:$src1, V2I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+                       (ins  V2I16Regs:$src1, V2I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+                       (ins  V2F64Regs:$src1, V2F64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f64">.s),
+                       [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+                       (ins  V2I64Regs:$src1, V2I64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u64">.s),
+                       [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs),
+                       [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+          (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+          (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+          (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+          (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+          (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+          (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+          (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+          (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+          (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+          (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2),
+                   !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+                   [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+                   si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+               !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+                   [(set vclass:$dst,
+                     (build_vector sclass:$a1, sclass:$a2,
+                       sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+  FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+  FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+  IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector2_i8  : Build_Vector2<"mov.v2.u16",  V2I8Regs,  Int8Regs,
+  IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+  FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector4_i8  : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+  IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+                 : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+                   !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+                   [], sop>;
+
+let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+  VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+                        SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+                                    (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+                                    (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+                                    (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+                                    (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+                                    (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+                                    (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+                                    (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+                                    (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+  string t1 = !strconcat("selp.", type);
+  string t2 = !strconcat(t1, " \t${dst}_");
+  string t3 = !strconcat(t2, pos);
+  string t4 = !strconcat(t3, ", ${src1}_");
+  string t5 = !strconcat(t4, pos);
+  string t6 = !strconcat(t5, ", ${src2}_");
+  string t7 = !strconcat(t6, pos);
+  string s  = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+      : NVPTXVecInst<(outs vclass:$dst),
+                     (ins  vclass:$src1, vclass:$src2, Int1Regs:$p),
+                     asmstr,
+                     [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+                       vclass:$src2))],
+                     sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select  : Vec_Select<V4I8Regs,  Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select  : Vec_Select<V2I8Regs,  Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto   : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+  NVPTXInst sop>
+    : NVPTXVecInst<(outs outrclass:$dst),
+                   (ins  inrclass:$a, inrclass:$b),
+                   "Unsupported",
+                   [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+                   sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+                           NVPTXInst inst8,
+                           NVPTXInst inst16,
+                           NVPTXInst inst32,
+                           NVPTXInst inst64>
+{
+  def  V2I8 : Vec_Compare<op, V2I8Regs,  V2I8Regs,  inst8>;
+  def  V4I8 : Vec_Compare<op, V4I8Regs,  V4I8Regs,  inst8>;
+  def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+  def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+  def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+  def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+  def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm VecSGT : Vec_Compare_All<vsetgt,  ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+    ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+  defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+    ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+  defm VecSLT : Vec_Compare_All<vsetlt,  ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+    ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+  defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+    ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+  defm VecSGE : Vec_Compare_All<vsetge,  ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+    ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+  defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+    ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+  defm VecSLE : Vec_Compare_All<vsetle,  ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+    ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+  defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+    ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+  defm VecSEQ : Vec_Compare_All<vseteq,  ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+    ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+  defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+    ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+  defm VecSNE : Vec_Compare_All<vsetne,  ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+    ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+  defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+    ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+                            NVPTXInst instf32,
+                            NVPTXInst instf64>
+{
+  def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+  def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+  def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm FVecGT :  FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+    FSetGTf64rr_toi64>;
+  defm FVecLT :  FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+    FSetLTf64rr_toi64>;
+  defm FVecGE :  FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+    FSetGEf64rr_toi64>;
+  defm FVecLE :  FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+    FSetLEf64rr_toi64>;
+  defm FVecEQ :  FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+    FSetEQf64rr_toi64>;
+  defm FVecNE :  FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+    FSetNEf64rr_toi64>;
+
+  defm FVecUGT :  FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+    FSetUGTf64rr_toi64>;
+  defm FVecULT :  FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+    FSetULTf64rr_toi64>;
+  defm FVecUGE :  FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+    FSetUGEf64rr_toi64>;
+  defm FVecULE :  FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+    FSetULEf64rr_toi64>;
+  defm FVecUEQ :  FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+    FSetUEQf64rr_toi64>;
+  defm FVecUNE :  FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+    FSetUNEf64rr_toi64>;
+
+  defm FVecNUM :  FVec_Compare_All<vseto,  FSetNUMf32rr_toi32,
+    FSetNUMf64rr_toi64>;
+  defm FVecNAN :  FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+    FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8  : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8  : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8  : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8  : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8  : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8  : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+      NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+                "loadparam : $dst <- [$a, $b]",
+                [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+                sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                "storeparam : [$a, $b] <- $val",
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+  NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+                "storeretval : retval[$a] <- $val",
+                [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32  : LoadParamVecInst<V4I32Regs, ".v4.b32",
+  LoadParamScalar4I32>;
+def LoadParamV4I16  : LoadParamVecInst<V4I16Regs, ".v4.b16",
+  LoadParamScalar4I16>;
+def LoadParamV4I8   : LoadParamVecInst<V4I8Regs, ".v4.b8",
+  LoadParamScalar4I8>;
+
+def LoadParamV2I64  : LoadParamVecInst<V2I64Regs, ".v2.b64",
+  LoadParamScalar2I64>;
+def LoadParamV2I32  : LoadParamVecInst<V2I32Regs, ".v2.b32",
+  LoadParamScalar2I32>;
+def LoadParamV2I16  : LoadParamVecInst<V2I16Regs, ".v2.b16",
+  LoadParamScalar2I16>;
+def LoadParamV2I8   : LoadParamVecInst<V2I8Regs, ".v2.b8",
+  LoadParamScalar2I8>;
+
+def LoadParamV4F32  : LoadParamVecInst<V4F32Regs, ".v4.f32",
+  LoadParamScalar4F32>;
+def LoadParamV2F32  : LoadParamVecInst<V2F32Regs, ".v2.f32",
+  LoadParamScalar2F32>;
+def LoadParamV2F64  : LoadParamVecInst<V2F64Regs, ".v2.f64",
+  LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32  : StoreParamVecInst<V4I32Regs, ".v4.b32",
+  StoreParamScalar4I32>;
+def StoreParamV4I16  : StoreParamVecInst<V4I16Regs, ".v4.b16",
+  StoreParamScalar4I16>;
+def StoreParamV4I8   : StoreParamVecInst<V4I8Regs, ".v4.b8",
+  StoreParamScalar4I8>;
+
+def StoreParamV2I64  : StoreParamVecInst<V2I64Regs, ".v2.b64",
+  StoreParamScalar2I64>;
+def StoreParamV2I32  : StoreParamVecInst<V2I32Regs, ".v2.b32",
+  StoreParamScalar2I32>;
+def StoreParamV2I16  : StoreParamVecInst<V2I16Regs, ".v2.b16",
+  StoreParamScalar2I16>;
+def StoreParamV2I8   : StoreParamVecInst<V2I8Regs, ".v2.b8",
+  StoreParamScalar2I8>;
+
+def StoreParamV4F32  : StoreParamVecInst<V4F32Regs, ".v4.f32",
+  StoreParamScalar4F32>;
+def StoreParamV2F32  : StoreParamVecInst<V2F32Regs, ".v2.f32",
+  StoreParamScalar2F32>;
+def StoreParamV2F64  : StoreParamVecInst<V2F64Regs, ".v2.f64",
+  StoreParamScalar2F64>;
+
+def StoreRetvalV4I32  : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+  StoreRetvalScalar4I32>;
+def StoreRetvalV4I16  : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+  StoreRetvalScalar4I16>;
+def StoreRetvalV4I8   : StoreRetvalVecInst<V4I8Regs,  ".v4.b8",
+  StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64  : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+  StoreRetvalScalar2I64>;
+def StoreRetvalV2I32  : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+  StoreRetvalScalar2I32>;
+def StoreRetvalV2I16  : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+  StoreRetvalScalar2I16>;
+def StoreRetvalV2I8   : StoreRetvalVecInst<V2I8Regs,  ".v2.b8",
+  StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32  : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+  StoreRetvalScalar4F32>;
+def StoreRetvalV2F32  : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+  StoreRetvalScalar2F32>;
+def StoreRetvalV2F64  : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+  StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                     (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                     (V4i16Extract V4I16Regs:$s,2),
+                     (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+          (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+          (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+            (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+          (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+            (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+                                "Error!",
+                                [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+                                I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+                                 "Error!",
+                                [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+                                 I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+                                "Error!",
+                               [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+                                I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+                                 "Error!",
+                                [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+                                 I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+                                  "Error!",
+                                [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+                                  I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+          (VecI32toV2I16
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+          (VecI64toV2I32
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+          (VecI32toV4I8
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+          (VecI64toV4I16
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_i32
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_i64
+      (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+    (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+                                  "Error!",
+                              [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+                                  F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+     (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+          (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+          (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+          (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+          (BITCONVERT_32_I2F
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+          (BITCONVERT_64_I2F
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+          (BITCONVERT_32_I2F
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+          (BITCONVERT_64_I2F
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+          (BITCONVERT_64_F2I
+    (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+    (Build_Vector2_f64
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+    (Build_Vector2_f32
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_f64
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+             (V4i32Extract V4I32Regs:$s,1))),
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+             (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+    (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                      (V4i16Extract V4I16Regs:$s,2),
+                      (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+          (Build_Vector2_i64
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+             (V4f32Extract V4F32Regs:$s,1))),
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+             (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+    (Build_Vector2_i64
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+    (Build_Vector2_i32
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I
+          (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+            (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/lib/Target/NVPTX/NVPTXutil.cpp b/lib/Target/NVPTX/NVPTXutil.cpp
new file mode 100644
index 0000000..6a0e532
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXutil.cpp
@@ -0,0 +1,92 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI)
+{
+  if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+      (MI->getOpcode() != NVPTX::LD_i64_avar))
+    return false;
+  if (MI->getOperand(2).isImm() == false)
+    return false;
+  if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+    return false;
+  return true;
+}
+
+#define DATA_MASK     0x7f
+#define DIGIT_WIDTH   7
+#define MORE_BYTES    0x80
+
+static int encode_leb128(uint64_t val, int *nbytes,
+                         char *space, int splen)
+{
+  char *a;
+  char *end = space + splen;
+
+  a = space;
+  do {
+    unsigned char uc;
+
+    if (a >= end)
+      return 1;
+    uc = val & DATA_MASK;
+    val >>= DIGIT_WIDTH;
+    if (val != 0)
+      uc |= MORE_BYTES;
+    *a = uc;
+    a++;
+  } while (val);
+  *nbytes = a - space;
+  return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str)
+{
+  union { uint64_t x; char a[8]; } temp64;
+
+  temp64.x = 0;
+
+  for (unsigned i=0,e=strlen(str); i!=e; ++i)
+    temp64.a[i] = str[e-1-i];
+
+  char encoded[16];
+  int nbytes;
+
+  int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+  (void)retval;
+  assert(retval == 0 &&
+         "Encoding to leb128 failed");
+
+  assert(nbytes <= 8 &&
+         "Cannot support register names with leb128 encoding > 8 bytes");
+
+  temp64.x = 0;
+  for (int i=0; i<nbytes; ++i)
+    temp64.a[i] = encoded[i];
+
+  return temp64.x;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXutil.h b/lib/Target/NVPTX/NVPTXutil.h
new file mode 100644
index 0000000..d1d1171
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXutil.h
@@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
diff --git a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..0bf1334
--- /dev/null
+++ b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+#include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXInfo
+  NVPTXTargetInfo.cpp
+  )
+
+add_dependencies(LLVMNVPTXInfo NVPTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/LLVMBuild.txt b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
index 2cc30c4..ef12b0e 100644
--- a/lib/Target/PTX/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXInfo
-parent = PTX
+name = NVPTXInfo
+parent = NVPTX
 required_libraries = MC Support Target
-add_to_library_groups = PTX
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile
index 8619785..8622315 100644
--- a/lib/Target/PTX/TargetInfo/Makefile
+++ b/lib/Target/NVPTX/TargetInfo/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/TargetInfo/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXInfo
+LIBRARYNAME = LLVMNVPTXInfo
 
 # Hack: we need to include 'main' target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 0000000..f3624b9
--- /dev/null
+++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+    "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+    "NVIDIA PTX 64-bit");
+}
diff --git a/lib/Target/NVPTX/VectorElementize.cpp b/lib/Target/NVPTX/VectorElementize.cpp
new file mode 100644
index 0000000..8043e2d
--- /dev/null
+++ b/lib/Target/NVPTX/VectorElementize.cpp
@@ -0,0 +1,1248 @@
+//===-- VectorElementize.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts operations on vector types to operations on their
+// element types.
+//
+// For generic binary and unary vector instructions, the conversion is simple.
+// Suppose we have
+//        av = bv Vop cv
+// where av, bv, and cv are vector virtual registers, and Vop is a vector op.
+// This gets converted to the following :
+//       a1 = b1 Sop c1
+//       a2 = b2 Sop c2
+//
+// VectorToScalarMap maintains the vector vreg to scalar vreg mapping.
+// For the above example, the map will look as follows:
+// av => [a1, a2]
+// bv => [b1, b2]
+//
+// In addition, initVectorInfo creates the following opcode->opcode map.
+// Vop => Sop
+// OtherVop => OtherSop
+// ...
+//
+// For vector specific instructions like vecbuild, vecshuffle etc, the
+// conversion is different. Look at comments near the functions with
+// prefix createVec<...>.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY VectorElementize : public MachineFunctionPass {
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  NVPTXTargetMachine &TM;
+  MachineRegisterInfo *MRI;
+  const NVPTXRegisterInfo *RegInfo;
+  const NVPTXInstrInfo *InstrInfo;
+
+  llvm::DenseMap<const TargetRegisterClass *, const TargetRegisterClass *>
+  RegClassMap;
+  llvm::DenseMap<unsigned, bool> SimpleMoveMap;
+
+  llvm::DenseMap<unsigned, SmallVector<unsigned, 4> > VectorToScalarMap;
+
+  bool isVectorInstr(MachineInstr *);
+
+  SmallVector<unsigned, 4> getScalarRegisters(unsigned);
+  unsigned getScalarVersion(unsigned);
+  unsigned getScalarVersion(MachineInstr *);
+
+  bool isVectorRegister(unsigned);
+  const TargetRegisterClass *getScalarRegClass(const TargetRegisterClass *RC);
+  unsigned numCopiesNeeded(MachineInstr *);
+
+  void createLoadCopy(MachineFunction&, MachineInstr *,
+                      std::vector<MachineInstr *>&);
+  void createStoreCopy(MachineFunction&, MachineInstr *,
+                       std::vector<MachineInstr *>&);
+
+  void createVecDest(MachineFunction&, MachineInstr *,
+                     std::vector<MachineInstr *>&);
+
+  void createCopies(MachineFunction&, MachineInstr *,
+                    std::vector<MachineInstr *>&);
+
+  unsigned copyProp(MachineFunction&);
+  unsigned removeDeadMoves(MachineFunction&);
+
+  void elementize(MachineFunction&);
+
+  bool isSimpleMove(MachineInstr *);
+
+  void createVecShuffle(MachineFunction& F, MachineInstr *Instr,
+                        std::vector<MachineInstr *>& copies);
+
+  void createVecExtract(MachineFunction& F, MachineInstr *Instr,
+                        std::vector<MachineInstr *>& copies);
+
+  void createVecInsert(MachineFunction& F, MachineInstr *Instr,
+                       std::vector<MachineInstr *>& copies);
+
+  void createVecBuild(MachineFunction& F, MachineInstr *Instr,
+                      std::vector<MachineInstr *>& copies);
+
+public:
+
+  static char ID; // Pass identification, replacement for typeid
+  VectorElementize(NVPTXTargetMachine &tm)
+  : MachineFunctionPass(ID), TM(tm) {}
+
+  virtual const char *getPassName() const {
+    return "Convert LLVM vector types to their element types";
+  }
+};
+
+char VectorElementize::ID = 1;
+}
+
+static cl::opt<bool>
+RemoveRedundantMoves("nvptx-remove-redundant-moves",
+       cl::desc("NVPTX: Remove redundant moves introduced by vector lowering"),
+                     cl::init(true));
+
+#define VECINST(x) ((((x)->getDesc().TSFlags) & NVPTX::VecInstTypeMask) \
+    >> NVPTX::VecInstTypeShift)
+#define ISVECINST(x) (VECINST(x) != NVPTX::VecNOP)
+#define ISVECLOAD(x)    (VECINST(x) == NVPTX::VecLoad)
+#define ISVECSTORE(x)   (VECINST(x) == NVPTX::VecStore)
+#define ISVECBUILD(x)   (VECINST(x) == NVPTX::VecBuild)
+#define ISVECSHUFFLE(x) (VECINST(x) == NVPTX::VecShuffle)
+#define ISVECEXTRACT(x) (VECINST(x) == NVPTX::VecExtract)
+#define ISVECINSERT(x)  (VECINST(x) == NVPTX::VecInsert)
+#define ISVECDEST(x)     (VECINST(x) == NVPTX::VecDest)
+
+bool VectorElementize::isSimpleMove(MachineInstr *mi) {
+  if (mi->isCopy())
+    return true;
+  unsigned TSFlags = (mi->getDesc().TSFlags & NVPTX::SimpleMoveMask)
+        >> NVPTX::SimpleMoveShift;
+  return (TSFlags == 1);
+}
+
+bool VectorElementize::isVectorInstr(MachineInstr *mi) {
+  if ((mi->getOpcode() == NVPTX::PHI) ||
+      (mi->getOpcode() == NVPTX::IMPLICIT_DEF) || mi->isCopy()) {
+    MachineOperand dest = mi->getOperand(0);
+    return isVectorRegister(dest.getReg());
+  }
+  return ISVECINST(mi);
+}
+
+unsigned VectorElementize::getScalarVersion(MachineInstr *mi) {
+  return getScalarVersion(mi->getOpcode());
+}
+
+///=============================================================================
+///Instr is assumed to be a vector instruction. For most vector instructions,
+///the size of the destination vector register gives the number of scalar copies
+///needed. For VecStore, size of getOperand(1) gives the number of scalar copies
+///needed. For VecExtract, the dest is a scalar. So getOperand(1) gives the
+///number of scalar copies needed.
+///=============================================================================
+unsigned VectorElementize::numCopiesNeeded(MachineInstr *Instr) {
+  unsigned numDefs=0;
+  unsigned def;
+  for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+    MachineOperand oper = Instr->getOperand(i);
+
+    if (!oper.isReg()) continue;
+    if (!oper.isDef()) continue;
+    def = i;
+    numDefs++;
+  }
+  assert((numDefs <= 1) && "Only 0 or 1 defs supported");
+
+  if (numDefs == 1) {
+    unsigned regnum = Instr->getOperand(def).getReg();
+    if (ISVECEXTRACT(Instr))
+      regnum = Instr->getOperand(1).getReg();
+    return getNVPTXVectorSize(MRI->getRegClass(regnum));
+  }
+  else if (numDefs == 0) {
+    assert(ISVECSTORE(Instr)
+           && "Only 0 def instruction supported is vector store");
+
+    unsigned regnum = Instr->getOperand(0).getReg();
+    return getNVPTXVectorSize(MRI->getRegClass(regnum));
+  }
+  return 1;
+}
+
+const TargetRegisterClass *VectorElementize::
+getScalarRegClass(const TargetRegisterClass *RC) {
+  assert(isNVPTXVectorRegClass(RC) &&
+         "Not a vector register class");
+  return getNVPTXElemClass(RC);
+}
+
+bool VectorElementize::isVectorRegister(unsigned reg) {
+  const TargetRegisterClass *RC=MRI->getRegClass(reg);
+  return isNVPTXVectorRegClass(RC);
+}
+
+///=============================================================================
+///For every vector register 'v' that is not already in the VectorToScalarMap,
+///create n scalar registers of the corresponding element type, where n
+///is 2 or 4 (getNVPTXVectorSize) and add it VectorToScalarMap.
+///=============================================================================
+SmallVector<unsigned, 4> VectorElementize::getScalarRegisters(unsigned regnum) {
+  assert(isVectorRegister(regnum) && "Expecting a vector register here");
+  // Create the scalar registers and put them in the map, if not already there.
+  if (VectorToScalarMap.find(regnum) == VectorToScalarMap.end()) {
+    const TargetRegisterClass *vecClass = MRI->getRegClass(regnum);
+    const TargetRegisterClass *scalarClass = getScalarRegClass(vecClass);
+
+    SmallVector<unsigned, 4> temp;
+
+    for (unsigned i=0, e=getNVPTXVectorSize(vecClass); i!=e; ++i)
+      temp.push_back(MRI->createVirtualRegister(scalarClass));
+
+    VectorToScalarMap[regnum] = temp;
+  }
+  return VectorToScalarMap[regnum];
+}
+
+///=============================================================================
+///For a vector load of the form
+///va <= ldv2 [addr]
+///the following multi output instruction is created :
+///[v1, v2] <= LD [addr]
+///Look at NVPTXVector.td for the definitions of multi output loads.
+///=============================================================================
+void VectorElementize::createLoadCopy(MachineFunction& F, MachineInstr *Instr,
+                                      std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  // Remove the dest, that should be a vector operand.
+  MachineOperand dest = copy->getOperand(0);
+  unsigned regnum = dest.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) {
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true));
+  }
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+
+}
+
+///=============================================================================
+///For a vector store of the form
+///stv2 va, [addr]
+///the following multi input instruction is created :
+///ST v1, v2, [addr]
+///Look at NVPTXVector.td for the definitions of multi input stores.
+///=============================================================================
+void VectorElementize::createStoreCopy(MachineFunction& F, MachineInstr *Instr,
+                                       std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  MachineOperand src = copy->getOperand(0);
+  unsigned regnum = src.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], false));
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+}
+
+///=============================================================================
+///va <= shufflev2 vb, vc, <i1>, <i2>
+///gets converted to 2 moves into a1 and a2. The source of the moves depend on
+///i1 and i2. i1, i2 can belong to the set {0, 1, 2, 3} for shufflev2. For
+///shufflev4 the set is {0,..7}. For example, if i1=3, i2=0, the move
+///instructions will be
+///a1 <= c2
+///a2 <= b1
+///=============================================================================
+void VectorElementize::createVecShuffle(MachineFunction& F, MachineInstr *Instr,
+                                        std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+  unsigned src1regnum = Instr->getOperand(1).getReg();
+  unsigned src2regnum = Instr->getOperand(2).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+  SmallVector<unsigned, 4> src1 = getScalarRegisters(src1regnum);
+  SmallVector<unsigned, 4> src2 = getScalarRegisters(src2regnum);
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+    MachineOperand which=Instr->getOperand(3+i);
+    assert(which.isImm() && "Shuffle operand not a constant");
+
+    int src=which.getImm();
+    int elem=src%numcopies;
+
+    if (which.getImm() < numcopies)
+      copy->addOperand(MachineOperand::CreateReg(src1[elem], false));
+    else
+      copy->addOperand(MachineOperand::CreateReg(src2[elem], false));
+    copies.push_back(copy);
+  }
+}
+
+///=============================================================================
+///a <= extractv2 va, <i1>
+///gets turned into a simple move to the scalar register a. The source depends
+///on i1.
+///=============================================================================
+void VectorElementize::createVecExtract(MachineFunction& F, MachineInstr *Instr,
+                                        std::vector<MachineInstr *>& copies) {
+  unsigned srcregnum = Instr->getOperand(1).getReg();
+
+  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
+
+  MachineOperand which = Instr->getOperand(2);
+  assert(which.isImm() && "Extract operand not a constant");
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  MachineInstr *copy = BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)),
+                               Instr->getOperand(0).getReg());
+  copy->addOperand(MachineOperand::CreateReg(src[which.getImm()], false));
+
+  copies.push_back(copy);
+}
+
+///=============================================================================
+///va <= vecinsertv2 vb, c, <i1>
+///This instruction copies all elements of vb to va, except the 'i1'th element.
+///The scalar value c becomes the 'i1'th element of va.
+///This gets translated to 2 (4 for vecinsertv4) moves.
+///=============================================================================
+void VectorElementize::createVecInsert(MachineFunction& F, MachineInstr *Instr,
+                                       std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+  unsigned srcregnum = Instr->getOperand(1).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
+
+  MachineOperand which=Instr->getOperand(3);
+  assert(which.isImm() && "Insert operand not a constant");
+  unsigned int elem=which.getImm();
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+
+    if (i != elem)
+      copy->addOperand(MachineOperand::CreateReg(src[i], false));
+    else
+      copy->addOperand(Instr->getOperand(2));
+
+    copies.push_back(copy);
+  }
+
+}
+
+///=============================================================================
+///va <= buildv2 b1, b2
+///gets translated to
+///a1 <= b1
+///a2 <= b2
+///=============================================================================
+void VectorElementize::createVecBuild(MachineFunction& F, MachineInstr *Instr,
+                                      std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+
+    copy->addOperand(Instr->getOperand(1+i));
+
+    copies.push_back(copy);
+  }
+
+}
+
+///=============================================================================
+///For a tex inst of the form
+///va <= op [scalar operands]
+///the following multi output instruction is created :
+///[v1, v2] <= op' [scalar operands]
+///=============================================================================
+void VectorElementize::createVecDest(MachineFunction& F, MachineInstr *Instr,
+                                     std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  // Remove the dest, that should be a vector operand.
+  MachineOperand dest = copy->getOperand(0);
+  unsigned regnum = dest.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true));
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+}
+
+///=============================================================================
+///Look at the vector instruction type and dispatch to the createVec<...>
+///function that creates the scalar copies.
+///=============================================================================
+void VectorElementize::createCopies(MachineFunction& F, MachineInstr *Instr,
+                                    std::vector<MachineInstr *>& copies) {
+  if (ISVECLOAD(Instr)) {
+    createLoadCopy(F, Instr, copies);
+    return;
+  }
+  if (ISVECSTORE(Instr)) {
+    createStoreCopy(F, Instr, copies);
+    return;
+  }
+  if (ISVECSHUFFLE(Instr)) {
+    createVecShuffle(F, Instr, copies);
+    return;
+  }
+  if (ISVECEXTRACT(Instr)) {
+    createVecExtract(F, Instr, copies);
+    return;
+  }
+  if (ISVECINSERT(Instr)) {
+    createVecInsert(F, Instr, copies);
+    return;
+  }
+  if (ISVECDEST(Instr)) {
+    createVecDest(F, Instr, copies);
+    return;
+  }
+  if (ISVECBUILD(Instr)) {
+    createVecBuild(F, Instr, copies);
+    return;
+  }
+
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  for (unsigned i=0; i<numcopies; ++i)
+    copies.push_back(F.CloneMachineInstr(Instr));
+
+  for (unsigned i=0; i<numcopies; ++i) {
+    MachineInstr *copy = copies[i];
+
+    std::vector<MachineOperand> allOperands;
+    std::vector<bool> isDef;
+
+    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) {
+      MachineOperand oper = copy->getOperand(j);
+      allOperands.push_back(oper);
+      if (oper.isReg())
+        isDef.push_back(oper.isDef());
+      else
+        isDef.push_back(false);
+    }
+
+    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j)
+      copy->RemoveOperand(0);
+
+    copy->setDesc(InstrInfo->get(getScalarVersion(Instr)));
+
+    for (unsigned j=0, e=allOperands.size(); j!=e; ++j) {
+      MachineOperand oper=allOperands[j];
+      if (oper.isReg()) {
+        unsigned regnum = oper.getReg();
+        if (isVectorRegister(regnum)) {
+
+          SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+          copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], isDef[j]));
+        }
+        else
+          copy->addOperand(oper);
+      }
+      else
+        copy->addOperand(oper);
+    }
+  }
+}
+
+///=============================================================================
+///Scan through all basic blocks, looking for vector instructions.
+///For each vector instruction I, insert the scalar copies before I, and
+///add I into toRemove vector. Finally remove all instructions in toRemove.
+///=============================================================================
+void VectorElementize::elementize(MachineFunction &F) {
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend();
+      BI!=BE; ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    std::vector<MachineInstr *> copies;
+    std::vector<MachineInstr *> toRemove;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end();
+        II!=IE; ++II) {
+      MachineInstr *Instr = &*II;
+
+      if (!isVectorInstr(Instr))
+        continue;
+
+      copies.clear();
+      createCopies(F, Instr, copies);
+      for (unsigned i=0, e=copies.size(); i!=e; ++i)
+        BB->insert(II, copies[i]);
+
+      assert((copies.size() > 0) && "Problem in createCopies");
+      toRemove.push_back(Instr);
+    }
+    for (unsigned i=0, e=toRemove.size(); i!=e; ++i)
+      F.DeleteMachineInstr(toRemove[i]->getParent()->remove(toRemove[i]));
+  }
+}
+
+///=============================================================================
+///a <= b
+///...
+///...
+///x <= op(a, ...)
+///gets converted to
+///
+///x <= op(b, ...)
+///The original move is still present. This works on SSA form machine code.
+///Note that a <= b should be a simple vreg-to-vreg move instruction.
+///TBD : I didn't find a function that can do replaceOperand, so I remove
+///all operands and add all of them again, replacing the one while adding.
+///=============================================================================
+unsigned VectorElementize::copyProp(MachineFunction &F) {
+  unsigned numReplacements = 0;
+
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
+      ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
+        ++II) {
+      MachineInstr *Instr = &*II;
+
+      // Don't do copy propagation on PHI as it will cause unnecessary
+      // live range overlap.
+      if ((Instr->getOpcode() == TargetOpcode::PHI) ||
+          (Instr->getOpcode() == TargetOpcode::DBG_VALUE))
+        continue;
+
+      bool needsReplacement = false;
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+        MachineOperand oper = Instr->getOperand(i);
+        if (!oper.isReg()) continue;
+        if (oper.isDef()) continue;
+        if (!RegInfo->isVirtualRegister(oper.getReg())) continue;
+
+        MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
+
+        if (!defInstr) continue;
+
+        if (!isSimpleMove(defInstr)) continue;
+
+        MachineOperand defSrc = defInstr->getOperand(1);
+        if (!defSrc.isReg()) continue;
+        if (!RegInfo->isVirtualRegister(defSrc.getReg())) continue;
+
+        needsReplacement = true;
+
+      }
+      if (!needsReplacement) continue;
+
+      numReplacements++;
+
+      std::vector<MachineOperand> operands;
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+        MachineOperand oper = Instr->getOperand(i);
+        bool flag = false;
+        do {
+          if (!(oper.isReg()))
+            break;
+          if (oper.isDef())
+            break;
+          if (!(RegInfo->isVirtualRegister(oper.getReg())))
+            break;
+          MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
+          if (!(isSimpleMove(defInstr)))
+            break;
+          MachineOperand defSrc = defInstr->getOperand(1);
+          if (!(defSrc.isReg()))
+            break;
+          if (!(RegInfo->isVirtualRegister(defSrc.getReg())))
+            break;
+          operands.push_back(defSrc);
+          flag = true;
+        } while (0);
+        if (flag == false)
+          operands.push_back(oper);
+      }
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i)
+        Instr->RemoveOperand(0);
+      for (unsigned i=0, e=operands.size(); i!=e; ++i)
+        Instr->addOperand(operands[i]);
+
+    }
+  }
+  return numReplacements;
+}
+
+///=============================================================================
+///Look for simple vreg-to-vreg instructions whose use_empty() is true, add
+///them to deadMoves vector. Then remove all instructions in deadMoves.
+///=============================================================================
+unsigned VectorElementize::removeDeadMoves(MachineFunction &F) {
+  std::vector<MachineInstr *> deadMoves;
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
+      ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
+        ++II) {
+      MachineInstr *Instr = &*II;
+
+      if (!isSimpleMove(Instr)) continue;
+
+      MachineOperand dest = Instr->getOperand(0);
+      assert(dest.isReg() && "dest of move not a register");
+      assert(RegInfo->isVirtualRegister(dest.getReg()) &&
+             "dest of move not a virtual register");
+
+      if (MRI->use_empty(dest.getReg())) {
+        deadMoves.push_back(Instr);
+      }
+    }
+  }
+
+  for (unsigned i=0, e=deadMoves.size(); i!=e; ++i)
+    F.DeleteMachineInstr(deadMoves[i]->getParent()->remove(deadMoves[i]));
+
+  return deadMoves.size();
+}
+
+///=============================================================================
+///Main function for this pass.
+///=============================================================================
+bool VectorElementize::runOnMachineFunction(MachineFunction &F) {
+  MRI = &F.getRegInfo();
+
+  RegInfo = TM.getRegisterInfo();
+  InstrInfo = TM.getInstrInfo();
+
+  VectorToScalarMap.clear();
+
+  elementize(F);
+
+  if (RemoveRedundantMoves)
+    while (1) {
+      if (copyProp(F) == 0) break;
+      removeDeadMoves(F);
+    }
+
+  return true;
+}
+
+FunctionPass *llvm::createVectorElementizePass(NVPTXTargetMachine &tm) {
+  return new VectorElementize(tm);
+}
+
+unsigned VectorElementize::getScalarVersion(unsigned opcode) {
+  if (opcode == NVPTX::PHI)
+    return opcode;
+  if (opcode == NVPTX::IMPLICIT_DEF)
+    return opcode;
+  switch(opcode) {
+  default: llvm_unreachable("Scalar version not set, fix NVPTXVector.td");
+  case TargetOpcode::COPY: return TargetOpcode::COPY;
+  case NVPTX::AddCCCV2I32: return NVPTX::ADDCCCi32rr;
+  case NVPTX::AddCCCV4I32: return NVPTX::ADDCCCi32rr;
+  case NVPTX::AddCCV2I32: return NVPTX::ADDCCi32rr;
+  case NVPTX::AddCCV4I32: return NVPTX::ADDCCi32rr;
+  case NVPTX::Build_Vector2_f32: return NVPTX::FMOV32rr;
+  case NVPTX::Build_Vector2_f64: return NVPTX::FMOV64rr;
+  case NVPTX::Build_Vector2_i16: return NVPTX::IMOV16rr;
+  case NVPTX::Build_Vector2_i32: return NVPTX::IMOV32rr;
+  case NVPTX::Build_Vector2_i64: return NVPTX::IMOV64rr;
+  case NVPTX::Build_Vector2_i8: return NVPTX::IMOV8rr;
+  case NVPTX::Build_Vector4_f32: return NVPTX::FMOV32rr;
+  case NVPTX::Build_Vector4_i16: return NVPTX::IMOV16rr;
+  case NVPTX::Build_Vector4_i32: return NVPTX::IMOV32rr;
+  case NVPTX::Build_Vector4_i8: return NVPTX::IMOV8rr;
+  case NVPTX::CVTv2i16tov2i32: return NVPTX::Zint_extendext16to32;
+  case NVPTX::CVTv2i64tov2i32: return NVPTX::TRUNC_64to32;
+  case NVPTX::CVTv2i8tov2i32: return NVPTX::Zint_extendext8to32;
+  case NVPTX::CVTv4i16tov4i32: return NVPTX::Zint_extendext16to32;
+  case NVPTX::CVTv4i8tov4i32: return NVPTX::Zint_extendext8to32;
+  case NVPTX::F32MAD_ftzV2: return NVPTX::FMAD32_ftzrrr;
+  case NVPTX::F32MADV2: return NVPTX::FMAD32rrr;
+  case NVPTX::F32MAD_ftzV4: return NVPTX::FMAD32_ftzrrr;
+  case NVPTX::F32MADV4: return NVPTX::FMAD32rrr;
+  case NVPTX::F32FMA_ftzV2: return NVPTX::FMA32_ftzrrr;
+  case NVPTX::F32FMAV2: return NVPTX::FMA32rrr;
+  case NVPTX::F32FMA_ftzV4: return NVPTX::FMA32_ftzrrr;
+  case NVPTX::F32FMAV4: return NVPTX::FMA32rrr;
+  case NVPTX::F64FMAV2: return NVPTX::FMA64rrr;
+  case NVPTX::FVecEQV2F32: return NVPTX::FSetEQf32rr_toi32;
+  case NVPTX::FVecEQV2F64: return NVPTX::FSetEQf64rr_toi64;
+  case NVPTX::FVecEQV4F32: return NVPTX::FSetEQf32rr_toi32;
+  case NVPTX::FVecGEV2F32: return NVPTX::FSetGEf32rr_toi32;
+  case NVPTX::FVecGEV2F64: return NVPTX::FSetGEf64rr_toi64;
+  case NVPTX::FVecGEV4F32: return NVPTX::FSetGEf32rr_toi32;
+  case NVPTX::FVecGTV2F32: return NVPTX::FSetGTf32rr_toi32;
+  case NVPTX::FVecGTV2F64: return NVPTX::FSetGTf64rr_toi64;
+  case NVPTX::FVecGTV4F32: return NVPTX::FSetGTf32rr_toi32;
+  case NVPTX::FVecLEV2F32: return NVPTX::FSetLEf32rr_toi32;
+  case NVPTX::FVecLEV2F64: return NVPTX::FSetLEf64rr_toi64;
+  case NVPTX::FVecLEV4F32: return NVPTX::FSetLEf32rr_toi32;
+  case NVPTX::FVecLTV2F32: return NVPTX::FSetLTf32rr_toi32;
+  case NVPTX::FVecLTV2F64: return NVPTX::FSetLTf64rr_toi64;
+  case NVPTX::FVecLTV4F32: return NVPTX::FSetLTf32rr_toi32;
+  case NVPTX::FVecNANV2F32: return NVPTX::FSetNANf32rr_toi32;
+  case NVPTX::FVecNANV2F64: return NVPTX::FSetNANf64rr_toi64;
+  case NVPTX::FVecNANV4F32: return NVPTX::FSetNANf32rr_toi32;
+  case NVPTX::FVecNEV2F32: return NVPTX::FSetNEf32rr_toi32;
+  case NVPTX::FVecNEV2F64: return NVPTX::FSetNEf64rr_toi64;
+  case NVPTX::FVecNEV4F32: return NVPTX::FSetNEf32rr_toi32;
+  case NVPTX::FVecNUMV2F32: return NVPTX::FSetNUMf32rr_toi32;
+  case NVPTX::FVecNUMV2F64: return NVPTX::FSetNUMf64rr_toi64;
+  case NVPTX::FVecNUMV4F32: return NVPTX::FSetNUMf32rr_toi32;
+  case NVPTX::FVecUEQV2F32: return NVPTX::FSetUEQf32rr_toi32;
+  case NVPTX::FVecUEQV2F64: return NVPTX::FSetUEQf64rr_toi64;
+  case NVPTX::FVecUEQV4F32: return NVPTX::FSetUEQf32rr_toi32;
+  case NVPTX::FVecUGEV2F32: return NVPTX::FSetUGEf32rr_toi32;
+  case NVPTX::FVecUGEV2F64: return NVPTX::FSetUGEf64rr_toi64;
+  case NVPTX::FVecUGEV4F32: return NVPTX::FSetUGEf32rr_toi32;
+  case NVPTX::FVecUGTV2F32: return NVPTX::FSetUGTf32rr_toi32;
+  case NVPTX::FVecUGTV2F64: return NVPTX::FSetUGTf64rr_toi64;
+  case NVPTX::FVecUGTV4F32: return NVPTX::FSetUGTf32rr_toi32;
+  case NVPTX::FVecULEV2F32: return NVPTX::FSetULEf32rr_toi32;
+  case NVPTX::FVecULEV2F64: return NVPTX::FSetULEf64rr_toi64;
+  case NVPTX::FVecULEV4F32: return NVPTX::FSetULEf32rr_toi32;
+  case NVPTX::FVecULTV2F32: return NVPTX::FSetULTf32rr_toi32;
+  case NVPTX::FVecULTV2F64: return NVPTX::FSetULTf64rr_toi64;
+  case NVPTX::FVecULTV4F32: return NVPTX::FSetULTf32rr_toi32;
+  case NVPTX::FVecUNEV2F32: return NVPTX::FSetUNEf32rr_toi32;
+  case NVPTX::FVecUNEV2F64: return NVPTX::FSetUNEf64rr_toi64;
+  case NVPTX::FVecUNEV4F32: return NVPTX::FSetUNEf32rr_toi32;
+  case NVPTX::I16MADV2: return NVPTX::MAD16rrr;
+  case NVPTX::I16MADV4: return NVPTX::MAD16rrr;
+  case NVPTX::I32MADV2: return NVPTX::MAD32rrr;
+  case NVPTX::I32MADV4: return NVPTX::MAD32rrr;
+  case NVPTX::I64MADV2: return NVPTX::MAD64rrr;
+  case NVPTX::I8MADV2: return NVPTX::MAD8rrr;
+  case NVPTX::I8MADV4: return NVPTX::MAD8rrr;
+  case NVPTX::ShiftLV2I16: return NVPTX::SHLi16rr;
+  case NVPTX::ShiftLV2I32: return NVPTX::SHLi32rr;
+  case NVPTX::ShiftLV2I64: return NVPTX::SHLi64rr;
+  case NVPTX::ShiftLV2I8: return NVPTX::SHLi8rr;
+  case NVPTX::ShiftLV4I16: return NVPTX::SHLi16rr;
+  case NVPTX::ShiftLV4I32: return NVPTX::SHLi32rr;
+  case NVPTX::ShiftLV4I8: return NVPTX::SHLi8rr;
+  case NVPTX::ShiftRAV2I16: return NVPTX::SRAi16rr;
+  case NVPTX::ShiftRAV2I32: return NVPTX::SRAi32rr;
+  case NVPTX::ShiftRAV2I64: return NVPTX::SRAi64rr;
+  case NVPTX::ShiftRAV2I8: return NVPTX::SRAi8rr;
+  case NVPTX::ShiftRAV4I16: return NVPTX::SRAi16rr;
+  case NVPTX::ShiftRAV4I32: return NVPTX::SRAi32rr;
+  case NVPTX::ShiftRAV4I8: return NVPTX::SRAi8rr;
+  case NVPTX::ShiftRLV2I16: return NVPTX::SRLi16rr;
+  case NVPTX::ShiftRLV2I32: return NVPTX::SRLi32rr;
+  case NVPTX::ShiftRLV2I64: return NVPTX::SRLi64rr;
+  case NVPTX::ShiftRLV2I8: return NVPTX::SRLi8rr;
+  case NVPTX::ShiftRLV4I16: return NVPTX::SRLi16rr;
+  case NVPTX::ShiftRLV4I32: return NVPTX::SRLi32rr;
+  case NVPTX::ShiftRLV4I8: return NVPTX::SRLi8rr;
+  case NVPTX::SubCCCV2I32: return NVPTX::SUBCCCi32rr;
+  case NVPTX::SubCCCV4I32: return NVPTX::SUBCCCi32rr;
+  case NVPTX::SubCCV2I32: return NVPTX::SUBCCi32rr;
+  case NVPTX::SubCCV4I32: return NVPTX::SUBCCi32rr;
+  case NVPTX::V2F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
+  case NVPTX::V2F32Div_prec: return NVPTX::FDIV32rr_prec;
+  case NVPTX::V2F32Div_ftz: return NVPTX::FDIV32rr_ftz;
+  case NVPTX::V2F32Div: return NVPTX::FDIV32rr;
+  case NVPTX::V2F32_Select: return NVPTX::SELECTf32rr;
+  case NVPTX::V2F64Div: return NVPTX::FDIV64rr;
+  case NVPTX::V2F64_Select: return NVPTX::SELECTf64rr;
+  case NVPTX::V2I16_Select: return NVPTX::SELECTi16rr;
+  case NVPTX::V2I32_Select: return NVPTX::SELECTi32rr;
+  case NVPTX::V2I64_Select: return NVPTX::SELECTi64rr;
+  case NVPTX::V2I8_Select: return NVPTX::SELECTi8rr;
+  case NVPTX::V2f32Extract: return NVPTX::FMOV32rr;
+  case NVPTX::V2f32Insert: return NVPTX::FMOV32rr;
+  case NVPTX::V2f32Mov: return NVPTX::FMOV32rr;
+  case NVPTX::V2f64Extract: return NVPTX::FMOV64rr;
+  case NVPTX::V2f64Insert: return NVPTX::FMOV64rr;
+  case NVPTX::V2f64Mov: return NVPTX::FMOV64rr;
+  case NVPTX::V2i16Extract: return NVPTX::IMOV16rr;
+  case NVPTX::V2i16Insert: return NVPTX::IMOV16rr;
+  case NVPTX::V2i16Mov: return NVPTX::IMOV16rr;
+  case NVPTX::V2i32Extract: return NVPTX::IMOV32rr;
+  case NVPTX::V2i32Insert: return NVPTX::IMOV32rr;
+  case NVPTX::V2i32Mov: return NVPTX::IMOV32rr;
+  case NVPTX::V2i64Extract: return NVPTX::IMOV64rr;
+  case NVPTX::V2i64Insert: return NVPTX::IMOV64rr;
+  case NVPTX::V2i64Mov: return NVPTX::IMOV64rr;
+  case NVPTX::V2i8Extract: return NVPTX::IMOV8rr;
+  case NVPTX::V2i8Insert: return NVPTX::IMOV8rr;
+  case NVPTX::V2i8Mov: return NVPTX::IMOV8rr;
+  case NVPTX::V4F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
+  case NVPTX::V4F32Div_prec: return NVPTX::FDIV32rr_prec;
+  case NVPTX::V4F32Div_ftz: return NVPTX::FDIV32rr_ftz;
+  case NVPTX::V4F32Div: return NVPTX::FDIV32rr;
+  case NVPTX::V4F32_Select: return NVPTX::SELECTf32rr;
+  case NVPTX::V4I16_Select: return NVPTX::SELECTi16rr;
+  case NVPTX::V4I32_Select: return NVPTX::SELECTi32rr;
+  case NVPTX::V4I8_Select: return NVPTX::SELECTi8rr;
+  case NVPTX::V4f32Extract: return NVPTX::FMOV32rr;
+  case NVPTX::V4f32Insert: return NVPTX::FMOV32rr;
+  case NVPTX::V4f32Mov: return NVPTX::FMOV32rr;
+  case NVPTX::V4i16Extract: return NVPTX::IMOV16rr;
+  case NVPTX::V4i16Insert: return NVPTX::IMOV16rr;
+  case NVPTX::V4i16Mov: return NVPTX::IMOV16rr;
+  case NVPTX::V4i32Extract: return NVPTX::IMOV32rr;
+  case NVPTX::V4i32Insert: return NVPTX::IMOV32rr;
+  case NVPTX::V4i32Mov: return NVPTX::IMOV32rr;
+  case NVPTX::V4i8Extract: return NVPTX::IMOV8rr;
+  case NVPTX::V4i8Insert: return NVPTX::IMOV8rr;
+  case NVPTX::V4i8Mov: return NVPTX::IMOV8rr;
+  case NVPTX::VAddV2I16: return NVPTX::ADDi16rr;
+  case NVPTX::VAddV2I32: return NVPTX::ADDi32rr;
+  case NVPTX::VAddV2I64: return NVPTX::ADDi64rr;
+  case NVPTX::VAddV2I8: return NVPTX::ADDi8rr;
+  case NVPTX::VAddV4I16: return NVPTX::ADDi16rr;
+  case NVPTX::VAddV4I32: return NVPTX::ADDi32rr;
+  case NVPTX::VAddV4I8: return NVPTX::ADDi8rr;
+  case NVPTX::VAddfV2F32: return NVPTX::FADDf32rr;
+  case NVPTX::VAddfV2F32_ftz: return NVPTX::FADDf32rr_ftz;
+  case NVPTX::VAddfV2F64: return NVPTX::FADDf64rr;
+  case NVPTX::VAddfV4F32: return NVPTX::FADDf32rr;
+  case NVPTX::VAddfV4F32_ftz: return NVPTX::FADDf32rr_ftz;
+  case NVPTX::VAndV2I16: return NVPTX::ANDb16rr;
+  case NVPTX::VAndV2I32: return NVPTX::ANDb32rr;
+  case NVPTX::VAndV2I64: return NVPTX::ANDb64rr;
+  case NVPTX::VAndV2I8: return NVPTX::ANDb8rr;
+  case NVPTX::VAndV4I16: return NVPTX::ANDb16rr;
+  case NVPTX::VAndV4I32: return NVPTX::ANDb32rr;
+  case NVPTX::VAndV4I8: return NVPTX::ANDb8rr;
+  case NVPTX::VMulfV2F32_ftz: return NVPTX::FMULf32rr_ftz;
+  case NVPTX::VMulfV2F32: return NVPTX::FMULf32rr;
+  case NVPTX::VMulfV2F64: return NVPTX::FMULf64rr;
+  case NVPTX::VMulfV4F32_ftz: return NVPTX::FMULf32rr_ftz;
+  case NVPTX::VMulfV4F32: return NVPTX::FMULf32rr;
+  case NVPTX::VMultHSV2I16: return NVPTX::MULTHSi16rr;
+  case NVPTX::VMultHSV2I32: return NVPTX::MULTHSi32rr;
+  case NVPTX::VMultHSV2I64: return NVPTX::MULTHSi64rr;
+  case NVPTX::VMultHSV2I8: return NVPTX::MULTHSi8rr;
+  case NVPTX::VMultHSV4I16: return NVPTX::MULTHSi16rr;
+  case NVPTX::VMultHSV4I32: return NVPTX::MULTHSi32rr;
+  case NVPTX::VMultHSV4I8: return NVPTX::MULTHSi8rr;
+  case NVPTX::VMultHUV2I16: return NVPTX::MULTHUi16rr;
+  case NVPTX::VMultHUV2I32: return NVPTX::MULTHUi32rr;
+  case NVPTX::VMultHUV2I64: return NVPTX::MULTHUi64rr;
+  case NVPTX::VMultHUV2I8: return NVPTX::MULTHUi8rr;
+  case NVPTX::VMultHUV4I16: return NVPTX::MULTHUi16rr;
+  case NVPTX::VMultHUV4I32: return NVPTX::MULTHUi32rr;
+  case NVPTX::VMultHUV4I8: return NVPTX::MULTHUi8rr;
+  case NVPTX::VMultV2I16: return NVPTX::MULTi16rr;
+  case NVPTX::VMultV2I32: return NVPTX::MULTi32rr;
+  case NVPTX::VMultV2I64: return NVPTX::MULTi64rr;
+  case NVPTX::VMultV2I8: return NVPTX::MULTi8rr;
+  case NVPTX::VMultV4I16: return NVPTX::MULTi16rr;
+  case NVPTX::VMultV4I32: return NVPTX::MULTi32rr;
+  case NVPTX::VMultV4I8: return NVPTX::MULTi8rr;
+  case NVPTX::VNegV2I16: return NVPTX::INEG16;
+  case NVPTX::VNegV2I32: return NVPTX::INEG32;
+  case NVPTX::VNegV2I64: return NVPTX::INEG64;
+  case NVPTX::VNegV2I8: return NVPTX::INEG8;
+  case NVPTX::VNegV4I16: return NVPTX::INEG16;
+  case NVPTX::VNegV4I32: return NVPTX::INEG32;
+  case NVPTX::VNegV4I8: return NVPTX::INEG8;
+  case NVPTX::VNegv2f32: return NVPTX::FNEGf32;
+  case NVPTX::VNegv2f32_ftz: return NVPTX::FNEGf32_ftz;
+  case NVPTX::VNegv2f64: return NVPTX::FNEGf64;
+  case NVPTX::VNegv4f32: return NVPTX::FNEGf32;
+  case NVPTX::VNegv4f32_ftz: return NVPTX::FNEGf32_ftz;
+  case NVPTX::VNotV2I16: return NVPTX::NOT16;
+  case NVPTX::VNotV2I32: return NVPTX::NOT32;
+  case NVPTX::VNotV2I64: return NVPTX::NOT64;
+  case NVPTX::VNotV2I8: return NVPTX::NOT8;
+  case NVPTX::VNotV4I16: return NVPTX::NOT16;
+  case NVPTX::VNotV4I32: return NVPTX::NOT32;
+  case NVPTX::VNotV4I8: return NVPTX::NOT8;
+  case NVPTX::VOrV2I16: return NVPTX::ORb16rr;
+  case NVPTX::VOrV2I32: return NVPTX::ORb32rr;
+  case NVPTX::VOrV2I64: return NVPTX::ORb64rr;
+  case NVPTX::VOrV2I8: return NVPTX::ORb8rr;
+  case NVPTX::VOrV4I16: return NVPTX::ORb16rr;
+  case NVPTX::VOrV4I32: return NVPTX::ORb32rr;
+  case NVPTX::VOrV4I8: return NVPTX::ORb8rr;
+  case NVPTX::VSDivV2I16: return NVPTX::SDIVi16rr;
+  case NVPTX::VSDivV2I32: return NVPTX::SDIVi32rr;
+  case NVPTX::VSDivV2I64: return NVPTX::SDIVi64rr;
+  case NVPTX::VSDivV2I8: return NVPTX::SDIVi8rr;
+  case NVPTX::VSDivV4I16: return NVPTX::SDIVi16rr;
+  case NVPTX::VSDivV4I32: return NVPTX::SDIVi32rr;
+  case NVPTX::VSDivV4I8: return NVPTX::SDIVi8rr;
+  case NVPTX::VSRemV2I16: return NVPTX::SREMi16rr;
+  case NVPTX::VSRemV2I32: return NVPTX::SREMi32rr;
+  case NVPTX::VSRemV2I64: return NVPTX::SREMi64rr;
+  case NVPTX::VSRemV2I8: return NVPTX::SREMi8rr;
+  case NVPTX::VSRemV4I16: return NVPTX::SREMi16rr;
+  case NVPTX::VSRemV4I32: return NVPTX::SREMi32rr;
+  case NVPTX::VSRemV4I8: return NVPTX::SREMi8rr;
+  case NVPTX::VSubV2I16: return NVPTX::SUBi16rr;
+  case NVPTX::VSubV2I32: return NVPTX::SUBi32rr;
+  case NVPTX::VSubV2I64: return NVPTX::SUBi64rr;
+  case NVPTX::VSubV2I8: return NVPTX::SUBi8rr;
+  case NVPTX::VSubV4I16: return NVPTX::SUBi16rr;
+  case NVPTX::VSubV4I32: return NVPTX::SUBi32rr;
+  case NVPTX::VSubV4I8: return NVPTX::SUBi8rr;
+  case NVPTX::VSubfV2F32_ftz: return NVPTX::FSUBf32rr_ftz;
+  case NVPTX::VSubfV2F32: return NVPTX::FSUBf32rr;
+  case NVPTX::VSubfV2F64: return NVPTX::FSUBf64rr;
+  case NVPTX::VSubfV4F32_ftz: return NVPTX::FSUBf32rr_ftz;
+  case NVPTX::VSubfV4F32: return NVPTX::FSUBf32rr;
+  case NVPTX::VUDivV2I16: return NVPTX::UDIVi16rr;
+  case NVPTX::VUDivV2I32: return NVPTX::UDIVi32rr;
+  case NVPTX::VUDivV2I64: return NVPTX::UDIVi64rr;
+  case NVPTX::VUDivV2I8: return NVPTX::UDIVi8rr;
+  case NVPTX::VUDivV4I16: return NVPTX::UDIVi16rr;
+  case NVPTX::VUDivV4I32: return NVPTX::UDIVi32rr;
+  case NVPTX::VUDivV4I8: return NVPTX::UDIVi8rr;
+  case NVPTX::VURemV2I16: return NVPTX::UREMi16rr;
+  case NVPTX::VURemV2I32: return NVPTX::UREMi32rr;
+  case NVPTX::VURemV2I64: return NVPTX::UREMi64rr;
+  case NVPTX::VURemV2I8: return NVPTX::UREMi8rr;
+  case NVPTX::VURemV4I16: return NVPTX::UREMi16rr;
+  case NVPTX::VURemV4I32: return NVPTX::UREMi32rr;
+  case NVPTX::VURemV4I8: return NVPTX::UREMi8rr;
+  case NVPTX::VXorV2I16: return NVPTX::XORb16rr;
+  case NVPTX::VXorV2I32: return NVPTX::XORb32rr;
+  case NVPTX::VXorV2I64: return NVPTX::XORb64rr;
+  case NVPTX::VXorV2I8: return NVPTX::XORb8rr;
+  case NVPTX::VXorV4I16: return NVPTX::XORb16rr;
+  case NVPTX::VXorV4I32: return NVPTX::XORb32rr;
+  case NVPTX::VXorV4I8: return NVPTX::XORb8rr;
+  case NVPTX::VecSEQV2I16: return NVPTX::ISetSEQi16rr_toi16;
+  case NVPTX::VecSEQV2I32: return NVPTX::ISetSEQi32rr_toi32;
+  case NVPTX::VecSEQV2I64: return NVPTX::ISetSEQi64rr_toi64;
+  case NVPTX::VecSEQV2I8: return NVPTX::ISetSEQi8rr_toi8;
+  case NVPTX::VecSEQV4I16: return NVPTX::ISetSEQi16rr_toi16;
+  case NVPTX::VecSEQV4I32: return NVPTX::ISetSEQi32rr_toi32;
+  case NVPTX::VecSEQV4I8: return NVPTX::ISetSEQi8rr_toi8;
+  case NVPTX::VecSGEV2I16: return NVPTX::ISetSGEi16rr_toi16;
+  case NVPTX::VecSGEV2I32: return NVPTX::ISetSGEi32rr_toi32;
+  case NVPTX::VecSGEV2I64: return NVPTX::ISetSGEi64rr_toi64;
+  case NVPTX::VecSGEV2I8: return NVPTX::ISetSGEi8rr_toi8;
+  case NVPTX::VecSGEV4I16: return NVPTX::ISetSGEi16rr_toi16;
+  case NVPTX::VecSGEV4I32: return NVPTX::ISetSGEi32rr_toi32;
+  case NVPTX::VecSGEV4I8: return NVPTX::ISetSGEi8rr_toi8;
+  case NVPTX::VecSGTV2I16: return NVPTX::ISetSGTi16rr_toi16;
+  case NVPTX::VecSGTV2I32: return NVPTX::ISetSGTi32rr_toi32;
+  case NVPTX::VecSGTV2I64: return NVPTX::ISetSGTi64rr_toi64;
+  case NVPTX::VecSGTV2I8: return NVPTX::ISetSGTi8rr_toi8;
+  case NVPTX::VecSGTV4I16: return NVPTX::ISetSGTi16rr_toi16;
+  case NVPTX::VecSGTV4I32: return NVPTX::ISetSGTi32rr_toi32;
+  case NVPTX::VecSGTV4I8: return NVPTX::ISetSGTi8rr_toi8;
+  case NVPTX::VecSLEV2I16: return NVPTX::ISetSLEi16rr_toi16;
+  case NVPTX::VecSLEV2I32: return NVPTX::ISetSLEi32rr_toi32;
+  case NVPTX::VecSLEV2I64: return NVPTX::ISetSLEi64rr_toi64;
+  case NVPTX::VecSLEV2I8: return NVPTX::ISetSLEi8rr_toi8;
+  case NVPTX::VecSLEV4I16: return NVPTX::ISetSLEi16rr_toi16;
+  case NVPTX::VecSLEV4I32: return NVPTX::ISetSLEi32rr_toi32;
+  case NVPTX::VecSLEV4I8: return NVPTX::ISetSLEi8rr_toi8;
+  case NVPTX::VecSLTV2I16: return NVPTX::ISetSLTi16rr_toi16;
+  case NVPTX::VecSLTV2I32: return NVPTX::ISetSLTi32rr_toi32;
+  case NVPTX::VecSLTV2I64: return NVPTX::ISetSLTi64rr_toi64;
+  case NVPTX::VecSLTV2I8: return NVPTX::ISetSLTi8rr_toi8;
+  case NVPTX::VecSLTV4I16: return NVPTX::ISetSLTi16rr_toi16;
+  case NVPTX::VecSLTV4I32: return NVPTX::ISetSLTi32rr_toi32;
+  case NVPTX::VecSLTV4I8: return NVPTX::ISetSLTi8rr_toi8;
+  case NVPTX::VecSNEV2I16: return NVPTX::ISetSNEi16rr_toi16;
+  case NVPTX::VecSNEV2I32: return NVPTX::ISetSNEi32rr_toi32;
+  case NVPTX::VecSNEV2I64: return NVPTX::ISetSNEi64rr_toi64;
+  case NVPTX::VecSNEV2I8: return NVPTX::ISetSNEi8rr_toi8;
+  case NVPTX::VecSNEV4I16: return NVPTX::ISetSNEi16rr_toi16;
+  case NVPTX::VecSNEV4I32: return NVPTX::ISetSNEi32rr_toi32;
+  case NVPTX::VecSNEV4I8: return NVPTX::ISetSNEi8rr_toi8;
+  case NVPTX::VecShuffle_v2f32: return NVPTX::FMOV32rr;
+  case NVPTX::VecShuffle_v2f64: return NVPTX::FMOV64rr;
+  case NVPTX::VecShuffle_v2i16: return NVPTX::IMOV16rr;
+  case NVPTX::VecShuffle_v2i32: return NVPTX::IMOV32rr;
+  case NVPTX::VecShuffle_v2i64: return NVPTX::IMOV64rr;
+  case NVPTX::VecShuffle_v2i8: return NVPTX::IMOV8rr;
+  case NVPTX::VecShuffle_v4f32: return NVPTX::FMOV32rr;
+  case NVPTX::VecShuffle_v4i16: return NVPTX::IMOV16rr;
+  case NVPTX::VecShuffle_v4i32: return NVPTX::IMOV32rr;
+  case NVPTX::VecShuffle_v4i8: return NVPTX::IMOV8rr;
+  case NVPTX::VecUEQV2I16: return NVPTX::ISetUEQi16rr_toi16;
+  case NVPTX::VecUEQV2I32: return NVPTX::ISetUEQi32rr_toi32;
+  case NVPTX::VecUEQV2I64: return NVPTX::ISetUEQi64rr_toi64;
+  case NVPTX::VecUEQV2I8: return NVPTX::ISetUEQi8rr_toi8;
+  case NVPTX::VecUEQV4I16: return NVPTX::ISetUEQi16rr_toi16;
+  case NVPTX::VecUEQV4I32: return NVPTX::ISetUEQi32rr_toi32;
+  case NVPTX::VecUEQV4I8: return NVPTX::ISetUEQi8rr_toi8;
+  case NVPTX::VecUGEV2I16: return NVPTX::ISetUGEi16rr_toi16;
+  case NVPTX::VecUGEV2I32: return NVPTX::ISetUGEi32rr_toi32;
+  case NVPTX::VecUGEV2I64: return NVPTX::ISetUGEi64rr_toi64;
+  case NVPTX::VecUGEV2I8: return NVPTX::ISetUGEi8rr_toi8;
+  case NVPTX::VecUGEV4I16: return NVPTX::ISetUGEi16rr_toi16;
+  case NVPTX::VecUGEV4I32: return NVPTX::ISetUGEi32rr_toi32;
+  case NVPTX::VecUGEV4I8: return NVPTX::ISetUGEi8rr_toi8;
+  case NVPTX::VecUGTV2I16: return NVPTX::ISetUGTi16rr_toi16;
+  case NVPTX::VecUGTV2I32: return NVPTX::ISetUGTi32rr_toi32;
+  case NVPTX::VecUGTV2I64: return NVPTX::ISetUGTi64rr_toi64;
+  case NVPTX::VecUGTV2I8: return NVPTX::ISetUGTi8rr_toi8;
+  case NVPTX::VecUGTV4I16: return NVPTX::ISetUGTi16rr_toi16;
+  case NVPTX::VecUGTV4I32: return NVPTX::ISetUGTi32rr_toi32;
+  case NVPTX::VecUGTV4I8: return NVPTX::ISetUGTi8rr_toi8;
+  case NVPTX::VecULEV2I16: return NVPTX::ISetULEi16rr_toi16;
+  case NVPTX::VecULEV2I32: return NVPTX::ISetULEi32rr_toi32;
+  case NVPTX::VecULEV2I64: return NVPTX::ISetULEi64rr_toi64;
+  case NVPTX::VecULEV2I8: return NVPTX::ISetULEi8rr_toi8;
+  case NVPTX::VecULEV4I16: return NVPTX::ISetULEi16rr_toi16;
+  case NVPTX::VecULEV4I32: return NVPTX::ISetULEi32rr_toi32;
+  case NVPTX::VecULEV4I8: return NVPTX::ISetULEi8rr_toi8;
+  case NVPTX::VecULTV2I16: return NVPTX::ISetULTi16rr_toi16;
+  case NVPTX::VecULTV2I32: return NVPTX::ISetULTi32rr_toi32;
+  case NVPTX::VecULTV2I64: return NVPTX::ISetULTi64rr_toi64;
+  case NVPTX::VecULTV2I8: return NVPTX::ISetULTi8rr_toi8;
+  case NVPTX::VecULTV4I16: return NVPTX::ISetULTi16rr_toi16;
+  case NVPTX::VecULTV4I32: return NVPTX::ISetULTi32rr_toi32;
+  case NVPTX::VecULTV4I8: return NVPTX::ISetULTi8rr_toi8;
+  case NVPTX::VecUNEV2I16: return NVPTX::ISetUNEi16rr_toi16;
+  case NVPTX::VecUNEV2I32: return NVPTX::ISetUNEi32rr_toi32;
+  case NVPTX::VecUNEV2I64: return NVPTX::ISetUNEi64rr_toi64;
+  case NVPTX::VecUNEV2I8: return NVPTX::ISetUNEi8rr_toi8;
+  case NVPTX::VecUNEV4I16: return NVPTX::ISetUNEi16rr_toi16;
+  case NVPTX::VecUNEV4I32: return NVPTX::ISetUNEi32rr_toi32;
+  case NVPTX::VecUNEV4I8: return NVPTX::ISetUNEi8rr_toi8;
+  case NVPTX::INT_PTX_LDU_G_v2i8_32: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i8_32: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i16_32: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i16_32: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i32_32: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i32_32: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2f32_32: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4f32_32: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i64_32: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2f64_32: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i8_64: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i8_64: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i16_64: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i16_64: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i32_64: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i32_64: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2f32_64: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4f32_64: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i64_64: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2f64_64: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_64;
+
+  case NVPTX::LoadParamV4I32: return NVPTX::LoadParamScalar4I32;
+  case NVPTX::LoadParamV4I16: return NVPTX::LoadParamScalar4I16;
+  case NVPTX::LoadParamV4I8: return NVPTX::LoadParamScalar4I8;
+  case NVPTX::LoadParamV2I64: return NVPTX::LoadParamScalar2I64;
+  case NVPTX::LoadParamV2I32: return NVPTX::LoadParamScalar2I32;
+  case NVPTX::LoadParamV2I16: return NVPTX::LoadParamScalar2I16;
+  case NVPTX::LoadParamV2I8: return NVPTX::LoadParamScalar2I8;
+  case NVPTX::LoadParamV4F32: return NVPTX::LoadParamScalar4F32;
+  case NVPTX::LoadParamV2F32: return NVPTX::LoadParamScalar2F32;
+  case NVPTX::LoadParamV2F64: return NVPTX::LoadParamScalar2F64;
+  case NVPTX::StoreParamV4I32: return NVPTX::StoreParamScalar4I32;
+  case NVPTX::StoreParamV4I16: return NVPTX::StoreParamScalar4I16;
+  case NVPTX::StoreParamV4I8: return NVPTX::StoreParamScalar4I8;
+  case NVPTX::StoreParamV2I64: return NVPTX::StoreParamScalar2I64;
+  case NVPTX::StoreParamV2I32: return NVPTX::StoreParamScalar2I32;
+  case NVPTX::StoreParamV2I16: return NVPTX::StoreParamScalar2I16;
+  case NVPTX::StoreParamV2I8: return NVPTX::StoreParamScalar2I8;
+  case NVPTX::StoreParamV4F32: return NVPTX::StoreParamScalar4F32;
+  case NVPTX::StoreParamV2F32: return NVPTX::StoreParamScalar2F32;
+  case NVPTX::StoreParamV2F64: return NVPTX::StoreParamScalar2F64;
+  case NVPTX::StoreRetvalV4I32: return NVPTX::StoreRetvalScalar4I32;
+  case NVPTX::StoreRetvalV4I16: return NVPTX::StoreRetvalScalar4I16;
+  case NVPTX::StoreRetvalV4I8: return NVPTX::StoreRetvalScalar4I8;
+  case NVPTX::StoreRetvalV2I64: return NVPTX::StoreRetvalScalar2I64;
+  case NVPTX::StoreRetvalV2I32: return NVPTX::StoreRetvalScalar2I32;
+  case NVPTX::StoreRetvalV2I16: return NVPTX::StoreRetvalScalar2I16;
+  case NVPTX::StoreRetvalV2I8: return NVPTX::StoreRetvalScalar2I8;
+  case NVPTX::StoreRetvalV4F32: return NVPTX::StoreRetvalScalar4F32;
+  case NVPTX::StoreRetvalV2F32: return NVPTX::StoreRetvalScalar2F32;
+  case NVPTX::StoreRetvalV2F64: return NVPTX::StoreRetvalScalar2F64;
+  case NVPTX::VecI32toV4I8: return NVPTX::I32toV4I8;
+  case NVPTX::VecI64toV4I16: return NVPTX::I64toV4I16;
+  case NVPTX::VecI16toV2I8: return NVPTX::I16toV2I8;
+  case NVPTX::VecI32toV2I16: return NVPTX::I32toV2I16;
+  case NVPTX::VecI64toV2I32: return NVPTX::I64toV2I32;
+  case NVPTX::VecF64toV2F32: return NVPTX::F64toV2F32;
+
+  case NVPTX::LD_v2i8_avar: return NVPTX::LDV_i8_v2_avar;
+  case NVPTX::LD_v2i8_areg: return NVPTX::LDV_i8_v2_areg;
+  case NVPTX::LD_v2i8_ari:  return NVPTX::LDV_i8_v2_ari;
+  case NVPTX::LD_v2i8_asi:  return NVPTX::LDV_i8_v2_asi;
+  case NVPTX::LD_v4i8_avar: return NVPTX::LDV_i8_v4_avar;
+  case NVPTX::LD_v4i8_areg: return NVPTX::LDV_i8_v4_areg;
+  case NVPTX::LD_v4i8_ari:  return NVPTX::LDV_i8_v4_ari;
+  case NVPTX::LD_v4i8_asi:  return NVPTX::LDV_i8_v4_asi;
+
+  case NVPTX::LD_v2i16_avar: return NVPTX::LDV_i16_v2_avar;
+  case NVPTX::LD_v2i16_areg: return NVPTX::LDV_i16_v2_areg;
+  case NVPTX::LD_v2i16_ari:  return NVPTX::LDV_i16_v2_ari;
+  case NVPTX::LD_v2i16_asi:  return NVPTX::LDV_i16_v2_asi;
+  case NVPTX::LD_v4i16_avar: return NVPTX::LDV_i16_v4_avar;
+  case NVPTX::LD_v4i16_areg: return NVPTX::LDV_i16_v4_areg;
+  case NVPTX::LD_v4i16_ari:  return NVPTX::LDV_i16_v4_ari;
+  case NVPTX::LD_v4i16_asi:  return NVPTX::LDV_i16_v4_asi;
+
+  case NVPTX::LD_v2i32_avar: return NVPTX::LDV_i32_v2_avar;
+  case NVPTX::LD_v2i32_areg: return NVPTX::LDV_i32_v2_areg;
+  case NVPTX::LD_v2i32_ari:  return NVPTX::LDV_i32_v2_ari;
+  case NVPTX::LD_v2i32_asi:  return NVPTX::LDV_i32_v2_asi;
+  case NVPTX::LD_v4i32_avar: return NVPTX::LDV_i32_v4_avar;
+  case NVPTX::LD_v4i32_areg: return NVPTX::LDV_i32_v4_areg;
+  case NVPTX::LD_v4i32_ari:  return NVPTX::LDV_i32_v4_ari;
+  case NVPTX::LD_v4i32_asi:  return NVPTX::LDV_i32_v4_asi;
+
+  case NVPTX::LD_v2f32_avar: return NVPTX::LDV_f32_v2_avar;
+  case NVPTX::LD_v2f32_areg: return NVPTX::LDV_f32_v2_areg;
+  case NVPTX::LD_v2f32_ari:  return NVPTX::LDV_f32_v2_ari;
+  case NVPTX::LD_v2f32_asi:  return NVPTX::LDV_f32_v2_asi;
+  case NVPTX::LD_v4f32_avar: return NVPTX::LDV_f32_v4_avar;
+  case NVPTX::LD_v4f32_areg: return NVPTX::LDV_f32_v4_areg;
+  case NVPTX::LD_v4f32_ari:  return NVPTX::LDV_f32_v4_ari;
+  case NVPTX::LD_v4f32_asi:  return NVPTX::LDV_f32_v4_asi;
+
+  case NVPTX::LD_v2i64_avar: return NVPTX::LDV_i64_v2_avar;
+  case NVPTX::LD_v2i64_areg: return NVPTX::LDV_i64_v2_areg;
+  case NVPTX::LD_v2i64_ari:  return NVPTX::LDV_i64_v2_ari;
+  case NVPTX::LD_v2i64_asi:  return NVPTX::LDV_i64_v2_asi;
+  case NVPTX::LD_v2f64_avar: return NVPTX::LDV_f64_v2_avar;
+  case NVPTX::LD_v2f64_areg: return NVPTX::LDV_f64_v2_areg;
+  case NVPTX::LD_v2f64_ari:  return NVPTX::LDV_f64_v2_ari;
+  case NVPTX::LD_v2f64_asi:  return NVPTX::LDV_f64_v2_asi;
+
+  case NVPTX::ST_v2i8_avar: return NVPTX::STV_i8_v2_avar;
+  case NVPTX::ST_v2i8_areg: return NVPTX::STV_i8_v2_areg;
+  case NVPTX::ST_v2i8_ari:  return NVPTX::STV_i8_v2_ari;
+  case NVPTX::ST_v2i8_asi:  return NVPTX::STV_i8_v2_asi;
+  case NVPTX::ST_v4i8_avar: return NVPTX::STV_i8_v4_avar;
+  case NVPTX::ST_v4i8_areg: return NVPTX::STV_i8_v4_areg;
+  case NVPTX::ST_v4i8_ari:  return NVPTX::STV_i8_v4_ari;
+  case NVPTX::ST_v4i8_asi:  return NVPTX::STV_i8_v4_asi;
+
+  case NVPTX::ST_v2i16_avar: return NVPTX::STV_i16_v2_avar;
+  case NVPTX::ST_v2i16_areg: return NVPTX::STV_i16_v2_areg;
+  case NVPTX::ST_v2i16_ari:  return NVPTX::STV_i16_v2_ari;
+  case NVPTX::ST_v2i16_asi:  return NVPTX::STV_i16_v2_asi;
+  case NVPTX::ST_v4i16_avar: return NVPTX::STV_i16_v4_avar;
+  case NVPTX::ST_v4i16_areg: return NVPTX::STV_i16_v4_areg;
+  case NVPTX::ST_v4i16_ari:  return NVPTX::STV_i16_v4_ari;
+  case NVPTX::ST_v4i16_asi:  return NVPTX::STV_i16_v4_asi;
+
+  case NVPTX::ST_v2i32_avar: return NVPTX::STV_i32_v2_avar;
+  case NVPTX::ST_v2i32_areg: return NVPTX::STV_i32_v2_areg;
+  case NVPTX::ST_v2i32_ari:  return NVPTX::STV_i32_v2_ari;
+  case NVPTX::ST_v2i32_asi:  return NVPTX::STV_i32_v2_asi;
+  case NVPTX::ST_v4i32_avar: return NVPTX::STV_i32_v4_avar;
+  case NVPTX::ST_v4i32_areg: return NVPTX::STV_i32_v4_areg;
+  case NVPTX::ST_v4i32_ari:  return NVPTX::STV_i32_v4_ari;
+  case NVPTX::ST_v4i32_asi:  return NVPTX::STV_i32_v4_asi;
+
+  case NVPTX::ST_v2f32_avar: return NVPTX::STV_f32_v2_avar;
+  case NVPTX::ST_v2f32_areg: return NVPTX::STV_f32_v2_areg;
+  case NVPTX::ST_v2f32_ari:  return NVPTX::STV_f32_v2_ari;
+  case NVPTX::ST_v2f32_asi:  return NVPTX::STV_f32_v2_asi;
+  case NVPTX::ST_v4f32_avar: return NVPTX::STV_f32_v4_avar;
+  case NVPTX::ST_v4f32_areg: return NVPTX::STV_f32_v4_areg;
+  case NVPTX::ST_v4f32_ari:  return NVPTX::STV_f32_v4_ari;
+  case NVPTX::ST_v4f32_asi:  return NVPTX::STV_f32_v4_asi;
+
+  case NVPTX::ST_v2i64_avar: return NVPTX::STV_i64_v2_avar;
+  case NVPTX::ST_v2i64_areg: return NVPTX::STV_i64_v2_areg;
+  case NVPTX::ST_v2i64_ari:  return NVPTX::STV_i64_v2_ari;
+  case NVPTX::ST_v2i64_asi:  return NVPTX::STV_i64_v2_asi;
+  case NVPTX::ST_v2f64_avar: return NVPTX::STV_f64_v2_avar;
+  case NVPTX::ST_v2f64_areg: return NVPTX::STV_f64_v2_areg;
+  case NVPTX::ST_v2f64_ari:  return NVPTX::STV_f64_v2_ari;
+  case NVPTX::ST_v2f64_asi:  return NVPTX::STV_f64_v2_asi;
+  }
+  return 0;
+}
diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 0000000..a7347ef
--- /dev/null
+++ b/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,125 @@
+#ifndef __CL_COMMON_DEFINES_H__
+#define __CL_COMMON_DEFINES_H__
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,            // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1,           // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,            // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3,           // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,            // four channel RGBA half
+  CLK_FLOAT = 0x10DE,                 // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 =  0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888 ,         // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R,        // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4,         // number of bits required to
+                                                // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1
+}clk_channel_type;
+
+typedef enum clk_sampler_type {
+    __CLK_ADDRESS_BASE             = 0,
+    CLK_ADDRESS_NONE               = 0 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP              = 1 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP_TO_EDGE      = 2 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_REPEAT             = 3 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_MIRROR             = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+    CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
+#endif
+    __CLK_ADDRESS_MASK             = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+                                     CLK_ADDRESS_CLAMP_TO_EDGE |
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+    __CLK_ADDRESS_BITS             = 3,        // number of bits required to
+                                               // represent address info
+
+    __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
+    CLK_NORMALIZED_COORDS_FALSE    = 0,
+    CLK_NORMALIZED_COORDS_TRUE     = 1 << __CLK_NORMALIZED_BASE,
+    __CLK_NORMALIZED_MASK          = CLK_NORMALIZED_COORDS_FALSE |
+                                     CLK_NORMALIZED_COORDS_TRUE,
+    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
+                                               // represent normalization
+
+    __CLK_FILTER_BASE              = __CLK_NORMALIZED_BASE +
+                                     __CLK_NORMALIZED_BITS,
+    CLK_FILTER_NEAREST             = 0 << __CLK_FILTER_BASE,
+    CLK_FILTER_LINEAR              = 1 << __CLK_FILTER_BASE,
+    CLK_FILTER_ANISOTROPIC         = 2 << __CLK_FILTER_BASE,
+    __CLK_FILTER_MASK              = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC,
+    __CLK_FILTER_BITS              = 2,        // number of bits required to
+                                               // represent address info
+
+    __CLK_MIP_BASE                 = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+    CLK_MIP_NEAREST                = 0 << __CLK_MIP_BASE,
+    CLK_MIP_LINEAR                 = 1 << __CLK_MIP_BASE,
+    CLK_MIP_ANISOTROPIC            = 2 << __CLK_MIP_BASE,
+    __CLK_MIP_MASK                 = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC,
+    __CLK_MIP_BITS                 = 2,
+
+    __CLK_SAMPLER_BITS             = __CLK_MIP_BASE + __CLK_MIP_BITS,
+    __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                     __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+    __CLK_ANISOTROPIC_RATIO_BITS   = 5,
+    __CLK_ANISOTROPIC_RATIO_MASK   = (int) 0x80000000 >>
+                                      (__CLK_ANISOTROPIC_RATIO_BITS-1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE     (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE    (1 << 1)
+
+#endif // __CL_COMMON_DEFINES_H__
diff --git a/lib/Target/NVPTX/gen-register-defs.py b/lib/Target/NVPTX/gen-register-defs.py
new file mode 100644
index 0000000..ed06668
--- /dev/null
+++ b/lib/Target/NVPTX/gen-register-defs.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+num_regs = 396
+
+outFile = open('NVPTXRegisterInfo.td', 'w')
+
+outFile.write('''
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+''')
+
+# Predicates
+outFile.write('''
+//===--- Predicate --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i))
+
+# Int8
+outFile.write('''
+//===--- 8-bit ------------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i))
+
+# Int16
+outFile.write('''
+//===--- 16-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i))
+
+# Int32
+outFile.write('''
+//===--- 32-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i))
+
+# Int64
+outFile.write('''
+//===--- 64-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i))
+
+# F32
+outFile.write('''
+//===--- 32-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i))
+
+# F64
+outFile.write('''
+//===--- 64-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i))
+
+# Vector registers
+outFile.write('''
+//===--- Vector -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i))
+
+for i in range(0, num_regs):
+  outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i))
+
+# Argument registers
+outFile.write('''
+//===--- Arguments --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i))
+
+outFile.write('''
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+''')
+
+outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('''
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+''')
+
+outFile.write('''
+class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
+                       NVPTXRegClass sClass,
+                       int e,
+                       string n>
+  : NVPTXRegClass<regTypes, alignment, regList>
+{
+  NVPTXRegClass scalarClass=sClass;
+  int elems=e;
+  string name=n;
+}
+''')
+
+
+outFile.write('def V2F32Regs\n  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1))
+outFile.write('def V4F32Regs\n  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1))
+
+outFile.write('def V2I32Regs\n  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1))
+outFile.write('def V4I32Regs\n  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1))
+
+outFile.write('def V2F64Regs\n  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1))
+outFile.write('def V2I64Regs\n  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1))
+
+outFile.write('def V2I16Regs\n  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n    Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1))
+outFile.write('def V4I16Regs\n  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n    Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1))
+
+outFile.write('def V2I8Regs\n  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n    Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1))
+outFile.write('def V4I8Regs\n  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n    Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1))
+
+outFile.close()
+
+
+outFile = open('NVPTXNumRegisters.h', 'w')
+outFile.write('''
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = %d;
+
+}
+
+#endif
+''' % num_regs)
+
+outFile.close()
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
deleted file mode 100644
index a3be342..0000000
--- a/lib/Target/PTX/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS PTX.td)
-
-tablegen(LLVM PTXGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM PTXGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM PTXGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM PTXGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM PTXGenSubtargetInfo.inc -gen-subtarget)
-add_public_tablegen_target(PTXCommonTableGen)
-
-add_llvm_target(PTXCodeGen
-  PTXAsmPrinter.cpp
-  PTXISelDAGToDAG.cpp
-  PTXISelLowering.cpp
-  PTXInstrInfo.cpp
-  PTXFPRoundingModePass.cpp
-  PTXFrameLowering.cpp
-  PTXMCAsmStreamer.cpp
-  PTXMCInstLower.cpp
-  PTXMFInfoExtract.cpp
-  PTXMachineFunctionInfo.cpp
-  PTXParamManager.cpp
-  PTXRegAlloc.cpp
-  PTXRegisterInfo.cpp
-  PTXSelectionDAGInfo.cpp
-  PTXSubtarget.cpp
-  PTXTargetMachine.cpp
-  )
-
-add_subdirectory(TargetInfo)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
-
diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b252893..0000000
--- a/lib/Target/PTX/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPTXAsmPrinter
-  PTXInstPrinter.cpp
-  )
-
-add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen)
-
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
deleted file mode 100644
index 1830213..0000000
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints a PTX MCInst to a .ptx file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "PTXInstPrinter.h"
-#include "MCTargetDesc/PTXBaseInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#include "PTXGenAsmWriter.inc"
-
-PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
-                               const MCInstrInfo &MII,
-                               const MCRegisterInfo &MRI,
-                               const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // Decode the register number into type and offset
-  unsigned RegSpace  = RegNo & 0x7;
-  unsigned RegType   = (RegNo >> 3) & 0x7;
-  unsigned RegOffset = RegNo >> 6;
-
-  // Print the register
-  OS << "%";
-
-  switch (RegSpace) {
-  default:
-    llvm_unreachable("Unknown register space!");
-  case PTXRegisterSpace::Reg:
-    switch (RegType) {
-    default:
-      llvm_unreachable("Unknown register type!");
-    case PTXRegisterType::Pred:
-      OS << "p";
-      break;
-    case PTXRegisterType::B16:
-      OS << "rh";
-      break;
-    case PTXRegisterType::B32:
-      OS << "r";
-      break;
-    case PTXRegisterType::B64:
-      OS << "rd";
-      break;
-    case PTXRegisterType::F32:
-      OS << "f";
-      break;
-    case PTXRegisterType::F64:
-      OS << "fd";
-      break;
-    }
-    break;
-  case PTXRegisterSpace::Return:
-    OS << "ret";
-    break;
-  case PTXRegisterSpace::Argument:
-    OS << "arg";
-    break;
-  }
-
-  OS << RegOffset;
-}
-
-void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                               StringRef Annot) {
-  printPredicate(MI, O);
-  switch (MI->getOpcode()) {
-  default:
-    printInstruction(MI, O);
-    break;
-  case PTX::CALL:
-    printCall(MI, O);
-  }
-  O << ";";
-  printAnnotation(O, Annot);
-}
-
-void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) {
-  // The last two operands are the predicate operands
-  int RegIndex;
-  int OpIndex;
-
-  if (MI->getOpcode() == PTX::CALL) {
-    RegIndex = 0;
-    OpIndex  = 1;
-  } else {
-    RegIndex = MI->getNumOperands()-2;
-    OpIndex = MI->getNumOperands()-1;
-  }
-
-  int PredOp = MI->getOperand(OpIndex).getImm();
-  if (PredOp == PTXPredicate::None)
-    return;
-
-  if (PredOp == PTXPredicate::Negate)
-    O << '!';
-  else
-    O << '@';
-
-  printOperand(MI, RegIndex, O);
-}
-
-void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) {
-  O << "\tcall.uni\t";
-  // The first two operands are the predicate slot
-  unsigned Index = 2;
-  unsigned NumRets = MI->getOperand(Index++).getImm();
-
-  if (NumRets > 0) {
-    O << "(";
-    printOperand(MI, Index++, O);
-    for (unsigned i = 1; i < NumRets; ++i) {
-      O << ", ";
-      printOperand(MI, Index++, O);
-    }
-    O << "), ";
-  }
-
-  const MCExpr* Expr = MI->getOperand(Index++).getExpr();
-  unsigned NumArgs = MI->getOperand(Index++).getImm();
-  
-  // if the function call is to printf or puts, change to vprintf
-  if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
-    const MCSymbol &Sym = SymRefExpr->getSymbol();
-    if (Sym.getName() == "printf" || Sym.getName() == "puts") {
-      O << "vprintf";
-    } else {
-      O << Sym.getName();
-    }
-  } else {
-    O << *Expr;
-  }
-  
-  O << ", (";
-
-  if (NumArgs > 0) {
-    printOperand(MI, Index++, O);
-    for (unsigned i = 1; i < NumArgs; ++i) {
-      O << ", ";
-      printOperand(MI, Index++, O);
-    }
-  }
-  O << ")";
-}
-
-void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm()) {
-    O << Op.getImm();
-  } else if (Op.isFPImm()) {
-    double Imm = Op.getFPImm();
-    APFloat FPImm(Imm);
-    APInt FPIntImm = FPImm.bitcastToAPInt();
-    O << "0D";
-    // PTX requires us to output the full 64 bits, even if the number is zero
-    if (FPIntImm.getZExtValue() > 0) {
-      O << FPIntImm.toString(16, false);
-    } else {
-      O << "0000000000000000";
-    }
-  } else if (Op.isReg()) {
-    printRegName(O, Op.getReg());
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    const MCExpr *Expr = Op.getExpr();
-    if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
-      const MCSymbol &Sym = SymRefExpr->getSymbol();
-      O << Sym.getName();
-    } else {
-      O << *Op.getExpr();
-    }
-  }
-}
-
-void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  // By definition, operand OpNo+1 is an i32imm
-  const MCOperand &Op2 = MI->getOperand(OpNo+1);
-  printOperand(MI, OpNo, O);
-  if (Op2.getImm() == 0)
-    return; // don't print "+0"
-  O << "+" << Op2.getImm();
-}
-
-void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert (Op.isImm() && "Rounding modes must be immediate values");
-  switch (Op.getImm()) {
-  default:
-    llvm_unreachable("Unknown rounding mode!");
-  case PTXRoundingMode::RndDefault:
-    llvm_unreachable("FP rounding-mode pass did not handle instruction!");
-  case PTXRoundingMode::RndNone:
-    // Do not print anything.
-    break;
-  case PTXRoundingMode::RndNearestEven:
-    O << ".rn";
-    break;
-  case PTXRoundingMode::RndTowardsZero:
-    O << ".rz";
-    break;
-  case PTXRoundingMode::RndNegInf:
-    O << ".rm";
-    break;
-  case PTXRoundingMode::RndPosInf:
-    O << ".rp";
-    break;
-  case PTXRoundingMode::RndApprox:
-    O << ".approx";
-    break;
-  case PTXRoundingMode::RndNearestEvenInt:
-    O << ".rni";
-    break;
-  case PTXRoundingMode::RndTowardsZeroInt:
-    O << ".rzi";
-    break;
-  case PTXRoundingMode::RndNegInfInt:
-    O << ".rmi";
-    break;
-  case PTXRoundingMode::RndPosInfInt:
-    O << ".rpi";
-    break;
-  }
-}
-
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
deleted file mode 100644
index ea4d504..0000000
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints n PTX MCInst to a .ptx file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXINSTPRINTER_H
-#define PTXINSTPRINTER_H
-
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class PTXInstPrinter : public MCInstPrinter {
-public:
-  PTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-
-  // Autogenerated by tblgen.
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-
-  void printPredicate(const MCInst *MI, raw_ostream &O);
-  void printCall(const MCInst *MI, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-};
-}
-
-#endif
-
diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index d1fd74c..0000000
--- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-add_llvm_library(LLVMPTXDesc
-  PTXMCTargetDesc.cpp
-  PTXMCAsmInfo.cpp
-  )
-
-add_dependencies(LLVMPTXDesc PTXCommonTableGen)
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
deleted file mode 100644
index a3e0f32..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
+++ /dev/null
@@ -1,134 +0,0 @@
-//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the PTX target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXBASEINFO_H
-#define PTXBASEINFO_H
-
-#include "PTXMCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-  namespace PTXStateSpace {
-    enum {
-      Global    = 0, // default to global state space
-      Constant  = 1,
-      Local     = 2,
-      Parameter = 3,
-      Shared    = 4
-    };
-  } // namespace PTXStateSpace
-
-  namespace PTXPredicate {
-    enum {
-      Normal = 0,
-      Negate = 1,
-      None   = 2
-    };
-  } // namespace PTXPredicate
-
-  /// Namespace to hold all target-specific flags.
-  namespace PTXRoundingMode {
-    // Instruction Flags
-    enum {
-      // Rounding Mode Flags
-      RndMask             = 15,
-      RndDefault          =  0, // ---
-      RndNone             =  1, // <NONE>
-      RndNearestEven      =  2, // .rn
-      RndTowardsZero      =  3, // .rz
-      RndNegInf           =  4, // .rm
-      RndPosInf           =  5, // .rp
-      RndApprox           =  6, // .approx
-      RndNearestEvenInt   =  7, // .rni
-      RndTowardsZeroInt   =  8, // .rzi
-      RndNegInfInt        =  9, // .rmi
-      RndPosInfInt        = 10  // .rpi
-    };
-  } // namespace PTXII
-
-  namespace PTXRegisterType {
-    // Register type encoded in MCOperands
-    enum {
-      Pred  = 0,
-      B16,
-      B32,
-      B64,
-      F32,
-      F64
-    };
-  } // namespace PTXRegisterType
-
-  namespace PTXRegisterSpace {
-    // Register space encoded in MCOperands
-    enum {
-      Reg = 0,
-      Local,
-      Param,
-      Argument,
-      Return
-    };
-  }
-
-  inline static void decodeRegisterName(raw_ostream &OS,
-                                        unsigned EncodedReg) {
-    OS << "%";
-
-    unsigned RegSpace  = EncodedReg & 0x7;
-    unsigned RegType   = (EncodedReg >> 3) & 0x7;
-    unsigned RegOffset = EncodedReg >> 6;
-
-    switch (RegSpace) {
-    default:
-      llvm_unreachable("Unknown register space!");
-    case PTXRegisterSpace::Reg:
-      switch (RegType) {
-      default:
-        llvm_unreachable("Unknown register type!");
-      case PTXRegisterType::Pred:
-        OS << "p";
-        break;
-      case PTXRegisterType::B16:
-        OS << "rh";
-        break;
-      case PTXRegisterType::B32:
-        OS << "r";
-        break;
-      case PTXRegisterType::B64:
-        OS << "rd";
-        break;
-      case PTXRegisterType::F32:
-        OS << "f";
-        break;
-      case PTXRegisterType::F64:
-        OS << "fd";
-        break;
-      }
-      break;
-    case PTXRegisterSpace::Return:
-      OS << "ret";
-      break;
-    case PTXRegisterSpace::Argument:
-      OS << "arg";
-      break;
-    }
-
-    OS << RegOffset;
-  }
-} // namespace llvm
-
-#endif
-
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
deleted file mode 100644
index cdfbc80..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- PTXMCAsmInfo.cpp - PTX asm properties -----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the PTXMCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXMCAsmInfo.h"
-#include "llvm/ADT/Triple.h"
-
-using namespace llvm;
-
-void PTXMCAsmInfo::anchor() { }
-
-PTXMCAsmInfo::PTXMCAsmInfo(const Target &T, const StringRef &TT) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::ptx64)
-    PointerSize = 8;
-
-  CommentString = "//";
-
-  PrivateGlobalPrefix = "$L__";
-
-  AllowPeriodsInName = false;
-
-  HasSetDirective = false;
-
-  HasDotTypeDotSizeDirective = false;
-
-  HasSingleParameterDotFile = false;
-}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
deleted file mode 100644
index 08fb970..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===-- PTXMCTargetDesc.cpp - PTX Target Descriptions ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides PTX specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXMCTargetDesc.h"
-#include "PTXMCAsmInfo.h"
-#include "InstPrinter/PTXInstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_MC_DESC
-#include "PTXGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "PTXGenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "PTXGenRegisterInfo.inc"
-
-using namespace llvm;
-
-static MCInstrInfo *createPTXMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitPTXMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  // PTX does not have a return address register.
-  InitPTXMCRegisterInfo(X, 0);
-  return X;
-}
-
-static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                 StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitPTXMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI) {
-  assert(SyntaxVariant == 0 && "We only have one syntax variant");
-  return new PTXInstPrinter(MAI, MII, MRI, STI);
-}
-
-extern "C" void LLVMInitializePTXTargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
-  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target,
-                                          createPTXMCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target,
-                                          createPTXMCSubtargetInfo);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter);
-}
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
deleted file mode 100644
index ffb92cb..0000000
--- a/lib/Target/PTX/PTX.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// PTX back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_H
-#define PTX_H
-
-#include "MCTargetDesc/PTXBaseInfo.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-  class MachineInstr;
-  class MCInst;
-  class PTXAsmPrinter;
-  class PTXTargetMachine;
-  class FunctionPass;
-
-  FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM,
-                                            CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXRegisterAllocator();
-
-  void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    PTXAsmPrinter &AP);
-
-} // namespace llvm;
-
-#endif // PTX_H
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
deleted file mode 100644
index 994a68e..0000000
--- a/lib/Target/PTX/PTX.td
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- PTX.td - Describe the PTX Target Machine -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This is the top level entry point for the PTX target.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// Subtarget Features
-//===----------------------------------------------------------------------===//
-
-//===- Architectural Features ---------------------------------------------===//
-
-def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
-                                     "Do not demote .f64 to .f32">;
-
-def FeatureNoFMA  : SubtargetFeature<"no-fma","SupportsFMA", "false",
-                                     "Disable Fused-Multiply Add">;
-
-//===- PTX Version --------------------------------------------------------===//
-
-def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
-                                    "Use PTX Language Version 2.0">;
-
-def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1",
-                                    "Use PTX Language Version 2.1">;
-
-def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
-                                    "Use PTX Language Version 2.2">;
-
-def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3",
-                                    "Use PTX Language Version 2.3">;
-
-//===- PTX Target ---------------------------------------------------------===//
-
-def FeatureSM10 : SubtargetFeature<"sm10", "PTXTarget", "PTX_SM_1_0",
-                                   "Use Shader Model 1.0">;
-def FeatureSM11 : SubtargetFeature<"sm11", "PTXTarget", "PTX_SM_1_1",
-                                   "Use Shader Model 1.1">;
-def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2",
-                                   "Use Shader Model 1.2">;
-def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3",
-                                   "Use Shader Model 1.3">;
-def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0",
-                                   "Use Shader Model 2.0", [FeatureDouble]>;
-def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1",
-                                   "Use Shader Model 2.1", [FeatureDouble]>;
-def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2",
-                                   "Use Shader Model 2.2", [FeatureDouble]>;
-def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3",
-                                   "Use Shader Model 2.3", [FeatureDouble]>;
-
-def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget",
-                                        "PTX_COMPUTE_1_0",
-                                        "Use Compute Compatibility 1.0">;
-def FeatureCOMPUTE11 : SubtargetFeature<"compute11", "PTXTarget",
-                                        "PTX_COMPUTE_1_1",
-                                        "Use Compute Compatibility 1.1">;
-def FeatureCOMPUTE12 : SubtargetFeature<"compute12", "PTXTarget",
-                                        "PTX_COMPUTE_1_2",
-                                        "Use Compute Compatibility 1.2">;
-def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget",
-                                        "PTX_COMPUTE_1_3",
-                                        "Use Compute Compatibility 1.3">;
-def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget",
-                                        "PTX_COMPUTE_2_0",
-                                        "Use Compute Compatibility 2.0",
-                                        [FeatureDouble]>;
-
-//===----------------------------------------------------------------------===//
-// PTX supported processors
-//===----------------------------------------------------------------------===//
-
-class Proc<string Name, list<SubtargetFeature> Features>
-  : Processor<Name, NoItineraries, Features>;
-
-def : Proc<"generic", []>;
-
-// Processor definitions for compute/shader models
-def : Proc<"compute_10", [FeatureCOMPUTE10]>;
-def : Proc<"compute_11", [FeatureCOMPUTE11]>;
-def : Proc<"compute_12", [FeatureCOMPUTE12]>;
-def : Proc<"compute_13", [FeatureCOMPUTE13]>;
-def : Proc<"compute_20", [FeatureCOMPUTE20]>;
-def : Proc<"sm_10",      [FeatureSM10]>;
-def : Proc<"sm_11",      [FeatureSM11]>;
-def : Proc<"sm_12",      [FeatureSM12]>;
-def : Proc<"sm_13",      [FeatureSM13]>;
-def : Proc<"sm_20",      [FeatureSM20]>;
-def : Proc<"sm_21",      [FeatureSM21]>;
-def : Proc<"sm_22",      [FeatureSM22]>;
-def : Proc<"sm_23",      [FeatureSM23]>;
-
-// Processor definitions for common GPU architectures
-def : Proc<"g80",        [FeatureSM10]>;
-def : Proc<"gt200",      [FeatureSM13]>;
-def : Proc<"gf100",      [FeatureSM20, FeatureDouble]>;
-def : Proc<"fermi",      [FeatureSM20, FeatureDouble]>;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "PTXRegisterInfo.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "PTXInstrInfo.td"
-
-def PTXInstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// PTX uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def PTXAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def PTX : Target {
-  let InstructionSet = PTXInstrInfo;
-  let AssemblyWriters = [PTXAsmWriter];
-}
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
deleted file mode 100644
index 0b6ac7b..0000000
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ /dev/null
@@ -1,561 +0,0 @@
-//===-- PTXAsmPrinter.cpp - PTX LLVM assembly writer ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to PTX assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-asm-printer"
-
-#include "PTXAsmPrinter.h"
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXParamManager.h"
-#include "PTXRegisterInfo.h"
-#include "PTXTargetMachine.h"
-#include "llvm/Argument.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static const char PARAM_PREFIX[] = "__param_";
-static const char RETURN_PREFIX[] = "__ret_";
-
-static const char *getRegisterTypeName(unsigned RegType) {
-  switch (RegType) {
-  default:
-    llvm_unreachable("Unknown register type");
-  case PTXRegisterType::Pred:
-    return ".pred";
-  case PTXRegisterType::B16:
-    return ".b16";
-  case PTXRegisterType::B32:
-    return ".b32";
-  case PTXRegisterType::B64:
-    return ".b64";
-  case PTXRegisterType::F32:
-    return ".f32";
-  case PTXRegisterType::F64:
-    return ".f64";
-  }
-}
-
-static const char *getStateSpaceName(unsigned addressSpace) {
-  switch (addressSpace) {
-  default: llvm_unreachable("Unknown state space");
-  case PTXStateSpace::Global:    return "global";
-  case PTXStateSpace::Constant:  return "const";
-  case PTXStateSpace::Local:     return "local";
-  case PTXStateSpace::Parameter: return "param";
-  case PTXStateSpace::Shared:    return "shared";
-  }
-}
-
-static const char *getTypeName(Type* type) {
-  while (true) {
-    switch (type->getTypeID()) {
-      default: llvm_unreachable("Unknown type");
-      case Type::FloatTyID: return ".f32";
-      case Type::DoubleTyID: return ".f64";
-      case Type::IntegerTyID:
-        switch (type->getPrimitiveSizeInBits()) {
-          default: llvm_unreachable("Unknown integer bit-width");
-          case 16: return ".u16";
-          case 32: return ".u32";
-          case 64: return ".u64";
-        }
-      case Type::ArrayTyID:
-      case Type::PointerTyID:
-        type = dyn_cast<SequentialType>(type)->getElementType();
-        break;
-    }
-  }
-  return NULL;
-}
-
-bool PTXAsmPrinter::doFinalization(Module &M) {
-  // XXX Temproarily remove global variables so that doFinalization() will not
-  // emit them again (global variables are emitted at beginning).
-
-  Module::GlobalListType &global_list = M.getGlobalList();
-  int i, n = global_list.size();
-  GlobalVariable **gv_array = new GlobalVariable* [n];
-
-  // first, back-up GlobalVariable in gv_array
-  i = 0;
-  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
-       I != E; ++I)
-    gv_array[i++] = &*I;
-
-  // second, empty global_list
-  while (!global_list.empty())
-    global_list.remove(global_list.begin());
-
-  // call doFinalization
-  bool ret = AsmPrinter::doFinalization(M);
-
-  // now we restore global variables
-  for (i = 0; i < n; i ++)
-    global_list.insert(global_list.end(), gv_array[i]);
-
-  delete[] gv_array;
-  return ret;
-}
-
-void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
-{
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-
-  // Emit the PTX .version and .target attributes
-  OutStreamer.EmitRawText(Twine("\t.version ") + ST.getPTXVersionString());
-  OutStreamer.EmitRawText(Twine("\t.target ") + ST.getTargetString() +
-                                (ST.supportsDouble() ? ""
-                                                     : ", map_f64_to_f32"));
-  // .address_size directive is optional, but it must immediately follow
-  // the .target directive if present within a module
-  if (ST.supportsPTX23()) {
-    const char *addrSize = ST.is64Bit() ? "64" : "32";
-    OutStreamer.EmitRawText(Twine("\t.address_size ") + addrSize);
-  }
-
-  OutStreamer.AddBlankLine();
-
-  // Define any .file directives
-  DebugInfoFinder DbgFinder;
-  DbgFinder.processModule(M);
-
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-       E = DbgFinder.compile_unit_end(); I != E; ++I) {
-    DICompileUnit DIUnit(*I);
-    StringRef FN = DIUnit.getFilename();
-    StringRef Dir = DIUnit.getDirectory();
-    GetOrCreateSourceID(FN, Dir);
-  }
-
-  OutStreamer.AddBlankLine();
-
-  // declare external functions
-  for (Module::const_iterator i = M.begin(), e = M.end();
-       i != e; ++i)
-    EmitFunctionDeclaration(i);
-  
-  // declare global variables
-  for (Module::const_global_iterator i = M.global_begin(), e = M.global_end();
-       i != e; ++i)
-    EmitVariableDeclaration(i);
-}
-
-void PTXAsmPrinter::EmitFunctionBodyStart() {
-  OutStreamer.EmitRawText(Twine("{"));
-
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  const PTXParamManager &PM = MFI->getParamManager();
-
-  // Print register definitions
-  SmallString<128> regDefs;
-  raw_svector_ostream os(regDefs);
-  unsigned numRegs;
-
-  // pred
-  numRegs = MFI->countRegisters(PTXRegisterType::Pred, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .pred %p<" << numRegs << ">;\n";
-
-  // i16
-  numRegs = MFI->countRegisters(PTXRegisterType::B16, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b16 %rh<" << numRegs << ">;\n";
-
-  // i32
-  numRegs = MFI->countRegisters(PTXRegisterType::B32, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b32 %r<" << numRegs << ">;\n";
-
-  // i64
-  numRegs = MFI->countRegisters(PTXRegisterType::B64, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b64 %rd<" << numRegs << ">;\n";
-
-  // f32
-  numRegs = MFI->countRegisters(PTXRegisterType::F32, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .f32 %f<" << numRegs << ">;\n";
-
-  // f64
-  numRegs = MFI->countRegisters(PTXRegisterType::F64, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .f64 %fd<" << numRegs << ">;\n";
-
-  // Local params
-  for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end();
-       i != e; ++i)
-    os << "\t.param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i)
-       << ";\n";
-
-  OutStreamer.EmitRawText(os.str());
-
-
-  const MachineFrameInfo* FrameInfo = MF->getFrameInfo();
-  DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects()
-               << " frame object(s)\n");
-  for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) {
-    DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n");
-    if (FrameInfo->getObjectSize(i) > 0) {
-      OutStreamer.EmitRawText("\t.local .align " +
-                              Twine(FrameInfo->getObjectAlignment(i)) +
-                              " .b8 __local" +
-                              Twine(i) +
-                              "[" +
-                              Twine(FrameInfo->getObjectSize(i)) +
-                              "];");
-    }
-  }
-
-  //unsigned Index = 1;
-  // Print parameter passing params
-  //for (PTXMachineFunctionInfo::param_iterator
-  //     i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) {
-  //  std::string def = "\t.param .b";
-  //  def += utostr(*i);
-  //  def += " __ret_";
-  //  def += utostr(Index);
-  //  Index++;
-  //  def += ";";
-  //  OutStreamer.EmitRawText(Twine(def));
-  //}
-}
-
-void PTXAsmPrinter::EmitFunctionBodyEnd() {
-  OutStreamer.EmitRawText(Twine("}"));
-}
-
-void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MCInst TmpInst;
-  LowerPTXMachineInstrToMCInst(MI, TmpInst, *this);
-  OutStreamer.EmitInstruction(TmpInst);
-}
-
-void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
-  // Check to see if this is a special global used by LLVM, if so, emit it.
-  if (EmitSpecialLLVMGlobal(gv))
-    return;
-
-  MCSymbol *gvsym = Mang->getSymbol(gv);
-
-  assert(gvsym->isUndefined() && "Cannot define a symbol twice!");
-
-  SmallString<128> decl;
-  raw_svector_ostream os(decl);
-
-  // check if it is defined in some other translation unit
-  if (gv->isDeclaration())
-    os << ".extern ";
-
-  // state space: e.g., .global
-  os << '.' << getStateSpaceName(gv->getType()->getAddressSpace()) << ' ';
-
-  // alignment (optional)
-  unsigned alignment = gv->getAlignment();
-  if (alignment != 0)
-    os << ".align " << gv->getAlignment() << ' ';
-
-
-  if (PointerType::classof(gv->getType())) {
-    PointerType* pointerTy = dyn_cast<PointerType>(gv->getType());
-    Type* elementTy = pointerTy->getElementType();
-
-    if (elementTy->isArrayTy()) {
-      assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
-
-      ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy);
-      elementTy = arrayTy->getElementType();
-
-      unsigned numElements = arrayTy->getNumElements();
-
-      while (elementTy->isArrayTy()) {
-        arrayTy = dyn_cast<ArrayType>(elementTy);
-        elementTy = arrayTy->getElementType();
-
-        numElements *= arrayTy->getNumElements();
-      }
-
-      // FIXME: isPrimitiveType() == false for i16?
-      assert(elementTy->isSingleValueType() &&
-             "Non-primitive types are not handled");
-
-      // Find the size of the element in bits
-      unsigned elementSize = elementTy->getPrimitiveSizeInBits();
-
-      os << ".b" << elementSize << ' ' << gvsym->getName()
-         << '[' << numElements << ']';
-    } else {
-      os << ".b8" << gvsym->getName() << "[]";
-    }
-
-    // handle string constants (assume ConstantArray means string)
-    if (gv->hasInitializer()) {
-      const Constant *C = gv->getInitializer();
-      if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
-        os << " = {";
-
-        for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
-          if (i > 0)
-            os << ',';
-
-          os << "0x";
-          os.write_hex(cast<ConstantInt>(CA->getOperand(i))->getZExtValue());
-        }
-
-        os << '}';
-      }
-    }
-  } else {
-    // Note: this is currently the fall-through case and most likely generates
-    //       incorrect code.
-    os << getTypeName(gv->getType()) << ' ' << gvsym->getName();
-
-    if (isa<ArrayType>(gv->getType()) || isa<PointerType>(gv->getType()))
-      os << "[]";
-  }
-
-  os << ';';
-
-  OutStreamer.EmitRawText(os.str());
-  OutStreamer.AddBlankLine();
-}
-
-void PTXAsmPrinter::EmitFunctionEntryLabel() {
-  // The function label could have already been emitted if two symbols end up
-  // conflicting due to asm renaming.  Detect this and emit an error.
-  if (!CurrentFnSym->isUndefined())
-    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
-                       "' label emitted multiple times to assembly file");
-
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  const PTXParamManager &PM = MFI->getParamManager();
-  const bool isKernel = MFI->isKernel();
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-
-  SmallString<128> decl;
-  raw_svector_ostream os(decl);
-  os << (isKernel ? ".entry" : ".func");
-
-  if (!isKernel) {
-    os << " (";
-    if (ST.useParamSpaceForDeviceArgs()) {
-      for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(),
-           b = i; i != e; ++i) {
-        if (i != b)
-          os << ", ";
-
-        os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
-      }
-    } else {
-      for (PTXMachineFunctionInfo::reg_iterator
-           i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i;
-           i != e; ++i) {
-        if (i != b)
-          os << ", ";
-
-        os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
-           << MFI->getRegisterName(*i);
-      }
-    }
-    os << ')';
-  }
-
-  // Print function name
-  os << ' ' << CurrentFnSym->getName() << " (";
-
-  const Function *F = MF->getFunction();
-
-  // Print parameters
-  if (isKernel || ST.useParamSpaceForDeviceArgs()) {
-    /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(),
-         b = i; i != e; ++i) {
-      if (i != b)
-        os << ", ";
-
-      os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
-    }*/
-    int Counter = 1;
-    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(),
-         b = i; i != e; ++i) {
-      if (i != b)
-        os << ", ";
-      const Type *ArgType = (*i).getType();
-      os << ".param .b";
-      if (ArgType->isPointerTy()) {
-        if (ST.is64Bit())
-          os << "64";
-        else
-          os << "32";
-      } else {
-        os << ArgType->getPrimitiveSizeInBits();
-      }
-      if (ArgType->isPointerTy() && ST.emitPtrAttribute()) {
-        const PointerType *PtrType = dyn_cast<const PointerType>(ArgType);
-        os << " .ptr";
-        switch (PtrType->getAddressSpace()) {
-        default:
-          llvm_unreachable("Unknown address space in argument");
-        case PTXStateSpace::Global:
-          os << " .global";
-          break;
-        case PTXStateSpace::Shared:
-          os << " .shared";
-          break;
-        }
-      }
-      os << " __param_" << Counter++;
-    }
-  } else {
-    for (PTXMachineFunctionInfo::reg_iterator
-         i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i;
-         i != e; ++i) {
-      if (i != b)
-        os << ", ";
-
-      os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
-         << MFI->getRegisterName(*i);
-    }
-  }
-  os << ')';
-
-  OutStreamer.EmitRawText(os.str());
-}
-
-void PTXAsmPrinter::EmitFunctionDeclaration(const Function* func)
-{
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-	
-  std::string decl = "";
-
-  // hard-coded emission of extern vprintf function 
-  
-  if (func->getName() == "printf" || func->getName() == "puts") {		
-    decl += ".extern .func (.param .b32 __param_1) vprintf (.param .b";
-    if (ST.is64Bit())	
-      decl += "64";
-    else				
-      decl += "32";
-    decl += " __param_2, .param .b";
-    if (ST.is64Bit())	
-      decl += "64";
-    else				
-      decl += "32";
-    decl += " __param_3)\n";
-  }
-  
-  OutStreamer.EmitRawText(Twine(decl));
-}
-
-unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
-                                            StringRef DirName) {
-  // If FE did not provide a file name, then assume stdin.
-  if (FileName.empty())
-    return GetOrCreateSourceID("<stdin>", StringRef());
-
-  // MCStream expects full path name as filename.
-  if (!DirName.empty() && !sys::path::is_absolute(FileName)) {
-    SmallString<128> FullPathName = DirName;
-    sys::path::append(FullPathName, FileName);
-    // Here FullPathName will be copied into StringMap by GetOrCreateSourceID.
-    return GetOrCreateSourceID(StringRef(FullPathName), StringRef());
-  }
-
-  StringMapEntry<unsigned> &Entry = SourceIdMap.GetOrCreateValue(FileName);
-  if (Entry.getValue())
-    return Entry.getValue();
-
-  unsigned SrcId = SourceIdMap.size();
-  Entry.setValue(SrcId);
-
-  // Print out a .file directive to specify files for .loc directives.
-  OutStreamer.EmitDwarfFileDirective(SrcId, "", Entry.getKey());
-
-  return SrcId;
-}
-
-MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
-                                      const MCSymbol *Symbol) {
-  const MCExpr *Expr;
-  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
-  MCOperand MCOp;
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  unsigned EncodedReg;
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unknown operand type");
-  case MachineOperand::MO_Register:
-    if (MO.getReg() > 0) {
-      // Encode the register
-      EncodedReg = MFI->getEncodedRegister(MO.getReg());
-    } else {
-      EncodedReg = 0;
-    }
-    MCOp = MCOperand::CreateReg(EncodedReg);
-    break;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                                 MO.getMBB()->getSymbol(), OutContext));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
-    break;
-  case MachineOperand::MO_FPImmediate:
-    APFloat Val = MO.getFPImm()->getValueAPF();
-    bool ignored;
-    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
-    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
-    break;
-  }
-
-  return MCOp;
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializePTXAsmPrinter() {
-  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
-  RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
-}
diff --git a/lib/Target/PTX/PTXAsmPrinter.h b/lib/Target/PTX/PTXAsmPrinter.h
deleted file mode 100644
index 74c8d58..0000000
--- a/lib/Target/PTX/PTXAsmPrinter.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- PTXAsmPrinter.h - Print machine code to a PTX file ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// PTX Assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXASMPRINTER_H
-#define PTXASMPRINTER_H
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter {
-public:
-  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {}
-
-  const char *getPassName() const { return "PTX Assembly Printer"; }
-
-  bool doFinalization(Module &M);
-
-  virtual void EmitStartOfAsmFile(Module &M);
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd();
-  virtual void EmitFunctionEntryLabel();
-  virtual void EmitInstruction(const MachineInstr *MI);
-
-  unsigned GetOrCreateSourceID(StringRef FileName,
-                               StringRef DirName);
-
-  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
-  MCOperand lowerOperand(const MachineOperand &MO);
-
-private:
-  void EmitVariableDeclaration(const GlobalVariable *gv);
-  void EmitFunctionDeclaration(const Function* func);
-
-  StringMap<unsigned> SourceIdMap;
-}; // class PTXAsmPrinter
-} // namespace llvm
-
-#endif
-
diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp
deleted file mode 100644
index a21d172..0000000
--- a/lib/Target/PTX/PTXFPRoundingModePass.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a machine function pass that sets appropriate FP rounding
-// modes for all relevant instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-fp-rounding-mode"
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// NOTE: PTXFPRoundingModePass should be executed just before emission.
-
-namespace {
-  /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
-  /// all FP instructions. Essentially, this pass just looks for all FP
-  /// instructions that have a rounding mode set to RndDefault, and sets an
-  /// appropriate rounding mode based on the target device.
-  ///
-  class PTXFPRoundingModePass : public MachineFunctionPass {
-    private:
-      static char ID;
-
-      typedef std::pair<unsigned, unsigned> RndModeDesc;
-
-      PTXTargetMachine& TargetMachine;
-      DenseMap<unsigned, RndModeDesc> Instrs;
-
-    public:
-      PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
-        : MachineFunctionPass(ID),
-          TargetMachine(TM) {
-        initializeMap();
-      }
-
-      virtual bool runOnMachineFunction(MachineFunction &MF);
-
-      virtual const char *getPassName() const {
-        return "PTX FP Rounding Mode Pass";
-      }
-
-    private:
-
-      void initializeMap();
-      void processInstruction(MachineInstr &MI);
-  }; // class PTXFPRoundingModePass
-} // end anonymous namespace
-
-using namespace llvm;
-
-char PTXFPRoundingModePass::ID = 0;
-
-bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) {
-  // Look at each basic block
-  for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe;
-       ++bbi) {
-    MachineBasicBlock &MBB = *bbi;
-    // Look at each instruction
-    for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end();
-         ii != ie; ++ii) {
-      MachineInstr &MI = *ii;
-      processInstruction(MI);
-    }
-  }
-  return false;
-}
-
-void PTXFPRoundingModePass::initializeMap() {
-  using namespace PTXRoundingMode;
-  const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>();
-
-  // Build a map of default rounding mode for all instructions that need a
-  // rounding mode.
-  Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-
-  Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone);
-
-  unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone;
-  Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode);
-
-  unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone;
-  Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode);
-
-  Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-
-  Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox);
-
-  Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-
-  Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven);
-}
-
-void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) {
-  // Is this an instruction that needs a rounding mode?
-  if (Instrs.count(MI.getOpcode())) {
-    const RndModeDesc &Desc = Instrs[MI.getOpcode()];
-    // Get the rounding mode operand
-    MachineOperand &Op = MI.getOperand(Desc.first);
-    // Update the rounding mode if needed
-    if (Op.getImm() == PTXRoundingMode::RndDefault) {
-      Op.setImm(Desc.second);
-    }
-  }
-}
-
-FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM,
-                                                CodeGenOpt::Level OptLevel) {
-  return new PTXFPRoundingModePass(TM, OptLevel);
-}
-
diff --git a/lib/Target/PTX/PTXFrameLowering.cpp b/lib/Target/PTX/PTXFrameLowering.cpp
deleted file mode 100644
index e6e268e..0000000
--- a/lib/Target/PTX/PTXFrameLowering.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- PTXFrameLowering.cpp - PTX Frame Information ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXFrameLowering.h"
-#include "llvm/CodeGen/MachineFunction.h"
-
-using namespace llvm;
-
-void PTXFrameLowering::emitPrologue(MachineFunction &MF) const {
-}
-
-void PTXFrameLowering::emitEpilogue(MachineFunction &MF,
-                                    MachineBasicBlock &MBB) const {
-}
diff --git a/lib/Target/PTX/PTXFrameLowering.h b/lib/Target/PTX/PTXFrameLowering.h
deleted file mode 100644
index 831e818..0000000
--- a/lib/Target/PTX/PTXFrameLowering.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- PTXFrameLowering.h - Define frame lowering for PTX -----*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_FRAMEINFO_H
-#define PTX_FRAMEINFO_H
-
-#include "PTX.h"
-#include "PTXSubtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-  class PTXSubtarget;
-
-class PTXFrameLowering : public TargetFrameLowering {
-protected:
-  const PTXSubtarget &STI;
-
-public:
-  explicit PTXFrameLowering(const PTXSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2),
-      STI(sti) {
-  }
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  bool hasFP(const MachineFunction &MF) const { return false; }
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
deleted file mode 100644
index 5c7ee29..0000000
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-//===-- PTXISelDAGToDAG.cpp - A dag to dag inst selector for PTX ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the PTX target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-// PTXDAGToDAGISel - PTX specific code to select PTX machine
-// instructions for SelectionDAG operations.
-class PTXDAGToDAGISel : public SelectionDAGISel {
-  public:
-    PTXDAGToDAGISel(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel);
-
-    virtual const char *getPassName() const {
-      return "PTX DAG->DAG Pattern Instruction Selection";
-    }
-
-    SDNode *Select(SDNode *Node);
-
-    // Complex Pattern Selectors.
-    bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2);
-    bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset);
-
-    // Include the pieces auto'gened from the target description
-#include "PTXGenDAGISel.inc"
-
-  private:
-    // We need this only because we can't match intruction BRAdp
-    // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
-    SDNode *SelectBRCOND(SDNode *Node);
-
-    SDNode *SelectREADPARAM(SDNode *Node);
-    SDNode *SelectWRITEPARAM(SDNode *Node);
-    SDNode *SelectFrameIndex(SDNode *Node);
-
-    bool isImm(const SDValue &operand);
-    bool SelectImm(const SDValue &operand, SDValue &imm);
-
-    const PTXSubtarget& getSubtarget() const;
-}; // class PTXDAGToDAGISel
-} // namespace
-
-// createPTXISelDag - This pass converts a legalized DAG into a
-// PTX-specific DAG, ready for instruction scheduling
-FunctionPass *llvm::createPTXISelDag(PTXTargetMachine &TM,
-                                     CodeGenOpt::Level OptLevel) {
-  return new PTXDAGToDAGISel(TM, OptLevel);
-}
-
-PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel)
-  : SelectionDAGISel(TM, OptLevel) {}
-
-SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
-  switch (Node->getOpcode()) {
-    case ISD::BRCOND:
-      return SelectBRCOND(Node);
-    case PTXISD::READ_PARAM:
-      return SelectREADPARAM(Node);
-    case PTXISD::WRITE_PARAM:
-      return SelectWRITEPARAM(Node);
-    case ISD::FrameIndex:
-      return SelectFrameIndex(Node);
-    default:
-      return SelectCode(Node);
-  }
-}
-
-SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
-  assert(Node->getNumOperands() >= 3);
-
-  SDValue Chain  = Node->getOperand(0);
-  SDValue Pred   = Node->getOperand(1);
-  SDValue Target = Node->getOperand(2); // branch target
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  assert(Target.getOpcode()  == ISD::BasicBlock);
-  assert(Pred.getValueType() == MVT::i1);
-
-  // Emit BRAdp
-  SDValue Ops[] = { Target, Pred, PredOp, Chain };
-  return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
-}
-
-SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) {
-  SDValue Chain = Node->getOperand(0);
-  SDValue Index = Node->getOperand(1);
-
-  int OpCode;
-
-  // Get the type of parameter we are reading
-  EVT VT = Node->getValueType(0);
-  assert(VT.isSimple() && "READ_PARAM only implemented for MVT types");
-
-  MVT Type = VT.getSimpleVT();
-
-  if (Type == MVT::i1)
-    OpCode = PTX::READPARAMPRED;
-  else if (Type == MVT::i16)
-    OpCode = PTX::READPARAMI16;
-  else if (Type == MVT::i32)
-    OpCode = PTX::READPARAMI32;
-  else if (Type == MVT::i64)
-    OpCode = PTX::READPARAMI64;
-  else if (Type == MVT::f32)
-    OpCode = PTX::READPARAMF32;
-  else {
-    assert(Type == MVT::f64 && "Unexpected type!");
-    OpCode = PTX::READPARAMF64;
-  }
-
-  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  SDValue Ops[] = { Index, Pred, PredOp, Chain };
-  return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4);
-}
-
-SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) {
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue Value = Node->getOperand(1);
-
-  int OpCode;
-
-  //Node->dumpr(CurDAG);
-
-  // Get the type of parameter we are writing
-  EVT VT = Value->getValueType(0);
-  assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types");
-
-  MVT Type = VT.getSimpleVT();
-
-  if (Type == MVT::i1)
-    OpCode = PTX::WRITEPARAMPRED;
-  else if (Type == MVT::i16)
-    OpCode = PTX::WRITEPARAMI16;
-  else if (Type == MVT::i32)
-    OpCode = PTX::WRITEPARAMI32;
-  else if (Type == MVT::i64)
-    OpCode = PTX::WRITEPARAMI64;
-  else if (Type == MVT::f32)
-    OpCode = PTX::WRITEPARAMF32;
-  else if (Type == MVT::f64)
-    OpCode = PTX::WRITEPARAMF64;
-  else
-    llvm_unreachable("Invalid type in SelectWRITEPARAM");
-
-  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  SDValue Ops[] = { Value, Pred, PredOp, Chain };
-  SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4);
-
-  //dbgs() << "SelectWRITEPARAM produced:\n\t";
-  //Ret->dumpr(CurDAG);
-
-  return Ret;
-}
-
-SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) {
-  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-  //dbgs() << "Selecting FrameIndex at index " << FI << "\n";
-  //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0));
-
-  PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-
-  SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI),
-                                                        Node->getValueType(0));
-
-  return FrameSymbol.getNode();
-}
-
-// Match memory operand of the form [reg+reg]
-bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
-  if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 ||
-      isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1)))
-    return false;
-
-  assert(Addr.getValueType().isSimple() && "Type must be simple");
-
-  R1 = Addr;
-  R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-  return true;
-}
-
-// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
-bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
-                                   SDValue &Offset) {
-  // FrameIndex addresses are handled separately
-  //errs() << "SelectADDRri: ";
-  //Addr.getNode()->dumpr();
-  if (isa<FrameIndexSDNode>(Addr)) {
-    //errs() << "Failure\n";
-    return false;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    Base = Addr.getOperand(0);
-    if (isa<FrameIndexSDNode>(Base)) {
-      //errs() << "Failure\n";
-      return false;
-    }
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-    //errs() << "Success\n";
-    return true;
-  }
-
-  /*if (Addr.getNumOperands() == 1) {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-    errs() << "Success\n";
-    return true;
-  }*/
-
-  //errs() << "SelectADDRri fails on: ";
-  //Addr.getNode()->dumpr();
-
-  if (isImm(Addr)) {
-    //errs() << "Failure\n";
-    return false;
-  }
-
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-  //errs() << "Success\n";
-  return true;
-
-  /*if (Addr.getOpcode() != ISD::ADD) {
-    // let SelectADDRii handle the [imm] case
-    if (isImm(Addr))
-      return false;
-    // it is [reg]
-
-    assert(Addr.getValueType().isSimple() && "Type must be simple");
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-    return true;
-  }
-
-  if (Addr.getNumOperands() < 2)
-    return false;
-
-  // let SelectADDRii handle the [imm+imm] case
-  if (isImm(Addr.getOperand(0)) && isImm(Addr.getOperand(1)))
-    return false;
-
-  // try [reg+imm] and [imm+reg]
-  for (int i = 0; i < 2; i ++)
-    if (SelectImm(Addr.getOperand(1-i), Offset)) {
-      Base = Addr.getOperand(i);
-      return true;
-    }
-
-  // neither [reg+imm] nor [imm+reg]
-  return false;*/
-}
-
-// Match memory operand of the form [imm+imm] and [imm]
-bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
-                                   SDValue &Offset) {
-  // is [imm+imm]?
-  if (Addr.getOpcode() == ISD::ADD) {
-    return SelectImm(Addr.getOperand(0), Base) &&
-           SelectImm(Addr.getOperand(1), Offset);
-  }
-
-  // is [imm]?
-  if (SelectImm(Addr, Base)) {
-    assert(Addr.getValueType().isSimple() && "Type must be simple");
-
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-    return true;
-  }
-
-  return false;
-}
-
-// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
-bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base,
-                                      SDValue &Offset) {
-  //errs() << "SelectADDRlocal: ";
-  //Addr.getNode()->dumpr();
-  if (isa<FrameIndexSDNode>(Addr)) {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-    //errs() << "Success\n";
-    return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    Base = Addr.getOperand(0);
-    if (!isa<FrameIndexSDNode>(Base)) {
-      //errs() << "Failure\n";
-      return false;
-    }
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-    //errs() << "Offset: ";
-    //Offset.getNode()->dumpr();
-    //errs() << "Success\n";
-    return true;
-  }
-
-  //errs() << "Failure\n";
-  return false;
-}
-
-bool PTXDAGToDAGISel::isImm(const SDValue &operand) {
-  return ConstantSDNode::classof(operand.getNode());
-}
-
-bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) {
-  SDNode *node = operand.getNode();
-  if (!ConstantSDNode::classof(node))
-    return false;
-
-  ConstantSDNode *CN = cast<ConstantSDNode>(node);
-  imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(),
-                                  operand.getValueType());
-  return true;
-}
-
-const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const
-{
-  return TM.getSubtarget<PTXSubtarget>();
-}
-
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
deleted file mode 100644
index ef4455b..0000000
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-//===-- PTXISelLowering.cpp - PTX DAG Lowering Implementation -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXTargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXISelLowering.h"
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXRegisterInfo.h"
-#include "PTXSubtarget.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// TargetLowering Implementation
-//===----------------------------------------------------------------------===//
-
-PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
-  // Set up the register classes.
-  addRegisterClass(MVT::i1,  PTX::RegPredRegisterClass);
-  addRegisterClass(MVT::i16, PTX::RegI16RegisterClass);
-  addRegisterClass(MVT::i32, PTX::RegI32RegisterClass);
-  addRegisterClass(MVT::i64, PTX::RegI64RegisterClass);
-  addRegisterClass(MVT::f32, PTX::RegF32RegisterClass);
-  addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
-
-  setBooleanContents(ZeroOrOneBooleanContent);
-  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
-  setMinFunctionAlignment(2);
-
-  // Let LLVM use loads/stores for all mem* operations
-  maxStoresPerMemcpy  = 4096;
-  maxStoresPerMemmove = 4096;
-  maxStoresPerMemset  = 4096;
-
-  ////////////////////////////////////
-  /////////// Expansion //////////////
-  ////////////////////////////////////
-
-  // (any/zero/sign) extload => load + (any/zero/sign) extend
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
-
-  // f32 extload => load + fextend
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-
-  // f64 truncstore => trunc + store
-
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-  // sign_extend_inreg => sign_extend
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-
-  // br_cc => brcond
-
-  setOperationAction(ISD::BR_CC, MVT::Other, Expand);
-
-  // select_cc => setcc
-
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-
-  ////////////////////////////////////
-  //////////// Legal /////////////////
-  ////////////////////////////////////
-
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-
-  ////////////////////////////////////
-  //////////// Custom ////////////////
-  ////////////////////////////////////
-
-  // customise setcc to use bitwise logic if possible
-
-  //setOperationAction(ISD::SETCC, MVT::i1, Custom);
-  setOperationAction(ISD::SETCC, MVT::i1, Legal);
-
-  // customize translation of memory addresses
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-}
-
-EVT PTXTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i1;
-}
-
-SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-    default:
-      llvm_unreachable("Unimplemented operand");
-    case ISD::SETCC:
-      return LowerSETCC(Op, DAG);
-    case ISD::GlobalAddress:
-      return LowerGlobalAddress(Op, DAG);
-  }
-}
-
-const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-    default:
-      llvm_unreachable("Unknown opcode");
-    case PTXISD::COPY_ADDRESS:
-      return "PTXISD::COPY_ADDRESS";
-    case PTXISD::LOAD_PARAM:
-      return "PTXISD::LOAD_PARAM";
-    case PTXISD::STORE_PARAM:
-      return "PTXISD::STORE_PARAM";
-    case PTXISD::READ_PARAM:
-      return "PTXISD::READ_PARAM";
-    case PTXISD::WRITE_PARAM:
-      return "PTXISD::WRITE_PARAM";
-    case PTXISD::EXIT:
-      return "PTXISD::EXIT";
-    case PTXISD::RET:
-      return "PTXISD::RET";
-    case PTXISD::CALL:
-      return "PTXISD::CALL";
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//                      Custom Lower Operation
-//===----------------------------------------------------------------------===//
-
-SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer");
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  SDValue Op2 = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
-  //ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-
-  // Look for X == 0, X == 1, X != 0, or X != 1
-  // We can simplify these to bitwise logic
-
-  //if (Op1.getOpcode() == ISD::Constant &&
-  //    (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-  //     cast<ConstantSDNode>(Op1)->isNullValue()) &&
-  //    (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-  //
-  //  return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
-  //}
-
-  //ConstantSDNode* COp1 = cast<ConstantSDNode>(Op1);
-  //if(COp1 && COp1->getZExtValue() == 1) {
-  //  if(CC == ISD::SETNE) {
-  //    return DAG.getNode(PTX::XORripreds, dl, MVT::i1, Op0);
-  //  }
-  //}
-
-  llvm_unreachable("setcc was not matched by a pattern!");
-
-  return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
-}
-
-SDValue PTXTargetLowering::
-LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  assert(PtrVT.isSimple() && "Pointer must be to primitive type.");
-
-  SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
-  SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS,
-                                 dl,
-                                 PtrVT.getSimpleVT(),
-                                 targetGlobal);
-
-  return movInstr;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-SDValue PTXTargetLowering::
-  LowerFormalArguments(SDValue Chain,
-                       CallingConv::ID CallConv,
-                       bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const {
-  if (isVarArg) llvm_unreachable("PTX does not support varargs");
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = MFI->getParamManager();
-
-  switch (CallConv) {
-    default:
-      llvm_unreachable("Unsupported calling convention");
-    case CallingConv::PTX_Kernel:
-      MFI->setKernel(true);
-      break;
-    case CallingConv::PTX_Device:
-      MFI->setKernel(false);
-      break;
-  }
-
-  // We do one of two things here:
-  // IsKernel || SM >= 2.0  ->  Use param space for arguments
-  // SM < 2.0               ->  Use registers for arguments
-  if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) {
-    // We just need to emit the proper LOAD_PARAM ISDs
-    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-      assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) &&
-             "Kernels cannot take pred operands");
-
-      unsigned ParamSize = Ins[i].VT.getStoreSizeInBits();
-      unsigned Param = PM.addArgumentParam(ParamSize);
-      const std::string &ParamName = PM.getParamName(Param);
-      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                       MVT::Other);
-      SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                                     ParamValue);
-      InVals.push_back(ArgValue);
-    }
-  }
-  else {
-    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-      EVT                        RegVT = Ins[i].VT;
-      const TargetRegisterClass* TRC   = getRegClassFor(RegVT);
-      unsigned                   RegType;
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1)
-        RegType = PTXRegisterType::Pred;
-      else if (RegVT == MVT::i16)
-        RegType = PTXRegisterType::B16;
-      else if (RegVT == MVT::i32)
-        RegType = PTXRegisterType::B32;
-      else if (RegVT == MVT::i64)
-        RegType = PTXRegisterType::B64;
-      else if (RegVT == MVT::f32)
-        RegType = PTXRegisterType::F32;
-      else if (RegVT == MVT::f64)
-        RegType = PTXRegisterType::F64;
-      else
-        llvm_unreachable("Unknown parameter type");
-
-      // Use a unique index in the instruction to prevent instruction folding.
-      // Yes, this is a hack.
-      SDValue Index = DAG.getTargetConstant(i, MVT::i32);
-      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain,
-                                     Index);
-
-      InVals.push_back(ArgValue);
-
-      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Argument);
-    }
-  }
-
-  return Chain;
-}
-
-SDValue PTXTargetLowering::
-  LowerReturn(SDValue Chain,
-              CallingConv::ID CallConv,
-              bool isVarArg,
-              const SmallVectorImpl<ISD::OutputArg> &Outs,
-              const SmallVectorImpl<SDValue> &OutVals,
-              DebugLoc dl,
-              SelectionDAG &DAG) const {
-  if (isVarArg) llvm_unreachable("PTX does not support varargs");
-
-  switch (CallConv) {
-    default:
-      llvm_unreachable("Unsupported calling convention.");
-    case CallingConv::PTX_Kernel:
-      assert(Outs.size() == 0 && "Kernel must return void.");
-      return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
-    case CallingConv::PTX_Device:
-      assert(Outs.size() <= 1 && "Can at most return one value.");
-      break;
-  }
-
-  MachineFunction& MF = DAG.getMachineFunction();
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = MFI->getParamManager();
-
-  SDValue Flag;
-  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
-
-  if (ST.useParamSpaceForDeviceArgs()) {
-    assert(Outs.size() < 2 && "Device functions can return at most one value");
-
-    if (Outs.size() == 1) {
-      unsigned ParamSize = OutVals[0].getValueType().getSizeInBits();
-      unsigned Param = PM.addReturnParam(ParamSize);
-      const std::string &ParamName = PM.getParamName(Param);
-      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                       MVT::Other);
-      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-                          ParamValue, OutVals[0]);
-    }
-  } else {
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      EVT                  RegVT = Outs[i].VT;
-      const TargetRegisterClass* TRC;
-      unsigned             RegType;
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1) {
-        TRC = PTX::RegPredRegisterClass;
-        RegType = PTXRegisterType::Pred;
-      }
-      else if (RegVT == MVT::i16) {
-        TRC = PTX::RegI16RegisterClass;
-        RegType = PTXRegisterType::B16;
-      }
-      else if (RegVT == MVT::i32) {
-        TRC = PTX::RegI32RegisterClass;
-        RegType = PTXRegisterType::B32;
-      }
-      else if (RegVT == MVT::i64) {
-        TRC = PTX::RegI64RegisterClass;
-        RegType = PTXRegisterType::B64;
-      }
-      else if (RegVT == MVT::f32) {
-        TRC = PTX::RegF32RegisterClass;
-        RegType = PTXRegisterType::F32;
-      }
-      else if (RegVT == MVT::f64) {
-        TRC = PTX::RegF64RegisterClass;
-        RegType = PTXRegisterType::F64;
-      }
-      else {
-        llvm_unreachable("Unknown parameter type");
-      }
-
-      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-
-      SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/);
-      SDValue OutReg = DAG.getRegister(Reg, RegVT);
-
-      Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
-
-      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Return);
-    }
-  }
-
-  if (Flag.getNode() == 0) {
-    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
-  }
-  else {
-    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
-  }
-}
-
-SDValue
-PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &InVals) const {
-
-  MachineFunction& MF = DAG.getMachineFunction();
-  PTXMachineFunctionInfo *PTXMFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = PTXMFI->getParamManager();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
-         "Calls are not handled for the target device");
-
-  // Identify the callee function
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
-  const Function *function = cast<Function>(GV);
-
-  // allow non-device calls only for printf
-  bool isPrintf = function->getName() == "printf" || function->getName() == "puts";
-
-  assert((isPrintf || function->getCallingConv() == CallingConv::PTX_Device) &&
-			 "PTX function calls must be to PTX device functions");
-
-  unsigned outSize = isPrintf ? 2 : Outs.size();
-
-  std::vector<SDValue> Ops;
-  // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
-  Ops.resize(outSize + Ins.size() + 4);
-
-  Ops[0] = Chain;
-
-  // Identify the callee function
-  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  Ops[Ins.size()+2] = Callee;
-
-  // #Outs
-  Ops[Ins.size()+3] = DAG.getTargetConstant(outSize, MVT::i32);
-
-  if (isPrintf) {
-    // first argument is the address of the global string variable in memory
-    unsigned Param0 = PM.addLocalParam(getPointerTy().getSizeInBits());
-    SDValue ParamValue0 = DAG.getTargetExternalSymbol(PM.getParamName(Param0).c_str(),
-                                                      MVT::Other);
-    Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-                        ParamValue0, OutVals[0]);
-    Ops[Ins.size()+4] = ParamValue0;
-
-    // alignment is the maximum size of all the arguments
-    unsigned alignment = 0;
-    for (unsigned i = 1; i < OutVals.size(); ++i) {
-      alignment = std::max(alignment,
-    		               OutVals[i].getValueType().getSizeInBits());
-    }
-
-    // size is the alignment multiplied by the number of arguments
-    unsigned size = alignment * (OutVals.size() - 1);
-
-    // second argument is the address of the stack object (unless no arguments)
-    unsigned Param1 = PM.addLocalParam(getPointerTy().getSizeInBits());
-    SDValue ParamValue1 = DAG.getTargetExternalSymbol(PM.getParamName(Param1).c_str(),
-                                                      MVT::Other);
-    Ops[Ins.size()+5] = ParamValue1;
-
-    if (size > 0)
-    {
-      // create a local stack object to store the arguments
-      unsigned StackObject = MFI->CreateStackObject(size / 8, alignment / 8, false);
-      SDValue FrameIndex = DAG.getFrameIndex(StackObject, getPointerTy());
-
-      // store each of the arguments to the stack in turn
-      for (unsigned int i = 1; i != OutVals.size(); i++) {
-        SDValue FrameAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameIndex, DAG.getTargetConstant((i - 1) * 8, getPointerTy()));
-        Chain = DAG.getStore(Chain, dl, OutVals[i], FrameAddr,
-                             MachinePointerInfo(),
-                             false, false, 0);
-      }
-
-      // copy the address of the local frame index to get the address in non-local space
-      SDValue genericAddr = DAG.getNode(PTXISD::COPY_ADDRESS, dl, getPointerTy(), FrameIndex);
-
-      // store this address in the second argument
-      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, ParamValue1, genericAddr);
-    }
-  }
-  else
-  {
-	  // Generate STORE_PARAM nodes for each function argument.  In PTX, function
-	  // arguments are explicitly stored into .param variables and passed as
-	  // arguments. There is no register/stack-based calling convention in PTX.
-	  for (unsigned i = 0; i != OutVals.size(); ++i) {
-		unsigned Size = OutVals[i].getValueType().getSizeInBits();
-		unsigned Param = PM.addLocalParam(Size);
-		const std::string &ParamName = PM.getParamName(Param);
-		SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-														 MVT::Other);
-		Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-							ParamValue, OutVals[i]);
-		Ops[i+Ins.size()+4] = ParamValue;
-	  }
-  }
-
-  std::vector<SDValue> InParams;
-
-  // Generate list of .param variables to hold the return value(s).
-  Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32);
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    unsigned Size = Ins[i].VT.getStoreSizeInBits();
-    unsigned Param = PM.addLocalParam(Size);
-    const std::string &ParamName = PM.getParamName(Param);
-    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                     MVT::Other);
-    Ops[i+2] = ParamValue;
-    InParams.push_back(ParamValue);
-  }
-
-  Ops[0] = Chain;
-
-  // Create the CALL node.
-  Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size());
-
-  // Create the LOAD_PARAM nodes that retrieve the function return value(s).
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                               InParams[i]);
-    InVals.push_back(Load);
-  }
-
-  return Chain;
-}
-
-unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) {
-  // All arguments consist of one "register," regardless of the type.
-  return 1;
-}
-
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
deleted file mode 100644
index 33220f4..0000000
--- a/lib/Target/PTX/PTXISelLowering.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- PTXISelLowering.h - PTX DAG Lowering Interface ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that PTX uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_ISEL_LOWERING_H
-#define PTX_ISEL_LOWERING_H
-
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace PTXISD {
-  enum NodeType {
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-    LOAD_PARAM,
-    STORE_PARAM,
-    READ_PARAM,
-    WRITE_PARAM,
-    EXIT,
-    RET,
-    COPY_ADDRESS,
-    CALL
-  };
-}                               // namespace PTXISD
-
-class PTXTargetLowering : public TargetLowering {
-  public:
-    explicit PTXTargetLowering(TargetMachine &TM);
-
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
-
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-
-    virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-
-    virtual SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv,
-                           bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl,
-                           SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv,
-                  bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl,
-                  SelectionDAG &DAG) const;
-
-    virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
-                SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual EVT getSetCCResultType(EVT VT) const;
-
-    virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT);
-
-  private:
-    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-}; // class PTXTargetLowering
-} // namespace llvm
-
-#endif // PTX_ISEL_LOWERING_H
diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td
deleted file mode 100644
index 267e834..0000000
--- a/lib/Target/PTX/PTXInstrFormats.td
+++ /dev/null
@@ -1,51 +0,0 @@
-//===-- PTXInstrFormats.td - PTX Instruction Formats -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Rounding Mode Specifier
-/*class RoundingMode<bits<3> val> {
-  bits<3> Value = val;
-}
-
-def RndDefault     : RoundingMode<0>;
-def RndNearestEven : RoundingMode<1>;
-def RndNearestZero : RoundingMode<2>;
-def RndNegInf      : RoundingMode<3>;
-def RndPosInf      : RoundingMode<4>;
-def RndApprox      : RoundingMode<5>;*/
-
-
-// Rounding Mode Operand
-def RndMode : Operand<i32> {
-  let PrintMethod = "printRoundingMode";
-}
-
-def RndDefault : PatLeaf<(i32 0)>;
-
-// PTX Predicate operand, default to (0, 0) = (zero-reg, none).
-// Leave PrintMethod empty; predicate printing is defined elsewhere.
-def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
-                                     (ops (i1 zero_reg), (i32 2))>;
-
-def RndModeOperand : Operand<OtherVT> {
-  let MIOperandInfo = (ops i32imm);
-}
-
-// Instruction Types
-let Namespace = "PTX" in {
-
-  class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern>
-    : Instruction {
-      dag OutOperandList = oops;
-      dag InOperandList = !con(iops, (ins pred:$_p));
-      let AsmString = asmstr; // Predicate printing is defined elsewhere.
-      let Pattern = pattern;
-      let isPredicable = 1;
-  }
-}
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
deleted file mode 100644
index 443cd54..0000000
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-//===-- PTXInstrInfo.cpp - PTX Instruction Information --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-instrinfo"
-
-#include "PTXInstrInfo.h"
-#include "PTX.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define GET_INSTRINFO_CTOR
-#include "PTXGenInstrInfo.inc"
-
-using namespace llvm;
-
-PTXInstrInfo::PTXInstrInfo(PTXTargetMachine &_TM)
-  : PTXGenInstrInfo(),
-    RI(_TM, *this), TM(_TM) {}
-
-static const struct map_entry {
-  const TargetRegisterClass *cls;
-  const int opcode;
-} map[] = {
-  { &PTX::RegI16RegClass, PTX::MOVU16rr },
-  { &PTX::RegI32RegClass, PTX::MOVU32rr },
-  { &PTX::RegI64RegClass, PTX::MOVU64rr },
-  { &PTX::RegF32RegClass, PTX::MOVF32rr },
-  { &PTX::RegF64RegClass, PTX::MOVF64rr },
-  { &PTX::RegPredRegClass,   PTX::MOVPREDrr }
-};
-
-void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, DebugLoc DL,
-                               unsigned DstReg, unsigned SrcReg,
-                               bool KillSrc) const {
-
-  const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
-  //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) &&
-  //  "Invalid register copy between two register classes");
-
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) {
-    if (map[i].cls == MRI.getRegClass(DstReg)) {
-      const MCInstrDesc &MCID = get(map[i].opcode);
-      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).
-        addReg(SrcReg, getKillRegState(KillSrc));
-      AddDefaultPredicate(MI);
-      return;
-    }
-  }
-
-  llvm_unreachable("Impossible reg-to-reg copy");
-}
-
-bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I,
-                                unsigned DstReg, unsigned SrcReg,
-                                const TargetRegisterClass *DstRC,
-                                const TargetRegisterClass *SrcRC,
-                                DebugLoc DL) const {
-  if (DstRC != SrcRC)
-    return false;
-
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
-    if (DstRC == map[i].cls) {
-      const MCInstrDesc &MCID = get(map[i].opcode);
-      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).addReg(SrcReg);
-      AddDefaultPredicate(MI);
-      return true;
-    }
-
-  return false;
-}
-
-bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI,
-                               unsigned &SrcReg, unsigned &DstReg,
-                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  switch (MI.getOpcode()) {
-    default:
-      return false;
-    case PTX::MOVU16rr:
-    case PTX::MOVU32rr:
-    case PTX::MOVU64rr:
-    case PTX::MOVF32rr:
-    case PTX::MOVF64rr:
-    case PTX::MOVPREDrr:
-      assert(MI.getNumOperands() >= 2 &&
-             MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
-             "Invalid register-register move instruction");
-      SrcSubIdx = DstSubIdx = 0; // No sub-registers
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-  }
-}
-
-// predicate support
-
-bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const {
-  int i = MI->findFirstPredOperandIdx();
-  return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister;
-}
-
-bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  return !isPredicated(MI) && MI->isTerminator();
-}
-
-bool PTXInstrInfo::
-PredicateInstruction(MachineInstr *MI,
-                     const SmallVectorImpl<MachineOperand> &Pred) const {
-  if (Pred.size() < 2)
-    llvm_unreachable("lesser than 2 predicate operands are provided");
-
-  int i = MI->findFirstPredOperandIdx();
-  if (i == -1)
-    llvm_unreachable("missing predicate operand");
-
-  MI->getOperand(i).setReg(Pred[0].getReg());
-  MI->getOperand(i+1).setImm(Pred[1].getImm());
-
-  return true;
-}
-
-bool PTXInstrInfo::
-SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                  const SmallVectorImpl<MachineOperand> &Pred2) const {
-  const MachineOperand &PredReg1 = Pred1[0];
-  const MachineOperand &PredReg2 = Pred2[0];
-  if (PredReg1.getReg() != PredReg2.getReg())
-    return false;
-
-  const MachineOperand &PredOp1 = Pred1[1];
-  const MachineOperand &PredOp2 = Pred2[1];
-  if (PredOp1.getImm() != PredOp2.getImm())
-    return false;
-
-  return true;
-}
-
-bool PTXInstrInfo::
-DefinesPredicate(MachineInstr *MI,
-                 std::vector<MachineOperand> &Pred) const {
-  // If an instruction sets a predicate register, it defines a predicate.
-
-  // TODO supprot 5-operand format of setp instruction
-
-  if (MI->getNumOperands() < 1)
-    return false;
-
-  const MachineOperand &MO = MI->getOperand(0);
-
-  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass)
-    return false;
-
-  Pred.push_back(MO);
-  Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None));
-  return true;
-}
-
-// branch support
-
-bool PTXInstrInfo::
-AnalyzeBranch(MachineBasicBlock &MBB,
-              MachineBasicBlock *&TBB,
-              MachineBasicBlock *&FBB,
-              SmallVectorImpl<MachineOperand> &Cond,
-              bool AllowModify) const {
-  // TODO implement cases when AllowModify is true
-
-  if (MBB.empty())
-    return true;
-
-  MachineBasicBlock::iterator iter = MBB.end();
-  const MachineInstr& instLast1 = *--iter;
-  // for special case that MBB has only 1 instruction
-  const bool IsSizeOne = MBB.size() == 1;
-  // if IsSizeOne is true, *--iter and instLast2 are invalid
-  // we put a dummy value in instLast2 and desc2 since they are used
-  const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter;
-
-  DEBUG(dbgs() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: MBB:    " << MBB.getName().str() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: TBB:    " << TBB << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: FBB:    " << FBB << "\n");
-
-  // this block ends with no branches
-  if (!IsAnyKindOfBranch(instLast1)) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n");
-    return false;
-  }
-
-  // this block ends with only an unconditional branch
-  if (instLast1.isUnconditionalBranch() &&
-      // when IsSizeOne is true, it "absorbs" the evaluation of instLast2
-      (IsSizeOne || !IsAnyKindOfBranch(instLast2))) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n");
-    TBB = GetBranchTarget(instLast1);
-    return false;
-  }
-
-  // this block ends with a conditional branch and
-  // it falls through to a successor block
-  if (instLast1.isConditionalBranch() &&
-      IsAnySuccessorAlsoLayoutSuccessor(MBB)) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n");
-    TBB = GetBranchTarget(instLast1);
-    int i = instLast1.findFirstPredOperandIdx();
-    Cond.push_back(instLast1.getOperand(i));
-    Cond.push_back(instLast1.getOperand(i+1));
-    return false;
-  }
-
-  // when IsSizeOne is true, we are done
-  if (IsSizeOne)
-    return true;
-
-  // this block ends with a conditional branch
-  // followed by an unconditional branch
-  if (instLast2.isConditionalBranch() &&
-      instLast1.isUnconditionalBranch()) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n");
-    TBB = GetBranchTarget(instLast2);
-    FBB = GetBranchTarget(instLast1);
-    int i = instLast2.findFirstPredOperandIdx();
-    Cond.push_back(instLast2.getOperand(i));
-    Cond.push_back(instLast2.getOperand(i+1));
-    return false;
-  }
-
-  // branch cannot be understood
-  DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n");
-  return true;
-}
-
-unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  unsigned count = 0;
-  while (!MBB.empty())
-    if (IsAnyKindOfBranch(MBB.back())) {
-      MBB.pop_back();
-      ++count;
-    } else
-      break;
-  DEBUG(dbgs() << "RemoveBranch: MBB:   " << MBB.getName().str() << "\n");
-  DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n");
-  return count;
-}
-
-unsigned PTXInstrInfo::
-InsertBranch(MachineBasicBlock &MBB,
-             MachineBasicBlock *TBB,
-             MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond,
-             DebugLoc DL) const {
-  DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n");
-  DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str()
-                        << "\n";
-        else     dbgs() << "InsertBranch: TBB: (NULL)\n");
-  DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str()
-                        << "\n";
-        else     dbgs() << "InsertBranch: FBB: (NULL)\n");
-  DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n");
-
-  assert(TBB && "TBB is NULL");
-
-  if (FBB) {
-    BuildMI(&MBB, DL, get(PTX::BRAdp))
-      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
-    return 2;
-  } else if (Cond.size()) {
-    BuildMI(&MBB, DL, get(PTX::BRAdp))
-      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
-    return 1;
-  } else {
-    BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
-    return 1;
-  }
-}
-
-// Memory operand folding for spills
-void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MII,
-                                     unsigned SrcReg, bool isKill, int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("storeRegToStackSlot should not be called for PTX");
-}
-
-void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MII,
-                                        unsigned DestReg, int FrameIdx,
-                                        const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("loadRegFromStackSlot should not be called for PTX");
-}
-
-// static helper routines
-
-MachineSDNode *PTXInstrInfo::
-GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                  DebugLoc dl, EVT VT, SDValue Op1) {
-  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  SDValue ops[] = { Op1, predReg, predOp };
-  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-}
-
-MachineSDNode *PTXInstrInfo::
-GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                  DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
-  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  SDValue ops[] = { Op1, Op2, predReg, predOp };
-  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-}
-
-void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
-  if (MI->findFirstPredOperandIdx() == -1) {
-    MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
-    MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None));
-  }
-}
-
-bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) {
-  return inst.isTerminator() || inst.isBranch() || inst.isIndirectBranch();
-}
-
-bool PTXInstrInfo::
-IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) {
-  for (MachineBasicBlock::const_succ_iterator
-      i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i)
-    if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i))
-      return true;
-  return false;
-}
-
-MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) {
-  // FIXME So far all branch instructions put destination in 1st operand
-  const MachineOperand& target = inst.getOperand(0);
-  assert(target.isMBB() && "FIXME: detect branch target operand");
-  return target.getMBB();
-}
diff --git a/lib/Target/PTX/PTXInstrInfo.h b/lib/Target/PTX/PTXInstrInfo.h
deleted file mode 100644
index fba89c0..0000000
--- a/lib/Target/PTX/PTXInstrInfo.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===-- PTXInstrInfo.h - PTX Instruction Information ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_INSTR_INFO_H
-#define PTX_INSTR_INFO_H
-
-#include "PTXRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "PTXGenInstrInfo.inc"
-
-namespace llvm {
-class PTXTargetMachine;
-
-class MachineSDNode;
-class SDValue;
-class SelectionDAG;
-
-class PTXInstrInfo : public PTXGenInstrInfo {
-private:
-  const PTXRegisterInfo RI;
-  PTXTargetMachine &TM;
-
-public:
-  explicit PTXInstrInfo(PTXTargetMachine &_TM);
-
-  virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; }
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DstReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DstReg, unsigned SrcReg,
-                            const TargetRegisterClass *DstRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
-
-  virtual bool isMoveInstr(const MachineInstr& MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
-  // predicate support
-
-  virtual bool isPredicated(const MachineInstr *MI) const;
-
-  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
-
-  virtual
-  bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
-
-  virtual
-  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
-
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
-
-  // PTX is fully-predicable
-  virtual bool isPredicable(MachineInstr *MI) const { return true; }
-
-  // branch support
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify = false) const;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  // Memory operand folding for spills
-  // TODO: Implement this eventually and get rid of storeRegToStackSlot and
-  //       loadRegFromStackSlot.  Doing so will get rid of the "stack" registers
-  //       we currently use to spill, though I doubt the overall effect on ptxas
-  //       output will be large.  I have yet to see a case where ptxas is unable
-  //       to see through the "stack" register usage and hence generates
-  //       efficient code anyway.
-  // virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-  //                                             MachineInstr* MI,
-  //                                       const SmallVectorImpl<unsigned> &Ops,
-  //                                             int FrameIndex) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock& MBB,
-                                   MachineBasicBlock::iterator MII,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass* RC,
-                                   const TargetRegisterInfo* TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MII,
-                                    unsigned DestReg, int FrameIdx,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  // static helper routines
-
-  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                          DebugLoc dl, EVT VT,
-                                          SDValue Op1);
-
-  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                          DebugLoc dl, EVT VT,
-                                          SDValue Op1, SDValue Op2);
-
-  static void AddDefaultPredicate(MachineInstr *MI);
-
-  static bool IsAnyKindOfBranch(const MachineInstr& inst);
-
-  static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB);
-
-  static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst);
-}; // class PTXInstrInfo
-} // namespace llvm
-
-#endif // PTX_INSTR_INFO_H
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
deleted file mode 100644
index bead428..0000000
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ /dev/null
@@ -1,1031 +0,0 @@
-//===-- PTXInstrInfo.td - PTX Instruction defs --------------*- tablegen-*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Instruction format superclass
-//===----------------------------------------------------------------------===//
-
-include "PTXInstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-// Code Generation Predicates
-//===----------------------------------------------------------------------===//
-
-// Shader Model Support
-def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">;
-def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">;
-def FMadNeedsRoundingMode : Predicate<"getSubtarget().fmadNeedsRoundingMode()">;
-def FMadNoRoundingMode : Predicate<"!getSubtarget().fmadNeedsRoundingMode()">;
-
-// PTX Version Support
-def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
-def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
-def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
-def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
-def SupportsPTX23       : Predicate<"getSubtarget().supportsPTX23()">;
-def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
-
-// Fused-Multiply Add
-def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
-def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
-
-
-
-// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
-// def SDT_PTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart,
-//                               [SDNPHasChain, SDNPOutGlue]>;
-// def PTXcallseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd,
-//                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-def PTXcall : SDNode<"PTXISD::CALL", SDTNone,
-                     [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>;
-
-
-// Branch & call targets have OtherVT type.
-def brtarget   : Operand<OtherVT>;
-def calltarget : Operand<i32>;
-
-//===----------------------------------------------------------------------===//
-// PTX Specific Node Definitions
-//===----------------------------------------------------------------------===//
-
-// PTX allow generic 3-reg shifts like shl r0, r1, r2
-def PTXshl : SDNode<"ISD::SHL", SDTIntBinOp>;
-def PTXsrl : SDNode<"ISD::SRL", SDTIntBinOp>;
-def PTXsra : SDNode<"ISD::SRA", SDTIntBinOp>;
-
-def PTXexit
-  : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>;
-def PTXret
-  : SDNode<"PTXISD::RET",  SDTNone,
-           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def PTXcopyaddress
-  : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
-
-
-
-//===----------------------------------------------------------------------===//
-// Instruction Class Templates
-//===----------------------------------------------------------------------===//
-
-// For floating-point instructions, we cannot just embed the pattern into the
-// instruction definition since we need to muck around with the rounding mode,
-// and I do not know how to insert constants into instructions directly from
-// pattern matches.
-
-//===- Floating-Point Instructions - 2 Operand Form -----------------------===//
-multiclass PTX_FLOAT_2OP<string opcstr> {
-  def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a),
-                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
-  def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, f32imm:$a),
-                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
-  def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a),
-                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
-  def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, f64imm:$a),
-                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
-}
-
-//===- Floating-Point Instructions - 3 Operand Form -----------------------===//
-multiclass PTX_FLOAT_3OP<string opcstr> {
-  def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a, RegF32:$b),
-                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
-  def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a, f32imm:$b),
-                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
-  def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a, RegF64:$b),
-                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
-  def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a, f64imm:$b),
-                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
-}
-
-//===- Floating-Point Instructions - 4 Operand Form -----------------------===//
-multiclass PTX_FLOAT_4OP<string opcstr> {
-  def rrr32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rri32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rii32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rrr64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-  def rri64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-  def rii64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-}
-
-//===- Integer Instructions - 3 Operand Form ------------------------------===//
-multiclass PTX_INT3<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===//
-multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===//
-multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
-  def ripreds : InstPTX<(outs RegPred:$d),
-                     (ins RegPred:$a, i1imm:$b),
-                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
-                     [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>;
-  def rrpreds : InstPTX<(outs RegPred:$d),
-                     (ins RegPred:$a, RegPred:$b),
-                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
-                     [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>;
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Integer Shift Instructions - 3 Operand Form ------------------------===//
-multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-  def ir16 : InstPTX<(outs RegI16:$d),
-                     (ins i16imm:$a, RegI16:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>;
-  def ir32 : InstPTX<(outs RegI32:$d),
-                     (ins i32imm:$a, RegI32:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>;
-  def ir64 : InstPTX<(outs RegI64:$d),
-                     (ins i64imm:$a, RegI64:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
-}
-
-//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===//
-multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
-                        CondCode cmp, string cmpstr> {
-  // TODO support 5-operand format: p|q, a, b, c
-
-  def rr
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>;
-  def ri
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>;
-
-  def rr_and_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_and_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
-                                     RegPred:$c))]>;
-  def rr_or_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_or_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
-  def rr_xor_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_xor_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
-                                     RegPred:$c))]>;
-
-  def rr_and_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def ri_and_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def rr_or_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp),
-                                    (not RegPred:$c)))]>;
-  def ri_or_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp),
-                                    (not RegPred:$c)))]>;
-  def rr_xor_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def ri_xor_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
-                                     (not RegPred:$c)))]>;
-}
-
-//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===//
-multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls,
-                        CondCode ucmp, CondCode ocmp, string cmpstr> {
-  // TODO support 5-operand format: p|q, a, b, c
-
-  def rr_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>;
-  def rr_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
-
-  def ri_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>;
-  def ri_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>;
-
-  def rr_and_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
-                                     RegPred:$c))]>;
-  def rr_and_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
-                                     RegPred:$c))]>;
-
-  def rr_or_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
-  def rr_or_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
-
-  def rr_xor_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
-                                     RegPred:$c))]>;
-  def rr_xor_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
-                                     RegPred:$c))]>;
-
-  def rr_and_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
-                                     (not RegPred:$c)))]>;
-  def rr_and_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
-                                     (not RegPred:$c)))]>;
-
-  def rr_or_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp),
-                                    (not RegPred:$c)))]>;
-  def rr_or_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp),
-                                    (not RegPred:$c)))]>;
-
-  def rr_xor_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
-                                     (not RegPred:$c)))]>;
-  def rr_xor_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
-                                     (not RegPred:$c)))]>;
-}
-
-//===- Select Predicate Instructions - 4 Operand Form ---------------------===//
-multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls,
-                    SDNode immnode> {
-  def rr
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
-  def ri
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>;
-  def ii
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>;
-}
-
-
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-///===- Integer Arithmetic Instructions -----------------------------------===//
-
-defm ADD  : PTX_INT3<"add", add>;
-defm SUB  : PTX_INT3<"sub", sub>;
-defm MUL  : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
-defm DIV  : PTX_INT3<"div", udiv>;
-defm SDIV : PTX_INT3_SIGNED<"div", sdiv>;
-defm REM  : PTX_INT3<"rem", urem>;
-
-///===- Floating-Point Arithmetic Instructions ----------------------------===//
-
-// FNEG
-defm FNEG : PTX_FLOAT_2OP<"neg">;
-
-// Standard Binary Operations
-defm FADD : PTX_FLOAT_3OP<"add">;
-defm FSUB : PTX_FLOAT_3OP<"sub">;
-defm FMUL : PTX_FLOAT_3OP<"mul">;
-defm FDIV : PTX_FLOAT_3OP<"div">;
-
-// Multi-operation hybrid instructions
-defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>;
-
-
-///===- Floating-Point Intrinsic Instructions -----------------------------===//
-
-// SQRT
-def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                        "sqrt$r.f32\t$d, $a", []>;
-def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                        "sqrt$r.f32\t$d, $a", []>;
-def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                        "sqrt$r.f64\t$d, $a", []>;
-def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                        "sqrt$r.f64\t$d, $a", []>;
-
-// SIN
-def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                       "sin$r.f32\t$d, $a", []>;
-def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                       "sin$r.f32\t$d, $a", []>;
-def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                       "sin$r.f64\t$d, $a", []>;
-def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                       "sin$r.f64\t$d, $a", []>;
-
-// COS
-def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                       "cos$r.f32\t$d, $a", []>;
-def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                       "cos$r.f32\t$d, $a", []>;
-def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                       "cos$r.f64\t$d, $a", []>;
-def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                       "cos$r.f64\t$d, $a", []>;
-
-
-
-
-///===- Comparison and Selection Instructions -----------------------------===//
-
-// .setp
-
-// Compare u16
-
-defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ,  "eq">;
-defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE,  "ne">;
-defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">;
-defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">;
-defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">;
-defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">;
-defm SETPLTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLT,  "lt">;
-defm SETPLEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLE,  "le">;
-defm SETPGTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGT,  "gt">;
-defm SETPGEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGE,  "ge">;
-
-// Compare u32
-
-defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ,  "eq">;
-defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE,  "ne">;
-defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">;
-defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">;
-defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">;
-defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">;
-defm SETPLTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLT,  "lt">;
-defm SETPLEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLE,  "le">;
-defm SETPGTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGT,  "gt">;
-defm SETPGEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGE,  "ge">;
-
-// Compare u64
-
-defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ,  "eq">;
-defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE,  "ne">;
-defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">;
-defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">;
-defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">;
-defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">;
-defm SETPLTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLT,  "lt">;
-defm SETPLEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLE,  "le">;
-defm SETPGTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGT,  "gt">;
-defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE,  "ge">;
-
-// Compare f32
-
-defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">;
-defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">;
-defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">;
-defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">;
-defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">;
-defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">;
-
-// Compare f64
-
-defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">;
-defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">;
-defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">;
-defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">;
-defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">;
-defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">;
-
-// .selp
-
-defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>;
-defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>;
-defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>;
-defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>;
-defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>;
-
-///===- Logic and Shift Instructions --------------------------------------===//
-
-defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>;
-defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>;
-defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>;
-
-defm AND : PTX_LOGIC<"and", and>;
-defm OR  : PTX_LOGIC<"or",  or>;
-defm XOR : PTX_LOGIC<"xor", xor>;
-
-///===- Data Movement and Conversion Instructions -------------------------===//
-
-// any_extend
-// Implement the anyext instruction in terms of the PTX cvt instructions.
-//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>;
-//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>;
-//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>;
-
-// bitconvert
-// These instructions implement the bit-wise conversion between integer and
-// floating-point types.
-def MOVi32f32
-  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>;
-def MOVf32i32
-  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>;
-def MOVi64f64
-  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>;
-def MOVf64i64
-  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>;
-
-let neverHasSideEffects = 1 in {
-  def MOVPREDrr
-    : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
-  def MOVU16rr
-    : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>;
-  def MOVU32rr
-    : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>;
-  def MOVU64rr
-    : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>;
-  def MOVF32rr
-    : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>;
-  def MOVF64rr
-    : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>;
-}
-
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVPREDri
-    : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
-              [(set RegPred:$d, imm:$a)]>;
-  def MOVU16ri
-    : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
-              [(set RegI16:$d, imm:$a)]>;
-  def MOVU32ri
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RegI32:$d, imm:$a)]>;
-  def MOVU64ri
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RegI64:$d, imm:$a)]>;
-  def MOVF32ri
-    : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
-              [(set RegF32:$d, fpimm:$a)]>;
-  def MOVF64ri
-    : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
-              [(set RegF64:$d, fpimm:$a)]>;
-}
-
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVaddr32
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
-  def MOVaddr64
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
-  def MOVframe32
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "cvta.local.u32\t$d, $a",
-              [(set RegI32:$d, (PTXcopyaddress frameindex:$a))]>;
-  def MOVframe64
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "cvta.local.u64\t$d, $a",
-              [(set RegI64:$d, (PTXcopyaddress frameindex:$a))]>;
-}
-
-// PTX cvt instructions
-// Note all of these may actually be used, we just define all possible patterns
-// here (that make sense).
-// FIXME: Can we collapse this somehow into a multiclass def?
-
-// To i16
-def CVTu16u32
-  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>;
-def CVTu16u64
-  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>;
-def CVTu16f32
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u16.f32\t$d, $a", []>;
-def CVTs16f32
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s16.f32\t$d, $a", []>;
-def CVTu16f64
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u16.f64\t$d, $a", []>;
-def CVTs16f64
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s16.f64\t$d, $a", []>;
-
-// To i32
-def CVTu32u16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>;
-def CVTs32s16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>;
-def CVTu32u64
-  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>;
-def CVTu32f32
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u32.f32\t$d, $a", []>;
-def CVTs32f32
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s32.f32\t$d, $a", []>;
-def CVTu32f64
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u32.f64\t$d, $a", []>;
-def CVTs32f64
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s32.f64\t$d, $a", []>;
-
-// To i64
-def CVTu64u16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>;
-def CVTs64s16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>;
-def CVTu64u32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>;
-def CVTs64s32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>;
-def CVTu64f32
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u64.f32\t$d, $a", []>;
-def CVTs64f32
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s64.f32\t$d, $a", []>;
-def CVTu64f64
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u64.f64\t$d, $a", []>;
-def CVTs64f64
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s64.f64\t$d, $a", []>;
-
-// To f32
-def CVTf32u16
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f32.u16\t$d, $a", []>;
-def CVTf32s16
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f32.s16\t$d, $a", []>;
-def CVTf32u32
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f32.u32\t$d, $a", []>;
-def CVTf32s32
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f32.s32\t$d, $a", []>;
-def CVTf32u64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f32.u64\t$d, $a", []>;
-def CVTf32s64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f32.s64\t$d, $a", []>;
-def CVTf32f64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.f32.f64\t$d, $a", []>;
-
-// To f64
-def CVTf64u16
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f64.u16\t$d, $a", []>;
-def CVTf64s16
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f64.s16\t$d, $a", []>;
-def CVTf64u32
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f64.u32\t$d, $a", []>;
-def CVTf64s32
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f64.s32\t$d, $a", []>;
-def CVTf64u64
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f64.u64\t$d, $a", []>;
-def CVTf64s64
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f64.s64\t$d, $a", []>;
-def CVTf64f32
-  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>;
-
-  ///===- Control Flow Instructions -----------------------------------------===//
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-  def BRAd
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
-}
-
-let isBranch = 1, isTerminator = 1 in {
-  // FIXME: The pattern part is blank because I cannot (or do not yet know
-  // how to) use the first operand of PredicateOperand (a RegPred register) here
-  // When this is revisited, make sure to also look at LowerSETCC and try to
-  // fold it into negated predicates, if possible.
-  def BRAdp
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
-              [/*(brcond pred:$_p, bb:$d)*/]>;
-}
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
-  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
-}
-
-let hasSideEffects = 1 in {
-  def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>;
-}
-
-///===- Parameter Passing Pseudo-Instructions -----------------------------===//
-
-def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
-                            "mov.pred\t$a, %arg$b", []>;
-def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
-                            "mov.b16\t$a, %arg$b", []>;
-def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
-                            "mov.b32\t$a, %arg$b", []>;
-def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
-                            "mov.b64\t$a, %arg$b", []>;
-def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
-                            "mov.f32\t$a, %arg$b", []>;
-def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
-                            "mov.f64\t$a, %arg$b", []>;
-
-def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
-def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
-def WRITEPARAMI32  : InstPTX<(outs), (ins RegI32:$a), "//w", []>;
-def WRITEPARAMI64  : InstPTX<(outs), (ins RegI64:$a), "//w", []>;
-def WRITEPARAMF32  : InstPTX<(outs), (ins RegF32:$a), "//w", []>;
-def WRITEPARAMF64  : InstPTX<(outs), (ins RegF64:$a), "//w", []>;
-
-
-//===----------------------------------------------------------------------===//
-// Instruction Selection Patterns
-//===----------------------------------------------------------------------===//
-
-// FADD
-def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)),
-          (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)),
-          (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)),
-          (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)),
-          (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FSUB
-def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)),
-          (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)),
-          (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)),
-          (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)),
-          (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FMUL
-def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)),
-          (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)),
-          (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)),
-          (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)),
-          (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FDIV
-def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)),
-          (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)),
-          (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)),
-          (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)),
-          (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FMUL+FADD
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)),
-          (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
-          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)),
-          (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
-          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)),
-          (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)),
-          (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)),
-          (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-
-// FNEG
-def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>;
-
-// FSQRT
-def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>;
-
-// FSIN
-def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>;
-
-// FCOS
-def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>;
-
-// Type conversion notes:
-// - PTX does not directly support converting a predicate to a value, so we
-//   use a select instruction to select either 0 or 1 (integer or fp) based
-//   on the truth value of the predicate.
-// - PTX does not directly support converting to a predicate type, so we fake it
-//   by performing a greater-than test between the value and zero.  This follows
-//   the C convention that any non-zero value is equivalent to 'true'.
-
-// Conversion to pred
-def : Pat<(i1 (trunc RegI16:$a)),      (SETPGTu16ri RegI16:$a, 0)>;
-def : Pat<(i1 (trunc RegI32:$a)),      (SETPGTu32ri RegI32:$a, 0)>;
-def : Pat<(i1 (trunc RegI64:$a)),      (SETPGTu64ri RegI64:$a, 0)>;
-def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>;
-def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>;
-
-// Conversion to u16
-def : Pat<(i16 (anyext RegPred:$a)),    (SELPi16ii RegPred:$a, 1, 0)>;
-def : Pat<(i16 (sext RegPred:$a)),      (SELPi16ii RegPred:$a, 0xFFFF, 0)>;
-def : Pat<(i16 (zext RegPred:$a)),      (SELPi16ii RegPred:$a, 1, 0)>;
-def : Pat<(i16 (trunc RegI32:$a)),      (CVTu16u32 RegI32:$a)>;
-def : Pat<(i16 (trunc RegI64:$a)),      (CVTu16u64 RegI64:$a)>;
-def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>;
-def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>;
-def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>;
-def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>;
-
-// Conversion to u32
-def : Pat<(i32 (anyext RegPred:$a)),    (SELPi32ii RegPred:$a, 1, 0)>;
-def : Pat<(i32 (sext RegPred:$a)),      (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>;
-def : Pat<(i32 (zext RegPred:$a)),      (SELPi32ii RegPred:$a, 1, 0)>;
-def : Pat<(i32 (anyext RegI16:$a)),     (CVTu32u16 RegI16:$a)>;
-def : Pat<(i32 (sext RegI16:$a)),       (CVTs32s16 RegI16:$a)>;
-def : Pat<(i32 (zext RegI16:$a)),       (CVTu32u16 RegI16:$a)>;
-def : Pat<(i32 (trunc RegI64:$a)),      (CVTu32u64 RegI64:$a)>;
-def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>;
-def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>;
-def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>;
-def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>;
-def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>;
-
-// Conversion to u64
-def : Pat<(i64 (anyext RegPred:$a)),    (SELPi64ii RegPred:$a, 1, 0)>;
-def : Pat<(i64 (sext RegPred:$a)),      (SELPi64ii RegPred:$a,
-                                         0xFFFFFFFFFFFFFFFF, 0)>;
-def : Pat<(i64 (zext RegPred:$a)),      (SELPi64ii RegPred:$a, 1, 0)>;
-def : Pat<(i64 (anyext RegI16:$a)),     (CVTu64u16 RegI16:$a)>;
-def : Pat<(i64 (sext RegI16:$a)),       (CVTs64s16 RegI16:$a)>;
-def : Pat<(i64 (zext RegI16:$a)),       (CVTu64u16 RegI16:$a)>;
-def : Pat<(i64 (anyext RegI32:$a)),     (CVTu64u32 RegI32:$a)>;
-def : Pat<(i64 (sext RegI32:$a)),       (CVTs64s32 RegI32:$a)>;
-def : Pat<(i64 (zext RegI32:$a)),       (CVTu64u32 RegI32:$a)>;
-def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>;
-def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>;
-def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>;
-def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>;
-def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>;
-
-// Conversion to f32
-def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a,
-                                        (MOVf32i32 0x3F800000), (MOVf32i32 0))>;
-def : Pat<(f32 (uint_to_fp RegI16:$a)),  (CVTf32u16 RndDefault, RegI16:$a)>;
-def : Pat<(f32 (sint_to_fp RegI16:$a)),  (CVTf32s16 RndDefault, RegI16:$a)>;
-def : Pat<(f32 (uint_to_fp RegI32:$a)),  (CVTf32u32 RndDefault, RegI32:$a)>;
-def : Pat<(f32 (sint_to_fp RegI32:$a)),  (CVTf32s32 RndDefault, RegI32:$a)>;
-def : Pat<(f32 (uint_to_fp RegI64:$a)),  (CVTf32u64 RndDefault, RegI64:$a)>;
-def : Pat<(f32 (sint_to_fp RegI64:$a)),  (CVTf32s64 RndDefault, RegI64:$a)>;
-def : Pat<(f32 (fround RegF64:$a)),      (CVTf32f64 RndDefault, RegF64:$a)>;
-def : Pat<(f32 (bitconvert RegI32:$a)),  (MOVf32i32 RegI32:$a)>;
-
-// Conversion to f64
-def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a,
-                                (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>;
-def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>;
-def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>;
-def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>;
-def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>;
-def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>;
-def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
-def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
-def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
-
-// setcc - predicate inversion for branch conditions
-def : Pat<(i1 (setcc RegPred:$a, imm:$b, SETNE)),
-          (XORripreds RegPred:$a, imm:$b)>;
-
-///===- Intrinsic Instructions --------------------------------------------===//
-include "PTXIntrinsicInstrInfo.td"
-
-///===- Load/Store Instructions -------------------------------------------===//
-include "PTXInstrLoadStore.td"
-
diff --git a/lib/Target/PTX/PTXInstrLoadStore.td b/lib/Target/PTX/PTXInstrLoadStore.td
deleted file mode 100644
index 7a62684..0000000
--- a/lib/Target/PTX/PTXInstrLoadStore.td
+++ /dev/null
@@ -1,278 +0,0 @@
-//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tablegen-*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX load/store instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Addressing Predicates
-// We have to differentiate between 32- and 64-bit pointer types
-def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
-def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
-
-//===----------------------------------------------------------------------===//
-// Pattern Fragments for Loads/Stores
-//===----------------------------------------------------------------------===//
-
-def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Global;
-  return false;
-}]>;
-
-def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Constant;
-  return false;
-}]>;
-
-def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Shared;
-  return false;
-}]>;
-
-def store_global
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Global;
-  return false;
-}]>;
-
-def store_shared
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Shared;
-  return false;
-}]>;
-
-// Addressing modes.
-def ADDRrr32    : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
-def ADDRrr64    : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
-def ADDRri32    : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
-def ADDRri64    : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
-def ADDRii32    : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
-def ADDRii64    : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
-def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>;
-def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>;
-
-// Address operands
-def MEMri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI32, i32imm);
-}
-def MEMri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI64, i64imm);
-}
-def LOCALri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def LOCALri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-def MEMii32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def MEMii64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-// The operand here does not correspond to an actual address, so we
-// can use i32 in 64-bit address modes.
-def MEMpi : Operand<i32> {
-  let PrintMethod = "printParamOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-def MEMret : Operand<i32> {
-  let PrintMethod = "printReturnOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-
-
-// Load/store .param space
-def PTXloadparam
-  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXstoreparam
-  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-
-def PTXreadparam
-  : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
-      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXwriteparam
-  : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>,
-      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-
-
-
-//===----------------------------------------------------------------------===//
-// Classes for loads/stores
-//===----------------------------------------------------------------------===//
-multiclass PTX_LD<string opstr, string typestr,
-           RegisterClass RC, PatFrag pat_load> {
-  def rr32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs RC:$d),
-                     (ins MEMii32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs RC:$d),
-                     (ins MEMii64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_ST<string opstr, string typestr, RegisterClass RC,
-                  PatFrag pat_store> {
-  def rr32 : InstPTX<(outs),
-                     (ins RC:$d, MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr32:$a)]>,
-                     Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs),
-                     (ins RC:$d, MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr64:$a)]>,
-                     Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs),
-                   (ins RC:$d, MEMri32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri32:$a)]>,
-                   Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs),
-                   (ins RC:$d, MEMri64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri64:$a)]>,
-                   Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs),
-                   (ins RC:$d, MEMii32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii32:$a)]>,
-                   Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs),
-                   (ins RC:$d, MEMii64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii64:$a)]>,
-                   Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> {
-  def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a),
-                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
-                       [(set RC:$d, (load_global ADDRlocal32:$a))]>;
-  def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a),
-                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
-                       [(set RC:$d, (load_global ADDRlocal64:$a))]>;
-  def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a),
-                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
-                       [(store_global RC:$d, ADDRlocal32:$a)]>;
-  def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a),
-                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
-                       [(store_global RC:$d, ADDRlocal64:$a)]>;
-}
-
-multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> {
-  let hasSideEffects = 1 in {
-  def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a),
-                     !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (PTXloadparam texternalsym:$a))]>;
-  def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a),
-                     !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")),
-                     [(PTXstoreparam texternalsym:$d, RC:$a)]>;
-  }
-}
-
-multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
-  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
-  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
-  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
-  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
-  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
-}
-
-multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
-  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
-  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
-  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
-  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
-  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
-}
-
-
-
-//===----------------------------------------------------------------------===//
-// Instruction definitions for loads/stores
-//===----------------------------------------------------------------------===//
-
-// Global/shared stores
-defm STg : PTX_ST_ALL<"st.global", store_global>;
-defm STs : PTX_ST_ALL<"st.shared", store_shared>;
-
-// Global/shared/constant loads
-defm LDg : PTX_LD_ALL<"ld.global", load_global>;
-defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
-defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
-
-// Param loads/stores
-defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>;
-defm PARAMU16  : PTX_PARAM_LD_ST<".u16", RegI16>;
-defm PARAMU32  : PTX_PARAM_LD_ST<".u32", RegI32>;
-defm PARAMU64  : PTX_PARAM_LD_ST<".u64", RegI64>;
-defm PARAMF32  : PTX_PARAM_LD_ST<".f32", RegF32>;
-defm PARAMF64  : PTX_PARAM_LD_ST<".f64", RegF64>;
-
-// Local loads/stores
-defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>;
-defm LOCALU16  : PTX_LOCAL_LD_ST<".u16", RegI16>;
-defm LOCALU32  : PTX_LOCAL_LD_ST<".u32", RegI32>;
-defm LOCALU64  : PTX_LOCAL_LD_ST<".u64", RegI64>;
-defm LOCALF32  : PTX_LOCAL_LD_ST<".f32", RegF32>;
-defm LOCALF64  : PTX_LOCAL_LD_ST<".f64", RegF64>;
-
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
deleted file mode 100644
index 3416f1c..0000000
--- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the PTX-specific intrinsic instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Special Purpose Register Accessor Intrinsics
-
-class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
-  : InstPTX<(outs RegI64:$d), (ins),
-            !strconcat("mov.u64\t$d, %", regname),
-            [(set RegI64:$d, (intop))]>;
-
-class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
-  : InstPTX<(outs RegI32:$d), (ins),
-            !strconcat("mov.u32\t$d, %", regname),
-            [(set RegI32:$d, (intop))]>;
-
-// TODO Add read vector-version of special registers
-
-//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid",
-//                                                     int_ptx_read_tid_r64>;
-def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
-                                                     int_ptx_read_tid_x>;
-def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
-                                                     int_ptx_read_tid_y>;
-def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
-                                                     int_ptx_read_tid_z>;
-def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
-                                                     int_ptx_read_tid_w>;
-
-//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid",
-//                                                      int_ptx_read_ntid_r64>;
-def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
-                                                      int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
-                                                      int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
-                                                      int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
-                                                      int_ptx_read_ntid_w>;
-
-def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
-                                                     int_ptx_read_laneid>;
-def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
-                                                     int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
-                                                     int_ptx_read_nwarpid>;
-
-//def PTX_READ_CTAID_R64 :
-//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
-def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
-                                                       int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
-                                                       int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
-                                                       int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
-                                                       int_ptx_read_ctaid_w>;
-
-//def PTX_READ_NCTAID_R64 :
-//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
-def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
-                                                        int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
-                                                        int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
-                                                        int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
-                                                        int_ptx_read_nctaid_w>;
-
-def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
-                                                   int_ptx_read_smid>;
-def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
-                                                    int_ptx_read_nsmid>;
-def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
-                                                     int_ptx_read_gridid>;
-
-def PTX_READ_LANEMASK_EQ
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
-def PTX_READ_LANEMASK_LE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
-def PTX_READ_LANEMASK_LT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
-def PTX_READ_LANEMASK_GE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
-def PTX_READ_LANEMASK_GT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
-
-def PTX_READ_CLOCK
-  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
-def PTX_READ_CLOCK64
-  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
-
-def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
-def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
-def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
-def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
-
-// PTX Parallel Synchronization and Communication Intrinsics
-
-def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i",
-                           [(int_ptx_bar_sync imm:$i)]>;
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
deleted file mode 100644
index 3ed67a6..0000000
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ /dev/null
@@ -1,556 +0,0 @@
-//===-- PTXMCAsmStreamer.cpp - PTX Text Assembly Output -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/PathV2.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-class PTXMCAsmStreamer : public MCStreamer {
-  formatted_raw_ostream &OS;
-  const MCAsmInfo &MAI;
-  OwningPtr<MCInstPrinter> InstPrinter;
-  OwningPtr<MCCodeEmitter> Emitter;
-
-  SmallString<128> CommentToEmit;
-  raw_svector_ostream CommentStream;
-
-  unsigned IsVerboseAsm : 1;
-  unsigned ShowInst : 1;
-
-public:
-  PTXMCAsmStreamer(MCContext &Context,
-                   formatted_raw_ostream &os,
-                   bool isVerboseAsm, bool useLoc,
-                   MCInstPrinter *printer,
-                   MCCodeEmitter *emitter,
-                   bool showInst)
-    : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
-      InstPrinter(printer), Emitter(emitter), CommentStream(CommentToEmit),
-      IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst) {
-    if (InstPrinter && IsVerboseAsm)
-      InstPrinter->setCommentStream(CommentStream);
-  }
-
-  ~PTXMCAsmStreamer() {}
-
-  inline void EmitEOL() {
-    // If we don't have any comments, just emit a \n.
-    if (!IsVerboseAsm) {
-      OS << '\n';
-      return;
-    }
-    EmitCommentsAndEOL();
-  }
-  void EmitCommentsAndEOL();
-
-  /// isVerboseAsm - Return true if this streamer supports verbose assembly at
-  /// all.
-  virtual bool isVerboseAsm() const { return IsVerboseAsm; }
-
-  /// hasRawTextSupport - We support EmitRawText.
-  virtual bool hasRawTextSupport() const { return true; }
-
-  /// AddComment - Add a comment that can be emitted to the generated .s
-  /// file if applicable as a QoI issue to make the output of the compiler
-  /// more readable.  This only affects the MCAsmStreamer, and only when
-  /// verbose assembly output is enabled.
-  virtual void AddComment(const Twine &T);
-
-  /// AddEncodingComment - Add a comment showing the encoding of an instruction.
-  virtual void AddEncodingComment(const MCInst &Inst);
-
-  /// GetCommentOS - Return a raw_ostream that comments can be written to.
-  /// Unlike AddComment, you are required to terminate comments with \n if you
-  /// use this method.
-  virtual raw_ostream &GetCommentOS() {
-    if (!IsVerboseAsm)
-      return nulls();  // Discard comments unless in verbose asm mode.
-    return CommentStream;
-  }
-
-  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
-  virtual void AddBlankLine() {
-    EmitEOL();
-  }
-
-  /// @name MCStreamer Interface
-  /// @{
-
-  virtual void ChangeSection(const MCSection *Section);
-  virtual void InitSections() { /* PTX does not use sections */ }
-
-  virtual void EmitLabel(MCSymbol *Symbol);
-
-  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
-
-  virtual void EmitThumbFunc(MCSymbol *Func);
-
-  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
-
-  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
-
-  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                        const MCSymbol *LastLabel,
-                                        const MCSymbol *Label,
-                                        unsigned PointerSize);
-
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
-
-  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
-  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
-  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
-  virtual void EmitCOFFSymbolType(int Type);
-  virtual void EndCOFFSymbolDef();
-  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
-  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                unsigned ByteAlignment);
-
-  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
-  ///
-  /// @param Symbol - The common symbol to emit.
-  /// @param Size - The size of the common symbol.
-  /// @param ByteAlignment - The alignment of the common symbol in bytes.
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                     unsigned ByteAlignment);
-
-  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
-
-  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                              uint64_t Size, unsigned ByteAlignment = 0);
-
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace);
-  virtual void EmitULEB128Value(const MCExpr *Value);
-  virtual void EmitSLEB128Value(const MCExpr *Value);
-  virtual void EmitGPRel32Value(const MCExpr *Value);
-
-
-  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                        unsigned AddrSpace);
-
-  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                                    unsigned ValueSize = 1,
-                                    unsigned MaxBytesToEmit = 0);
-
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit = 0);
-
-  virtual bool EmitValueToOffset(const MCExpr *Offset,
-                                 unsigned char Value = 0);
-
-  virtual void EmitFileDirective(StringRef Filename);
-  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                      StringRef Filename);
-
-  virtual void EmitInstruction(const MCInst &Inst);
-
-  /// EmitRawText - If this file is backed by an assembly streamer, this dumps
-  /// the specified string in the output .s file.  This capability is
-  /// indicated by the hasRawTextSupport() predicate.
-  virtual void EmitRawText(StringRef String);
-
-  virtual void FinishImpl();
-
-  /// @}
-
-}; // class PTXMCAsmStreamer
-
-}
-
-/// TODO: Add appropriate implementation of Emit*() methods when needed
-
-void PTXMCAsmStreamer::AddComment(const Twine &T) {
-  if (!IsVerboseAsm) return;
-
-  // Make sure that CommentStream is flushed.
-  CommentStream.flush();
-
-  T.toVector(CommentToEmit);
-  // Each comment goes on its own line.
-  CommentToEmit.push_back('\n');
-
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
-}
-
-void PTXMCAsmStreamer::EmitCommentsAndEOL() {
-  if (CommentToEmit.empty() && CommentStream.GetNumBytesInBuffer() == 0) {
-    OS << '\n';
-    return;
-  }
-
-  CommentStream.flush();
-  StringRef Comments = CommentToEmit.str();
-
-  assert(Comments.back() == '\n' &&
-         "Comment array not newline terminated");
-  do {
-    // Emit a line of comments.
-    OS.PadToColumn(MAI.getCommentColumn());
-    size_t Position = Comments.find('\n');
-    OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n';
-
-    Comments = Comments.substr(Position+1);
-  } while (!Comments.empty());
-
-  CommentToEmit.clear();
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
-}
-
-static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
-  assert(Bytes && "Invalid size!");
-  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
-}
-
-void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
-  assert(Section && "Cannot switch to a null section!");
-}
-
-void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection() && "Cannot emit before setting section!");
-
-  OS << *Symbol << MAI.getLabelSuffix();
-  EmitEOL();
-  Symbol->setSection(*getCurrentSection());
-}
-
-void PTXMCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
-
-void PTXMCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {}
-
-void PTXMCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
-  OS << *Symbol << " = " << *Value;
-  EmitEOL();
-
-  // FIXME: Lift context changes into super class.
-  Symbol->setVariableValue(Value);
-}
-
-void PTXMCAsmStreamer::EmitWeakReference(MCSymbol *Alias,
-                                         const MCSymbol *Symbol) {
-  OS << ".weakref " << *Alias << ", " << *Symbol;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                                const MCSymbol *LastLabel,
-                                                const MCSymbol *Label,
-                                                unsigned PointerSize) {
-  report_fatal_error("Unimplemented.");
-}
-
-void PTXMCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                           MCSymbolAttr Attribute) {}
-
-void PTXMCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-
-void PTXMCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-
-void PTXMCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {}
-
-void PTXMCAsmStreamer::EmitCOFFSymbolType (int Type) {}
-
-void PTXMCAsmStreamer::EndCOFFSymbolDef() {}
-
-void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
-
-void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                        unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                             unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                    unsigned Size, unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitTBSSSymbol(const MCSection *Section,
-                                      MCSymbol *Symbol,
-                                      uint64_t Size, unsigned ByteAlignment) {}
-
-static inline char toOctal(int X) { return (X&7)+'0'; }
-
-static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
-  OS << '"';
-
-  for (unsigned i = 0, e = Data.size(); i != e; ++i) {
-    unsigned char C = Data[i];
-    if (C == '"' || C == '\\') {
-      OS << '\\' << (char)C;
-      continue;
-    }
-
-    if (isprint((unsigned char)C)) {
-      OS << (char)C;
-      continue;
-    }
-
-    switch (C) {
-      case '\b': OS << "\\b"; break;
-      case '\f': OS << "\\f"; break;
-      case '\n': OS << "\\n"; break;
-      case '\r': OS << "\\r"; break;
-      case '\t': OS << "\\t"; break;
-      default:
-        OS << '\\';
-        OS << toOctal(C >> 6);
-        OS << toOctal(C >> 3);
-        OS << toOctal(C >> 0);
-        break;
-    }
-  }
-
-  OS << '"';
-}
-
-void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  if (Data.empty()) return;
-
-  if (Data.size() == 1) {
-    OS << MAI.getData8bitsDirective(AddrSpace);
-    OS << (unsigned)(unsigned char)Data[0];
-    EmitEOL();
-    return;
-  }
-
-  // If the data ends with 0 and the target supports .asciz, use it, otherwise
-  // use .ascii
-  if (MAI.getAscizDirective() && Data.back() == 0) {
-    OS << MAI.getAscizDirective();
-    Data = Data.substr(0, Data.size()-1);
-  } else {
-    OS << MAI.getAsciiDirective();
-  }
-
-  OS << ' ';
-  PrintQuotedString(Data, OS);
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     unsigned AddrSpace) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  const char *Directive = 0;
-  switch (Size) {
-  default: break;
-  case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break;
-  case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break;
-  case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break;
-  case 8:
-    Directive = MAI.getData64bitsDirective(AddrSpace);
-    // If the target doesn't support 64-bit data, emit as two 32-bit halves.
-    if (Directive) break;
-    int64_t IntValue;
-    if (!Value->EvaluateAsAbsolute(IntValue))
-      report_fatal_error("Don't know how to emit this value.");
-    if (getContext().getAsmInfo().isLittleEndian()) {
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
-    } else {
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
-    }
-    return;
-  }
-
-  assert(Directive && "Invalid size for machine code value!");
-  OS << Directive << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
-  assert(MAI.hasLEB128() && "Cannot print a .uleb");
-  OS << ".uleb128 " << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
-  assert(MAI.hasLEB128() && "Cannot print a .sleb");
-  OS << ".sleb128 " << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
-  assert(MAI.getGPRel32Directive() != 0);
-  OS << MAI.getGPRel32Directive() << *Value;
-  EmitEOL();
-}
-
-
-/// EmitFill - Emit NumBytes bytes worth of the value specified by
-/// FillValue.  This implements directives such as '.space'.
-void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                                unsigned AddrSpace) {
-  if (NumBytes == 0) return;
-
-  if (AddrSpace == 0)
-    if (const char *ZeroDirective = MAI.getZeroDirective()) {
-      OS << ZeroDirective << NumBytes;
-      if (FillValue != 0)
-        OS << ',' << (int)FillValue;
-      EmitEOL();
-      return;
-    }
-
-  // Emit a byte at a time.
-  MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
-}
-
-void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                            int64_t Value,
-                                            unsigned ValueSize,
-                                            unsigned MaxBytesToEmit) {
-  // Some assemblers don't support non-power of two alignments, so we always
-  // emit alignments as a power of two if possible.
-  if (isPowerOf2_32(ByteAlignment)) {
-    switch (ValueSize) {
-    default: llvm_unreachable("Invalid size for machine code value!");
-    case 1: OS << MAI.getAlignDirective(); break;
-    // FIXME: use MAI for this!
-    case 2: OS << ".p2alignw "; break;
-    case 4: OS << ".p2alignl "; break;
-    case 8: llvm_unreachable("Unsupported alignment size!");
-    }
-
-    if (MAI.getAlignmentIsInBytes())
-      OS << ByteAlignment;
-    else
-      OS << Log2_32(ByteAlignment);
-
-    if (Value || MaxBytesToEmit) {
-      OS << ", 0x";
-      OS.write_hex(truncateToSize(Value, ValueSize));
-
-      if (MaxBytesToEmit)
-        OS << ", " << MaxBytesToEmit;
-    }
-    EmitEOL();
-    return;
-  }
-
-  // Non-power of two alignment.  This is not widely supported by assemblers.
-  // FIXME: Parameterize this based on MAI.
-  switch (ValueSize) {
-  default: llvm_unreachable("Invalid size for machine code value!");
-  case 1: OS << ".balign";  break;
-  case 2: OS << ".balignw"; break;
-  case 4: OS << ".balignl"; break;
-  case 8: llvm_unreachable("Unsupported alignment size!");
-  }
-
-  OS << ' ' << ByteAlignment;
-  OS << ", " << truncateToSize(Value, ValueSize);
-  if (MaxBytesToEmit)
-    OS << ", " << MaxBytesToEmit;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                         unsigned MaxBytesToEmit) {}
-
-bool PTXMCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
-                                         unsigned char Value) {return false;}
-
-
-void PTXMCAsmStreamer::EmitFileDirective(StringRef Filename) {
-  assert(MAI.hasSingleParameterDotFile());
-  OS << "\t.file\t";
-  PrintQuotedString(Filename, OS);
-  EmitEOL();
-}
-
-// FIXME: should we inherit from MCAsmStreamer?
-bool PTXMCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
-                                              StringRef Directory,
-                                              StringRef Filename) {
-  if (!Directory.empty()) {
-    if (sys::path::is_absolute(Filename))
-      return EmitDwarfFileDirective(FileNo, "", Filename);
-    SmallString<128> FullPathName = Directory;
-    sys::path::append(FullPathName, Filename);
-    return EmitDwarfFileDirective(FileNo, "", FullPathName);
-  }
-
-  OS << "\t.file\t" << FileNo << ' ';
-  PrintQuotedString(Filename, OS);
-  EmitEOL();
-  return this->MCStreamer::EmitDwarfFileDirective(FileNo, Directory, Filename);
-}
-
-void PTXMCAsmStreamer::AddEncodingComment(const MCInst &Inst) {}
-
-void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-
-  // Show the encoding in a comment if we have a code emitter.
-  if (Emitter)
-    AddEncodingComment(Inst);
-
-  // Show the MCInst if enabled.
-  if (ShowInst) {
-    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
-    GetCommentOS() << "\n";
-  }
-
-  // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
-  if (InstPrinter)
-    InstPrinter->printInst(&Inst, OS, "");
-  else
-    Inst.print(OS, &MAI);
-  EmitEOL();
-}
-
-/// EmitRawText - If this file is backed by an assembly streamer, this dumps
-/// the specified string in the output .s file.  This capability is
-/// indicated by the hasRawTextSupport() predicate.
-void PTXMCAsmStreamer::EmitRawText(StringRef String) {
-  if (!String.empty() && String.back() == '\n')
-    String = String.substr(0, String.size()-1);
-  OS << String;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::FinishImpl() {}
-
-namespace llvm {
-  MCStreamer *createPTXAsmStreamer(MCContext &Context,
-                                   formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc, bool useCFI,
-                                   bool useDwarfDirectory,
-                                   MCInstPrinter *IP,
-                                   MCCodeEmitter *CE, MCAsmBackend *MAB,
-                                   bool ShowInst) {
-    return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
-                                IP, CE, ShowInst);
-  }
-}
diff --git a/lib/Target/PTX/PTXMCInstLower.cpp b/lib/Target/PTX/PTXMCInstLower.cpp
deleted file mode 100644
index 142e639..0000000
--- a/lib/Target/PTX/PTXMCInstLower.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower PTX MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "PTXAsmPrinter.h"
-#include "llvm/Constants.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
-
-void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        PTXAsmPrinter &AP) {
-  OutMI.setOpcode(MI->getOpcode());
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    MCOperand MCOp;
-    OutMI.addOperand(AP.lowerOperand(MO));
-  }
-}
-
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
deleted file mode 100644
index 172a0e0..0000000
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===-- PTXMFInfoExtract.cpp - Extract PTX machine function info ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an information extractor for PTX machine functions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-mf-info-extract"
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "PTXMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// NOTE: PTXMFInfoExtract must after register allocation!
-
-namespace {
-  /// PTXMFInfoExtract - PTX specific code to extract of PTX machine
-  /// function information for PTXAsmPrinter
-  ///
-  class PTXMFInfoExtract : public MachineFunctionPass {
-    private:
-      static char ID;
-
-    public:
-      PTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
-        : MachineFunctionPass(ID) {}
-
-      virtual bool runOnMachineFunction(MachineFunction &MF);
-
-      virtual const char *getPassName() const {
-        return "PTX Machine Function Info Extractor";
-      }
-  }; // class PTXMFInfoExtract
-} // end anonymous namespace
-
-using namespace llvm;
-
-char PTXMFInfoExtract::ID = 0;
-
-bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  // Generate list of all virtual registers used in this function
-  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
-    unsigned RegType;
-    if (TRC == PTX::RegPredRegisterClass)
-      RegType = PTXRegisterType::Pred;
-    else if (TRC == PTX::RegI16RegisterClass)
-      RegType = PTXRegisterType::B16;
-    else if (TRC == PTX::RegI32RegisterClass)
-      RegType = PTXRegisterType::B32;
-    else if (TRC == PTX::RegI64RegisterClass)
-      RegType = PTXRegisterType::B64;
-    else if (TRC == PTX::RegF32RegisterClass)
-      RegType = PTXRegisterType::F32;
-    else if (TRC == PTX::RegF64RegisterClass)
-      RegType = PTXRegisterType::F64;
-    else
-      llvm_unreachable("Unkown register class.");
-    MFI->addRegister(Reg, RegType, PTXRegisterSpace::Reg);
-  }
-
-  return false;
-}
-
-FunctionPass *llvm::createPTXMFInfoExtract(PTXTargetMachine &TM,
-                                           CodeGenOpt::Level OptLevel) {
-  return new PTXMFInfoExtract(TM, OptLevel);
-}
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
deleted file mode 100644
index bb7574c..0000000
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- PTXMachineFuctionInfo.h - PTX machine function info ------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares PTX-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_MACHINE_FUNCTION_INFO_H
-#define PTX_MACHINE_FUNCTION_INFO_H
-
-#include "PTX.h"
-#include "PTXParamManager.h"
-#include "PTXRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-/// PTXMachineFunctionInfo - This class is derived from MachineFunction and
-/// contains private PTX target-specific information for each MachineFunction.
-///
-class PTXMachineFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-  bool IsKernel;
-  DenseSet<unsigned> RegArgs;
-  DenseSet<unsigned> RegRets;
-
-  typedef DenseMap<int, std::string> FrameMap;
-
-  FrameMap FrameSymbols;
-
-  struct RegisterInfo {
-    unsigned Reg;
-    unsigned Type;
-    unsigned Space;
-    unsigned Offset;
-    unsigned Encoded;
-  };
-
-  typedef DenseMap<unsigned, RegisterInfo> RegisterInfoMap;
-
-  RegisterInfoMap RegInfo;
-
-  PTXParamManager ParamManager;
-
-public:
-  typedef DenseSet<unsigned>::const_iterator reg_iterator;
-
-  PTXMachineFunctionInfo(MachineFunction &MF)
-    : IsKernel(false) {
-  }
-
-  /// getParamManager - Returns the PTXParamManager instance for this function.
-  PTXParamManager& getParamManager() { return ParamManager; }
-  const PTXParamManager& getParamManager() const { return ParamManager; }
-
-  /// setKernel/isKernel - Gets/sets a flag that indicates if this function is
-  /// a PTX kernel function.
-  void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; }
-  bool isKernel() const { return IsKernel; }
-
-  /// argreg_begin/argreg_end - Returns iterators to the set of registers
-  /// containing function arguments.
-  reg_iterator argreg_begin() const { return RegArgs.begin(); }
-  reg_iterator argreg_end()   const { return RegArgs.end(); }
-
-  /// retreg_begin/retreg_end - Returns iterators to the set of registers
-  /// containing the function return values.
-  reg_iterator retreg_begin() const { return RegRets.begin(); }
-  reg_iterator retreg_end()   const { return RegRets.end(); }
-
-  /// addRegister - Adds a virtual register to the set of all used registers
-  void addRegister(unsigned Reg, unsigned RegType, unsigned RegSpace) {
-    if (!RegInfo.count(Reg)) {
-      RegisterInfo Info;
-      Info.Reg = Reg;
-      Info.Type = RegType;
-      Info.Space = RegSpace;
-
-      // Determine register offset
-      Info.Offset = 0;
-      for(RegisterInfoMap::const_iterator i = RegInfo.begin(),
-          e = RegInfo.end(); i != e; ++i) {
-        const RegisterInfo& RI = i->second;
-        if (RI.Space == RegSpace)
-          if (RI.Space != PTXRegisterSpace::Reg || RI.Type == Info.Type)
-            Info.Offset++;
-      }
-
-      // Encode the register data into a single register number
-      Info.Encoded = (Info.Offset << 6) | (Info.Type << 3) | Info.Space;
-
-      RegInfo[Reg] = Info;
-
-      if (RegSpace == PTXRegisterSpace::Argument)
-        RegArgs.insert(Reg);
-      else if (RegSpace == PTXRegisterSpace::Return)
-        RegRets.insert(Reg);
-    }
-  }
-
-  /// countRegisters - Returns the number of registers of the given type and
-  /// space.
-  unsigned countRegisters(unsigned RegType, unsigned RegSpace) const {
-    unsigned Count = 0;
-    for(RegisterInfoMap::const_iterator i = RegInfo.begin(), e = RegInfo.end();
-        i != e; ++i) {
-      const RegisterInfo& RI = i->second;
-      if (RI.Type == RegType && RI.Space == RegSpace)
-        Count++;
-    }
-    return Count;
-  }
-
-  /// getEncodedRegister - Returns the encoded value of the register.
-  unsigned getEncodedRegister(unsigned Reg) const {
-    return RegInfo.lookup(Reg).Encoded;
-  }
-
-  /// addRetReg - Adds a register to the set of return-value registers.
-  void addRetReg(unsigned Reg) {
-    if (!RegRets.count(Reg)) {
-      RegRets.insert(Reg);
-    }
-  }
-
-  /// addArgReg - Adds a register to the set of function argument registers.
-  void addArgReg(unsigned Reg) {
-    RegArgs.insert(Reg);
-  }
-
-  /// getRegisterName - Returns the name of the specified virtual register. This
-  /// name is used during PTX emission.
-  std::string getRegisterName(unsigned Reg) const {
-    if (RegInfo.count(Reg)) {
-      const RegisterInfo& RI = RegInfo.lookup(Reg);
-      std::string Name;
-      raw_string_ostream NameStr(Name);
-      decodeRegisterName(NameStr, RI.Encoded);
-      NameStr.flush();
-      return Name;
-    }
-    else if (Reg == PTX::NoRegister)
-      return "%noreg";
-    else
-      llvm_unreachable("Register not in register name map");
-  }
-
-  /// getEncodedRegisterName - Returns the name of the encoded register.
-  std::string getEncodedRegisterName(unsigned EncodedReg) const {
-    std::string Name;
-    raw_string_ostream NameStr(Name);
-    decodeRegisterName(NameStr, EncodedReg);
-    NameStr.flush();
-    return Name;
-  }
-
-  /// getRegisterType - Returns the type of the specified virtual register.
-  unsigned getRegisterType(unsigned Reg) const {
-    if (RegInfo.count(Reg))
-      return RegInfo.lookup(Reg).Type;
-    else
-      llvm_unreachable("Unknown register");
-  }
-
-  /// getOffsetForRegister - Returns the offset of the virtual register
-  unsigned getOffsetForRegister(unsigned Reg) const {
-    if (RegInfo.count(Reg))
-      return RegInfo.lookup(Reg).Offset;
-    else
-      return 0;
-  }
-
-  /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
-  const char* getFrameSymbol(int FrameIndex) {
-    if (FrameSymbols.count(FrameIndex)) {
-      return FrameSymbols.lookup(FrameIndex).c_str();
-    } else {
-      std::string Name          = "__local";
-      Name                     += utostr(FrameIndex);
-      // The whole point of caching this name is to ensure the pointer we pass
-      // to any getExternalSymbol() calls will remain valid for the lifetime of
-      // the back-end instance. This is to work around an issue in SelectionDAG
-      // where symbol names are expected to be life-long strings.
-      FrameSymbols[FrameIndex]  = Name;
-      return FrameSymbols[FrameIndex].c_str();
-    }
-  }
-}; // class PTXMachineFunctionInfo
-} // namespace llvm
-
-#endif // PTX_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp
deleted file mode 100644
index cc1cc71..0000000
--- a/lib/Target/PTX/PTXParamManager.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- PTXParamManager.cpp - Manager for .param variables ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXParamManager class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXParamManager.h"
-#include "PTX.h"
-#include "llvm/ADT/StringExtras.h"
-
-using namespace llvm;
-
-PTXParamManager::PTXParamManager() {
-}
-
-unsigned PTXParamManager::addArgumentParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_ARGUMENT;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__param_";
-  Name += utostr(ArgumentParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  ArgumentParams.push_back(Index);
-
-  return Index;
-}
-
-unsigned PTXParamManager::addReturnParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_RETURN;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__ret_";
-  Name += utostr(ReturnParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  ReturnParams.push_back(Index);
-
-  return Index;
-}
-
-unsigned PTXParamManager::addLocalParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_LOCAL;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__localparam_";
-  Name += utostr(LocalParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  LocalParams.push_back(Index);
-
-  return Index;
-}
-
diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h
deleted file mode 100644
index 92e7728..0000000
--- a/lib/Target/PTX/PTXParamManager.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//===-- PTXParamManager.h - Manager for .param variables --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PTXParamManager class, which manages all defined .param
-// variables for a particular function.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_PARAM_MANAGER_H
-#define PTX_PARAM_MANAGER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include <string>
-
-namespace llvm {
-
-/// PTXParamManager - This class manages all .param variables defined for a
-/// particular function.
-class PTXParamManager {
-private:
-
-  /// PTXParamType - Type of a .param variable
-  enum PTXParamType {
-    PTX_PARAM_TYPE_ARGUMENT,
-    PTX_PARAM_TYPE_RETURN,
-    PTX_PARAM_TYPE_LOCAL
-  };
-
-  /// PTXParam - Definition of a PTX .param variable
-  struct PTXParam {
-    PTXParamType  Type;
-    unsigned      Size;
-    std::string   Name;
-  };
-
-  DenseMap<unsigned, PTXParam> AllParams;
-  SmallVector<unsigned, 4> ArgumentParams;
-  SmallVector<unsigned, 4> ReturnParams;
-  SmallVector<unsigned, 4> LocalParams;
-
-public:
-
-  typedef SmallVector<unsigned, 4>::const_iterator param_iterator;
-
-  PTXParamManager();
-
-  param_iterator arg_begin() const { return ArgumentParams.begin(); }
-  param_iterator arg_end() const { return ArgumentParams.end(); }
-  param_iterator ret_begin() const { return ReturnParams.begin(); }
-  param_iterator ret_end() const { return ReturnParams.end(); }
-  param_iterator local_begin() const { return LocalParams.begin(); }
-  param_iterator local_end() const { return LocalParams.end(); }
-
-  /// addArgumentParam - Returns a new .param used as an argument.
-  unsigned addArgumentParam(unsigned Size);
-
-  /// addReturnParam - Returns a new .param used as a return argument.
-  unsigned addReturnParam(unsigned Size);
-
-  /// addLocalParam - Returns a new .param used as a local .param variable.
-  unsigned addLocalParam(unsigned Size);
-
-  /// getParamName - Returns the name of the parameter as a string.
-  const std::string &getParamName(unsigned Param) const {
-    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
-    return AllParams.find(Param)->second.Name;
-  }
-
-  /// getParamSize - Returns the size of the parameter in bits.
-  unsigned getParamSize(unsigned Param) const {
-    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
-    return AllParams.find(Param)->second.Size;
-  }
-
-};
-
-}
-
-#endif
-
diff --git a/lib/Target/PTX/PTXRegAlloc.cpp b/lib/Target/PTX/PTXRegAlloc.cpp
deleted file mode 100644
index 7fd5375..0000000
--- a/lib/Target/PTX/PTXRegAlloc.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a register allocator for PTX code.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-reg-alloc"
-
-#include "PTX.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
-
-using namespace llvm;
-
-namespace {
-  // Special register allocator for PTX.
-  class PTXRegAlloc : public MachineFunctionPass {
-  public:
-    static char ID;
-    PTXRegAlloc() : MachineFunctionPass(ID) {}
-
-    virtual const char* getPassName() const {
-      return "PTX Register Allocator";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      // We do not actually do anything (at least not yet).
-      return false;
-    }
-  };
-
-  char PTXRegAlloc::ID = 0;
-
-  static RegisterRegAlloc
-    ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator);
-}
-
-FunctionPass *llvm::createPTXRegisterAllocator() {
-  return new PTXRegAlloc();
-}
-
diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp
deleted file mode 100644
index b6ffd38..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- PTXRegisterInfo.cpp - PTX Register Information --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXRegisterInfo.h"
-#include "PTX.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define GET_REGINFO_TARGET_DESC
-#include "PTXGenRegisterInfo.inc"
-
-using namespace llvm;
-
-PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM,
-                                 const TargetInstrInfo &tii)
-  // PTX does not have a return address register.
-  : PTXGenRegisterInfo(0), TII(tii) {
-}
-
-void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator /*II*/,
-                                          int /*SPAdj*/,
-                                          RegScavenger * /*RS*/) const {
-  llvm_unreachable("FrameIndex should have been previously eliminated!");
-}
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
deleted file mode 100644
index 5614ce7..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- PTXRegisterInfo.h - PTX Register Information Impl -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_REGISTER_INFO_H
-#define PTX_REGISTER_INFO_H
-
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/BitVector.h"
-
-#define GET_REGINFO_HEADER
-#include "PTXGenRegisterInfo.inc"
-
-namespace llvm {
-class PTXTargetMachine;
-class MachineFunction;
-
-struct PTXRegisterInfo : public PTXGenRegisterInfo {
-private:
-  const TargetInstrInfo &TII;
-
-public:
-  PTXRegisterInfo(PTXTargetMachine &TM,
-                  const TargetInstrInfo &tii);
-
-  virtual const uint16_t
-    *getCalleeSavedRegs(const MachineFunction *MF = 0) const {
-    static const uint16_t CalleeSavedRegs[] = { 0 };
-    return CalleeSavedRegs; // save nothing
-  }
-
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
-    BitVector Reserved(getNumRegs());
-    return Reserved; // reserve no regs
-  }
-
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                   int SPAdj,
-                                   RegScavenger *RS = NULL) const;
-
-  virtual unsigned getFrameRegister(const MachineFunction &MF) const {
-    llvm_unreachable("PTX does not have a frame register");
-  }
-}; // struct PTXRegisterInfo
-} // namespace llvm
-
-#endif // PTX_REGISTER_INFO_H
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
deleted file mode 100644
index e8b262e..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ /dev/null
@@ -1,36 +0,0 @@
-//===-- PTXRegisterInfo.td - PTX Register defs -------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the PTX register file
-//===----------------------------------------------------------------------===//
-
-class PTXReg<string n> : Register<n> {
-  let Namespace = "PTX";
-}
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-
-// The generated register info code throws warnings for empty register classes
-// (e.g. zero-length arrays), so we use a dummy register here just to prevent
-// these warnings.
-def DUMMY_REG : PTXReg<"R0">;
-
-//===----------------------------------------------------------------------===//
-//  Register classes
-//===----------------------------------------------------------------------===//
-def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>;
-def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>;
-def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>;
-def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>;
-def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>;
-def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>;
-
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
deleted file mode 100644
index a116fab..0000000
--- a/lib/Target/PTX/PTXSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-selectiondag-info"
-#include "PTXTargetMachine.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-using namespace llvm;
-
-PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<PTXSubtarget>()) {
-}
-
-PTXSelectionDAGInfo::~PTXSelectionDAGInfo() {
-}
-
-SDValue
-PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile, bool AlwaysInline,
-                                             MachinePointerInfo DstPtrInfo,
-                                          MachinePointerInfo SrcPtrInfo) const {
-  // Do repeated 4-byte loads and stores. To be improved.
-  // This requires 4-byte alignment.
-  if ((Align & 3) != 0)
-    return SDValue();
-  // This requires the copy size to be a constant, preferably
-  // within a subtarget-specific limit.
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  if (!ConstantSize)
-    return SDValue();
-  uint64_t SizeVal = ConstantSize->getZExtValue();
-  // Always inline memcpys. In PTX, we do not have a C library that provides
-  // a memcpy function.
-  //if (!AlwaysInline)
-  //  return SDValue();
-
-  unsigned BytesLeft = SizeVal & 3;
-  unsigned NumMemOps = SizeVal >> 2;
-  unsigned EmittedNumMemOps = 0;
-  EVT VT = MVT::i32;
-  unsigned VTSize = 4;
-  unsigned i = 0;
-  const unsigned MAX_LOADS_IN_LDM = 6;
-  SDValue TFOps[MAX_LOADS_IN_LDM];
-  SDValue Loads[MAX_LOADS_IN_LDM];
-  uint64_t SrcOff = 0, DstOff = 0;
-  EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
-
-  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
-  // same number of stores.  The loads and stores will get combined into
-  // ldm/stm later on.
-  while (EmittedNumMemOps < NumMemOps) {
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      Loads[i] = DAG.getLoad(VT, dl, Chain,
-                             DAG.getNode(ISD::ADD, dl, PointerType, Src,
-                                         DAG.getConstant(SrcOff, PointerType)),
-                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
-                             false, false, 0);
-      TFOps[i] = Loads[i].getValue(1);
-      SrcOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                              DAG.getNode(ISD::ADD, dl, PointerType, Dst,
-                                          DAG.getConstant(DstOff, PointerType)),
-                              DstPtrInfo.getWithOffset(DstOff),
-                              isVolatile, false, 0);
-      DstOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    EmittedNumMemOps += i;
-  }
-
-  if (BytesLeft == 0)
-    return Chain;
-
-  // Issue loads / stores for the trailing (1 - 3) bytes.
-  unsigned BytesLeftSave = BytesLeft;
-  i = 0;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    Loads[i] = DAG.getLoad(VT, dl, Chain,
-                           DAG.getNode(ISD::ADD, dl, PointerType, Src,
-                                       DAG.getConstant(SrcOff, PointerType)),
-                           SrcPtrInfo.getWithOffset(SrcOff), false, false,
-                           false, 0);
-    TFOps[i] = Loads[i].getValue(1);
-    ++i;
-    SrcOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-  i = 0;
-  BytesLeft = BytesLeftSave;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                            DAG.getNode(ISD::ADD, dl, PointerType, Dst,
-                                        DAG.getConstant(DstOff, PointerType)),
-                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
-    ++i;
-    DstOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-}
-
-SDValue PTXSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                        SDValue Chain, SDValue Dst,
-                        SDValue Src, SDValue Size,
-                        unsigned Align, bool isVolatile,
-                        MachinePointerInfo DstPtrInfo) const {
-  llvm_unreachable("memset lowering not implemented for PTX yet");
-}
-
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.h b/lib/Target/PTX/PTXSelectionDAGInfo.h
deleted file mode 100644
index e0c7167..0000000
--- a/lib/Target/PTX/PTXSelectionDAGInfo.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PTX subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXSELECTIONDAGINFO_H
-#define PTXSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target.
-/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo.
-class PTXSelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const PTXSubtarget *Subtarget;
-
-public:
-  explicit PTXSelectionDAGInfo(const TargetMachine &TM);
-  ~PTXSelectionDAGInfo();
-
-  virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
-                                  MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
-
-  virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                                  SDValue Chain,
-                                  SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align,
-                                  bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const;
-};
-
-}
-
-#endif
-
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
deleted file mode 100644
index 454f64e..0000000
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- PTXSubtarget.cpp - PTX Subtarget Information ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTX specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXSubtarget.h"
-#include "PTX.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
-#include "PTXGenSubtargetInfo.inc"
-
-using namespace llvm;
-
-void PTXSubtarget::anchor() { }
-
-PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool is64Bit)
-  : PTXGenSubtargetInfo(TT, CPU, FS),
-    PTXTarget(PTX_COMPUTE_1_0),
-    PTXVersion(PTX_VERSION_2_0),
-    SupportsDouble(false),
-    SupportsFMA(true),
-    Is64Bit(is64Bit) {
-  std::string TARGET = CPU;
-  if (TARGET.empty())
-    TARGET = "generic";
-  ParseSubtargetFeatures(TARGET, FS);
-}
-
-std::string PTXSubtarget::getTargetString() const {
-  switch(PTXTarget) {
-    default: llvm_unreachable("Unknown PTX target");
-    case PTX_SM_1_0: return "sm_10";
-    case PTX_SM_1_1: return "sm_11";
-    case PTX_SM_1_2: return "sm_12";
-    case PTX_SM_1_3: return "sm_13";
-    case PTX_SM_2_0: return "sm_20";
-    case PTX_SM_2_1: return "sm_21";
-    case PTX_SM_2_2: return "sm_22";
-    case PTX_SM_2_3: return "sm_23";
-    case PTX_COMPUTE_1_0: return "compute_10";
-    case PTX_COMPUTE_1_1: return "compute_11";
-    case PTX_COMPUTE_1_2: return "compute_12";
-    case PTX_COMPUTE_1_3: return "compute_13";
-    case PTX_COMPUTE_2_0: return "compute_20";
-  }
-}
-
-std::string PTXSubtarget::getPTXVersionString() const {
-  switch(PTXVersion) {
-    case PTX_VERSION_2_0: return "2.0";
-    case PTX_VERSION_2_1: return "2.1";
-    case PTX_VERSION_2_2: return "2.2";
-    case PTX_VERSION_2_3: return "2.3";
-  }
-  llvm_unreachable("Invalid PTX version");
-}
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
deleted file mode 100644
index ce93fef..0000000
--- a/lib/Target/PTX/PTXSubtarget.h
+++ /dev/null
@@ -1,131 +0,0 @@
-//===-- PTXSubtarget.h - Define Subtarget for the PTX -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the PTX specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_SUBTARGET_H
-#define PTX_SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-#define GET_SUBTARGETINFO_HEADER
-#include "PTXGenSubtargetInfo.inc"
-
-namespace llvm {
-class StringRef;
-
-  class PTXSubtarget : public PTXGenSubtargetInfo {
-      virtual void anchor(); 
-    public:
-
-      /**
-       * Enumeration of Shader Models supported by the back-end.
-       */
-      enum PTXTargetEnum {
-        PTX_COMPUTE_1_0, /*< Compute Compatibility 1.0 */
-        PTX_COMPUTE_1_1, /*< Compute Compatibility 1.1 */
-        PTX_COMPUTE_1_2, /*< Compute Compatibility 1.2 */
-        PTX_COMPUTE_1_3, /*< Compute Compatibility 1.3 */
-        PTX_COMPUTE_2_0, /*< Compute Compatibility 2.0 */
-        PTX_LAST_COMPUTE,
-
-        PTX_SM_1_0, /*< Shader Model 1.0 */
-        PTX_SM_1_1, /*< Shader Model 1.1 */
-        PTX_SM_1_2, /*< Shader Model 1.2 */
-        PTX_SM_1_3, /*< Shader Model 1.3 */
-        PTX_SM_2_0, /*< Shader Model 2.0 */
-        PTX_SM_2_1, /*< Shader Model 2.1 */
-        PTX_SM_2_2, /*< Shader Model 2.2 */
-        PTX_SM_2_3, /*< Shader Model 2.3 */
-        PTX_LAST_SM
-      };
-
-      /**
-       * Enumeration of PTX versions supported by the back-end.
-       *
-       * Currently, PTX 2.0 is the minimum supported version.
-       */
-      enum PTXVersionEnum {
-        PTX_VERSION_2_0,  /*< PTX Version 2.0 */
-        PTX_VERSION_2_1,  /*< PTX Version 2.1 */
-        PTX_VERSION_2_2,  /*< PTX Version 2.2 */
-        PTX_VERSION_2_3   /*< PTX Version 2.3 */
-      };
-
-  private:
-
-      /// Shader Model supported on the target GPU.
-      PTXTargetEnum PTXTarget;
-
-      /// PTX Language Version.
-      PTXVersionEnum PTXVersion;
-
-      // The native .f64 type is supported on the hardware.
-      bool SupportsDouble;
-
-      // Support the fused-multiply add (FMA) and multiply-add (MAD)
-      // instructions
-      bool SupportsFMA;
-
-      // Use .u64 instead of .u32 for addresses.
-      bool Is64Bit;
-
-    public:
-
-      PTXSubtarget(const std::string &TT, const std::string &CPU,
-                   const std::string &FS, bool is64Bit);
-
-      // Target architecture accessors
-      std::string getTargetString() const;
-
-      std::string getPTXVersionString() const;
-
-      bool supportsDouble() const { return SupportsDouble; }
-
-      bool is64Bit() const { return Is64Bit; }
-
-      bool supportsFMA() const { return SupportsFMA; }
-
-      bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; }
-
-      bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
-
-      bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; }
-
-      bool fdivNeedsRoundingMode() const {
-        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool fmadNeedsRoundingMode() const {
-        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool useParamSpaceForDeviceArgs() const {
-        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool callsAreHandled() const {
-        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool emitPtrAttribute() const {
-        return PTXVersion >= PTX_VERSION_2_2;
-      }
-
-      void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  }; // class PTXSubtarget
-} // namespace llvm
-
-#endif // PTX_SUBTARGET_H
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
deleted file mode 100644
index 97b8de1..0000000
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//===-- PTXTargetMachine.cpp - Define TargetMachine for PTX ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Top-level implementation for the PTX target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXTargetMachine.h"
-#include "PTX.h"
-#include "llvm/PassManager.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Scalar.h"
-
-
-using namespace llvm;
-
-namespace llvm {
-  MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc,
-                                   bool useCFI, bool useDwarfDirectory,
-                                   MCInstPrinter *InstPrint,
-                                   MCCodeEmitter *CE,
-                                   MCAsmBackend *MAB,
-                                   bool ShowInst);
-}
-
-extern "C" void LLVMInitializePTXTarget() {
-
-  RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target);
-  RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target);
-
-  TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer);
-  TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer);
-}
-
-namespace {
-  const char* DataLayout32 =
-    "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
-  const char* DataLayout64 =
-    "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
-}
-
-// DataLayout and FrameLowering are filled with dummy data
-PTXTargetMachine::PTXTargetMachine(const Target &T,
-                                   StringRef TT, StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DataLayout(is64Bit ? DataLayout64 : DataLayout32),
-    Subtarget(TT, CPU, FS, is64Bit),
-    FrameLowering(Subtarget),
-    InstrInfo(*this),
-    TSInfo(*this),
-    TLInfo(*this) {
-}
-
-void PTX32TargetMachine::anchor() { }
-
-PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
-}
-
-void PTX64TargetMachine::anchor() { }
-
-PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
-}
-
-namespace llvm {
-/// PTX Code Generator Pass Configuration Options.
-class PTXPassConfig : public TargetPassConfig {
-public:
-  PTXPassConfig(PTXTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
-
-  PTXTargetMachine &getPTXTargetMachine() const {
-      return getTM<PTXTargetMachine>();
-  }
-
-  bool addInstSelector();
-  FunctionPass *createTargetRegisterAllocator(bool);
-  void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
-  bool addPostRegAlloc();
-  void addMachineLateOptimization();
-  bool addPreEmitPass();
-};
-} // namespace
-
-TargetPassConfig *PTXTargetMachine::createPassConfig(PassManagerBase &PM) {
-  PTXPassConfig *PassConfig = new PTXPassConfig(this, PM);
-  PassConfig->disablePass(PrologEpilogCodeInserterID);
-  return PassConfig;
-}
-
-bool PTXPassConfig::addInstSelector() {
-  PM->add(createPTXISelDag(getPTXTargetMachine(), getOptLevel()));
-  return false;
-}
-
-FunctionPass *PTXPassConfig::createTargetRegisterAllocator(bool /*Optimized*/) {
-  return createPTXRegisterAllocator();
-}
-
-// Modify the optimized compilation path to bypass optimized register alloction.
-void PTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  addFastRegAlloc(RegAllocPass);
-}
-
-bool PTXPassConfig::addPostRegAlloc() {
-  // PTXMFInfoExtract must after register allocation!
-  //PM->add(createPTXMFInfoExtract(getPTXTargetMachine()));
-  return false;
-}
-
-/// Add passes that optimize machine instructions after register allocation.
-void PTXPassConfig::addMachineLateOptimization() {
-  if (addPass(BranchFolderPassID) != &NoPassID)
-    printAndVerify("After BranchFolding");
-
-  if (addPass(TailDuplicateID) != &NoPassID)
-    printAndVerify("After TailDuplicate");
-}
-
-bool PTXPassConfig::addPreEmitPass() {
-  PM->add(createPTXMFInfoExtract(getPTXTargetMachine(), getOptLevel()));
-  PM->add(createPTXFPRoundingModePass(getPTXTargetMachine(), getOptLevel()));
-  return true;
-}
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
deleted file mode 100644
index 278d155..0000000
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- PTXTargetMachine.h - Define TargetMachine for PTX -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the PTX specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_TARGET_MACHINE_H
-#define PTX_TARGET_MACHINE_H
-
-#include "PTXISelLowering.h"
-#include "PTXInstrInfo.h"
-#include "PTXFrameLowering.h"
-#include "PTXSelectionDAGInfo.h"
-#include "PTXSubtarget.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-class PTXTargetMachine : public LLVMTargetMachine {
-  private:
-    const TargetData    DataLayout;
-    PTXSubtarget        Subtarget; // has to be initialized before FrameLowering
-    PTXFrameLowering    FrameLowering;
-    PTXInstrInfo        InstrInfo;
-    PTXSelectionDAGInfo TSInfo;
-    PTXTargetLowering   TLInfo;
-
-  public:
-    PTXTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL,
-                     bool is64Bit);
-
-    virtual const TargetData *getTargetData() const { return &DataLayout; }
-
-    virtual const TargetFrameLowering *getFrameLowering() const {
-      return &FrameLowering;
-    }
-
-    virtual const PTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
-    virtual const TargetRegisterInfo *getRegisterInfo() const {
-      return &InstrInfo.getRegisterInfo(); }
-
-    virtual const PTXTargetLowering *getTargetLowering() const {
-      return &TLInfo; }
-
-    virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const {
-      return &TSInfo;
-    }
-
-    virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
-
-    // Emission of machine code through JITCodeEmitter is not supported.
-    virtual bool addPassesToEmitMachineCode(PassManagerBase &,
-                                            JITCodeEmitter &,
-                                            bool = true) {
-      return true;
-    }
-
-    // Emission of machine code through MCJIT is not supported.
-    virtual bool addPassesToEmitMC(PassManagerBase &,
-                                   MCContext *&,
-                                   raw_ostream &,
-                                   bool = true) {
-      return true;
-    }
-
-    // Pass Pipeline Configuration
-    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-}; // class PTXTargetMachine
-
-
-class PTX32TargetMachine : public PTXTargetMachine {
-  virtual void anchor();
-public:
-
-  PTX32TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-}; // class PTX32TargetMachine
-
-class PTX64TargetMachine : public PTXTargetMachine {
-  virtual void anchor();
-public:
-
-  PTX64TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-}; // class PTX32TargetMachine
-
-} // namespace llvm
-
-#endif // PTX_TARGET_MACHINE_H
diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt
deleted file mode 100644
index d9a5da3..0000000
--- a/lib/Target/PTX/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPTXInfo
-  PTXTargetInfo.cpp
-  )
-
-add_dependencies(LLVMPTXInfo PTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
deleted file mode 100644
index 09a2735..0000000
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- PTXTargetInfo.cpp - PTX Target Implementation ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "llvm/Module.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-Target llvm::ThePTX32Target;
-Target llvm::ThePTX64Target;
-
-extern "C" void LLVMInitializePTXTargetInfo() {
-  // see llvm/ADT/Triple.h
-  RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32",
-                                    "PTX (32-bit) [Experimental]");
-  RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64",
-                                    "PTX (64-bit) [Experimental]");
-}
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index bcd8bd2..192d18d 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_target(PowerPCCodeGen
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCodeEmitter.cpp
+  PPCCTRLoops.cpp
   PPCHazardRecognizers.cpp
   PPCInstrInfo.cpp
   PPCISelDAGToDAG.cpp
@@ -28,6 +29,8 @@ add_llvm_target(PowerPCCodeGen
   PPCSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMPowerPCCodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 61d23ce..d175e3e 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -86,8 +86,33 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O, 
                                            const char *Modifier) {
-  assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!");
   unsigned Code = MI->getOperand(OpNo).getImm();
+  if (!Modifier) {
+    unsigned CCReg = MI->getOperand(OpNo+1).getReg();
+    unsigned RegNo;
+    switch (CCReg) {
+    default: llvm_unreachable("Unknown CR register");
+    case PPC::CR0: RegNo = 0; break;
+    case PPC::CR1: RegNo = 1; break;
+    case PPC::CR2: RegNo = 2; break;
+    case PPC::CR3: RegNo = 3; break;
+    case PPC::CR4: RegNo = 4; break;
+    case PPC::CR5: RegNo = 5; break;
+    case PPC::CR6: RegNo = 6; break;
+    case PPC::CR7: RegNo = 7; break;
+    }
+
+    // Print the CR bit number. The Code is ((BI << 5) | BO) for a
+    // BCC, but we must have the positive form here (BO == 12)
+    unsigned BI = Code >> 5;
+    assert((Code & 0xF) == 12 &&
+           "BO in predicate bit must have the positive form");
+
+    unsigned Value = 4*RegNo + BI;
+    O << Value;
+    return;
+  }
+
   if (StringRef(Modifier) == "cc") {
     switch ((PPC::Predicate)Code) {
     case PPC::PRED_ALWAYS: return; // Don't print anything for always.
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 73fd534..8f1e211 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -42,7 +42,7 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                             raw_ostream &O, const char *Modifier);
+                             raw_ostream &O, const char *Modifier = 0);
 
 
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 5a6827f..f652422 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -77,6 +77,7 @@ public:
 } // end anonymous namespace
   
 MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new PPCMCCodeEmitter(MCII, STI, Ctx);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index b7fa064..7162e15 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -31,6 +32,7 @@ extern Target ThePPC32Target;
 extern Target ThePPC64Target;
   
 MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 24a7178..9103e12 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -30,6 +30,7 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
+  FunctionPass *createPPCCTRLoops();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
@@ -50,21 +51,27 @@ namespace llvm {
     /// and jumps to external functions on Tiger and earlier.
     MO_DARWIN_STUB = 1,
     
-    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
-    MO_LO16 = 4, MO_HA16 = 8,
-
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
-    MO_PIC_FLAG = 16,
+    MO_PIC_FLAG = 4,
 
     /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
     /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
-    MO_NLP_FLAG = 32,
+    MO_NLP_FLAG = 8,
     
     /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
     /// symbol with hidden visibility.  This causes a different kind of
     /// non-lazy-pointer to be generated.
-    MO_NLP_HIDDEN_FLAG = 64
+    MO_NLP_HIDDEN_FLAG = 16,
+
+    /// The next are not flags but distinct values.
+    MO_ACCESS_MASK = 224,
+
+    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
+    MO_LO16 = 32, MO_HA16 = 64,
+
+    MO_TPREL16_HA = 96,
+    MO_TPREL16_LO = 128
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index c554d39..b7f1688 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,8 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -42,12 +44,14 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions">;
-def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
-                                        "Enable GPUL instructions">;
+def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+                                        "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                         "Enable the stfiwx instruction">;
+def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
+                                        "Enable the isel instruction">;
 def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
 
@@ -64,8 +68,10 @@ include "PPCInstrInfo.td"
 //
 
 def : Processor<"generic", G3Itineraries, [Directive32]>;
-def : Processor<"440", PPC440Itineraries, [Directive440, FeatureBookE]>;
-def : Processor<"450", PPC440Itineraries, [Directive440, FeatureBookE]>;
+def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureBookE]>;
+def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureBookE]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603]>;
@@ -74,28 +80,37 @@ def : Processor<"603ev", G3Itineraries, [Directive603]>;
 def : Processor<"604", G3Itineraries, [Directive604]>;
 def : Processor<"604e", G3Itineraries, [Directive604]>;
 def : Processor<"620", G3Itineraries, [Directive620]>;
-def : Processor<"g3", G3Itineraries, [Directive7400]>;
+def : Processor<"750", G4Itineraries, [Directive750]>;
+def : Processor<"g3", G3Itineraries, [Directive750]>;
 def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>;
-def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"970", G5Itineraries,
                   [Directive970, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
-def : Processor<"a2",  PPCA2Itineraries, [DirectiveA2, FeatureBookE,
-                                          FeatureFSqrt, FeatureSTFIWX,
-                                          Feature64Bit
-                                      /*, Feature64BitRegs */]>;
+def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
+                                         FeatureMFOCRF, FeatureFSqrt,
+                                         FeatureSTFIWX, FeatureISEL,
+                                         Feature64Bit
+                                     /*, Feature64BitRegs */]>;
+def : Processor<"pwr6", G5Itineraries,
+                  [DirectivePwr6, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"pwr7", G5Itineraries,
+                  [DirectivePwr7, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureISEL, Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : Processor<"ppc64", G5Itineraries,
                   [Directive64, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index fb7aa71..f76b89c 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -22,8 +22,8 @@
 #include "PPCSubtarget.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCPredicates.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Assembly/Writer.h"
@@ -248,7 +248,9 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'c': // Don't print "$" before a global var name or constant.
       break; // PPC never has a prefix.
     case 'L': // Write second word of DImode reference.
@@ -451,11 +453,13 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc750",
     "ppc970",
     "ppcA2",
+    "power6",
+    "power7",
     "ppc64"
   };
 
   unsigned Directive = Subtarget.getDarwinDirective();
-  if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970)
+  if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
     Directive = PPC::DIR_970;
   if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
     Directive = PPC::DIR_7400;
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 5f775e1..21a0fb2 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -135,21 +135,33 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
           MBBStartOffset += 4;
           continue;
         }
-        
+
         // Otherwise, we have to expand it to a long branch.
-        // The BCC operands are:
-        // 0. PPC branch predicate
-        // 1. CR register
-        // 2. Target MBB
-        PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
-        unsigned CRReg = I->getOperand(1).getReg();
-        
         MachineInstr *OldBranch = I;
         DebugLoc dl = OldBranch->getDebugLoc();
-        
-        // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
-        BuildMI(MBB, I, dl, TII->get(PPC::BCC))
-          .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+ 
+        if (I->getOpcode() == PPC::BCC) {
+          // The BCC operands are:
+          // 0. PPC branch predicate
+          // 1. CR register
+          // 2. Target MBB
+          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+          unsigned CRReg = I->getOperand(1).getReg();
+       
+          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+        } else {
+           llvm_unreachable("Unhandled branch type!");
+        }
         
         // Uncond branch to the real destination.
         I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 0000000..2a2abb1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,724 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+// This pass is based on the HexagonHardwareLoops pass.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for CTR loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested CTR loops.
+//  - No function calls in loops.
+//
+//  Note: As with unconverted loops, PPCBranchSelector must be run after this
+//  pass in order to convert long-displacement jumps into jump pairs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ctrloops"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace {
+  class CountValue;
+  struct PPCCTRLoops : public MachineFunctionPass {
+    MachineLoopInfo       *MLI;
+    MachineRegisterInfo   *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;   // Pass identification, replacement for typeid
+
+    PPCCTRLoops() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "PPC CTR Loops"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
+    /// induction variable.
+    /// Should be defined in MachineLoop. Based upon version in class Loop.
+    void getCanonicalInductionVariable(MachineLoop *L,
+                              SmallVector<MachineInstr *, 4> &IVars,
+                              SmallVector<MachineInstr *, 4> &IOps) const;
+
+    /// getTripCount - Return a loop-invariant LLVM register indicating the
+    /// number of times the loop will be executed.  If the trip-count cannot
+    /// be determined, this return null.
+    CountValue *getTripCount(MachineLoop *L,
+                             SmallVector<MachineInstr *, 2> &OldInsts) const;
+
+    /// isInductionOperation - Return true if the instruction matches the
+    /// pattern for an opertion that defines an induction variable.
+    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
+
+    /// isInvalidOperation - Return true if the instruction is not valid within
+    /// a CTR loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// containsInavlidInstruction - Return true if the loop contains an
+    /// instruction that inhibits using the CTR loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// converToCTRLoop - Given a loop, check if we can convert it to a
+    /// CTR loop.  If so, then perform the conversion and return true.
+    bool convertToCTRLoop(MachineLoop *L);
+
+    /// isDead - Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVector<MachineInstr *, 1> &DeadPhis) const;
+
+    /// removeIfDead - Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+  };
+
+  char PPCCTRLoops::ID = 0;
+
+
+  // CountValue class - Abstraction for a trip count of a loop. A
+  // smaller vesrsion of the MachineOperand class without the concerns
+  // of changing the operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      unsigned RegNum;
+      int64_t ImmVal;
+      Values(unsigned r) : RegNum(r) {}
+      Values(int64_t i) : ImmVal(i) {}
+    } Contents;
+    bool isNegative;
+
+  public:
+    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
+                                       isNegative(neg) {}
+    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
+                                     isNegative(i < 0) {}
+    CountValueType getType() const { return Kind; }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+    bool isNeg() const { return isNegative; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.RegNum;
+    }
+    void setReg(unsigned Val) {
+      Contents.RegNum = Val;
+    }
+    int64_t getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      if (isNegative) {
+        return -Contents.ImmVal;
+      }
+      return Contents.ImmVal;
+    }
+    void setImm(int64_t Val) {
+      Contents.ImmVal = Val;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
+      if (isReg()) { OS << PrintReg(getReg()); }
+      if (isImm()) { OS << getImm(); }
+    }
+  };
+} // end anonymous namespace
+
+
+/// isCompareEquals - Returns true if the instruction is a compare equals
+/// instruction with an immediate operand.
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) {
+  if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) {
+    SignedCmp = true;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) {
+    SignedCmp = false;
+    return true;
+  }
+
+  return false;
+}
+
+
+/// createPPCCTRLoops - Factory for creating
+/// the CTR loop phase.
+FunctionPass *llvm::createPPCCTRLoops() {
+  return new PPCCTRLoops();
+}
+
+
+bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* PPC CTR Loops *********\n");
+
+  bool Changed = false;
+
+  // get the loop information
+  MLI = &getAnalysis<MachineLoopInfo>();
+  // get the register information
+  MRI = &MF.getRegInfo();
+  // the target specific instructio info.
+  TII = MF.getTarget().getInstrInfo();
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop()) {
+      Changed |= convertToCTRLoop(L);
+    }
+  }
+
+  return Changed;
+}
+
+/// getCanonicalInductionVariable - Check to see if the loop has a canonical
+/// induction variable. We check for a simple recurrence pattern - an
+/// integer recurrence that decrements by one each time through the loop and
+/// ends at zero.  If so, return the phi node that corresponds to it.
+///
+/// Based upon the similar code in LoopInfo except this code is specific to
+/// the machine.
+/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
+///
+void
+PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
+                                  SmallVector<MachineInstr *, 4> &IVars,
+                                  SmallVector<MachineInstr *, 4> &IOps) const {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end()) return;  // dead loop
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
+
+  // make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return;
+
+  // Loop over all of the PHI nodes, looking for a canonical induction variable:
+  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
+  //   - The recurrence comes from the backedge.
+  //   - the definition is an induction operatio.n
+  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *MPhi = &*I;
+    unsigned DefReg = MPhi->getOperand(0).getReg();
+    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+      // Check each operand for the value from the backedge.
+      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
+      if (L->contains(MBB)) { // operands comes from the backedge
+        // Check if the definition is an induction operation.
+        MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
+        if (isInductionOperation(DI, DefReg)) {
+          IOps.push_back(DI);
+          IVars.push_back(MPhi);
+        }
+      }
+    }
+  }
+  return;
+}
+
+/// getTripCount - Return a loop-invariant LLVM value indicating the
+/// number of times the loop will be executed.  The trip count can
+/// be either a register or a constant value.  If the trip-count
+/// cannot be determined, this returns null.
+///
+/// We find the trip count from the phi instruction that defines the
+/// induction variable.  We follow the links to the CMP instruction
+/// to get the trip count.
+///
+/// Based upon getTripCount in LoopInfo.
+///
+CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
+                           SmallVector<MachineInstr *, 2> &OldInsts) const {
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate a CTR loop if the loop has more than one exit.
+  if (LastMBB == 0)
+    return 0;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI->getOpcode() != PPC::BCC)
+    return 0;
+
+  // We need to make sure that this compare is defining the condition
+  // register actually used by the terminating branch.
+
+  unsigned PredReg = LastI->getOperand(1).getReg();
+  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
+
+  unsigned PredCond = LastI->getOperand(0).getImm();
+  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
+    return 0;
+
+  // Check that the loop has a induction variable.
+  SmallVector<MachineInstr *, 4> IVars, IOps;
+  getCanonicalInductionVariable(L, IVars, IOps);
+  for (unsigned i = 0; i < IVars.size(); ++i) {
+    MachineInstr *IOp = IOps[i];
+    MachineInstr *IV_Inst = IVars[i];
+
+    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
+    //  if Imm is 0, get the count from the PHI opnd
+    //  if Imm is -M, than M is the count
+    //  Otherwise, Imm is the count
+    MachineOperand *IV_Opnd;
+    const MachineOperand *InitialValue;
+    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+      InitialValue = &IV_Inst->getOperand(1);
+      IV_Opnd = &IV_Inst->getOperand(3);
+    } else {
+      InitialValue = &IV_Inst->getOperand(3);
+      IV_Opnd = &IV_Inst->getOperand(1);
+    }
+
+    DEBUG(dbgs() << "Considering:\n");
+    DEBUG(dbgs() << "  induction operation: " << *IOp);
+    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
+    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
+  
+    // Look for the cmp instruction to determine if we
+    // can get a useful trip count.  The trip count can
+    // be either a register or an immediate.  The location
+    // of the value depends upon the type (reg or imm).
+    for (MachineRegisterInfo::reg_iterator
+         RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
+         RI != RE; ++RI) {
+      IV_Opnd = &RI.getOperand();
+      bool SignedCmp;
+      MachineInstr *MI = IV_Opnd->getParent();
+      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) &&
+          MI->getOperand(0).getReg() == PredReg) {
+
+        OldInsts.push_back(MI);
+        OldInsts.push_back(IOp);
+ 
+        DEBUG(dbgs() << "  compare: " << *MI);
+ 
+        const MachineOperand &MO = MI->getOperand(2);
+        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
+
+        int64_t ImmVal;
+        if (SignedCmp)
+          ImmVal = (short) MO.getImm();
+        else
+          ImmVal = MO.getImm();
+  
+        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+        assert(L->contains(IV_DefInstr->getParent()) &&
+               "IV definition should occurs in loop");
+        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
+  
+        assert(InitialValue->isReg() && "Expecting register for init value");
+        unsigned InitialValueReg = InitialValue->getReg();
+  
+        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+  
+        // Here we need to look for an immediate load (an li or lis/ori pair).
+        if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
+                         DefInstr->getOpcode() == PPC::ORI)) {
+          int64_t start = (short) DefInstr->getOperand(2).getImm();
+          const MachineInstr *DefInstr2 =
+            MRI->getVRegDef(DefInstr->getOperand(0).getReg());
+          if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
+                            DefInstr2->getOpcode() == PPC::LIS)) {
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
+
+            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
+  
+            int64_t count = ImmVal - start;
+            if ((count % iv_value) != 0) {
+              return 0;
+            }
+            return new CountValue(count/iv_value);
+          }
+        } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
+                                DefInstr->getOpcode() == PPC::LI)) {
+          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+
+          int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm()));
+          if ((count % iv_value) != 0) {
+            return 0;
+          }
+          return new CountValue(count/iv_value);
+        } else if (iv_value == 1 || iv_value == -1) {
+          // We can't determine a constant starting value.
+          if (ImmVal == 0) {
+            return new CountValue(InitialValueReg, iv_value > 0);
+          }
+          // FIXME: handle non-zero end value.
+        }
+        // FIXME: handle non-unit increments (we might not want to introduce division
+        // but we can handle some 2^n cases with shifts).
+  
+      }
+    }
+  }
+  return 0;
+}
+
+/// isInductionOperation - return true if the operation is matches the
+/// pattern that defines an induction variable:
+///    addi iv, c
+///
+bool
+PPCCTRLoops::isInductionOperation(const MachineInstr *MI,
+                                           unsigned IVReg) const {
+  return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) &&
+          MI->getOperand(1).isReg() && // could be a frame index instead
+          MI->getOperand(1).getReg() == IVReg);
+}
+
+/// isInvalidOperation - Return true if the operation is invalid within
+/// CTR loop.
+bool
+PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a CTR loop
+  if (MI->getDesc().isCall()) {
+    return true;
+  }
+  // check if the instruction defines a CTR loop register
+  // (this will also catch nested CTR loops)
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// containsInvalidInstruction - Return true if the loop contains
+/// an instruction that inhibits the use of the CTR loop function.
+///
+bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isDead returns true if the instruction is dead
+/// (this was essentially copied from DeadMachineInstructionElim::isDead, but
+/// with special cases for inline asm, physical registers and instructions with
+/// side effects removed)
+bool PPCCTRLoops::isDead(const MachineInstr *MI,
+                         SmallVector<MachineInstr *, 1> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (!MRI->use_nodbg_empty(Reg)) {
+        // This instruction has users, but if the only user is the phi node for the
+        // parent block, and the only use of that phi node is this instruction, then
+        // this instruction is dead: both it (and the phi node) can be removed.
+        MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
+        if (llvm::next(I) == MRI->use_end() &&
+            I.getOperand().getParent()->isPHI()) {
+          MachineInstr *OnePhi = I.getOperand().getParent();
+
+          for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+            const MachineOperand &OPO = OnePhi->getOperand(j);
+            if (OPO.isReg() && OPO.isDef()) {
+              unsigned OPReg = OPO.getReg();
+
+              MachineRegisterInfo::use_iterator nextJ;
+              for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg),
+                   E = MRI->use_end(); J!=E; J=nextJ) {
+                nextJ = llvm::next(J);
+                MachineOperand& Use = J.getOperand();
+                MachineInstr *UseMI = Use.getParent();
+
+                if (MI != UseMI) {
+                  // The phi node has a user that is not MI, bail...
+                  return false;
+                }
+              }
+            }
+          }
+
+          DeadPhis.push_back(OnePhi);
+        } else {
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+        }
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void PPCCTRLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim
+
+  SmallVector<MachineInstr *, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "CTR looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I!=E; I=nextI) {
+        nextI = llvm::next(I);  // I is invalidated by the setReg
+        MachineOperand& Use = I.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+        if (UseMI==MI)
+          continue;
+        if (Use.isDebug()) // this might also be a instr -> phi -> instr case
+                           // which can also be removed.
+          UseMI->getOperand(0).setReg(0U);
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i) {
+      DeadPhis[i]->eraseFromParent();
+    }
+  }
+}
+
+/// converToCTRLoop - check if the loop is a candidate for
+/// converting to a CTR loop.  If so, then perform the
+/// transformation.
+///
+/// This function works on innermost loops first.  A loop can
+/// be converted if it is a counting loop; either a register
+/// value or an immediate.
+///
+/// The code makes several assumptions about the representation
+/// of the loop in llvm.
+bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    Changed |= convertToCTRLoop(*I);
+  }
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed) {
+    return Changed;
+  }
+
+  SmallVector<MachineInstr *, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getTripCount(L, OldInsts);
+  if (TripCount == 0) {
+    DEBUG(dbgs() << "failed to get trip count!\n");
+    return false;
+  }
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L)) {
+    return false;
+  }
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  // No preheader means there's not place for the loop instr.
+  if (Preheader == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  DebugLoc dl;
+  if (InsertPos != Preheader->end())
+    dl = InsertPos->getDebugLoc();
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate CTR loop if the loop has more than one exit.
+  if (LastMBB == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the loop
+    // latch may contains instrs. that need to be executed after the 1st iter.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart)) {
+      return false;
+    }
+  }
+
+  // Convert the loop to a CTR loop
+  DEBUG(dbgs() << "Change to CTR loop at "; L->dump());
+
+  MachineFunction *MF = LastMBB->getParent();
+  const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
+  bool isPPC64 = Subtarget.isPPC64();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+
+  unsigned CountReg;
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    const TargetRegisterClass *SrcRC =
+      MF->getRegInfo().getRegClass(TripCount->getReg());
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    unsigned CopyOp = (isPPC64 && SrcRC == GPRC) ?
+                        (unsigned) PPC::EXTSW_32_64 :
+                        (unsigned) TargetOpcode::COPY;
+    BuildMI(*Preheader, InsertPos, dl,
+            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
+    if (TripCount->isNeg()) {
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
+                       CountReg).addReg(CountReg1);
+    }
+  } else {
+    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
+    // Put the trip count in a register for transfer into the count register.
+
+    int64_t CountImm = TripCount->getImm();
+    assert(!TripCount->isNeg() && "Constant trip count must be positive");
+
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    if (CountImm > 0xFFFF) {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
+              CountReg).addImm(CountImm >> 16);
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
+              CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF);
+    } else {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LI8 : PPC::LI),
+              CountReg).addImm(CountImm);
+    }
+  }
+
+  // Add the mtctr instruction to the beginning of the loop.
+  BuildMI(*Preheader, InsertPos, dl,
+          TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg,
+            TripCount->isImm() ? RegState::Kill : 0);
+
+  // Make sure the loop start always has a reference in the CFG.  We need to
+  // create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with a bdnz instruction.
+  dl = LastI->getDebugLoc();
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    if (MBB != Preheader)
+      MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR);
+  }
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  assert(LastI->getOpcode() == PPC::BCC &&
+         "loop end must start with a BCC instruction");
+  // Either the BCC branches to the beginning of the loop, or it
+  // branches out of the loop and there is an unconditional branch
+  // to the start of the loop.
+  MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB();
+  BuildMI(*LastMBB, LastI, dl,
+        TII->get((BranchTarget == LoopStart) ?
+                 (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                 (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
+
+  // Conditional branch; just delete it.
+  DEBUG(dbgs() << "Removing old branch: " << *LastI);
+  LastMBB->erase(LastI);
+
+  delete TripCount;
+
+  // The induction operation (add) and the comparison (cmpwi) may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumCTRLoops;
+  return true;
+}
+
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b77a80b..c24afa9 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -330,6 +330,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
 
     if (HasFP)
+      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
+      // offsets of R1 is not allowed.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
         .addReg(PPC::R31)
         .addImm(FPOffset)
@@ -366,9 +368,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
         .addReg(PPC::R0, RegState::Kill)
         .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
         .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1, RegState::Define)
+        .addReg(PPC::R1)
         .addReg(PPC::R0);
     } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
@@ -381,9 +383,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
         .addReg(PPC::R0, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
         .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1, RegState::Define)
+        .addReg(PPC::R1)
         .addReg(PPC::R0);
     }
   } else {    // PPC64.
@@ -399,9 +401,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
         .addReg(PPC::X0)
         .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(PPC::X0);
     } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
@@ -414,9 +416,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
         .addReg(PPC::X0, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(PPC::X0);
     }
   }
@@ -492,7 +494,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
       // subregisters of CR2. We just need to emit a move of CR2.
-      if (PPC::CRBITRCRegisterClass->contains(Reg))
+      if (PPC::CRBITRCRegClass.contains(Reg))
         continue;
 
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
@@ -817,7 +819,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (PPC::GPRCRegisterClass->contains(Reg)) {
+    if (PPC::GPRCRegClass.contains(Reg)) {
       HasGPSaveArea = true;
 
       GPRegs.push_back(CSI[i]);
@@ -825,7 +827,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinGPR) {
         MinGPR = Reg;
       }
-    } else if (PPC::G8RCRegisterClass->contains(Reg)) {
+    } else if (PPC::G8RCRegClass.contains(Reg)) {
       HasG8SaveArea = true;
 
       G8Regs.push_back(CSI[i]);
@@ -833,7 +835,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinG8R) {
         MinG8R = Reg;
       }
-    } else if (PPC::F8RCRegisterClass->contains(Reg)) {
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
       HasFPSaveArea = true;
 
       FPRegs.push_back(CSI[i]);
@@ -842,12 +844,12 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
         MinFPR = Reg;
       }
 // FIXME SVR4: Disable CR save area for now.
-    } else if (PPC::CRBITRCRegisterClass->contains(Reg)
-               || PPC::CRRCRegisterClass->contains(Reg)) {
+    } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)) {
 //      HasCRSaveArea = true;
-    } else if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
       HasVRSAVESaveArea = true;
-    } else if (PPC::VRRCRegisterClass->contains(Reg)) {
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
       HasVRSaveArea = true;
 
       VRegs.push_back(CSI[i]);
@@ -932,8 +934,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::CRBITRCRegisterClass->contains(Reg) ||
-          PPC::CRRCRegisterClass->contains(Reg)) {
+      if (PPC::CRBITRCRegClass.contains(Reg) ||
+          PPC::CRRCRegClass.contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -950,7 +952,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
+      if (PPC::VRSAVERCRegClass.contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 5a04888..a00f686 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -111,6 +111,23 @@ namespace {
     /// immediate field.  Because preinc imms have already been validated, just
     /// accept it.
     bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+      if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo ||
+          N.getOpcode() == ISD::TargetGlobalAddress) {
+        Out = N;
+        return true;
+      }
+
+      return false;
+    }
+
+    /// SelectAddrIdxOffs - Return true if the operand is valid for a preinc
+    /// index field.  Because preinc imms have already been validated, just
+    /// accept it.
+    bool SelectAddrIdxOffs(SDValue N, SDValue &Out) const {
+      if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo ||
+          N.getOpcode() == ISD::TargetGlobalAddress)
+        return false;
+
       Out = N;
       return true;
     }
@@ -238,11 +255,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     DebugLoc dl;
 
     if (PPCLowering.getPointerTy() == MVT::i32) {
-      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
     } else {
-      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RCRegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
     }
@@ -697,7 +714,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
-  if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1)
+  if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1)
     IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
                                            CCReg), 0);
  else
@@ -833,7 +850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::MFCR: {
     SDValue InFlag = N->getOperand(1);
     // Use MFOCRF if supported.
-    if (PPCSubTarget.isGigaProcessor())
+    if (PPCSubTarget.hasMFOCRF())
       return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
                                     N->getOperand(0), InFlag);
     else
@@ -915,12 +932,44 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      // FIXME: PPC64
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
                                     PPCLowering.getPointerTy(),
                                     MVT::Other, Ops, 3);
     } else {
-      llvm_unreachable("R+R preindex loads not supported yet!");
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDUX; break;
+          case MVT::f32: Opcode = PPC::LFSUX; break;
+          case MVT::i32: Opcode = PPC::LWZUX; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+               "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDUX; break;
+          case MVT::i32: Opcode = isSExt ? PPC::LWAUX  : PPC::LWZUX8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
     }
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 3b24951..aa819ee 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,9 +51,11 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State);
 
-static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
-cl::desc("enable preincrement load/store generation on PPC (experimental)"),
-                                     cl::Hidden);
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
 
 static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
@@ -64,6 +66,7 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
+  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
 
   setPow2DivIsCheap();
 
@@ -73,12 +76,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
-  setMinStackArgumentAlignment(TM.getSubtarget<PPCSubtarget>().isPPC64() ? 8:4);
+  bool isPPC64 = Subtarget->isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
-  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
-  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
 
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -130,17 +134,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
-  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
 
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+  if (!Subtarget->hasFSQRT()) {
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
   }
@@ -226,8 +230,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) {
-    if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+  if (Subtarget->isSVR4ABI()) {
+    if (isPPC64) {
       // VAARG always uses double-word chunks, so promote anything smaller.
       setOperationAction(ISD::VAARG, MVT::i1, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
@@ -271,7 +275,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
-  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+  if (Subtarget->has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -290,9 +294,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+  if (Subtarget->use64BitRegs()) {
     // 64-bit PowerPC implementations can support i64 types directly
-    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
     // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
     setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
     // 64-bit PowerPC wants to expand i128 shifts itself.
@@ -306,7 +310,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+  if (Subtarget->hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -370,12 +374,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
 
-    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
 
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -389,8 +394,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport())
+  if (Subtarget->has64BitSupport()) {
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  }
 
   setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
@@ -398,7 +405,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
-  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+  if (isPPC64) {
     setStackPointerRegisterToSaveRestore(PPC::X1);
     setExceptionPointerRegister(PPC::X3);
     setExceptionSelectorRegister(PPC::X4);
@@ -415,7 +422,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::BSWAP);
 
   // Darwin long double math library functions have $LDBL128 appended.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+  if (Subtarget->isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
     setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
     setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
@@ -432,6 +439,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   if (PPCSubTarget.isDarwin())
     setPrefFunctionAlignment(4);
 
+  if (isPPC64 && Subtarget->isJITCodeModel())
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    setSupportJumpTables(false);
+
   setInsertFencesForAtomic(true);
 
   setSchedulingPreference(Sched::Hybrid);
@@ -902,10 +914,11 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
              && "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
              Disp.getOpcode() == ISD::TargetConstantPool ||
              Disp.getOpcode() == ISD::TargetJumpTable);
       Base = N.getOperand(0);
@@ -1006,7 +1019,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     short imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
       } else {
@@ -1015,7 +1028,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
              && "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
@@ -1084,8 +1097,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                   SDValue &Offset,
                                                   ISD::MemIndexedMode &AM,
                                                   SelectionDAG &DAG) const {
-  // Disabled by default for now.
-  if (!EnablePPCPreinc) return false;
+  if (DisablePPCPreinc) return false;
 
   SDValue Ptr;
   EVT VT;
@@ -1103,7 +1115,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (VT.isVector())
     return false;
 
-  // TODO: Check reg+reg first.
+  if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) {
+    AM = ISD::PRE_INC;
+    return true;
+  }
 
   // LDU/STU use reg+imm*4, others use reg+imm.
   if (VT != MVT::i64) {
@@ -1222,6 +1237,30 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_HA);
+  SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_LO);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+  SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                   is64bit ? MVT::i64 : MVT::i32);
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+  return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+}
+
 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
@@ -1440,13 +1479,16 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Entry.Node = Nest; Args.push_back(Entry);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
-                false, false, false, false, 0, CallingConv::C,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                                       Type::getVoidTy(*DAG.getContext()),
+                                       false, false, false, false, 0,
+                                       CallingConv::C,
                 /*isTailCall=*/false,
-                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                                       /*doesNotRet=*/false,
+                                       /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                 Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   return CallResult.second;
 }
@@ -1702,7 +1744,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1721,19 +1763,19 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
         default:
           llvm_unreachable("ValVT not supported by formal arguments Lowering");
         case MVT::i32:
-          RC = PPC::GPRCRegisterClass;
+          RC = &PPC::GPRCRegClass;
           break;
         case MVT::f32:
-          RC = PPC::F4RCRegisterClass;
+          RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
-          RC = PPC::F8RCRegisterClass;
+          RC = &PPC::F8RCRegClass;
           break;
         case MVT::v16i8:
         case MVT::v8i16:
         case MVT::v4i32:
         case MVT::v4f32:
-          RC = PPC::VRRCRegisterClass;
+          RC = &PPC::VRRCRegClass;
           break;
       }
 
@@ -1763,7 +1805,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2743,7 +2785,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2800,7 +2842,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
       SmallVector<CCValAssign, 16> RVLocs;
       CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		     getTargetMachine(), RVLocs, *DAG.getContext());
+                     getTargetMachine(), RVLocs, *DAG.getContext());
       CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
       for (unsigned i = 0; i != RVLocs.size(); ++i)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2864,14 +2906,19 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 }
 
 SDValue
-PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   if (isTailCall)
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
@@ -2921,7 +2968,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2961,7 +3008,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -3485,7 +3532,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -4559,7 +4606,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
@@ -4899,11 +4946,37 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-      MI->getOpcode() == PPC::SELECT_CC_I8 ||
-      MI->getOpcode() == PPC::SELECT_CC_F4 ||
-      MI->getOpcode() == PPC::SELECT_CC_F8 ||
-      MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_CC_I8)) {
+    unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ?
+                                         PPC::ISEL8 : PPC::ISEL;
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+
+    // The SelectPred is ((BI << 5) | BO) for a BCC
+    unsigned BO = SelectPred & 0xF;
+    assert((BO == 12 || BO == 4) && "invalid predicate BO field for isel");
+
+    unsigned TrueOpNo, FalseOpNo;
+    if (BO == 12) {
+      TrueOpNo = 2;
+      FalseOpNo = 3;
+    } else {
+      TrueOpNo = 3;
+      FalseOpNo = 2;
+      SelectPred = PPC::InvertPredicate((PPC::Predicate)SelectPred);
+    }
+
+    BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(TrueOpNo).getReg())
+      .addReg(MI->getOperand(FalseOpNo).getReg())
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg());
+  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+             MI->getOpcode() == PPC::SELECT_CC_I8 ||
+             MI->getOpcode() == PPC::SELECT_CC_F4 ||
+             MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
 
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
@@ -5612,18 +5685,18 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'b':   // R1-R31
     case 'r':   // R0-R31
       if (VT == MVT::i64 && PPCSubTarget.isPPC64())
-        return std::make_pair(0U, PPC::G8RCRegisterClass);
-      return std::make_pair(0U, PPC::GPRCRegisterClass);
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, PPC::F4RCRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, PPC::F8RCRegisterClass);
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
       break;
     case 'v':
-      return std::make_pair(0U, PPC::VRRCRegisterClass);
+      return std::make_pair(0U, &PPC::VRRCRegClass);
     case 'y':   // crrc
-      return std::make_pair(0U, PPC::CRRCRegisterClass);
+      return std::make_pair(0U, &PPC::CRRCRegClass);
     }
   }
 
@@ -5839,11 +5912,30 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   }
 }
 
+/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+/// is expanded to mul + add.
+bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::v4f32:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  unsigned Directive = PPCSubTarget.getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2)
-    return Sched::ILP;
+  if (DisableILPPref)
+    return TargetLowering::getSchedulingPreference(N);
 
-  return TargetLowering::getSchedulingPreference(N);
+  return Sched::ILP;
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 18eb072..b0a013b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -366,6 +366,12 @@ namespace llvm {
                         bool IsZeroVal, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
+    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+    /// is expanded to mul + add.
+    virtual bool isFMAFasterThanMulAndAdd(EVT VT) const;
+
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
@@ -389,6 +395,7 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -439,12 +446,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual bool
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 7f67a41..39778a5 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -68,15 +68,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_Darwin  : IForm<18, 0, 1,
-                            (outs), (ins calltarget:$func, variable_ops), 
+                            (outs), (ins calltarget:$func),
                             "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA8_Darwin : IForm<18, 1, 1,
-                          (outs), (ins aaddr:$func, variable_ops),
+                          (outs), (ins aaddr:$func),
                           "bla $func", BrB, [(PPCcall_Darwin (i64 imm:$func))]>;
   }
   let Uses = [CTR8, RM] in {
     def BCTRL8_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
-                                  (outs), (ins variable_ops),
+                                  (outs), (ins),
                                   "bctrl", BrB,
                                   [(PPCbctrl_Darwin)]>, Requires<[In64BitMode]>;
   }
@@ -88,27 +88,27 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_ELF  : IForm<18, 0, 1,
-                         (outs), (ins calltarget:$func, variable_ops), 
+                         (outs), (ins calltarget:$func),
                          "bl $func", BrB, []>;  // See Pat patterns below.
 
     let isCodeGenOnly = 1 in
     def BL8_NOP_ELF  : IForm_and_DForm_4_zero<18, 0, 1, 24,
-                             (outs), (ins calltarget:$func, variable_ops), 
+                             (outs), (ins calltarget:$func),
                              "bl $func\n\tnop", BrB, []>;
 
     def BLA8_ELF : IForm<18, 1, 1,
-                         (outs), (ins aaddr:$func, variable_ops),
+                         (outs), (ins aaddr:$func),
                          "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>;
 
     let isCodeGenOnly = 1 in
     def BLA8_NOP_ELF : IForm_and_DForm_4_zero<18, 1, 1, 24,
-                             (outs), (ins aaddr:$func, variable_ops),
+                             (outs), (ins aaddr:$func),
                              "bla $func\n\tnop", BrB,
                              [(PPCcall_nop_SVR4 (i64 imm:$func))]>;
   }
   let Uses = [X11, CTR8, RM] in {
     def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
-                               (outs), (ins variable_ops),
+                               (outs), (ins),
                                "bctrl", BrB,
                                [(PPCbctrl_SVR4)]>, Requires<[In64BitMode]>;
   }
@@ -180,17 +180,17 @@ def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst),
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi8 :Pseudo< (outs),
-                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                        (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd8 $dst $offset",
                  []>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
                  "#TC_RETURNa8 $func $offset",
                  [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops),
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
                  "#TC_RETURNr8 $dst $offset",
                  []>;
 
@@ -229,6 +229,15 @@ def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
 def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
           (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
 
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let Defs = [CTR8], Uses = [CTR8] in {
+    def BDZ8  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdz $dst",  BrB, []>;
+    def BDNZ8 : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdnz $dst", BrB, []>;
+  }
+}
+
 // 64-but CR instructions
 def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
@@ -256,6 +265,15 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins G8RC:$rS),
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
+let Pattern = [(set G8RC:$rT, readcyclecounter)] in
+def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs G8RC:$rT), (ins),
+                          "mfspr $rT, 268", SprMFTB>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+// Note that encoding mftb using mfspr is now the preferred form,
+// and has been since at least ISA v2.03. The mftb instruction has
+// now been phased out. Using mfspr, however, is known not to work on
+// the POWER3.
+
 let Defs = [X1], Uses = [X1] in
 def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"",
                        [(set G8RC:$result,
@@ -278,45 +296,37 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
-// Copies, extends, truncates.
-def OR4To8  : XForm_6<31, 444, (outs G8RC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
-                   []>;
-def OR8To4  : XForm_6<31, 444, (outs GPRC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
-                   []>;
-
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
-                      "li $rD, $imm", IntGeneral,
+                      "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
-                      "lis $rD, $imm", IntGeneral,
+                      "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "nand $rA, $rS, $rB", IntGeneral,
+                   "nand $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>;
 def AND8 : XForm_6<31,  28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "and $rA, $rS, $rB", IntGeneral,
+                   "and $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>;
 def ANDC8: XForm_6<31,  60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "andc $rA, $rS, $rB", IntGeneral,
+                   "andc $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>;
 def OR8  : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
+                   "or $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>;
 def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "nor $rA, $rS, $rB", IntGeneral,
+                   "nor $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>;
 def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "orc $rA, $rS, $rB", IntGeneral,
+                   "orc $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>;
 def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "eqv $rA, $rS, $rB", IntGeneral,
+                   "eqv $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>;
 def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "xor $rA, $rS, $rB", IntGeneral,
+                   "xor $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>;
 
 // Logical ops with immediate.
@@ -329,20 +339,20 @@ def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
                     [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>,
                      isDOT;
 def ORI8    : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "ori $dst, $src1, $src2", IntGeneral,
+                      "ori $dst, $src1, $src2", IntSimple,
                       [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>;
 def ORIS8   : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "oris $dst, $src1, $src2", IntGeneral,
+                      "oris $dst, $src1, $src2", IntSimple,
                     [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>;
 def XORI8   : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "xori $dst, $src1, $src2", IntGeneral,
+                      "xori $dst, $src1, $src2", IntSimple,
                       [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>;
 def XORIS8  : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "xoris $dst, $src1, $src2", IntGeneral,
+                      "xoris $dst, $src1, $src2", IntSimple,
                    [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>;
 
 def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
-                     "add $rT, $rA, $rB", IntGeneral,
+                     "add $rT, $rA, $rB", IntSimple,
                      [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
                      
 let Defs = [CARRY] in {
@@ -355,10 +365,13 @@ def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
                      [(set G8RC:$rD, (addc G8RC:$rA, immSExt16:$imm))]>;
 }
 def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
-                     "addi $rD, $rA, $imm", IntGeneral,
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
+def ADDI8L  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, symbolLo64:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
                      [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
 def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm),
-                     "addis $rD, $rA, $imm", IntGeneral,
+                     "addis $rD, $rA, $imm", IntSimple,
                      [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
 
 let Defs = [CARRY] in {
@@ -374,7 +387,7 @@ def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                      "subf $rT, $rA, $rB", IntGeneral,
                      [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
 def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
-                       "neg $rT, $rA", IntGeneral,
+                       "neg $rT, $rA", IntSimple,
                        [(set G8RC:$rT, (ineg G8RC:$rA))]>;
 let Uses = [CARRY], Defs = [CARRY] in {
 def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
@@ -427,21 +440,21 @@ def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
 }
                    
 def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsb $rA, $rS", IntGeneral,
+                      "extsb $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>;
 def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsh $rA, $rS", IntGeneral,
+                      "extsh $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>;
 
 def EXTSW  : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64;
 /// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers.
 def EXTSW_32 : XForm_11<31, 986, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64;
 def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
 
 let Defs = [CARRY] in {
@@ -493,6 +506,10 @@ def RLWINM8 : MForm_2<21,
                      "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
                      []>;
 
+def ISEL8   : AForm_1<31, 15,
+                     (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB, pred:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
 }  // End FXU Operations.
 
 
@@ -529,6 +546,16 @@ def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
+def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+                    (ins memrr:$addr),
+                    "lhaux $rD, $addr", LdStLoad,
+                    []>, RegConstraint<"$addr.offreg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+                    (ins memrr:$addr),
+                    "lwaux $rD, $addr", LdStLoad,
+                    []>, RegConstraint<"$addr.offreg = $ea_result">,
+                    NoEncode<"$ea_result">, isPPC64;
 }
 
 // Zero extending loads.
@@ -568,6 +595,22 @@ def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
                     "lwzu $rD, $addr", LdStLoad,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
 }
 }
 
@@ -603,6 +646,11 @@ def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
+def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "ldux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">, isPPC64;
 }
 
 def : Pat<(PPCload ixaddr:$src),
@@ -660,6 +708,14 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 
+def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
                     "stdu $rS, $ptroff($ptrreg)", LdStSTD,
@@ -668,10 +724,41 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     isPPC64;
 
-let mayStore = 1 in
-def STDUX : XForm_8<31, 181, (outs), (ins G8RC:$rS, memrr:$dst),
-                   "stdux $rS, $dst", LdStSTD,
-                   []>, isPPC64;
+
+def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti8 G8RC:$rS,
+                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti16 G8RC:$rS,
+                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti32 G8RC:$rS,
+                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked, isPPC64;
 
 // STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register.
 def STD_32  : DSForm_1<62, 0, (outs), (ins GPRC:$rT, memrix:$dst),
@@ -706,11 +793,12 @@ def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB),
 
 // Extensions and truncates to/from 32-bit regs.
 def : Pat<(i64 (zext GPRC:$in)),
-          (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>;
+          (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32),
+                  0, 32)>;
 def : Pat<(i64 (anyext GPRC:$in)),
-          (OR4To8 GPRC:$in, GPRC:$in)>;
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32)>;
 def : Pat<(i32 (trunc G8RC:$in)),
-          (OR8To4 G8RC:$in, G8RC:$in)>;
+          (EXTRACT_SUBREG G8RC:$in, sub_32)>;
 
 // Extending loads with i64 targets.
 def : Pat<(zextloadi1 iaddr:$src),
@@ -765,6 +853,10 @@ def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
 def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
 def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
 def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, G8RC:$in),
+          (ADDIS8 G8RC:$in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, G8RC:$in),
+          (ADDI8L G8RC:$in, tglobaltlsaddr:$g)>;
 def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)),
           (ADDIS8 G8RC:$in, tglobaladdr:$g)>;
 def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)),
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 6c0f3d3..b0b8423 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -274,15 +274,11 @@ let PPC970_Unit = 5 in {  // VALU Operations.
 // VA-Form instructions.  3-input AltiVec ops.
 def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
                        "vmaddfp $vD, $vA, $vC, $vB", VecFP,
-                       [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC),
-                                             VRRC:$vB))]>,
-                       Requires<[FPContractions]>;
+                       [(set VRRC:$vD, (fma VRRC:$vA, VRRC:$vC, VRRC:$vB))]>;
 def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
                        "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
-                       [(set VRRC:$vD, (fsub V_immneg0,
-                                             (fsub (fmul VRRC:$vA, VRRC:$vC),
-                                                   VRRC:$vB)))]>,
-                       Requires<[FPContractions]>;
+                       [(set VRRC:$vD, (fneg (fma VRRC:$vA, VRRC:$vC,
+                                                  (fneg VRRC:$vB))))]>; 
 
 def VMHADDSHS  : VA1a_Int<32, "vmhaddshs",  int_ppc_altivec_vmhaddshs>;
 def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index d8e4b2b..a41a027 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -94,6 +94,12 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = lk;
 }
 
+class IForm_ext<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+            string asmstr, InstrItinClass itin, list<dag> pattern>
+         : IForm<opcode, aa, lk, OOL, IOL, asmstr, itin, pattern> {
+  let LI{0-4} = bo;
+}
+
 // 1.7.2 B-Form
 class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   : I<opcode, OOL, IOL, asmstr, BrB> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index b45ada9..47f09dc 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -40,6 +40,10 @@ extern cl::opt<bool> DisablePPC64RS;
 
 using namespace llvm;
 
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+            cl::desc("Disable analysis for CTR loops"));
+
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
     TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
@@ -75,6 +79,22 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
 
   return new PPCScoreboardHazardRecognizer(II, DAG);
 }
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         unsigned &SrcReg, unsigned &DstReg,
+                                         unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: return false;
+  case PPC::EXTSW:
+  case PPC::EXTSW_32_64:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = PPC::sub_32;
+    return true;
+  }
+}
+
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
   switch (MI->getOpcode()) {
@@ -186,10 +206,14 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
 
 
 // Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
 bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -221,7 +245,30 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       Cond.push_back(LastInst->getOperand(0));
       Cond.push_back(LastInst->getOperand(1));
       return false;
+    } else if (LastInst->getOpcode() == PPC::BDNZ8 ||
+               LastInst->getOpcode() == PPC::BDNZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(1));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDZ8 ||
+               LastInst->getOpcode() == PPC::BDZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
     }
+
     // Otherwise, don't know what this is.
     return true;
   }
@@ -245,6 +292,34 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     Cond.push_back(SecondLastInst->getOperand(1));
     FBB = LastInst->getOperand(0).getMBB();
     return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDNZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(1));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   }
 
   // If the block ends with two PPC:Bs, handle it.  The second one is not
@@ -273,7 +348,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
       return 0;
     --I;
   }
-  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 0;
 
   // Remove the branch.
@@ -283,7 +360,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   if (I == MBB.begin()) return 1;
   --I;
-  if (I->getOpcode() != PPC::BCC)
+  if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 1;
 
   // Remove the branch.
@@ -301,10 +380,16 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "PPC branch conditions have two components!");
 
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty())   // Unconditional branch
       BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+    else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+      BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                              (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                              (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
     else                // Conditional branch
       BuildMI(&MBB, DL, get(PPC::BCC))
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
@@ -312,8 +397,13 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   }
 
   // Two-way Conditional Branch.
-  BuildMI(&MBB, DL, get(PPC::BCC))
-    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                            (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                            (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else
+    BuildMI(&MBB, DL, get(PPC::BCC))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
   BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
@@ -354,7 +444,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   const TargetRegisterClass *RC,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const{
   DebugLoc DL;
-  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
                                          .addReg(SrcReg,
@@ -370,7 +460,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
                                          .addReg(SrcReg,
@@ -386,17 +476,17 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
     if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
@@ -438,7 +528,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
     // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
     // backend currently only uses CR1EQ as an individual bit, this should
     // not cause any bug. If we need other uses of CR bits, the following
@@ -470,9 +560,9 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       Reg = PPC::CR7;
 
     return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
-                               PPC::CRRCRegisterClass, NewMIs);
+                               &PPC::CRRCRegClass, NewMIs);
 
-  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // STVX VAL, 0, R0
@@ -522,7 +612,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
                                    SmallVectorImpl<MachineInstr*> &NewMIs)const{
-  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (DestReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
                                                  DestReg), FrameIdx));
@@ -531,7 +621,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
     }
-  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
     if (DestReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
                                          FrameIdx));
@@ -540,13 +630,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::X11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::X11));
     }
-  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
                                        FrameIdx));
-  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
-  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
     if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
@@ -578,7 +668,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                          PPC::MTCRF8 : PPC::MTCRF), DestReg)
                        .addReg(ScratchReg));
     }
-  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
     if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
@@ -607,9 +697,9 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
       Reg = PPC::CR7;
 
     return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
-                                PPC::CRRCRegisterClass, NewMIs);
+                                &PPC::CRRCRegClass, NewMIs);
 
-  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // Dest = LVX 0, R0
@@ -665,8 +755,11 @@ PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
 bool PPCInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
-  // Leave the CR# the same, but invert the condition.
-  Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+    Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+  else
+    // Leave the CR# the same, but invert the condition.
+    Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
   return false;
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 7d49aa1..374213e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -92,6 +92,9 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const;
 
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const;
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 748486c..f57f0c9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -323,7 +323,7 @@ def memri : Operand<iPTR> {
 }
 def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
-  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let MIOperandInfo = (ops ptr_rc:$offreg, ptr_rc:$ptrreg);
 }
 def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let PrintMethod = "printMemRegImmShifted";
@@ -349,10 +349,10 @@ def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
 
 /// This is just the offset part of iaddr, used for preinc.
 def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+def xaddroff : ComplexPattern<iPTR, 1, "SelectAddrIdxOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
 def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
 def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
 def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
@@ -438,6 +438,13 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
                   "b${cond:cc} ${cond:reg}, $dst"
                   /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+
+  let Defs = [CTR], Uses = [CTR] in {
+    def BDZ  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdz $dst",  BrB, []>;
+    def BDNZ : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdnz $dst", BrB, []>;
+  }
 }
 
 // Darwin ABI Calls.
@@ -445,15 +452,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_Darwin  : IForm<18, 0, 1,
-                           (outs), (ins calltarget:$func, variable_ops), 
+                           (outs), (ins calltarget:$func), 
                            "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA_Darwin : IForm<18, 1, 1, 
-                          (outs), (ins aaddr:$func, variable_ops),
+                          (outs), (ins aaddr:$func),
                           "bla $func", BrB, [(PPCcall_Darwin (i32 imm:$func))]>;
   }
   let Uses = [CTR, RM] in {
     def BCTRL_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
-                                  (outs), (ins variable_ops),
+                                  (outs), (ins),
                                   "bctrl", BrB,
                                   [(PPCbctrl_Darwin)]>, Requires<[In32BitMode]>;
   }
@@ -464,16 +471,16 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_SVR4  : IForm<18, 0, 1,
-                        (outs), (ins calltarget:$func, variable_ops), 
+                        (outs), (ins calltarget:$func), 
                         "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA_SVR4 : IForm<18, 1, 1,
-                        (outs), (ins aaddr:$func, variable_ops),
+                        (outs), (ins aaddr:$func),
                         "bla $func", BrB,
                         [(PPCcall_SVR4 (i32 imm:$func))]>;
   }
   let Uses = [CTR, RM] in {
     def BCTRL_SVR4 : XLForm_2_ext<19, 528, 20, 0, 1,
-                                (outs), (ins variable_ops),
+                                (outs), (ins),
                                 "bctrl", BrB,
                                 [(PPCbctrl_SVR4)]>, Requires<[In32BitMode]>;
   }
@@ -482,18 +489,18 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi :Pseudo< (outs),
-                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                        (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd $dst $offset",
                  []>;
 
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
                  "#TC_RETURNa $func $offset",
                  [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset, variable_ops),
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
                  "#TC_RETURNr $dst $offset",
                  []>;
 
@@ -704,6 +711,44 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
                   "lfd $rD, $addr", LdStLFD,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhaux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lfsux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lfdux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
 }
 }
 
@@ -815,12 +860,49 @@ def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
                    "stwx $rS, $dst", LdStStore,
                    [(store GPRC:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
-                   
-let mayStore = 1 in {
-def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
-                   "stwux $rS, $rA, $rB", LdStStore,
-                   []>;
-}
+ 
+def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_truncsti8 GPRC:$rS,
+                                     ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+ 
+def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_truncsti16 GPRC:$rS,
+                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+                 
+def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+
+def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
+                              (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
+                              (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
 def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
                    "sthbrx $rS, $dst", LdStStore,
                    [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, 
@@ -852,7 +934,10 @@ def SYNC : XForm_24_sync<31, 598, (outs), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
-                     "addi $rD, $rA, $imm", IntGeneral,
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+def ADDIL  : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
                      [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
 let Defs = [CARRY] in {
 def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
@@ -864,7 +949,7 @@ def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      []>;
 }
 def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm),
-                     "addis $rD, $rA, $imm", IntGeneral,
+                     "addis $rD, $rA, $imm", IntSimple,
                      [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
 def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym),
                      "la $rD, $sym($rA)", IntGeneral,
@@ -881,10 +966,10 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
 
 let isReMaterializable = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
-                       "li $rD, $imm", IntGeneral,
+                       "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
   def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm),
-                       "lis $rD, $imm", IntGeneral,
+                       "lis $rD, $imm", IntSimple,
                        [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>;
 }
 }
@@ -899,18 +984,18 @@ def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
                     [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>,
                     isDOT;
 def ORI   : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "ori $dst, $src1, $src2", IntGeneral,
+                    "ori $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>;
 def ORIS  : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "oris $dst, $src1, $src2", IntGeneral,
+                    "oris $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>;
 def XORI  : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "xori $dst, $src1, $src2", IntGeneral,
+                    "xori $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>;
 def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "xoris $dst, $src1, $src2", IntGeneral,
+                    "xoris $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>;
-def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntGeneral,
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntSimple,
                          []>;
 def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm),
                         "cmpwi $crD, $rA, $imm", IntCompare>;
@@ -921,28 +1006,28 @@ def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "nand $rA, $rS, $rB", IntGeneral,
+                   "nand $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>;
 def AND  : XForm_6<31,  28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "and $rA, $rS, $rB", IntGeneral,
+                   "and $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>;
 def ANDC : XForm_6<31,  60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "andc $rA, $rS, $rB", IntGeneral,
+                   "andc $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>;
 def OR   : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
+                   "or $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>;
 def NOR  : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "nor $rA, $rS, $rB", IntGeneral,
+                   "nor $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>;
 def ORC  : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "orc $rA, $rS, $rB", IntGeneral,
+                   "orc $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>;
 def EQV  : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "eqv $rA, $rS, $rB", IntGeneral,
+                   "eqv $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>;
 def XOR  : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "xor $rA, $rS, $rB", IntGeneral,
+                   "xor $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>;
 def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
                    "slw $rA, $rS, $rB", IntGeneral,
@@ -967,10 +1052,10 @@ def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
                       "cntlzw $rA, $rS", IntGeneral,
                       [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
 def EXTSB  : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsb $rA, $rS", IntGeneral,
+                      "extsb $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>;
 def EXTSH  : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsh $rA, $rS", IntGeneral,
+                      "extsh $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>;
 
 def CMPW   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
@@ -1115,7 +1200,7 @@ def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins),
                      PPC970_MicroCode, PPC970_Unit_CRU;
 
 def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
-                       "mfcr $rT, $FXM", SprMFCR>,
+                       "mfocrf $rT, $FXM", SprMFCR>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
 // Instructions to manipulate FPSCR.  Only long double handling uses these.
@@ -1159,7 +1244,7 @@ let PPC970_Unit = 1 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
 //
 def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
-                     "add $rT, $rA, $rB", IntGeneral,
+                     "add $rT, $rA, $rB", IntSimple,
                      [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
 let Defs = [CARRY] in {
 def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
@@ -1194,7 +1279,7 @@ def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      PPC970_DGroup_Cracked;
 }
 def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
-                      "neg $rT, $rA", IntGeneral,
+                      "neg $rT, $rA", IntSimple,
                       [(set GPRC:$rT, (ineg GPRC:$rA))]>;
 let Uses = [CARRY], Defs = [CARRY] in {
 def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
@@ -1226,51 +1311,43 @@ let Uses = [RM] in {
   def FMADD : AForm_1<63, 29, 
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC),
-                                             F8RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB))]>;
   def FMADDS : AForm_1<59, 29,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
-                                             F4RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB))]>;
   def FMSUB : AForm_1<63, 28,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC),
-                                             F8RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fma F8RC:$FRA, F8RC:$FRC, (fneg F8RC:$FRB)))]>;
   def FMSUBS : AForm_1<59, 28,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC),
-                                             F4RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fma F4RC:$FRA, F4RC:$FRC, (fneg F4RC:$FRB)))]>;
   def FNMADD : AForm_1<63, 31,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC),
-                                                   F8RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fneg (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB)))]>;
   def FNMADDS : AForm_1<59, 31,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC),
-                                                   F4RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fneg (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB)))]>;
   def FNMSUB : AForm_1<63, 30,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC),
-                                                   F8RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT, (fneg (fma F8RC:$FRA, F8RC:$FRC,
+                                                  (fneg F8RC:$FRB))))]>;
   def FNMSUBS : AForm_1<59, 30,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC),
-                                                   F4RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT, (fneg (fma F4RC:$FRA, F4RC:$FRC,
+                                                  (fneg F4RC:$FRB))))]>;
 }
 // FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
 // having 4 of these, force the comparison to always be an 8-byte double (code
@@ -1321,6 +1398,13 @@ let Uses = [RM] in {
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
+  def ISEL  : AForm_1<31, 15,
+                     (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB, pred:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
 // M-Form instructions.  rotate and mask instructions.
 //
 let isCommutable = 1 in {
@@ -1418,6 +1502,10 @@ def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
 def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
 def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
 def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, GPRC:$in),
+          (ADDIS GPRC:$in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, GPRC:$in),
+          (ADDIL GPRC:$in, tglobaltlsaddr:$g)>;
 def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)),
           (ADDIS GPRC:$in, tglobaladdr:$g)>;
 def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)),
@@ -1427,14 +1515,6 @@ def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)),
 def : Pat<(add GPRC:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS GPRC:$in, tblockaddress:$g)>;
 
-// Fused negative multiply subtract, alternate pattern
-def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)),
-          (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>,
-          Requires<[FPContractions]>;
-def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)),
-          (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>,
-          Requires<[FPContractions]>;
-
 // Standard shifts.  These are represented separately from the real shifts above
 // so that we can distinguish between shifts that allow 5-bit and 6-bit shift
 // amounts.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index a6528c0..aba2739 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -210,7 +210,7 @@ asm(
     ".text\n"
     ".align 2\n"
     ".globl PPC64CompilationCallback\n"
-    ".section \".opd\",\"aw\"\n"
+    ".section \".opd\",\"aw\",@progbits\n"
     ".align 3\n"
 "PPC64CompilationCallback:\n"
     ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 276edcb..19ec993 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -99,10 +99,22 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
-  if (MO.getTargetFlags() & PPCII::MO_LO16)
-    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16;
-  else if (MO.getTargetFlags() & PPCII::MO_HA16)
-    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16;
+  unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+  switch (access) {
+    case PPCII::MO_HA16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_HA16; 
+                         break;
+    case PPCII::MO_LO16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_LO16; 
+                         break;
+    case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
+                               break;
+    case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
+                               break;
+   }
 
   // FIXME: This isn't right, but we don't have a good way to express this in
   // the MC Level, see below.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ef13571..ab8bf1f 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -89,10 +89,17 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32;
 }
 
+bool
+PPCRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return requiresRegisterScavenging(MF);
+}
+
+
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
-PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                       const {
   if (Subtarget.isPPC64())
     return &PPC::G8RCRegClass;
   return &PPC::GPRCRegClass;
@@ -192,6 +199,20 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
+bool
+PPCRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  switch (RC->getID()) {
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID:
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::VRRCRegClassID:
+    return true;
+  default:
+    return false;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
@@ -321,14 +342,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   // address of new allocated space.
   if (LP64) {
     if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Use "true" part.
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(Reg, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(MI.getOperand(1).getReg());
     else
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X0, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(MI.getOperand(1).getReg());
 
     if (!MI.getOperand(1).isKill())
@@ -342,9 +363,9 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
         .addImm(maxCallFrameSize)
         .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
   } else {
-    BuildMI(MBB, II, dl, TII.get(PPC::STWUX))
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
       .addReg(Reg, RegState::Kill)
-      .addReg(PPC::R1, RegState::Define)
+      .addReg(PPC::R1)
       .addReg(MI.getOperand(1).getReg());
 
     if (!MI.getOperand(1).isKill())
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index b1e6a72..152c36d 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -35,7 +35,8 @@ public:
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
-  virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
+  virtual const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
@@ -46,10 +47,14 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+
   /// requiresRegisterScavenging - We require a register scavenger.
   /// FIXME (64-bit): Should be inlined.
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 0e55313..5ca3876 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -314,12 +314,18 @@ def CRBITRC : RegisterClass<"PPC", [i32], 32,
 }
 
 def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
-                                                CR7, CR2, CR3, CR4)> {
-  let SubRegClasses = [(CRBITRC sub_lt, sub_gt, sub_eq, sub_un)];
+                                                CR7, CR2, CR3, CR4)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+  let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+  let isAllocatable = 0;
 }
 
-def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)>;
-def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)>;
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 8c0a858..6a6ccb9 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -25,6 +25,7 @@ def VFPU   : FuncUnit; // vector floating point unit
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for PowerPC
 //
+def IntSimple    : InstrItinClass;
 def IntGeneral   : InstrItinClass;
 def IntCompare   : InstrItinClass;
 def IntDivD      : InstrItinClass;
@@ -117,17 +118,17 @@ include "PPCScheduleA2.td"
 //
 //    opcode     itinerary class
 //    ======     ===============
-//    add        IntGeneral
+//    add        IntSimple
 //    addc       IntGeneral
 //    adde       IntGeneral
-//    addi       IntGeneral
+//    addi       IntSimple
 //    addic      IntGeneral
 //    addic.     IntGeneral
-//    addis      IntGeneral
+//    addis      IntSimple
 //    addme      IntGeneral
 //    addze      IntGeneral
-//    and        IntGeneral
-//    andc       IntGeneral
+//    and        IntSimple
+//    andc       IntSimple
 //    andi.      IntGeneral
 //    andis.     IntGeneral
 //    b          BrB
@@ -165,10 +166,10 @@ include "PPCScheduleA2.td"
 //    eciwx      LdStLoad
 //    ecowx      LdStLoad
 //    eieio      LdStLoad
-//    eqv        IntGeneral
-//    extsb      IntGeneral
-//    extsh      IntGeneral
-//    extsw      IntRotateD
+//    eqv        IntSimple
+//    extsb      IntSimple
+//    extsh      IntSimple
+//    extsw      IntSimple
 //    fabs       FPGeneral
 //    fadd       FPGeneral
 //    fadds      FPGeneral
@@ -280,13 +281,13 @@ include "PPCScheduleA2.td"
 //    mulld      IntMulHD
 //    mulli      IntMulLI
 //    mullw      IntMulHW
-//    nand       IntGeneral
-//    neg        IntGeneral
-//    nor        IntGeneral
-//    or         IntGeneral
-//    orc        IntGeneral
-//    ori        IntGeneral
-//    oris       IntGeneral
+//    nand       IntSimple
+//    neg        IntSimple
+//    nor        IntSimple
+//    or         IntSimple
+//    orc        IntSimple
+//    ori        IntSimple
+//    oris       IntSimple
 //    rfi        SprRFI
 //    rfid       IntRFID
 //    rldcl      IntRotateD
@@ -502,7 +503,7 @@ include "PPCScheduleA2.td"
 //    vupklsb    VecPerm
 //    vupklsh    VecPerm
 //    vxor       VecGeneral
-//    xor        IntGeneral
-//    xori       IntGeneral
-//    xoris      IntGeneral
+//    xor        IntSimple
+//    xori       IntSimple
+//    xoris      IntSimple
 //
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 419faea..cd0fb70 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -108,6 +108,15 @@ def PPC440Itineraries : ProcessorItineraries<
    IRACC, IEXE1, IEXE2, IWB, LRACC, JEXE1, JEXE2, JWB, AGEN, CRD, LWB,
    FEXE1, FEXE2, FEXE3, FEXE4, FEXE5, FEXE6, FWB, LWARX_Hold],
   [GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -373,26 +382,6 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1]>,
-                               InstrStage<1, [IRACC], 0>,
-                               InstrStage<4, [LWARX_Hold], 0>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStSTWCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 857ba40..4d4a5d0 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -60,6 +60,17 @@ def PPCA2Itineraries : ProcessorItineraries<
    IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
    FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
   [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntGeneral  , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -159,6 +170,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -181,6 +203,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7], 
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapD    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<BrB         , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -269,6 +302,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLD      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStStore   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,28 +423,6 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [26, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStSTWCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index bc926f7..61e89ed 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -14,6 +14,7 @@
 
 def G3Itineraries : ProcessorItineraries<
   [IU1, IU2, FPU1, BPU, SRU, SLU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index f7ec1e0..e19ddfa 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -13,6 +13,7 @@
 
 def G4Itineraries : ProcessorItineraries<
   [IU1, IU2, SLU, SRU, BPU, FPU1, VIU1, VIU2, VPU, VFPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index 37ebfc5..e7446cb 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -16,6 +16,7 @@ def IU4    : FuncUnit; // integer unit 4 (7450 simple)
 
 def G4PlusItineraries : ProcessorItineraries<
   [IU1, IU2, IU3, IU4, BPU, SLU, FPU1, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index d1e40ce..1371499 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -13,6 +13,7 @@
 
 def G5Itineraries : ProcessorItineraries<
   [IU1, IU2, SLU, BPU, FPU1, FPU2, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
   InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index f405b47..bb193ac 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <cstdlib>
 
@@ -25,56 +26,19 @@
 
 using namespace llvm;
 
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#include <mach/mach_host.h>
-#include <mach/host_info.h>
-#include <mach/machine.h>
-
-/// GetCurrentPowerPCFeatures - Returns the current CPUs features.
-static const char *GetCurrentPowerPCCPU() {
-  host_basic_info_data_t hostInfo;
-  mach_msg_type_number_t infoCount;
-
-  infoCount = HOST_BASIC_INFO_COUNT;
-  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
-            &infoCount);
-            
-  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
-
-  switch(hostInfo.cpu_subtype) {
-  case CPU_SUBTYPE_POWERPC_601:   return "601";
-  case CPU_SUBTYPE_POWERPC_602:   return "602";
-  case CPU_SUBTYPE_POWERPC_603:   return "603";
-  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
-  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
-  case CPU_SUBTYPE_POWERPC_604:   return "604";
-  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
-  case CPU_SUBTYPE_POWERPC_620:   return "620";
-  case CPU_SUBTYPE_POWERPC_750:   return "750";
-  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
-  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
-  case CPU_SUBTYPE_POWERPC_970:   return "970";
-  default: ;
-  }
-  
-  return "generic";
-}
-#endif
-
-
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, bool is64Bit)
   : PPCGenSubtargetInfo(TT, CPU, FS)
   , StackAlignment(16)
   , DarwinDirective(PPC::DIR_NONE)
-  , IsGigaProcessor(false)
+  , HasMFOCRF(false)
   , Has64BitSupport(false)
   , Use64BitRegs(false)
   , IsPPC64(is64Bit)
   , HasAltivec(false)
   , HasFSQRT(false)
   , HasSTFIWX(false)
+  , HasISEL(false)
   , IsBookE(false)
   , HasLazyResolverStubs(false)
   , IsJITCodeModel(false)
@@ -84,9 +48,10 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   std::string CPUName = CPU;
   if (CPUName.empty())
     CPUName = "generic";
-#if defined(__APPLE__)
+#if (defined(__APPLE__) || defined(__linux__)) && \
+    (defined(__ppc__) || defined(__powerpc__))
   if (CPUName == "generic")
-    CPUName = GetCurrentPowerPCCPU();
+    CPUName = sys::getHostCPUName();
 #endif
 
   // Parse features string.
@@ -146,10 +111,14 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  if (DarwinDirective == PPC::DIR_440 || DarwinDirective == PPC::DIR_A2)
-    Mode = TargetSubtargetInfo::ANTIDEP_ALL;
-  else
-    Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
+  // but we can't because we can't reassign the cr registers. There is a
+  // dependence between the cr register and the RLWINM instruction used
+  // to extract its value which the anti-dependency breaker can't currently
+  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
+  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
+
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
 
   CriticalPathRCs.clear();
 
@@ -157,6 +126,9 @@ bool PPCSubtarget::enablePostRAScheduler(
     CriticalPathRCs.push_back(&PPC::G8RCRegClass);
   else
     CriticalPathRCs.push_back(&PPC::GPRCRegClass);
+    
+  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
+  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
 
   return OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index a275029..0207c83 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -41,6 +41,8 @@ namespace PPC {
     DIR_750, 
     DIR_970, 
     DIR_A2,
+    DIR_PWR6,
+    DIR_PWR7,
     DIR_64  
   };
 }
@@ -61,13 +63,14 @@ protected:
   unsigned DarwinDirective;
 
   /// Used by the ISel to turn in optimizations for POWER4-derived architectures
-  bool IsGigaProcessor;
+  bool HasMFOCRF;
   bool Has64BitSupport;
   bool Use64BitRegs;
   bool IsPPC64;
   bool HasAltivec;
   bool HasFSQRT;
   bool HasSTFIWX;
+  bool HasISEL;
   bool IsBookE;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
@@ -138,7 +141,8 @@ public:
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasSTFIWX() const { return HasSTFIWX; }
   bool hasAltivec() const { return HasAltivec; }
-  bool isGigaProcessor() const { return IsGigaProcessor; }
+  bool hasMFOCRF() const { return HasMFOCRF; }
+  bool hasISEL() const { return HasISEL; }
   bool isBookE() const { return IsBookE; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 50f3db8..9805112 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -17,10 +17,15 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+                        cl::desc("Disable CTR loops for PPC"));
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
@@ -81,41 +86,37 @@ public:
     return getTM<PPCTargetMachine>();
   }
 
+  virtual bool addPreRegAlloc();
   virtual bool addInstSelector();
   virtual bool addPreEmitPass();
 };
 } // namespace
 
 TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
-  TargetPassConfig *PassConfig = new PPCPassConfig(this, PM);
+  return new PPCPassConfig(this, PM);
+}
 
-  // Override this for PowerPC.  Tail merging happily breaks up instruction issue
-  // groups, which typically degrades performance.
-  PassConfig->setEnableTailMerge(false);
+bool PPCPassConfig::addPreRegAlloc() {
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoops());
 
-  return PassConfig;
+  return false;
 }
 
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM->add(createPPCISelDag(getPPCTargetMachine()));
+  addPass(createPPCISelDag(getPPCTargetMachine()));
   return false;
 }
 
 bool PPCPassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  PM->add(createPPCBranchSelectionPass());
+  addPass(createPPCBranchSelectionPass());
   return false;
 }
 
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       JITCodeEmitter &JCE) {
-  // FIXME: This should be moved to TargetJITInfo!!
-  if (Subtarget.isPPC64())
-    // Temporary workaround for the inability of PPC64 JIT to handle jump
-    // tables.
-    Options.DisableJumpTables = true;
-
   // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
   // writing?
   Subtarget.SetJITMode();
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 349cd89..b6763aa 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -2,7 +2,6 @@
 
 TODO:
 * gpr0 allocation
-* implement do-loop -> bdnz transform
 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
 
 ===-------------------------------------------------------------------------===
diff --git a/lib/Target/PowerPC/TargetInfo/Makefile b/lib/Target/PowerPC/TargetInfo/Makefile
index a101aa4..2d0560d 100644
--- a/lib/Target/PowerPC/TargetInfo/Makefile
+++ b/lib/Target/PowerPC/TargetInfo/Makefile
@@ -10,6 +10,6 @@ LEVEL = ../../../..
 LIBRARYNAME = LLVMPowerPCInfo
 
 # Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+override CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 093255e..cbfa4cf 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -964,6 +964,12 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 
 //===---------------------------------------------------------------------===//
 
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
 This was noticed in the entryblock for grokdeclarator in 403.gcc:
 
         %tmp = icmp eq i32 %decl_context, 4          
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index ae4af0f..efb10db 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -23,5 +23,7 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMSparcCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 883aa3a..7bf8c3f 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -279,14 +279,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
 //returns true if the Reg or its alias is in the RegSet.
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
-  if (RegSet.count(Reg))
-    return true;
-  // check Aliased Registers
-  for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
-       *Alias; ++ Alias)
-    if (RegSet.count(*Alias))
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
       return true;
-
   return false;
 }
 
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index c14b3d4..2554862 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -187,7 +187,9 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'r':
      break;
     }
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 210705e..6b593c9 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -22,10 +22,9 @@ namespace llvm {
   class SparcSubtarget;
 
 class SparcFrameLowering : public TargetFrameLowering {
-  const SparcSubtarget &STI;
 public:
-  explicit SparcFrameLowering(const SparcSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0), STI(sti) {
+  explicit SparcFrameLowering(const SparcSubtarget &/*sti*/)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {
   }
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index c3e6f16..79f7ebd 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -90,7 +90,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
@@ -160,7 +160,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
@@ -345,21 +345,26 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
 }
 
 SDValue
-SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               bool doesNotRet, bool &isTailCall,
-                               const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               const SmallVectorImpl<SDValue> &OutVals,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // Sparc target does not yet support tail call optimization.
   isTailCall = false;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), ArgLocs, *DAG.getContext());
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -590,7 +595,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
@@ -689,9 +694,9 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
-  addRegisterClass(MVT::f32, SP::FPRegsRegisterClass);
-  addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass);
+  addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
 
   // Turn FP extload into load/fextend
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
@@ -1259,7 +1264,7 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      return std::make_pair(0U, SP::IntRegsRegisterClass);
+      return std::make_pair(0U, &SP::IntRegsRegClass);
     }
   }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index cf43048..09148ea 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -76,12 +76,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index faff468..f8674d0 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -303,13 +303,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (I != MBB.end()) DL = I->getDebugLoc();
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == SP::IntRegsRegisterClass)
+  if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill));
-  else if (RC == SP::FPRegsRegisterClass)
+  else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill));
-  else if (RC == SP::DFPRegsRegisterClass)
+  else if (RC == &SP::DFPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill));
   else
@@ -324,11 +324,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == SP::IntRegsRegisterClass)
+  if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == SP::FPRegsRegisterClass)
+  else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == SP::DFPRegsRegisterClass)
+  else if (RC == &SP::DFPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
   else
     llvm_unreachable("Can't load this register from stack slot");
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 6357468..ff8d3c5 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -109,9 +109,6 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-void SparcRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
-
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index cc25307..9ee12ed 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -34,7 +34,8 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
-    TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
+    InstrInfo(Subtarget),
+    TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
 }
 
@@ -59,7 +60,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool SparcPassConfig::addInstSelector() {
-  PM->add(createSparcISelDag(getSparcTargetMachine()));
+  addPass(createSparcISelDag(getSparcTargetMachine()));
   return false;
 }
 
@@ -67,8 +68,8 @@ bool SparcPassConfig::addInstSelector() {
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
 bool SparcPassConfig::addPreEmitPass(){
-  PM->add(createSparcFPMoverPass(getSparcTargetMachine()));
-  PM->add(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  addPass(createSparcFPMoverPass(getSparcTargetMachine()));
+  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
   return true;
 }
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index b203dfa..b2cc624 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -28,9 +28,9 @@ namespace llvm {
 class SparcTargetMachine : public LLVMTargetMachine {
   SparcSubtarget Subtarget;
   const TargetData DataLayout;       // Calculates type size & alignment
+  SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
-  SparcInstrInfo InstrInfo;
   SparcFrameLowering FrameLowering;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index acb7476..cc6dc1e 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -117,8 +117,8 @@ TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const TargetAlignElem TargetData::InvalidAlignmentElem =
-                TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
+const TargetAlignElem
+TargetData::InvalidAlignmentElem = { (AlignTypeEnum)0xFF, 0, 0, 0 };
 
 //===----------------------------------------------------------------------===//
 //                       TargetData Class Implementation
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index 440f9ad..f1d1d07 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -21,20 +21,25 @@ using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 //  TargetInstrInfo
-//===----------------------------------------------------------------------===//
+//
+// Methods that depend on CodeGen are implemented in
+// TargetInstrInfoImpl.cpp. Invoking them without linking libCodeGen raises a
+// link error.
+// ===----------------------------------------------------------------------===//
 
 TargetInstrInfo::~TargetInstrInfo() {
 }
 
 const TargetRegisterClass*
 TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
+                             const TargetRegisterInfo *TRI,
+                             const MachineFunction &MF) const {
   if (OpNum >= MCID.getNumOperands())
     return 0;
 
   short RegClass = MCID.OpInfo[OpNum].RegClass;
   if (MCID.OpInfo[OpNum].isLookupPtrRegClass())
-    return TRI->getPointerRegClass(RegClass);
+    return TRI->getPointerRegClass(MF, RegClass);
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
@@ -44,54 +49,6 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
   return TRI->getRegClass(RegClass);
 }
 
-unsigned
-TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
-                                const MachineInstr *MI) const {
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
-  unsigned Class = MI->getDesc().getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (UOps)
-    return UOps;
-
-  // The # of u-ops is dynamically determined. The specific target should
-  // override this function to return the right number.
-  return 1;
-}
-
-int
-TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const {
-  if (!ItinData || ItinData->isEmpty())
-    return -1;
-
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
-  unsigned UseClass = UseMI->getDesc().getSchedClass();
-  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
-}
-
-int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                     const MachineInstr *MI,
-                                     unsigned *PredCost) const {
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
-  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
-}
-
-bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData,
-                                       const MachineInstr *DefMI,
-                                       unsigned DefIdx) const {
-  if (!ItinData || ItinData->isEmpty())
-    return false;
-
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
-  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
-  return (DefCycle != -1 && DefCycle <= 1);
-}
-
 /// insertNoop - Insert a noop into the instruction stream at the specified
 /// point.
 void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -99,7 +56,6 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
   llvm_unreachable("Target didn't implement insertNoop!");
 }
 
-
 /// Measure the specified inline asm to determine an approximation of its
 /// length.
 /// Comments (which run till the next SeparatorString or newline) do not
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index ec95ad4..8e215a7 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -24,64 +24,72 @@ void TargetLibraryInfo::anchor() { }
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "__cxa_atexit",
+    "__cxa_guard_abort",
+    "__cxa_guard_acquire",
+    "__cxa_guard_release",
+    "__memcpy_chk",
     "acos",
-    "acosl",
     "acosf",
+    "acosl",
     "asin",
-    "asinl",
     "asinf",
+    "asinl",
     "atan",
-    "atanl",
-    "atanf",
     "atan2",
-    "atan2l",
     "atan2f",
+    "atan2l",
+    "atanf",
+    "atanl",
     "ceil",
-    "ceill",
     "ceilf",
+    "ceill",
     "copysign",
     "copysignf",
     "copysignl",
     "cos",
-    "cosl",
     "cosf",
     "cosh",
-    "coshl",
     "coshf",
+    "coshl",
+    "cosl",
     "exp",
-    "expl",
-    "expf",
     "exp2",
-    "exp2l",
     "exp2f",
+    "exp2l",
+    "expf",
+    "expl",
     "expm1",
-    "expm1l",
     "expm1f",
+    "expm1l",
     "fabs",
-    "fabsl",
     "fabsf",
+    "fabsl",
+    "fiprintf",
     "floor",
-    "floorl",
     "floorf",
-    "fiprintf",
+    "floorl",
     "fmod",
-    "fmodl",
     "fmodf",
+    "fmodl",
+    "fputc",
     "fputs",
     "fwrite",
     "iprintf",
     "log",
-    "logl",
-    "logf",
-    "log2",
-    "log2l",
-    "log2f",
     "log10",
-    "log10l",
     "log10f",
+    "log10l",
     "log1p",
-    "log1pl",
     "log1pf",
+    "log1pl",
+    "log2",
+    "log2f",
+    "log2l",
+    "logf",
+    "logl",
+    "memchr",
+    "memcmp",
     "memcpy",
     "memmove",
     "memset",
@@ -92,6 +100,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "pow",
     "powf",
     "powl",
+    "putchar",
+    "puts",
     "rint",
     "rintf",
     "rintl",
@@ -99,36 +109,48 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "roundf",
     "roundl",
     "sin",
-    "sinl",
     "sinf",
     "sinh",
-    "sinhl",
     "sinhf",
+    "sinhl",
+    "sinl",
     "siprintf",
     "sqrt",
-    "sqrtl",
     "sqrtf",
+    "sqrtl",
+    "strcat",
+    "strchr",
+    "strcpy",
+    "strlen",
+    "strncat",
+    "strncmp",
+    "strncpy",
+    "strnlen",
     "tan",
-    "tanl",
     "tanf",
     "tanh",
-    "tanhl",
     "tanhf",
+    "tanhl",
+    "tanl",
     "trunc",
     "truncf",
-    "truncl",
-    "__cxa_atexit",
-    "__cxa_guard_abort",
-    "__cxa_guard_acquire",
-    "__cxa_guard_release"
+    "truncl"
   };
 
 /// initialize - Initialize the set of available library functions based on the
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
-static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
+static void initialize(TargetLibraryInfo &TLI, const Triple &T,
+                       const char **StandardNames) {
   initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
 
+#ifndef NDEBUG
+  // Verify that the StandardNames array is in alphabetical order.
+  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
+    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
+      llvm_unreachable("TargetLibraryInfo function names must be sorted");
+  }
+#endif // !NDEBUG
   
   // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later.
   if (T.isMacOSX()) {
@@ -240,14 +262,14 @@ TargetLibraryInfo::TargetLibraryInfo() : ImmutablePass(ID) {
   // Default to everything being available.
   memset(AvailableArray, -1, sizeof(AvailableArray));
 
-  initialize(*this, Triple());
+  initialize(*this, Triple(), StandardNames);
 }
 
 TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
   // Default to everything being available.
   memset(AvailableArray, -1, sizeof(AvailableArray));
   
-  initialize(*this, T);
+  initialize(*this, T, StandardNames);
 }
 
 TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
@@ -256,6 +278,17 @@ TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
   CustomNames = TLI.CustomNames;
 }
 
+bool TargetLibraryInfo::getLibFunc(StringRef funcName,
+                                   LibFunc::Func &F) const {
+  const char **Start = &StandardNames[0];
+  const char **End = &StandardNames[LibFunc::NumLibFuncs];
+  const char **I = std::lower_bound(Start, End, funcName);
+  if (I != End && *I == funcName) {
+    F = (LibFunc::Func)(I - Start);
+    return true;
+  }
+  return false;
+}
 
 /// disableAllFunctions - This disables all builtins, which is used for options
 /// like -fno-builtin.
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 2570e0d..b74a0bd 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -152,7 +152,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
   // a mergable string section, or general .data if it contains relocations.
   if (GVar->isConstant()) {
     // If the initializer for the global contains something that requires a
-    // relocation, then we may have to drop this into a wriable data section
+    // relocation, then we may have to drop this into a writable data section
     // even though it is marked const.
     switch (C->getRelocationInfo()) {
     case Constant::NoRelocation:
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index b9b2526..3825719 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -11,7 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/GlobalAlias.h"
 #include "llvm/GlobalValue.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -75,25 +77,58 @@ CodeModel::Model TargetMachine::getCodeModel() const {
   return CodeGenInfo->getCodeModel();
 }
 
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
+  switch (Var->getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+    break;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return TLSModel::GeneralDynamic;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return TLSModel::LocalDynamic;
+  case GlobalVariable::InitialExecTLSModel:
+    return TLSModel::InitialExec;
+  case GlobalVariable::LocalExecTLSModel:
+    return TLSModel::LocalExec;
+  }
+  llvm_unreachable("invalid TLS model");
+}
+
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  bool isLocal = GV->hasLocalLinkage();
-  bool isDeclaration = GV->isDeclaration();
+  // If GV is an alias then use the aliasee for determining
+  // thread-localness.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->resolveAliasedGlobal(false);
+  const GlobalVariable *Var = cast<GlobalVariable>(GV);
+
+  bool isLocal = Var->hasLocalLinkage();
+  bool isDeclaration = Var->isDeclaration();
+  bool isPIC = getRelocationModel() == Reloc::PIC_;
+  bool isPIE = Options.PositionIndependentExecutable;
   // FIXME: what should we do for protected and internal visibility?
   // For variables, is internal different from hidden?
-  bool isHidden = GV->hasHiddenVisibility();
+  bool isHidden = Var->hasHiddenVisibility();
 
-  if (getRelocationModel() == Reloc::PIC_ &&
-      !Options.PositionIndependentExecutable) {
+  TLSModel::Model Model;
+  if (isPIC && !isPIE) {
     if (isLocal || isHidden)
-      return TLSModel::LocalDynamic;
+      Model = TLSModel::LocalDynamic;
     else
-      return TLSModel::GeneralDynamic;
+      Model = TLSModel::GeneralDynamic;
   } else {
     if (!isDeclaration || isHidden)
-      return TLSModel::LocalExec;
+      Model = TLSModel::LocalExec;
     else
-      return TLSModel::InitialExec;
+      Model = TLSModel::InitialExec;
   }
+
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+  if (SelectedModel > Model)
+    return SelectedModel;
+
+  return Model;
 }
 
 /// getOptLevel - Returns the optimization level: None, Less,
@@ -127,4 +162,3 @@ void TargetMachine::setFunctionSections(bool V) {
 void TargetMachine::setDataSections(bool V) {
   DataSections = V;
 }
-
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 1716423..2395f2b 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -46,6 +46,50 @@ void PrintReg::print(raw_ostream &OS) const {
   }
 }
 
+void PrintRegUnit::print(raw_ostream &OS) const {
+  // Generic printout when TRI is missing.
+  if (!TRI) {
+    OS << "Unit~" << Unit;
+    return;
+  }
+
+  // Check for invalid register units.
+  if (Unit >= TRI->getNumRegUnits()) {
+    OS << "BadUnit~" << Unit;
+    return;
+  }
+
+  // Normal units have at least one root.
+  MCRegUnitRootIterator Roots(Unit, TRI);
+  assert(Roots.isValid() && "Unit has no roots.");
+  OS << TRI->getName(*Roots);
+  for (++Roots; Roots.isValid(); ++Roots)
+    OS << '~' << TRI->getName(*Roots);
+}
+
+/// getAllocatableClass - Return the maximal subclass of the given register
+/// class that is alloctable, or NULL.
+const TargetRegisterClass *
+TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
+  if (!RC || RC->isAllocatable())
+    return RC;
+
+  const unsigned *SubClass = RC->getSubClassMask();
+  for (unsigned Base = 0, BaseE = getNumRegClasses();
+       Base < BaseE; Base += 32) {
+    unsigned Idx = Base;
+    for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) {
+      unsigned Offset = CountTrailingZeros_32(Mask);
+      const TargetRegisterClass *SubRC = getRegClass(Idx + Offset);
+      if (SubRC->isAllocatable())
+        return SubRC;
+      Mask >>= Offset;
+      Idx += Offset + 1;
+    }
+  }
+  return NULL;
+}
+
 /// getMinimalPhysRegClass - Returns the Register Class of a physical
 /// register of the given type, picking the most sub register class of
 /// the right type that contains this physreg.
@@ -71,6 +115,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
 /// registers for the specific register class.
 static void getAllocatableSetForRC(const MachineFunction &MF,
                                    const TargetRegisterClass *RC, BitVector &R){
+  assert(RC->isAllocatable() && "invalid for nonallocatable sets");
   ArrayRef<uint16_t> Order = RC->getRawAllocationOrder(MF);
   for (unsigned i = 0; i != Order.size(); ++i)
     R.set(Order[i]);
@@ -80,7 +125,10 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
                                           const TargetRegisterClass *RC) const {
   BitVector Allocatable(getNumRegs());
   if (RC) {
-    getAllocatableSetForRC(MF, RC, Allocatable);
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
   } else {
     for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
          E = regclass_end(); I != E; ++I)
@@ -95,6 +143,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
   return Allocatable;
 }
 
+static inline
+const TargetRegisterClass *firstCommonClass(const uint32_t *A,
+                                            const uint32_t *B,
+                                            const TargetRegisterInfo *TRI) {
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + CountTrailingZeros_32(Common));
+  return 0;
+}
+
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
                                       const TargetRegisterClass *B) const {
@@ -106,15 +164,83 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  const unsigned *SubA = A->getSubClassMask();
-  const unsigned *SubB = B->getSubClassMask();
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+}
 
-  // We could start the search from max(A.ID, B.ID), but we are only going to
-  // execute 2-3 iterations anyway.
-  for (unsigned Base = 0, BaseE = getNumRegClasses(); Base < BaseE; Base += 32)
-    if (unsigned Common = *SubA++ & *SubB++)
-      return getRegClass(Base + CountTrailingZeros_32(Common));
+const TargetRegisterClass *
+TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                             const TargetRegisterClass *B,
+                                             unsigned Idx) const {
+  assert(A && B && "Missing register class");
+  assert(Idx && "Bad sub-register index");
+
+  // Find Idx in the list of super-register indices.
+  for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI)
+    if (RCI.getSubReg() == Idx)
+      // The bit mask contains all register classes that are projected into B
+      // by Idx. Find a class that is also a sub-class of A.
+      return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
+  return 0;
+}
 
-  // No common sub-class exists.
-  return NULL;
+const TargetRegisterClass *TargetRegisterInfo::
+getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                       const TargetRegisterClass *RCB, unsigned SubB,
+                       unsigned &PreA, unsigned &PreB) const {
+  assert(RCA && SubA && RCB && SubB && "Invalid arguments");
+
+  // Search all pairs of sub-register indices that project into RCA and RCB
+  // respectively. This is quadratic, but usually the sets are very small. On
+  // most targets like X86, there will only be a single sub-register index
+  // (e.g., sub_16bit projecting into GR16).
+  //
+  // The worst case is a register class like DPR on ARM.
+  // We have indices dsub_0..dsub_7 projecting into that class.
+  //
+  // It is very common that one register class is a sub-register of the other.
+  // Arrange for RCA to be the larger register so the answer will be found in
+  // the first iteration. This makes the search linear for the most common
+  // case.
+  const TargetRegisterClass *BestRC = 0;
+  unsigned *BestPreA = &PreA;
+  unsigned *BestPreB = &PreB;
+  if (RCA->getSize() < RCB->getSize()) {
+    std::swap(RCA, RCB);
+    std::swap(SubA, SubB);
+    std::swap(BestPreA, BestPreB);
+  }
+
+  // Also terminate the search one we have found a register class as small as
+  // RCA.
+  unsigned MinSize = RCA->getSize();
+
+  for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
+    unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
+    for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) {
+      // Check if a common super-register class exists for this index pair.
+      const TargetRegisterClass *RC =
+        firstCommonClass(IA.getMask(), IB.getMask(), this);
+      if (!RC || RC->getSize() < MinSize)
+        continue;
+
+      // The indexes must compose identically: PreA+SubA == PreB+SubB.
+      unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB);
+      if (FinalA != FinalB)
+        continue;
+
+      // Is RC a better candidate than BestRC?
+      if (BestRC && RC->getSize() >= BestRC->getSize())
+        continue;
+
+      // Yes, RC is the smallest super-register seen so far.
+      BestRC = RC;
+      *BestPreA = IA.getSubReg();
+      *BestPreB = IB.getSubReg();
+
+      // Bail early if we reached MinSize. We won't find a better candidate.
+      if (BestRC->getSize() == MinSize)
+        return BestRC;
+    }
+  }
+  return BestRC;
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 08c732c..fbbaa9500 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -65,6 +65,10 @@ private:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
 
+  bool MatchInstruction(SMLoc IDLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        SmallVectorImpl<MCInst> &MCInsts);
+
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
   /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
   bool isSrcOp(X86Operand &Op);
@@ -117,7 +121,7 @@ static unsigned MatchRegisterName(StringRef Name);
 
 /// }
 
-static  bool isImmSExti16i8Value(uint64_t Value) {
+static bool isImmSExti16i8Value(uint64_t Value) {
   return ((                                  Value <= 0x000000000000007FULL)||
           (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
           (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
@@ -135,12 +139,12 @@ static bool isImmZExtu32u8Value(uint64_t Value) {
 
 static bool isImmSExti64i8Value(uint64_t Value) {
   return ((                                  Value <= 0x000000000000007FULL)||
-	  (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 
 static bool isImmSExti64i32Value(uint64_t Value) {
   return ((                                  Value <= 0x000000007FFFFFFFULL)||
-	  (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 namespace {
 
@@ -187,7 +191,7 @@ struct X86Operand : public MCParsedAsmOperand {
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-  
+
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
   virtual void print(raw_ostream &OS) const {}
@@ -309,28 +313,45 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   bool isMem() const { return Kind == Memory; }
-  bool isMem8() const { 
+  bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
   }
-  bool isMem16() const { 
+  bool isMem16() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 16);
   }
-  bool isMem32() const { 
+  bool isMem32() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 32);
   }
-  bool isMem64() const { 
+  bool isMem64() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 64);
   }
-  bool isMem80() const { 
+  bool isMem80() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 80);
   }
-  bool isMem128() const { 
+  bool isMem128() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 128);
   }
-  bool isMem256() const { 
+  bool isMem256() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 256);
   }
 
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
       !getMemIndexReg() && getMemScale() == 1;
@@ -356,26 +377,38 @@ struct X86Operand : public MCParsedAsmOperand {
     addExpr(Inst, getImm());
   }
 
-  void addMem8Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem8Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem16Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem16Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem80Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem32Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem128Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem64Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem256Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem80Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem128Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem256Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -467,7 +500,7 @@ bool X86AsmParser::isSrcOp(X86Operand &Op) {
 bool X86AsmParser::isDstOp(X86Operand &Op) {
   unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
 
-  return Op.isMem() && 
+  return Op.isMem() &&
     (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
     isa<MCConstantExpr>(Op.Mem.Disp) &&
     cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
@@ -611,7 +644,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
   if (getLexer().isNot(AsmToken::LBrac))
     return ErrorOperand(Start, "Expected '[' token!");
   Parser.Lex();
-  
+
   if (getLexer().is(AsmToken::Identifier)) {
     // Parse BaseReg
     if (ParseRegister(BaseReg, Start, End)) {
@@ -638,11 +671,11 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
         // Handle '[' Scale*IndexReg ']'
         Parser.Lex();
         SMLoc IdxRegLoc = Parser.getTok().getLoc();
-	if (ParseRegister(IndexReg, IdxRegLoc, End))
-	  return ErrorOperand(IdxRegLoc, "Expected register");
+        if (ParseRegister(IndexReg, IdxRegLoc, End))
+          return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
       } else
-        return ErrorOperand(Loc, "Unepxeted token");
+        return ErrorOperand(Loc, "Unexpected token");
   }
 
   if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
@@ -655,8 +688,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       if (getLexer().is(AsmToken::Star)) {
         Parser.Lex();
         SMLoc IdxRegLoc = Parser.getTok().getLoc();
-	if (ParseRegister(IndexReg, IdxRegLoc, End))
-	  return ErrorOperand(IdxRegLoc, "Expected register");
+        if (ParseRegister(IndexReg, IdxRegLoc, End))
+          return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
       } else if (getLexer().is(AsmToken::RBrac)) {
         const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
@@ -668,7 +701,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       End = Parser.getTok().getLoc();
       if (!IndexReg)
         ParseRegister(IndexReg, Start, End);
-      else if (getParser().ParseExpression(Disp, End)) return 0;        
+      else if (getParser().ParseExpression(Disp, End)) return 0;
     }
   }
 
@@ -881,7 +914,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           if (getParser().ParseAbsoluteExpression(ScaleVal)){
             Error(Loc, "expected scale expression");
             return 0;
-	  }
+          }
 
           // Validate the scale amount.
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
@@ -916,15 +949,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
   // If we have both a base register and an index register make sure they are
   // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
   if (BaseReg != 0 && IndexReg != 0) {
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
-        !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
         IndexReg != X86::RIZ) {
       Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
       return 0;
     }
     if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
-        !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
         IndexReg != X86::EIZ){
       Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
       return 0;
@@ -944,7 +980,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
   if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
       PatchedName != "setb" && PatchedName != "setnb")
     PatchedName = PatchedName.substr(0, Name.size()-1);
-  
+
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
   const MCExpr *ExtraImmOp = 0;
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
@@ -1204,20 +1240,20 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       // Intel syntax
       X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
       if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-	  cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-	delete Operands[2];
-	Operands.pop_back();
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[2];
+        Operands.pop_back();
       }
     } else {
       X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
       if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-	  cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-	delete Operands[1];
-	Operands.erase(Operands.begin() + 1);
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[1];
+        Operands.erase(Operands.begin() + 1);
       }
     }
   }
-  
+
   // Transforms "int $3" into "int3" as a size optimization.  We can't write an
   // instalias with an immediate operand yet.
   if (Name == "int" && Operands.size() == 2) {
@@ -1476,6 +1512,18 @@ bool X86AsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
+  SmallVector<MCInst, 2> Insts;
+  bool Error = MatchInstruction(IDLoc, Operands, Insts);
+  if (!Error)
+    for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+      Out.EmitInstruction(Insts[i]);
+  return Error;
+}
+
+bool X86AsmParser::
+MatchInstruction(SMLoc IDLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                 SmallVectorImpl<MCInst> &MCInsts) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
@@ -1491,7 +1539,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     MCInst Inst;
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    MCInsts.push_back(Inst);
 
     const char *Repl =
       StringSwitch<const char*>(Op->getToken())
@@ -1520,12 +1568,12 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   case Match_Success:
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
-    // individual transformations can chain off each other. 
+    // individual transformations can chain off each other.
     while (processInstruction(Inst, Operands))
       ;
 
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    MCInsts.push_back(Inst);
     return false;
   case Match_MissingFeature:
     Error(IDLoc, "instruction requires a CPU feature not currently enabled");
@@ -1558,12 +1606,12 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   // Otherwise, we assume that this may be an integer instruction, which comes
   // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
   const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
-  
+
   // Check for the various suffix matches.
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
-  
+
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
   Tmp[Base.size()] = Suffixes[1];
   Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
@@ -1583,7 +1631,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     (Match3 == Match_Success) + (Match4 == Match_Success);
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    MCInsts.push_back(Inst);
     return false;
   }
 
@@ -1673,10 +1721,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {
-	// FIXME : Handle noprefix
-	Parser.Lex();
+        // FIXME : Handle noprefix
+        Parser.Lex();
       } else
-	return true;
+        return true;
     }
     return false;
   }
@@ -1691,19 +1739,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
       const MCExpr *Value;
       if (getParser().ParseExpression(Value))
         return true;
-      
+
       getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
-      
+
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      
+
       // FIXME: Improve diagnostic.
       if (getLexer().isNot(AsmToken::Comma))
         return Error(L, "unexpected token in directive");
       Parser.Lex();
     }
   }
-  
+
   Parser.Lex();
   return false;
 }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index f612e23..b886d46 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -52,6 +52,8 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
+add_dependencies(LLVMX86CodeGen intrinsics_gen)
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 8278bde..5039887 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -322,7 +322,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
 
   OperandType type = (OperandType)operand.type;
 
+  bool isBranch = false;
+  uint64_t pcrel = 0;
   if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.immediateOffset + insn.immediateSize;
     switch (insn.displacementSize) {
     default:
       break;
@@ -351,15 +356,15 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
       // Special case those X86 instructions that use the imm8 as a set of
       // bits, bit count, etc. and are not sign-extend.
       if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
-	  Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
-	  Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
-	  Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
-	  Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
-	  Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
-	  Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
-	  Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
-	  Opcode != X86::VINSERTPSrr)
-	type = TYPE_MOFFS8;
+          Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+          Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+          Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+          Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+          Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+          Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+          Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+          Opcode != X86::VINSERTPSrr)
+        type = TYPE_MOFFS8;
       break;
     case ENCODING_IW:
       type = TYPE_MOFFS16;
@@ -373,8 +378,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     }
   }
 
-  bool isBranch = false;
-  uint64_t pcrel = 0;
   switch (type) {
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
@@ -495,7 +498,38 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
     } else {
       baseReg = MCOperand::CreateReg(0);
     }
-    
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    if (IndexIs128 || IndexIs256) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase + 
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
     if (insn.sibIndex != SIB_INDEX_NONE) {
       switch (insn.sibIndex) {
       default:
@@ -506,6 +540,8 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         indexReg = MCOperand::CreateReg(X86::x); break;
       EA_BASES_32BIT
       EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
 #undef ENTRY
       }
     } else {
@@ -726,8 +762,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
     translateRegister(mcInst, insn.vvvv);
     return false;
   case ENCODING_DUP:
-    return translateOperand(mcInst,
-                            insn.spec->operands[operand.type - TYPE_DUP0],
+    return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
                             insn, Dis);
   }
 }
@@ -753,8 +788,8 @@ static bool translateInstruction(MCInst &mcInst,
   insn.numImmediatesTranslated = 0;
   
   for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    if (insn.spec->operands[index].encoding != ENCODING_NONE) {
-      if (translateOperand(mcInst, insn.spec->operands[index], insn, Dis)) {
+    if (insn.operands[index].encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
         return true;
       }
     }
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index c11f51c..0dbfa26 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -20,7 +20,7 @@
 // 2. Read the opcode, and determine what kind of opcode it is.  The
 //    disassembler distinguishes four kinds of opcodes, which are enumerated in
 //    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
-//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a 
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
 //    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
 //
 // 3. Depending on the opcode type, look in one of four ClassDecision structures
@@ -74,8 +74,8 @@
 #ifndef X86DISASSEMBLER_H
 #define X86DISASSEMBLER_H
 
-#define INSTRUCTION_SPECIFIER_FIELDS  \
-  const char*             name;
+#define INSTRUCTION_SPECIFIER_FIELDS \
+  uint16_t operands;
 
 #define INSTRUCTION_IDS               \
   unsigned instructionIDs;
@@ -88,7 +88,7 @@
 #include "llvm/MC/MCDisassembler.h"
 
 namespace llvm {
-  
+
 class MCInst;
 class MCInstrInfo;
 class MCSubtargetInfo;
@@ -96,7 +96,7 @@ class MemoryObject;
 class raw_ostream;
 
 struct EDInstInfo;
-  
+
 namespace X86Disassembler {
 
 /// X86GenericDisassembler - Generic disassembler for all X86 platforms.
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 6020877..0c92912 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -1495,14 +1495,14 @@ static int readOperands(struct InternalInstruction* insn) {
   needVVVV = hasVVVV && (insn->vvvv != 0);
   
   for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    switch (insn->spec->operands[index].encoding) {
+    switch (x86OperandSets[insn->spec->operands][index].encoding) {
     case ENCODING_NONE:
       break;
     case ENCODING_REG:
     case ENCODING_RM:
       if (readModRM(insn))
         return -1;
-      if (fixupReg(insn, &insn->spec->operands[index]))
+      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
         return -1;
       break;
     case ENCODING_CB:
@@ -1524,14 +1524,14 @@ static int readOperands(struct InternalInstruction* insn) {
       }
       if (readImmediate(insn, 1))
         return -1;
-      if (insn->spec->operands[index].type == TYPE_IMM3 &&
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
           insn->immediates[insn->numImmediatesConsumed - 1] > 7)
         return -1;
-      if (insn->spec->operands[index].type == TYPE_IMM5 &&
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
           insn->immediates[insn->numImmediatesConsumed - 1] > 31)
         return -1;
-      if (insn->spec->operands[index].type == TYPE_XMM128 ||
-          insn->spec->operands[index].type == TYPE_XMM256)
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
+          x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
         sawRegImm = 1;
       break;
     case ENCODING_IW:
@@ -1582,7 +1582,7 @@ static int readOperands(struct InternalInstruction* insn) {
       needVVVV = 0; /* Mark that we have found a VVVV operand. */
       if (!hasVVVV)
         return -1;
-      if (fixupReg(insn, &insn->spec->operands[index]))
+      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
         return -1;
       break;
     case ENCODING_DUP:
@@ -1644,6 +1644,8 @@ int decodeInstruction(struct InternalInstruction* insn,
       insn->instructionID == 0 ||
       readOperands(insn))
     return -1;
+
+  insn->operands = &x86OperandSets[insn->spec->operands][0];
   
   insn->length = insn->readerCursor - insn->startLocation;
   
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index fae309b..797703f 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -19,17 +19,18 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  
-#define INSTRUCTION_SPECIFIER_FIELDS
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+  uint16_t operands;
 
 #define INSTRUCTION_IDS     \
   unsigned instructionIDs;
 
 #include "X86DisassemblerDecoderCommon.h"
-  
+
 #undef INSTRUCTION_SPECIFIER_FIELDS
 #undef INSTRUCTION_IDS
-  
+
 /*
  * Accessor functions for various fields of an Intel instruction
  */
@@ -43,7 +44,7 @@ extern "C" {
 #define rFromREX(rex)        (((rex) & 0x4) >> 2)
 #define xFromREX(rex)        (((rex) & 0x2) >> 1)
 #define bFromREX(rex)        ((rex) & 0x1)
-    
+
 #define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
 #define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
 #define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
@@ -237,7 +238,7 @@ extern "C" {
   ENTRY(YMM13)    \
   ENTRY(YMM14)    \
   ENTRY(YMM15)
-    
+
 #define REGS_SEGMENT \
   ENTRY(ES)          \
   ENTRY(CS)          \
@@ -245,7 +246,7 @@ extern "C" {
   ENTRY(DS)          \
   ENTRY(FS)          \
   ENTRY(GS)
-  
+
 #define REGS_DEBUG  \
   ENTRY(DR0)        \
   ENTRY(DR1)        \
@@ -266,12 +267,12 @@ extern "C" {
   ENTRY(CR6)          \
   ENTRY(CR7)          \
   ENTRY(CR8)
-  
+
 #define ALL_EA_BASES  \
   EA_BASES_16BIT      \
   EA_BASES_32BIT      \
   EA_BASES_64BIT
-  
+
 #define ALL_SIB_BASES \
   REGS_32BIT          \
   REGS_64BIT
@@ -290,7 +291,7 @@ extern "C" {
   ENTRY(RIP)
 
 /*
- * EABase - All possible values of the base field for effective-address 
+ * EABase - All possible values of the base field for effective-address
  *   computations, a.k.a. the Mod and R/M fields of the ModR/M byte.  We
  *   distinguish between bases (EA_BASE_*) and registers that just happen to be
  *   referred to when Mod == 0b11 (EA_REG_*).
@@ -305,20 +306,23 @@ typedef enum {
 #undef ENTRY
   EA_max
 } EABase;
-  
-/* 
+
+/*
  * SIBIndex - All possible values of the SIB index field.
  *   Borrows entries from ALL_EA_BASES with the special case that
  *   sib is synonymous with NONE.
+ * Vector SIB: index can be XMM or YMM.
  */
 typedef enum {
   SIB_INDEX_NONE,
 #define ENTRY(x) SIB_INDEX_##x,
   ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
 #undef ENTRY
   SIB_INDEX_max
 } SIBIndex;
-  
+
 /*
  * SIBBase - All possible values of the SIB base field.
  */
@@ -350,7 +354,7 @@ typedef enum {
 #undef ENTRY
   MODRM_REG_max
 } Reg;
-  
+
 /*
  * SegmentOverride - All possible segment overrides.
  */
@@ -364,7 +368,7 @@ typedef enum {
   SEG_OVERRIDE_GS,
   SEG_OVERRIDE_max
 } SegmentOverride;
-    
+
 /*
  * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
  */
@@ -428,16 +432,16 @@ struct InternalInstruction {
   void* dlogArg;
 
   /* General instruction information */
-  
+
   /* The mode to disassemble for (64-bit, protected, real) */
   DisassemblerMode mode;
   /* The start of the instruction, usable with the reader */
   uint64_t startLocation;
   /* The length of the instruction, in bytes */
   size_t length;
-  
+
   /* Prefix state */
-  
+
   /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
@@ -453,7 +457,7 @@ struct InternalInstruction {
   uint64_t necessaryPrefixLocation;
   /* The segment override type */
   SegmentOverride segmentOverride;
-  
+
   /* Sizes of various critical pieces of data, in bytes */
   uint8_t registerSize;
   uint8_t addressSize;
@@ -464,9 +468,9 @@ struct InternalInstruction {
      needed to find relocation entries for adding symbolic operands */
   uint8_t displacementOffset;
   uint8_t immediateOffset;
-  
+
   /* opcode state */
-  
+
   /* The value of the two-byte escape prefix (usually 0x0f) */
   uint8_t twoByteEscape;
   /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
@@ -475,16 +479,16 @@ struct InternalInstruction {
   uint8_t opcode;
   /* The ModR/M byte of the instruction, if it is an opcode extension */
   uint8_t modRMExtension;
-  
+
   /* decode state */
-  
+
   /* The type of opcode, used for indexing into the array of decode tables */
   OpcodeType opcodeType;
   /* The instruction ID, extracted from the decode table */
   uint16_t instructionID;
   /* The specifier for the instruction, from the instruction info table */
   const struct InstructionSpecifier *spec;
-  
+
   /* state for additional bytes, consumed during operand decode.  Pattern:
      consumed___ indicates that the byte was already consumed and does not
      need to be consumed again */
@@ -492,12 +496,12 @@ struct InternalInstruction {
   /* The VEX.vvvv field, which contains a third register operand for some AVX
      instructions */
   Reg                           vvvv;
-  
+
   /* The ModR/M byte, which contains most register operands and some portion of
      all memory operands */
   BOOL                          consumedModRM;
   uint8_t                       modRM;
-  
+
   /* The SIB byte, used for more complex 32- or 64-bit memory operands */
   BOOL                          consumedSIB;
   uint8_t                       sib;
@@ -505,19 +509,19 @@ struct InternalInstruction {
   /* The displacement, used for memory operands */
   BOOL                          consumedDisplacement;
   int32_t                       displacement;
-  
+
   /* Immediates.  There can be two in some cases */
   uint8_t                       numImmediatesConsumed;
   uint8_t                       numImmediatesTranslated;
   uint64_t                      immediates[2];
-  
+
   /* A register or immediate operand encoded into the opcode */
   BOOL                          consumedOpcodeModifier;
   uint8_t                       opcodeModifier;
   Reg                           opcodeRegister;
-  
+
   /* Portions of the ModR/M byte */
-  
+
   /* These fields determine the allowable values for the ModR/M fields, which
      depend on operand and address widths */
   EABase                        eaBaseBase;
@@ -530,11 +534,13 @@ struct InternalInstruction {
   EADisplacement                eaDisplacement;
   /* The reg field always encodes a register */
   Reg                           reg;
-  
+
   /* SIB state */
   SIBIndex                      sibIndex;
   uint8_t                       sibScale;
   SIBBase                       sibBase;
+
+  const struct OperandSpecifier *operands;
 };
 
 /* decodeInstruction - Decode one instruction and store the decoding results in
@@ -568,15 +574,15 @@ int decodeInstruction(struct InternalInstruction* insn,
  * @param line  - The line number that printed the debug message.
  * @param s     - The message to print.
  */
-  
+
 void x86DisassemblerDebug(const char *file,
                           unsigned line,
                           const char *s);
 
 const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
 
-#ifdef __cplusplus 
+#ifdef __cplusplus
 }
 #endif
-  
+
 #endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 13e1136..b0a0e1e 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -119,7 +119,7 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")
 
 
-#define ENUM_ENTRY(n, r, d) n,    
+#define ENUM_ENTRY(n, r, d) n,
 typedef enum {
   INSTRUCTION_CONTEXTS
   IC_max
@@ -148,11 +148,11 @@ typedef enum {
  * If a ModR/M byte is not required, "required" is left unset, and the values
  * for each instructionID are identical.
  */
- 
+
 typedef uint16_t InstrUID;
 
 /*
- * ModRMDecisionType - describes the type of ModR/M decision, allowing the 
+ * ModRMDecisionType - describes the type of ModR/M decision, allowing the
  * consumer to determine the number of entries in it.
  *
  * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
@@ -172,7 +172,7 @@ typedef uint16_t InstrUID;
   ENUM_ENTRY(MODRM_SPLITREG)  \
   ENUM_ENTRY(MODRM_FULL)
 
-#define ENUM_ENTRY(n) n,    
+#define ENUM_ENTRY(n) n,
 typedef enum {
   MODRMTYPES
   MODRM_max
@@ -180,13 +180,13 @@ typedef enum {
 #undef ENUM_ENTRY
 
 /*
- * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which 
+ * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
  *  instruction each possible value of the ModR/M byte corresponds to.  Once
  *  this information is known, we have narrowed down to a single instruction.
  */
 struct ModRMDecision {
   uint8_t     modrm_type;
-  
+
   /* The macro below must be defined wherever this file is included. */
   INSTRUCTION_IDS
 };
@@ -210,7 +210,7 @@ struct ContextDecision {
   struct OpcodeDecision opcodeDecisions[IC_max];
 };
 
-/* 
+/*
  * Physical encodings of instruction operands.
  */
 
@@ -244,14 +244,14 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
                               "in type")
 
-#define ENUM_ENTRY(n, d) n,    
+#define ENUM_ENTRY(n, d) n,
   typedef enum {
     ENCODINGS
     ENCODING_max
   } OperandEncoding;
 #undef ENUM_ENTRY
 
-/* 
+/*
  * Semantic interpretations of instruction operands.
  */
 
@@ -332,14 +332,14 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
   ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
 
-#define ENUM_ENTRY(n, d) n,    
+#define ENUM_ENTRY(n, d) n,
 typedef enum {
   TYPES
   TYPE_max
 } OperandType;
 #undef ENUM_ENTRY
 
-/* 
+/*
  * OperandSpecifier - The specification for how to extract and interpret one
  *   operand.
  */
@@ -374,8 +374,7 @@ typedef enum {
 struct InstructionSpecifier {
   uint8_t modifierType;
   uint8_t modifierBase;
-  struct OperandSpecifier operands[X86_MAX_OPERANDS];
-  
+
   /* The macro below must be defined wherever this file is included. */
   INSTRUCTION_SPECIFIER_FIELDS
 };
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index f532019..64ac5e6 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -96,7 +96,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFHWmi:
   case X86::VPSHUFHWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+    DecodePSHUFHWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFHWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
     break;
   case X86::PSHUFLWri:
@@ -106,7 +116,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFLWmi:
   case X86::VPSHUFLWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+    DecodePSHUFLWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFLWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
     break;
 
@@ -487,6 +507,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a0bb6dc..db597fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -94,40 +94,83 @@ namespace X86II {
     MO_PLT,
 
     /// MO_TLSGD - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TLSGD
     MO_TLSGD,
 
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
     /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @GOTTPOFF
     MO_GOTTPOFF,
 
     /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @INDNTPOFF
     MO_INDNTPOFF,
 
     /// MO_TPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TPOFF
     MO_TPOFF,
 
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
     /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @NTPOFF
     MO_NTPOFF,
 
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
     /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "__imp_FOO" symbol.  This is used for
     /// dllimport linkage on windows.
@@ -438,17 +481,17 @@ namespace X86II {
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
   // specified machine instruction.
   //
-  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
     return TSFlags >> X86II::OpcodeShift;
   }
 
-  static inline bool hasImm(uint64_t TSFlags) {
+  inline bool hasImm(uint64_t TSFlags) {
     return (TSFlags & X86II::ImmMask) != 0;
   }
 
   /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
   /// of the specified instruction.
-  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
     default: llvm_unreachable("Unknown immediate size");
     case X86II::Imm8:
@@ -463,7 +506,7 @@ namespace X86II {
 
   /// isImmPCRel - Return true if the immediate of the specified instruction's
   /// TSFlags indicates that it is pc relative.
-  static inline unsigned isImmPCRel(uint64_t TSFlags) {
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
     default: llvm_unreachable("Unknown immediate size");
     case X86II::Imm8PCRel:
@@ -486,9 +529,11 @@ namespace X86II {
   /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
   /// counted as one operand.
   ///
-  static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
     switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:  llvm_unreachable("FIXME: Remove this form");
+    case X86II::MRMInitReg:
+        // FIXME: Remove this form.
+        return -1;
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
     case X86II::RawFrm:
@@ -546,7 +591,7 @@ namespace X86II {
 
   /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
   /// higher) register?  e.g. r8, xmm8, xmm13, etc.
-  static inline bool isX86_64ExtendedReg(unsigned RegNo) {
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
     switch (RegNo) {
     default: break;
     case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
@@ -568,7 +613,7 @@ namespace X86II {
     return false;
   }
   
-  static inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
     return (reg == X86::SPL || reg == X86::BPL ||
             reg == X86::SIL || reg == X86::DIL);
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index afa545c..b0acd7d 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -35,19 +35,6 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
              clEnumValEnd));
 
 
-static const char *const x86_asm_table[] = {
-  "{si}", "S",
-  "{di}", "D",
-  "{ax}", "a",
-  "{cx}", "c",
-  "{memory}", "memory",
-  "{flags}", "",
-  "{dirflag}", "",
-  "{fpsr}", "",
-  "{fpcr}", "",
-  "{cc}", "cc",
-  0,0};
-
 void X86MCAsmInfoDarwin::anchor() { }
 
 X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
@@ -55,7 +42,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   if (is64Bit)
     PointerSize = 8;
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -88,7 +74,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   if (T.getArch() == Triple::x86_64)
     PointerSize = 8;
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -106,9 +91,10 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
-  // OpenBSD has buggy support for .quad in 32-bit mode, just split into two
-  // .words.
-  if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86)
+  // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split
+  // into two .words.
+  if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
+       T.getArch() == Triple::x86)
     Data64bitsDirective = 0;
 }
 
@@ -137,7 +123,6 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
     PrivateGlobalPrefix = ".L";
   }
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -151,7 +136,6 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
     PrivateGlobalPrefix = ".L";
   }
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 80990e5..4a38324 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -139,6 +139,7 @@ public:
 
 
 MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new X86MCCodeEmitter(MCII, STI, Ctx);
@@ -569,7 +570,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   }
 
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
   case X86II::MRMDestMem: {
@@ -602,11 +613,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
       VEX_R = 0x0;
 
     if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, 1);
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
 
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -616,7 +627,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       VEX_X = 0x0;
 
     if (HasVEX_4VOp3)
-      VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+      // Instruction format for 4VOp3:
+      //   src1(ModR/M), MemAddr, src3(VEX_4V)
+      // CurOp points to start of the MemoryOperand,
+      //   it skips TIED_TO operands if exist, then increments past src1.
+      // CurOp + X86::AddrNumOperands will point to src3.
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
     break;
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
@@ -961,11 +977,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // FIXME: This should be handled during MCInst lowering.
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
-  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
     ++CurOp;
-  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
 
   // Keep track of the current byte being emitted.
   unsigned CurByte = 0;
@@ -1037,7 +1056,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     SrcRegNum = CurOp + X86::AddrNumOperands;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
+      ++SrcRegNum;
 
     EmitMemModRMByte(MI, CurOp,
                      GetX86RegNum(MI.getOperand(SrcRegNum)),
@@ -1050,15 +1069,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     SrcRegNum = CurOp + 1;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
+      ++SrcRegNum;
 
-    if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
-      SrcRegNum++;
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
 
     EmitRegModRMByte(MI.getOperand(SrcRegNum),
                      GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
 
-    // 2 operands skipped with HasMemOp4, comensate accordingly
+    // 2 operands skipped with HasMemOp4, compensate accordingly
     CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
@@ -1071,7 +1090,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++AddrOperands;
       ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
     }
-    if(HasMemOp4) // Skip second register source (encoded in I8IMM)
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
       ++FirstMemOp;
 
     EmitByte(BaseOpcode, CurByte, OS);
@@ -1089,7 +1108,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM4r: case X86II::MRM5r:
   case X86II::MRM6r: case X86II::MRM7r:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      CurOp++;
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     EmitRegModRMByte(MI.getOperand(CurOp++),
                      (TSFlags & X86II::FormMask)-X86II::MRM0r,
@@ -1100,7 +1119,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      CurOp++;
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
                      TSFlags, CurByte, OS, Fixups);
@@ -1149,22 +1168,23 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   // If there is a remaining operand, it must be a trailing immediate.  Emit it
-  // according to the right size for the instruction.
-  if (CurOp != NumOps) {
+  // according to the right size for the instruction. Some instructions
+  // (SSE4a extrq and insertq) have two trailing immediates.
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte.
     if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
-                                                   : CurOp);
-      CurOp++;
-      bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
-      unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
-      RegNum |= GetX86RegNum(MO) << 4;
+                                                    : CurOp);
+      ++CurOp;
+      unsigned RegNum = GetX86RegNum(MO) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
       // If there is an additional 5th operand it must be an immediate, which
       // is encoded in bits[3:0]
-      if(CurOp != NumOps) {
+      if (CurOp != NumOps) {
         const MCOperand &MIMM = MI.getOperand(CurOp++);
-        if(MIMM.isImm()) {
+        if (MIMM.isImm()) {
           unsigned Val = MIMM.getImm();
           assert(Val < 16 && "Immediate operand value out of range");
           RegNum |= Val;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 9896cbe..4650069 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -76,6 +76,7 @@ namespace X86_MC {
 }
 
 MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index a802333..8b87c1f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -64,13 +64,13 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  int NewImm = Imm;
+  unsigned NewImm = Imm;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
     for (unsigned i = 0; i != NumLaneElts; ++i) {
       ShuffleMask.push_back(NewImm % NumLaneElts + l);
@@ -80,48 +80,55 @@ void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  ShuffleMask.push_back(0);
-  ShuffleMask.push_back(1);
-  ShuffleMask.push_back(2);
-  ShuffleMask.push_back(3);
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back(4+(Imm & 3));
-    Imm >>= 2;
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
   }
 }
 
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back((Imm & 3));
-    Imm >>= 2;
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
   }
-  ShuffleMask.push_back(4);
-  ShuffleMask.push_back(5);
-  ShuffleMask.push_back(6);
-  ShuffleMask.push_back(7);
 }
 
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  int NewImm = Imm;
+  unsigned NewImm = Imm;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    // Part that reads from dest.
-    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + l);
-      NewImm /= NumLaneElts;
-    }
-    // Part that reads from src.
-    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l);
-      NewImm /= NumLaneElts;
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
     }
     if (NumLaneElts == 4) NewImm = Imm; // reload imm
   }
@@ -130,7 +137,7 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -150,7 +157,7 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -167,19 +174,26 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
   if (Imm & 0x88)
     return; // Not a shuffle
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
-  unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
-  unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
 
-  for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i)
-    ShuffleMask.push_back(i);
-  for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i)
-    ShuffleMask.push_back(i);
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+      ShuffleMask.push_back(i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2*i)) & 3);
+  }
 }
 
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 5b8c6ef..70d8171 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -35,31 +35,35 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 // <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index ecc7b59..dce5b4d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -26,7 +26,7 @@ class FunctionPass;
 class JITCodeEmitter;
 class X86TargetMachine;
 
-/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
@@ -36,6 +36,11 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
 /// register for PIC on x86-32.
 FunctionPass* createGlobalBaseRegPass();
 
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
 /// createX86FloatingPointStackifierPass - This function returns a pass which
 /// converts floating point register references and pseudo instructions into
 /// floating point stack references and physical instructions.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index b6591d4..6c1a816 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -86,21 +86,24 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
-def FeatureCLMUL   : SubtargetFeature<"clmul", "HasCLMUL", "true",
-                               "Enable carry-less multiplication instructions">;
-def FeatureFMA3    : SubtargetFeature<"fma3", "HasFMA3", "true",
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
                                       "Enable three-operand fused multiple-add",
                                       [FeatureAVX]>;
 def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
                                       "Enable four-operand fused multiple-add",
-                                      [FeatureAVX]>;
+                                      [FeatureAVX, FeatureSSE4A]>;
 def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
-                                      "Enable XOP instructions">;
+                                      "Enable XOP instructions",
+                                      [FeatureAVX, FeatureSSE4A]>;
 def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                                           "HasVectorUAMem", "true",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
-                                      "Enable AES instructions">;
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
 def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
@@ -128,10 +131,10 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, GenericItineraries, Features>;
+ : ProcessorModel<Name, GenericModel, Features>;
 
 class AtomProc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, AtomItineraries, Features>;
+ : ProcessorModel<Name, AtomModel, Features>;
 
 def : Proc<"generic",         []>;
 def : Proc<"i386",            []>;
@@ -169,25 +172,23 @@ def : Proc<"nehalem",         [FeatureSSE42,  FeatureCMPXCHG16B,
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
 def : Proc<"westmere",        [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,
-                               FeaturePOPCNT, FeatureAES, FeatureCLMUL]>;
+                               FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
 // Sandy Bridge
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
-// FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx",      [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL]>;
+def : Proc<"corei7-avx",      [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL]>;
 // Ivy Bridge
-def : Proc<"core-avx-i",      [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL,
+def : Proc<"core-avx-i",      [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL,
                                FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>;
 
 // Haswell
-// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready.
-def : Proc<"core-avx2",       [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL, FeatureRDRAND,
+def : Proc<"core-avx2",       [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                                FeatureF16C, FeatureFSGSBase,
                                FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                               FeatureBMI2]>;
+                               FeatureBMI2, FeatureFMA]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -211,21 +212,20 @@ def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
+def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
                                FeaturePOPCNT, FeatureSlowBTMem]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeatureLZCNT, FeaturePOPCNT]>;
-// FIXME: Disabling AVX/FMA4 for now since it's not ready.
 // Bulldozer
-def : Proc<"bdver1",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeatureCLMUL,
-                               FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>;
+def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureLZCNT, FeaturePOPCNT]>;
 // Enhanced Bulldozer
-def : Proc<"bdver2",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeatureCLMUL,
-                               FeatureXOP, FeatureF16C, FeatureLZCNT,
+def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
                                FeaturePOPCNT, FeatureBMI]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 7db7ccb..db71e27 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -20,10 +20,10 @@
 #include "X86TargetMachine.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -186,10 +186,14 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
     O << '-' << *MF->getPICBaseSymbol();
     break;
   case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
   case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
   case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
   case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
   case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
   case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
   case X86II::MO_GOT:       O << "@GOT";       break;
   case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
@@ -403,7 +407,9 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     const MachineOperand &MO = MI->getOperand(OpNo);
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
       if (MO.isImm()) {
         O << MO.getImm();
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index a6ed9ba..35386cd 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -37,15 +37,15 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   virtual const char *getPassName() const {
     return "X86 AT&T-Style Assembly Printer";
   }
-  
+
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
 
   virtual void EmitStartOfAsmFile(Module &M);
 
   virtual void EmitEndOfAsmFile(Module &M);
-  
+
   virtual void EmitInstruction(const MachineInstr *MI);
-  
+
   void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
 
   // These methods are used by the tablegen'erated instruction printer.
@@ -71,7 +71,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O);
 
   bool runOnMachineFunction(MachineFunction &F);
-  
+
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
   MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
index e01ff41..6a6125b 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -17,4 +17,3 @@ using namespace llvm;
 
 X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
 }
-
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
index 0cec95a..471eb31 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -1,4 +1,4 @@
-//===-- X86COFFMachineModuleInfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
+//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -33,7 +33,7 @@ public:
   void addExternalFunction(MCSymbol* Symbol) {
     Externals.insert(Symbol);
   }
-    
+
   typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
   externals_iterator externals_begin() const { return Externals.begin(); }
   externals_iterator externals_end() const { return Externals.end(); }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index d148989..a6d2709 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -29,10 +29,13 @@ def RetCC_X86Common : CallingConv<[
   // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
   // for functions that return two i8 values are currently expected to pack the
   // values into an i16 (which uses AX, and thus AL:AH).
-  CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
-  CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
-  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
-  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
 
   // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
   // can only be used by ABI non-compliant code. If the target doesn't have XMM
@@ -413,7 +416,7 @@ def CC_X86 : CallingConv<[
 // Callee-saved Registers.
 //===----------------------------------------------------------------------===//
 
-def CSR_Ghc : CalleeSavedRegs<(add)>;
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
 def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
 def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ee3de9a..d705049 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,12 +53,12 @@ namespace {
   public:
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(ID), II(0), TD(0), TM(tm), 
+      : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(false),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
     Emitter(X86TargetMachine &tm, CodeEmitter &mce,
             const X86InstrInfo &ii, const TargetData &td, bool is64)
-      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), 
+      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -68,8 +68,20 @@ namespace {
       return "X86 Machine Code Emitter";
     }
 
+    void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                          const MachineInstr &MI,
+                          const MCInstrDesc *Desc) const;
+
+    void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                             const MachineInstr &MI,
+                             const MCInstrDesc *Desc) const;
+
+    void emitSegmentOverridePrefix(uint64_t TSFlags,
+                                   int MemOperand,
+                                   const MachineInstr &MI) const;
+
     void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
-    
+
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
       AU.addRequired<MachineModuleInfo>();
@@ -115,17 +127,17 @@ template<class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
-  
+
   II = TM.getInstrInfo();
   TD = TM.getTargetData();
   Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  
+
   do {
-    DEBUG(dbgs() << "JITTing function '" 
+    DEBUG(dbgs() << "JITTing function '"
           << MF.getFunction()->getName() << "'\n");
     MCE.startFunction(MF);
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
       MCE.StartMachineBasicBlock(MBB);
       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
@@ -149,18 +161,18 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
 static unsigned determineREX(const MachineInstr &MI) {
   unsigned REX = 0;
   const MCInstrDesc &Desc = MI.getDesc();
-  
+
   // Pseudo instructions do not need REX prefix byte.
   if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
     return 0;
   if (Desc.TSFlags & X86II::REX_W)
     REX |= 1 << 3;
-  
+
   unsigned NumOps = Desc.getNumOperands();
   if (NumOps) {
     bool isTwoAddr = NumOps > 1 &&
-    Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
-    
+      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
     // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
     unsigned i = isTwoAddr ? 1 : 0;
     for (unsigned e = NumOps; i != e; ++i) {
@@ -171,7 +183,7 @@ static unsigned determineREX(const MachineInstr &MI) {
           REX |= 0x40;
       }
     }
-    
+
     switch (Desc.TSFlags & X86II::FormMask) {
       case X86II::MRMInitReg:
         if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
@@ -362,7 +374,7 @@ void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
 }
 
 template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, 
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
                                        unsigned Index,
                                        unsigned Base) {
   // SIB byte is in the same format as the ModRMByte...
@@ -378,8 +390,8 @@ void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
   }
 }
 
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
-/// sign-extended field. 
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
 static bool isDisp8(int Value) {
   return Value == (signed char)Value;
 }
@@ -388,10 +400,10 @@ static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
                               const TargetMachine &TM) {
   // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
   // mechanism as 32-bit mode.
-  if (TM.getSubtarget<X86Subtarget>().is64Bit() && 
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
       !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
     return false;
-  
+
   // Return true if this is a reference to a stub containing the address of the
   // global, not the global itself.
   return isGlobalStubReference(GVOp.getTargetFlags());
@@ -417,7 +429,7 @@ void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
   if (RelocOp->isGlobal()) {
     // In 64-bit static small code model, we could potentially emit absolute.
     // But it's probably not beneficial. If the MCE supports using RIP directly
-    // do it, otherwise fallback to absolute (this is determined by IsPCRel). 
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel).
     //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
     //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
     bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
@@ -441,7 +453,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   const MachineOperand &Op3 = MI.getOperand(Op+3);
   int DispVal = 0;
   const MachineOperand *DispForReloc = 0;
-  
+
   // Figure out what sort of displacement we have to handle here.
   if (Op3.isGlobal()) {
     DispForReloc = &Op3;
@@ -469,7 +481,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   const MachineOperand &IndexReg = MI.getOperand(Op+2);
 
   unsigned BaseReg = Base.getReg();
-  
+
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP ||
       (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
@@ -486,7 +498,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
 
   // Is a SIB byte needed?
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
   // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
@@ -494,7 +506,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     BaseRegNo = X86_MC::getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 && 
+      IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
       // present.
@@ -508,7 +520,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
       emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
       return;
     }
-    
+
     // If the base is not EBP/ESP and there is no displacement, use simple
     // indirect register encoding, this handles addresses like [EAX].  The
     // encoding for [EBP] with no displacement means [disp32] so we handle it
@@ -517,20 +529,20 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
       MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
       return;
     }
-    
+
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
     if (!DispForReloc && isDisp8(DispVal)) {
       MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
       emitConstant(DispVal, 1);
       return;
     }
-    
+
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
     MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
     emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
     return;
   }
-  
+
   // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
   assert(IndexReg.getReg() != X86::ESP &&
          IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
@@ -563,7 +575,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   unsigned SS = SSTable[Scale.getImm()];
 
   if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel 
+    // Handle the SIB byte for the case where there is no base, see Intel
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
@@ -596,94 +608,116 @@ static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
   return Desc;
 }
 
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
-                                           const MCInstrDesc *Desc) {
-  DEBUG(dbgs() << MI);
-  
-  // If this is a pseudo instruction, lower it.
-  switch (Desc->getOpcode()) {
-  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
-  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
-  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
-  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
-  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
-  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
-  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
-  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
-  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
-  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
-  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
-  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
-  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
-  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
-  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
-  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
-  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
-  }
-  
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
 
-  MCE.processDebugLoc(MI.getDebugLoc(), true);
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
 
-  unsigned Opcode = Desc->Opcode;
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
 
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+                                            int MemOperand,
+                                            const MachineInstr &MI,
+                                            const MCInstrDesc *Desc) const {
   // Emit the lock opcode prefix as needed.
   if (Desc->TSFlags & X86II::LOCK)
     MCE.emitByte(0xF0);
 
   // Emit segment override opcode prefix as needed.
-  switch (Desc->TSFlags & X86II::SegOvrMask) {
-  case X86II::FS:
-    MCE.emitByte(0x64);
-    break;
-  case X86II::GS:
-    MCE.emitByte(0x65);
-    break;
-  default: llvm_unreachable("Invalid segment!");
-  case 0: break;  // No segment override!
-  }
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
 
   // Emit the repeat opcode prefix as needed.
   if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
     MCE.emitByte(0xF3);
 
-  // Emit the operand size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::OpSize)
-    MCE.emitByte(0x66);
-
   // Emit the address size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::AdSize)
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemOperand == -1) {
+    need_address_override = false;
+  } else if (Is64BitMode) {
+    assert(!Is16BitMemOperand(MI, MemOperand));
+    need_address_override = Is32BitMemOperand(MI, MemOperand);
+  } else {
+    assert(!Is64BitMemOperand(MI, MemOperand));
+    need_address_override = Is16BitMemOperand(MI, MemOperand);
+  }
+
+  if (need_address_override)
     MCE.emitByte(0x67);
 
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    MCE.emitByte(0x66);
+
   bool Need0FPrefix = false;
   switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::TB:  // Two-byte opcode prefix
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-  case X86II::A6:  // 0F A6
-  case X86II::A7:  // 0F A7
-    Need0FPrefix = true;
-    break;
-  case X86II::REP: break; // already handled.
-  case X86II::T8XS: // F3 0F 38
-  case X86II::XS:   // F3 0F
-    MCE.emitByte(0xF3);
-    Need0FPrefix = true;
-    break;
-  case X86II::T8XD: // F2 0F 38
-  case X86II::TAXD: // F2 0F 3A
-  case X86II::XD:   // F2 0F
-    MCE.emitByte(0xF2);
-    Need0FPrefix = true;
-    break;
-  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
-  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
-    MCE.emitByte(0xD8+
-                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
-                                   >> X86II::Op0Shift));
-    break; // Two-byte opcode prefix
-  default: llvm_unreachable("Invalid prefix!");
-  case 0: break;  // No prefix!
+    case X86II::TB:  // Two-byte opcode prefix
+    case X86II::T8:  // 0F 38
+    case X86II::TA:  // 0F 3A
+    case X86II::A6:  // 0F A6
+    case X86II::A7:  // 0F A7
+      Need0FPrefix = true;
+      break;
+    case X86II::REP: break; // already handled.
+    case X86II::T8XS: // F3 0F 38
+    case X86II::XS:   // F3 0F
+      MCE.emitByte(0xF3);
+      Need0FPrefix = true;
+      break;
+    case X86II::T8XD: // F2 0F 38
+    case X86II::TAXD: // F2 0F 3A
+    case X86II::XD:   // F2 0F
+      MCE.emitByte(0xF2);
+      Need0FPrefix = true;
+      break;
+    case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+    case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+      MCE.emitByte(0xD8+
+                   (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                    >> X86II::Op0Shift));
+      break; // Two-byte opcode prefix
+    default: llvm_unreachable("Invalid prefix!");
+    case 0: break;  // No prefix!
   }
 
   // Handle REX prefix.
@@ -697,50 +731,446 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     MCE.emitByte(0x0F);
 
   switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::T8XD:  // F2 0F 38
-  case X86II::T8XS:  // F3 0F 38
-  case X86II::T8:    // 0F 38
-    MCE.emitByte(0x38);
-    break;
-  case X86II::TAXD:  // F2 0F 38
-  case X86II::TA:    // 0F 3A
-    MCE.emitByte(0x3A);
-    break;
-  case X86II::A6:    // 0F A6
-    MCE.emitByte(0xA6);
-    break;
-  case X86II::A7:    // 0F A7
-    MCE.emitByte(0xA7);
-    break;
+    case X86II::T8XD:  // F2 0F 38
+    case X86II::T8XS:  // F3 0F 38
+    case X86II::T8:    // 0F 38
+      MCE.emitByte(0x38);
+      break;
+    case X86II::TAXD:  // F2 0F 38
+    case X86II::TA:    // 0F 3A
+      MCE.emitByte(0x3A);
+      break;
+    case X86II::A6:    // 0F A6
+      MCE.emitByte(0xA6);
+      break;
+    case X86II::A7:    // 0F A7
+      MCE.emitByte(0xA7);
+      break;
+  }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+//  ModRM field => XMM9 => 1
+//  VEX.VVVV    => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+static unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                            unsigned OpNum) {
+  unsigned SrcReg = MI.getOperand(OpNum).getReg();
+  unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg());
+  if (X86II::isX86_64ExtendedReg(SrcReg))
+    SrcRegNum |= 8;
+
+  // The registers represented through VEX_VVVV should
+  // be encoded in 1's complement form.
+  return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+                                                 int MemOperand,
+                                                 const MachineInstr &MI) const {
+  switch (TSFlags & X86II::SegOvrMask) {
+    default: llvm_unreachable("Invalid segment!");
+    case 0:
+      // No segment override, check for explicit one on memory operand.
+      if (MemOperand != -1) {   // If the instruction has a memory operand.
+        switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+          default: llvm_unreachable("Unknown segment register!");
+          case 0: break;
+          case X86::CS: MCE.emitByte(0x2E); break;
+          case X86::SS: MCE.emitByte(0x36); break;
+          case X86::DS: MCE.emitByte(0x3E); break;
+          case X86::ES: MCE.emitByte(0x26); break;
+          case X86::FS: MCE.emitByte(0x64); break;
+          case X86::GS: MCE.emitByte(0x65); break;
+        }
+      }
+      break;
+    case X86II::FS:
+      MCE.emitByte(0x64);
+      break;
+    case X86II::GS:
+      MCE.emitByte(0x65);
+      break;
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                                               int MemOperand,
+                                               const MachineInstr &MI,
+                                               const MCInstrDesc *Desc) const {
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // XOP: Use XOP prefix byte 0x8f instead of VEX.
+  unsigned char XOP = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b10001: XOP map select - 09h instructions with no imm byte
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+    XOP = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+    default: llvm_unreachable("Invalid prefix!");
+    case X86II::T8:  // 0F 38
+      VEX_5M = 0x2;
+      break;
+    case X86II::TA:  // 0F 3A
+      VEX_5M = 0x3;
+      break;
+    case X86II::T8XS: // F3 0F 38
+      VEX_PP = 0x2;
+      VEX_5M = 0x2;
+      break;
+    case X86II::T8XD: // F2 0F 38
+      VEX_PP = 0x3;
+      VEX_5M = 0x2;
+      break;
+    case X86II::TAXD: // F2 0F 3A
+      VEX_PP = 0x3;
+      VEX_5M = 0x3;
+      break;
+    case X86II::XS:  // F3 0F
+      VEX_PP = 0x2;
+      break;
+    case X86II::XD:  // F2 0F
+      VEX_PP = 0x3;
+      break;
+    case X86II::XOP8:
+      VEX_5M = 0x8;
+      break;
+    case X86II::XOP9:
+      VEX_5M = 0x9;
+      break;
+    case X86II::A6:  // Bypass: Not used by VEX
+    case X86II::A7:  // Bypass: Not used by VEX
+    case X86II::TB:  // Bypass: Not used by VEX
+    case 0:
+      break;  // No prefix!
+  }
+
+
+  // Set the vector length to 256-bit if YMM0-YMM15 is used
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
+    if (MI.getOperand(i).isImplicit())
+      continue;
+    unsigned SrcReg = MI.getOperand(i).getReg();
+    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
+      VEX_L = 1;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      // Duplicate register.
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestMem: {
+      // MRMDestMem instructions forms:
+      //  MemAddr, src1(ModR/M)
+      //  MemAddr, src1(VEX_4V), src2(ModR/M)
+      //  MemAddr, src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      CurOp = X86::AddrNumOperands;
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      const MachineOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      break;
+    }
+    case X86II::MRMSrcMem:
+      // MRMSrcMem instructions forms:
+      //  src1(ModR/M), MemAddr
+      //  src1(ModR/M), src2(VEX_4V), MemAddr
+      //  src1(ModR/M), MemAddr, imm8
+      //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+      //
+      //  FMA4:
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 1);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+      break;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      // MRM[0-9]m instructions forms:
+      //  MemAddr
+      //  src1(VEX_4V), MemAddr
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+      break;
+    }
+    case X86II::MRMSrcReg:
+      // MRMSrcReg instructions forms:
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M), src1(ModR/M)
+      //  dst(ModR/M), src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestReg:
+      // MRMDestReg instructions forms:
+      //  dst(ModR/M), src(ModR/M)
+      //  dst(ModR/M), src(ModR/M), imm8
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+        VEX_R = 0x0;
+      break;
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // MRM0r-MRM7r instructions forms:
+      //  dst(VEX_4V), src(ModR/M), imm8
+      VEX_4V = getVEXRegisterEncoding(MI, 0);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+        VEX_B = 0x0;
+      break;
+    default: // RawFrm
+      break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+    MCE.emitByte(0xC5);
+    MCE.emitByte(LastByte | (VEX_R << 7));
+    return;
+  }
+
+  // 3 byte VEX prefix
+  MCE.emitByte(XOP ? 0x8F : 0xC4);
+  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+  MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+                                           const MCInstrDesc *Desc) {
+  DEBUG(dbgs() << MI);
+
+  // If this is a pseudo instruction, lower it.
+  switch (Desc->getOpcode()) {
+  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
+  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
+  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
+  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
+  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
+  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
   }
 
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  unsigned Opcode = Desc->Opcode;
+
   // If this is a two-address instruction, skip one of the register operands.
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
-  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1)
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
     ++CurOp;
-  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1,MCOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  uint64_t TSFlags = Desc->TSFlags;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+  else
+    emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
 
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
-  switch (Desc->TSFlags & X86II::FormMask) {
+  switch (TSFlags & X86II::FormMask) {
   default:
     llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
   case X86II::Pseudo:
     // Remember the current PC offset, this is the PIC relocation
     // base address.
     switch (Opcode) {
-    default: 
+    default:
       llvm_unreachable("pseudo instructions should be removed before code"
                        " emission");
-      break;
     // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
     // to make it slightly easier to see.
     case X86::Int_MemBarrier:
       DEBUG(dbgs() << "#MEMBARRIER\n");
       break;
-    
+
     case TargetOpcode::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
@@ -752,7 +1182,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
       break;
-    
+
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
       break;
@@ -774,7 +1204,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     if (CurOp == NumOps)
       break;
-      
+
     const MachineOperand &MO = MI.getOperand(CurOp++);
 
     DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
@@ -787,13 +1217,13 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitPCRelativeBlockAddress(MO.getMBB());
       break;
     }
-    
+
     if (MO.isGlobal()) {
       emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
                         MO.getOffset(), 0);
       break;
     }
-    
+
     if (MO.isSymbol()) {
       emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
       break;
@@ -804,7 +1234,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
       break;
     }
-    
+
     assert(MO.isImm() && "Unknown RawFrm operand!");
     if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
       // Fix up immediate operand for pc relative calls.
@@ -815,21 +1245,21 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
     break;
   }
-      
+
   case X86II::AddRegFrm: {
     MCE.emitByte(BaseOpcode +
                  X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
-    
+
     if (CurOp == NumOps)
       break;
-      
+
     const MachineOperand &MO1 = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO1.isImm()) {
       emitConstant(MO1.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64ri64i32)
@@ -855,46 +1285,57 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
                      X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
     break;
   }
   case X86II::MRMDestMem: {
     MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
     emitMemModRMByte(MI, CurOp,
-                X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
-                                  .getReg()));
-    CurOp +=  X86::AddrNumOperands + 1;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+                X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
     break;
   }
 
-  case X86II::MRMSrcReg:
+  case X86II::MRMSrcReg: {
     MCE.emitByte(BaseOpcode);
-    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
-    CurOp += 2;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
     break;
-
+  }
   case X86II::MRMSrcMem: {
     int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    MCE.emitByte(BaseOpcode);
 
     intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
-
-    MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp+1,
+    emitMemModRMByte(MI, FirstMemOp,
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+    if (HasVEX_4VOp3)
+      ++CurOp;
     break;
   }
 
@@ -902,20 +1343,22 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
   case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
                      (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
 
     if (CurOp == NumOps)
       break;
-    
+
     const MachineOperand &MO1 = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO1.isImm()) {
       emitConstant(MO1.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64ri32)
@@ -937,8 +1380,10 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
     intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
-      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? 
+      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
           X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
 
     MCE.emitByte(BaseOpcode);
@@ -948,14 +1393,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     if (CurOp == NumOps)
       break;
-    
+
     const MachineOperand &MO = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO.isImm()) {
       emitConstant(MO.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64mi32)
@@ -980,7 +1425,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
-      
+
   case X86II::MRM_C1:
     MCE.emitByte(BaseOpcode);
     MCE.emitByte(0xC1);
@@ -1003,6 +1448,33 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                         : CurOp);
+      ++CurOp;
+      unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MachineOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      emitConstant(RegNum, 1);
+    } else {
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    }
+  }
+
   if (!MI.isVariadic() && CurOp != NumOps) {
 #ifndef NDEBUG
     dbgs() << "Cannot encode all operands of: " << MI << "\n";
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 69752c5..e5952aa 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -57,7 +57,9 @@ class X86FastISel : public FastISel {
   bool X86ScalarSSEf32;
 
 public:
-  explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) {
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+                       const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
     X86ScalarSSEf64 = Subtarget->hasSSE2();
@@ -155,9 +157,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   // For now, require SSE/SSE2 for performing floating-point operations,
   // since x87 requires additional work.
   if (VT == MVT::f64 && !X86ScalarSSEf64)
-     return false;
+    return false;
   if (VT == MVT::f32 && !X86ScalarSSEf32)
-     return false;
+    return false;
   // Similarly, no f80 support yet.
   if (VT == MVT::f80)
     return false;
@@ -183,37 +185,37 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
   case MVT::i1:
   case MVT::i8:
     Opc = X86::MOV8rm;
-    RC  = X86::GR8RegisterClass;
+    RC  = &X86::GR8RegClass;
     break;
   case MVT::i16:
     Opc = X86::MOV16rm;
-    RC  = X86::GR16RegisterClass;
+    RC  = &X86::GR16RegClass;
     break;
   case MVT::i32:
     Opc = X86::MOV32rm;
-    RC  = X86::GR32RegisterClass;
+    RC  = &X86::GR32RegClass;
     break;
   case MVT::i64:
     // Must be in x86-64 mode.
     Opc = X86::MOV64rm;
-    RC  = X86::GR64RegisterClass;
+    RC  = &X86::GR64RegClass;
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
-      RC  = X86::FR32RegisterClass;
+      RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
-      RC  = X86::RFP32RegisterClass;
+      RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
-      RC  = X86::FR64RegisterClass;
+      RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
-      RC  = X86::RFP64RegisterClass;
+      RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
@@ -240,7 +242,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   default: return false;
   case MVT::i1: {
     // Mask out all but lowest bit.
-    unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
             TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
     Val = AndResult;
@@ -547,13 +549,13 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
         if (TLI.getPointerTy() == MVT::i64) {
           Opc = X86::MOV64rm;
-          RC  = X86::GR64RegisterClass;
+          RC  = &X86::GR64RegClass;
 
           if (Subtarget->isPICStyleRIPRel())
             StubAM.Base.Reg = X86::RIP;
         } else {
           Opc = X86::MOV32rm;
-          RC  = X86::GR32RegisterClass;
+          RC  = &X86::GR32RegClass;
         }
 
         LoadReg = createResultReg(RC);
@@ -743,7 +745,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-		   I->getContext());
+                   I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
@@ -1258,7 +1260,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
     if (V->getType()->isFloatTy()) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
-      unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
               TII.get(X86::CVTSS2SDrr), ResultReg)
         .addReg(OpReg);
@@ -1277,7 +1279,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
       if (V->getType()->isDoubleTy()) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
-        unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                 TII.get(X86::CVTSD2SSrr), ResultReg)
           .addReg(OpReg);
@@ -1314,8 +1316,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   if (!Subtarget->is64Bit()) {
     // If we're on x86-32; we can't extract an i8 from a general register.
     // First issue a copy to GR16_ABCD or GR32_ABCD.
-    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
-      ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+      (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+      (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
     unsigned CopyReg = createResultReg(CopyRC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
             CopyReg).addReg(InputReg);
@@ -1423,7 +1426,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     return DoSelectCall(&I, "memset");
   }
   case Intrinsic::stackprotector: {
-    // Emit code inline code to store the stack guard onto the stack.
+    // Emit code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
 
     const Value *Op1 = I.getArgOperand(0); // The guard's value.
@@ -1484,7 +1487,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
       return false;
 
     // The call to CreateRegs builds two sequential registers, to store the
-    // both the the returned values.
+    // both the returned values.
     unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
       .addReg(Reg1).addReg(Reg2);
@@ -1515,6 +1518,22 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   return DoSelectCall(I, 0);
 }
 
+static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
+                                           const ImmutableCallSite &CS) {
+  if (Subtarget.is64Bit())
+    return 0;
+  if (Subtarget.isTargetWindows())
+    return 0;
+  CallingConv::ID CC = CS.getCallingConv();
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC)
+    return 0;
+  if (!CS.paramHasAttr(1, Attribute::StructRet))
+    return 0;
+  if (CS.paramHasAttr(1, Attribute::InReg))
+    return 0;
+  return 4;
+}
+
 // Select either a call, or an llvm.memcpy/memmove/memset intrinsic
 bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   const CallInst *CI = cast<CallInst>(I);
@@ -1548,12 +1567,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  SmallVector<uint64_t, 4> Offsets;
   GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
-                Outs, TLI, &Offsets);
+                Outs, TLI);
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-					   *FuncInfo.MF, FTy->isVarArg(),
-					   Outs, FTy->getContext());
+                                           *FuncInfo.MF, FTy->isVarArg(),
+                                           Outs, FTy->getContext());
   if (!CanLowerReturn)
     return false;
 
@@ -1667,7 +1685,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
-		 I->getParent()->getContext());
+                 I->getParent()->getContext());
 
   // Allocate shadow area for Win64
   if (Subtarget->isTargetWin64())
@@ -1693,7 +1711,6 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
@@ -1737,6 +1754,14 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       ArgVT = VA.getLocVT();
       break;
     }
+    case CCValAssign::VExt: 
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
     }
 
     if (VA.isRegLoc()) {
@@ -1838,27 +1863,24 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
   // Add an implicit use GOT pointer in EBX.
   if (Subtarget->isPICStyleGOT())
-    MIB.addReg(X86::EBX);
+    MIB.addReg(X86::EBX, RegState::Implicit);
 
   if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
-    MIB.addReg(X86::AL);
+    MIB.addReg(X86::AL, RegState::Implicit);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
-
-  // Add a register mask with the call-preserved registers.
-  // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
-  unsigned NumBytesCallee = 0;
-  if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() &&
-      CS.paramHasAttr(1, Attribute::StructRet))
-    NumBytesCallee = 4;
+  const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesCallee);
 
@@ -1889,7 +1911,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   SmallVector<unsigned, 4> UsedRegs;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
-		    I->getParent()->getContext());
+                    I->getParent()->getContext());
   unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1903,7 +1925,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
          RVLocs[i].getLocReg() == X86::ST1)) {
       if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
         CopyVT = MVT::f80;
-        CopyReg = createResultReg(X86::RFP80RegisterClass);
+        CopyReg = createResultReg(&X86::RFP80RegClass);
       }
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
               CopyReg);
@@ -2001,37 +2023,37 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   default: return false;
   case MVT::i8:
     Opc = X86::MOV8rm;
-    RC  = X86::GR8RegisterClass;
+    RC  = &X86::GR8RegClass;
     break;
   case MVT::i16:
     Opc = X86::MOV16rm;
-    RC  = X86::GR16RegisterClass;
+    RC  = &X86::GR16RegClass;
     break;
   case MVT::i32:
     Opc = X86::MOV32rm;
-    RC  = X86::GR32RegisterClass;
+    RC  = &X86::GR32RegClass;
     break;
   case MVT::i64:
     // Must be in x86-64 mode.
     Opc = X86::MOV64rm;
-    RC  = X86::GR64RegisterClass;
+    RC  = &X86::GR64RegClass;
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
-      RC  = X86::FR32RegisterClass;
+      RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
-      RC  = X86::RFP32RegisterClass;
+      RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
-      RC  = X86::FR64RegisterClass;
+      RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
-      RC  = X86::RFP64RegisterClass;
+      RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
@@ -2120,28 +2142,28 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-    default: return false;
-    case MVT::f32:
-      if (X86ScalarSSEf32) {
-        Opc = X86::FsFLD0SS;
-        RC  = X86::FR32RegisterClass;
-      } else {
-        Opc = X86::LD_Fp032;
-        RC  = X86::RFP32RegisterClass;
-      }
-      break;
-    case MVT::f64:
-      if (X86ScalarSSEf64) {
-        Opc = X86::FsFLD0SD;
-        RC  = X86::FR64RegisterClass;
-      } else {
-        Opc = X86::LD_Fp064;
-        RC  = X86::RFP64RegisterClass;
-      }
-      break;
-    case MVT::f80:
-      // No f80 support yet.
-      return false;
+  default: return false;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
   }
 
   unsigned ResultReg = createResultReg(RC);
@@ -2160,7 +2182,7 @@ bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
   if (!X86SelectAddress(LI->getOperand(0), AM))
     return false;
 
-  X86InstrInfo &XII = (X86InstrInfo&)TII;
+  const X86InstrInfo &XII = (const X86InstrInfo&)TII;
 
   unsigned Size = TD.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
@@ -2179,7 +2201,8 @@ bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
 
 
 namespace llvm {
-  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) {
-    return new X86FastISel(funcInfo);
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    return new X86FastISel(funcInfo, libInfo);
   }
 }
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index ed1707d..955c75a 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -130,7 +130,7 @@ namespace {
     // The hardware keeps track of how many FP registers are live, so we have
     // to model that exactly. Usually, each live register corresponds to an
     // FP<n> register, but when dealing with calls, returns, and inline
-    // assembly, it is sometimes neccesary to have live scratch registers.
+    // assembly, it is sometimes necessary to have live scratch registers.
     unsigned Stack[8];          // FP<n> Registers in each stack slot...
     unsigned StackTop;          // The current top of the FP stack.
 
@@ -971,7 +971,7 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
   // Change from the pseudo instruction to the concrete instruction.
   MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
   MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
-  
+
   // Result gets pushed on the stack.
   pushReg(DestReg);
 }
@@ -1015,7 +1015,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   } else {
     moveToTop(Reg, I);            // Move to the top of the stack...
   }
-  
+
   // Convert from the pseudo instruction to the concrete instruction.
   MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
   MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
@@ -1297,7 +1297,7 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
   MI->RemoveOperand(1);
   MI->getOperand(0).setReg(getSTReg(Op1));
   MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
-  
+
   // If we kill the second operand, make sure to pop it from the stack.
   if (Op0 != Op1 && KillsOp1) {
     // Get this value off of the register stack.
@@ -1714,38 +1714,38 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
       // Assert that the top of stack contains the right FP register.
       assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
              "Top of stack not the right register for RET!");
-      
+
       // Ok, everything is good, mark the value as not being on the stack
       // anymore so that our assertion about the stack being empty at end of
       // block doesn't fire.
       StackTop = 0;
       return;
     }
-    
+
     // Otherwise, we are returning two values:
     // 2) If returning the same value for both, we only have one thing in the FP
     //    stack.  Consider:  RET FP1, FP1
     if (StackTop == 1) {
       assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
              "Stack misconfiguration for RET!");
-      
+
       // Duplicate the TOS so that we return it twice.  Just pick some other FPx
       // register to hold it.
       unsigned NewReg = getScratchReg();
       duplicateToTop(FirstFPRegOp, NewReg, MI);
       FirstFPRegOp = NewReg;
     }
-    
+
     /// Okay we know we have two different FPx operands now:
     assert(StackTop == 2 && "Must have two values live!");
-    
+
     /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
     ///    in ST(1).  In this case, emit an fxch.
     if (getStackEntry(0) == SecondFPRegOp) {
       assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
       moveToTop(FirstFPRegOp, MI);
     }
-    
+
     /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
     /// ST(1).  Just remove both from our understanding of the stack and return.
     assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 000e375..2238688 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -45,14 +45,14 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
-          RI->needsStackRealignment(MF) ||
+          RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
-          MMI.callsUnwindInit());
+          MMI.callsUnwindInit() || MMI.callsEHReturn());
 }
 
 static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
@@ -125,8 +125,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI)
-        Uses.insert(*AsI);
+      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     }
 
     const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
@@ -369,7 +369,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
 /// getCompactUnwindRegNum - Get the compact unwind number for a given
 /// register. The number corresponds to the enum lists in
 /// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
   for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
     if (*CURegs == Reg)
       return Idx;
@@ -398,13 +398,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
   //     4       3
   //     5       3
   //
-  static const unsigned CU32BitRegs[] = {
+  static const uint16_t CU32BitRegs[] = {
     X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
   };
-  static const unsigned CU64BitRegs[] = {
+  static const uint16_t CU64BitRegs[] = {
     X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
   };
-  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
 
   for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
     int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
@@ -466,13 +466,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
 static uint32_t
 encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
                                       bool Is64Bit) {
-  static const unsigned CU32BitRegs[] = {
+  static const uint16_t CU32BitRegs[] = {
     X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
   };
-  static const unsigned CU64BitRegs[] = {
+  static const uint16_t CU64BitRegs[] = {
     X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
   };
-  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
 
   // Encode the registers in the order they were saved, 3-bits per register. The
   // registers are numbered from 1 to CU_NUM_SAVED_REGS.
@@ -650,6 +650,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned StackPtr = RegInfo->getStackRegister();
+  unsigned BasePtr = RegInfo->getBaseRegister();
   DebugLoc DL;
 
   // If we're forcing a stack realignment we can't rely on just the frame
@@ -721,10 +722,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF))
-      FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
-
-    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers are pushed on stack before the stack
+      // is realigned.
+      FrameSize -= X86FI->getCalleeSavedFrameSize();
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    }
 
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -781,19 +786,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(FramePtr);
-
-    // Realign stack
-    if (RegInfo->needsStackRealignment(MF)) {
-      MachineInstr *MI =
-        BuildMI(MBB, MBBI, DL,
-                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
-        .addReg(StackPtr)
-        .addImm(-MaxAlign)
-        .setMIFlag(MachineInstr::FrameSetup);
-
-      // The EFLAGS implicit def is dead.
-      MI->getOperand(3).setIsDead();
-    }
   } else {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
@@ -823,6 +815,27 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+
+  // NOTE: We push the registers before realigning the stack, so
+  // vector callee-saved (xmm) registers may be saved w/o proper
+  // alignment in this way. However, currently these regs are saved in
+  // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
+  // this shouldn't be a problem.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+      .addReg(StackPtr)
+      .addImm(-MaxAlign)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
+
   DL = MBB.findDebugLoc(MBBI);
 
   // If there is an SUB32ri of ESP immediately before this instruction, merge
@@ -913,6 +926,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
                  UseLEA, TII, *RegInfo);
 
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF)) {
+    // Update the frame pointer with the current stack pointer.
+    unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
   if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
@@ -997,10 +1022,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF))
-      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
-
-    NumBytes = FrameSize - CSSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers were pushed on stack before the stack
+      // was realigned.
+      FrameSize -= CSSize;
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - CSSize;
+    }
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
@@ -1010,7 +1039,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   // Skip the callee-saved pop instructions.
-  MachineBasicBlock::iterator LastCSPop = MBBI;
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
@@ -1021,6 +1049,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
     --MBBI;
   }
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
 
   DL = MBBI->getDebugLoc();
 
@@ -1032,28 +1061,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
   // slot before popping them off! Same applies for the case, when stack was
   // realigned.
-  if (RegInfo->needsStackRealignment(MF)) {
-    // We cannot use LEA here, because stack pointer was realigned. We need to
-    // deallocate local frame back.
-    if (CSSize) {
-      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII,
-                   *RegInfo);
-      MBBI = prior(LastCSPop);
-    }
-
-    BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
-            StackPtr).addReg(FramePtr);
-  } else if (MFI->hasVarSizedObjects()) {
-    if (CSSize) {
-      unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
-      MachineInstr *MI =
-        addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
-                     FramePtr, false, -CSSize);
-      MBB.insert(MBBI, MI);
+  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+    if (RegInfo->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    if (CSSize != 0) {
+      unsigned Opc = getLEArOpcode(Is64Bit);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, -CSSize);
     } else {
-      BuildMI(MBB, MBBI, DL,
-              TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+      unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
         .addReg(FramePtr);
     }
   } else if (NumBytes) {
@@ -1124,8 +1141,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     MachineInstr *NewMI = prior(MBBI);
-    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
-      NewMI->addOperand(MBBI->getOperand(i));
+    NewMI->copyImplicitOps(MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
@@ -1142,16 +1158,25 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 }
 
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
-  const X86RegisterInfo *RI =
+  const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
   uint64_t StackSize = MFI->getStackSize();
 
-  if (RI->needsStackRealignment(MF)) {
+  if (RegInfo->hasBasePointer(MF)) {
+    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
     if (FI < 0) {
       // Skip the saved EBP.
-      Offset += RI->getSlotSize();
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
@@ -1162,7 +1187,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
       return Offset + StackSize;
 
     // Skip the saved EBP.
-    Offset += RI->getSlotSize();
+    Offset += RegInfo->getSlotSize();
 
     // Skip the RETADDR move area
     const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1174,6 +1199,22 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
   return Offset;
 }
 
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else if (RegInfo->needsStackRealignment(MF))
+    FrameReg = RegInfo->getStackRegister();
+  else
+    FrameReg = RegInfo->getFrameRegister(MF);
+  return getFrameIndexOffset(MF, FI);
+}
+
 bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
@@ -1307,6 +1348,10 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
            "Slot for EBP register must be last in order to be found!");
     (void)FrameIdx;
   }
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
 }
 
 static bool
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d55a497..dc515dc 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -60,6 +60,8 @@ public:
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
   uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 };
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8e2b1d6..27195b4 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -60,7 +60,7 @@ namespace {
     int Base_FrameIndex;
 
     unsigned Scale;
-    SDValue IndexReg; 
+    SDValue IndexReg;
     int32_t Disp;
     SDValue Segment;
     const GlobalValue *GV;
@@ -80,11 +80,11 @@ namespace {
     bool hasSymbolicDisplacement() const {
       return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
     }
-    
+
     bool hasBaseOrIndexReg() const {
       return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
     }
-    
+
     /// isRIPRelative - Return true if this addressing mode is already RIP
     /// relative.
     bool isRIPRelative() const {
@@ -94,7 +94,7 @@ namespace {
         return RegNode->getReg() == X86::RIP;
       return false;
     }
-    
+
     void setBaseReg(SDValue Reg) {
       BaseType = RegBase;
       Base_Reg = Reg;
@@ -104,7 +104,7 @@ namespace {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
       if (Base_Reg.getNode() != 0)
-        Base_Reg.getNode()->dump(); 
+        Base_Reg.getNode()->dump();
       else
         dbgs() << "nul";
       dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
@@ -113,7 +113,7 @@ namespace {
       if (IndexReg.getNode() != 0)
         IndexReg.getNode()->dump();
       else
-        dbgs() << "nul"; 
+        dbgs() << "nul";
       dbgs() << " Disp " << Disp << '\n'
              << "GV ";
       if (GV)
@@ -187,6 +187,7 @@ namespace {
 
   private:
     SDNode *Select(SDNode *N);
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
     SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
@@ -212,21 +213,21 @@ namespace {
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
                              SDValue &NodeWithChain);
-    
+
     bool TryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
-    
+
     /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
     /// inline asm expressions.
     virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                               char ConstraintCode,
                                               std::vector<SDValue> &OutOps);
-    
+
     void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
 
-    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, 
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
                                    SDValue &Scale, SDValue &Index,
                                    SDValue &Disp, SDValue &Segment) {
       Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
@@ -425,7 +426,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptForSize is used in pattern predicates that isel is matching.
   OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
-  
+
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
     SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
@@ -461,7 +462,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       ++NumLoadMoved;
       continue;
     }
-    
+
     // Lower fpround and fpextend nodes that target the FP stack to be store and
     // load to the stack.  This is a gross hack.  We would like to simply mark
     // these as being illegal, but when we do that, legalize produces these when
@@ -472,7 +473,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // FIXME: This should only happen when not compiled with -O0.
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
-    
+
     EVT SrcVT = N->getOperand(0).getValueType();
     EVT DstVT = N->getValueType(0);
 
@@ -495,7 +496,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       if (N->getConstantOperandVal(1))
         continue;
     }
-   
+
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
@@ -504,10 +505,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
       MemVT = SrcIsSSE ? SrcVT : DstVT;
-    
+
     SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
     DebugLoc dl = N->getDebugLoc();
-    
+
     // FIXME: optimize the case where the src/dest is a load or store?
     SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
                                           N->getOperand(0),
@@ -523,12 +524,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // To avoid invalidating 'I', back it up to the convert node.
     --I;
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
-    
+
     // Now that we did that, the node is dead.  Increment the iterator to the
     // next node to process, then delete N.
     ++I;
     CurDAG->DeleteNode(N);
-  }  
+  }
 }
 
 
@@ -583,7 +584,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
 
 bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   SDValue Address = N->getOperand(1);
-  
+
   // load gs:0 -> GS segment register.
   // load fs:0 -> FS segment register.
   //
@@ -592,7 +593,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
-        Subtarget->isTargetELF())
+        Subtarget->isTargetLinux())
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -601,7 +602,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
         return false;
       }
-  
+
   return true;
 }
 
@@ -991,7 +992,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case ISD::SHL:
     if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
       break;
-      
+
     if (ConstantSDNode
           *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
       unsigned Val = CN->getZExtValue();
@@ -1166,7 +1167,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
         !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
       return false;
     AM = Backup;
-    
+
     // Try again after commuting the operands.
     if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
         !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
@@ -1202,7 +1203,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       AM = Backup;
     }
     break;
-      
+
   case ISD::AND: {
     // Perform some heroic transforms on an and of a constant-count shift
     // with a constant to enable use of the scaled offset field.
@@ -1274,7 +1275,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                  SDValue &Scale, SDValue &Index,
                                  SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
-  
+
   if (Parent &&
       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
       // that are not a MemSDNode, and thus don't have proper addrspace info.
@@ -1289,7 +1290,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
     if (AddrSpace == 257)
       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
   }
-  
+
   if (MatchAddress(N, AM))
     return false;
 
@@ -1335,7 +1336,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
   // elements.  This is a vector shuffle from the zero vector.
   if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
       // Check to see if the top elements are all zeros (or bitcast of zeros).
-      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       N.getOperand(0).getNode()->hasOneUse() &&
       ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
       N.getOperand(0).getOperand(0).hasOneUse() &&
@@ -1410,7 +1411,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   // If it isn't worth using an LEA, reject it.
   if (Complexity <= 2)
     return false;
-  
+
   getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
   return true;
 }
@@ -1421,7 +1422,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
                                         SDValue &Disp, SDValue &Segment) {
   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
-    
+
   X86ISelAddressMode AM;
   AM.GV = GA->getGlobal();
   AM.Disp += GA->getOffset();
@@ -1434,7 +1435,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
   } else {
     AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
   }
-  
+
   getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
   return true;
 }
@@ -1448,7 +1449,7 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
       !IsProfitableToFold(N, P, P) ||
       !IsLegalToFold(N, P, P, OptLevel))
     return false;
-  
+
   return SelectAddr(N.getNode(),
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
@@ -1699,7 +1700,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
-  
+
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
   // version of the arithmetic instruction.
@@ -1726,14 +1727,14 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
     default:
       return 0;
   }
-  
+
   bool isCN = false;
   ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
   if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) {
     isCN = true;
     Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
   }
-  
+
   unsigned Opc = 0;
   switch (NVT.getSimpleVT().SimpleTy) {
     default: return 0;
@@ -1771,7 +1772,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
       }
       break;
   }
-  
+
   assert(Opc != 0 && "Invalid arith lock transform!");
 
   DebugLoc dl = Node->getDebugLoc();
@@ -1851,7 +1852,7 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
 /// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
 /// is suitable for doing the {load; increment or decrement; store} to modify
 /// transformation.
-static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, 
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
                                 SDValue StoredVal, SelectionDAG *CurDAG,
                                 LoadSDNode* &LoadNode, SDValue &InputChain) {
 
@@ -1875,15 +1876,15 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
   // Return LoadNode by reference.
   LoadNode = cast<LoadSDNode>(Load);
   // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
-  EVT LdVT = LoadNode->getMemoryVT();    
-  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && 
+  EVT LdVT = LoadNode->getMemoryVT();
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
       LdVT != MVT::i8)
     return false;
 
   // Is store the only read of the loaded value?
   if (!Load.hasOneUse())
     return false;
-  
+
   // Is the address of the store the same as the load?
   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
       LoadNode->getOffset() != StoreNode->getOffset())
@@ -1905,6 +1906,20 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
         ChainCheck = true;
         continue;
       }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
       ChainOps.push_back(Op);
     }
 
@@ -1938,12 +1953,44 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return 0;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           VTs, Ops, array_lengthof(Ops));
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   EVT NVT = Node->getValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
   DebugLoc dl = Node->getDebugLoc();
-  
+
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
 
   if (Node->isMachineOpcode()) {
@@ -1953,23 +2000,82 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   switch (Opcode) {
   default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return NULL;
+      break;
+    }
+    }
+    break;
+  }
   case X86ISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
+
   case X86ISD::ATOMOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMOR6432);
   case X86ISD::ATOMXOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMXOR6432);
   case X86ISD::ATOMADD64_DAG:
-    return SelectAtomic64(Node, X86::ATOMADD6432);
   case X86ISD::ATOMSUB64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSUB6432);
   case X86ISD::ATOMNAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMNAND6432);
   case X86ISD::ATOMAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMAND6432);
-  case X86ISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+  case X86ISD::ATOMSWAP64_DAG: {
+    unsigned Opc;
+    switch (Opcode) {
+    default: llvm_unreachable("Impossible opcode");
+    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
+    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
+    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
+    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
+    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+    }
+    SDNode *RetVal = SelectAtomic64(Node, Opc);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
 
   case ISD::ATOMIC_LOAD_ADD: {
     SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
@@ -2013,7 +2119,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
       break;
 
-    unsigned ShlOp, Op = 0;
+    unsigned ShlOp, Op;
     EVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
@@ -2036,6 +2142,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ShlOp = X86::SHL32ri;
 
       switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = X86::AND32ri8; break;
       case ISD::OR:  Op =  X86::OR32ri8; break;
       case ISD::XOR: Op = X86::XOR32ri8; break;
@@ -2046,6 +2153,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ShlOp = X86::SHL64ri;
 
       switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
       case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
       case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
@@ -2062,7 +2170,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
-    
+
     unsigned LoReg;
     switch (NVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
@@ -2071,20 +2179,20 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
     case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
     }
-    
+
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
-    
+
     SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
     SDValue Ops[] = {N1, InFlag};
     SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2);
-    
+
     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
     return NULL;
   }
-      
+
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI: {
     SDValue N0 = Node->getOperand(0);
@@ -2128,7 +2236,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
-                                            N0, SDValue()).getValue(1);
+                                          N0, SDValue()).getValue(1);
 
     if (foldedLoad) {
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
@@ -2168,7 +2276,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                LoReg, NVT, InFlag);
+                                              LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
@@ -2181,7 +2289,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
-    
+
     return NULL;
   }
 
@@ -2332,7 +2440,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     return NULL;
   }
 
-  case X86ISD::CMP: {
+  case X86ISD::CMP:
+  case X86ISD::SUB: {
+    // Sometimes a SUB is used to perform comparison.
+    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+      // This node is not a CMP.
+      break;
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
@@ -2449,7 +2562,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     // a simple increment or decrement through memory of that value, if the
     // uses of the modified value and its address are suitable.
     // The DEC64m tablegen pattern is currently not able to match the case where
-    // the EFLAGS on the original DEC are used. (This also applies to 
+    // the EFLAGS on the original DEC are used. (This also applies to
     // {INC,DEC}X{64,32,16,8}.)
     // We'll need to improve tablegen to allow flags to be transferred from a
     // node in the pattern to the result node.  probably with a new keyword
@@ -2481,7 +2594,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     MemOp[0] = StoreNode->getMemOperand();
     MemOp[1] = LoadNode->getMemOperand();
     const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
-    EVT LdVT = LoadNode->getMemoryVT();    
+    EVT LdVT = LoadNode->getMemoryVT();
     unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
     MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
                                                    Node->getDebugLoc(),
@@ -2494,6 +2607,85 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     return Result;
   }
+
+  // FIXME: Custom handling because TableGen doesn't support multiple implicit
+  // defs in an instruction pattern
+  case X86ISD::PCMPESTRI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    SDValue N3 = Node->getOperand(3);
+    SDValue N4 = Node->getOperand(4);
+
+    // Make sure last argument is a constant
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4);
+    if (!Cst)
+      break;
+
+    uint64_t Imm = Cst->getZExtValue();
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                                          X86::EAX, N1, SDValue()).getValue(1);
+    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+                                  N3, InFlag).getValue(1);
+
+    SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag };
+    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr :
+                                         X86::PCMPESTRIrr;
+    InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
+                                            array_lengthof(Ops)), 0);
+
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::ECX, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+    }
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::EFLAGS, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+    }
+
+    return NULL;
+  }
+
+  // FIXME: Custom handling because TableGen doesn't support multiple implicit
+  // defs in an instruction pattern
+  case X86ISD::PCMPISTRI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+
+    // Make sure last argument is a constant
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2);
+    if (!Cst)
+      break;
+
+    uint64_t Imm = Cst->getZExtValue();
+
+    SDValue Ops[] = { N0, N1, getI8Imm(Imm) };
+    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr :
+                                         X86::PCMPISTRIrr;
+    SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
+                                                    array_lengthof(Ops)), 0);
+
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::ECX, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+    }
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::EFLAGS, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+    }
+
+    return NULL;
+  }
   }
 
   SDNode *ResNode = SelectCode(Node);
@@ -2521,7 +2713,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
       return true;
     break;
   }
-  
+
   OutOps.push_back(Op0);
   OutOps.push_back(Op1);
   OutOps.push_back(Op2);
@@ -2530,7 +2722,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   return false;
 }
 
-/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 04299f3..ea66a61 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <bitset>
+#include <cctype>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -62,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 /// simple subregister reference.  Idx is an index in the 128 bits we
 /// want.  It need not be aligned to a 128-bit bounday.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec,
-                                   SDValue Idx,
-                                   SelectionDAG &DAG,
-                                   DebugLoc dl) {
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, DebugLoc dl) {
   EVT VT = Vec.getValueType();
-  assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
+  assert(VT.is256BitVector() && "Unexpected vector size!");
   EVT ElVT = VT.getVectorElementType();
-  int Factor = VT.getSizeInBits()/128;
+  unsigned Factor = VT.getSizeInBits()/128;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
   // Extract from UNDEF is UNDEF.
   if (Vec.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(ISD::UNDEF, dl, ResultVT);
-
-  if (isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    return DAG.getUNDEF(ResultVT);
 
-    // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
-    // we can match to VEXTRACTF128.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
+  // we can match to VEXTRACTF128.
+  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
 
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
-                                 * ElemsPerChunk);
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+                               * ElemsPerChunk);
 
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
-                                 VecIdx);
-
-    return Result;
-  }
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+                               VecIdx);
 
-  return SDValue();
+  return Result;
 }
 
 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
@@ -104,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec,
 /// simple superregister reference.  Idx is an index in the 128 bits
 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result,
-                                  SDValue Vec,
-                                  SDValue Idx,
-                                  SelectionDAG &DAG,
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
                                   DebugLoc dl) {
-  if (isa<ConstantSDNode>(Idx)) {
-    EVT VT = Vec.getValueType();
-    assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
 
-    EVT ElVT = VT.getVectorElementType();
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-    EVT ResultVT = Result.getValueType();
+  EVT VT = Vec.getValueType();
+  assert(VT.is128BitVector() && "Unexpected vector size!");
 
-    // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
 
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
-                                 * ElemsPerChunk);
+  // Insert the relevant 128 bits.
+  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
 
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                         VecIdx);
-    return Result;
-  }
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+                               * ElemsPerChunk);
 
-  return SDValue();
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+                     VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   DebugLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
@@ -140,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
 
   if (Subtarget->isTargetEnvMacho()) {
     if (is64Bit)
-      return new X8664_MachoTargetObjectFile();
+      return new X86_64MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
   }
 
+  if (Subtarget->isTargetLinux())
+    return new X86LinuxTargetObjectFile();
   if (Subtarget->isTargetELF())
     return new TargetLoweringObjectFileELF();
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -162,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   TD = getTargetData();
 
   // Set up the TargetLowering object.
-  static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -171,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
-  // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
-  if (Subtarget->is64Bit())
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
     setSchedulingPreference(Sched::ILP);
-  else if (Subtarget->isAtom()) 
-    setSchedulingPreference(Sched::Hybrid);
   else
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -215,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   }
 
   // Set up the register classes.
-  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
-  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
-  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
   if (Subtarget->is64Bit())
-    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 
@@ -345,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
@@ -492,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setShouldFoldAtomicFences(true);
 
   // Expand certain atomics
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -567,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
 
     // Use ANDPD to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f64, Custom);
@@ -599,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -632,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   } else if (!TM.Options.UseSoftFloat) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
-    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
@@ -660,7 +662,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // Long double always uses X87.
   if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
@@ -705,8 +707,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
     setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
@@ -729,6 +731,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMA,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
@@ -764,8 +767,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction((MVT::SimpleValueType)VT,
                           (MVT::SimpleValueType)InnerVT, Expand);
     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
@@ -776,7 +779,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
-    addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
 
@@ -813,7 +816,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
-    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 
     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
@@ -826,18 +829,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
-    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 
     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
     // registers cannot be used even for integer operations.
-    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 
     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
@@ -867,27 +869,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
-
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
-      EVT VT = (MVT::SimpleValueType)i;
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
       // Do not attempt to custom lower non-power-of-2 vectors
       if (!isPowerOf2_32(VT.getVectorNumElements()))
         continue;
       // Do not attempt to custom lower non-128-bit vectors
       if (!VT.is128BitVector())
         continue;
-      setOperationAction(ISD::BUILD_VECTOR,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
-                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
@@ -903,24 +896,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Do not attempt to promote non-128-bit vectors
       if (!VT.is128BitVector())
         continue;
 
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
     }
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -1007,16 +999,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
-  if (Subtarget->hasSSE42())
-    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
-
   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
 
     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
@@ -1040,13 +1029,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
-
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
 
@@ -1070,6 +1052,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
 
+    if (Subtarget->hasFMA()) {
+      setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Custom);
+      setOperationAction(ISD::FMA,             MVT::f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::f64, Custom);
+    }
+
     if (Subtarget->hasAVX2()) {
       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
@@ -1121,60 +1112,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     // Custom lower several nodes for 256-bit types.
-    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-                  i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Extract subvector is special because the value type
       // (result) is 128-bit but the source is 256-bit wide.
       if (VT.is128BitVector())
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
       // Do not attempt to custom lower other non-256-bit vectors
       if (!VT.is256BitVector())
         continue;
 
-      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Do not attempt to promote non-256-bit vectors
       if (!VT.is256BitVector())
         continue;
 
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
     }
   }
 
   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   // of this type with custom code.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
                        Custom);
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1218,17 +1209,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
-  if (Subtarget->hasBMI())
-    setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::XOR);
 
   computeRegisterProperties();
 
@@ -1243,6 +1238,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setPrefLoopAlignment(4); // 2^4 bytes.
   benefitFromCodePlacementOpt = true;
 
+  // Predictable cmov don't hurt on atom because it's in-order.
+  predictableSelectIsExpensive = !Subtarget->isAtom();
+
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
@@ -1276,7 +1274,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
         break;
     }
   }
-  return;
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -1411,18 +1408,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
   default:
     return TargetLowering::findRepresentativeClass(VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
-    RRC = (Subtarget->is64Bit()
-           ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+    RRC = Subtarget->is64Bit() ?
+      (const TargetRegisterClass*)&X86::GR64RegClass :
+      (const TargetRegisterClass*)&X86::GR32RegClass;
     break;
   case MVT::x86mmx:
-    RRC = X86::VR64RegisterClass;
+    RRC = &X86::VR64RegClass;
     break;
   case MVT::f32: case MVT::f64:
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   case MVT::v4f64:
-    RRC = X86::VR128RegisterClass;
+    RRC = &X86::VR128RegClass;
     break;
   }
   return std::make_pair(RRC, Cost);
@@ -1457,7 +1455,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
 
 bool
 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-				  MachineFunction &MF, bool isVarArg,
+                                  MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1501,6 +1499,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     SDValue ValToCopy = OutVals[i];
     EVT ValVT = ValToCopy.getValueType();
 
+    // Promote values to the appropriate types
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1638,7 +1646,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -1655,7 +1663,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
     SDValue Val;
 
     // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the the value is popped from the stack, so
+    // point stack, we must guarantee the value is popped from the stack, so
     // a CopyFromReg is not good enough - the copy instruction may be eliminated
     // if the return value is not used. We use the FpPOP_RETVAL instruction
     // instead.
@@ -1699,21 +1707,37 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
 /// CallIsStructReturn - Determines whether a call uses struct return
 /// semantics.
-static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   if (Outs.empty())
-    return false;
+    return NotStructReturn;
 
-  return Outs[0].Flags.isSRet();
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
 }
 
 /// ArgsAreStructReturn - Determines whether a function uses struct
 /// return semantics.
-static bool
-ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   if (Ins.empty())
-    return false;
+    return NotStructReturn;
 
-  return Ins[0].Flags.isSRet();
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
 }
 
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -1850,19 +1874,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       EVT RegVT = VA.getLocVT();
       const TargetRegisterClass *RC;
       if (RegVT == MVT::i32)
-        RC = X86::GR32RegisterClass;
+        RC = &X86::GR32RegClass;
       else if (Is64Bit && RegVT == MVT::i64)
-        RC = X86::GR64RegisterClass;
+        RC = &X86::GR64RegClass;
       else if (RegVT == MVT::f32)
-        RC = X86::FR32RegisterClass;
+        RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
-        RC = X86::FR64RegisterClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
-        RC = X86::VR256RegisterClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
-        RC = X86::VR128RegisterClass;
+        RC = &X86::FR64RegClass;
+      else if (RegVT.is256BitVector())
+        RC = &X86::VR256RegClass;
+      else if (RegVT.is128BitVector())
+        RC = &X86::VR128RegClass;
       else if (RegVT == MVT::x86mmx)
-        RC = X86::VR64RegisterClass;
+        RC = &X86::VR64RegClass;
       else
         llvm_unreachable("Unknown argument type!");
 
@@ -2004,7 +2028,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(Offset));
         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     X86::GR64RegisterClass);
+                                     &X86::GR64RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2020,7 +2044,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SmallVector<SDValue, 11> SaveXMMOps;
         SaveXMMOps.push_back(Chain);
 
-        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
         SaveXMMOps.push_back(ALVal);
 
@@ -2031,7 +2055,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       X86::VR128RegisterClass);
+                                       &X86::VR128RegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
           SaveXMMOps.push_back(Val);
         }
@@ -2054,7 +2078,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
     if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
-        ArgsAreStructReturn(Ins))
+        argsAreStructReturn(Ins) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
@@ -2127,19 +2151,24 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
 }
 
 SDValue
-X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isTargetWin64();
   bool IsWindows      = Subtarget->isTargetWindows();
-  bool IsStructRet    = CallIsStructReturn(Outs);
+  StructReturnType SR = callIsStructReturn(Outs);
   bool IsSibcall      = false;
 
   if (MF.getTarget().Options.DisableTailCalls)
@@ -2148,8 +2177,9 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, OutVals, Ins, DAG);
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(),
+                    Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
@@ -2231,7 +2261,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::AExt:
-      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+      if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
@@ -2282,27 +2312,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into registers.
-  SDValue InFlag;
-  // Tail call byval lowering might overwrite argument registers so in case of
-  // tail call optimization the copies to registers are lowered later.
-  if (!isTailCall)
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-
   if (Subtarget->isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
-      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
-                               DAG.getNode(X86ISD::GlobalBaseReg,
-                                           DebugLoc(), getPointerTy()),
-                               InFlag);
-      InFlag = Chain.getValue(1);
+      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -2340,12 +2355,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
-    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
-                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
-    InFlag = Chain.getValue(1);
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
-
   // For tail calls lower the arguments to the 'real' stack slot.
   if (isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
@@ -2359,8 +2372,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    // Do not flag preceding copytoreg stuff together with the following stuff.
-    InFlag = SDValue();
     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -2400,19 +2411,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                           &MemOpChains2[0], MemOpChains2.size());
 
-    // Copy arguments to their registers.
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-    InFlag =SDValue();
-
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
                                      FPDiff, dl);
   }
 
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
@@ -2514,14 +2526,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
-  // Add an implicit use GOT pointer in EBX.
-  if (!isTailCall && Subtarget->isPICStyleGOT())
-    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
-  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
-  if (Is64Bit && isVarArg && !IsWin64)
-    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2551,7 +2555,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                        getTargetMachine().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
-           IsStructRet)
+           SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
@@ -2743,7 +2747,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-		   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2764,7 +2768,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-		   getTargetMachine(), RVLocs, *DAG.getContext());
+                   getTargetMachine(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -2778,12 +2782,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs1, *DAG.getContext());
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs2, *DAG.getContext());
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2810,7 +2814,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-		   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (Subtarget->isTargetWin64()) {
@@ -2872,8 +2876,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 }
 
 FastISel *
-X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
-  return X86::createFastISel(funcInfo);
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
 }
 
 
@@ -2911,6 +2916,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::UNPCKH:
   case X86ISD::VPERMILP:
   case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
     return true;
   }
 }
@@ -3051,10 +3057,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
         // X > -1   -> X == 0, jump !sign.
         RHS = DAG.getConstant(0, RHS.getValueType());
         return X86::COND_NS;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
         // X < 1   -> X <= 0
         RHS = DAG.getConstant(0, RHS.getValueType());
         return X86::COND_LE;
@@ -3170,12 +3178,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
   return false;
 }
 
-/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (L, L+Pos]. or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
-                                       int Pos, int Size, int Low) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
     if (!isUndefOrEqual(Mask[i], Low))
       return false;
   return true;
@@ -3194,8 +3202,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
     return false;
 
   // Lower quadword copied in order or undef.
@@ -3204,16 +3212,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
 
   // Upper quadword shuffled.
   for (unsigned i = 4; i != 8; ++i)
-    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+    if (!isUndefOrInRange(Mask[i], 4, 8))
       return false;
 
+  if (VT == MVT::v16i16) {
+    // Lower quadword copied in order or undef.
+    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+      return false;
+
+    // Upper quadword shuffled.
+    for (unsigned i = 12; i != 16; ++i)
+      if (!isUndefOrInRange(Mask[i], 12, 16))
+        return false;
+  }
+
   return true;
 }
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
     return false;
 
   // Upper quadword copied in order.
@@ -3222,9 +3241,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
 
   // Lower quadword shuffled.
   for (unsigned i = 0; i != 4; ++i)
-    if (Mask[i] >= 4)
+    if (!isUndefOrInRange(Mask[i], 0, 4))
       return false;
 
+  if (VT == MVT::v16i16) {
+    // Upper quadword copied in order.
+    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
+      return false;
+
+    // Lower quadword shuffled.
+    for (unsigned i = 8; i != 12; ++i)
+      if (!isUndefOrInRange(Mask[i], 8, 12))
+        return false;
+  }
+
   return true;
 }
 
@@ -3374,11 +3404,11 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
+  unsigned NumElems = VT.getVectorNumElements();
+
   if (NumElems != 4)
     return false;
 
@@ -3393,11 +3423,11 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
+  unsigned NumElems = VT.getVectorNumElements();
+
   if (NumElems != 4)
     return false;
 
@@ -3410,7 +3440,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -3418,11 +3448,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4)
     return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i + NumElems))
       return false;
 
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
 
@@ -3432,23 +3462,71 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
   unsigned NumElems = VT.getVectorNumElements();
 
-  if ((NumElems != 2 && NumElems != 4)
-      || VT.getSizeInBits() > 128)
+  if (NumElems != 2 && NumElems != 4)
     return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
       return false;
 
   return true;
 }
 
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  DebugLoc dl = SVOp->getDebugLoc();
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+  const int *CompactionMask;
+  if (MatchEvenMask)
+    CompactionMask = CompactionMaskEven;
+  else if (MatchOddMask)
+    CompactionMask = CompactionMaskOdd;
+  else
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+                                     UndefNode, CompactionMask);
+  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+                                     UndefNode, CompactionMask);
+  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -3606,7 +3684,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   if (VT.getVectorElementType().getSizeInBits() < 32)
     return false;
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -3628,7 +3706,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX || VT.getSizeInBits() != 256)
+  if (!HasAVX || !VT.is256BitVector())
     return false;
 
   // The shuffle result is divided into half A and half B. In total the two
@@ -3720,9 +3798,10 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
 /// element of vector 2 and the other elements to come from vector 1 in order.
 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
-  unsigned NumOps = VT.getVectorNumElements();
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return false;
+
+  unsigned NumOps = VT.getVectorNumElements();
   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
     return false;
 
@@ -3788,9 +3867,11 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!HasAVX || !VT.is256BitVector())
+    return false;
 
-  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 4)
     return false;
 
   for (unsigned i = 0; i != NumElts/2; ++i)
@@ -3806,7 +3887,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned e = VT.getVectorNumElements() / 2;
@@ -3880,9 +3961,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   for (unsigned i = 0; i != NumElts; ++i) {
     int Elt = N->getMaskElt(i);
     if (Elt < 0) continue;
-    Elt %= NumLaneElts;
-    unsigned ShAmt = i << Shift;
-    if (ShAmt >= 8) ShAmt -= 8;
+    Elt &= NumLaneElts - 1;
+    unsigned ShAmt = (i << Shift) % 8;
     Mask |= Elt << ShAmt;
   }
 
@@ -3892,30 +3972,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   unsigned Mask = 0;
-  // 8 nodes, but we only care about the last 4.
-  for (unsigned i = 7; i >= 4; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= (Val - 4);
-    if (i != 4)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the last 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i+4);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits.
+      Mask |= Elt << (i * 2);
+    }
   }
+
   return Mask;
 }
 
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   unsigned Mask = 0;
-  // 8 nodes, but we only care about the first 4.
-  for (int i = 3; i >= 0; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= Val;
-    if (i != 0)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the first 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits
+      Mask |= Elt << (i * 2);
+    }
   }
+
   return Mask;
 }
 
@@ -4016,13 +4114,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   SmallVector<int, 8> MaskVec;
 
   for (unsigned i = 0; i != NumElems; ++i) {
-    int idx = SVOp->getMaskElt(i);
-    if (idx < 0)
-      MaskVec.push_back(idx);
-    else if (idx < (int)NumElems)
-      MaskVec.push_back(idx + NumElems);
-    else
-      MaskVec.push_back(idx - NumElems);
+    int Idx = SVOp->getMaskElt(i);
+    if (Idx >= 0) {
+      if (Idx < (int)NumElems)
+        Idx += NumElems;
+      else
+        Idx -= NumElems;
+    }
+    MaskVec.push_back(Idx);
   }
   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
                               SVOp->getOperand(0), &MaskVec[0]);
@@ -4033,7 +4132,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
   if (VT.getVectorNumElements() != 4)
     return false;
@@ -4090,7 +4189,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
@@ -4107,7 +4206,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i+NumElems))
       return false;
   return true;
@@ -4159,11 +4258,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
+  unsigned Size = VT.getSizeInBits();
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
-  if (VT.getSizeInBits() == 128) {  // SSE
+  if (Size == 128) {  // SSE
     if (Subtarget->hasSSE2()) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4171,7 +4271,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
     }
-  } else if (VT.getSizeInBits() == 256) { // AVX
+  } else if (Size == 256) { // AVX
     if (Subtarget->hasAVX2()) { // AVX2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4183,7 +4283,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
     }
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
+
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
@@ -4194,25 +4296,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
 static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
-  assert((VT.is128BitVector() || VT.is256BitVector())
-         && "Expected a 128-bit or 256-bit vector type");
+  unsigned Size = VT.getSizeInBits();
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDValue Vec;
-  if (VT.getSizeInBits() == 256) {
+  if (Size == 256) {
     if (HasAVX2) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
     } else { // AVX
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-      SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
-                                Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Vec = Insert128BitVector(InsV, Vec,
-                    DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
     }
-  } else {
+  } else if (Size == 128) {
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
 
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
@@ -4255,9 +4354,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
-  unsigned Half = NumElems/2;
   SmallVector<int, 8> Mask;
-  for (unsigned i = 0; i != Half; ++i) {
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
     Mask.push_back(i + Half);
     Mask.push_back(i + NumElems + Half);
   }
@@ -4289,15 +4387,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   EVT VT = V.getValueType();
   DebugLoc dl = V.getDebugLoc();
-  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
-         && "Vector size not supported");
+  unsigned Size = VT.getSizeInBits();
 
-  if (VT.getSizeInBits() == 128) {
+  if (Size == 128) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
                              &SplatMask[0]);
-  } else {
+  } else if (Size == 256) {
     // To use VPERMILPS to splat scalars, the second half of indicies must
     // refer to the higher part, which is a duplication of the lower one,
     // because VPERMILPS can only handle in-lane permutations.
@@ -4307,7 +4404,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
                              &SplatMask[0]);
-  }
+  } else
+    llvm_unreachable("Vector size not supported");
 
   return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
@@ -4328,9 +4426,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // Extract the 128-bit part containing the splat element and update
   // the splat element index when it refers to the higher register.
   if (Size == 256) {
-    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
-    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
-    if (Idx > 0)
+    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+    if (EltNo >= NumElems/2)
       EltNo -= NumElems/2;
   }
 
@@ -4346,10 +4443,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // into the low and high part. This is necessary because we want
   // to use VPERM* to shuffle the vectors
   if (Size == 256) {
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
-    V1 = Insert128BitVector(InsV, V1,
-               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
   }
 
   return getLegalSplat(DAG, V1, EltNo);
@@ -4377,7 +4471,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
 /// target specific opcode. Returns true if the Mask could be calculated.
 /// Sets IsUnary to true if only uses one source.
-static bool getTargetShuffleMask(SDNode *N, EVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
@@ -4408,12 +4502,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT,
     break;
   case X86ISD::PSHUFHW:
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
@@ -4473,20 +4572,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    unsigned NumElems = VT.getVectorNumElements();
+    MVT ShufVT = V.getValueType().getSimpleVT();
+    unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     SDValue ImmN;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
     if (Elt < 0)
-      return DAG.getUNDEF(VT.getVectorElementType());
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
 
     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
-                                           : N->getOperand(1);
+                                         : N->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
@@ -4631,7 +4731,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   // Although the logic below support any bitwidth size, there are no
   // shift instructions which handle more than 128-bit vectors.
-  if (SVOp->getValueType(0).getSizeInBits() > 128)
+  if (!SVOp->getValueType(0).is128BitVector())
     return false;
 
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -4726,7 +4826,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
-  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
+  assert(VT.is128BitVector() && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4794,7 +4894,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
@@ -4802,7 +4902,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                              false, false, false, 0);
 
     SmallVector<int, 8> Mask;
-    for (int i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
       Mask.push_back(EltNo);
 
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
@@ -4866,8 +4966,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                        LDBase->getPointerInfo(),
                        LDBase->isVolatile(), LDBase->isNonTemporal(),
                        LDBase->isInvariant(), LDBase->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1 &&
-             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+  }
+  if (NumElems == 4 && LastLoadedElt == 1 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
     SDValue ResNode =
@@ -4896,6 +4997,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
   SDValue Ld;
   bool ConstSplatVal;
 
@@ -4930,8 +5034,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
         return SDValue();
 
       SDValue Sc = Op.getOperand(0);
-      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
-        return SDValue();
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasAVX2())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.is256BitVector())
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
 
       Ld = Sc.getOperand(0);
       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
@@ -4946,8 +5059,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
     }
   }
 
-  bool Is256 = VT.getSizeInBits() == 256;
-  bool Is128 = VT.getSizeInBits() == 128;
+  bool Is256 = VT.is256BitVector();
 
   // Handle the broadcasting a single constant scalar from the constant pool
   // into a vector. On Sandybridge it is still better to load a constant vector
@@ -4957,9 +5069,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
     assert(!CVT.isVector() && "Must not broadcast a vector type");
     unsigned ScalarSize = CVT.getSizeInBits();
 
-    if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
-        (Is128 && (ScalarSize == 32))) {
-
+    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
       const Constant *C = 0;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -4971,40 +5081,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
       SDValue CP = DAG.getConstantPool(C, getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, Alignment);
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
   }
 
-  // The scalar source must be a normal load.
-  if (!ISD::isNormalLoad(Ld.getNode()))
-    return SDValue();
-
-  // Reject loads that have uses of the chain result
-  if (Ld->hasAnyUseOfValue(1))
-    return SDValue();
-
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
-  // VBroadcast to YMM
-  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasAVX2() &&
+      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
-  // VBroadcast to XMM
-  if (Is128 && (ScalarSize == 32))
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
-  // double since there is vbroadcastsd xmm
+  // double since there is no vbroadcastsd xmm
   if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
-    // VBroadcast to YMM
-    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
-
-    // VBroadcast to XMM
-    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
 
@@ -5102,8 +5204,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
           Mask.push_back(Idx);
           for (unsigned i = 1; i != VecElts; ++i)
             Mask.push_back(i);
-          Item = DAG.getVectorShuffle(VecVT, dl, Item,
-                                      DAG.getUNDEF(Item.getValueType()),
+          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
                                       &Mask[0]);
         }
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5120,12 +5221,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                              Item, DAG.getIntPtrConstant(0));
         }
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+        assert(VT.is128BitVector() && "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5134,12 +5235,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
-          Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
-                                    DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
         } else {
-          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         }
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5171,7 +5271,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
       SmallVector<int, 8> MaskVec;
-      for (unsigned i = 0; i < NumElems; i++)
+      for (unsigned i = 0; i != NumElems; ++i)
         MaskVec.push_back(i == Idx ? 0 : 1);
       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
     }
@@ -5199,7 +5299,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     SmallVector<SDValue, 32> V;
     for (unsigned i = 0; i != NumElems; ++i)
       V.push_back(Op.getOperand(i));
@@ -5212,10 +5312,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                 NumElems/2);
 
     // Recreate the wider vector with the lower and upper part.
-    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
-                                DAG.getConstant(0, MVT::i32), DAG, dl);
-    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
-                              DAG, dl);
+    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -5283,7 +5380,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   }
 
-  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+  if (Values.size() > 1 && VT.is128BitVector()) {
     // Check for a build vector of consecutive loads.
     for (unsigned i = 0; i < NumElems; ++i)
       V[i] = Op.getOperand(i);
@@ -5344,62 +5441,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
-// them in a MMX register.  This is better than doing a stack convert.
-static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  EVT ResVT = Op.getValueType();
-
-  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
-         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
-  int Mask[2];
-  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
-  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-  InVec = Op.getOperand(1);
-  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    unsigned NumElts = ResVT.getVectorNumElements();
-    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
-                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
-  } else {
-    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
-    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-    Mask[0] = 0; Mask[1] = 2;
-    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-}
-
 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   EVT ResVT = Op.getValueType();
 
-  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = ResVT.getVectorNumElements();
 
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
-                            DAG, dl);
+  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 SDValue
 X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  EVT ResVT = Op.getValueType();
-
   assert(Op.getNumOperands() == 2);
-  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
-         "Unsupported CONCAT_VECTORS for value type");
-
-  // We support concatenate two MMX registers and place them in a MMX register.
-  // This is better than doing a stack convert.
-  if (ResVT.is128BitVector())
-    return LowerMMXCONCAT_VECTORS(Op, DAG);
 
   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
@@ -5407,75 +5466,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
 }
 
 // Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
                                           const X86Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  EVT VT = Op.getValueType();
-  EVT InVT = V1.getValueType();
-  int MaskSize = VT.getVectorNumElements();
-  int InSize = InVT.getVectorNumElements();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (!Subtarget->hasSSE41())
     return SDValue();
 
-  if (MaskSize != InSize)
-    return SDValue();
-
-  int ISDNo = 0;
+  unsigned ISDNo = 0;
   MVT OpTy;
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
   default: return SDValue();
   case MVT::v8i16:
-           ISDNo = X86ISD::BLENDPW;
-           OpTy = MVT::v8i16;
-           break;
+    ISDNo = X86ISD::BLENDPW;
+    OpTy = MVT::v8i16;
+    break;
   case MVT::v4i32:
   case MVT::v4f32:
-           ISDNo = X86ISD::BLENDPS;
-           OpTy = MVT::v4f32;
-           break;
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v4f32;
+    break;
   case MVT::v2i64:
   case MVT::v2f64:
-           ISDNo = X86ISD::BLENDPD;
-           OpTy = MVT::v2f64;
-           break;
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v2f64;
+    break;
   case MVT::v8i32:
   case MVT::v8f32:
-           if (!Subtarget->hasAVX())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPS;
-           OpTy = MVT::v8f32;
-           break;
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v8f32;
+    break;
   case MVT::v4i64:
   case MVT::v4f64:
-           if (!Subtarget->hasAVX())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPD;
-           OpTy = MVT::v4f64;
-           break;
-  case MVT::v16i16:
-           if (!Subtarget->hasAVX2())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPW;
-           OpTy = MVT::v16i16;
-           break;
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v4f64;
+    break;
   }
   assert(ISDNo && "Invalid Op Number");
 
   unsigned MaskVals = 0;
 
-  for (int i = 0; i < MaskSize; ++i) {
+  for (unsigned i = 0; i != NumElems; ++i) {
     int EltIdx = SVOp->getMaskElt(i);
-    if (EltIdx == i || EltIdx == -1)
+    if (EltIdx == (int)i || EltIdx < 0)
       MaskVals |= (1<<i);
-    else if (EltIdx == (i + MaskSize))
+    else if (EltIdx == (int)(i + NumElems))
       continue; // Bit is set to zero;
-    else return SDValue();
+    else
+      return SDValue();
   }
 
   V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
@@ -5629,13 +5677,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     bool TwoInputs = V1Used && V2Used;
     for (unsigned i = 0; i != 8; ++i) {
       int EltIdx = MaskVals[i] * 2;
-      if (TwoInputs && (EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
     }
     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
@@ -5649,13 +5694,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     pshufbMask.clear();
     for (unsigned i = 0; i != 8; ++i) {
       int EltIdx = MaskVals[i] * 2;
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
     }
     V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
@@ -5731,10 +5773,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     int EltIdx = MaskVals[i];
     if (EltIdx < 0)
       continue;
-    SDValue ExtOp = (EltIdx < 8)
-    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                  DAG.getIntPtrConstant(EltIdx))
-    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
                   DAG.getIntPtrConstant(EltIdx - 8));
     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
                        DAG.getIntPtrConstant(i));
@@ -5755,21 +5797,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
-  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
-  bool V1Only = true;
-  bool V2Only = true;
-  for (unsigned i = 0; i < 16; ++i) {
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0)
-      continue;
-    if (EltIdx < 16)
-      V2Only = false;
-    else
-      V1Only = false;
-  }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   if (TLI.getSubtarget()->hasSSSE3()) {
@@ -5781,23 +5813,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     // Otherwise, we have elements from both input vectors, and must zero out
     // elements that come from V2 in the first mask, and V1 in the second mask
     // so that we can OR them together.
-    bool TwoInputs = !(V1Only || V2Only);
     for (unsigned i = 0; i != 16; ++i) {
       int EltIdx = MaskVals[i];
-      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
     }
-    // If all the elements are from V2, assign it to V1 and return after
-    // building the first pshufb.
-    if (V2Only)
-      V1 = V2;
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (!TwoInputs)
+    if (V2IsUndef)
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5805,11 +5830,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     pshufbMask.clear();
     for (unsigned i = 0; i != 16; ++i) {
       int EltIdx = MaskVals[i];
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
     }
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
@@ -5822,7 +5844,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   // the 16 different words that comprise the two doublequadword input vectors.
   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
-  SDValue NewV = V2Only ? V2 : V1;
+  SDValue NewV = V1;
   for (int i = 0; i != 8; ++i) {
     int Elt0 = MaskVals[i*2];
     int Elt1 = MaskVals[i*2+1];
@@ -5832,9 +5854,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
       continue;
 
     // This word of the result is already in the correct place, skip it.
-    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
-      continue;
-    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
       continue;
 
     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
@@ -5896,41 +5916,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG, DebugLoc dl) {
-  EVT VT = SVOp->getValueType(0);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   unsigned NumElems = VT.getVectorNumElements();
-  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  EVT NewVT;
-  switch (VT.getSimpleVT().SimpleTy) {
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected!");
-  case MVT::v4f32: NewVT = MVT::v2f64; break;
-  case MVT::v4i32: NewVT = MVT::v2i64; break;
-  case MVT::v8i16: NewVT = MVT::v4i32; break;
-  case MVT::v16i8: NewVT = MVT::v4i32; break;
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
   }
 
-  int Scale = NumElems / NewWidth;
   SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i < NumElems; i += Scale) {
+  for (unsigned i = 0; i != NumElems; i += Scale) {
     int StartIdx = -1;
-    for (int j = 0; j < Scale; ++j) {
+    for (unsigned j = 0; j != Scale; ++j) {
       int EltIdx = SVOp->getMaskElt(i+j);
       if (EltIdx < 0)
         continue;
-      if (StartIdx == -1)
-        StartIdx = EltIdx - (EltIdx % Scale);
-      if (EltIdx != StartIdx + j)
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
         return SDValue();
     }
-    if (StartIdx == -1)
-      MaskVec.push_back(-1);
-    else
-      MaskVec.push_back(StartIdx / Scale);
+    MaskVec.push_back(StartIdx);
   }
 
-  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
 }
 
@@ -5973,6 +5989,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
 /// which could not be matched by any known target speficic shuffle
 static SDValue
 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   EVT VT = SVOp->getValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -5981,14 +6002,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   DebugLoc dl = SVOp->getDebugLoc();
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
-  SDValue Shufs[2];
+  SDValue Output[2];
 
   SmallVector<int, 16> Mask;
   for (unsigned l = 0; l < 2; ++l) {
     // Build a shuffle mask for the output, discovering on the fly which
     // input vectors to use as shuffle operands (recorded in InputUsed).
     // If building a suitable shuffle vector proves too hard, then bail
-    // out with useBuildVector set.
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
     unsigned LaneStart = l * NumLaneElems;
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -6020,38 +6042,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
       if (OpNo >= array_lengthof(InputUsed)) {
-        // More than two input vectors used! Give up.
-        return SDValue();
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
       }
 
       // Add the mask index for the new shuffle vector.
       Mask.push_back(Idx + OpNo * NumLaneElems);
     }
 
-    if (InputUsed[0] < 0) {
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+                              SVOps.size());
+    } else if (InputUsed[0] < 0) {
       // No input vectors were used! The result is undefined.
-      Shufs[l] = DAG.getUNDEF(NVT);
+      Output[l] = DAG.getUNDEF(NVT);
     } else {
       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
-                   DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
-                                   DAG, dl);
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
       // If only one input was used, use an undefined vector for the other.
       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
-                   DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
-                                   DAG, dl);
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
       // At least one input vector was used. Create a new shuffle vector.
-      Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
     }
 
     Mask.clear();
   }
 
   // Concatenate the result back
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
-                            DAG, dl);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
 }
 
 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6063,7 +6108,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   DebugLoc dl = SVOp->getDebugLoc();
   EVT VT = SVOp->getValueType(0);
 
-  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+  assert(VT.is128BitVector() && "Unsupported vector size");
 
   std::pair<int, int> Locs[4];
   int Mask1[] = { -1, -1, -1, -1 };
@@ -6107,7 +6152,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
-  } else if (NumLo == 3 || NumHi == 3) {
+  }
+
+  if (NumLo == 3 || NumHi == 3) {
     // Otherwise, we must have three elements from one vector, call it X, and
     // one element from the other, call it Y.  First, use a shufps to build an
     // intermediate vector with the one element from Y and the element from X
@@ -6143,17 +6190,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       Mask1[2] = HiIndex & 1 ? 6 : 4;
       Mask1[3] = HiIndex & 1 ? 4 : 6;
       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-    } else {
-      Mask1[0] = HiIndex & 1 ? 2 : 0;
-      Mask1[1] = HiIndex & 1 ? 0 : 2;
-      Mask1[2] = PermMask[2];
-      Mask1[3] = PermMask[3];
-      if (Mask1[2] >= 0)
-        Mask1[2] += 4;
-      if (Mask1[3] >= 0)
-        Mask1[3] += 4;
-      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
     }
+
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   }
 
   // Break it into (shuffle shuffle_hi, shuffle_lo).
@@ -6302,7 +6349,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
 
     if (NumElems == 4)
-      // If we don't care about the second element, procede to use movss.
+      // If we don't care about the second element, proceed to use movss.
       if (SVOp->getMaskElt(1) != -1)
         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   }
@@ -6360,7 +6407,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
 
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
-  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
+      VT == MVT::v16i16 || VT == MVT::v32i8) {
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
@@ -6564,11 +6612,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     // new vector_shuffle with the corrected mask.p
     SmallVector<int, 8> NewMask(M.begin(), M.end());
     NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-    }
   }
 
   if (Commuted) {
@@ -6605,12 +6652,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   }
 
-  if (isPSHUFHWMask(M, VT))
+  if (isPSHUFHWMask(M, VT, HasAVX2))
     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
                                 getShufflePSHUFHWImmediate(SVOp),
                                 DAG);
 
-  if (isPSHUFLWMask(M, VT))
+  if (isPSHUFLWMask(M, VT, HasAVX2))
     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
@@ -6647,7 +6694,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
@@ -6689,7 +6736,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle all 128-bit wide vectors with 4 elements, and match them with
   // several different shuffle types.
-  if (NumElems == 4 && VT.getSizeInBits() == 128)
+  if (NumElems == 4 && VT.is128BitVector())
     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
 
   // Handle general 256-bit shuffles
@@ -6705,7 +6752,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
-  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+  if (!Op.getOperand(0).getValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
@@ -6714,7 +6761,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 16) {
+  }
+
+  if (VT.getSizeInBits() == 16) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
     if (Idx == 0)
@@ -6729,7 +6778,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT == MVT::f32) {
+  }
+
+  if (VT == MVT::f32) {
     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
     // the result back to FR32 register. It's only worth matching if the
     // result has a single use which is a store or a bitcast to i32.  And in
@@ -6749,7 +6800,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                               Op.getOperand(0)),
                                               Op.getOperand(1));
     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  } else if (VT == MVT::i32 || VT == MVT::i64) {
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
     // ExtractPS/pextrq works with constant index.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return Op;
@@ -6769,22 +6822,22 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
-  if (VecVT.getSizeInBits() == 256) {
+  if (VecVT.is256BitVector()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     unsigned NumElems = VecVT.getVectorNumElements();
     SDValue Idx = Op.getOperand(1);
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
     // Get the 128-bit vector.
-    bool Upper = IdxVal >= NumElems/2;
-    Vec = Extract128BitVector(Vec,
-                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
 
+    if (IdxVal >= NumElems/2)
+      IdxVal -= NumElems/2;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
+                       DAG.getConstant(IdxVal, MVT::i32));
   }
 
-  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
 
   if (Subtarget->hasSSE41()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
@@ -6811,7 +6864,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 32) {
+  }
+
+  if (VT.getSizeInBits() == 32) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     if (Idx == 0)
       return Op;
@@ -6823,7 +6878,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() == 64) {
+  }
+
+  if (VT.getSizeInBits() == 64) {
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
@@ -6856,7 +6913,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return SDValue();
 
   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
@@ -6876,7 +6933,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+  }
+
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
     // Bits [7:6] of the constant are the source select.  This will always be
     //  zero here.  The DAG Combiner may combine an extract_elt index into these
     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
@@ -6889,8 +6948,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
     // Create this as a scalar to vector..
     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 
-             isa<ConstantSDNode>(N2)) {
+  }
+
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
     // PINSR* works with constant index.
     return Op;
   }
@@ -6909,23 +6969,22 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 
   // If this is a 256-bit vector result, first extract the 128-bit vector,
   // insert the element into the extracted half and then place it back.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     if (!isa<ConstantSDNode>(N2))
       return SDValue();
 
     // Get the desired 128-bit vector half.
     unsigned NumElems = VT.getVectorNumElements();
     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    bool Upper = IdxVal >= NumElems/2;
-    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
-    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired half.
-    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
-                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
+    bool Upper = IdxVal >= NumElems/2;
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
 
     // Insert the changed part back to the 256-bit vector
-    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
 
   if (Subtarget->hasSSE41())
@@ -6954,7 +7013,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
-  if (OpVT.getSizeInBits() > 128) {
+  if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     EVT VT128 = EVT::getVectorVT(*Context,
                                  OpVT.getVectorElementType(),
@@ -6963,19 +7022,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
     // Insert the 128-bit vector.
-    return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
-                              DAG.getConstant(0, MVT::i32),
-                              DAG, dl);
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
 
-  if (Op.getValueType() == MVT::v1i64 &&
+  if (OpVT == MVT::v1i64 &&
       Op.getOperand(0).getValueType() == MVT::i64)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
-         "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
 }
 
@@ -6989,9 +7045,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue Idx = Op.getNode()->getOperand(1);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 128
-        && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
-        return Extract128BitVector(Vec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).is128BitVector() &&
+        Vec.getNode()->getValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Extract128BitVector(Vec, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -7008,9 +7066,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 256
-        && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
-      return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).is256BitVector() &&
+        SubVec.getNode()->getValueType(0).is128BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -7219,7 +7279,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
-           unsigned char OperandFlags) {
+           unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   DebugLoc dl = GA->getDebugLoc();
@@ -7227,12 +7287,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
   } else {
     SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7264,11 +7328,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
-// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
-// "local exec" model.
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
-                                   bool is64Bit) {
+                                   bool is64Bit, bool isPIC) {
   DebugLoc dl = GA->getDebugLoc();
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
@@ -7286,25 +7388,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   unsigned WrapperKind = X86ISD::Wrapper;
   if (model == TLSModel::LocalExec) {
     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
-  } else if (is64Bit) {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_GOTTPOFF;
-    WrapperKind = X86ISD::WrapperRIP;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
   } else {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_INDNTPOFF;
+    llvm_unreachable("Unexpected model");
   }
 
-  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
-  // exec)
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
-  if (model == TLSModel::InitialExec)
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                           Offset);
+    }
+
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false,
+                         0);
+  }
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
@@ -7318,29 +7431,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = GA->getGlobal();
 
   if (Subtarget->isTargetELF()) {
-    // TODO: implement the "local dynamic" model
-    // TODO: implement the "initial exec"model for pic executables
-
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->resolveAliasedGlobal(false);
-
     TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
-      case TLSModel::LocalDynamic: // not implemented
         if (Subtarget->is64Bit())
           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                                   Subtarget->is64Bit());
+                                   Subtarget->is64Bit(),
+                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
     }
-  } else if (Subtarget->isTargetDarwin()) {
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
@@ -7383,7 +7493,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
                               Chain.getValue(1));
-  } else if (Subtarget->isTargetWindows()) {
+  }
+
+  if (Subtarget->isTargetWindows()) {
     // Just use the implicit TLS architecture
     // Need to generate someting similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7429,7 +7541,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                         false, false, false, 0);
 
     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
-		                            getPointerTy());
+                                    getPointerTy());
     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
 
     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
@@ -7600,9 +7712,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
      #ifdef __SSE3__
-       haddpd   %xmm0, %xmm0          
+       haddpd   %xmm0, %xmm0
      #else
-       pshufd   $0x4e, %xmm0, %xmm1 
+       pshufd   $0x4e, %xmm0, %xmm1
        addpd    %xmm1, %xmm0
      #endif
   */
@@ -7693,12 +7805,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   // Handle final rounding.
   EVT DestVT = Op.getValueType();
 
-  if (DestVT.bitsLT(MVT::f64)) {
+  if (DestVT.bitsLT(MVT::f64))
     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                        DAG.getIntPtrConstant(0));
-  } else if (DestVT.bitsGT(MVT::f64)) {
+  if (DestVT.bitsGT(MVT::f64))
     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
-  }
 
   // Handle final rounding.
   return Sub;
@@ -7719,10 +7830,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   EVT DstVT = Op.getValueType();
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG);
-  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG);
-  else if (Subtarget->is64Bit() &&
-           SrcVT == MVT::i64 && DstVT == MVT::f32)
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
@@ -7899,9 +8009,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
 }
 
 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7916,9 +8026,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
 }
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7931,7 +8041,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
     EltVT = VT.getVectorElementType();
   Constant *C;
   if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2, 
+    C = ConstantVector::getSplat(2,
                 ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   } else {
     C = ConstantVector::getSplat(4,
@@ -7965,15 +8075,15 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, 16);
   if (VT.isVector()) {
-    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
-                    DAG.getNode(ISD::BITCAST, dl, XORVT,
-                                Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
-  } else {
-    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
 }
 
 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -8172,7 +8282,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (Op.getNode()->getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
-    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::SUB:
+      Opcode = X86ISD::SUB;
+      break;
     case ISD::OR:  Opcode = X86ISD::OR;  break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
@@ -8198,6 +8310,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
 
+  if (Opcode == X86ISD::CMP) {
+    SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
+                              Op.getOperand(1));
+    // We can't replace usage of SUB with CMP.
+    // The SUB node will be removed later because there is no use of it.
+    return SDValue(New.getNode(), 0);
+  }
+
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops;
   for (unsigned i = 0; i != NumOperands; ++i)
@@ -8217,9 +8337,41 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
       return EmitTest(Op0, X86CC, DAG);
 
   DebugLoc dl = Op0.getDebugLoc();
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  DebugLoc dl = Cmp.getDebugLoc();
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
 /// if it's possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8341,6 +8493,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
 }
@@ -8350,24 +8503,22 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
-  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
   DebugLoc dl = Op.getDebugLoc();
   SDValue CC = Op.getOperand(2);
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
@@ -8389,10 +8540,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
   if (isFP) {
-    unsigned SSECC = 8;
+#ifndef NDEBUG
     EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
 
+    unsigned SSECC;
     bool Swap = false;
 
     // SSE Condition code mapping:
@@ -8405,7 +8558,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
     //  6 - NLE
     //  7 - ORD
     switch (SetCCOpcode) {
-    default: break;
+    default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETOEQ:
     case ISD::SETEQ:  SSECC = 0; break;
     case ISD::SETOGT:
@@ -8419,33 +8572,33 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SETUO:  SSECC = 3; break;
     case ISD::SETUNE:
     case ISD::SETNE:  SSECC = 4; break;
-    case ISD::SETULE: Swap = true;
+    case ISD::SETULE: Swap = true; // Fallthrough
     case ISD::SETUGE: SSECC = 5; break;
-    case ISD::SETULT: Swap = true;
+    case ISD::SETULT: Swap = true; // Fallthrough
     case ISD::SETUGT: SSECC = 6; break;
     case ISD::SETO:   SSECC = 7; break;
+    case ISD::SETUEQ:
+    case ISD::SETONE: SSECC = 8; break;
     }
     if (Swap)
       std::swap(Op0, Op1);
 
     // In the two special cases we can't handle, emit two comparisons.
     if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
       if (SetCCOpcode == ISD::SETUEQ) {
-        SDValue UNORD, EQ;
-        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                            DAG.getConstant(3, MVT::i8));
-        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                         DAG.getConstant(0, MVT::i8));
-        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      } else if (SetCCOpcode == ISD::SETONE) {
-        SDValue ORD, NEQ;
-        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(7, MVT::i8));
-        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(4, MVT::i8));
-        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
       }
-      llvm_unreachable("Illegal FP comparison");
+
+      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     }
     // Handle all other FP comparisons here.
     return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
@@ -8453,17 +8606,17 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Break 256-bit integer vector compare into smaller ones.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
     return Lower256IntVSETCC(Op, DAG);
 
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
-  unsigned Opc = 0;
+  unsigned Opc;
   bool Swap = false, Invert = false, FlipSigns = false;
 
   switch (SetCCOpcode) {
-  default: break;
+  default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
@@ -8480,10 +8633,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
-    return SDValue();
-  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
-    return SDValue();
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
+      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
+      return SDValue();
+  }
 
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   // bits of the inputs before performing those operations.
@@ -8510,7 +8665,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getNode()->getOpcode();
-  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
     return true;
   if (Op.getResNo() == 1 &&
       (Opc == X86ISD::ADD ||
@@ -8542,6 +8698,16 @@ static bool isAllOnes(SDValue V) {
   return C && C->isAllOnesValue();
 }
 
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool addTest = true;
   SDValue Cond  = Op.getOperand(0);
@@ -8572,8 +8738,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
 
       SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+        if (YC->isNullValue() &&
+            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
       SDValue Res =   // Res = 0 or -1.
         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
@@ -8654,9 +8837,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (addTest) {
-    // Look pass the truncate.
-    if (Cond.getOpcode() == ISD::TRUNCATE)
-      Cond = Cond.getOperand(0);
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -8679,7 +8862,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // a <  b ?  0 : -1 -> RES = setcc_carry
   // a >= b ? -1 :  0 -> RES = setcc_carry
   // a >= b ?  0 : -1 -> RES = ~setcc_carry
-  if (Cond.getOpcode() == X86ISD::CMP) {
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -8918,6 +9102,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
@@ -8947,6 +9132,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
@@ -8960,9 +9146,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (addTest) {
-    // Look pass the truncate.
-    if (Cond.getOpcode() == ISD::TRUNCATE)
-      Cond = Cond.getOperand(0);
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -8980,6 +9166,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                      Chain, Dest, CC, Cond);
 }
@@ -9018,7 +9205,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       const Function *F = MF.getFunction();
 
       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-           I != E; I++)
+           I != E; ++I)
         if (I->hasNestAttr())
           report_fatal_error("Cannot use segmented stacks with functions that "
                              "have nested arguments.");
@@ -9201,12 +9388,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
 
   if (isa<ConstantSDNode>(ShAmt)) {
+    // Constant may be a TargetConstant. Use a regular constant.
+    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
     switch (Opc) {
       default: llvm_unreachable("Unknown target vector shift node");
       case X86ISD::VSHLI:
       case X86ISD::VSRLI:
       case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+        return DAG.getNode(Opc, dl, VT, SrcOp,
+                           DAG.getConstant(ShiftAmt, MVT::i32));
     }
   }
 
@@ -9223,10 +9413,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   SDValue ShOps[4];
   ShOps[0] = ShAmt;
   ShOps[1] = DAG.getConstant(0, MVT::i32);
-  ShOps[2] = DAG.getUNDEF(MVT::i32);
-  ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
-  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
@@ -9261,8 +9456,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_sse2_ucomigt_sd:
   case Intrinsic::x86_sse2_ucomige_sd:
   case Intrinsic::x86_sse2_ucomineq_sd: {
-    unsigned Opc = 0;
-    ISD::CondCode CC = ISD::SETCC_INVALID;
+    unsigned Opc;
+    ISD::CondCode CC;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse_comieq_ss:
@@ -9336,245 +9531,102 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
-  // XOP comparison intrinsics
-  case Intrinsic::x86_xop_vpcomltb:
-  case Intrinsic::x86_xop_vpcomltw:
-  case Intrinsic::x86_xop_vpcomltd:
-  case Intrinsic::x86_xop_vpcomltq:
-  case Intrinsic::x86_xop_vpcomltub:
-  case Intrinsic::x86_xop_vpcomltuw:
-  case Intrinsic::x86_xop_vpcomltud:
-  case Intrinsic::x86_xop_vpcomltuq:
-  case Intrinsic::x86_xop_vpcomleb:
-  case Intrinsic::x86_xop_vpcomlew:
-  case Intrinsic::x86_xop_vpcomled:
-  case Intrinsic::x86_xop_vpcomleq:
-  case Intrinsic::x86_xop_vpcomleub:
-  case Intrinsic::x86_xop_vpcomleuw:
-  case Intrinsic::x86_xop_vpcomleud:
-  case Intrinsic::x86_xop_vpcomleuq:
-  case Intrinsic::x86_xop_vpcomgtb:
-  case Intrinsic::x86_xop_vpcomgtw:
-  case Intrinsic::x86_xop_vpcomgtd:
-  case Intrinsic::x86_xop_vpcomgtq:
-  case Intrinsic::x86_xop_vpcomgtub:
-  case Intrinsic::x86_xop_vpcomgtuw:
-  case Intrinsic::x86_xop_vpcomgtud:
-  case Intrinsic::x86_xop_vpcomgtuq:
-  case Intrinsic::x86_xop_vpcomgeb:
-  case Intrinsic::x86_xop_vpcomgew:
-  case Intrinsic::x86_xop_vpcomged:
-  case Intrinsic::x86_xop_vpcomgeq:
-  case Intrinsic::x86_xop_vpcomgeub:
-  case Intrinsic::x86_xop_vpcomgeuw:
-  case Intrinsic::x86_xop_vpcomgeud:
-  case Intrinsic::x86_xop_vpcomgeuq:
-  case Intrinsic::x86_xop_vpcomeqb:
-  case Intrinsic::x86_xop_vpcomeqw:
-  case Intrinsic::x86_xop_vpcomeqd:
-  case Intrinsic::x86_xop_vpcomeqq:
-  case Intrinsic::x86_xop_vpcomequb:
-  case Intrinsic::x86_xop_vpcomequw:
-  case Intrinsic::x86_xop_vpcomequd:
-  case Intrinsic::x86_xop_vpcomequq:
-  case Intrinsic::x86_xop_vpcomneb:
-  case Intrinsic::x86_xop_vpcomnew:
-  case Intrinsic::x86_xop_vpcomned:
-  case Intrinsic::x86_xop_vpcomneq:
-  case Intrinsic::x86_xop_vpcomneub:
-  case Intrinsic::x86_xop_vpcomneuw:
-  case Intrinsic::x86_xop_vpcomneud:
-  case Intrinsic::x86_xop_vpcomneuq:
-  case Intrinsic::x86_xop_vpcomfalseb:
-  case Intrinsic::x86_xop_vpcomfalsew:
-  case Intrinsic::x86_xop_vpcomfalsed:
-  case Intrinsic::x86_xop_vpcomfalseq:
-  case Intrinsic::x86_xop_vpcomfalseub:
-  case Intrinsic::x86_xop_vpcomfalseuw:
-  case Intrinsic::x86_xop_vpcomfalseud:
-  case Intrinsic::x86_xop_vpcomfalseuq:
-  case Intrinsic::x86_xop_vpcomtrueb:
-  case Intrinsic::x86_xop_vpcomtruew:
-  case Intrinsic::x86_xop_vpcomtrued:
-  case Intrinsic::x86_xop_vpcomtrueq:
-  case Intrinsic::x86_xop_vpcomtrueub:
-  case Intrinsic::x86_xop_vpcomtrueuw:
-  case Intrinsic::x86_xop_vpcomtrueud:
-  case Intrinsic::x86_xop_vpcomtrueuq: {
-    unsigned CC = 0;
-    unsigned Opc = 0;
-
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_xop_vpcomltb:
-    case Intrinsic::x86_xop_vpcomltw:
-    case Intrinsic::x86_xop_vpcomltd:
-    case Intrinsic::x86_xop_vpcomltq:
-      CC = 0;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomltub:
-    case Intrinsic::x86_xop_vpcomltuw:
-    case Intrinsic::x86_xop_vpcomltud:
-    case Intrinsic::x86_xop_vpcomltuq:
-      CC = 0;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomleb:
-    case Intrinsic::x86_xop_vpcomlew:
-    case Intrinsic::x86_xop_vpcomled:
-    case Intrinsic::x86_xop_vpcomleq:
-      CC = 1;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomleub:
-    case Intrinsic::x86_xop_vpcomleuw:
-    case Intrinsic::x86_xop_vpcomleud:
-    case Intrinsic::x86_xop_vpcomleuq:
-      CC = 1;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgtb:
-    case Intrinsic::x86_xop_vpcomgtw:
-    case Intrinsic::x86_xop_vpcomgtd:
-    case Intrinsic::x86_xop_vpcomgtq:
-      CC = 2;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgtub:
-    case Intrinsic::x86_xop_vpcomgtuw:
-    case Intrinsic::x86_xop_vpcomgtud:
-    case Intrinsic::x86_xop_vpcomgtuq:
-      CC = 2;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgeb:
-    case Intrinsic::x86_xop_vpcomgew:
-    case Intrinsic::x86_xop_vpcomged:
-    case Intrinsic::x86_xop_vpcomgeq:
-      CC = 3;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgeub:
-    case Intrinsic::x86_xop_vpcomgeuw:
-    case Intrinsic::x86_xop_vpcomgeud:
-    case Intrinsic::x86_xop_vpcomgeuq:
-      CC = 3;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomeqb:
-    case Intrinsic::x86_xop_vpcomeqw:
-    case Intrinsic::x86_xop_vpcomeqd:
-    case Intrinsic::x86_xop_vpcomeqq:
-      CC = 4;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomequb:
-    case Intrinsic::x86_xop_vpcomequw:
-    case Intrinsic::x86_xop_vpcomequd:
-    case Intrinsic::x86_xop_vpcomequq:
-      CC = 4;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomneb:
-    case Intrinsic::x86_xop_vpcomnew:
-    case Intrinsic::x86_xop_vpcomned:
-    case Intrinsic::x86_xop_vpcomneq:
-      CC = 5;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomneub:
-    case Intrinsic::x86_xop_vpcomneuw:
-    case Intrinsic::x86_xop_vpcomneud:
-    case Intrinsic::x86_xop_vpcomneuq:
-      CC = 5;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseb:
-    case Intrinsic::x86_xop_vpcomfalsew:
-    case Intrinsic::x86_xop_vpcomfalsed:
-    case Intrinsic::x86_xop_vpcomfalseq:
-      CC = 6;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseub:
-    case Intrinsic::x86_xop_vpcomfalseuw:
-    case Intrinsic::x86_xop_vpcomfalseud:
-    case Intrinsic::x86_xop_vpcomfalseuq:
-      CC = 6;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueb:
-    case Intrinsic::x86_xop_vpcomtruew:
-    case Intrinsic::x86_xop_vpcomtrued:
-    case Intrinsic::x86_xop_vpcomtrueq:
-      CC = 7;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueub:
-    case Intrinsic::x86_xop_vpcomtrueuw:
-    case Intrinsic::x86_xop_vpcomtrueud:
-    case Intrinsic::x86_xop_vpcomtrueuq:
-      CC = 7;
-      Opc = X86ISD::VPCOMU;
-      break;
-    }
-
-    SDValue LHS = Op.getOperand(1);
-    SDValue RHS = Op.getOperand(2);
-    return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
-                       DAG.getConstant(CC, MVT::i8));
-  }
 
   // Arithmetic intrinsics.
   case Intrinsic::x86_sse2_pmulu_dq:
   case Intrinsic::x86_avx2_pmulu_dq:
     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
+  // SSE3/AVX horizontal add/sub intrinsics
   case Intrinsic::x86_sse3_hadd_ps:
   case Intrinsic::x86_sse3_hadd_pd:
   case Intrinsic::x86_avx_hadd_ps_256:
   case Intrinsic::x86_avx_hadd_pd_256:
-    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse3_hsub_ps:
   case Intrinsic::x86_sse3_hsub_pd:
   case Intrinsic::x86_avx_hsub_ps_256:
   case Intrinsic::x86_avx_hsub_pd_256:
-    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_ssse3_phadd_w_128:
   case Intrinsic::x86_ssse3_phadd_d_128:
   case Intrinsic::x86_avx2_phadd_w:
   case Intrinsic::x86_avx2_phadd_d:
-    return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_ssse3_phsub_w_128:
   case Intrinsic::x86_ssse3_phsub_d_128:
   case Intrinsic::x86_avx2_phsub_w:
-  case Intrinsic::x86_avx2_phsub_d:
-    return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_phsub_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse3_hadd_ps:
+    case Intrinsic::x86_sse3_hadd_pd:
+    case Intrinsic::x86_avx_hadd_ps_256:
+    case Intrinsic::x86_avx_hadd_pd_256:
+      Opcode = X86ISD::FHADD;
+      break;
+    case Intrinsic::x86_sse3_hsub_ps:
+    case Intrinsic::x86_sse3_hsub_pd:
+    case Intrinsic::x86_avx_hsub_ps_256:
+    case Intrinsic::x86_avx_hsub_pd_256:
+      Opcode = X86ISD::FHSUB;
+      break;
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_avx2_phadd_w:
+    case Intrinsic::x86_avx2_phadd_d:
+      Opcode = X86ISD::HADD;
+      break;
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_avx2_phsub_w:
+    case Intrinsic::x86_avx2_phsub_d:
+      Opcode = X86ISD::HSUB;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // AVX2 variable shift intrinsics
   case Intrinsic::x86_avx2_psllv_d:
   case Intrinsic::x86_avx2_psllv_q:
   case Intrinsic::x86_avx2_psllv_d_256:
   case Intrinsic::x86_avx2_psllv_q_256:
-    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_avx2_psrlv_d:
   case Intrinsic::x86_avx2_psrlv_q:
   case Intrinsic::x86_avx2_psrlv_d_256:
   case Intrinsic::x86_avx2_psrlv_q_256:
-    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx2_psllv_q_256:
+      Opcode = ISD::SHL;
+      break;
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+      Opcode = ISD::SRL;
+      break;
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+      Opcode = ISD::SRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
     return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_ssse3_psign_b_128:
   case Intrinsic::x86_ssse3_psign_w_128:
   case Intrinsic::x86_ssse3_psign_d_128:
@@ -9583,15 +9635,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx2_psign_d:
     return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_sse41_insertps:
     return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
   case Intrinsic::x86_avx_vperm2f128_ps_256:
   case Intrinsic::x86_avx_vperm2f128_pd_256:
   case Intrinsic::x86_avx_vperm2f128_si_256:
   case Intrinsic::x86_avx2_vperm2i128:
     return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
@@ -9621,7 +9676,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx_vtestc_pd_256:
   case Intrinsic::x86_avx_vtestnzc_pd_256: {
     bool IsTestPacked = false;
-    unsigned X86CC = 0;
+    unsigned X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx_vtestz_ps:
@@ -9672,44 +9727,93 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
-    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse2_psrl_w:
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
-    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse2_psra_w:
   case Intrinsic::x86_sse2_psra_d:
   case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_psra_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+      Opcode = X86ISD::VSHL;
+      break;
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+      Opcode = X86ISD::VSRL;
+      break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+      Opcode = X86ISD::VSRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/AVX immediate shift intrinsics
   case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
-    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
   case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
-    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_sse2_psrai_d:
   case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_psrai_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+      Opcode = X86ISD::VSHLI;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+      Opcode = X86ISD::VSRLI;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+      Opcode = X86ISD::VSRAI;
+      break;
+    }
+    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
                                Op.getOperand(1), Op.getOperand(2), DAG);
+  }
+
   // Fix vector shift instructions where the last operand is a non-immediate
   // i32 value.
   case Intrinsic::x86_mmx_pslli_w:
@@ -9724,8 +9828,9 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     if (isa<ConstantSDNode>(ShAmt))
       return SDValue();
 
-    unsigned NewIntNo = 0;
+    unsigned NewIntNo;
     switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_mmx_pslli_w:
       NewIntNo = Intrinsic::x86_mmx_psll_w;
       break;
@@ -9750,7 +9855,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::x86_mmx_psrai_d:
       NewIntNo = Intrinsic::x86_mmx_psra_d;
       break;
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     }
 
     // The vector shift intrinsics with scalars uses 32b shift amounts but
@@ -9766,6 +9870,116 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                        DAG.getConstant(NewIntNo, MVT::i32),
                        Op.getOperand(1), ShAmt);
   }
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
+      break;
+    }
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8),
+                                SDValue(PCMP.getNode(), 1));
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+  }
+  }
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  // RDRAND intrinsics.
+  case Intrinsic::x86_rdrand_16:
+  case Intrinsic::x86_rdrand_32:
+  case Intrinsic::x86_rdrand_64: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+    // return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops, 4);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
   }
 }
 
@@ -9816,7 +10030,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
@@ -9833,7 +10046,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
-  MF.getRegInfo().addLiveOut(StoreAddrReg);
 
   return DAG.getNode(X86ISD::EH_RETURN, dl,
                      MVT::Other,
@@ -10149,23 +10361,21 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
-  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+  assert(VT.is256BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10176,14 +10386,14 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+  assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
 SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+  assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
@@ -10193,7 +10403,7 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
     return Lower256IntArith(Op, DAG);
 
   assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
@@ -10310,6 +10520,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
         }
+        llvm_unreachable("Unknown shift opcode.");
       }
 
       if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
@@ -10353,6 +10564,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
         }
+        llvm_unreachable("Unknown shift opcode.");
       }
     }
   }
@@ -10421,15 +10633,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Decompose 256-bit shifts into smaller 128-bit shifts.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     unsigned NumElems = VT.getVectorNumElements();
     MVT EltVT = VT.getVectorElementType().getSimpleVT();
     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Extract the two vectors
-    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
-                                     DAG, dl);
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
 
     // Recreate the shift amount vectors
     SDValue Amt1, Amt2;
@@ -10448,9 +10659,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                                  &Amt2Csts[0], NumElems/2);
     } else {
       // Variable shift amount
-      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
-                                 DAG, dl);
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
     }
 
     // Issue new vector shifts for the smaller types
@@ -10560,20 +10770,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
         return SDValue();
       if (!Subtarget->hasAVX2()) {
         // needs to be split
-        int NumElems = VT.getVectorNumElements();
-        SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-        SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+        unsigned NumElems = VT.getVectorNumElements();
 
         // Extract the LHS vectors
         SDValue LHS = Op.getOperand(0);
-        SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-        SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
         MVT EltVT = VT.getVectorElementType().getSimpleVT();
         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
         EVT ExtraEltVT = ExtraVT.getVectorElementType();
-        int ExtraNumElems = ExtraVT.getVectorNumElements();
+        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
                                    ExtraNumElems/2);
         SDValue Extra = DAG.getValueType(ExtraVT);
@@ -10859,6 +11067,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -11013,7 +11222,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                Regs64bit ? X86::RBX : X86::EBX,
                                swapInL, cpInH.getValue(1));
     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
-                               Regs64bit ? X86::RCX : X86::ECX, 
+                               Regs64bit ? X86::RCX : X86::ECX,
                                swapInH, swapInL.getValue(1));
     SDValue Ops[] = { swapInH.getValue(0),
                       N->getOperand(1),
@@ -11118,10 +11327,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
@@ -11190,6 +11401,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   }
 }
 
@@ -11258,6 +11477,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return true;
 }
 
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return Imm == (int32_t)Imm;
+}
+
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
@@ -11300,8 +11528,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isMOVLMask(M, VT) ||
           isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
           isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT) ||
-          isPSHUFLWMask(M, VT) ||
+          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
+          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
           isPALIGNRMask(M, VT, Subtarget) ||
           isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
           isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
@@ -11316,7 +11544,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
     return true;
-  if (NumElts == 4 && VT.getSizeInBits() == 128) {
+  if (NumElts == 4 && VT.is128BitVector()) {
     return (isMOVLMask(Mask, VT)  ||
             isCommutedMOVLMask(Mask, VT, true) ||
             isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
@@ -11460,7 +11688,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   //     result in out1, out2
   //     fallthrough -->nextMBB
 
-  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
   const unsigned LoadOpc = X86::MOV32rm;
   const unsigned NotOpc = X86::NOT32r;
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11662,7 +11890,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   int valArgIndx = lastAddrIndx + 1;
 
-  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
   for (int i=0; i <= lastAddrIndx; ++i)
     (*MIB).addOperand(*argOpers[i]);
@@ -11672,7 +11900,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
           argOpers[valArgIndx]->isImm()) &&
          "invalid operand");
 
-  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   if (argOpers[valArgIndx]->isReg())
     MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
   else
@@ -11687,7 +11915,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   MIB.addReg(t2);
 
   // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
   MIB.addReg(t2);
   MIB.addReg(t1);
@@ -11742,8 +11970,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
       MIB.addOperand(Op);
   }
   BuildMI(*BB, MI, dl,
-    TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
-             MI->getOperand(0).getReg())
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
     .addReg(X86::XMM0);
 
   MI->eraseFromParent();
@@ -11776,24 +12003,6 @@ X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
-  DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  // First arg in ECX, the second in EAX.
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
-    .addReg(MI->getOperand(0).getReg());
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
-    .addReg(MI->getOperand(1).getReg());
-
-  // The instruction doesn't actually take any operands though.
-  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
-
-  MI->eraseFromParent(); // The pseudo is gone now.
-  return BB;
-}
-
-MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(
                    MachineInstr *MI,
                    MachineBasicBlock *MBB) const {
@@ -12306,8 +12515,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+      .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
@@ -12517,7 +12727,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
-      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
@@ -12596,8 +12806,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // Thread synchronization.
   case X86::MONITOR:
     return EmitMonitor(MI, BB);
-  case X86::MWAIT:
-    return EmitMwait(MI, BB);
 
     // Atomic Lowering.
   case X86::ATOMAND32:
@@ -12605,25 +12813,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMOR32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
                                                X86::OR32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMXOR32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
                                                X86::XOR32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMNAND32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                X86::AND32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass, true);
+                                               &X86::GR32RegClass, true);
   case X86::ATOMMIN32:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
   case X86::ATOMMAX32:
@@ -12638,25 +12846,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMOR16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
                                                X86::OR16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMXOR16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
                                                X86::XOR16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMNAND16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                                X86::AND16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass, true);
+                                               &X86::GR16RegClass, true);
   case X86::ATOMMIN16:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
   case X86::ATOMMAX16:
@@ -12671,25 +12879,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMOR8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
                                                X86::OR8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMXOR8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
                                                X86::XOR8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMNAND8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                                X86::AND8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass, true);
+                                               &X86::GR8RegClass, true);
   // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
   // This group is for 64-bit host.
   case X86::ATOMAND64:
@@ -12697,25 +12905,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMOR64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
                                                X86::OR64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMXOR64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
                                                X86::XOR64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMNAND64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                X86::AND64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass, true);
+                                               &X86::GR64RegClass, true);
   case X86::ATOMMIN64:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
   case X86::ATOMMAX64:
@@ -12870,10 +13078,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 /// inserting the result into the low part of a new 256-bit vector
 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
         SVOp->getMaskElt(j) >= 0)
       return false;
@@ -12886,10 +13094,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
 /// inserting the result into the high part of a new 256-bit vector
 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
         SVOp->getMaskElt(j) >= 0)
       return false;
@@ -12906,7 +13114,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
       V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -12931,30 +13139,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
     // To match the shuffle mask, the first half of the mask should
     // be exactly the first vector, and all the rest a splat with the
     // first element of the second one.
-    for (int i = 0; i < NumElems/2; ++i)
+    for (unsigned i = 0; i != NumElems/2; ++i)
       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
         return SDValue();
 
     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
-      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
-      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
-      SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
-                                Ld->getMemoryVT(),
-                                Ld->getPointerInfo(),
-                                Ld->getAlignment(),
-                                false/*isVolatile*/, true/*ReadMem*/,
-                                false/*WriteMem*/);
-      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
-    } 
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      }
+    }
 
     // Emit a zeroed vector and insert the desired subvector on its
     // first half.
     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
@@ -12964,18 +13173,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
 
   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
-                                    DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   if (isShuffleLow128VectorInsertHigh(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
@@ -12995,12 +13201,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+  if (Subtarget->hasAVX() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
   // Only handle 128 wide vector from here on.
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return SDValue();
 
   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
@@ -13014,16 +13220,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 
-/// PerformTruncateCombine - Converts truncate operation to
+/// DCI, PerformTruncateCombine - Converts truncate operation to
 /// a sequence of vector shuffle operations.
 /// It is possible when we truncate 256-bit vector to 128-bit vector
 
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
                                                   DAGCombinerInfo &DCI) const {
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (!Subtarget->hasAVX()) return SDValue();
+  if (!Subtarget->hasAVX())
+    return SDValue();
 
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
@@ -13032,55 +13239,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
 
   if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
 
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v4i64 -> v4i32
+
+      // VPERMD
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    // AVX: v4i64 -> v4i32
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(0));
+                               DAG.getIntPtrConstant(0));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(2));
+                               DAG.getIntPtrConstant(2));
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
 
     // PSHUFD
-    int ShufMask1[] = {0, 2, 0, 0};
+    static const int ShufMask1[] = {0, 2, 0, 0};
 
-    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
-                                ShufMask1);
+    SDValue Undef = DAG.getUNDEF(VT);
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
 
     // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+    static const int ShufMask2[] = {0, 1, 4, 5};
 
     return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
   }
+
   if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
 
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v8i32 -> v8i16
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+
+      // PSHUFB
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
+                               &pshufbMask[0], 32);
+      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+
+      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                       DAG.getIntPtrConstant(0));
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+    }
+
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(0));
+                               DAG.getIntPtrConstant(0));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(4));
+                               DAG.getIntPtrConstant(4));
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
 
     // PSHUFB
-    int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13, 
-                      -1, -1, -1, -1, -1, -1, -1, -1};
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
 
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
-                                DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
-                                DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
 
     // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+    static const int ShufMask2[] = {0, 1, 4, 5};
 
     SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
@@ -13127,7 +13381,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SmallVector<int, 16> ShuffleMask;
   bool UnaryShuffle;
-  if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+                            UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
@@ -13276,8 +13531,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-
-
   DebugLoc DL = N->getDebugLoc();
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
@@ -13559,9 +13812,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // to simplify previous instructions.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() &&
-      TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -13576,6 +13833,88 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = 0;
+  bool needOppositeCond = (CC == X86::COND_E);
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1)
+    needOppositeCond = !needOppositeCond;
+  else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  // Skip 'zext' node.
+  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
+    SetCC = SetCC.getOperand(0);
+
+  // Quit if not SETCC.
+  // FIXME: So far we only handle the boolean value generated from SETCC. If
+  // there is other ways to generate boolean values, we need handle them here
+  // as well.
+  if (SetCC.getOpcode() != X86ISD::SETCC)
+    return SDValue();
+
+  // Set the condition code or opposite one if necessary.
+  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+  if (needOppositeCond)
+    CC = X86::GetOppositeBranchCondition(CC);
+
+  return SetCC.getOperand(1);
+}
+
+static bool IsValidFCMOVCondition(X86::CondCode CC) {
+  switch (CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_AE:
+  case X86::COND_A:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI) {
@@ -13589,6 +13928,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   SDValue TrueOp = N->getOperand(1);
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   SDValue Cond = N->getOperand(3);
+
   if (CC == X86::COND_E || CC == X86::COND_NE) {
     switch (Cond.getOpcode()) {
     default: break;
@@ -13600,6 +13940,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  SDValue Flags;
+
+  Flags = BoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
@@ -14022,7 +14374,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
 
   // Sometimes the operand may come from a insert_subvector building a 256-bit
   // allones vector
-  if (VT.getSizeInBits() == 256 &&
+  if (VT.is256BitVector() &&
       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
     SDValue V1 = N->getOperand(0);
     SDValue V2 = N->getOperand(1);
@@ -14260,6 +14612,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+                           Ops, array_lengthof(Ops));
+      }
+  return SDValue();
+}
+
 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@@ -14267,6 +14654,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (Subtarget->hasCMov()) {
+    SDValue RV = performIntegerAbsCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Try forming BMI if it is available.
+  if (!Subtarget->hasBMI())
+    return SDValue();
+
   EVT VT = N->getValueType(0);
 
   if (VT != MVT::i32 && VT != MVT::i64)
@@ -14292,7 +14689,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
 
 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
@@ -14314,63 +14712,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
     unsigned RegSz = RegVT.getSizeInBits();
     unsigned MemSz = MemVT.getSizeInBits();
     assert(RegSz > MemSz && "Register size must be greater than the mem size");
-    // All sizes must be a power of two
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
 
-    // Attempt to load the original value using a single load op.
-    // Find a scalar type which is equal to the loaded word size.
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
     MVT SclrLoadTy = MVT::i8;
     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
       MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
         SclrLoadTy = Tp;
-        break;
       }
     }
 
-    // Proceed if a load word is found.
-    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+        (64 <= MemSz))
+      SclrLoadTy = MVT::f64;
 
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
     EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
       RegSz/SclrLoadTy.getSizeInBits());
 
+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   RegSz/MemVT.getScalarType().getSizeInBits());
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
 
-    // Perform a single load.
-    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                  Ld->getBasePtr(),
-                                  Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                  Ld->getAlignment());
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");
+
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
 
-    // Insert the word loaded into a vector.
-    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-      LoadUnitVecVT, ScalarLoad);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
 
     // Bitcast the loaded value to a vector of the original element type, in
     // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
-                                    ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
     unsigned SizeRatio = RegSz/MemSz;
 
     // Redistribute the loaded elements into the different locations.
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i*SizeRatio] = i;
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                DAG.getUNDEF(SlicedVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
 
     // Bitcast to the requested type.
     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
     // Replace the original load with the new sequence
     // and return the new chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
-    return SDValue(ScalarLoad.getNode(), 1);
+    return DCI.CombineTo(N, Shuff, TF, true);
   }
 
   return SDValue();
@@ -14387,13 +14816,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we are saving a concatenation of two XMM registers, perform two stores.
-  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
-  // 128-bit ones. If in the future the cost becomes only one memory access the
-  // first version would be better.
-  if (VT.getSizeInBits() == 256 &&
-    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
-    StoredVal.getNumOperands() == 2) {
-
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
+      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+      StoredVal.getNumOperands() == 2) {
     SDValue Value0 = StoredVal.getOperand(0);
     SDValue Value1 = StoredVal.getOperand(1);
 
@@ -14438,14 +14866,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
 
-    // Can't shuffle using an illegal type
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
@@ -14454,13 +14884,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
       MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
         StoreType = Tp;
     }
 
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
     // Bitcast the original vector into a vector of store-size units
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
@@ -14469,7 +14904,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
-    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(i));
@@ -14818,18 +15253,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (!Subtarget->hasAVX()) 
+  if (!Subtarget->hasAVX())
     return SDValue();
 
-  // Optimize vectors in AVX mode
-  // Sign extend  v8i16 to v8i32 and
-  //              v4i32 to v4i64
-  //
-  // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
-  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
-  // concat the vectors to original VT
-
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
   EVT OpVT = Op.getValueType();
@@ -14838,23 +15264,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
       (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
 
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+
+    // Optimize vectors in AVX mode
+    // Sign extend  v8i16 to v8i32 and
+    //              v4i32 to v4i64
+    //
+    // Divide input vector into two parts
+    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+    // concat the vectors to original VT
+
     unsigned NumElems = OpVT.getVectorNumElements();
+    SDValue Undef = DAG.getUNDEF(OpVT);
+
     SmallVector<int,8> ShufMask1(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask1[i] = i;
 
-    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask1.data());
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
 
     SmallVector<int,8> ShufMask2(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask2[i] = i + NumElems/2;
 
-    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask2.data());
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
 
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                   VT.getVectorNumElements()/2);
 
-    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
+    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
     OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -14862,7 +15302,42 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget* Subtarget) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  bool NegA = (A.getOpcode() == ISD::FNEG);
+  bool NegB = (B.getOpcode() == ISD::FNEG);
+  bool NegC = (C.getOpcode() == ISD::FNEG);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+  if (NegA)
+    A = A.getOperand(0);
+  if (NegB)
+    B = B.getOperand(0);
+  if (NegC)
+    C = C.getOperand(0);
+
+  unsigned Opcode;
+  if (!NegMul)
+    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+  return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
@@ -14887,6 +15362,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
                                    N00.getOperand(0), N00.getOperand(1)),
                        DAG.getConstant(1, VT));
   }
+
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
@@ -14899,50 +15375,139 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
-  if (Subtarget->hasAVX()) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
 
-    if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
       ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
 
-      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
-      SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
-      SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
 
-      EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 
-        VT.getVectorNumElements()/2);
+    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
+    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
 
-      OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-      OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorNumElements()/2);
 
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
-    }
+    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   }
 
+  return SDValue();
+}
 
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   LHS.getValueType(), RHS, LHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   RHS.getValueType(), LHS, RHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
   return SDValue();
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned X86CC = N->getConstantOperandVal(0);
-  SDValue EFLAG = N->getOperand(1);
   DebugLoc DL = N->getDebugLoc();
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
 
   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   // cases.
-  if (X86CC == X86::COND_B)
+  if (CC == X86::COND_B)
     return DAG.getNode(ISD::AND, DL, MVT::i8,
                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
+                                   DAG.getConstant(CC, MVT::i8), EFLAGS),
                        DAG.getConstant(1, MVT::i8));
 
+  SDValue Flags;
+
+  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  SDValue Flags;
+
+  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+    // Notice that we use SINT_TO_FP because we know that the high bits
+    // are zero and SINT_TO_FP is better supported by the hardware.
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
   return SDValue();
 }
 
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86TargetLowering *XTLI) {
   SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
   if (Op0.getOpcode() == ISD::LOAD) {
@@ -14961,6 +15526,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
+  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
+  }
+
+  return SDValue();
+}
+
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15095,9 +15674,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -15105,10 +15686,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
   case X86ISD::UNPCKH:
@@ -15123,6 +15707,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERMILP:
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   }
 
   return SDValue();
@@ -15652,55 +16237,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       // in the normal allocation?
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget->is64Bit()) {
-	if (VT == MVT::i32 || VT == MVT::f32)
-	  return std::make_pair(0U, X86::GR32RegisterClass);
-	else if (VT == MVT::i16)
-	  return std::make_pair(0U, X86::GR16RegisterClass);
-	else if (VT == MVT::i8 || VT == MVT::i1)
-	  return std::make_pair(0U, X86::GR8RegisterClass);
-	else if (VT == MVT::i64 || VT == MVT::f64)
-	  return std::make_pair(0U, X86::GR64RegisterClass);
-	break;
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
       }
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
-	return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
-      else if (VT == MVT::i16)
-	return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
-      else if (VT == MVT::i8 || VT == MVT::i1)
-	return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
-      else if (VT == MVT::i64)
-	return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
     case 'r':   // GENERAL_REGS
     case 'l':   // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8RegisterClass);
+        return std::make_pair(0U, &X86::GR8RegClass);
       if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16RegisterClass);
+        return std::make_pair(0U, &X86::GR16RegClass);
       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32RegisterClass);
-      return std::make_pair(0U, X86::GR64RegisterClass);
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
     case 'R':   // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
       if (VT == MVT::i32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
-      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP32RegisterClass);
+        return std::make_pair(0U, &X86::RFP32RegClass);
       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP64RegisterClass);
-      return std::make_pair(0U, X86::RFP80RegisterClass);
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
     case 'y':   // MMX_REGS if MMX allowed.
       if (!Subtarget->hasMMX()) break;
-      return std::make_pair(0U, X86::VR64RegisterClass);
+      return std::make_pair(0U, &X86::VR64RegClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
       if (!Subtarget->hasSSE2()) break;
       // FALL THROUGH.
@@ -15712,10 +16297,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       // Scalar SSE types.
       case MVT::f32:
       case MVT::i32:
-        return std::make_pair(0U, X86::FR32RegisterClass);
+        return std::make_pair(0U, &X86::FR32RegClass);
       case MVT::f64:
       case MVT::i64:
-        return std::make_pair(0U, X86::FR64RegisterClass);
+        return std::make_pair(0U, &X86::FR64RegClass);
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
@@ -15723,7 +16308,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v2i64:
       case MVT::v4f32:
       case MVT::v2f64:
-        return std::make_pair(0U, X86::VR128RegisterClass);
+        return std::make_pair(0U, &X86::VR128RegClass);
       // AVX types.
       case MVT::v32i8:
       case MVT::v16i16:
@@ -15731,8 +16316,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v4i64:
       case MVT::v8f32:
       case MVT::v4f64:
-        return std::make_pair(0U, X86::VR256RegisterClass);
-        
+        return std::make_pair(0U, &X86::VR256RegClass);
       }
       break;
     }
@@ -15755,28 +16339,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         Constraint[6] == '}') {
 
       Res.first = X86::ST0+Constraint[4]-'0';
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint)) {
       Res.first = X86::ST0;
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // flags -> EFLAGS
     if (StringRef("{flags}").equals_lower(Constraint)) {
       Res.first = X86::EFLAGS;
-      Res.second = X86::CCRRegisterClass;
+      Res.second = &X86::CCRRegClass;
       return Res;
     }
 
     // 'A' means EAX + EDX.
     if (Constraint == "A") {
       Res.first = X86::EAX;
-      Res.second = X86::GR32_ADRegisterClass;
+      Res.second = &X86::GR32_ADRegClass;
       return Res;
     }
     return Res;
@@ -15792,7 +16376,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
   // really want an 8-bit or 32-bit register, map to the appropriate register
   // class and return the appropriate register.
-  if (Res.second == X86::GR16RegisterClass) {
+  if (Res.second == &X86::GR16RegClass) {
     if (VT == MVT::i8) {
       unsigned DestReg = 0;
       switch (Res.first) {
@@ -15804,7 +16388,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR8RegisterClass;
+        Res.second = &X86::GR8RegClass;
       }
     } else if (VT == MVT::i32) {
       unsigned DestReg = 0;
@@ -15821,7 +16405,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR32RegisterClass;
+        Res.second = &X86::GR32RegClass;
       }
     } else if (VT == MVT::i64) {
       unsigned DestReg = 0;
@@ -15838,22 +16422,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR64RegisterClass;
+        Res.second = &X86::GR64RegClass;
       }
     }
-  } else if (Res.second == X86::FR32RegisterClass ||
-             Res.second == X86::FR64RegisterClass ||
-             Res.second == X86::VR128RegisterClass) {
+  } else if (Res.second == &X86::FR32RegClass ||
+             Res.second == &X86::FR64RegClass ||
+             Res.second == &X86::VR128RegClass) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
-    if (VT == MVT::f32)
-      Res.second = X86::FR32RegisterClass;
-    else if (VT == MVT::f64)
-      Res.second = X86::FR64RegisterClass;
-    else if (X86::VR128RegisterClass->hasType(VT))
-      Res.second = X86::VR128RegisterClass;
+
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
   }
 
   return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09116e8..9123ebd 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -137,10 +137,6 @@ namespace llvm {
       /// relative displacements.
       WrapperRIP,
 
-      /// MOVQ2DQ - Copies a 64-bit value from an MMX vector to the low word
-      /// of an XMM vector, with the high word zero filled.
-      MOVQ2DQ,
-
       /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
       /// to an MMX vector.  If you think this is too close to the previous
       /// mnemonic, so do I; blame Intel.
@@ -207,6 +203,10 @@ namespace llvm {
       // TLSADDR - Thread Local Storage.
       TLSADDR,
 
+      // TLSBASEADDR - Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
       // TLSCALL - Thread Local Storage.  When calling to an OS provided
       // thunk at the address from an earlier relocation.
       TLSCALL,
@@ -242,9 +242,6 @@ namespace llvm {
       // PCMP* - Vector integer comparisons.
       PCMPEQ, PCMPGT,
 
-      // VPCOM, VPCOMU - XOP Vector integer comparisons.
-      VPCOM, VPCOMU,
-
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
@@ -293,6 +290,14 @@ namespace llvm {
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
 
+      // FMA nodes
+      FMADD,
+      FNMADD,
+      FMSUB,
+      FNMSUB,
+      FMADDSUB,
+      FMSUBADD,
+
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
       // with control flow.
@@ -315,6 +320,19 @@ namespace llvm {
       SFENCE,
       LFENCE,
 
+      // FNSTSW16r - Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // SAHF - Store contents of %ah into %eflags.
+      SAHF,
+
+      // RDRAND - Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
+      // PCMP*STRI
+      PCMPISTRI,
+      PCMPESTRI,
+
       // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
       // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
       // Atomic 64-bit binary operations.
@@ -558,6 +576,18 @@ namespace llvm {
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
@@ -575,6 +605,12 @@ namespace llvm {
     virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
+    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+    /// is expanded to mul + add.
+    virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
     /// isNarrowingProfitable - Return true if it's profitable to narrow
     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
     /// from i32 to i8 but not from i32 to i16.
@@ -634,7 +670,8 @@ namespace llvm {
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                                     const TargetLibraryInfo *libInfo) const;
 
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
@@ -761,6 +798,7 @@ namespace llvm {
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
@@ -797,12 +835,7 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -822,9 +855,9 @@ namespace llvm {
 
     virtual bool
     CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-		   bool isVarArg,
-		   const SmallVectorImpl<ISD::OutputArg> &Outs,
-		   LLVMContext &Context) const;
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
 
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp) const;
@@ -909,10 +942,14 @@ namespace llvm {
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                     SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
   };
 
   namespace X86 {
-    FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
   }
 }
 
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 0eee083..f790611 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1132,8 +1132,10 @@ defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
                          X86xor_flag, xor, 1, 0>;
 defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
                          X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
 defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
                          X86sub_flag, sub, 0, 0>;
+}
 
 // Arithmetic.
 let Uses = [EFLAGS] in {
@@ -1143,7 +1145,9 @@ let Uses = [EFLAGS] in {
                             0, 0>;
 }
 
+let isCompare = 1 in {
 defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1154,7 +1158,7 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
                          (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
 
-let Defs = [EFLAGS] in {
+let isCompare = 1, Defs = [EFLAGS] in {
   let isCommutable = 1 in {
     def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
     def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index fa1d676..aaef4a4 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -55,11 +55,11 @@ struct X86AddressMode {
     : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
     Base.Reg = 0;
   }
-  
-  
+
+
   void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
     assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
-    
+
     if (BaseType == X86AddressMode::RegBase)
       MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
                                              false, false, false, 0, false));
@@ -67,16 +67,16 @@ struct X86AddressMode {
       assert(BaseType == X86AddressMode::FrameIndexBase);
       MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
     }
-    
+
     MO.push_back(MachineOperand::CreateImm(Scale));
     MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
                                            false, false, false, 0, false));
-    
+
     if (GV)
       MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
     else
       MO.push_back(MachineOperand::CreateImm(Disp));
-    
+
     MO.push_back(MachineOperand::CreateReg(0, false, false,
                                            false, false, false, 0, false));
   }
@@ -122,7 +122,7 @@ static inline const MachineInstrBuilder &
 addFullAddress(const MachineInstrBuilder &MIB,
                const X86AddressMode &AM) {
   assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
-  
+
   if (AM.BaseType == X86AddressMode::RegBase)
     MIB.addReg(AM.Base.Reg);
   else {
@@ -135,7 +135,7 @@ addFullAddress(const MachineInstrBuilder &MIB,
     MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
   else
     MIB.addImm(AM.Disp);
-    
+
   return MIB.addReg(0);
 }
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 6f9e849..99c2b8f 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -375,11 +375,16 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [ESP] in
+    Uses = [ESP] in {
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
                   Requires<[In32BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[In32BitMode]>;
+}
 
 // All calls clobber the non-callee saved registers. RSP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
@@ -389,11 +394,16 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [RSP] in
+    Uses = [RSP] in {
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
                   [(X86tlsaddr tls64addr:$sym)]>,
                   Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
 
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
@@ -1008,8 +1018,8 @@ def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
 // tailcall stuff
-def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
-          (TCRETURNri GR32_TC:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In32BitMode]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
@@ -1623,6 +1633,12 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
 def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index bf11fde..b0c27c8 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -18,16 +18,16 @@
 // Return instructions.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
-  def RETW   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
-  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret{w}\t$amt",
                     [], IIC_RET_IMM>, OpSize;
   def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
@@ -148,12 +148,12 @@ let isCall = 1 in
   // registers are added manually.
   let Uses = [ESP] in {
     def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
-                           (outs), (ins i32imm_pcrel:$dst,variable_ops),
+                           (outs), (ins i32imm_pcrel:$dst),
                            "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
-    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
                          Requires<[In32BitMode]>;
-    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
                         Requires<[In32BitMode]>;
 
@@ -174,7 +174,7 @@ let isCall = 1 in
     // callw for 16 bit code for the assembler.
     let isAsmParserOnly = 1 in
       def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
-                       (outs), (ins i16imm_pcrel:$dst, variable_ops),
+                       (outs), (ins i16imm_pcrel:$dst),
                        "callw\t$dst", []>, OpSize;
   }
 
@@ -185,23 +185,23 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     isCodeGenOnly = 1 in
   let Uses = [ESP] in {
   def TCRETURNdi : PseudoI<(outs),
-                     (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
   def TCRETURNri : PseudoI<(outs),
-                     (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
   let mayLoad = 1 in
   def TCRETURNmi : PseudoI<(outs),
-                     (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
 
   // FIXME: The should be pseudo instructions that are lowered when going to
   // mcinst.
   def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst, variable_ops),
+                           (ins i32imm_pcrel:$dst),
                            "jmp\t$dst  # TAILCALL",
                            [], IIC_JMP_REL>;
-  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
                    "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
-  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
                    "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 }
 
@@ -218,14 +218,14 @@ let isCall = 1, Uses = [RSP] in {
   // that the offset between an arbitrary immediate and the call will fit in
   // the 32-bit pcrel field that we have.
   def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
-                        (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                        (outs), (ins i64i32imm_pcrel:$dst),
                         "call{q}\t$dst", [], IIC_CALL_RI>,
                       Requires<[In64BitMode]>;
-  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)],
                         IIC_CALL_RI>,
                       Requires<[In64BitMode]>;
-  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
                         IIC_CALL_MEM>,
                       Requires<[In64BitMode]>;
@@ -240,7 +240,7 @@ let isCall = 1, isCodeGenOnly = 1 in
   let Defs = [RAX, R10, R11, RSP, EFLAGS],
       Uses = [RSP] in {
     def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
-                      (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                      (outs), (ins i64i32imm_pcrel:$dst),
                       "call{q}\t$dst", [], IIC_CALL_RI>,
                     Requires<[IsWin64]>;
   }
@@ -250,21 +250,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   let Uses = [RSP],
       usesCustomInserter = 1 in {
   def TCRETURNdi64 : PseudoI<(outs),
-                      (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
+                      (ins i64i32imm_pcrel:$dst, i32imm:$offset),
                       []>;
   def TCRETURNri64 : PseudoI<(outs),
-                      (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>;
+                      (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
   let mayLoad = 1 in
   def TCRETURNmi64 : PseudoI<(outs),
-                       (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+                       (ins i64mem_TC:$dst, i32imm:$offset), []>;
 
   def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
-                                      (ins i64i32imm_pcrel:$dst, variable_ops),
+                                      (ins i64i32imm_pcrel:$dst),
                    "jmp\t$dst  # TAILCALL", [], IIC_JMP_REL>;
-  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops),
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
                      "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 
   let mayLoad = 1 in
-  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
                      "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 }
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 0d5490a..2eb454d 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -39,12 +39,15 @@ let neverHasSideEffects = 1 in {
 
 
 // Sign/Zero extenders
+let neverHasSideEffects = 1 in {
 def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
                    TB, OpSize;
+let mayLoad = 1 in
 def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
                    TB, OpSize;
+} // neverHasSideEffects = 1
 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB;
@@ -59,12 +62,15 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
                    TB;
 
+let neverHasSideEffects = 1 in {
 def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
                    TB, OpSize;
+let mayLoad = 1 in
 def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
                    TB, OpSize;
+} // neverHasSideEffects = 1
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB;
@@ -82,6 +88,7 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 // These are the same as the regular MOVZX32rr8 and MOVZX32rm8
 // except that they use GR32_NOREX for the output operand register class
 // instead of GR32. This allows them to operate on h registers on x86-64.
+let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
@@ -91,6 +98,7 @@ def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
                          [], IIC_MOVZX>, TB;
+}
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
 // operand, which makes it a rare instruction with an 8-bit register
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index d57937b..265b4bb 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -15,83 +15,245 @@
 // FMA3 - Intel 3 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
+let Constraints = "$src1 = $dst" in {
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
+let neverHasSideEffects = 1 in {
   def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr, 
+                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+  let mayLoad = 1 in
   def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
+               (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+               !strconcat(OpcodeStr, 
+                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
   def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
+                (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                !strconcat(OpcodeStr, 
+                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+  let mayLoad = 1 in
   def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, f256mem:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
+                (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                !strconcat(OpcodeStr, 
+                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+} // neverHasSideEffects = 1
 }
 
+// Intrinsic for 213 pattern
+multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
+                        PatFrag MemFrag128, PatFrag MemFrag256,
+                        Intrinsic Int128, Intrinsic Int256, SDNode Op213, 
+			                  ValueType OpVT128, ValueType OpVT256> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
+                                      VR128:$src3))]>;
+
+  def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, 
+                                               VR128:$src1, VR128:$src3)))]>;
+
+  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
+                                      (MemFrag128 addr:$src3)))]>;
+
+  def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, VR128:$src1,
+                                               (MemFrag128 addr:$src3))))]>;
+
+
+  def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                    !strconcat(OpcodeStr, 
+                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
+                                       VR256:$src3))]>;
+
+  def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst, (OpVT256 (Op213 VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>;
+
+  def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                    !strconcat(OpcodeStr, 
+                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
+                                       (MemFrag256 addr:$src3)))]>;
+
+  def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst,
+                     (OpVT256 (Op213 VR256:$src2, VR256:$src1, 
+                               (MemFrag256 addr:$src3))))]>;
+}
+} // Constraints = "$src1 = $dst"
+
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpcodeStr, string PackTy> {
-  defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
-  defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
-  defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+                       string OpcodeStr, string PackTy,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       Intrinsic Int128, Intrinsic Int256, SDNode Op,
+                       ValueType OpTy128, ValueType OpTy256> {
+  defm r213 : fma3p_rm_int <opc213, !strconcat(OpcodeStr,
+                            !strconcat("213", PackTy)), MemFrag128, MemFrag256,
+                            Int128, Int256, Op, OpTy128, OpTy256>;
+  defm r132 : fma3p_rm <opc132, 
+                        !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+  defm r231 : fma3p_rm <opc231, 
+                        !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
 }
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
-  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
-  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
-  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
+                                 memopv8f32, int_x86_fma_vfmadd_ps,
+                                 int_x86_fma_vfmadd_ps_256, X86Fmadd,
+                                 v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
+                                 memopv8f32, int_x86_fma_vfmsub_ps, 
+                                 int_x86_fma_vfmsub_ps_256, X86Fmsub,
+                                 v4f32, v8f32>;
+  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+                                 memopv4f32, memopv8f32, 
+                                 int_x86_fma_vfmaddsub_ps,
+                                 int_x86_fma_vfmaddsub_ps_256, X86Fmaddsub, 
+                                 v4f32, v8f32>;
+  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+                                 memopv4f32, memopv8f32, 
+                                 int_x86_fma_vfmsubadd_ps,
+                                 int_x86_fma_vfmaddsub_ps_256, X86Fmsubadd,
+                                 v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
-  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
-  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
-  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
+                                 memopv4f64, int_x86_fma_vfmadd_pd,
+                                 int_x86_fma_vfmadd_pd_256, X86Fmadd, v2f64,
+                                 v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
+                                 memopv4f64, int_x86_fma_vfmsub_pd,
+                                 int_x86_fma_vfmsub_pd_256, X86Fmsub, v2f64,
+                                 v4f64>, VEX_W;
+  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+                                 memopv2f64, memopv4f64,
+                                 int_x86_fma_vfmaddsub_pd,
+                                 int_x86_fma_vfmaddsub_pd_256, X86Fmaddsub,
+                                 v2f64, v4f64>, VEX_W;
+  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+                                 memopv2f64, memopv4f64, 
+                                 int_x86_fma_vfmsubadd_pd,
+                                 int_x86_fma_vfmsubadd_pd_256, X86Fmsubadd,
+                                 v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
-  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
+                               memopv8f32, int_x86_fma_vfnmadd_ps,
+                               int_x86_fma_vfnmadd_ps_256, X86Fnmadd, v4f32,
+                               v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
+                               memopv8f32, int_x86_fma_vfnmsub_ps,
+                               int_x86_fma_vfnmsub_ps_256, X86Fnmsub, v4f32,
+                               v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
-  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
+                               memopv4f64, int_x86_fma_vfnmadd_pd,
+                               int_x86_fma_vfnmadd_pd_256, X86Fnmadd, v2f64,
+                               v4f64>, VEX_W;
+  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+                               memopv2f64,
+                               memopv4f64, int_x86_fma_vfnmsub_pd,
+                               int_x86_fma_vfnmsub_pd_256, X86Fnmsub, v2f64,
+                               v4f64>, VEX_W;
 }
 
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
-  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
-  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, x86memop:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           []>;
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+                    RegisterClass RC> {
+let neverHasSideEffects = 1 in {
+  def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
+               !strconcat(OpcodeStr, 
+                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+  let mayLoad = 1 in
+  def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, x86memop:$src3),
+               !strconcat(OpcodeStr, 
+                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+} // neverHasSideEffects = 1
 }
 
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
+                        ComplexPattern mem_cpat, Intrinsic IntId, 
+                        RegisterClass RC, SDNode OpNode, ValueType OpVT> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
+                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1, 
+                     VR128:$src3))]>;
+  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, memop:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+		                 (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, memop:$src3),
+                   !strconcat(OpcodeStr, 
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+}
+} // Constraints = "$src1 = $dst"
+
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpcodeStr> {
-  defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>;
-  defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>;
-  defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>;
-  defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W;
-  defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W;
-  defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W;
+                       string OpStr, Intrinsic IntF32, Intrinsic IntF64, 
+                       SDNode OpNode> {
+  defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
+  defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
+  defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>,
+                        VEX_W;
+  defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>,
+                        VEX_W;
+  defm SSr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213ss"), ssmem,
+                              sse_load_f32, IntF32, FR32, OpNode, f32>;
+  defm SDr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213sd"), sdmem,
+                              sse_load_f64, IntF64, FR64, OpNode, f64>, VEX_W;
 }
 
-defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG;
-defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG;
+defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                          int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                          int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                           int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                           int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
 
-defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG;
-defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG;
 
 //===----------------------------------------------------------------------===//
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -178,43 +340,47 @@ let isCodeGenOnly = 1 in {
 } // isCodeGenOnly = 1
 }
 
+let Predicates = [HasFMA4] in {
+
 defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfmadd_ss>;
+                          int_x86_fma_vfmadd_ss>;
 defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps,
-                          int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd,
-                          int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfmadd_sd>;
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
+                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
+                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
 defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfmsub_ss>;
+                          int_x86_fma_vfmsub_ss>;
 defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps,
-                          int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd,
-                          int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfmsub_sd>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
+                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
+                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
 defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfnmadd_ss>;
+                          int_x86_fma_vfnmadd_ss>;
 defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps,
-                          int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd,
-                          int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfnmadd_sd>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
+                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
+                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
 defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfnmsub_ss>;
+                          int_x86_fma_vfnmsub_ss>;
 defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps,
-                          int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd,
-                          int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps,
-                         int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd,
-                         int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps,
-                         int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd,
-                         int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfnmsub_sd>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
+                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
+                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
+                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
+                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
+                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
+                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+} // HasFMA4
+
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index a13887e..568726e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -27,6 +27,7 @@ def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
                                            SDTCisVT<2, OtherVT>]>;
 def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
                                            SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
 
 def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -41,6 +42,7 @@ def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
 def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
                              [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
                               SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
 def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
                              [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
@@ -203,6 +205,7 @@ def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
 }
 }
 
+let Defs = [FPSW] in {
 defm ADD : FPBinary_rr<fadd>;
 defm SUB : FPBinary_rr<fsub>;
 defm MUL : FPBinary_rr<fmul>;
@@ -213,6 +216,7 @@ defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
 defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
 
 class FPST0rInst<bits<8> o, string asm>
   : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
@@ -257,6 +261,7 @@ def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
 def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
 }
 
+let Defs = [FPSW] in {
 defm CHS : FPUnary<fneg, 0xE0, "fchs">;
 defm ABS : FPUnary<fabs, 0xE1, "fabs">;
 defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
@@ -269,6 +274,7 @@ def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
 }
 def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+} // Defs = [FPSW]
 
 // Versions of FP instructions that take a single memory operand.  Added for the
 //   disassembler; remove as they are included with patterns elsewhere.
@@ -316,6 +322,7 @@ multiclass FPCMov<PatLeaf cc> {
                                         Requires<[HasCMov]>;
 }
 
+let Defs = [FPSW] in {
 let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
 defm CMOVB  : FPCMov<X86_COND_B>;
 defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -416,24 +423,40 @@ def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
 }
 
 let mayLoad = 1 in {
-def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
-def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
-def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
-def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
-def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
-def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
 }
 let mayStore = 1 in {
-def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
-def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
-def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
-def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
-def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
-def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
-def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
-def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
-def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
-def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
 }
 
 // FISTTP requires SSE3 even though it's a FPStack op.
@@ -459,17 +482,23 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
 } // Predicates = [HasSSE3]
 
 let mayStore = 1 in {
-def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
-def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
 def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
-  "fisttp{ll}\t$dst">;
+  "fisttp{ll}\t$dst", IIC_FST>;
 }
 
 // FP Stack manipulation instructions.
-def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
-def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
-def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
-def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
+                   IIC_FLD>, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
+                   IIC_FST>, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
+                   IIC_FST>, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
+                   IIC_FXCH>, D9;
 
 // Floating point constant loads.
 let isReMaterializable = 1 in {
@@ -487,20 +516,21 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm1)]>;
 }
 
-def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
-def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
 
 
 // Floating point compares.
-let Defs = [EFLAGS] in {
 def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
 def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
 def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
-                        
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // Defs = [FPSW]
+
 // CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
 def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
 def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -509,85 +539,94 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
                   [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
 }
 
-let Defs = [EFLAGS], Uses = [ST0] in {
+let Defs = [FPSW], Uses = [ST0] in {
 def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
                     (outs), (ins RST:$reg),
-                    "fucom\t$reg">, DD;
+                    "fucom\t$reg", IIC_FUCOM>, DD;
 def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
                     (outs), (ins RST:$reg),
-                    "fucomp\t$reg">, DD;
+                    "fucomp\t$reg", IIC_FUCOM>, DD;
 def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
                     (outs), (ins),
-                    "fucompp">, DA;
+                    "fucompp", IIC_FUCOM>, DA;
+}
 
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
 def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
                     (outs), (ins RST:$reg),
-                    "fucomi\t$reg">, DB;
+                    "fucomi\t$reg", IIC_FUCOMI>, DB;
 def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
                     (outs), (ins RST:$reg),
-                    "fucompi\t$reg">, DF;
+                    "fucompi\t$reg", IIC_FUCOMI>, DF;
 }
 
+let Defs = [EFLAGS, FPSW] in {
 def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                  "fcomi\t$reg">, DB;
+                  "fcomi\t$reg", IIC_FCOMI>, DB;
 def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                   "fcompi\t$reg">, DF;
+                   "fcompi\t$reg", IIC_FCOMI>, DF;
+}
 
 // Floating point flag ops.
-let Defs = [AX] in
-def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
-                  (outs), (ins), "fnstsw %ax", []>, DF;
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xE0, RawFrm,                  // AX = fp flags
+                  (outs), (ins), "fnstsw %ax",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
 
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
                   (outs), (ins i16mem:$dst), "fnstcw\t$dst",
-                  [(X86fp_cwd_get16 addr:$dst)]>;
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
                   
 let mayLoad = 1 in
 def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
-                  (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>;
 
 // FPU control instructions
-def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB;
+let Defs = [FPSW] in
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
 def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
-                "ffree\t$reg">, DD;
+                "ffree\t$reg", IIC_FFREE>, DD;
 
 // Clear exceptions
 
-def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB;
+let Defs = [FPSW] in
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
 
 // Operandless floating-point instructions for the disassembler.
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
-
-def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9;
-def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9;
-def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9;
-def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9;
-def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9;
-def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9;
-def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9;
-def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9;
-def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9;
-def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9;
-def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9;
-def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9;
-def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9;
-def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9;
-def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9;
-def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9;
-def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9;
-def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9;
-def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9;
-def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9;
-def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE;
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
 
 def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-               "fxsave\t$dst", []>, TB;
+               "fxsave\t$dst", [], IIC_FXSAVE>, TB;
 def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-                 "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+                 "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
+                 Requires<[In64BitMode]>;
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor\t$src", []>, TB;
+                "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
 def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                  "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>;
+                  "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+                  Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index b387090..81b4f81 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -255,8 +255,9 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
 
 // FPStack Instruction Templates:
 // FPI - Floating Point Instruction template.
-class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
-  : I<o, F, outs, ins, asm, []> {}
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = IIC_DEFAULT>
+  : I<o, F, outs, ins, asm, [], itin> {}
 
 // FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
@@ -365,6 +366,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 // 
 //   SDI    - SSE2 instructions with XD prefix.
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   S2SI   - SSE2 instructions with XS prefix.
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
 //   PDI    - SSE2 instructions with TB and OpSize prefixes.
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
@@ -377,8 +379,11 @@ class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
-class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern>
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
@@ -392,6 +397,10 @@ class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
         Requires<[HasAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
@@ -503,29 +512,29 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE2, HasAES]>;
+        Requires<[HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE2, HasAES]>;
+        Requires<[HasAES]>;
 
-// CLMUL Instruction Templates
-class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, Requires<[HasSSE2, HasCLMUL]>;
+        OpSize, Requires<[HasPCLMUL]>;
 
-class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                   list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>;
+        OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8,
-        OpSize, VEX_4V, Requires<[HasFMA3]>;
+        OpSize, VEX_4V, Requires<[HasFMA]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 35801e4..d13167b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
                                       SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, 
+                                      SDTCisOpSmallerThanOp<1, 0> ]>>;
+
 def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
-                 
+
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
@@ -102,13 +107,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 
-def X86vpcom   : SDNode<"X86ISD::VPCOM",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisSameAs<1,2>]>>;
@@ -127,7 +125,10 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 
 def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 
 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
 
@@ -162,9 +163,26 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 
-def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>;
-def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>;
-def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>;
+def X86Blendpw   : SDNode<"X86ISD::BLENDPW",   SDTBlend>;
+def X86Blendps   : SDNode<"X86ISD::BLENDPS",   SDTBlend>;
+def X86Blendpd   : SDNode<"X86ISD::BLENDPD",   SDTBlend>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+                                         SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+                                         SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
@@ -304,7 +322,7 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
 }]>;
 
 def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
+                                    (st node:$val, node:$ptr), [{
   if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
     return ST->isNonTemporal() && !ST->isTruncatingStore() &&
            ST->getAddressingMode() == ISD::UNINDEXED &&
@@ -313,7 +331,7 @@ def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
 }]>;
 
 def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
+                                      (st node:$val, node:$ptr), [{
   if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
     return ST->isNonTemporal() &&
            ST->getAlignment() < 16;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index b12c1db..cca04e5 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -54,38 +55,39 @@ ReMatPICStubLoad("remat-pic-stub-load",
 
 enum {
   // Select which memory operand is being unfolded.
-  // (stored in bits 0 - 7)
+  // (stored in bits 0 - 3)
   TB_INDEX_0    = 0,
   TB_INDEX_1    = 1,
   TB_INDEX_2    = 2,
-  TB_INDEX_MASK = 0xff,
-
-  // Minimum alignment required for load/store.
-  // Used for RegOp->MemOp conversion.
-  // (stored in bits 8 - 15)
-  TB_ALIGN_SHIFT = 8,
-  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
-  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
-  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
-  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT,
+  TB_INDEX_3    = 3,
+  TB_INDEX_MASK = 0xf,
 
   // Do not insert the reverse map (MemOp -> RegOp) into the table.
   // This may be needed because there is a many -> one mapping.
-  TB_NO_REVERSE   = 1 << 16,
+  TB_NO_REVERSE   = 1 << 4,
 
   // Do not insert the forward map (RegOp -> MemOp) into the table.
   // This is needed for Native Client, which prohibits branch
   // instructions from using a memory operand.
-  TB_NO_FORWARD   = 1 << 17,
+  TB_NO_FORWARD   = 1 << 5,
 
-  TB_FOLDED_LOAD  = 1 << 18,
-  TB_FOLDED_STORE = 1 << 19
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
 struct X86OpTblEntry {
   uint16_t RegOp;
   uint16_t MemOp;
-  uint32_t Flags;
+  uint16_t Flags;
 };
 
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
@@ -408,20 +410,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
     { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
     { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
-    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm,      TB_ALIGN_16 },
-    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm,      0 },
     { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
     { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
     { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
     { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
@@ -492,14 +484,20 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     // AVX 128-bit versions of foldable instructions
     { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
     { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
-    { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm,     0 },
     { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
     { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
     { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
     { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
@@ -535,6 +533,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VSQRTPSr_Int,    X86::VSQRTPSm_Int,        TB_ALIGN_16 },
     { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
     { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+
     // AVX 256-bit foldable instructions
     { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
     { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
@@ -543,6 +543,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        TB_ALIGN_32 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        TB_ALIGN_32 },
+
     // AVX2 foldable instructions
     { X86::VPABSBrr256,     X86::VPABSBrm256,         TB_ALIGN_32 },
     { X86::VPABSDrr256,     X86::VPABSDrm256,         TB_ALIGN_32 },
@@ -558,6 +559,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VSQRTPDYr_Int,   X86::VSQRTPDYm_Int,       TB_ALIGN_32 },
     { X86::VSQRTPSYr,       X86::VSQRTPSYm,           TB_ALIGN_32 },
     { X86::VSQRTPSYr_Int,   X86::VSQRTPSYm_Int,       TB_ALIGN_32 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -671,6 +674,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL64rr,        X86::IMUL64rm,      0 },
     { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
     { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
     { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
     { X86::MAXPDrr_Int,     X86::MAXPDrm_Int,   TB_ALIGN_16 },
     { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
@@ -808,17 +817,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
     { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
     { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
-    { X86::VCVTTSD2SI64rr,    X86::VCVTTSD2SI64rm,     0 },
-    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 },
-    { X86::VCVTTSD2SIrr,      X86::VCVTTSD2SIrm,       0 },
-    { X86::Int_VCVTTSD2SIrr,  X86::Int_VCVTTSD2SIrm,   0 },
-    { X86::VCVTTSS2SI64rr,    X86::VCVTTSS2SI64rm,     0 },
-    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 },
-    { X86::VCVTTSS2SIrr,      X86::VCVTTSS2SIrm,       0 },
-    { X86::Int_VCVTTSS2SIrr,  X86::Int_VCVTTSS2SIrm,   0 },
-    { X86::VCVTSD2SI64rr,     X86::VCVTSD2SI64rm,      0 },
-    { X86::VCVTSD2SIrr,       X86::VCVTSD2SIrm,        0 },
-    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQrm,       TB_ALIGN_16 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQXrm,      TB_ALIGN_16 },
     { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       TB_ALIGN_16 },
     { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
     { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
@@ -1122,6 +1121,158 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                   // Index 2, folded load
                   Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
+
+  static const X86OpTblEntry OpTbl3[] = {
+    // FMA foldable instructions
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         0 },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         0 },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         0 },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         0 },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         0 },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         0 },
+    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     0 },
+    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     0 },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr213r_Int,     X86::VFMADDPSr213m_Int,     TB_ALIGN_16 },
+    { X86::VFMADDPDr213r_Int,     X86::VFMADDPDr213m_Int,     TB_ALIGN_16 },
+    { X86::VFMADDPSr213rY_Int,    X86::VFMADDPSr213mY_Int,    TB_ALIGN_32 },
+    { X86::VFMADDPDr213rY_Int,    X86::VFMADDPDr213mY_Int,    TB_ALIGN_32 },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        0 },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        0 },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        0 },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        0 },
+    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    0 },
+    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    0 },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr213r_Int,    X86::VFNMADDPSr213m_Int,    TB_ALIGN_16 },
+    { X86::VFNMADDPDr213r_Int,    X86::VFNMADDPDr213m_Int,    TB_ALIGN_16 },
+    { X86::VFNMADDPSr213rY_Int,   X86::VFNMADDPSr213mY_Int,   TB_ALIGN_32 },
+    { X86::VFNMADDPDr213rY_Int,   X86::VFNMADDPDr213mY_Int,   TB_ALIGN_32 },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         0 },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         0 },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         0 },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         0 },
+    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     0 },
+    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     0 },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr213r_Int,     X86::VFMSUBPSr213m_Int,     TB_ALIGN_16 },
+    { X86::VFMSUBPDr213r_Int,     X86::VFMSUBPDr213m_Int,     TB_ALIGN_16 },
+    { X86::VFMSUBPSr213rY_Int,    X86::VFMSUBPSr213mY_Int,    TB_ALIGN_32 },
+    { X86::VFMSUBPDr213rY_Int,    X86::VFMSUBPDr213mY_Int,    TB_ALIGN_32 },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        0 },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        0 },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        0 },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        0 },
+    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    0 },
+    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    0 },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr213r_Int,    X86::VFNMSUBPSr213m_Int,    TB_ALIGN_16 },
+    { X86::VFNMSUBPDr213r_Int,    X86::VFNMSUBPDr213m_Int,    TB_ALIGN_16 },
+    { X86::VFNMSUBPSr213rY_Int,   X86::VFNMSUBPSr213mY_Int,   TB_ALIGN_32 },
+    { X86::VFNMSUBPDr213rY_Int,   X86::VFNMSUBPDr213mY_Int,   TB_ALIGN_32 },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr213r_Int,  X86::VFMADDSUBPSr213m_Int,  TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr213r_Int,  X86::VFMADDSUBPDr213m_Int,  TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr213rY_Int, X86::VFMADDSUBPSr213mY_Int, TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr213rY_Int, X86::VFMADDSUBPDr213mY_Int, TB_ALIGN_32 },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr213r_Int,  X86::VFMSUBADDPSr213m_Int,  TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr213r_Int,  X86::VFMSUBADDPDr213m_Int,  TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr213rY_Int, X86::VFMSUBADDPSr213mY_Int, TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr213rY_Int, X86::VFMSUBADDPDr213mY_Int, TB_ALIGN_32 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+    unsigned RegOp = OpTbl3[i].RegOp;
+    unsigned MemOp = OpTbl3[i].MemOp;
+    unsigned Flags = OpTbl3[i].Flags;
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 3, folded load
+                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+
 }
 
 void
@@ -1312,6 +1463,9 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
 /// regIsPICBase - Return true if register is PIC base (i.e.g defined by
 /// X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  // Don't waste compile time scanning use-def chains of physregs.
+  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+    return false;
   bool isPICBase = false;
   for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
          E = MRI.def_end(); I != E; ++I) {
@@ -1369,16 +1523,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
           return false;
         const MachineFunction &MF = *MI->getParent()->getParent();
         const MachineRegisterInfo &MRI = MF.getRegInfo();
-        bool isPICBase = false;
-        for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
-               E = MRI.def_end(); I != E; ++I) {
-          MachineInstr *DefMI = I.getOperand().getParent();
-          if (DefMI->getOpcode() != X86::MOVPC32r)
-            return false;
-          assert(!isPICBase && "More than one PIC base?");
-          isPICBase = true;
-        }
-        return isPICBase;
+        return regIsPICBase(BaseReg, MRI);
       }
       return false;
     }
@@ -1782,12 +1927,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
       if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src,
-                            MIOpc == X86::INC64r ? X86::GR64_NOSPRegisterClass :
-                                                   X86::GR32_NOSPRegisterClass))
+          !MF.getRegInfo().constrainRegClass(Src, RC))
         return 0;
 
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1812,11 +1958,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
       if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src,
-                            MIOpc == X86::DEC64r ? X86::GR64_NOSPRegisterClass :
-                                                   X86::GR32_NOSPRegisterClass))
+          !MF.getRegInfo().constrainRegClass(Src, RC))
         return 0;
 
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1844,10 +1991,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       const TargetRegisterClass *RC;
       if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
         Opc = X86::LEA64r;
-        RC = X86::GR64_NOSPRegisterClass;
+        RC = &X86::GR64_NOSPRegClass;
       } else {
         Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        RC = X86::GR32_NOSPRegisterClass;
+        RC = &X86::GR32_NOSPRegClass;
       }
 
 
@@ -1863,6 +2010,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         .addReg(Dest, RegState::Define |
                                 getDeadRegState(isDead)),
                         Src, isKill, Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2079,7 +2233,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   }
 }
 
-static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
   case X86::JE_4:  return X86::COND_E;
@@ -2101,6 +2255,84 @@ static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
   }
 }
 
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
 unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Illegal condition code!");
@@ -2147,6 +2379,101 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   }
 }
 
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+static unsigned getSETFromCond(X86::CondCode CC,
+                               bool HasMemoryOperand) {
+  static const unsigned Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
+                                bool HasMemoryOperand) {
+  static const unsigned Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   if (!MI->isTerminator()) return false;
 
@@ -2213,7 +2540,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     }
 
     // Handle conditional branches.
-    X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
@@ -2311,7 +2638,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     if (I->isDebugValue())
       continue;
     if (I->getOpcode() != X86::JMP_4 &&
-        GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -2371,6 +2698,56 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   return Count;
 }
 
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                const SmallVectorImpl<MachineOperand> &Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, DebugLoc DL,
+                                unsigned DstReg,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+   assert(Cond.size() == 1 && "Invalid Cond array");
+   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                  MRI.getRegClass(DstReg)->getSize(),
+                                  false/*HasMemoryOperand*/);
+   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
 /// isHReg - Test if the given register is a physical h register.
 static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
@@ -2637,6 +3014,464 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   NewMIs.push_back(MIB);
 }
 
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  // A SUB can be used to perform comparison.
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (((FlagI->getOpcode() == X86::CMP64rr &&
+        OI->getOpcode() == X86::SUB64rr) ||
+       (FlagI->getOpcode() == X86::CMP32rr &&
+        OI->getOpcode() == X86::SUB32rr)||
+       (FlagI->getOpcode() == X86::CMP16rr &&
+        OI->getOpcode() == X86::SUB16rr)||
+       (FlagI->getOpcode() == X86::CMP8rr &&
+        OI->getOpcode() == X86::SUB8rr)) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+        OI->getOpcode() == X86::SUB64ri32) ||
+       (FlagI->getOpcode() == X86::CMP64ri8 &&
+        OI->getOpcode() == X86::SUB64ri8) ||
+       (FlagI->getOpcode() == X86::CMP32ri &&
+        OI->getOpcode() == X86::SUB32ri) ||
+       (FlagI->getOpcode() == X86::CMP32ri8 &&
+        OI->getOpcode() == X86::SUB32ri8) ||
+       (FlagI->getOpcode() == X86::CMP16ri &&
+        OI->getOpcode() == X86::SUB16ri) ||
+       (FlagI->getOpcode() == X86::CMP16ri8 &&
+        OI->getOpcode() == X86::SUB16ri8) ||
+       (FlagI->getOpcode() == X86::CMP8ri &&
+        OI->getOpcode() == X86::SUB8ri)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+    return true;
+  }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Check whether we can replace SUB with CMP.
+  unsigned NewOpcode = 0;
+  switch (CmpInstr->getOpcode()) {
+  default: break;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+      return false;
+    // There is no use of the destination register, we can replace SUB with CMP.
+    switch (CmpInstr->getOpcode()) {
+    default: llvm_unreachable(0);
+    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
+    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
+    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
+    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
+    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
+    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
+    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
+    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
+    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+    case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
+    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
+    case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
+    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
+    case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
+    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    }
+    CmpInstr->setDesc(get(NewOpcode));
+    CmpInstr->RemoveOperand(0);
+    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+      return false;
+  }
+  }
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
+      !isDefConvertible(MI)))
+    return false;
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = NULL;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = MachineBasicBlock::reverse_iterator(I),
+      RE = CmpInstr->getParent() == MI->getParent() ?
+           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+           CmpInstr->getParent()->rend();
+  MachineInstr *Movr0Inst = 0;
+  for (; RI != RE; ++RI) {
+    MachineInstr *Instr = &*RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = Instr;
+      break;
+    }
+
+    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr->readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
+           Instr->getOpcode() == X86::MOV16r0 ||
+           Instr->getOpcode() == X86::MOV32r0 ||
+           Instr->getOpcode() == X86::MOV64r0) &&
+          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+    // We should check the usage if this instruction uses and updates EFLAGS.
+    if (!UseEFLAGS && ModifyEFLAGS) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!UseEFLAGS && !ModifyEFLAGS)
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      X86::CondCode NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the place right before Sub.
+  if (Movr0Inst) {
+    Sub->getParent()->remove(Movr0Inst);
+    Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+  }
+
+  // Make sure Sub instruction defines EFLAGS.
+  assert(Sub->getNumOperands() >= 2 &&
+         Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
+         Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+         "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
+  Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+  CmpInstr->eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+    OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+  return true;
+}
+
+/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr* X86InstrInfo::
+optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
+                  unsigned &FoldAsLoadDefReg,
+                  MachineInstr *&DefMI) const {
+  if (FoldAsLoadDefReg == 0)
+    return 0;
+  // To be conservative, if there exists another load, clear the load candidate.
+  if (MI->mayLoad()) {
+    FoldAsLoadDefReg = 0;
+    return 0;
+  }
+
+  // Check whether we can move DefMI here.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(this, 0, SawStore))
+    return 0;
+
+  // We try to commute MI if possible.
+  unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
+  for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
+    // Collect information about virtual register operands of MI.
+    unsigned SrcOperandId = 0;
+    bool FoundSrcOperand = false;
+    for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != FoldAsLoadDefReg)
+        continue;
+      // Do not fold if we have a subreg use or a def or multiple uses.
+      if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+        return 0;
+
+      SrcOperandId = i;
+      FoundSrcOperand = true;
+    }
+    if (!FoundSrcOperand) return 0;
+
+    // Check whether we can fold the def into SrcOperandId.
+    SmallVector<unsigned, 8> Ops;
+    Ops.push_back(SrcOperandId);
+    MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+    if (FoldMI) {
+      FoldAsLoadDefReg = 0;
+      return FoldMI;
+    }
+
+    if (Idx == 1) {
+      // MI was changed but it didn't help, commute it back!
+      commuteInstruction(MI, false);
+      return 0;
+    }
+
+    // Check whether we can commute MI and enable folding.
+    if (MI->isCommutable()) {
+      MachineInstr *NewMI = commuteInstruction(MI, false);
+      // Unable to commute.
+      if (!NewMI) return 0;
+      if (NewMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        NewMI->eraseFromParent();
+        return 0;
+      }
+    }
+  }
+  return 0;
+}
+
 /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
 /// instruction with two undef reads of the register being defined.  This is
 /// used for mapping:
@@ -2795,6 +3630,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (i == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (i == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   // If table selected...
@@ -2809,7 +3646,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         return NULL;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI)->getSize();
+        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -3202,7 +4039,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   UnfoldStore &= FoldedStore;
 
   const MCInstrDesc &MCID = get(Opc);
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
       !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
@@ -3297,7 +4134,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(MI->memoperands_begin(),
@@ -3323,7 +4160,8 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -3344,7 +4182,6 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 
   // Emit the load instruction.
   SDNode *Load = 0;
-  MachineFunction &MF = DAG.getMachineFunction();
   if (FoldedLoad) {
     EVT VT = *RC->vt_begin();
     std::pair<MachineInstr::mmo_iterator,
@@ -3371,7 +4208,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = 0;
   if (MCID.getNumDefs() > 0) {
-    DstRC = getRegClass(MCID, 0, &RI);
+    DstRC = getRegClass(MCID, 0, &RI, MF);
     VTs.push_back(*DstRC->vt_begin());
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
@@ -3625,7 +4462,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   // Create the register. The code to initialize it is inserted
   // later, by the CGBR pass (below).
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
@@ -3835,7 +4672,7 @@ namespace {
 
       unsigned PC;
       if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
-        PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
       else
         PC = GlobalBaseReg;
 
@@ -3869,3 +4706,117 @@ namespace {
 char CGBR::ID = 0;
 FunctionPass*
 llvm::createGlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+            else
+              I = SetRegister(I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   is64Bit ? X86::RAX : X86::EAX)
+                                   .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    virtual const char *getPassName() const {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index b23d756..b6f69af 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -128,8 +128,8 @@ class X86InstrInfo : public X86GenInstrInfo {
   X86TargetMachine &TM;
   const X86RegisterInfo RI;
 
-  /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
-  /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
   ///
   typedef DenseMap<unsigned,
                    std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
@@ -137,6 +137,7 @@ class X86InstrInfo : public X86GenInstrInfo {
   RegOp2MemOpTableType RegOp2MemOpTable0;
   RegOp2MemOpTableType RegOp2MemOpTable1;
   RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
@@ -144,9 +145,9 @@ class X86InstrInfo : public X86GenInstrInfo {
                    std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
   MemOp2RegOpTableType MemOp2RegOpTable;
 
-  void AddTableEntry(RegOp2MemOpTableType &R2MTable,
-                     MemOp2RegOpTableType &M2RTable,
-                     unsigned RegOp, unsigned MemOp, unsigned Flags);
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags);
 
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
@@ -218,6 +219,14 @@ public:
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const;
+  virtual bool canInsertSelect(const MachineBasicBlock&,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               unsigned, unsigned, int&, int&, int&) const;
+  virtual void insertSelect(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, DebugLoc DL,
+                            unsigned DstReg,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            unsigned TrueReg, unsigned FalseReg) const;
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -363,6 +372,33 @@ public:
                              const MachineInstr *DefMI, unsigned DefIdx,
                              const MachineInstr *UseMI, unsigned UseIdx) const;
 
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2,
+                              int &CmpMask, int &CmpValue) const;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                        const MachineRegisterInfo *MRI,
+                        unsigned &FoldAsLoadDefReg,
+                        MachineInstr *&DefMI) const;
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6a25312..d293156 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -63,6 +63,10 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
                                   [SDTCisInt<0>,
                                    SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
 
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
 def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -95,6 +99,8 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
 
 def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
@@ -131,6 +137,11 @@ def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
 def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
 def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
 
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
@@ -199,6 +210,9 @@ def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
@@ -278,6 +292,20 @@ def X86Mem256AsmOperand : AsmOperandClass {
   let Name = "Mem256"; let PredicateMethod = "isMem256";
 }
 
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+}
+def X86MemVY32Operand : AsmOperandClass {
+  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+}
+def X86MemVX64Operand : AsmOperandClass {
+  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+}
+def X86MemVY64Operand : AsmOperandClass {
+  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+}
+
 def X86AbsMemAsmOperand : AsmOperandClass {
   let Name = "AbsMem";
   let SuperClasses = [X86MemAsmOperand];
@@ -316,6 +344,20 @@ def f128mem : X86MemOperand<"printf128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
 def f256mem : X86MemOperand<"printf256mem">{ 
   let ParserMatchClass = X86Mem256AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
 }
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -328,7 +370,7 @@ def i8mem_NOREX : Operand<i64> {
 }
 
 // GPRs available for tailcall.
-// It represents GR64_TC or GR64_TCW64.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
 def ptr_rc_tailcall : PointerLikeRegClass<2>;
 
 // Special i32mem for addresses of load folding tail calls. These are not
@@ -336,7 +378,8 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>;
 // after callee-saved register are popped.
 def i32mem_TC : Operand<i32> {
   let PrintMethod = "printi32mem";
-  let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, i8imm);
   let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -487,6 +530,9 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
 def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
 def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
                         [add, sub, mul, X86mul_imm, shl, or, frameindex,
                          X86WrapperRIP], []>;
@@ -494,6 +540,9 @@ def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
 def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
 def HasCMov      : Predicate<"Subtarget->hasCMov()">;
@@ -514,8 +563,8 @@ def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
-def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
-def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
@@ -680,25 +729,27 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 
 // Nop
 let neverHasSideEffects = 1 in {
-  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
   def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
-                "nop{w}\t$zero", []>, TB, OpSize;
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
   def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
-                "nop{l}\t$zero", []>, TB;
+                "nop{l}\t$zero", [], IIC_NOP>, TB;
 }
 
 
 // Constructing a stack frame.
 def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
-                 "enter\t$len, $lvl", []>;
+                 "enter\t$len, $lvl", [], IIC_ENTER>;
 
 let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
 def LEAVE    : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>, Requires<[In32BitMode]>;
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In32BitMode]>;
 
 let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
 def LEAVE64  : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>, Requires<[In64BitMode]>;
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
@@ -706,41 +757,49 @@ def LEAVE64  : I<0xC9, RawFrm,
 
 let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
 let mayLoad = 1 in {
-def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
-  OpSize;
-def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
-  OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
-  OpSize;
-def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
-
-def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize;
-def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>,
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+                IIC_POP_MEM>;
+
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
                Requires<[In32BitMode]>;
 }
 
 let mayStore = 1 in {
-def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
-  OpSize;
-def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
-  OpSize;
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>,
   OpSize;
-def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>;
 
 def PUSHi8   : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
-                      "push{l}\t$imm", []>;
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
 def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{w}\t$imm", []>, OpSize;
+                      "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
-                      "push{l}\t$imm", []>;
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
 
-def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize;
-def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
                Requires<[In32BitMode]>;
 
 }
@@ -749,44 +808,48 @@ def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
 let mayLoad = 1 in {
 def POP64r   : I<0x58, AddRegFrm,
-                 (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+                 (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+                IIC_POP_MEM>;
 }
 let mayStore = 1 in {
 def PUSH64r  : I<0x50, AddRegFrm,
-                 (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+                 (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>;
 }
 }
 
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
-                     "push{q}\t$imm", []>;
+                     "push{q}\t$imm", [], IIC_PUSH_IMM>;
 def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{q}\t$imm", []>;
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
 def PUSH64i32  : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
-                      "push{q}\t$imm", []>;
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
 }
 
 let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
-def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
                Requires<[In64BitMode]>;
 let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
-def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
                  Requires<[In64BitMode]>;
 
 
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad=1, neverHasSideEffects=1 in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
                Requires<[In32BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
                Requires<[In32BitMode]>;
 }
 
@@ -794,84 +857,92 @@ let Constraints = "$src = $dst" in {    // GR32 = bswap GR32
 def BSWAP32r : I<0xC8, AddRegFrm,
                  (outs GR32:$dst), (ins GR32:$src),
                  "bswap{l}\t$dst",
-                 [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
 
 def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
                   "bswap{q}\t$dst",
-                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
 } // Constraints = "$src = $dst"
 
 // Bit scan instructions.
 let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BSF>, TB, OpSize;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB,
-                 OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BSF>, TB, OpSize;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BSF>, TB;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BSF>, TB;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BSF>, TB;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 TB, OpSize;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB,
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BSR>, TB,
                  OpSize;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BSR>, TB;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BSR>, TB;
 } // Defs = [EFLAGS]
 
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>;
-def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>;
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize;
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>;
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
-def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
 
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>;
-def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
 
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>;
-def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
 
 
 //===----------------------------------------------------------------------===//
@@ -880,64 +951,64 @@ def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
 
 let neverHasSideEffects = 1 in {
 def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
-                "mov{b}\t{$src, $dst|$dst, $src}", []>;
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
 def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(set GR8:$dst, imm:$src)]>;
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
 def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, imm:$src)]>, OpSize;
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
 def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, imm:$src)]>;
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>;
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, imm:$src)]>;
+                    [(set GR64:$dst, imm:$src)], IIC_MOV>;
 def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
                       "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, i64immSExt32:$src)]>;
+                      [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
 }
 
 def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 imm:$src), addr:$dst)]>;
+                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm:$src), addr:$dst)]>;
+                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(store i64immSExt32:$src, addr:$dst)]>;
+                      [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|AL, $src}", []>,
+                   "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize,
+                      "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|EAX, $src}", []>,
+                      "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, AL}", []>,
+                   "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize,
+                      "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, EAX}", []>,
+                      "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 
 // FIXME: These definitions are utterly broken
@@ -958,42 +1029,42 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
 
 let isCodeGenOnly = 1 in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", []>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                     "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(set GR8:$dst, (loadi8 addr:$src))]>;
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
 def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
 def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(set GR32:$dst, (loadi32 addr:$src))]>;
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
 def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(set GR64:$dst, (load addr:$src))]>;
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
 }
 
 def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(store GR8:$src, addr:$dst)]>;
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
 def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(store GR16:$src, addr:$dst)]>, OpSize;
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(store GR32:$src, addr:$dst)]>;
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
 def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(store GR64:$src, addr:$dst)]>;
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
 
 // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
 // that they can be used for copying and storing h registers, which can't be
@@ -1002,24 +1073,28 @@ let isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in
 def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>;
 let mayStore = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>;
 let mayLoad = 1, neverHasSideEffects = 1,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>;
 }
 
 
 // Condition code ops, incl. set if equal/not equal/...
-let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
-def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf", []>;  // flags = AH
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
 let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
-def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>;  // AH = flags
 
 
 //===----------------------------------------------------------------------===//
@@ -1028,13 +1103,14 @@ def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
 let Defs = [EFLAGS] in {
 def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB;
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize, TB;
 def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB;
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
 def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB;
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
 
 // Unlike with the register+register form, the memory+register form of the
 // bt instruction does not ignore the high bits of the index. From ISel's
@@ -1045,31 +1121,33 @@ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
 //                (implicit EFLAGS)]
-               []
+               [], IIC_BT_MR
                >, OpSize, TB, Requires<[FastBTMem]>;
 def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
 //                (implicit EFLAGS)]
-               []
+               [], IIC_BT_MR
                >, TB, Requires<[FastBTMem]>;
 def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
 //                (implicit EFLAGS)]
-                []
+                [], IIC_BT_MR
                 >, TB;
 
 def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
-                OpSize, TB;
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize, TB;
 def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB;
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, TB;
 def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
 
 // Note that these instructions don't need FastBTMem because that
 // only applies when the other operand is in a register. When it's
@@ -1077,91 +1155,103 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
 def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
-                 ]>, OpSize, TB;
+                 ], IIC_BT_MI>, OpSize, TB;
 def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
-                 ]>, TB;
+                 ], IIC_BT_MI>, TB;
 def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi64 addr:$src1),
-                                     i64immSExt8:$src2))]>, TB;
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
 
 
 def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 
 def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                  "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 
 def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 } // Defs = [EFLAGS]
 
 
@@ -1175,89 +1265,106 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 let Constraints = "$val = $dst" in {
 def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
                "xchg{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
+               IIC_XCHG_MEM>;
 def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
                "xchg{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>,
+               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
+               IIC_XCHG_MEM>,
                 OpSize;
 def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
                "xchg{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
+               IIC_XCHG_MEM>;
 def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
                   "xchg{q}\t{$val, $ptr|$ptr, $val}",
-                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
+                  IIC_XCHG_MEM>;
 
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
-                "xchg{b}\t{$val, $src|$src, $val}", []>;
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
-                 "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize;
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
-                 "xchg{l}\t{$val, $src|$src, $val}", []>;
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
-                  "xchg{q}\t{$val, $src|$src, $val}", []>;
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize;
+                  "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>;
+                  "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                  Requires<[In32BitMode]>;
 // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
 // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
 def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
-                   "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>;
+                   "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                   Requires<[In64BitMode]>;
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|RAX, $src}", []>;
+                  "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
 
 
 
 def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize;
 def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 
 let mayLoad = 1, mayStore = 1 in {
 def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                 "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize;
 def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 
 }
 
 def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
 def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize;
 def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB;
 def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
 
 let mayLoad = 1, mayStore = 1 in {
 def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
 def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize;
 def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB;
 def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
 def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
-                  "cmpxchg8b\t$dst", []>, TB;
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
-                    "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
 
 
 
@@ -1281,69 +1388,75 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
 
 
 // String manipulation instructions
-def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
-def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
-def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>;
-def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
 
-def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>;
-def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize;
-def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>;
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
 
 
 // Flag instructions
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
-def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
-def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
 
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
 
 // Table lookup instructions
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>;
 
 // ASCII Adjust After Addition
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, Requires<[In32BitMode]>;
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[In32BitMode]>;
 
 // ASCII Adjust AX Before Division
 // sets AL, AH and EFLAGS and uses AL and AH
 def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
-                 "aad\t$src", []>, Requires<[In32BitMode]>;
+                 "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
 
 // ASCII Adjust AX After Multiply
 // sets AL, AH and EFLAGS and uses AL
 def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
-                 "aam\t$src", []>, Requires<[In32BitMode]>;
+                 "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
 
 // ASCII Adjust AL After Subtraction - sets
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, Requires<[In32BitMode]>;
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[In32BitMode]>;
 
 // Decimal Adjust AL after Addition
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, Requires<[In32BitMode]>;
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[In32BitMode]>;
 
 // Decimal Adjust AL after Subtraction
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, Requires<[In32BitMode]>;
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[In32BitMode]>;
 
 // Check Array Index Against Bounds
 def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", []>, OpSize,
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
                    Requires<[In32BitMode]>;
 def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", []>,
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
                    Requires<[In32BitMode]>;
 
 // Adjust RPL Field of Segment Selector
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst),
-                 "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[In32BitMode]>;
 def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
-                 "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[In32BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // MOVBE Instructions
@@ -1351,22 +1464,28 @@ def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
 let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8;
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize, T8;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8;
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    T8;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8;
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8;
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8;
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize, T8;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR32:$src), addr:$dst)]>, T8;
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    T8;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(store (bswap GR64:$src), addr:$dst)]>, T8;
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1374,11 +1493,14 @@ let Predicates = [HasMOVBE] in {
 //
 let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
-                    "rdrand{w}\t$dst", []>, OpSize, TB;
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
-                    "rdrand{l}\t$dst", []>, TB;
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
-                     "rdrand{q}\t$dst", []>, TB;
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1774,9 +1896,9 @@ def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
 def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
 
 // We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW8r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW8r)>;
-def : InstAlias<"fnstsw"     , (FNSTSW8r)>;
+def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
+def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 63f96b6..c8f40bb 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -20,71 +20,130 @@
 // MMX Multiclasses
 //===----------------------------------------------------------------------===//
 
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+
 let Constraints = "$src1 = $dst" in {
   // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
   // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               bit Commutable = 0> {
+                               OpndItins itins, bit Commutable = 0> {
     def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                  (ins VR64:$src1, VR64:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> {
       let isCommutable = Commutable;
     }
     def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, i64mem:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1,
-                                   (bitconvert (load_mmx addr:$src2))))]>;
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>;
   }
 
   multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
                                 string OpcodeStr, Intrinsic IntId,
-                                Intrinsic IntId2> {
+                                Intrinsic IntId2, ShiftOpndItins itins> {
     def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                                   (ins VR64:$src1, VR64:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>;
     def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                                   (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR64:$dst, (IntId VR64:$src1,
-                                    (bitconvert (load_mmx addr:$src2))))]>;
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>;
   }
 }
 
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId64> {
+                               Intrinsic IntId64, OpndItins itins> {
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
-                     (IntId64 (bitconvert (memopmmx addr:$src))))]>;
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>;
 }
 
 /// Binary MMX instructions requiring SSSE3.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                             Intrinsic IntId64> {
+                             Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
          (IntId64 VR64:$src1,
-          (bitconvert (memopmmx addr:$src2))))]>;
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>;
 }
 }
 
@@ -103,13 +162,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
 
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
-                         string asm, Domain d> {
+                         string asm, OpndItins itins, Domain d> {
   def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
                         [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        IIC_DEFAULT, d>;
+                        itins.rr, d>;
   def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
                         [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        IIC_DEFAULT, d>;
+                        itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -139,22 +198,24 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, 
-                         (x86mmx (scalar_to_vector GR32:$src)))]>;
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>;
 let canFoldAsLoad = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst,
-               (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>;
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>;
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
 def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             []>;
+                             [], IIC_MMX_MOV_MM_RM>;
 
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
@@ -163,197 +224,276 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}", 
                              [(set GR64:$dst,
-                              (bitconvert VR64:$src))]>;
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
 def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
-                              (bitconvert GR64:$src))]>;
+                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}", []>;
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
 let canFoldAsLoad = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(store (x86mmx VR64:$src), addr:$dst)]>;
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
 
 def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                           (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                           [(set VR64:$dst,
                             (x86mmx (bitconvert
                             (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))]>;
+                                  (iPTR 0))))))],
+                          IIC_MMX_MOVQ_RR>;
 
-def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                             (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
           [(set VR128:$dst,
             (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+                              (i64 (bitconvert (x86mmx VR64:$src))))))],
+                           IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
+                       IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>;
+                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
+                       IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
-                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>;
 
 let AddedComplexity = 15 in
 // movd to MMX register zero-extends
 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                              "movd\t{$src, $dst|$dst, $src}",
               [(set VR64:$dst,
-                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>;
+                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
+                            IIC_MMX_MOV_MM_RM>;
 let AddedComplexity = 20 in
 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
                            (ins i32mem:$src),
                              "movd\t{$src, $dst|$dst, $src}",
           [(set VR64:$dst,
                 (x86mmx (X86vzmovl (x86mmx
-                                   (scalar_to_vector (loadi32 addr:$src))))))]>;
+                                   (scalar_to_vector (loadi32 addr:$src))))))],
+                            IIC_MMX_MOV_MM_RM>;
 
 // Arithmetic Instructions
-defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>;
-defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>;
-defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>;
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
 // -- Addition
-defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>;
-defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>;
-defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>;
-defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>;
-defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
-defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
-
-defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
-defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
-
-defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>;
-defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>;
-defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>;
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
 
 
 // -- Subtraction
-defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>;
-defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>;
-defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>;
-defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>;
-
-defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
-defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
-
-defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
-defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
-
-defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>;
-defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>;
-defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>;
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
 
 // -- Multiplication
-defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>;
-
-defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
-defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
-defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
 let isCommutable = 1 in
 defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
-                                     int_x86_ssse3_pmul_hr_sw>;
+                                     int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
 
 // -- Miscellanea
-defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
 
 defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
-                                     int_x86_ssse3_pmadd_ub_sw>;
-defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
-defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
-
-defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
-defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
-
-defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
-defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
-
-defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
-
-defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>;
-defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>;
-defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>;
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
 let Constraints = "$src1 = $dst" in
   defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
 
 // Logical Instructions
-defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
-defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
-defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
-defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>;
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
-                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
-                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
-                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
 
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
-                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
-                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
-                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
 
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
-                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
-                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
 
 // Comparison Instructions
-defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
-defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
-defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
-
-defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
-defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
-defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
 
 // -- Unpack Instructions
 defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
-                                       int_x86_mmx_punpckhbw>;
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
-                                       int_x86_mmx_punpckhwd>;
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", 
-                                       int_x86_mmx_punpckhdq>;
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", 
-                                       int_x86_mmx_punpcklbw>;
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
 defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", 
-                                       int_x86_mmx_punpcklwd>;
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
 defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
-                                       int_x86_mmx_punpckldq>;
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
 
 // -- Pack Instructions
-defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
-defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
-defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
 
 // -- Shuffle Instructions
-defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>;
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
 
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
-                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>;
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>;
 def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                           (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
-                                                   imm:$src2))]>;
-
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>;
 
 
 
@@ -361,24 +501,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      SSEPackedSingle>, TB;
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      SSEPackedDouble>, TB, OpSize;
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       SSEPackedSingle>, TB;
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       SSEPackedDouble>, TB, OpSize;
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         SSEPackedDouble>, TB, OpSize;
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                         SSEPackedSingle>, TB;
+                          SSEPackedSingle>, TB;
 }
 
 // Extract / Insert
@@ -386,14 +526,16 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
                            (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
                            "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             (iPTR imm:$src2)))]>;
+                                             (iPTR imm:$src2)))],
+                           IIC_MMX_PEXTR>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
                       (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32:$src2, (iPTR imm:$src3)))]>;
+                                        GR32:$src2, (iPTR imm:$src3)))],
+                      IIC_MMX_PINSRW>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
                      (outs VR64:$dst),
@@ -401,7 +543,8 @@ let Constraints = "$src1 = $dst" in {
                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
                                          (i32 (anyext (loadi16 addr:$src2))),
-                                       (iPTR imm:$src3)))]>;
+                                       (iPTR imm:$src3)))],
+                     IIC_MMX_PINSRW>;
 }
 
 // Mask creation
@@ -411,20 +554,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
 
-// MMX to XMM for vector types
-def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
-          (v2i64 (MOVQI2PQIrm addr:$src))>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq 
-                    (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
-          (v2i64 (MOVDI2PDIrm addr:$src))>;
-
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -439,11 +568,13 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 let Uses = [EDI] in
 def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                         "maskmovq\t{$mask, $src|$src, $mask}",
-                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                        IIC_MMX_MASKMOV>;
 let Uses = [RDI] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
-                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 65e3c1e..e4c35b9 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -245,9 +245,9 @@ multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 // A vector extract of the first f32/f64 position is a subregister copy
 def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
 def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
 
 // A 128-bit subvector extract from the first 256-bit vector position
 // is a subregister copy that needs no instruction.
@@ -283,14 +283,14 @@ def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
 def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
 // Implicitly promote a 64-bit scalar to a vector.
 def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
 def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
 
 // Bitcasts between 128-bit vector types. Return the original type since
 // no instruction is needed for the conversion
@@ -562,59 +562,57 @@ let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
             (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (VMOVSSrr (v4f32 (V_SET0)),
-                      (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (VMOVSSrr (v4i32 (V_SET0)),
-                      (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
             (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
             (SUBREG_TO_REG (i32 0),
-              (VMOVSSrr (v4f32 (V_SET0)),
-                        (EXTRACT_SUBREG (v8f32 VR256:$src), sub_ss)), sub_xmm)>;
+             (VMOVSSrr (v4f32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
             (SUBREG_TO_REG (i32 0),
-              (VMOVSSrr (v4i32 (V_SET0)),
-                        (EXTRACT_SUBREG (v8i32 VR256:$src), sub_ss)), sub_xmm)>;
+             (VMOVSSrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
   }
 
   let AddedComplexity = 20 in {
   // MOVSSrm zeros the high parts of the register; represent this
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
 
   // MOVSDrm zeros the high parts of the register; represent this
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
 
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                    (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
@@ -628,70 +626,68 @@ let Predicates = [HasAVX] in {
                            sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                    (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
-            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
             (SUBREG_TO_REG (i32 0),
-              (VMOVSDrr (v2f64 (V_SET0)),
-                        (EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>;
+             (VMOVSDrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
 
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
             (SUBREG_TO_REG (i32 0),
-              (VMOVSDrr (v2i64 (V_SET0)),
-                        (EXTRACT_SUBREG (v4i64 VR256:$src), sub_sd)), sub_xmm)>;
+             (VMOVSDrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
 
   // Extract and store.
   def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
-            (VMOVSSmr addr:$dst,
-                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
   def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
                    addr:$dst),
-            (VMOVSDmr addr:$dst,
-                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
 
   // Shuffle with VMOVSS
   def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
             (VMOVSSrr (v4i32 VR128:$src1),
-                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
             (VMOVSSrr (v4f32 VR128:$src1),
-                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
 
   // 256-bit variants
   def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
             (SUBREG_TO_REG (i32 0),
-                (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss),
-                          (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>;
+              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
   def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
             (SUBREG_TO_REG (i32 0),
-                (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss),
-                          (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>;
+              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
 
   // Shuffle with VMOVSD
   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr (v2i64 VR128:$src1),
-                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr (v2f64 VR128:$src1),
-                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
             (SUBREG_TO_REG (i32 0),
-                (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_sd),
-                          (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_sd)), sub_xmm)>;
+              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
   def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
             (SUBREG_TO_REG (i32 0),
-                (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_sd),
-                          (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_sd)), sub_xmm)>;
+              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
 
 
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
@@ -699,17 +695,13 @@ let Predicates = [HasAVX] in {
   // it has two uses through a bitcast. One use disappears at isel time and the
   // fold opportunity reappears.
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
-                                                   sub_sd))>;
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
 let Predicates = [HasSSE1] in {
@@ -719,37 +711,31 @@ let Predicates = [HasSSE1] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
             (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (MOVSSrr (v4f32 (V_SET0)),
-                     (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (MOVSSrr (v4i32 (V_SET0)),
-                     (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
   }
 
   let AddedComplexity = 20 in {
-  // MOVSSrm zeros the high parts of the register; represent this
-  // with SUBREG_TO_REG.
+  // MOVSSrm already zeros the high parts of the register.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
-            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
   }
 
   // Extract and store.
   def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
-            (MOVSSmr addr:$dst,
-                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
 
   // Shuffle with MOVSS
   def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
-            (MOVSSrr (v4i32 VR128:$src1),
-                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-            (MOVSSrr (v4f32 VR128:$src1),
-                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
 }
 
 let Predicates = [HasSSE2] in {
@@ -761,50 +747,46 @@ let Predicates = [HasSSE2] in {
   }
 
   let AddedComplexity = 20 in {
-  // MOVSDrm zeros the high parts of the register; represent this
-  // with SUBREG_TO_REG.
+  // MOVSDrm already zeros the high parts of the register.
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
-            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   }
 
   // Extract and store.
   def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
                    addr:$dst),
-            (MOVSDmr addr:$dst,
-                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
 
   // Shuffle with MOVSD
   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr (v2i64 VR128:$src1),
-                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr (v2f64 VR128:$src1),
-                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold cause
   // it has two uses through a bitcast. One use disappears at isel time and the
   // fold opportunity reappears.
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1416,14 +1398,15 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
-                         string asm, Domain d, OpndItins itins> {
-  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
-                        itins.rr, d>;
-  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
-                        itins.rm, d>;
+                       X86MemOperand x86memop, string asm, Domain d,
+                       OpndItins itins> {
+let neverHasSideEffects = 1 in {
+  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+             [], itins.rr, d>;
+  let mayLoad = 1 in
+  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+             [], itins.rm, d>;
+}
 }
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -1443,7 +1426,7 @@ defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 SSE_CVT_SS2SI_32>,
                                 XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
-                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                "cvttss2si{q}\t{$src, $dst|$dst, $src}",
                                 SSE_CVT_SS2SI_64>,
                                 XS, VEX, VEX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
@@ -1451,7 +1434,7 @@ defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                                 SSE_CVT_SD2SI>,
                                 XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
-                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
                                 SSE_CVT_SD2SI>,
                                 XD, VEX, VEX_W, VEX_LIG;
 
@@ -1465,11 +1448,14 @@ defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
                                   XS, VEX_4V, VEX_W, VEX_LIG;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">,
                                   XD, VEX_4V, VEX_LIG;
-defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
-                                  XD, VEX_4V, VEX_LIG;
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
                                   XD, VEX_4V, VEX_W, VEX_LIG;
 
+def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrr FR64:$dst, FR64:$src1, GR32:$src)>;
+def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+
 let Predicates = [HasAVX], AddedComplexity = 1 in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1519,14 +1505,14 @@ defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
 // and/or XMM operand(s).
 
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
                          string asm, OpndItins itins> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>;
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm>;
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>;
 }
 
 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -1548,30 +1534,31 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               itins.rm>;
 }
 
-defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                  f128mem, load, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si{l}",
+                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                  int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si",
-                  SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si{q}",
+                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
 
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                f128mem, load, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD;
+                 sdmem, sse_load_f64, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
-                  f128mem, load, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
+                   sdmem, sse_load_f64, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
 
 
 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss",
           SSE_CVT_Scalar, 0>, XS, VEX_4V;
 defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss",
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
           SSE_CVT_Scalar, 0>, XS, VEX_4V,
           VEX_W;
 defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd",
           SSE_CVT_Scalar, 0>, XD, VEX_4V;
 defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd",
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
           SSE_CVT_Scalar, 0>, XD,
           VEX_4V, VEX_W;
 
@@ -1587,94 +1574,71 @@ let Constraints = "$src1 = $dst" in {
                         "cvtsi2sd", SSE_CVT_Scalar>, XD;
   defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
                         int_x86_sse2_cvtsi642sd, i64mem, loadi64,
-                        "cvtsi2sd", SSE_CVT_Scalar>, XD, REX_W;
+                        "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
 }
 
 /// SSE 1 Only
 
 // Aliases for intrinsics
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
-                                    f32mem, load, "cvttss2si",
+                                    ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS, VEX;
 defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
-                                    int_x86_sse_cvttss2si64, f32mem, load,
-                                    "cvttss2si", SSE_CVT_SS2SI_64>,
-                                    XS, VEX, VEX_W;
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si{q}", SSE_CVT_SS2SI_64>,
+                                   XS, VEX, VEX_W;
 defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
-                                    f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
-                                    XD, VEX;
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD, VEX;
 defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
-                                    int_x86_sse2_cvttsd2si64, f128mem, load,
-                                    "cvttsd2si", SSE_CVT_SD2SI>,
-                                    XD, VEX, VEX_W;
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si{q}", SSE_CVT_SD2SI>,
+                                  XD, VEX, VEX_W;
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
-                                    f32mem, load, "cvttss2si",
+                                    ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS;
 defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
-                                    int_x86_sse_cvttss2si64, f32mem, load,
-                                    "cvttss2si{q}", SSE_CVT_SS2SI_64>,
-                                    XS, REX_W;
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si{q}", SSE_CVT_SS2SI_64>, XS, REX_W;
 defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
-                                    f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
-                                    XD;
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD;
 defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
-                                    int_x86_sse2_cvttsd2si64, f128mem, load,
-                                    "cvttsd2si{q}", SSE_CVT_SD2SI>,
-                                    XD, REX_W;
-
-let Pattern = []<dag> in {
-defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
-                               "cvtss2si{l}\t{$src, $dst|$dst, $src}",
-                               SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
-defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
-                               "cvtss2si\t{$src, $dst|$dst, $src}",
-                               SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
-defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
-                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
-defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
-                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
-}
-
-let Pattern = []<dag> in {
-defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
-                          "cvtss2si{l}\t{$src, $dst|$dst, $src}",
-                          SSE_CVT_SS2SI_32>, XS;
-defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
-                          "cvtss2si{q}\t{$src, $dst|$dst, $src}",
-                          SSE_CVT_SS2SI_64>, XS, REX_W;
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
+
+defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                                  ssmem, sse_load_f32, "cvtss2si{l}",
+                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                  ssmem, sse_load_f32, "cvtss2si{q}",
+                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                               ssmem, sse_load_f32, "cvtss2si{l}",
+                               SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                 ssmem, sse_load_f32, "cvtss2si{q}",
+                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               TB, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               TB, VEX, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB; /* PD SSE3 form is avaiable */
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
-            (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
-            (VCVTSS2SIrm addr:$src)>;
-  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
-            (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
-            (VCVTSS2SI64rm addr:$src)>;
-}
-
-let Predicates = [HasSSE1] in {
-  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
-            (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
-            (CVTSS2SIrm addr:$src)>;
-  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
-            (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
-            (CVTSS2SI64rm addr:$src)>;
-}
+                            TB, Requires<[HasSSE2]>;
 
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
+let neverHasSideEffects = 1 in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
@@ -1685,6 +1649,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [], IIC_SSE_CVT_Scalar_RM>,
                       XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
+}
 
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
           Requires<[HasAVX]>;
@@ -1700,17 +1665,37 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       XD,
                   Requires<[HasSSE2, OptForSize]>;
 
-defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
-                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
-                      SSE_CVT_Scalar, 0>,
-                      XS, VEX_4V;
-let Constraints = "$src1 = $dst" in
-defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
-                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
-                      SSE_CVT_Scalar>, XS;
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>;
+}
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
+let neverHasSideEffects = 1 in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1722,19 +1707,21 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RM>,
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
+}
 
-let Predicates = [HasAVX] in {
+let AddedComplexity = 1 in { // give AVX priority
   def : Pat<(f64 (fextend FR32:$src)),
-            (VCVTSS2SDrr FR32:$src, FR32:$src)>;
+            (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
   def : Pat<(fextend (loadf32 addr:$src)),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-}
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
 
-def : Pat<(extloadf32 addr:$src),
-          (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>,
-          Requires<[HasAVX, OptForSpeed]>;
+  def : Pat<(extloadf32 addr:$src),
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+  def : Pat<(extloadf32 addr:$src),
+            (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+            Requires<[HasAVX, OptForSpeed]>;
+} // AddedComplexity = 1
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -1760,190 +1747,146 @@ def : Pat<(extloadf32 addr:$src),
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       VR128:$src2))],
-                                       IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V,
-                    Requires<[HasAVX]>;
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
-                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       (load addr:$src2)))],
-                                       IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V,
-                    Requires<[HasAVX]>;
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       VR128:$src2))],
-                                       IIC_SSE_CVT_Scalar_RR>, XS,
-                    Requires<[HasSSE2]>;
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>;
 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
-                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       (load addr:$src2)))],
-                                       IIC_SSE_CVT_Scalar_RM>, XS,
-                    Requires<[HasSSE2]>;
-}
-
-// Convert doubleword to packed single/double fp
-// SSE2 instructions without OpSize prefix
-def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PS_RM>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>,
-                     TB, Requires<[HasSSE2]>;
-def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PS_RM>,
-                     TB, Requires<[HasSSE2]>;
-
-// FIXME: why the non-intrinsic version is described as SSE3?
-// SSE2 instructions with XS prefix
-def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XS, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PD_RM>,
-                     XS, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XS, Requires<[HasSSE2]>;
-def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                     "cvtdq2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PD_RM>,
-                     XS, Requires<[HasSSE2]>;
-
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>;
+}
 
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
                        IIC_SSE_CVT_PS_RR>, VEX;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
                        IIC_SSE_CVT_PS_RM>, VEX;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
                         IIC_SSE_CVT_PS_RR>, VEX;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
                         IIC_SSE_CVT_PS_RM>, VEX;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
                      IIC_SSE_CVT_PS_RR>;
 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
                      IIC_SSE_CVT_PS_RM>;
 
-def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                        IIC_SSE_CVT_PS_RR>,
-                        VEX;
-def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
-                         (ins f128mem:$src),
-                         "cvtps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PS_RM>, VEX;
-def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                        IIC_SSE_CVT_PS_RR>;
-def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvtps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PS_RM>;
-
-// SSE2 packed instructions with XD prefix
-def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XD, VEX, Requires<[HasAVX]>;
-def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (memop addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     XD, VEX, Requires<[HasAVX]>;
-def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XD, Requires<[HasSSE2]>;
-def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (memop addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     XD, Requires<[HasSSE2]>;
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                       VEX;
 
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+                       VEX, VEX_L;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+                      IIC_SSE_CVT_PD_RM>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>;
 
 // Convert with truncation packed single/double fp to doubleword
 // SSE2 packed instructions with XS prefix
-def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst,
-                          (int_x86_sse2_cvttps2dq VR128:$src))],
-                          IIC_SSE_CVT_PS_RR>, VEX;
-def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                           (memop addr:$src)))],
-                                           IIC_SSE_CVT_PS_RM>, VEX;
-def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR256:$dst,
-                           (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
-                           IIC_SSE_CVT_PS_RR>, VEX;
-def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
-                                            (memopv8f32 addr:$src)))],
-                                            IIC_SSE_CVT_PS_RM>, VEX;
-
-def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "cvttps2dq\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                            (int_x86_sse2_cvttps2dq VR128:$src))],
-                            IIC_SSE_CVT_PS_RR>;
-def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                      "cvttps2dq\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                            (int_x86_sse2_cvttps2dq (memop addr:$src)))],
-                            IIC_SSE_CVT_PS_RM>;
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (memopv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (memopv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-            (Int_VCVTDQ2PSrr VR128:$src)>;
+            (VCVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-            (Int_VCVTDQ2PSrm addr:$src)>;
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (VCVTTPS2DQrr VR128:$src)>;
@@ -1963,9 +1906,14 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-            (Int_CVTDQ2PSrr VR128:$src)>;
+            (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-            (Int_CVTDQ2PSrm addr:$src)>;
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (CVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (CVTTPS2DQrr VR128:$src)>;
@@ -1978,183 +1926,186 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         [(set VR128:$dst,
                               (int_x86_sse2_cvttpd2dq VR128:$src))],
                               IIC_SSE_CVT_PD_RR>, VEX;
-let isCodeGenOnly = 1 in
-def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                               (memop addr:$src)))],
-                                               IIC_SSE_CVT_PD_RM>, VEX;
-def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
-                      IIC_SSE_CVT_PD_RR>;
-def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
-                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                        (memop addr:$src)))],
-                                        IIC_SSE_CVT_PD_RM>;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
-def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_CVT_PD_RR>, VEX;
 
 // XMM only
-def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                            (memopv2f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX;
 
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
                          IIC_SSE_CVT_PD_RR>, VEX;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                        (memopv2f64 addr:$src)))],
+                                        IIC_SSE_CVT_PD_RM>;
 
 // Convert packed single to packed double
 let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
+let neverHasSideEffects = 1, mayLoad = 1 in
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
                      IIC_SSE_CVT_PD_RM>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
                      IIC_SSE_CVT_PD_RM>, TB, VEX;
 }
+
+let Predicates = [HasSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}", [],
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, TB;
+let neverHasSideEffects = 1, mayLoad = 1 in
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RM>, TB;
+}
 
-def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
-                                          (load addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     TB, Requires<[HasSSE2]>;
-def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
-                                          (load addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     TB, Requires<[HasSSE2]>;
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let neverHasSideEffects = 1, mayLoad = 1 in
+def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     []>, VEX;
+def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX;
+def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256
+                        (bitconvert (memopv2i64 addr:$src))))]>, VEX;
+def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX;
+}
+
+let neverHasSideEffects = 1, mayLoad = 1 in
+def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RR>;
+def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RM>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
 
 // Convert packed double to packed single
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, VEX;
-def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_CVT_PD_RR>, VEX;
 
 // XMM only
-def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                        "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
+                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX;
 
 // YMM only
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                        "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
                         IIC_SSE_CVT_PD_RR>, VEX;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                        "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
                      IIC_SSE_CVT_PD_RR>;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
                      IIC_SSE_CVT_PD_RM>;
 
 
-def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
-                        IIC_SSE_CVT_PD_RR>;
-def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
-                         (ins f128mem:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PD_RM>;
-def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
-                        IIC_SSE_CVT_PD_RR>;
-def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PD_RM>;
-
 // AVX 256-bit register conversion intrinsics
 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
 // whenever possible to avoid declaring two versions of each one.
-def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
-          (VCVTDQ2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
-          (VCVTDQ2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
-          (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
-          (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
-          (VCVTPS2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
-          (VCVTPS2DQYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
-          (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
-          (VCVTPS2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
-          (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
-          (VCVTTPD2DQYrm addr:$src)>;
-
-// Match fround and fextend for 128/256-bit conversions
-def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
-          (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
-          (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
-          (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
-          (VCVTPS2PDYrm addr:$src)>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+            (VCVTPS2PDYrm addr:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
@@ -2587,17 +2538,13 @@ let Predicates = [HasAVX] in {
                                         OpSize, VEX;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
-                                          sub_ss))>;
+            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
-                                          sub_ss))>;
+            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
   def : Pat<(i32 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
-                                          sub_sd))>;
+            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
-                                          sub_sd))>;
+            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
 
   // Assembler Only
   def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
@@ -2622,17 +2569,17 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
-                                       sub_ss))>, Requires<[HasSSE1]>;
+          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[HasSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
-                                       sub_ss))>, Requires<[HasSSE1]>;
+          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[HasSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
-                                       sub_sd))>, Requires<[HasSSE2]>;
+          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[HasSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
-                                       sub_sd))>, Requires<[HasSSE2]>;
+          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[HasSSE2]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3230,34 +3177,30 @@ def : Pat<(f32 (X86frcp (load addr:$src))),
 
 let Predicates = [HasAVX], AddedComplexity = 1 in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
-            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
-                (VSQRTSSr (f32 (IMPLICIT_DEF)),
-                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
-                sub_ss)>;
+            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
   def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
             (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
 
   def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
-            (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
-                (VSQRTSDr (f64 (IMPLICIT_DEF)),
-                          (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd)),
-                sub_sd)>;
+            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
+                              VR128)>;
   def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
             (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
 
   def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
-            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
-                (VRSQRTSSr (f32 (IMPLICIT_DEF)),
-                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
-                sub_ss)>;
+            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
   def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
             (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
 
   def : Pat<(int_x86_sse_rcp_ss VR128:$src),
-            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
-                (VRCPSSr (f32 (IMPLICIT_DEF)),
-                         (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
-                sub_ss)>;
+            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
   def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
             (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
 }
@@ -3336,13 +3279,6 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
                                                 IIC_SSE_MOVNT>, VEX;
 }
 
-def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
-          (VMOVNTDQYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
-          (VMOVNTPDYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
-          (VMOVNTPSYmr addr:$dst, VR256:$src)>;
-
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
@@ -4610,7 +4546,7 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 // Bitcast FR64 <-> GR64
 //
 let Predicates = [HasAVX] in
-def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
                         VEX;
@@ -4623,7 +4559,7 @@ def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, VEX;
 
-def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
                        IIC_SSE_MOVDQ>;
@@ -4897,80 +4833,6 @@ def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
 
 //===---------------------------------------------------------------------===//
-// SSE3 - Conversion Instructions
-//===---------------------------------------------------------------------===//
-
-// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX] in {
-// The assembler can recognize rr 256-bit instructions by seeing a ymm
-// register, but the same isn't true when using memory operands instead.
-// Provide other assembly rr and rm forms to address this explicitly.
-def VCVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrYr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// XMM only
-def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// YMM only
-def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
-
-def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>;
-def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RR>;
-
-def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
-          (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
-          (VCVTTPD2DQYrm addr:$src)>;
-
-// Convert Packed DW Integers to Packed Double FP
-let Predicates = [HasAVX] in {
-def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrm  : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrr  : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-}
-
-def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RR>;
-def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>;
-
-// AVX 256-bit register conversion intrinsics
-def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
-           (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
-           (VCVTDQ2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
-          (VCVTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
-          (VCVTPD2DQYrm addr:$src)>;
-
-def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
-          (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-          (VCVTDQ2PDYrm addr:$src)>;
-
-//===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
@@ -5580,16 +5442,14 @@ let usesCustomInserter = 1 in {
 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
                 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
                 Requires<[HasSSE3]>;
-def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2),
-                [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>,
-                Requires<[HasSSE3]>;
 }
 
 let Uses = [EAX, ECX, EDX] in
 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
                  TB, Requires<[HasSSE3]>;
 let Uses = [ECX, EAX] in
-def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait", [], IIC_SSE_MWAIT>,
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
                 TB, Requires<[HasSSE3]>;
 
 def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
@@ -5730,14 +5590,26 @@ let Predicates = [HasSSE41] in {
             (PMOVZXDQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+              (VPMOVZXDQYrr VR128:$src)>;
+    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+              (VPMOVZXWDYrr VR128:$src)>;
+  }
+
+  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
 let Predicates = [HasAVX] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
 let Predicates = [HasSSE41] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
 
 
@@ -6608,15 +6480,15 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv4f32, i128mem, 0>, VEX_4V;
+                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv2f64, i128mem, 0>, VEX_4V;
+                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6625,10 +6497,10 @@ let Predicates = [HasAVX] in {
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv4f32, i128mem, 0>, VEX_4V;
+                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv2f64, i128mem, 0>, VEX_4V;
+                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                    VR256, memopv8f32, i256mem, 0>, VEX_4V;
@@ -6647,10 +6519,10 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv4f32, i128mem>;
+                                     VR128, memopv4f32, f128mem>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv2f64, i128mem>;
+                                     VR128, memopv2f64, f128mem>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
                                      VR128, memopv2i64, i128mem>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
@@ -6658,10 +6530,10 @@ let Constraints = "$src1 = $dst" in {
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, i128mem>;
+                                  VR128, memopv4f32, f128mem>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, i128mem>;
+                                  VR128, memopv2f64, f128mem>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6687,15 +6559,15 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            memopv2f64, int_x86_sse41_blendvpd>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                          memopv4f64, int_x86_avx_blendv_pd_256>;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            memopv4f32, int_x86_sse41_blendvps>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                          memopv8f32, int_x86_avx_blendv_ps_256>;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
@@ -6766,7 +6638,7 @@ let Predicates = [HasAVX2] in {
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                               Intrinsic IntId> {
+                               X86MemOperand x86memop, Intrinsic IntId> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
@@ -6775,7 +6647,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                     OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2),
+                    (ins VR128:$src1, x86memop:$src2),
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
@@ -6785,14 +6657,28 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
                                   int_x86_sse41_blendvpd>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
                                   int_x86_sse41_blendvps>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   int_x86_sse41_pblendvb>;
 
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
 let Predicates = [HasSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
@@ -6955,81 +6841,42 @@ let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS] in {
-  multiclass SS42AI_pcmpistri<Intrinsic IntId128, string asm = "pcmpistri"> {
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  multiclass SS42AI_pcmpistri<string asm> {
     def rr : SS42AI<0x63, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
-       (implicit EFLAGS)]>, OpSize;
+      []>, OpSize;
+    let mayLoad = 1 in
     def rm : SS42AI<0x63, MRMSrcMem, (outs),
       (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
-       (implicit EFLAGS)]>, OpSize;
+      []>, OpSize;
   }
 }
 
-let Predicates = [HasAVX] in {
-defm VPCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">,
-                                    VEX;
-defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">,
-                                    VEX;
-defm VPCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128, "vpcmpistri">,
-                                    VEX;
-defm VPCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128, "vpcmpistri">,
-                                    VEX;
-defm VPCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128, "vpcmpistri">,
-                                    VEX;
-defm VPCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128, "vpcmpistri">,
-                                    VEX;
-}
-
-defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
-defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
-defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
-defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
-defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
-defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
+let Predicates = [HasAVX] in
+defm VPCMPISTRI  : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+defm PCMPISTRI   : SS42AI_pcmpistri<"pcmpistri">;
 
 // Packed Compare Explicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in {
-  multiclass SS42AI_pcmpestri<Intrinsic IntId128, string asm = "pcmpestri"> {
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  multiclass SS42AI_pcmpestri<string asm> {
     def rr : SS42AI<0x61, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src3, i8imm:$src5),
       !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
-       (implicit EFLAGS)]>, OpSize;
+      []>, OpSize;
+    let mayLoad = 1 in
     def rm : SS42AI<0x61, MRMSrcMem, (outs),
       (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
       !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-       [(set ECX,
-             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
-        (implicit EFLAGS)]>, OpSize;
+      []>, OpSize;
   }
 }
 
-let Predicates = [HasAVX] in {
-defm VPCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">,
-                                    VEX;
-defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">,
-                                    VEX;
-defm VPCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128, "vpcmpestri">,
-                                    VEX;
-defm VPCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128, "vpcmpestri">,
-                                    VEX;
-defm VPCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128, "vpcmpestri">,
-                                    VEX;
-defm VPCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128, "vpcmpestri">,
-                                    VEX;
-}
-
-defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
-defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
-defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
-defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
-defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
-defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
+let Predicates = [HasAVX] in
+defm VPCMPESTRI  : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+defm PCMPESTRI   : SS42AI_pcmpestri<"pcmpestri">;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - CRC Instructions
@@ -7204,52 +7051,50 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   OpSize;
 
 //===----------------------------------------------------------------------===//
-// CLMUL Instructions
+// PCLMUL Instructions
 //===----------------------------------------------------------------------===//
 
-// Carry-less Multiplication instructions
-let neverHasSideEffects = 1 in {
 // AVX carry-less Multiplication instructions
-def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           []>;
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
 
-let mayLoad = 1 in
-def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           []>;
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
 
+// Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
-def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-           []>;
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
 
-let mayLoad = 1 in
-def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-           []>;
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
 } // Constraints = "$src1 = $dst"
-} // neverHasSideEffects = 1
 
 
 multiclass pclmul_alias<string asm, int immop> {
-  def : InstAlias<!strconcat("pclmul", asm, 
-                           "dq {$src, $dst|$dst, $src}"),
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
                   (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
 
-  def : InstAlias<!strconcat("pclmul", asm, 
-                             "dq {$src, $dst|$dst, $src}"),
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
                   (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
 
-  def : InstAlias<!strconcat("vpclmul", asm, 
+  def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
                   (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
 
-  def : InstAlias<!strconcat("vpclmul", asm, 
+  def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
                   (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
 }
@@ -7259,6 +7104,45 @@ defm : pclmul_alias<"lqhq", 0x10>;
 defm : pclmul_alias<"lqlq", 0x00>;
 
 //===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, TB, OpSize;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, TB, OpSize;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
@@ -7286,7 +7170,7 @@ let ExeDomain = SSEPackedSingle in {
                                       int_x86_avx_vbroadcast_ss_256>;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
                                     int_x86_avx_vbroadcast_sd_256>;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256>;
@@ -7298,8 +7182,8 @@ let ExeDomain = SSEPackedSingle in {
                                            int_x86_avx2_vbroadcast_ss_ps_256>;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                         int_x86_avx2_vbroadcast_sd_pd_256>;
+def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+                                          int_x86_avx2_vbroadcast_sd_pd_256>;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
@@ -7595,7 +7479,6 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (Int VR128:$src))]>,
@@ -7604,27 +7487,26 @@ let Predicates = [HasAVX, HasF16C] in {
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
 }
-}
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
                TA, OpSize, VEX;
-  let neverHasSideEffects = 1, mayLoad = 1 in
-  def mr : Ii8<0x1D, MRMDestMem, (outs x86memop:$dst),
-               (ins RC:$src1, i32i8imm:$src2),
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
                TA, OpSize, VEX;
 }
-}
 
-defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
-defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+let Predicates = [HasAVX, HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX2 Instructions
@@ -7711,6 +7593,49 @@ let Predicates = [HasAVX2] in {
           (VPBROADCASTQrm addr:$src)>;
   def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VPBROADCASTQYrm addr:$src)>;
+
+  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBrr VR128:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBYrr VR128:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWrr VR128:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWYrr VR128:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDrr VR128:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDYrr VR128:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQYrr VR128:$src)>;
+  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSrr VR128:$src)>;
+  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSYrr VR128:$src)>;
+  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VBROADCASTSDYrr VR128:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  }
 }
 
 // AVX1 broadcast patterns
@@ -7718,16 +7643,42 @@ let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
+          (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
-
+          (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+  // 128bit broadcasts:
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -7820,8 +7771,8 @@ let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>,
-          VEX_4V;
+          []>, VEX_4V;
+let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7954,3 +7905,30 @@ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index bddba6c..ea716bf 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -14,7 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 let Defs = [RAX, RDX] in
-  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
 
 let Defs = [RAX, RCX, RDX] in
   def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
@@ -26,14 +27,17 @@ let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
   def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
 }
 
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
-def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
 
 // Interrupt and SysCall Instructions.
 let Uses = [EFLAGS] in
   def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
 def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
-              [(int_x86_int (i8 3))]>;
+              [(int_x86_int (i8 3))], IIC_INT3>;
+
+def : Pat<(debugtrap),
+          (INT3)>;
 
 // The long form of "int $3" turns into int3 as a size optimization.
 // FIXME: This doesn't work because InstAlias can't match immediate constants.
@@ -41,23 +45,25 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
 
 
 def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
-              [(int_x86_int imm:$trap)]>;
+              [(int_x86_int imm:$trap)], IIC_INT>;
 
 
-def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
-def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
-def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB,
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
                Requires<[In64BitMode]>;
 
-def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
-                 
-def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
 def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
                 Requires<[In64BitMode]>;
 
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>;
-def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
              Requires<[In64BitMode]>;
 
 
@@ -66,73 +72,73 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
 //
 let Defs = [AL], Uses = [DX] in
 def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|AL, DX}", []>;
+               "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|AX, DX}", []>,  OpSize;
+               "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>,  OpSize;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|EAX, DX}", []>;
+               "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
-                  "in{b}\t{$port, %al|AL, $port}", []>;
+                  "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|AX, $port}", []>, OpSize;
+                  "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|EAX, $port}", []>;
+                  "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|DX, AL}", []>;
+                "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize;
+                "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|DX, EAX}", []>;
+                "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
-                   "out{b}\t{%al, $port|$port, AL}", []>;
+                   "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, AX}", []>, OpSize;
+                   "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, EAX}", []>;
+                   "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
 
-def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>;
-def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>,  OpSize;
-def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", []>;
+def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>,  OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
 
 //===----------------------------------------------------------------------===//
 // Moves to and from debug registers
 
 def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
 def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
                 
 def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
 def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Moves to and from control registers
 
 def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
 def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
                 
 def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
 def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Segment override instruction prefixes
@@ -150,254 +156,265 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
 //
 
 def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
 def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 
 def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
 def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
 
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
 def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
 def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
 
 //===----------------------------------------------------------------------===//
 // Segmentation support instructions.
 
-def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
-                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
 def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
-                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
-                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
-                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; 
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; 
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; 
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; 
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
 
-def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
 
 def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
-               "str{w}\t$dst", []>, TB, OpSize;
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
 def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
-               "str{l}\t$dst", []>, TB;
+               "str{l}\t$dst", [], IIC_STR>, TB;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
-                "str{q}\t$dst", []>, TB;
+                "str{q}\t$dst", [], IIC_STR>, TB;
 def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
-               "str{w}\t$dst", []>, TB;
+               "str{w}\t$dst", [], IIC_STR>, TB;
 
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
-             "ltr{w}\t$src", []>, TB;
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
-             "ltr{w}\t$src", []>, TB;
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
                  
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t{%fs|FS}", []>, OpSize, TB;
+                 "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t{%gs|GS}", []>, OpSize, TB;
+                 "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t{%fs|FS}", []>, TB;
+                 "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t{%gs|GS}", []>, TB;
+                 "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+              OpSize, Requires<[In32BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t{%ss|SS}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+                      Requires<[In32BitMode]>;
                 
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t{%ds|DS}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
                 
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t{%es|ES}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
                 
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t{%fs|FS}", []>, OpSize, TB;
+                "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t{%fs|FS}", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t{%fs|FS}", []>, TB;
+                "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
                 
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t{%gs|GS}", []>, OpSize, TB;
+                "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t{%gs|GS}", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t{%gs|GS}", []>, TB;
+                "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
                  
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
 def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lds{l}\t{$src, $dst|$dst, $src}", []>;
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
                 
 def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lss{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
 def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "les{l}\t{$src, $dst|$dst, $src}", []>;
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
                 
 def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 
 
 def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
-              "verr\t$seg", []>, TB;
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
-              "verr\t$seg", []>, TB;
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
-              "verw\t$seg", []>, TB;
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
 def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
-              "verw\t$seg", []>, TB;
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Descriptor-table support instructions
 
 def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdt\t$dst", []>, TB;
+              "sgdt\t$dst", [], IIC_SGDT>, TB;
 def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
-              "sidtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
               "sidt\t$dst", []>, TB;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
-                "sldt{w}\t$dst", []>, TB, OpSize;
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
 def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
-                "sldt{w}\t$dst", []>, TB;
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
-                "sldt{l}\t$dst", []>, TB;
+                "sldt{l}\t$dst", [], IIC_SLDT>, TB;
                 
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
-                 "sldt{q}\t$dst", []>, TB;
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
-                 "sldt{q}\t$dst", []>, TB;
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
 def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt\t$src", []>, TB;
+              "lgdt\t$src", [], IIC_LGDT>, TB;
 def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt\t$src", []>, TB;
+              "lidt\t$src", [], IIC_LIDT>, TB;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
-                "lldt{w}\t$src", []>, TB;
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
-                "lldt{w}\t$src", []>, TB;
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
                 
 //===----------------------------------------------------------------------===//
 // Specialized register support
-def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
-def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
-def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
 
 def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
-                "smsw{w}\t$dst", []>, OpSize, TB;
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
 def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
-                "smsw{l}\t$dst", []>, TB;
+                "smsw{l}\t$dst", [], IIC_SMSW>, TB;
 // no m form encodable; use SMSW16m
 def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
-                 "smsw{q}\t$dst", []>, TB;
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
 
 // For memory operands, there is only a 16-bit form
 def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
-                "smsw{w}\t$dst", []>, TB;
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
-                "lmsw{w}\t$src", []>, TB;
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
-                "lmsw{w}\t$src", []>, TB;
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
                 
-def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
 
 //===----------------------------------------------------------------------===//
 // Cache instructions
-def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
 
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 6a8f0c8..6d3548f 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -17,17 +17,17 @@
 
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                Requires<[In32BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                Requires<[In64BitMode]>;
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                 Requires<[In32BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                 Requires<[In64BitMode]>;
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 65bbcb5..8ec2c68 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -15,7 +15,7 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, VEX;
-  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
@@ -36,27 +36,19 @@ let isAsmParserOnly = 1 in {
   defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
   defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
   defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
-  defm VFRCZPS   : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-  defm VFRCZPD   : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
 }
 
 // Scalar load 2 addr operand instructions
-let Constraints = "$src1 = $dst" in {
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      Operand memop, ComplexPattern mem_cpat> {
-  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
-                                                        VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX;
-  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
-                                                        memop:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1,
-                                  (bitconvert mem_cpat:$src2)))]>, VEX;
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
 }
 
-} // Constraints = "$src1 = $dst"
-
 let isAsmParserOnly = 1 in {
   defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                    ssmem, sse_load_f32>;
@@ -64,12 +56,26 @@ let isAsmParserOnly = 1 in {
                    sdmem, sse_load_f64>;
 }
 
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
@@ -88,13 +94,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2),
+           (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
            VEX_4V, VEX_W;
   def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins f128mem:$src1, VR128:$src2),
+           (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
@@ -116,25 +122,23 @@ let isAsmParserOnly = 1 in {
   defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
 }
 
-multiclass xop3opimm<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
-    def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-             (ins VR128:$src1, i8imm:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, VEX;
-    let mayLoad = 1 in
-    def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src1, i8imm:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, VEX;
-  }
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
 }
 
 let isAsmParserOnly = 1 in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw">;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq">;
-  defm VPROTD : xop3opimm<0xC2, "vprotd">;
-  defm VPROTB : xop3opimm<0xC0, "vprotb">;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
 }
 
 // Instruction where second source can be memory, but third must be register
@@ -146,7 +150,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst,
               (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -170,32 +174,31 @@ let isAsmParserOnly = 1 in {
 }
 
 // Instruction where second source can be memory, third must be imm8
-multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                     ValueType VT> {
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V;
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+           VEX_4V;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, i8imm:$src3),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
-                  imm:$src3)))]>, VEX_4V;
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              imm:$src3))]>, VEX_4V;
 }
 
 let isAsmParserOnly = 1 in {
-  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>;
-  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>;
-  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>;
-  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>;
-  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>;
-  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>;
-  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>;
-  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>;
+  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
 }
 
 // Instruction where either second or third source can be memory
@@ -207,7 +210,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
            VEX_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -215,7 +218,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               (bitconvert (memopv2i64 addr:$src3))))]>,
            VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
   def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -237,7 +240,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
            VEX_4V, VEX_I8IMM;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index c76d3cc..d7c08df 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -65,7 +65,7 @@ namespace llvm {
     /// referenced global symbols.
     virtual void relocate(void *Function, MachineRelocation *MR,
                           unsigned NumRelocs, unsigned char* GOTBase);
-    
+
     /// allocateThreadLocalMemory - Each target has its own way of
     /// handling thread local variables. This method returns a value only
     /// meaningful to the target.
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b578e8d..9c0ce4e 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -46,12 +46,12 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
-  
+
   if (!MO.isGlobal()) {
     assert(MO.isSymbol());
     Name += MAI.getGlobalPrefix();
     Name += MO.getSymbolName();
-  } else {    
+  } else {
     const GlobalValue *GV = MO.getGlobal();
     bool isImplicitlyPrivate = false;
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
@@ -59,7 +59,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
         MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       isImplicitlyPrivate = true;
-    
+
     Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
   }
 
@@ -110,7 +110,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       getMachOMMI().getFnStubEntry(Sym);
     if (StubSym.getPointer())
       return Sym;
-    
+
     if (MO.isGlobal()) {
       StubSym =
         MachineModuleInfoImpl::
@@ -135,7 +135,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // lot of extra uniquing.
   const MCExpr *Expr = 0;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  
+
   switch (MO.getTargetFlags()) {
   default: llvm_unreachable("Unknown target flag on GV operand");
   case X86II::MO_NO_FLAG:    // No flag.
@@ -144,7 +144,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_DLLIMPORT:
   case X86II::MO_DARWIN_STUB:
     break;
-      
+
   case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
   case X86II::MO_TLVP_PIC_BASE:
     Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
@@ -156,10 +156,14 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
   case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
   case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
   case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
   case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
   case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
   case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
   case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
   case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
   case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
@@ -169,7 +173,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::Create(Sym, Ctx);
     // Subtract the pic base.
-    Expr = MCBinaryExpr::CreateSub(Expr, 
+    Expr = MCBinaryExpr::CreateSub(Expr,
                             MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
                                    Ctx);
     if (MO.isJTI() && MAI.hasSetDirective()) {
@@ -183,10 +187,10 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     }
     break;
   }
-  
+
   if (Expr == 0)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
-  
+
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(Expr,
                                    MCConstantExpr::Create(MO.getOffset(), Ctx),
@@ -207,10 +211,10 @@ static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
   // Convert registers in the addr mode according to subreg64.
   for (unsigned i = 0; i != 4; ++i) {
     if (!MI->getOperand(OpNo+i).isReg()) continue;
-    
+
     unsigned Reg = MI->getOperand(OpNo+i).getReg();
     if (Reg == 0) continue;
-    
+
     MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
   }
 }
@@ -276,7 +280,7 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
     return;
 
   // Check whether this is an absolute address.
-  // FIXME: We know TLVP symbol refs aren't, but there should be a better way 
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way
   // to do this here.
   bool Absolute = true;
   if (Inst.getOperand(AddrOp).isExpr()) {
@@ -285,7 +289,7 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
       if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
         Absolute = false;
   }
-  
+
   if (Absolute &&
       (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
        Inst.getOperand(AddrBase + 2).getReg() != 0 ||
@@ -302,10 +306,10 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
 
 void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
-  
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    
+
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
@@ -341,10 +345,10 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       // Ignore call clobbers.
       continue;
     }
-    
+
     OutMI.addOperand(MCOp);
   }
-  
+
   // Handle a few special cases to eliminate operand modifiers.
 ReSimplify:
   switch (OutMI.getOpcode()) {
@@ -421,7 +425,7 @@ ReSimplify:
     case X86::TAILJMPd:
     case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
     }
-    
+
     MCOperand Saved = OutMI.getOperand(0);
     OutMI = MCInst();
     OutMI.setOpcode(Opcode);
@@ -441,7 +445,7 @@ ReSimplify:
   case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
   case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
   case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
-      
+
   // The assembler backend wants to see branches in their small form and relax
   // them to their large form.  The JIT can only handle the large form because
   // it does not do relaxation.  For now, translate the large form to the
@@ -550,17 +554,38 @@ ReSimplify:
 static void LowerTlsAddr(MCStreamer &OutStreamer,
                          X86MCInstLower &MCInstLowering,
                          const MachineInstr &MI) {
-  bool is64Bits = MI.getOpcode() == X86::TLS_addr64;
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
   MCContext &context = OutStreamer.getContext();
 
-  if (is64Bits) {
+  if (needsPadding) {
     MCInst prefix;
     prefix.setOpcode(X86::DATA16_PREFIX);
     OutStreamer.EmitInstruction(prefix);
   }
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
   MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
-  const MCSymbolRefExpr *symRef =
-    MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context);
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
 
   MCInst LEA;
   if (is64Bits) {
@@ -571,6 +596,14 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     LEA.addOperand(MCOperand::CreateReg(0));        // index
     LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
     LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
   } else {
     LEA.setOpcode(X86::LEA32r);
     LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
@@ -582,7 +615,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
   }
   OutStreamer.EmitInstruction(LEA);
 
-  if (is64Bits) {
+  if (needsPadding) {
     MCInst prefix;
     prefix.setOpcode(X86::DATA16_PREFIX);
     OutStreamer.EmitInstruction(prefix);
@@ -609,8 +642,6 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  OutStreamer.EmitCodeRegion();
-
   X86MCInstLower MCInstLowering(Mang, *MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -646,6 +677,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case X86::TLS_addr32:
   case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
     return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
@@ -655,7 +688,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //     call "L1$pb"
     // "L1$pb":
     //     popl %esi
-    
+
     // Emit the call.
     MCSymbol *PICBase = MF->getPICBaseSymbol();
     TmpInst.setOpcode(X86::CALLpcrel32);
@@ -664,43 +697,43 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase,
                                                                  OutContext)));
     OutStreamer.EmitInstruction(TmpInst);
-    
+
     // Emit the label.
     OutStreamer.EmitLabel(PICBase);
-    
+
     // popl $reg
     TmpInst.setOpcode(X86::POP32r);
     TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg());
     OutStreamer.EmitInstruction(TmpInst);
     return;
   }
-      
+
   case X86::ADD32ri: {
     // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
     if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
       break;
-    
+
     // Okay, we have something like:
     //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
-    
+
     // For this, we want to print something like:
     //   MYGLOBAL + (. - PICBASE)
     // However, we can't generate a ".", so just emit a new label here and refer
     // to it.
     MCSymbol *DotSym = OutContext.CreateTempSymbol();
     OutStreamer.EmitLabel(DotSym);
-    
+
     // Now that we have emitted the label, lower the complex operand expression.
     MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-    
+
     const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
     const MCExpr *PICBase =
       MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext);
     DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
-    
-    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), 
+
+    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
                                       DotExpr, OutContext);
-    
+
     MCInst TmpInst;
     TmpInst.setOpcode(X86::ADD32ri);
     TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
@@ -710,9 +743,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   }
-  
+
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
   OutStreamer.EmitInstruction(TmpInst);
 }
-
diff --git a/lib/Target/X86/X86MCInstLower.h b/lib/Target/X86/X86MCInstLower.h
index 40df3db..b4d4cfd 100644
--- a/lib/Target/X86/X86MCInstLower.h
+++ b/lib/Target/X86/X86MCInstLower.h
@@ -25,7 +25,7 @@ namespace llvm {
   class Mangler;
   class TargetMachine;
   class X86AsmPrinter;
-  
+
 /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
   MCContext &Ctx;
@@ -37,12 +37,12 @@ class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
 public:
   X86MCInstLower(Mangler *mang, const MachineFunction &MF,
                  X86AsmPrinter &asmprinter);
-  
+
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
   MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-  
+
 private:
   MachineModuleInfoMachO &getMachOMMI() const;
 };
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index c747109..78d20ce 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -24,7 +24,7 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
 
   /// ForceFramePointer - True if the function is required to use of frame
-  /// pointer for reasons other than it containing dynamic allocation or 
+  /// pointer for reasons other than it containing dynamic allocation or
   /// that FP eliminatation is turned off. For example, Cygwin main function
   /// contains stack pointer re-alignment code which requires FP.
   bool ForceFramePointer;
@@ -66,6 +66,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
   /// being passed on the stack.
   unsigned ArgumentStackSize;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -79,8 +81,9 @@ public:
                              RegSaveFrameIndex(0),
                              VarArgsGPOffset(0),
                              VarArgsFPOffset(0),
-                             ArgumentStackSize(0) {}
-  
+                             ArgumentStackSize(0),
+                             NumLocalDynamics(0) {}
+
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
       CalleeSavedFrameSize(0),
@@ -93,9 +96,10 @@ public:
       RegSaveFrameIndex(0),
       VarArgsGPOffset(0),
       VarArgsFPOffset(0),
-      ArgumentStackSize(0) {}
-  
-  bool getForceFramePointer() const { return ForceFramePointer;} 
+      ArgumentStackSize(0),
+      NumLocalDynamics(0) {}
+
+  bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
@@ -130,6 +134,10 @@ public:
 
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
   void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index b56025f..877b8f6 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -50,6 +50,10 @@ ForceStackAlign("force-align-stack",
                            " needed for the function."),
                  cl::init(false), cl::Hidden);
 
+cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
   : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
@@ -73,6 +77,10 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
     StackPtr = X86::ESP;
     FramePtr = X86::EBP;
   }
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
+  BasePtr = Is64Bit ? X86::RBX : X86::ESI;
 }
 
 /// getCompactUnwindRegNum - This function maps the register to the number for
@@ -90,6 +98,12 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // Only enable when post-RA scheduling is enabled and this is needed.
+  return TM.getSubtargetImpl()->postRAScheduler();
+}
+
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
   int reg = X86_MC::getX86RegNum(i);
@@ -146,7 +160,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
   // The GR8_NOREX class is always used in a way that won't be constrained to a
   // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
   // full GR8 class.
-  if (RC == X86::GR8_NOREXRegisterClass)
+  if (RC == &X86::GR8_NOREXRegClass)
     return RC;
 
   const TargetRegisterClass *Super = RC;
@@ -175,7 +189,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
@@ -238,7 +253,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   }
 
   if (ghcCall)
-    return CSR_Ghc_SaveList;
+    return CSR_NoRegs_SaveList;
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -254,7 +269,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    return CSR_Ghc_RegMask;
+    return CSR_NoRegs_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
   if (IsWin64)
@@ -268,21 +283,33 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Set the stack-pointer register and its aliases as reserved.
   Reserved.set(X86::RSP);
-  Reserved.set(X86::ESP);
-  Reserved.set(X86::SP);
-  Reserved.set(X86::SPL);
+  for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+    Reserved.set(*I);
 
   // Set the instruction pointer register and its aliases as reserved.
   Reserved.set(X86::RIP);
-  Reserved.set(X86::EIP);
-  Reserved.set(X86::IP);
+  for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+    Reserved.set(*I);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
     Reserved.set(X86::RBP);
-    Reserved.set(X86::EBP);
-    Reserved.set(X86::BP);
-    Reserved.set(X86::BPL);
+    for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t* RegMask = getCallPreservedMask(CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    Reserved.set(getBaseRegister());
+    for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+      Reserved.set(*I);
   }
 
   // Mark the segment registers as reserved.
@@ -293,6 +320,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::FS);
   Reserved.set(X86::GS);
 
+  // Mark the floating point stack registers as reserved.
+  Reserved.set(X86::ST0);
+  Reserved.set(X86::ST1);
+  Reserved.set(X86::ST2);
+  Reserved.set(X86::ST3);
+  Reserved.set(X86::ST4);
+  Reserved.set(X86::ST5);
+  Reserved.set(X86::ST6);
+  Reserved.set(X86::ST7);
+
   // Reserve the registers that only exist in 64-bit mode.
   if (!Is64Bit) {
     // These 8-bit registers are part of the x86-64 extension even though their
@@ -308,14 +345,13 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         X86::R8,  X86::R9,  X86::R10, X86::R11,
         X86::R12, X86::R13, X86::R14, X86::R15
       };
-      for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI)
-        Reserved.set(Reg);
+      for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
 
       // XMM8, XMM9, ...
       assert(X86::XMM15 == X86::XMM8+7);
-      for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
-           ++AI)
-        Reserved.set(Reg);
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
     }
   }
 
@@ -326,10 +362,36 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment and there are dynamic allocas, we can't
+   // reference off of the stack pointer, so we reserve a base pointer.
+   if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+     return true;
+
+   return false;
+}
+
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return (MF.getTarget().Options.RealignStack &&
-          !MFI->hasVarSizedObjects());
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  if (!MF.getTarget().Options.RealignStack)
+    return false;
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (MFI->hasVarSizedObjects())
+    return MRI->canReserveReg(BasePtr);
+  return true;
 }
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
@@ -339,13 +401,6 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                                F->hasFnAttr(Attribute::StackAlignment));
 
-  // FIXME: Currently we don't support stack realignment for functions with
-  //        variable-sized allocas.
-  // FIXME: It's more complicated than this...
-  if (0 && requiresRealignment && MFI->hasVarSizedObjects())
-    report_fatal_error(
-      "Stack realignment in presence of dynamic allocas is not supported");
-
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
     return canRealignStack(MF);
@@ -485,7 +540,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   unsigned Opc = MI.getOpcode();
   bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
-  if (needsStackRealignment(MF))
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
   else if (AfterFPPop)
     BasePtr = StackPtr;
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index bee0393..1bc32cb 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -50,6 +50,11 @@ private:
   ///
   unsigned FramePtr;
 
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
 public:
   X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
 
@@ -65,7 +70,8 @@ public:
   int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
 
   /// Code Generation virtual methods...
-  /// 
+  ///
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
 
   /// getMatchingSuperRegClass - Return a subclass of the specified register
   /// class A so that each register in it has a sub-register of the
@@ -82,7 +88,8 @@ public:
 
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   /// getCrossCopyRegClass - Returns a legal register class to copy a register
   /// in the specified class to or from. Returns NULL if it is possible to copy
@@ -104,6 +111,8 @@ public:
   /// register scavenger to determine what registers are free.
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  bool hasBasePointer(const MachineFunction &MF) const;
+
   bool canRealignStack(const MachineFunction &MF) const;
 
   bool needsStackRealignment(const MachineFunction &MF) const;
@@ -121,6 +130,7 @@ public:
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 5263a49..edc7184 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -23,9 +23,6 @@ let Namespace = "X86" in {
   def sub_8bit_hi : SubRegIndex;
   def sub_16bit   : SubRegIndex;
   def sub_32bit   : SubRegIndex;
-
-  def sub_ss  : SubRegIndex;
-  def sub_sd  : SubRegIndex;
   def sub_xmm : SubRegIndex;
 
 
@@ -163,8 +160,6 @@ let Namespace = "X86" in {
   def FP6 : Register<"fp6">;
 
   // XMM Registers, used by the various SSE instruction set extensions.
-  // The sub_ss and sub_sd subregs are the same registers with another regclass.
-  let CompositeIndices = [(sub_ss), (sub_sd)] in {
   def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
   def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
   def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
@@ -184,7 +179,7 @@ let Namespace = "X86" in {
   def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
   def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
   def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
-  }}
+  } // CostPerUse
 
   // YMM Registers, used by AVX instructions
   let SubRegIndices = [sub_xmm] in {
@@ -223,6 +218,9 @@ let Namespace = "X86" in {
   def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
   def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
 
+  // Floating-point status word
+  def FPSW : Register<"fpsw">;
+
   // Status flags register
   def EFLAGS : Register<"flags">;
 
@@ -296,26 +294,18 @@ def GR8 : RegisterClass<"X86", [i8],  8,
 
 def GR16 : RegisterClass<"X86", [i16], 16,
                          (add AX, CX, DX, SI, DI, BX, BP, SP,
-                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
-}
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
 
 def GR32 : RegisterClass<"X86", [i32], 32,
                          (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
-                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
 
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
 // RIP isn't really a register and it can't be used anywhere except in an
 // address, but it doesn't cause trouble.
 def GR64 : RegisterClass<"X86", [i64], 64,
                          (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32 sub_32bit)];
-}
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
 
 // Segment registers for use by MOV instructions (and others) that have a
 //   segment register as one operand.  Always contain a 16-bit segment
@@ -336,30 +326,12 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
 // operations.
 def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
 def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
-def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)];
-}
-def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit)];
-}
-def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit),
-                       (GR32_ABCD sub_32bit)];
-}
-def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
-                                                     R8, R9, R11, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32_TC sub_32bit)];
-}
-
+                                                     R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
                                                       R8, R9, R11)>;
 
@@ -373,64 +345,36 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
 }
 // GR16_NOREX - GR16 registers which do not require a REX prefix.
 def GR16_NOREX : RegisterClass<"X86", [i16], 16,
-                               (add AX, CX, DX, SI, DI, BX, BP, SP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
-}
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
 // GR32_NOREX - GR32 registers which do not require a REX prefix.
 def GR32_NOREX : RegisterClass<"X86", [i32], 32,
-                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit)];
-}
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
 // GR64_NOREX - GR64 registers which do not require a REX prefix.
 def GR64_NOREX : RegisterClass<"X86", [i64], 64,
-                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit),
-                       (GR32_NOREX sub_32bit)];
-}
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
 
 // GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
 // mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
 // to clear upper 32-bits of RAX so is not a NOP.
-def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
 
 // GR32_NOSP - GR32 registers except ESP.
-def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
 
 // GR64_NOSP - GR64 registers except RSP (and RIP).
-def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32_NOSP sub_32bit)];
-}
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
 
 // GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
 // ESP.
 def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
-                                    (and GR32_NOREX, GR32_NOSP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit)];
-}
+                                    (and GR32_NOREX, GR32_NOSP)>;
 
 // GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
 def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
-                                    (and GR64_NOREX, GR64_NOSP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit),
-                       (GR32_NOREX_NOSP sub_32bit)];
-}
+                                    (and GR64_NOREX, GR64_NOSP)>;
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
-def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit)];
-}
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
 
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
@@ -458,17 +402,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
 // Generic vector registers: VR64 and VR128.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                          128, (add FR32)> {
-  let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-}
-
+                          128, (add FR32)>;
 def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                          256, (sequence "YMM%u", 0, 15)> {
-  let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
-}
+                          256, (sequence "YMM%u", 0, 15)>;
 
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
index 857becf..0333056 100644
--- a/lib/Target/X86/X86Relocations.h
+++ b/lib/Target/X86/X86Relocations.h
@@ -21,7 +21,7 @@ namespace llvm {
     /// RelocationType - An enum for the x86 relocation codes. Note that
     /// the terminology here doesn't follow x86 convention - word means
     /// 32-bit and dword means 64-bit. The relocations will be treated
-    /// by JIT or ObjectCode emitters, this is transparent to the x86 code 
+    /// by JIT or ObjectCode emitters, this is transparent to the x86 code
     /// emitter but JIT and ObjectCode will treat them differently
     enum RelocationType {
       /// reloc_pcrel_word - PC relative relocation, add the relocated value to
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 17f4efd..c14407f 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for X86 
+// Instruction Itinerary classes used for X86
 def IIC_DEFAULT     : InstrItinClass;
 def IIC_ALU_MEM     : InstrItinClass;
 def IIC_ALU_NONMEM  : InstrItinClass;
@@ -253,6 +253,42 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
 
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
 def IIC_CMPX_LOCK : InstrItinClass;
 def IIC_CMPX_LOCK_8 : InstrItinClass;
 def IIC_CMPX_LOCK_8B : InstrItinClass;
@@ -261,13 +297,185 @@ def IIC_CMPX_LOCK_16B : InstrItinClass;
 def IIC_XADD_LOCK_MEM : InstrItinClass;
 def IIC_XADD_LOCK_MEM8 : InstrItinClass;
 
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BSF : InstrItinClass;
+def IIC_BSR : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-def GenericItineraries : ProcessorItineraries<[], [], []>;
+// IssueWidth is analagous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MinLatency=0 indicates that RAW dependencies can be decoded in the
+// same cycle.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruciton itineraries.
+def GenericModel : SchedMachineModel {
+  let IssueWidth = 4;
+  let MinLatency = 0;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+}
 
 include "X86ScheduleAtom.td"
-
-
-
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 77d4e56..8710261 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -106,7 +106,7 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
   InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
   // set
-  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, 
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
   InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
   // jcc
   InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
@@ -294,12 +294,237 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
 
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
   InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
 
   InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
   ]>;
 
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 9a04e35..00edcbc 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -38,7 +38,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256)
     return SDValue();
-  
+
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
@@ -62,13 +62,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
       Args.push_back(Entry);
       Entry.Node = Size;
       Args.push_back(Entry);
-      std::pair<SDValue,SDValue> CallResult =
-        TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+      TargetLowering::
+      CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
                         false, false, false, false,
                         0, CallingConv::C, /*isTailCall=*/false,
                         /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
                         DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
                         DAG, dl);
+      std::pair<SDValue,SDValue> CallResult =
+        TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ed1a409..9087852 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -39,10 +39,10 @@ unsigned char X86Subtarget::
 ClassifyBlockAddressReference() const {
   if (isPICStyleGOT())    // 32-bit ELF targets.
     return X86II::MO_GOTOFF;
-  
+
   if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
     return X86II::MO_PIC_BASE_OFFSET;
-  
+
   // Direct static reference to label.
   return X86II::MO_NO_FLAG;
 }
@@ -69,7 +69,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
     // Large model never uses stubs.
     if (TM.getCodeModel() == CodeModel::Large)
       return X86II::MO_NO_FLAG;
-      
+
     if (isTargetDarwin()) {
       // If symbol visibility is hidden, the extra load is not needed if
       // target is x86-64 or the symbol is definitely defined in the current
@@ -87,18 +87,18 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     return X86II::MO_NO_FLAG;
   }
-  
+
   if (isPICStyleGOT()) {   // 32-bit ELF targets.
     // Extra load is needed for all externally visible.
     if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
       return X86II::MO_GOTOFF;
     return X86II::MO_GOT;
   }
-  
+
   if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
     // Determine whether we have a stub reference and/or whether the reference
     // is relative to the PIC base or not.
-    
+
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
     if (!isDecl && !GV->isWeakForLinker())
@@ -108,26 +108,26 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
     // normal $non_lazy_ptr stub because this symbol might be resolved late.
     if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
       return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
-    
+
     // If symbol visibility is hidden, we have a stub for common symbol
     // references and external declarations.
     if (isDecl || GV->hasCommonLinkage()) {
       // Hidden $non_lazy_ptr reference.
       return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
     }
-    
+
     // Otherwise, no stub.
     return X86II::MO_PIC_BASE_OFFSET;
   }
-  
+
   if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
     // Determine whether we have a stub reference.
-    
+
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
     if (!isDecl && !GV->isWeakForLinker())
       return X86II::MO_NO_FLAG;
-    
+
     // Unless we have a symbol with hidden visibility, we have to go through a
     // normal $non_lazy_ptr stub because this symbol might be resolved late.
     if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
@@ -136,7 +136,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
     // Otherwise, no stub.
     return X86II::MO_NO_FLAG;
   }
-  
+
   // Direct static reference to global.
   return X86II::MO_NO_FLAG;
 }
@@ -196,33 +196,32 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
   if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
   if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
   if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
-  // FIXME: AVX codegen support is not ready.
-  //if ((ECX >> 28) & 1) { X86SSELevel = AVX;  ToggleFeature(X86::FeatureAVX); }
+  if ((ECX >> 28) & 1) { X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX); }
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
 
-  if (IsIntel && ((ECX >> 1) & 0x1)) {
-    HasCLMUL = true;
-    ToggleFeature(X86::FeatureCLMUL);
+  if ((ECX >> 1) & 0x1) {
+    HasPCLMUL = true;
+    ToggleFeature(X86::FeaturePCLMUL);
   }
-  if (IsIntel && ((ECX >> 12) & 0x1)) {
-    HasFMA3 = true;
-    ToggleFeature(X86::FeatureFMA3);
+  if ((ECX >> 12) & 0x1) {
+    HasFMA = true;
+    ToggleFeature(X86::FeatureFMA);
   }
   if (IsIntel && ((ECX >> 22) & 0x1)) {
     HasMOVBE = true;
     ToggleFeature(X86::FeatureMOVBE);
   }
-  if (IsIntel && ((ECX >> 23) & 0x1)) {
+  if ((ECX >> 23) & 0x1) {
     HasPOPCNT = true;
     ToggleFeature(X86::FeaturePOPCNT);
   }
-  if (IsIntel && ((ECX >> 25) & 0x1)) {
+  if ((ECX >> 25) & 0x1) {
     HasAES = true;
     ToggleFeature(X86::FeatureAES);
   }
-  if (IsIntel && ((ECX >> 29) & 0x1)) {
+  if ((ECX >> 29) & 0x1) {
     HasF16C = true;
     ToggleFeature(X86::FeatureF16C);
   }
@@ -247,15 +246,22 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     }
 
     // If it's Nehalem, unaligned memory access is fast.
-    // FIXME: Nehalem is family 6. Also include Westmere and later processors?
-    if (Family == 15 && Model == 26) {
+    // Include Westmere and Sandy Bridge as well.
+    // FIXME: add later processors.
+    if (IsIntel && ((Family == 6 && Model == 26) ||
+        (Family == 6 && Model == 44) ||
+        (Family == 6 && Model == 42))) {
       IsUAMemFast = true;
       ToggleFeature(X86::FeatureFastUAMem);
     }
 
     // Set processor type. Currently only Atom is detected.
-    if (Family == 6 && Model == 28) {
+    if (Family == 6 &&
+        (Model == 28 || Model == 38 || Model == 39
+         || Model == 53 || Model == 54)) {
       X86ProcFamily = IntelAtom;
+
+      UseLeaForSP = true;
       ToggleFeature(X86::FeatureLeaForSP);
     }
 
@@ -289,9 +295,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     }
   }
 
-  if (IsIntel && MaxLevel >= 7) {
+  if (MaxLevel >= 7) {
     if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
-      if (EBX & 0x1) {
+      if (IsIntel && (EBX & 0x1)) {
         HasFSGSBase = true;
         ToggleFeature(X86::FeatureFSGSBase);
       }
@@ -299,12 +305,11 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasBMI = true;
         ToggleFeature(X86::FeatureBMI);
       }
-      // FIXME: AVX2 codegen support is not ready.
-      //if ((EBX >> 5) & 0x1) {
-      //  X86SSELevel = AVX2;
-      //  ToggleFeature(X86::FeatureAVX2);
-      //}
-      if ((EBX >> 8) & 0x1) {
+      if (IsIntel && ((EBX >> 5) & 0x1)) {
+        X86SSELevel = AVX2;
+        ToggleFeature(X86::FeatureAVX2);
+      }
+      if (IsIntel && ((EBX >> 8) & 0x1)) {
         HasBMI2 = true;
         ToggleFeature(X86::FeatureBMI2);
       }
@@ -313,7 +318,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 }
 
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, 
+                           const std::string &FS,
                            unsigned StackAlignOverride, bool is64Bit)
   : X86GenSubtargetInfo(TT, CPU, FS)
   , X86ProcFamily(Others)
@@ -325,8 +330,8 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasPOPCNT(false)
   , HasSSE4A(false)
   , HasAES(false)
-  , HasCLMUL(false)
-  , HasFMA3(false)
+  , HasPCLMUL(false)
+  , HasFMA(false)
   , HasFMA4(false)
   , HasXOP(false)
   , HasMOVBE(false)
@@ -395,10 +400,10 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
     }
   }
 
-  if (X86ProcFamily == IntelAtom) {
+  if (X86ProcFamily == IntelAtom)
     PostRAScheduler = true;
-    InstrItins = getInstrItineraryForCPU(CPUName);
-  }
+
+  InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
   // target data structure which is shared with MC code emitter, etc.
@@ -424,9 +429,7 @@ bool X86Subtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  //TODO: change back to ANTIDEP_CRITICAL when the
-  // X86 subtarget properly sets up post RA liveness.
-  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 7fd832b..6841c5b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -55,7 +55,7 @@ protected:
 
   /// X86ProcFamily - X86 processor family: Intel Atom, and others
   X86ProcFamilyEnum X86ProcFamily;
-  
+
   /// PICStyle - Which PIC style to use
   ///
   PICStyles::Style PICStyle;
@@ -85,11 +85,11 @@ protected:
   /// HasAES - Target has AES instructions
   bool HasAES;
 
-  /// HasCLMUL - Target has carry-less multiplication
-  bool HasCLMUL;
+  /// HasPCLMUL - Target has carry-less multiplication
+  bool HasPCLMUL;
 
-  /// HasFMA3 - Target has 3-operand fused multiply-add
-  bool HasFMA3;
+  /// HasFMA - Target has 3-operand fused multiply-add
+  bool HasFMA;
 
   /// HasFMA4 - Target has 4-operand fused multiply-add
   bool HasFMA4;
@@ -149,7 +149,7 @@ protected:
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
-  
+
   /// Instruction itineraries for scheduling
   InstrItineraryData InstrItins;
 
@@ -203,8 +203,8 @@ public:
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasPOPCNT() const { return HasPOPCNT; }
   bool hasAES() const { return HasAES; }
-  bool hasCLMUL() const { return HasCLMUL; }
-  bool hasFMA3() const { return HasFMA3; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  bool hasFMA() const { return HasFMA; }
   bool hasFMA4() const { return HasFMA4; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
@@ -307,6 +307,8 @@ public:
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const;
 
+  bool postRAScheduler() const { return PostRAScheduler; }
+
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 89c3884..b7ba568 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -140,39 +140,48 @@ public:
 } // namespace
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new X86PassConfig(this, PM);
+  X86PassConfig *PC = new X86PassConfig(this, PM);
+
+  if (Subtarget.hasCMov())
+    PC->enablePass(&EarlyIfConverterID);
+
+  return PC;
 }
 
 bool X86PassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM->add(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
 
   // For 32-bit, prepend instructions to set the "global base reg" for PIC.
   if (!getX86Subtarget().is64Bit())
-    PM->add(createGlobalBaseRegPass());
+    addPass(createGlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addPreRegAlloc() {
-  PM->add(createX86MaxStackAlignmentHeuristicPass());
+  addPass(createX86MaxStackAlignmentHeuristicPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
 
 bool X86PassConfig::addPostRegAlloc() {
-  PM->add(createX86FloatingPointStackifierPass());
+  addPass(createX86FloatingPointStackifierPass());
   return true;  // -print-machineinstr should print after this.
 }
 
 bool X86PassConfig::addPreEmitPass() {
   bool ShouldPrint = false;
   if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
-    PM->add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
     ShouldPrint = true;
   }
 
   if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
-    PM->add(createX86IssueVZeroUpperPass());
+    addPass(createX86IssueVZeroUpperPass());
     ShouldPrint = true;
   }
 
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 718f35e..92aee0d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -9,16 +9,19 @@
 
 #include "X86TargetObjectFile.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 using namespace llvm;
 using namespace dwarf;
 
-const MCExpr *X8664_MachoTargetObjectFile::
+const MCExpr *X86_64MachoTargetObjectFile::
 getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                MachineModuleInfo *MMI, unsigned Encoding,
                                MCStreamer &Streamer) const {
@@ -37,8 +40,14 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
 }
 
-MCSymbol *X8664_MachoTargetObjectFile::
+MCSymbol *X86_64MachoTargetObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
   return Mang->getSymbol(GV);
 }
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index a02a368..2d320c5 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -16,9 +16,9 @@
 
 namespace llvm {
 
-  /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
   /// x86-64.
-  class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
   public:
     virtual const MCExpr *
     getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
@@ -32,6 +32,12 @@ namespace llvm {
                             MachineModuleInfo *MMI) const;
   };
 
+  /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+  /// and x86-64.
+  class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  };
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 2fd78a7..80b75dc 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -145,7 +145,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   // to insert any VZEROUPPER instructions.  This is constant-time, so it is
   // cheap in the common case of no ymm use.
   bool YMMUsed = false;
-  const TargetRegisterClass *RC = X86::VR256RegisterClass;
+  const TargetRegisterClass *RC = &X86::VR256RegClass;
   for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
        i != e; i++) {
     if (MRI.isPhysRegUsed(*i)) {
@@ -205,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   }
 
 
-  // The entry MBB for the function may set the inital state to dirty if
+  // The entry MBB for the function may set the initial state to dirty if
   // the function receives any YMM incoming arguments
   if (MBB == MF.begin()) {
     EntryState = ST_CLEAN;
@@ -222,7 +222,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     DebugLoc dl = I->getDebugLoc();
     bool isControlFlow = MI->isCall() || MI->isReturn();
 
-    // Shortcut: don't need to check regular instructions in dirty state. 
+    // Shortcut: don't need to check regular instructions in dirty state.
     if (!isControlFlow && CurState == ST_DIRTY)
       continue;
 
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 0d59572..ca94f03 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,5 +22,7 @@ add_llvm_target(XCoreCodeGen
   XCoreSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMXCoreCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 8906b24..c76866f 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -18,9 +18,9 @@
 #include "XCoreSubtarget.h"
 #include "XCoreTargetMachine.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -260,7 +260,17 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
 bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                       unsigned AsmVariant,const char *ExtraCode,
                                       raw_ostream &O) {
-  printOperand(MI, OpNo, O);
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
+
+printOperand(MI, OpNo, O);
   return false;
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 50fda58..a4e5647 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -78,8 +78,7 @@ static void storeToStack(MachineBasicBlock &MBB,
 //===----------------------------------------------------------------------===//
 
 XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
-  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0),
-    STI(sti) {
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
   // Do nothing
 }
 
@@ -341,7 +340,7 @@ XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
   bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
-  const TargetRegisterClass *RC = XCore::GRRegsRegisterClass;
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   if (LRUsed) {
     MF.getRegInfo().setPhysRegUnused(XCore::LR);
@@ -372,8 +371,3 @@ XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                false));
   }
 }
-
-void XCoreFrameLowering::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-
-}
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 4c51aa5..db1bbb6 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -22,7 +22,6 @@ namespace llvm {
   class XCoreSubtarget;
 
   class XCoreFrameLowering: public TargetFrameLowering {
-    const XCoreSubtarget &STI;
   public:
     XCoreFrameLowering(const XCoreSubtarget &STI);
 
@@ -45,8 +44,6 @@ namespace llvm {
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
-    void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
-
     //! Stack slot size (4 bytes)
     static int stackSlotSize() {
       return 4;
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index fdf2b78..8643ffc 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -66,7 +66,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
     Subtarget(*XTM.getSubtargetImpl()) {
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, XCore::GRRegsRegisterClass);
+  addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties();
@@ -485,12 +485,12 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Entry.Node = BasePtr;
   Args.push_back(Entry);
 
-  std::pair<SDValue, SDValue> CallResult =
-        LowerCallTo(Chain, IntPtrTy, false, false,
+  TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false,
                     false, false, 0, CallingConv::C, /*isTailCall=*/false,
                     /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
                     Args, DAG, DL);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   SDValue Ops[] =
     { CallResult.first, CallResult.second };
@@ -547,12 +547,13 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Entry.Node = Value;
   Args.push_back(Entry);
 
-  std::pair<SDValue, SDValue> CallResult =
-        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                    Type::getVoidTy(*DAG.getContext()), false, false,
                     false, false, 0, CallingConv::C, /*isTailCall=*/false,
                     /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
                     Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   return CallResult.second;
 }
@@ -873,14 +874,19 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
 /// XCore call implementation
 SDValue
-XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               bool doesNotRet, bool &isTailCall,
-                               const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               const SmallVectorImpl<SDValue> &OutVals,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // XCore target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -913,7 +919,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
@@ -1036,7 +1042,7 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
@@ -1096,7 +1102,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
@@ -1121,8 +1127,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
           llvm_unreachable(0);
         }
       case MVT::i32:
-        unsigned VReg = RegInfo.createVirtualRegister(
-                          XCore::GRRegsRegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       }
@@ -1172,8 +1177,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
         offset -= StackSlotSize;
         SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
         // Move argument from phys reg -> virt reg
-        unsigned VReg = RegInfo.createVirtualRegister(
-                          XCore::GRRegsRegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(ArgRegs[i], VReg);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         // Move argument from virt reg -> stack
@@ -1201,7 +1205,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
 bool XCoreTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-	       bool isVarArg,
+               bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1222,7 +1226,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
@@ -1606,12 +1610,12 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 std::pair<unsigned, const TargetRegisterClass*>
 XCoreTargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
-			     EVT VT) const {
+                             EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default : break;
     case 'r':
-      return std::make_pair(0U, XCore::GRRegsRegisterClass);
+      return std::make_pair(0U, &XCore::GRRegsRegClass);
     }
   }
   // Use the default implementation in TargetLowering to convert the register
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 0b63ecd..2874f00 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -151,7 +151,7 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-				 EVT VT) const;
+                                 EVT VT) const;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
@@ -174,12 +174,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -191,7 +186,7 @@ namespace llvm {
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-		     bool isVarArg,
+                     bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const;
   };
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index b25a08d..ae646a2 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -741,14 +741,12 @@ let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BL_u10 : _FU10<
-                  (outs),
-                  (ins calltarget:$target, variable_ops),
+                  (outs), (ins calltarget:$target),
                   "bl $target",
                   [(XCoreBranchLink immU10:$target)]>;
 
 def BL_lu10 : _FLU10<
-                  (outs),
-                  (ins calltarget:$target, variable_ops),
+                  (outs), (ins calltarget:$target),
                   "bl $target",
                   [(XCoreBranchLink immU20:$target)]>;
 }
@@ -796,7 +794,7 @@ def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size),
 
 def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size),
                  "mkmsk $dst, $size",
-                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), 0xffffffff))]>;
+                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
 
 def GETR_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$type),
                  "getr $dst, $type",
@@ -950,10 +948,10 @@ def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
 // dgetreg
 def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i),
                     "msync res[$i]",
-		    [(int_xcore_msync GRRegs:$i)]>;
+                    [(int_xcore_msync GRRegs:$i)]>;
 def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i),
                     "mjoin res[$i]",
-		    [(int_xcore_mjoin GRRegs:$i)]>;
+                    [(int_xcore_mjoin GRRegs:$i)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
@@ -988,7 +986,7 @@ def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
 let isCall=1, 
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
-def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
+def BLA_1r : _F1R<(outs), (ins GRRegs:$addr),
                  "bla $addr",
                  [(XCoreBranchLink GRRegs:$addr)]>;
 }
@@ -1038,7 +1036,7 @@ def GETET_0R : _F0R<(outs), (ins),
 
 def SSYNC_0r : _F0R<(outs), (ins),
                     "ssync",
-		    [(int_xcore_ssync)]>;
+                    [(int_xcore_ssync)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
     hasSideEffects = 1 in
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index f3b4b4c..cdd0a08 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -92,6 +92,11 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return requiresRegisterScavenging(MF);
+}
+
+bool
 XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   return false;
 }
@@ -205,8 +210,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned Reg = MI.getOperand(0).getReg();
   bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
 
-  assert(XCore::GRRegsRegisterClass->contains(Reg) &&
-         "Unexpected register operand");
+  assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
   
   MachineBasicBlock &MBB = *MI.getParent();
   
@@ -217,7 +221,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       if (!RS)
         report_fatal_error("eliminateFrameIndex Frame size too big: " +
                            Twine(Offset));
-      unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II,
+      unsigned ScratchReg = RS->scavengeRegister(&XCore::GRRegsRegClass, II,
                                                  SPAdj);
       loadConstant(MBB, II, ScratchReg, Offset, dl);
       switch (MI.getOpcode()) {
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 7391cfd..c4dcb6b 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -50,6 +50,8 @@ public:
   
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   bool useFPForScavengingIndex(const MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 5afd5a1..11ec86b 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -55,7 +55,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool XCorePassConfig::addInstSelector() {
-  PM->add(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+  addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
   return false;
 }
 
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index e160f63..b94dd69 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -245,10 +245,7 @@ static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
                      const ArgPromotion::IndicesVector &Longer) {
   if (Prefix.size() > Longer.size())
     return false;
-  for (unsigned i = 0, e = Prefix.size(); i != e; ++i)
-    if (Prefix[i] != Longer[i])
-      return false;
-  return true;
+  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
 }
 
 
@@ -616,8 +613,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                     AttributesVec.end()));
+  NF->setAttributes(AttrListPtr::get(AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F, NF);
@@ -734,13 +730,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
                                Args, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                                          AttributesVec.end()));
+      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                                        AttributesVec.end()));
+      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 58b3551..3f6b1de 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -20,3 +20,5 @@ add_llvm_library(LLVMipo
   StripDeadPrototypes.cpp
   StripSymbols.cpp
   )
+
+add_dependencies(LLVMipo intrinsics_gen)
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 95aef27..fd23a93 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -238,7 +238,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
         AttributesVec.push_back(PAL.getSlot(i));
       if (Attributes FnAttrs = PAL.getFnAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
-      PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+      PAL = AttrListPtr::get(AttributesVec);
     }
 
     Instruction *New;
@@ -753,8 +753,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(),
-                                        AttributesVec.end());
+  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -816,8 +815,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(),
-                                              AttributesVec.end());
+    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec);
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index d9911bf..4c7f0ed 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -53,12 +53,12 @@ namespace {
            I != E; ++I) {
         if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
           I->setInitializer(0);
-	} else {
-	  if (I->hasAvailableExternallyLinkage())
-	    continue;
-	  if (I->getName() == "llvm.global_ctors")
-	    continue;
-	}
+        } else {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+          if (I->getName() == "llvm.global_ctors")
+            continue;
+        }
 
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
@@ -69,10 +69,10 @@ namespace {
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
         if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
           I->deleteBody();
-	} else {
-	  if (I->hasAvailableExternallyLinkage())
-	    continue;
-	}
+        } else {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+        }
 
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 2b427aa..18c1c7b 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -65,7 +65,7 @@ bool GlobalDCE::runOnModule(Module &M) {
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Functions with external linkage are needed if they have a body
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+    if (!I->isDiscardableIfUnused() &&
         !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
       GlobalIsNeeded(I);
   }
@@ -75,7 +75,7 @@ bool GlobalDCE::runOnModule(Module &M) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible & appending globals are needed, if they have an
     // initializer.
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+    if (!I->isDiscardableIfUnused() &&
         !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
       GlobalIsNeeded(I);
   }
@@ -84,7 +84,7 @@ bool GlobalDCE::runOnModule(Module &M) {
        I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible aliases are needed.
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage())
+    if (!I->isDiscardableIfUnused())
       GlobalIsNeeded(I);
   }
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 1522aa4..6d950d2 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -254,6 +254,8 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::isStored;
           }
         }
+      } else if (isa<BitCastInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
       } else if (isa<GetElementPtrInst>(I)) {
         if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
       } else if (isa<SelectInst>(I)) {
@@ -294,6 +296,168 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
   return false;
 }
 
+/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
+/// as a root?  If so, we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+  // A global variable is a root if it is a pointer, or could plausibly contain
+  // a pointer.  There are two challenges; one is that we could have a struct
+  // the has an inner member which is a pointer.  We recurse through the type to
+  // detect these (up to a point).  The other is that we may actually be a union
+  // of a pointer and another type, and so our LLVM type is an integer which
+  // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+  // potentially contained here.
+
+  if (GV->hasPrivateLinkage())
+    return false;
+
+  SmallVector<Type *, 4> Types;
+  Types.push_back(cast<PointerType>(GV->getType())->getElementType());
+
+  unsigned Limit = 20;
+  do {
+    Type *Ty = Types.pop_back_val();
+    switch (Ty->getTypeID()) {
+      default: break;
+      case Type::PointerTyID: return true;
+      case Type::ArrayTyID:
+      case Type::VectorTyID: {
+        SequentialType *STy = cast<SequentialType>(Ty);
+        Types.push_back(STy->getElementType());
+        break;
+      }
+      case Type::StructTyID: {
+        StructType *STy = cast<StructType>(Ty);
+        if (STy->isOpaque()) return true;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I) {
+          Type *InnerTy = *I;
+          if (isa<PointerType>(InnerTy)) return true;
+          if (isa<CompositeType>(InnerTy))
+            Types.push_back(InnerTy);
+        }
+        break;
+      }
+    }
+    if (--Limit == 0) return true;
+  } while (!Types.empty());
+  return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(Value *V) {
+  do {
+    if (isa<Constant>(V))
+      return true;
+    if (!V->hasOneUse())
+      return false;
+    if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
+        isa<GlobalValue>(V))
+      return false;
+    if (isAllocationFn(V))
+      return true;
+
+    Instruction *I = cast<Instruction>(V);
+    if (I->mayHaveSideEffects())
+      return false;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+    } else if (I->getNumOperands() != 1) {
+      return false;
+    }
+
+    V = I->getOperand(0);
+  } while (1);
+}
+
+/// CleanupPointerRootUsers - This GV is a pointer root.  Loop over all users
+/// of the global and clean up any that obviously don't assign the global a
+/// value that isn't dynamically allocated.
+///
+static bool CleanupPointerRootUsers(GlobalVariable *GV) {
+  // A brief explanation of leak checkers.  The goal is to find bugs where
+  // pointers are forgotten, causing an accumulating growth in memory
+  // usage over time.  The common strategy for leak checkers is to whitelist the
+  // memory pointed to by globals at exit.  This is popular because it also
+  // solves another problem where the main thread of a C++ program may shut down
+  // before other threads that are still expecting to use those globals.  To
+  // handle that case, we expect the program may create a singleton and never
+  // destroy it.
+
+  bool Changed = false;
+
+  // If Dead[n].first is the only use of a malloc result, we can delete its
+  // chain of computation and the store to the global in Dead[n].second.
+  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+  // Constants can't be pointers to dynamically allocated memory.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E;) {
+    User *U = *UI++;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      Value *V = SI->getValueOperand();
+      if (isa<Constant>(V)) {
+        Changed = true;
+        SI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, SI));
+      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+      if (isa<Constant>(MSI->getValue())) {
+        Changed = true;
+        MSI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MSI));
+      }
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+      if (MemSrc && MemSrc->isConstant()) {
+        Changed = true;
+        MTI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MTI));
+      }
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      if (SafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        Dead.clear();
+        CleanupPointerRootUsers(GV);
+        return true;
+      }
+    }
+  }
+
+  for (int i = 0, e = Dead.size(); i != e; ++i) {
+    if (IsSafeComputationToRemove(Dead[i].first)) {
+      Dead[i].second->eraseFromParent();
+      Instruction *I = Dead[i].first;
+      do {
+	if (isAllocationFn(I))
+	  break;
+        Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+        if (!J)
+          break;
+        I->eraseFromParent();
+        I = J;
+      } while (1);
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
 /// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
 /// users of the global, cleaning up the obvious ones.  This is largely just a
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
@@ -517,7 +681,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
-                                               GV->isThreadLocal(),
+                                               GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
@@ -550,7 +714,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
-                                               GV->isThreadLocal(),
+                                               GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
@@ -810,13 +974,18 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // If we nuked all of the loads, then none of the stores are needed either,
   // nor is the global.
   if (AllNonStoreUsesGone) {
-    DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
-    CleanupConstantGlobalUsers(GV, 0, TD, TLI);
+    if (isLeakCheckerRoot(GV)) {
+      Changed |= CleanupPointerRootUsers(GV);
+    } else {
+      Changed = true;
+      CleanupConstantGlobalUsers(GV, 0, TD, TLI);
+    }
     if (GV->use_empty()) {
+      DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      Changed = true;
       GV->eraseFromParent();
       ++NumDeleted;
     }
-    Changed = true;
   }
   return Changed;
 }
@@ -866,7 +1035,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                              UndefValue::get(GlobalType),
                                              GV->getName()+".body",
                                              GV,
-                                             GV->isThreadLocal());
+                                             GV->getThreadLocalMode());
 
   // If there are bitcast users of the malloc (which is typical, usually we have
   // a malloc + bitcast) then replace them with uses of the new global.  Update
@@ -899,7 +1068,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
     new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
                        GlobalValue::InternalLinkage,
                        ConstantInt::getFalse(GV->getContext()),
-                       GV->getName()+".init", GV->isThreadLocal());
+                       GV->getName()+".init", GV->getThreadLocalMode());
   bool InitBoolUsed = false;
 
   // Loop over all uses of GV, processing them in turn.
@@ -1321,7 +1490,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                          PFieldTy, false, GlobalValue::InternalLinkage,
                          Constant::getNullValue(PFieldTy),
                          GV->getName() + ".f" + Twine(FieldNo), GV,
-                         GV->isThreadLocal());
+                         GV->getThreadLocalMode());
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
@@ -1567,8 +1736,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
       Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
-      CI = dyn_cast<BitCastInst>(Malloc) ?
-        extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc);
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+        CI = cast<CallInst>(BCI->getOperand(0));
+      else
+        CI = cast<CallInst>(Malloc);
     }
 
     GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
@@ -1645,7 +1816,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GlobalValue::InternalLinkage,
                                         ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
-                                             GV->isThreadLocal());
+                                             GV->getThreadLocalMode());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1716,7 +1887,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 /// possible.  If we make a change, return true.
 bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
                               Module::global_iterator &GVI) {
-  if (!GV->hasLocalLinkage())
+  if (!GV->isDiscardableIfUnused())
     return false;
 
   // Do more involved optimizations if the global is internal.
@@ -1729,6 +1900,9 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
     return true;
   }
 
+  if (!GV->hasLocalLinkage())
+    return false;
+
   SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
 
@@ -1787,10 +1961,15 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
   if (!GS.isLoaded) {
     DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
 
-    // Delete any stores we can find to the global.  We may not be able to
-    // make it completely dead though.
-    bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(),
-                                              TD, TLI);
+    bool Changed;
+    if (isLeakCheckerRoot(GV)) {
+      // Delete any constant stores to the global.
+      Changed = CleanupPointerRootUsers(GV);
+    } else {
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
+    }
 
     // If the global is dead now, delete it.
     if (GV->use_empty()) {
@@ -1838,7 +2017,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
         if (GV->use_empty()) {
           DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
-                << "simplify all users and delete global!\n");
+                       << "simplify all users and delete global!\n");
           GV->eraseFromParent();
           ++NumDeleted;
         } else {
@@ -1870,6 +2049,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 /// function, changing them to FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
   for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
     CallSite User(cast<Instruction>(*UI));
     User.setCallingConv(CallingConv::Fast);
   }
@@ -1890,6 +2071,8 @@ static AttrListPtr StripNest(const AttrListPtr &Attrs) {
 static void RemoveNestAttribute(Function *F) {
   F->setAttributes(StripNest(F->getAttributes()));
   for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
     CallSite User(cast<Instruction>(*UI));
     User.setAttributes(StripNest(User.getAttributes()));
   }
@@ -2045,7 +2228,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   // Create the new global and insert it next to the existing list.
   GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
                                            GCL->getLinkage(), CA, "",
-                                           GCL->isThreadLocal());
+                                           GCL->getThreadLocalMode());
   GCL->getParent()->getGlobalList().insert(GCL, NGV);
   NGV->takeName(GCL);
 
@@ -2701,7 +2884,7 @@ static bool EvaluateStaticConstructor(Function *F, const TargetData *TD,
           << " stores.\n");
     for (DenseMap<Constant*, Constant*>::const_iterator I =
            Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
-	 I != E; ++I)
+         I != E; ++I)
       CommitValueTo(I->second, I->first);
     for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I =
            Eval.getInvariants().begin(), E = Eval.getInvariants().end();
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index dc9cbfb..712888a 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -36,7 +36,7 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
-// This weirdly named statistic tracks the number of times that, when attemting
+// This weirdly named statistic tracks the number of times that, when attempting
 // to inline a function A into B, we analyze the callers of B in order to see
 // if those would be more profitable and blocked inline steps.
 STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
@@ -201,19 +201,22 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 }
 
 unsigned Inliner::getInlineThreshold(CallSite CS) const {
-  int thres = InlineThreshold;
+  int thres = InlineThreshold; // -inline-threshold or else selected by
+                               // overall opt level
 
-  // Listen to optsize when -inline-limit is not given.
+  // If -inline-threshold is not given, listen to the optsize attribute when it
+  // would decrease the threshold.
   Function *Caller = CS.getCaller();
-  if (Caller && !Caller->isDeclaration() &&
-      Caller->hasFnAttr(Attribute::OptimizeForSize) &&
-      InlineLimit.getNumOccurrences() == 0)
+  bool OptSize = Caller && !Caller->isDeclaration() &&
+    Caller->hasFnAttr(Attribute::OptimizeForSize);
+  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
-  // Listen to inlinehint when it would increase the threshold.
+  // Listen to the inlinehint attribute when it would increase the threshold.
   Function *Callee = CS.getCalledFunction();
-  if (HintThreshold > thres && Callee && !Callee->isDeclaration() &&
-      Callee->hasFnAttr(Attribute::InlineHint))
+  bool InlineHint = Callee && !Callee->isDeclaration() &&
+    Callee->hasFnAttr(Attribute::InlineHint);
+  if (InlineHint && HintThreshold > thres)
     thres = HintThreshold;
 
   return thres;
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 4f96afe4..97d7cdc 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
@@ -132,7 +132,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (ShouldExtractLoop) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
-    if (ExtractLoop(DT, L) != 0) {
+    CodeExtractor Extractor(DT, *L);
+    if (Extractor.extractCodeRegion() != 0) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
@@ -296,7 +297,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {
     if (const InvokeInst *II =
         dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
       BlocksToExtractVec.push_back(II->getUnwindDest());
-    ExtractBasicBlock(BlocksToExtractVec);
+    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
   }
 
   return !BlocksToExtract.empty();
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 0b01c38..9f70f66 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -45,22 +45,22 @@
 
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Constants.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -389,7 +389,7 @@ bool FunctionComparator::enumerate(const Value *V1, const Value *V2) {
     if (!C2) return false;
     // TODO: constant expressions with GEP or references to F1 or F2.
     if (C1->isNullValue() && C2->isNullValue() &&
-	isEquivalentType(C1->getType(), C2->getType()))
+        isEquivalentType(C1->getType(), C2->getType()))
       return true;
     // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
     // then they must have equal bit patterns.
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index d9d1d10..9c9910b 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CFG.h"
 using namespace llvm;
@@ -122,7 +122,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   DT.runOnFunction(*duplicateFunction);
   
   // Extract the body of the if.
-  Function* extractedFunction = ExtractCodeRegion(DT, toExtract);
+  Function* extractedFunction
+    = CodeExtractor(toExtract, &DT).extractCodeRegion();
   
   InlineFunctionInfo IFI;
   
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index b5caa9a..80bfc1c 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -22,11 +22,12 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/TypeFinder.h"
 #include "llvm/ValueSymbolTable.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
@@ -175,8 +176,8 @@ static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
 
 // Strip any named types of their names.
 static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
-  std::vector<StructType*> StructTypes;
-  M.findUsedStructTypes(StructTypes);
+  TypeFinder StructTypes;
+  StructTypes.run(M, false);
 
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index d070ccc..72cfe2c 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -13,3 +13,5 @@ add_llvm_library(LLVMInstCombine
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
+
+add_dependencies(LLVMInstCombine intrinsics_gen)
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 199df51..0d5ef90 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,11 +11,11 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/TargetFolder.h"
 
@@ -187,7 +187,7 @@ public:
   Instruction *visitPHINode(PHINode &PN);
   Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
   Instruction *visitAllocaInst(AllocaInst &AI);
-  Instruction *visitMalloc(Instruction &FI);
+  Instruction *visitAllocSite(Instruction &FI);
   Instruction *visitFree(CallInst &FI);
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 05e702f..99b62f8 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -170,10 +170,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
   if (Value *LHSV = dyn_castNegVal(LHS)) {
-    if (Value *RHSV = dyn_castNegVal(RHS)) {
-      Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
-      return BinaryOperator::CreateNeg(NewAdd);
-    }
+    if (!isa<Constant>(RHS))
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
     
     return BinaryOperator::CreateSub(RHS, LHSV);
   }
@@ -329,6 +330,20 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
+  // Check for (x & y) + (x ^ y)
+  {
+    Value *A = 0, *B = 0;
+    if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -406,66 +421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 }
 
 
-/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
-/// code necessary to compute the offset from the base pointer (without adding
-/// in the base pointer).  Return the result as a signed integer of intptr size.
-Value *InstCombiner::EmitGEPOffset(User *GEP) {
-  TargetData &TD = *getTargetData();
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
-  Value *Result = Constant::getNullValue(IntPtrTy);
-
-  // If the GEP is inbounds, we know that none of the addressing operations will
-  // overflow in an unsigned sense.
-  bool isInBounds = cast<GEPOperator>(GEP)->isInBounds();
-  
-  // Build a mask for high order bits.
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
-  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-
-  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
-       ++i, ++GTI) {
-    Value *Op = *i;
-    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
-    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
-      if (OpC->isZero()) continue;
-      
-      // Handle a struct index, which adds its field offset to the pointer.
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
-        
-        if (Size)
-          Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
-                                      GEP->getName()+".offs");
-        continue;
-      }
-      
-      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
-      Constant *OC =
-              ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
-      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
-      // Emit an add instruction.
-      Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
-      continue;
-    }
-    // Convert to correct type.
-    if (Op->getType() != IntPtrTy)
-      Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
-    if (Size != 1) {
-      // We'll let instcombine(mul) convert this to a shl if possible.
-      Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
-                              GEP->getName()+".idx", isInBounds /*NUW*/);
-    }
-
-    // Emit an add instruction.
-    Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs");
-  }
-  return Result;
-}
-
-
-
-
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
@@ -589,11 +544,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-    // C - zext(bool) -> bool ? C - 1 : C
-    if (ZExtInst *ZI = dyn_cast<ZExtInst>(Op1))
-      if (ZI->getSrcTy()->isIntegerTy(1))
-        return SelectInst::Create(ZI->getOperand(0), SubOne(C), C);
-
     // C-(X+C2) --> (C-C2)-X
     ConstantInt *C2;
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2))))
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 0dbe11d..7d0af0d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -986,19 +986,23 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     bool Op1Ordered;
     unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
     unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+    // uno && ord -> false
+    if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered)
+        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
     if (Op1Pred == 0) {
       std::swap(LHS, RHS);
       std::swap(Op0Pred, Op1Pred);
       std::swap(Op0Ordered, Op1Ordered);
     }
     if (Op0Pred == 0) {
-      // uno && ueq -> uno && (uno || eq) -> ueq
+      // uno && ueq -> uno && (uno || eq) -> uno
       // ord && olt -> ord && (ord && lt) -> olt
-      if (Op0Ordered == Op1Ordered)
+      if (!Op0Ordered && (Op0Ordered == Op1Ordered))
+        return LHS;
+      if (Op0Ordered && (Op0Ordered == Op1Ordered))
         return RHS;
       
       // uno && oeq -> uno && (ord && eq) -> false
-      // uno && ord -> false
       if (!Op0Ordered)
         return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
       // ord && ueq -> ord && (uno || eq) -> oeq
@@ -1932,10 +1936,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   // A | ( A ^ B) -> A |  B
   // A | (~A ^ B) -> A | ~B
+  // (A & B) | (A ^ B)
   if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
     if (Op0 == A || Op0 == B)
       return BinaryOperator::CreateOr(A, B);
 
+    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(A, B);
+
     if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
       Value *Not = Builder->CreateNot(B, B->getName()+".not");
       return BinaryOperator::CreateOr(Not, Op0);
@@ -2212,7 +2221,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Op0I && Op1I && Op0I->isShift() && 
       Op0I->getOpcode() == Op1I->getOpcode() && 
       Op0I->getOperand(1) == Op1I->getOperand(1) &&
-      (Op1I->hasOneUse() || Op1I->hasOneUse())) {
+      (Op0I->hasOneUse() || Op1I->hasOneUse())) {
     Value *NewOp =
       Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0),
                          Op0I->getName());
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 77e4727..d34fab1 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,8 +172,6 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   if (isFreeCall(&CI))
     return visitFree(CI);
-  if (isMalloc(&CI))
-    return visitMalloc(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
   // callee isn't.
@@ -246,78 +244,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   switch (II->getIntrinsicID()) {
   default: break;
   case Intrinsic::objectsize: {
-    // We need target data for just about everything so depend on it.
-    if (!TD) break;
-
-    Type *ReturnTy = CI.getType();
-    uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL;
-
-    // Get to the real allocated thing and offset as fast as possible.
-    Value *Op1 = II->getArgOperand(0)->stripPointerCasts();
-
-    uint64_t Offset = 0;
-    uint64_t Size = -1ULL;
-
-    // Try to look through constant GEPs.
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) {
-      if (!GEP->hasAllConstantIndices()) break;
-
-      // Get the current byte offset into the thing. Use the original
-      // operand in case we're looking through a bitcast.
-      SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return 0;
-      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
-
-      Op1 = GEP->getPointerOperand()->stripPointerCasts();
-
-      // Make sure we're not a constant offset from an external
-      // global.
-      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1))
-        if (!GV->hasDefinitiveInitializer()) break;
-    }
-
-    // If we've stripped down to a single global variable that we
-    // can know the size of then just return that.
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) {
-      if (GV->hasDefinitiveInitializer()) {
-        Constant *C = GV->getInitializer();
-        Size = TD->getTypeAllocSize(C->getType());
-      } else {
-        // Can't determine size of the GV.
-        Constant *RetVal = ConstantInt::get(ReturnTy, DontKnow);
-        return ReplaceInstUsesWith(CI, RetVal);
-      }
-    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) {
-      // Get alloca size.
-      if (AI->getAllocatedType()->isSized()) {
-        Size = TD->getTypeAllocSize(AI->getAllocatedType());
-        if (AI->isArrayAllocation()) {
-          const ConstantInt *C = dyn_cast<ConstantInt>(AI->getArraySize());
-          if (!C) break;
-          Size *= C->getZExtValue();
-        }
-      }
-    } else if (CallInst *MI = extractMallocCall(Op1)) {
-      // Get allocation size.
-      Type* MallocType = getMallocAllocatedType(MI);
-      if (MallocType && MallocType->isSized())
-        if (Value *NElems = getMallocArraySize(MI, TD, true))
-          if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
-            Size = NElements->getZExtValue() * TD->getTypeAllocSize(MallocType);
-    }
-
-    // Do not return "I don't know" here. Later optimization passes could
-    // make it possible to evaluate objectsize to a constant.
-    if (Size == -1ULL)
-      break;
-
-    if (Size < Offset) {
-      // Out of bound reference? Negative index normalized to large
-      // index? Just return "I don't know".
-      return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, DontKnow));
-    }
-    return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, Size-Offset));
+    uint64_t Size;
+    if (getObjectSize(II->getArgOperand(0), Size, TD))
+      return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
+    return 0;
   }
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
@@ -694,6 +624,57 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // Handle mul by zero first:
+    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+      return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+    }
+
+    // Check for constant LHS & RHS - in this case we just simplify.
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu);
+    VectorType *NewVT = cast<VectorType>(II->getType());
+    unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth();
+    if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) {
+      if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+        VectorType* VT = cast<VectorType>(CV0->getType());
+        SmallVector<Constant*, 4> NewElems;
+        for (unsigned i = 0; i < VT->getNumElements(); ++i) {
+          APInt CV0E =
+            (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue();
+          CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth);
+          APInt CV1E =
+            (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue();
+          CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth);
+          NewElems.push_back(
+            ConstantInt::get(NewVT->getElementType(), CV0E * CV1E));
+        }
+        return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems));
+      }
+
+      // Couldn't simplify - cannonicalize constant to the RHS.
+      std::swap(Arg0, Arg1);
+    }
+
+    // Handle mul by one:
+    if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+      if (ConstantInt *Splat =
+            dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) {
+        if (Splat->isOne()) {
+          if (Zext)
+            return CastInst::CreateZExtOrBitCast(Arg0, II->getType());
+          // else    
+          return CastInst::CreateSExtOrBitCast(Arg0, II->getType());
+        }
+      }
+    }
+
+    break;
+  }
+
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -711,7 +692,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     TerminatorInst *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
-      if (isa<AllocaInst>(BI) || isMalloc(BI)) {
+      if (isa<AllocaInst>(BI)) {
         CannotRemove = true;
         break;
       }
@@ -814,7 +795,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) {
   if (CI->getCalledFunction() == 0) return 0;
 
   InstCombineFortifiedLibCalls Simplifier(this);
-  Simplifier.fold(CI, TD);
+  Simplifier.fold(CI, TD, TLI);
   return Simplifier.NewInstruction;
 }
 
@@ -898,6 +879,9 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  if (isAllocLikeFn(CS.getInstruction()))
+    return visitAllocSite(*CS.getInstruction());
+
   bool Changed = false;
 
   // If the callee is a pointer to a function, attempt to move any casts to the
@@ -933,24 +917,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     }
 
   if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-    // This instruction is not reachable, just remove it.  We insert a store to
-    // undef so that we know that this code is not reachable, despite the fact
-    // that we can't modify the CFG here.
-    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
-               UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
-                  CS.getInstruction());
-
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
       ReplaceInstUsesWith(*CS.getInstruction(),
                           UndefValue::get(CS.getInstruction()->getType()));
 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      // Don't break the CFG, insert a dummy cond branch.
-      BranchInst::Create(II->getNormalDest(), II->getUnwindDest(),
-                         ConstantInt::getTrue(Callee->getContext()), II);
+    if (isa<InvokeInst>(CS.getInstruction())) {
+      // Can't remove an invoke because we cannot change the CFG.
+      return 0;
     }
+
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                  UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                  CS.getInstruction());
+
     return EraseInstFromFunction(*CS.getInstruction());
   }
 
@@ -1194,8 +1178,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(),
-                                                     attrVec.end());
+  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1367,8 +1350,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(),
-                                                   NewAttrs.end());
+      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 39279f4..555b442 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -34,7 +34,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
-    if (OBI && !OBI->hasNoUnsignedWrap()) {
+    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
       Scale = 1;
       Offset = 0;
       return Val;
@@ -648,10 +648,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   if (!I) return false;
   
   // If the input is a truncate from the destination type, we can trivially
-  // eliminate it, even if it has multiple uses.
-  // FIXME: This is currently disabled until codegen can handle this without
-  // pessimizing code, PR5997.
-  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+  // eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
   
   // We can't extend or shrink something that has multiple uses: doing so would
@@ -992,11 +990,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
   
-  // If this is a truncate from the dest type, we can trivially eliminate it,
-  // even if it has multiple uses.
-  // FIXME: This is currently disabled until codegen can handle this without
-  // pessimizing code, PR5997.
-  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+  // If this is a truncate from the dest type, we can trivially eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
   
   // We can't extend or shrink something that has multiple uses: doing so would
@@ -1341,10 +1336,9 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
     // non-type-safe code.
     if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
         GEP->hasAllConstantIndices()) {
-      // We are guaranteed to get a constant from EmitGEPOffset.
-      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP));
-      int64_t Offset = OffsetV->getSExtValue();
-      
+      SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
+      int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
+
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
       Type *GEPIdxTy =
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ab2987f..bdd310e 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1035,7 +1035,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
         // Pull in the high bits from known-ones set.
         APInt NewRHS = RHS->getValue().zext(SrcBits);
-        NewRHS |= KnownOne;
+        NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantInt::get(ICI.getContext(), NewRHS));
       }
@@ -2580,10 +2580,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
 
+    // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+    // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+    ConstantInt *Cst1;
+    if ((Op0->hasOneUse() &&
+         match(Op0, m_ZExt(m_Value(A))) &&
+         match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+        (Op1->hasOneUse() &&
+         match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+         match(Op1, m_ZExt(m_Value(A))))) {
+      APInt Pow2 = Cst1->getValue() + 1;
+      if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+          Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+        return new ICmpInst(I.getPredicate(), A,
+                            Builder->CreateTrunc(B, A->getType()));
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
-    ConstantInt *Cst1;
     if (Op0->hasOneUse() &&
         match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A),
                                            m_ConstantInt(ShAmt))))) &&
@@ -2809,7 +2824,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       case ICmpInst::ICMP_UGE:
         // (float)int >= -4.4   --> true
         // (float)int >= 4.4    --> int > 4
-        if (!RHS.isNegative())
+        if (RHS.isNegative())
           return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
         Pred = ICmpInst::ICMP_UGT;
         break;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index b2f2e24..c485844 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -22,72 +22,6 @@ using namespace llvm;
 
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 
-// Try to kill dead allocas by walking through its uses until we see some use
-// that could escape. This is a conservative analysis which tries to handle
-// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things
-// left after inlining and SROA finish chewing on an alloca.
-static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) {
-  SmallVector<Instruction *, 4> Worklist, DeadStores;
-  Worklist.push_back(&AI);
-  do {
-    Instruction *PI = Worklist.pop_back_val();
-    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end();
-         UI != UE; ++UI) {
-      Instruction *I = cast<Instruction>(*UI);
-      switch (I->getOpcode()) {
-      default:
-        // Give up the moment we see something we can't handle.
-        return 0;
-
-      case Instruction::GetElementPtr:
-      case Instruction::BitCast:
-        Worklist.push_back(I);
-        continue;
-
-      case Instruction::Call:
-        // We can handle a limited subset of calls to no-op intrinsics.
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-          switch (II->getIntrinsicID()) {
-          case Intrinsic::dbg_declare:
-          case Intrinsic::dbg_value:
-          case Intrinsic::invariant_start:
-          case Intrinsic::invariant_end:
-          case Intrinsic::lifetime_start:
-          case Intrinsic::lifetime_end:
-            continue;
-          default:
-            return 0;
-          }
-        }
-        // Reject everything else.
-        return 0;
-
-      case Instruction::Store: {
-        // Stores into the alloca are only live if the alloca is live.
-        StoreInst *SI = cast<StoreInst>(I);
-        // We can eliminate atomic stores, but not volatile.
-        if (SI->isVolatile())
-          return 0;
-        // The store is only trivially safe if the poniter is the destination
-        // as opposed to the value. We're conservative here and don't check for
-        // the case where we store the address of a dead alloca into a dead
-        // alloca.
-        if (SI->getPointerOperand() != PI)
-          return 0;
-        DeadStores.push_back(I);
-        continue;
-      }
-      }
-    }
-  } while (!Worklist.empty());
-
-  // The alloca is dead. Kill off all the stores to it, and then replace it
-  // with undef.
-  while (!DeadStores.empty())
-    IC.EraseInstFromFunction(*DeadStores.pop_back_val());
-  return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType()));
-}
-
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -106,7 +40,6 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
       Type *NewTy = 
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
 
@@ -135,22 +68,54 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     }
   }
 
-  if (TD && isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) {
-    // If alloca'ing a zero byte object, replace the alloca with a null pointer.
-    // Note that we only do this for alloca's, because malloc should allocate
-    // and return a unique pointer, even for a zero byte allocation.
-    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
-      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
-
+  if (TD && AI.getAllocatedType()->isSized()) {
     // If the alignment is 0 (unspecified), assign it the preferred alignment.
     if (AI.getAlignment() == 0)
       AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+
+    // Move all alloca's of zero byte objects to the entry block and merge them
+    // together.  Note that we only do this for alloca's, because malloc should
+    // allocate and return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) {
+      // For a zero sized alloca there is no point in doing an array allocation.
+      // This is helpful if the array size is a complicated expression not used
+      // elsewhere.
+      if (AI.isArrayAllocation()) {
+        AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
+        return &AI;
+      }
+
+      // Get the first instruction in the entry block.
+      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (FirstInst != &AI) {
+        // If the entry block doesn't start with a zero-size alloca then move
+        // this one to the start of the entry block.  There is no problem with
+        // dominance as the array size was forced to a constant earlier already.
+        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+            TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+          AI.moveBefore(FirstInst);
+          return &AI;
+        }
+
+        // Replace this zero-sized alloca with the one at the start of the entry
+        // block after ensuring that the address will be aligned enough for both
+        // types.
+        unsigned MaxAlign =
+          std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()),
+                   TD->getPrefTypeAlignment(AI.getAllocatedType()));
+        EntryAI->setAlignment(MaxAlign);
+        if (AI.getType() != EntryAI->getType())
+          return new BitCastInst(EntryAI, AI.getType());
+        return ReplaceInstUsesWith(AI, EntryAI);
+      }
+    }
   }
 
-  // Try to aggressively remove allocas which are only used for GEPs, lifetime
-  // markers, and stores. This happens when SROA iteratively promotes stores
-  // out of the alloca, and we need to cleanup after it.
-  return removeDeadAlloca(*this, AI);
+  // At last, use the generic allocation site handler to aggressively remove
+  // unused allocas.
+  return visitAllocSite(AI);
 }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 5168e2a..35a0bbb 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -464,9 +464,12 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
 
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   { const APInt *CI; Value *N;
-    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) {
+    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
+        match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
       if (*CI != 1)
         N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
+      if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
+        N = Builder->CreateZExt(N, Z->getDestTy());
       if (I.isExact())
         return BinaryOperator::CreateExactLShr(Op0, N);
       return BinaryOperator::CreateLShr(Op0, N);
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e727b2c..291e800 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -129,6 +129,12 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     if (TI->isCast()) {
       if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
         return 0;
+      // The select condition may be a vector. We may only change the operand
+      // type if the vector width remains the same (and matches the condition).
+      Type *CondTy = SI.getCondition()->getType();
+      if (CondTy->isVectorTy() && CondTy->getVectorNumElements() !=
+          FI->getOperand(0)->getType()->getVectorNumElements())
+        return 0;
     } else {
       return 0;  // unknown unary op.
     }
@@ -498,7 +504,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
 
-  if (isa<Constant>(CmpRHS)) {
+  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
     if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
       // Transform (X == C) ? X : Y -> (X == C) ? C : Y
       SI.setOperand(1, CmpRHS);
@@ -875,12 +881,16 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
     if (TrueSI->getCondition() == CondVal) {
+      if (SI.getTrueValue() == TrueSI->getTrueValue())
+        return 0;
       SI.setOperand(1, TrueSI->getTrueValue());
       return &SI;
     }
   }
   if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
     if (FalseSI->getCondition() == CondVal) {
+      if (SI.getFalseValue() == FalseSI->getFalseValue())
+        return 0;
       SI.setOperand(2, FalseSI->getFalseValue());
       return &SI;
     }
@@ -893,5 +903,16 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return &SI;
   }
 
+  if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) {
+    unsigned VWidth = VecTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) {
+      if (V != &SI)
+        return ReplaceInstUsesWith(SI, V);
+      return &SI;
+    }
+  }
+
   return 0;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index b31049e..4bb2403 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -151,7 +151,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
 
     // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
     // profitable unless we know the and'd out bits are already zero.
-    if (CI->getZExtValue() > NumBits) {
+    if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) {
       unsigned LowBits = CI->getZExtValue() - NumBits;
       if (MaskedValueIsZero(I->getOperand(0),
                           APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
@@ -529,6 +529,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     ShiftOp = 0;
   
   if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+
+    // This is a constant shift of a constant shift. Be careful about hiding
+    // shl instructions behind bit masks. They are used to represent multiplies
+    // by a constant, and it is important that simple arithmetic expressions
+    // are still recognizable by scalar evolution.
+    //
+    // The transforms applied to shl are very similar to the transforms applied
+    // to mul by constant. We can be more aggressive about optimizing right
+    // shifts.
+    //
+    // Combinations of right and left shifts will still be optimized in
+    // DAGCombine where scalar evolution no longer applies.
+
     ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
     uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
     uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
@@ -554,13 +567,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     }
     
     if (ShiftAmt1 == ShiftAmt2) {
-      // If we have ((X >>? C) << C), turn this into X & (-1 << C).
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::CreateAnd(X,
-                                         ConstantInt::get(I.getContext(),Mask));
-      }
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
@@ -570,28 +576,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       }
     } else if (ShiftAmt1 < ShiftAmt2) {
       uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
-      
-      // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
+
+      // (X >>?,exact C1) << C2 --> X << (C2-C1)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
       if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
         ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->isExact()) {
-          // (X >>?,exact C1) << C2 --> X << (C2-C1)
-          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
-                                                          X, ShiftDiffCst);
-          NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-          NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
-          return NewShl;
-        }
-        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-        
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(Shift,
-                                         ConstantInt::get(I.getContext(),Mask));
+        BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                        X, ShiftDiffCst);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
       }
-      
+
       // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
@@ -627,24 +628,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       assert(ShiftAmt2 < ShiftAmt1);
       uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
 
-      // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
+      // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
       if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
         ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->isExact()) {
-          // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
-          BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
-                                                          X, ShiftDiffCst);
-          NewShr->setIsExact(true);
-          return NewShr;
-        }
-        Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(),
-                                            X, ShiftDiffCst);
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(Shift,
-                                         ConstantInt::get(I.getContext(),Mask));
+        BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
+                                                        X, ShiftDiffCst);
+        NewShr->setIsExact(true);
+        return NewShr;
       }
-      
+
       // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 125c74a..54be8ed 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -989,6 +989,29 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     }
     break;
   }
+  case Instruction::Select: {
+    APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts);
+    if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) {
+      for (unsigned i = 0; i < VWidth; i++) {
+        if (CV->getAggregateElement(i)->isNullValue())
+          LeftDemanded.clearBit(i);
+        else
+          RightDemanded.clearBit(i);
+      }
+    }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; }
+      
+    // Output elements are undefined if both are undefined.
+    UndefElts &= UndefElts2;
+    break;
+  }
   case Instruction::BitCast: {
     // Vector->vector casts only.
     VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
@@ -1074,6 +1097,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // like undef&0.  The result is known zero, not undef.
     UndefElts &= UndefElts2;
     break;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    break;
     
   case Instruction::Call: {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 066b2ec..68ecd51 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -87,30 +87,34 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 
+Value *InstCombiner::EmitGEPOffset(User *GEP) {
+  return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP);
+}
+
 /// ShouldChangeType - Return true if it is desirable to convert a computation
 /// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
 /// type for example, or from a smaller to a larger illegal type.
 bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
-  
+
   // If we don't have TD, we don't know if the source/dest are legal.
   if (!TD) return false;
-  
+
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
   bool FromLegal = TD->isLegalInteger(FromWidth);
   bool ToLegal = TD->isLegalInteger(ToWidth);
-  
+
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
   if (FromLegal && !ToLegal)
     return false;
-  
+
   // Otherwise, if both are illegal, do not increase the size of the result. We
   // do allow things like i160 -> i64, but not i64 -> i160.
   if (!FromLegal && !ToLegal && ToWidth > FromWidth)
     return false;
-  
+
   return true;
 }
 
@@ -127,7 +131,7 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
 
   // We reason about Add and Sub Only.
   Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add && 
+  if (Opcode != Instruction::Add &&
       Opcode != Instruction::Sub) {
     return false;
   }
@@ -203,7 +207,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
           if (MaintainNoSignedWrap(I, B, C) &&
-	      (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
             // Note: this is only valid because SimplifyBinOp doesn't look at
             // the operands to Op0.
             I.clearSubclassOptionalData();
@@ -211,7 +215,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           } else {
             I.clearSubclassOptionalData();
           }
-            
+
           Changed = true;
           ++NumReassoc;
           continue;
@@ -540,7 +544,7 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
   Value *Op0 = SO, *Op1 = ConstOperand;
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
-  
+
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
     return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
                                     SO->getName()+".op");
@@ -579,7 +583,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
       if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
         return 0;
     }
-    
+
     Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this);
     Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this);
 
@@ -599,7 +603,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return 0;
-  
+
   // We normally only transform phis with a single use.  However, if a PHI has
   // multiple uses and they are all the same operation, we can fold *all* of the
   // uses into the PHI.
@@ -613,7 +617,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     }
     // Otherwise, we can replace *all* users with the new PHI we form.
   }
-  
+
   // Check to see if all of the operands of the PHI are simple constants
   // (constantint/constantfp/undef).  If there is one non-constant value,
   // remember the BB it is in.  If there is more than one or if *it* is a PHI,
@@ -627,7 +631,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 
     if (isa<PHINode>(InVal)) return 0;  // Itself a phi.
     if (NonConstBB) return 0;  // More than one non-const value.
-    
+
     NonConstBB = PN->getIncomingBlock(i);
 
     // If the InVal is an invoke at the end of the pred block, then we can't
@@ -635,14 +639,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
       if (II->getParent() == NonConstBB)
         return 0;
-    
+
     // If the incoming non-constant value is in I's block, we will remove one
     // instruction, but insert another equivalent one, leading to infinite
     // instcombine.
     if (NonConstBB == I.getParent())
       return 0;
   }
-  
+
   // If there is exactly one non-constant value, we can insert a copy of the
   // operation in that block.  However, if this is a critical edge, we would be
   // inserting the computation one some other paths (e.g. inside a loop).  Only
@@ -656,12 +660,12 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
   InsertNewInstBefore(NewPN, *PN);
   NewPN->takeName(PN);
-  
+
   // If we are going to have to insert a new computation, do so right before the
   // predecessors terminator.
   if (NonConstBB)
     Builder->SetInsertPoint(NonConstBB->getTerminator());
-  
+
   // Next, add all of the operands to the PHI.
   if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
     // We only currently try to fold the condition of a select when it is a phi,
@@ -706,20 +710,20 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
                                    PN->getIncomingValue(i), C, "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
-  } else { 
+  } else {
     CastInst *CI = cast<CastInst>(&I);
     Type *RetTy = CI->getType();
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
-      else 
+      else
         InV = Builder->CreateCast(CI->getOpcode(),
                                 PN->getIncomingValue(i), I.getType(), "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   }
-  
+
   for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
        UI != E; ) {
     Instruction *User = cast<Instruction>(*UI++);
@@ -734,11 +738,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// or not there is a sequence of GEP indices into the type that will land us at
 /// the specified offset.  If so, fill them into NewIndices and return the
 /// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, 
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
                                           SmallVectorImpl<Value*> &NewIndices) {
   if (!TD) return 0;
   if (!Ty->isSized()) return 0;
-  
+
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
@@ -747,7 +751,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
     Offset -= FirstIdx*TySize;
-    
+
     // Handle hosts where % returns negative instead of values [0..TySize).
     if (Offset < 0) {
       --FirstIdx;
@@ -756,24 +760,24 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
     }
     assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
   }
-  
+
   NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
-    
+
   // Index into the types.  If we fail, set OrigBase to null.
   while (Offset) {
     // Indexing into tail padding between struct/array elements.
     if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
       return 0;
-    
+
     if (StructType *STy = dyn_cast<StructType>(Ty)) {
       const StructLayout *SL = TD->getStructLayout(STy);
       assert(Offset < (int64_t)SL->getSizeInBytes() &&
              "Offset must stay within the indexed type");
-      
+
       unsigned Elt = SL->getElementContainingOffset(Offset);
       NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
                                             Elt));
-      
+
       Offset -= SL->getElementOffset(Elt);
       Ty = STy->getElementType(Elt);
     } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
@@ -787,7 +791,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
       return 0;
     }
   }
-  
+
   return Ty;
 }
 
@@ -948,7 +952,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           Res->setIsInBounds(GEP.isInBounds());
           return Res;
         }
-        
+
         if (ArrayType *XATy =
               dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
           // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
@@ -981,16 +985,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // V and GEP are both pointer types --> BitCast
         return new BitCastInst(NewGEP, GEP.getType());
       }
-      
+
       // Transform things like:
       // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
       //   (where tmp = 8*tmp2) into:
       // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
-      
+
       if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) {
         uint64_t ArrayEltSize =
             TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
-        
+
         // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
         // allow either a mul, shift, or constant here.
         Value *NewIdx = 0;
@@ -1015,7 +1019,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             NewIdx = Inst->getOperand(0);
           }
         }
-        
+
         // If the index will be to exactly the right offset with the scale taken
         // out, perform the transformation. Note, we don't know whether Scale is
         // signed or not. We'll use unsigned version of division/modulo
@@ -1054,10 +1058,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
-      // Determine how much the GEP moves the pointer.  We are guaranteed to get
-      // a constant back from EmitGEPOffset.
-      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP));
-      int64_t Offset = OffsetV->getSExtValue();
+      // Determine how much the GEP moves the pointer.
+      SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end());
+      int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops);
 
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
@@ -1065,7 +1068,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isMalloc(BCI->getOperand(0))) {
+            isAllocationFn(BCI->getOperand(0))) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1078,7 +1081,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         }
         return new BitCastInst(BCI->getOperand(0), GEP.getType());
       }
-      
+
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
@@ -1089,68 +1092,103 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Value *NGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
           Builder->CreateGEP(BCI->getOperand(0), NewIndices);
-        
+
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
         NGEP->takeName(&GEP);
         return new BitCastInst(NGEP, GEP.getType());
       }
     }
-  }    
-    
+  }
+
   return 0;
 }
 
 
 
-static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users,
-                                       int Depth = 0) {
-  if (Depth == 8)
-    return false;
+static bool
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
+  SmallVector<Instruction*, 4> Worklist;
+  Worklist.push_back(AI);
 
-  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
-       UI != UE; ++UI) {
-    User *U = *UI;
-    if (isFreeCall(U)) {
-      Users.push_back(U);
-      continue;
-    }
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
-      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) {
-        Users.push_back(ICI);
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE;
+         ++UI) {
+      Instruction *I = cast<Instruction>(*UI);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return false;
+
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+        Users.push_back(I);
+        Worklist.push_back(I);
         continue;
-      }
-    }
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) {
-        Users.push_back(BCI);
+
+      case Instruction::ICmp: {
+        ICmpInst *ICI = cast<ICmpInst>(I);
+        // We can fold eq/ne comparisons with null to false/true, respectively.
+        if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1)))
+          return false;
+        Users.push_back(I);
         continue;
       }
-    }
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
-      if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) {
-        Users.push_back(GEPI);
+
+      case Instruction::Call:
+        // Ignore no-op and store intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          default:
+            return false;
+
+          case Intrinsic::memmove:
+          case Intrinsic::memcpy:
+          case Intrinsic::memset: {
+            MemIntrinsic *MI = cast<MemIntrinsic>(II);
+            if (MI->isVolatile() || MI->getRawDest() != PI)
+              return false;
+          }
+          // fall through
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          case Intrinsic::objectsize:
+            Users.push_back(I);
+            continue;
+          }
+        }
+
+        if (isFreeCall(I)) {
+          Users.push_back(I);
+          continue;
+        }
+        return false;
+
+      case Instruction::Store: {
+        StoreInst *SI = cast<StoreInst>(I);
+        if (SI->isVolatile() || SI->getPointerOperand() != PI)
+          return false;
+        Users.push_back(I);
         continue;
       }
-    }
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        Users.push_back(II);
-        continue;
       }
+      llvm_unreachable("missing a return?");
     }
-    return false;
-  }
+  } while (!Worklist.empty());
   return true;
 }
 
-Instruction *InstCombiner::visitMalloc(Instruction &MI) {
+Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
   SmallVector<WeakVH, 64> Users;
-  if (IsOnlyNullComparedAndFreed(&MI, Users)) {
+  if (isAllocSiteRemovable(&MI, Users)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       Instruction *I = cast_or_null<Instruction>(&*Users[i]);
       if (!I) continue;
@@ -1161,9 +1199,23 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) {
                                              C->isFalseWhenEqual()));
       } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
         ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
+      } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
+          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
+          ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
+        }
       }
       EraseInstFromFunction(*I);
     }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+      // Replace invoke with a NOP intrinsic to maintain the original CFG
+      Module *M = II->getParent()->getParent()->getParent();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+                         ArrayRef<Value *>(), "", II->getParent());
+    }
     return EraseInstFromFunction(MI);
   }
   return 0;
@@ -1181,7 +1233,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
                          UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
     return EraseInstFromFunction(FI);
   }
-  
+
   // If we have 'free null' delete the instruction.  This can happen in stl code
   // when lots of inlining happens.
   if (isa<ConstantPointerNull>(Op))
@@ -1207,14 +1259,14 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Cannonicalize fcmp_one -> fcmp_oeq
   FCmpInst::Predicate FPred; Value *Y;
-  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), 
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
                              TrueDest, FalseDest)) &&
       BI.getCondition()->hasOneUse())
     if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
         FPred == FCmpInst::FCMP_OGE) {
       FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
       Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
-      
+
       // Swap Destinations and condition.
       BI.swapSuccessors();
       Worklist.Add(Cond);
@@ -1280,7 +1332,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
     }
     return 0; // Can't handle other constants
   }
-  
+
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
     const unsigned *exti, *exte, *insi, *inse;
@@ -1329,7 +1381,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // %E = extractvalue { i32, { i32 } } %I, 1, 0
       // with
       // %E extractvalue { i32 } { i32 42 }, 0
-      return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(),
                                       makeArrayRef(exti, exte));
   }
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
@@ -1349,7 +1401,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
         }
-          
+
         // If the normal result of the add is dead, and the RHS is a constant,
         // we can transform this into a range comparison.
         // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3
@@ -1798,7 +1850,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 /// many instructions are dead or constant).  Additionally, if we find a branch
 /// whose condition is a known constant, we only visit the reachable successors.
 ///
-static bool AddReachableCodeToWorklist(BasicBlock *BB, 
+static bool AddReachableCodeToWorklist(BasicBlock *BB,
                                        SmallPtrSet<BasicBlock*, 64> &Visited,
                                        InstCombiner &IC,
                                        const TargetData *TD,
@@ -1812,13 +1864,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 
   do {
     BB = Worklist.pop_back_val();
-    
+
     // We have now visited this block!  If we've already been here, ignore it.
     if (!Visited.insert(BB)) continue;
 
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
       Instruction *Inst = BBI++;
-      
+
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst)) {
         ++NumDeadInst;
@@ -1826,7 +1878,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
         Inst->eraseFromParent();
         continue;
       }
-      
+
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
         if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
@@ -1837,7 +1889,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
           Inst->eraseFromParent();
           continue;
         }
-      
+
       if (TD) {
         // See if we can constant fold its operands.
         for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end();
@@ -1881,17 +1933,17 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
             Worklist.push_back(ReachableBB);
             continue;
           }
-        
+
         // Otherwise it is the default destination.
         Worklist.push_back(SI->getDefaultDest());
         continue;
       }
     }
-    
+
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       Worklist.push_back(TI->getSuccessor(i));
   } while (!Worklist.empty());
-  
+
   // Once we've found all of the instructions to add to instcombine's worklist,
   // add them in reverse order.  This way instcombine will visit from the top
   // of the function down.  This jives well with the way that it adds all uses
@@ -1899,13 +1951,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
   // some N^2 behavior in pathological cases.
   IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
                               InstrsForInstCombineWorklist.size());
-  
+
   return MadeIRChange;
 }
 
 bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
-  
+
   DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                << F.getName() << "\n");
 
@@ -1976,13 +2028,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
       BasicBlock *BB = I->getParent();
       Instruction *UserInst = cast<Instruction>(I->use_back());
       BasicBlock *UserParent;
-      
+
       // Get the block the use occurs in.
       if (PHINode *PN = dyn_cast<PHINode>(UserInst))
         UserParent = PN->getIncomingBlock(I->use_begin().getUse());
       else
         UserParent = UserInst->getParent();
-      
+
       if (UserParent != BB) {
         bool UserIsSuccessor = false;
         // See if the user is one of our successors.
@@ -2004,7 +2056,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Now that we have an instruction, try combining it to simplify it.
     Builder->SetInsertPoint(I->getParent(), I);
     Builder->SetCurrentDebugLocation(I->getDebugLoc());
-    
+
 #ifndef NDEBUG
     std::string OrigI;
 #endif
@@ -2069,14 +2121,14 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 bool InstCombiner::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
-  
+
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
-  IRBuilder<true, TargetFolder, InstCombineIRInserter> 
+  IRBuilder<true, TargetFolder, InstCombineIRInserter>
     TheBuilder(F.getContext(), TargetFolder(TD),
                InstCombineIRInserter(Worklist));
   Builder = &TheBuilder;
-  
+
   bool EverMadeChange = false;
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
@@ -2087,7 +2139,7 @@ bool InstCombiner::runOnFunction(Function &F) {
   unsigned Iteration = 0;
   while (DoOneIteration(F, Iteration++))
     EverMadeChange = true;
-  
+
   Builder = 0;
   return EverMadeChange;
 }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b43b9e5..bf35eac 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,20 +16,23 @@
 #define DEBUG_TYPE "asan"
 
 #include "FunctionBlackList.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetData.h"
@@ -37,7 +40,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Type.h"
 
 #include <string>
 #include <algorithm>
@@ -47,6 +49,7 @@ using namespace llvm;
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDefaultShadowOffsetAndroid = 0;
 
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
@@ -70,6 +73,9 @@ static const int kAsanStackMidRedzoneMagic = 0xf2;
 static const int kAsanStackRightRedzoneMagic = 0xf3;
 static const int kAsanStackPartialRedzoneMagic = 0xf4;
 
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
 // Command-line flags.
 
 // This flag may need to be replaced with -f[no-]asan-reads.
@@ -77,6 +83,17 @@ static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
        cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
        cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics",
+       cl::desc("instrument atomic instructions (rmw, cmpxchg)"),
+       cl::Hidden, cl::init(true));
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
+       cl::init(10000),
+       cl::desc("maximal number of instructions to instrument in any given BB"),
+       cl::Hidden);
 // This flag may need to be replaced with -f[no]asan-stack.
 static cl::opt<bool> ClStack("asan-stack",
        cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
@@ -125,18 +142,29 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
 
 namespace {
 
+/// An object of this type is created while instrumenting every function.
+struct AsanFunctionContext {
+  AsanFunctionContext(Function &Function) : F(Function) { }
+
+  Function &F;
+};
+
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public ModulePass {
   AddressSanitizer();
   virtual const char *getPassName() const;
-  void instrumentMop(Instruction *I);
-  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
+  void instrumentMop(AsanFunctionContext &AFC, Instruction *I);
+  void instrumentAddress(AsanFunctionContext &AFC,
+                         Instruction *OrigIns, IRBuilder<> &IRB,
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
-  Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr,
-                                 bool IsWrite, uint32_t TypeSize);
-  bool instrumentMemIntrinsic(MemIntrinsic *MI);
-  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
-                                  Value *Size,
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(BasicBlock *BB, Value *Addr, Value *PC,
+                                 bool IsWrite, size_t AccessSizeIndex);
+  bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(AsanFunctionContext &AFC,
+                                   Instruction *OrigIns, Value *Addr,
+                                   Value *Size,
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool handleFunction(Module &M, Function &F);
@@ -144,7 +172,6 @@ struct AddressSanitizer : public ModulePass {
   bool poisonStackInFunction(Module &M, Function &F);
   virtual bool runOnModule(Module &M);
   bool insertGlobalRedzones(Module &M);
-  BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp);
   static char ID;  // Pass identification, replacement for typeid
 
  private:
@@ -163,11 +190,11 @@ struct AddressSanitizer : public ModulePass {
     return getAlignedSize(SizeInBytes);
   }
 
+  Function *checkInterfaceFunction(Constant *FuncOrBitcast);
   void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
                    Value *ShadowBase, bool DoPoison);
   bool LooksLikeCodeInBug11395(Instruction *I);
 
-  Module      *CurrentModule;
   LLVMContext *C;
   TargetData *TD;
   uint64_t MappingOffset;
@@ -180,7 +207,11 @@ struct AddressSanitizer : public ModulePass {
   Function *AsanInitFunction;
   Instruction *CtorInsertBefore;
   OwningPtr<FunctionBlackList> BL;
+  // This array is indexed by AccessIsWrite and log2(AccessSize).
+  Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  InlineAsm *EmptyAsm;
 };
+
 }  // namespace
 
 char AddressSanitizer::ID = 0;
@@ -196,6 +227,12 @@ const char *AddressSanitizer::getPassName() const {
   return "AddressSanitizer";
 }
 
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = CountTrailingZeros_32(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
 // Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
@@ -206,29 +243,32 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
 // Split the basic block and insert an if-then code.
 // Before:
 //   Head
-//   SplitBefore
+//   Cmp
 //   Tail
 // After:
 //   Head
 //   if (Cmp)
-//     NewBasicBlock
-//   SplitBefore
+//     ThenBlock
 //   Tail
 //
-// Returns the NewBasicBlock's terminator.
-BranchInst *AddressSanitizer::splitBlockAndInsertIfThen(
-    Instruction *SplitBefore, Value *Cmp) {
+// If ThenBlock is zero, a new block is created and its terminator is returned.
+// Otherwize 0 is returned.
+static BranchInst *splitBlockAndInsertIfThen(Value *Cmp,
+                                             BasicBlock *ThenBlock = 0) {
+  Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode();
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
   TerminatorInst *HeadOldTerm = Head->getTerminator();
-  BasicBlock *NewBasicBlock =
-      BasicBlock::Create(*C, "", Head->getParent());
-  BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock,
-                                               /*ifFalse*/Tail,
-                                               Cmp);
+  BranchInst *CheckTerm = 0;
+  if (!ThenBlock) {
+    LLVMContext &C = Head->getParent()->getParent()->getContext();
+    ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  }
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
 
-  BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock);
   return CheckTerm;
 }
 
@@ -242,12 +282,13 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
                                                MappingOffset));
 }
 
-void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
+void AddressSanitizer::instrumentMemIntrinsicParam(
+    AsanFunctionContext &AFC, Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
   // Check the first byte.
   {
     IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
+    instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite);
   }
   // Check the last byte.
   {
@@ -257,15 +298,16 @@ void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
     SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
     Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
     Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
+    instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
   }
 }
 
 // Instrument memset/memmove/memcpy
-bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC,
+                                              MemIntrinsic *MI) {
   Value *Dst = MI->getDest();
   MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
-  Value *Src = MemTran ? MemTran->getSource() : NULL;
+  Value *Src = MemTran ? MemTran->getSource() : 0;
   Value *Length = MI->getLength();
 
   Constant *ConstLength = dyn_cast<Constant>(Length);
@@ -277,26 +319,46 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
     IRBuilder<> IRB(InsertBefore);
 
     Value *Cmp = IRB.CreateICmpNE(Length,
-                                   Constant::getNullValue(Length->getType()));
-    InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp);
+                                  Constant::getNullValue(Length->getType()));
+    InsertBefore = splitBlockAndInsertIfThen(Cmp);
   }
 
-  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
+  instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true);
   if (Src)
-    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
+    instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false);
   return true;
 }
 
-static Value *getLDSTOperand(Instruction *I) {
+// If I is an interesting memory access, return the PointerOperand
+// and set IsWrite. Otherwise return NULL.
+static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return NULL;
+    *IsWrite = false;
     return LI->getPointerOperand();
   }
-  return cast<StoreInst>(*I).getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return NULL;
+    *IsWrite = true;
+    return SI->getPointerOperand();
+  }
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return RMW->getPointerOperand();
+  }
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return XCHG->getPointerOperand();
+  }
+  return NULL;
 }
 
-void AddressSanitizer::instrumentMop(Instruction *I) {
-  int IsWrite = isa<StoreInst>(*I);
-  Value *Addr = getLDSTOperand(I);
+void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) {
+  bool IsWrite;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
+  assert(Addr);
   if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) {
     // We are accessing a global scalar variable. Nothing to catch here.
     return;
@@ -314,22 +376,57 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   }
 
   IRBuilder<> IRB(I);
-  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+  instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite);
+}
+
+// Validate the result of Module::getOrInsertFunction called for an interface
+// function of AddressSanitizer. If the instrumented module defines a function
+// with the same name, their prototypes must match, otherwise
+// getOrInsertFunction returns a bitcast.
+Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast);
+  FuncOrBitcast->dump();
+  report_fatal_error("trying to redefine an AddressSanitizer "
+                     "interface function");
 }
 
 Instruction *AddressSanitizer::generateCrashCode(
-    IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) {
-  // IsWrite and TypeSize are encoded in the function name.
-  std::string FunctionName = std::string(kAsanReportErrorTemplate) +
-      (IsWrite ? "store" : "load") + itostr(TypeSize / 8);
-  Value *ReportWarningFunc = CurrentModule->getOrInsertFunction(
-      FunctionName, IRB.getVoidTy(), IntptrTy, NULL);
-  CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr);
-  Call->setDoesNotReturn();
+    BasicBlock *BB, Value *Addr, Value *PC,
+    bool IsWrite, size_t AccessSizeIndex) {
+  IRBuilder<> IRB(BB->getFirstNonPHI());
+  CallInst *Call;
+  if (PC)
+    Call = IRB.CreateCall2(AsanErrorCallback[IsWrite][AccessSizeIndex],
+                           Addr, PC);
+  else
+    Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm);
   return Call;
 }
 
-void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                            Value *ShadowValue,
+                                            uint32_t TypeSize) {
+  size_t Granularity = 1 << MappingScale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte = IRB.CreateAnd(
+      AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte = IRB.CreateIntCast(
+      LastAccessedByte, IRB.getInt8Ty(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC,
+                                         Instruction *OrigIns,
                                          IRBuilder<> &IRB, Value *Addr,
                                          uint32_t TypeSize, bool IsWrite) {
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
@@ -344,31 +441,25 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
 
-  Instruction *CheckTerm = splitBlockAndInsertIfThen(
-      cast<Instruction>(Cmp)->getNextNode(), Cmp);
-  IRBuilder<> IRB2(CheckTerm);
+  BasicBlock *CrashBlock = BasicBlock::Create(*C, "crash_bb", &AFC.F);
+  new UnreachableInst(*C, CrashBlock);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+  Instruction *Crash =
+      generateCrashCode(CrashBlock, AddrLong, 0, IsWrite, AccessSizeIndex);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
 
   size_t Granularity = 1 << MappingScale;
   if (TypeSize < 8 * Granularity) {
-    // Addr & (Granularity - 1)
-    Value *Lower3Bits = IRB2.CreateAnd(
-        AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
-    // (Addr & (Granularity - 1)) + size - 1
-    Value *LastAccessedByte = IRB2.CreateAdd(
-        Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
-    // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
-    LastAccessedByte = IRB2.CreateIntCast(
-        LastAccessedByte, IRB.getInt8Ty(), false);
-    // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
-    Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue);
-
-    CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2);
-  }
-
-  IRBuilder<> IRB1(CheckTerm);
-  Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize);
-  Crash->setDebugLoc(OrigIns->getDebugLoc());
-  ReplaceInstWithInst(CheckTerm, new UnreachableInst(*C));
+    BranchInst *CheckTerm = splitBlockAndInsertIfThen(Cmp);
+    assert(CheckTerm->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+    ReplaceInstWithInst(CheckTerm, NewTerm);
+  } else {
+    splitBlockAndInsertIfThen(Cmp, CrashBlock);
+  }
 }
 
 // This function replaces all global variables with new variables that have
@@ -473,7 +564,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
     // Create a new global variable with enough space for a redzone.
     GlobalVariable *NewGlobal = new GlobalVariable(
         M, NewTy, G->isConstant(), G->getLinkage(),
-        NewInitializer, "", G, G->isThreadLocal());
+        NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setAlignment(RedzoneSize);
 
@@ -501,7 +592,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
-  Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction(
+  Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
 
@@ -516,8 +607,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
   IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
-  Function *AsanUnregisterGlobals = cast<Function>(M.getOrInsertFunction(
-      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  Function *AsanUnregisterGlobals =
+      checkInterfaceFunction(M.getOrInsertFunction(
+          kAsanUnregisterGlobalsName,
+          IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
@@ -537,7 +630,6 @@ bool AddressSanitizer::runOnModule(Module &M) {
     return false;
   BL.reset(new FunctionBlackList(ClBlackListFile));
 
-  CurrentModule = &M;
   C = &(M.getContext());
   LongSize = TD->getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -551,13 +643,33 @@ bool AddressSanitizer::runOnModule(Module &M) {
 
   // call __asan_init in the module ctor.
   IRBuilder<> IRB(CtorInsertBefore);
-  AsanInitFunction = cast<Function>(
+  AsanInitFunction = checkInterfaceFunction(
       M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
-  MappingOffset = LongSize == 32
-      ? kDefaultShadowOffset32 : kDefaultShadowOffset64;
+  // Create __asan_report* callbacks.
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+         AccessSizeIndex++) {
+      // IsWrite and TypeSize are encoded in the function name.
+      std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+          (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
+      // If we are merging crash callbacks, they have two parameters.
+      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>(
+          M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+    }
+  }
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
+  llvm::Triple targetTriple(M.getTargetTriple());
+  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI;
+
+  MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
+    (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
   if (ClMappingOffsetLog >= 0) {
     if (ClMappingOffsetLog == 0) {
       // special case
@@ -640,17 +752,17 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
   SmallSet<Value*, 16> TempsToInstrument;
   SmallVector<Instruction*, 16> ToInstrument;
   SmallVector<Instruction*, 8> NoReturnCalls;
+  bool IsWrite;
 
   // Fill the set of memory operations to instrument.
   for (Function::iterator FI = F.begin(), FE = F.end();
        FI != FE; ++FI) {
     TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (LooksLikeCodeInBug11395(BI)) return false;
-      if ((isa<LoadInst>(BI) && ClInstrumentReads) ||
-          (isa<StoreInst>(BI) && ClInstrumentWrites)) {
-        Value *Addr = getLDSTOperand(BI);
+      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr))
             continue;  // We've seen this temp in the current BB.
@@ -668,19 +780,24 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
         continue;
       }
       ToInstrument.push_back(BI);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
+        break;
     }
   }
 
+  AsanFunctionContext AFC(F);
+
   // Instrument.
   int NumInstrumented = 0;
   for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
     Instruction *Inst = ToInstrument[i];
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst))
-        instrumentMop(Inst);
+      if (isInterestingMemoryAccess(Inst, &IsWrite))
+        instrumentMop(AFC, Inst);
       else
-        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+        instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst));
     }
     NumInstrumented++;
   }
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 0000000..09e0f14
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,209 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that instruments the code to perform run-time
+// bounds checking on loads, stores, and other memory intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bounds-checking"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/TargetFolder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Instrumentation.h"
+using namespace llvm;
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+namespace {
+  struct BoundsChecking : public FunctionPass {
+    static char ID;
+
+    BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){
+      initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+  private:
+    const TargetData *TD;
+    ObjectSizeOffsetEvaluator *ObjSizeEval;
+    BuilderTy *Builder;
+    Instruction *Inst;
+    BasicBlock *TrapBB;
+    unsigned Penalty;
+
+    BasicBlock *getTrapBB();
+    void emitBranchToTrap(Value *Cmp = 0);
+    bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue,
+                          APInt &Size, Value* &SizeValue);
+    bool instrument(Value *Ptr, Value *Val);
+ };
+}
+
+char BoundsChecking::ID = 0;
+INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking",
+                false, false)
+
+
+/// getTrapBB - create a basic block that traps. All overflowing conditions
+/// branch to this block. There's only one trap block per function.
+BasicBlock *BoundsChecking::getTrapBB() {
+  if (TrapBB && SingleTrapBB)
+    return TrapBB;
+
+  Function *Fn = Inst->getParent()->getParent();
+  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+  Builder->SetInsertPoint(TrapBB);
+
+  llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+  CallInst *TrapCall = Builder->CreateCall(F);
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  TrapCall->setDebugLoc(Inst->getDebugLoc());
+  Builder->CreateUnreachable();
+
+  Builder->SetInsertPoint(PrevInsertPoint);
+  return TrapBB;
+}
+
+
+/// emitBranchToTrap - emit a branch instruction to a trap block.
+/// If Cmp is non-null, perform a jump only if its value evaluates to true.
+void BoundsChecking::emitBranchToTrap(Value *Cmp) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp);
+  if (C) {
+    ++ChecksSkipped;
+    if (!C->getZExtValue())
+      return;
+    else
+      Cmp = 0; // unconditional branch
+  }
+
+  Instruction *Inst = Builder->GetInsertPoint();
+  BasicBlock *OldBB = Inst->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (Cmp)
+    BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB);
+  else
+    BranchInst::Create(getTrapBB(), OldBB);
+}
+
+
+/// instrument - adds run-time bounds checks to memory accessing instructions.
+/// Ptr is the pointer that will be read/written, and InstVal is either the
+/// result from the load or the value being stored. It is used to determine the
+/// size of memory block that is touched.
+/// Returns true if any change was made to the IR, false otherwise.
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
+  uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType());
+  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+              << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr);
+
+  if (!ObjSizeEval->bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return false;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  IntegerType *IntTy = TD->getIntPtrType(Inst->getContext());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = Builder->CreateSub(Size, Offset);
+  Value *Cmp2 = Builder->CreateICmpULT(Size, Offset);
+  Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = Builder->CreateOr(Cmp2, Cmp3);
+  if (!SizeCI || SizeCI->getValue().slt(0)) {
+    Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = Builder->CreateOr(Cmp1, Or);
+  }
+  emitBranchToTrap(Or);
+
+  ++ChecksAdded;
+  return true;
+}
+
+bool BoundsChecking::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+
+  TrapBB = 0;
+  BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
+  Builder = &TheBuilder;
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext());
+  ObjSizeEval = &TheObjSizeEval;
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
+        isa<AtomicRMWInst>(I))
+        WorkList.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (std::vector<Instruction*>::iterator i = WorkList.begin(),
+       e = WorkList.end(); i != e; ++i) {
+    Inst = *i;
+
+    Builder->SetInsertPoint(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      MadeChange |= instrument(LI->getPointerOperand(), LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand());
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand());
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand());
+    } else {
+      llvm_unreachable("unknown Instruction type");
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) {
+  return new BoundsChecking(Penalty);
+}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index e4c8cf1..00de882 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
+  BoundsChecking.cpp
   EdgeProfiling.cpp
   FunctionBlackList.cpp
   GCOVProfiling.cpp
@@ -9,3 +10,5 @@ add_llvm_library(LLVMInstrumentation
   ProfilingUtils.cpp
   ThreadSanitizer.cpp
   )
+
+add_dependencies(LLVMInstrumentation intrinsics_gen)
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 96e5d5b..264a6a6 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -18,22 +18,23 @@
 
 #include "ProfilingUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Instructions.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugLoc.h"
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/PathV2.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <string>
 #include <utility>
 using namespace llvm;
@@ -57,7 +58,6 @@ namespace {
     virtual const char *getPassName() const {
       return "GCOV Profiler";
     }
-
   private:
     bool runOnModule(Module &M);
 
@@ -90,6 +90,7 @@ namespace {
     // list.
     void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
                                                      MDNode *>, 8> &);
+    void insertIndirectCounterIncrement();
 
     std::string mangleName(DICompileUnit CU, std::string NewStem);
 
@@ -421,6 +422,7 @@ bool GCOVProfiler::emitProfileArcs() {
   if (!CU_Nodes) return false;
 
   bool Result = false;  
+  bool InsertIndCounterIncrCode = false;
   for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
     DICompileUnit CU(CU_Nodes->getOperand(i));
     DIArray SPs = CU.getSubprograms();
@@ -446,7 +448,7 @@ bool GCOVProfiler::emitProfileArcs() {
         new GlobalVariable(*M, CounterTy, false,
                            GlobalValue::InternalLinkage,
                            Constant::getNullValue(CounterTy),
-                           "__llvm_gcov_ctr", 0, false, 0);
+                           "__llvm_gcov_ctr");
       CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
       
       UniqueVector<BasicBlock *> ComplexEdgePreds;
@@ -507,15 +509,21 @@ bool GCOVProfiler::emitProfileArcs() {
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
+
+          // Build code to increment the counter.
+          InsertIndCounterIncrCode = true;
           Builder.CreateCall2(getIncrementIndirectCounterFunc(),
                               EdgeState, CounterPtrArray);
-          // clear the predecessor number
-          Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
         }
       }
     }
+
     insertCounterWriteout(CountersBySP);
   }
+
+  if (InsertIndCounterIncrCode)
+    insertIndirectCounterIncrement();
+
   return Result;
 }
 
@@ -574,13 +582,14 @@ Constant *GCOVProfiler::getStartFileFunc() {
 }
 
 Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
   Type *Args[] = {
-    Type::getInt32PtrTy(*Ctx),                  // uint32_t *predecessor
-    Type::getInt64PtrTy(*Ctx)->getPointerTo(),  // uint64_t **state_table_row
+    Int32Ty->getPointerTo(),                // uint32_t *predecessor
+    Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
-  return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy);
 }
 
 Constant *GCOVProfiler::getEmitFunctionFunc() {
@@ -588,8 +597,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
 
@@ -665,5 +673,75 @@ void GCOVProfiler::insertCounterWriteout(
   }
   Builder.CreateRetVoid();
 
-  InsertProfilingShutdownCall(WriteoutF, M);
+  // Create a small bit of code that registers the "__llvm_gcov_writeout"
+  // function to be executed at exit.
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_init", M);
+  F->setUnnamedAddr(true);
+  F->setLinkage(GlobalValue::InternalLinkage);
+  F->addFnAttr(Attribute::NoInline);
+
+  BB = BasicBlock::Create(*Ctx, "entry", F);
+  Builder.SetInsertPoint(BB);
+
+  FTy = FunctionType::get(Type::getInt32Ty(*Ctx),
+                          PointerType::get(FTy, 0), false);
+  Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy);
+  Builder.CreateCall(AtExitFn, WriteoutF);
+  Builder.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
+
+void GCOVProfiler::insertIndirectCounterIncrement() {
+  Function *Fn =
+    cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
+  Fn->setUnnamedAddr(true);
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  Fn->addFnAttr(Attribute::NoInline);
+
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
+  Constant *NegOne = ConstantInt::get(Int32Ty, 0xffffffff);
+
+  // Create basic blocks for function.
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
+  IRBuilder<> Builder(BB);
+
+  BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn);
+
+  // uint32_t pred = *predecessor;
+  // if (pred == 0xffffffff) return;
+  Argument *Arg = Fn->arg_begin();
+  Arg->setName("predecessor");
+  Value *Pred = Builder.CreateLoad(Arg, "pred");
+  Value *Cond = Builder.CreateICmpEQ(Pred, NegOne);
+  BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
+
+  Builder.SetInsertPoint(PredNotNegOne);
+
+  // uint64_t *counter = counters[pred];
+  // if (!counter) return;
+  Value *ZExtPred = Builder.CreateZExt(Pred, Int64Ty);
+  Arg = llvm::next(Fn->arg_begin());
+  Arg->setName("counters");
+  Value *GEP = Builder.CreateGEP(Arg, ZExtPred);
+  Value *Counter = Builder.CreateLoad(GEP, "counter");
+  Cond = Builder.CreateICmpEQ(Counter,
+                              Constant::getNullValue(Int64Ty->getPointerTo()));
+  Builder.CreateCondBr(Cond, Exit, CounterEnd);
+
+  // ++*counter;
+  Builder.SetInsertPoint(CounterEnd);
+  Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
+                                 ConstantInt::get(Int64Ty, 1));
+  Builder.CreateStore(Add, Counter);
+  Builder.CreateBr(Exit);
+
+  // Fill in the exit block.
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateRetVoid();
 }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index c7266e2..1e0b4a3 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -20,11 +20,12 @@ using namespace llvm;
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerPass(Registry);
+  initializeBoundsCheckingPass(Registry);
   initializeEdgeProfilerPass(Registry);
+  initializeGCOVProfilerPass(Registry);
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
-  initializeGCOVProfilerPass(Registry);
-  initializeAddressSanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
 }
 
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index b214796..cc27146 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -55,11 +55,11 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8bb337e..dc0fa71 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -22,73 +22,73 @@
 #define DEBUG_TYPE "tsan"
 
 #include "FunctionBlackList.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Type.h"
 
 using namespace llvm;
 
 static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
 
-static cl::opt<bool> ClPrintStats("tsan-print-stats",
-       cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden);
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite, 
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
 
 namespace {
 
-// Stats counters for ThreadSanitizer instrumentation.
-struct ThreadSanitizerStats {
-  size_t NumInstrumentedReads;
-  size_t NumInstrumentedWrites;
-  size_t NumOmittedReadsBeforeWrite;
-  size_t NumAccessesWithBadSize;
-  size_t NumInstrumentedVtableWrites;
-  size_t NumOmittedReadsFromConstantGlobals;
-  size_t NumOmittedReadsFromVtable;
-};
-
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
   ThreadSanitizer();
+  const char *getPassName() const;
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
-  bool instrumentLoadOrStore(Instruction *I);
   static char ID;  // Pass identification, replacement for typeid.
 
  private:
-  void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
-                                     SmallVectorImpl<Instruction*> &All);
+  bool instrumentLoadOrStore(Instruction *I);
+  bool instrumentAtomic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
+                                      SmallVectorImpl<Instruction*> &All);
   bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr);
 
   TargetData *TD;
   OwningPtr<FunctionBlackList> BL;
+  IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
-  Value *TsanFuncEntry;
-  Value *TsanFuncExit;
+  Function *TsanFuncEntry;
+  Function *TsanFuncExit;
   // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
   static const size_t kNumberOfAccessSizes = 5;
-  Value *TsanRead[kNumberOfAccessSizes];
-  Value *TsanWrite[kNumberOfAccessSizes];
-  Value *TsanVptrUpdate;
-
-  // Stats are modified w/o synchronization.
-  ThreadSanitizerStats stats;
+  Function *TsanRead[kNumberOfAccessSizes];
+  Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanAtomicLoad[kNumberOfAccessSizes];
+  Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanVptrUpdate;
 };
 }  // namespace
 
@@ -97,6 +97,10 @@ INITIALIZE_PASS(ThreadSanitizer, "tsan",
     "ThreadSanitizer: detects data races.",
     false, false)
 
+const char *ThreadSanitizer::getPassName() const {
+  return "ThreadSanitizer";
+}
+
 ThreadSanitizer::ThreadSanitizer()
   : FunctionPass(ID),
   TD(NULL) {
@@ -106,12 +110,18 @@ FunctionPass *llvm::createThreadSanitizerPass() {
   return new ThreadSanitizer();
 }
 
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast))
+     return F;
+  FuncOrBitcast->dump();
+  report_fatal_error("ThreadSanitizer interface function redefined");
+}
+
 bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
   BL.reset(new FunctionBlackList(ClBlackListFile));
-  memset(&stats, 0, sizeof(stats));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -120,38 +130,38 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
 
   // Initialize the callbacks.
-  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", IRB.getVoidTy(),
-                                        IRB.getInt8PtrTy(), NULL);
-  TsanFuncExit = M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(),
-                                       NULL);
+  TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+  TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_exit", IRB.getVoidTy(), NULL));
+  OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
-    SmallString<32> ReadName("__tsan_read");
-    ReadName += itostr(1 << i);
-    TsanRead[i] = M.getOrInsertFunction(ReadName, IRB.getVoidTy(),
-                                        IRB.getInt8PtrTy(), NULL);
-    SmallString<32> WriteName("__tsan_write");
-    WriteName += itostr(1 << i);
-    TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(),
-                                         IRB.getInt8PtrTy(), NULL);
-  }
-  TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
-                                         IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                                         NULL);
-  return true;
-}
+    const size_t ByteSize = 1 << i;
+    const size_t BitSize = ByteSize * 8;
+    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
+    TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
 
-bool ThreadSanitizer::doFinalization(Module &M) {
-  if (ClPrintStats) {
-    errs() << "ThreadSanitizerStats " << M.getModuleIdentifier()
-           << ": wr " << stats.NumInstrumentedWrites
-           << "; rd " << stats.NumInstrumentedReads
-           << "; vt " << stats.NumInstrumentedVtableWrites
-           << "; bs " << stats.NumAccessesWithBadSize
-           << "; rbw " << stats.NumOmittedReadsBeforeWrite
-           << "; rcg " << stats.NumOmittedReadsFromConstantGlobals
-           << "; rvt " << stats.NumOmittedReadsFromVtable
-           << "\n";
+    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
+    TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
+                                   "_load");
+    TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicLoadName, Ty, PtrTy, OrdTy, NULL));
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
+                                    "_store");
+    TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
+        NULL));
   }
+  TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), NULL));
   return true;
 }
 
@@ -173,13 +183,13 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
     if (GV->isConstant()) {
       // Reads from constant globals can not race with any writes.
-      stats.NumOmittedReadsFromConstantGlobals++;
+      NumOmittedReadsFromConstantGlobals++;
       return true;
     }
   } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
     if (isVtableAccess(L)) {
       // Reads from a vtable pointer can not race with any writes.
-      stats.NumOmittedReadsFromVtable++;
+      NumOmittedReadsFromVtable++;
       return true;
     }
   }
@@ -197,7 +207,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 //
 // 'Local' is a vector of insns within the same BB (no calls between).
 // 'All' is a vector of insns that will be instrumented.
-void ThreadSanitizer::choseInstructionsToInstrument(
+void ThreadSanitizer::chooseInstructionsToInstrument(
     SmallVectorImpl<Instruction*> &Local,
     SmallVectorImpl<Instruction*> &All) {
   SmallSet<Value*, 8> WriteTargets;
@@ -212,7 +222,7 @@ void ThreadSanitizer::choseInstructionsToInstrument(
       Value *Addr = Load->getPointerOperand();
       if (WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
-        stats.NumOmittedReadsBeforeWrite++;
+        NumOmittedReadsBeforeWrite++;
         continue;
       }
       if (addrPointsToConstantData(Addr)) {
@@ -225,12 +235,27 @@ void ThreadSanitizer::choseInstructionsToInstrument(
   Local.clear();
 }
 
+static bool isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSynchScope() == CrossThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSynchScope() == CrossThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (FenceInst *FI = dyn_cast<FenceInst>(I))
+    return FI->getSynchScope() == CrossThread;
+  return false;
+}
+
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
   if (BL->isIn(F)) return false;
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
   bool Res = false;
   bool HasCalls = false;
 
@@ -240,16 +265,18 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
     BasicBlock &BB = *FI;
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
          BI != BE; ++BI) {
-      if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
+      if (isAtomic(BI))
+        AtomicAccesses.push_back(BI);
+      else if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
         LocalLoadsAndStores.push_back(BI);
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
       else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
         HasCalls = true;
-        choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
     }
-    choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
   }
 
   // We have collected all loads and stores.
@@ -261,6 +288,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
     Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
   }
 
+  // Instrument atomic memory accesses.
+  for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
+    Res |= instrumentAtomic(AtomicAccesses[i]);
+  }
+
   // Instrument function entry/exit points if there were instrumented accesses.
   if (Res || HasCalls) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -283,29 +315,98 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
   Value *Addr = IsWrite
       ? cast<StoreInst>(I)->getPointerOperand()
       : cast<LoadInst>(I)->getPointerOperand();
-  Type *OrigPtrTy = Addr->getType();
-  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
-  assert(OrigTy->isSized());
-  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
-  if (TypeSize != 8  && TypeSize != 16 &&
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
-    stats.NumAccessesWithBadSize++;
-    // Ignore all unusual sizes.
+  int Idx = getMemoryAccessFuncIndex(Addr);
+  if (Idx < 0)
     return false;
-  }
   if (IsWrite && isVtableAccess(I)) {
+    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
     Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue does not necessary have a pointer type.
+    if (isa<IntegerType>(StoredValue->getType()))
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
     IRB.CreateCall2(TsanVptrUpdate,
                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
                     IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy()));
-    stats.NumInstrumentedVtableWrites++;
+    NumInstrumentedVtableWrites++;
     return true;
   }
-  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
-  assert(Idx < kNumberOfAccessSizes);
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
-  if (IsWrite) stats.NumInstrumentedWrites++;
-  else         stats.NumInstrumentedReads++;
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic:              assert(false);
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 1 << 0; break;
+ // case Consume:                v = 1 << 1; break;  // Not specified yet.
+    case Acquire:                v = 1 << 2; break;
+    case Release:                v = 1 << 3; break;
+    case AcquireRelease:         v = 1 << 4; break;
+    case SequentiallyConsistent: v = 1 << 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (isa<AtomicRMWInst>(I)) {
+    // FIXME: Not yet supported.
+  } else if (isa<AtomicCmpXchgInst>(I)) {
+    // FIXME: Not yet supported.
+  } else if (isa<FenceInst>(I)) {
+    // FIXME: Not yet supported.
+  }
   return true;
 }
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index ba214d1..b344952 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -9,7 +9,7 @@
 //
 // This file implements the Aggressive Dead Code Elimination pass.  This pass
 // optimistically assumes that all instructions are dead until proven otherwise,
-// allowing it to eliminate dead computations that other DCE passes do not 
+// allowing it to eliminate dead computations that other DCE passes do not
 // catch, particularly involving loop computations.
 //
 //===----------------------------------------------------------------------===//
@@ -36,13 +36,13 @@ namespace {
     ADCE() : FunctionPass(ID) {
       initializeADCEPass(*PassRegistry::getPassRegistry());
     }
-    
+
     virtual bool runOnFunction(Function& F);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage& AU) const {
       AU.setPreservesCFG();
     }
-    
+
   };
 }
 
@@ -52,7 +52,7 @@ INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false)
 bool ADCE::runOnFunction(Function& F) {
   SmallPtrSet<Instruction*, 128> alive;
   SmallVector<Instruction*, 128> worklist;
-  
+
   // Collect the set of "root" instructions that are known live.
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (isa<TerminatorInst>(I.getInstructionIterator()) ||
@@ -62,7 +62,7 @@ bool ADCE::runOnFunction(Function& F) {
       alive.insert(I.getInstructionIterator());
       worklist.push_back(I.getInstructionIterator());
     }
-  
+
   // Propagate liveness backwards to operands.
   while (!worklist.empty()) {
     Instruction* curr = worklist.pop_back_val();
@@ -72,7 +72,7 @@ bool ADCE::runOnFunction(Function& F) {
         if (alive.insert(Inst))
           worklist.push_back(Inst);
   }
-  
+
   // The inverse of the live set is the dead set.  These are those instructions
   // which have no side effects and do not influence the control flow or return
   // value of the function, and may therefore be deleted safely.
@@ -82,7 +82,7 @@ bool ADCE::runOnFunction(Function& F) {
       worklist.push_back(I.getInstructionIterator());
       I->dropAllReferences();
     }
-  
+
   for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
     ++NumRemoved;
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index d660c72..a01e066 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -32,3 +32,5 @@ add_llvm_library(LLVMScalarOpts
   Sink.cpp
   TailRecursionElimination.cpp
   )
+
+add_dependencies(LLVMScalarOpts intrinsics_gen)
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 9a5423f..bc87106 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -18,32 +18,32 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/AddrModeMatcher.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -60,6 +60,7 @@ STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
+STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -70,6 +71,10 @@ static cl::opt<bool> DisableDeleteDeadBlocks(
   "disable-cgp-delete-dead-blocks", cl::Hidden, cl::init(false),
   cl::desc("Disable deleting dead blocks in CodeGenPrepare"));
 
+static cl::opt<bool> DisableSelectToBranch(
+  "disable-cgp-select2branch", cl::Hidden, cl::init(false),
+  cl::desc("Disable select to branch conversion."));
+
 namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -78,7 +83,7 @@ namespace {
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
     ProfileInfo *PFI;
-    
+
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
     /// update it.
@@ -93,6 +98,9 @@ namespace {
     /// be updated.
     bool ModifiedDT;
 
+    /// OptSize - True if optimizing for size.
+    bool OptSize;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetLowering *tli = 0)
@@ -108,6 +116,7 @@ namespace {
     }
 
   private:
+    bool EliminateFallThrough(Function &F);
     bool EliminateMostlyEmptyBlocks(Function &F);
     bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
@@ -118,6 +127,7 @@ namespace {
     bool OptimizeCallInst(CallInst *CI);
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
+    bool OptimizeSelectInst(SelectInst *SI);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
     bool PlaceDbgValues(Function &F);
   };
@@ -141,13 +151,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   PFI = getAnalysisIfAvailable<ProfileInfo>();
+  OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
 
   // First pass, eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
   // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // handle it properly. iSel will drop llvm.dbg.value if it can not
   // find a node corresponding to the value.
   EverMadeChange |= PlaceDbgValues(F);
 
@@ -182,6 +193,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
              I = WorkList.begin(), E = WorkList.end(); I != E; ++I)
         DeleteDeadBlock(*I);
 
+    // Merge pairs of basic blocks with unconditional branches, connected by
+    // a single edge.
+    if (EverMadeChange || MadeChange)
+      MadeChange |= EliminateFallThrough(F);
+
     if (MadeChange)
       ModifiedDT = true;
     EverMadeChange |= MadeChange;
@@ -193,6 +209,39 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   return EverMadeChange;
 }
 
+/// EliminateFallThrough - Merge basic blocks which are connected
+/// by a single edge, where one of the basic blocks has a single successor
+/// pointing to the other basic block, which has a single predecessor.
+bool CodeGenPrepare::EliminateFallThrough(Function &F) {
+  bool Changed = false;
+  // Scan all of the blocks in the function, except for the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+    // If the destination block has a single pred, then this is a trivial
+    // edge, just collapse it.
+    BasicBlock *SinglePred = BB->getSinglePredecessor();
+
+    if (!SinglePred || SinglePred == BB) continue;
+
+    BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
+    if (Term && !Term->isConditional()) {
+      Changed = true;
+      // Remember if SinglePred was the entry block of the function.
+      // If so, we will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(BB, this);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+
+      // We have erased a block. Update the iterator.
+      I = BB;
+      DEBUG(dbgs() << "Merged:\n"<< *SinglePred << "\n\n\n");
+    }
+  }
+  return Changed;
+}
+
 /// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
 /// debug info directives, and an unconditional branch.  Passes before isel
 /// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
@@ -326,7 +375,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
-      
+
       DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
       return;
     }
@@ -537,7 +586,7 @@ protected:
 
 bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   BasicBlock *BB = CI->getParent();
-  
+
   // Lower inline assembly if we can.
   // If we found an inline asm expession, and if the target knows how to
   // lower it to normal LLVM code, do so now.
@@ -554,19 +603,19 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     if (OptimizeInlineAsmInst(CI))
       return true;
   }
-  
+
   // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
     bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
     Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
-    
+    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
     // Substituting this can cause recursive simplifications, which can
     // invalidate our iterator.  Use a WeakVH to hold onto it in case this
     // happens.
     WeakVH IterHandle(CurInstIterator);
-    
+
     replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0,
                                   TLInfo, ModifiedDT ? 0 : DT);
 
@@ -594,13 +643,13 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
-  
+
   // Lower all default uses of _chk calls.  This is very similar
   // to what InstCombineCalls does, but here we are only lowering calls
   // that have the default "don't know" as the objectsize.  Anything else
   // should be left alone.
   CodeGenPrepareFortifiedLibCalls Simplifier;
-  return Simplifier.fold(CI, TD);
+  return Simplifier.fold(CI, TD, TLInfo);
 }
 
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
@@ -635,10 +684,18 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   if (!TLI)
     return false;
 
+  PHINode *PN = 0;
+  BitCastInst *BCI = 0;
   Value *V = RI->getReturnValue();
-  PHINode *PN = V ? dyn_cast<PHINode>(V) : NULL;
-  if (V && !PN)
-    return false;
+  if (V) {
+    BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      V = BCI->getOperand(0);
+
+    PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return false;
+  }
 
   BasicBlock *BB = RI->getParent();
   if (PN && PN->getParent() != BB)
@@ -656,6 +713,9 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   if (PN) {
     BasicBlock::iterator BI = BB->begin();
     do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
+    if (&*BI == BCI)
+      // Also skip over the bitcast.
+      ++BI;
     if (&*BI != RI)
       return false;
   } else {
@@ -750,13 +810,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                         Type *AccessTy) {
   Value *Repl = Addr;
-  
-  // Try to collapse single-value PHI nodes.  This is necessary to undo 
+
+  // Try to collapse single-value PHI nodes.  This is necessary to undo
   // unprofitable PRE transformations.
   SmallVector<Value*, 8> worklist;
   SmallPtrSet<Value*, 16> Visited;
   worklist.push_back(Addr);
-  
+
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
@@ -768,20 +828,20 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   while (!worklist.empty()) {
     Value *V = worklist.back();
     worklist.pop_back();
-    
+
     // Break use-def graph loops.
     if (!Visited.insert(V)) {
       Consensus = 0;
       break;
     }
-    
+
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
         worklist.push_back(P->getIncomingValue(i));
       continue;
     }
-    
+
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode =
@@ -816,15 +876,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       }
       continue;
     }
-    
+
     Consensus = 0;
     break;
   }
-  
+
   // If the addressing mode couldn't be determined, or if multiple different
   // ones were determined, bail out now.
   if (!Consensus) return false;
-  
+
   // Check to see if any of the instructions supersumed by this addr mode are
   // non-local to I's BB.
   bool AnyNonLocal = false;
@@ -933,7 +993,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // Use a WeakVH to hold onto it in case this happens.
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
-    
+
     RecursivelyDeleteTriviallyDeadInstructions(Repl);
 
     if (IterHandle != CurInstIterator) {
@@ -945,7 +1005,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // This address is now available for reassignment, so erase the table
       // entry; we don't want to match some completely different instruction.
       SunkAddrs[Addr] = 0;
-    }    
+    }
   }
   ++NumMemoryInsts;
   return true;
@@ -957,12 +1017,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
-  TargetLowering::AsmOperandInfoVector 
+  TargetLowering::AsmOperandInfoVector
     TargetConstraints = TLI->ParseConstraints(CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
+
     // Compute the constraint code and ConstraintType to use.
     TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -1091,6 +1151,79 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   return MadeChange;
 }
 
+/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
+/// turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
+  // FIXME: This should use the same heuristics as IfConversion to determine
+  // whether a select is better represented as a branch.  This requires that
+  // branch probability metadata is preserved for the select, which is not the
+  // case currently.
+
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+
+  // If the branch is predicted right, an out of order CPU can avoid blocking on
+  // the compare.  Emit cmovs on compares with a memory operand as branches to
+  // avoid stalls on the load from memory.  If the compare has more than one use
+  // there's probably another cmov or setcc around so it's not worth emitting a
+  // branch.
+  if (!Cmp)
+    return false;
+
+  Value *CmpOp0 = Cmp->getOperand(0);
+  Value *CmpOp1 = Cmp->getOperand(1);
+
+  // We check that the memory operand has one use to avoid uses of the loaded
+  // value directly after the compare, making branches unprofitable.
+  return Cmp->hasOneUse() &&
+         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
+          (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
+}
+
+
+bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
+  // If we have a SelectInst that will likely profit from branch prediction,
+  // turn it into a branch.
+  if (DisableSelectToBranch || OptSize || !TLI ||
+      !TLI->isPredictableSelectExpensive())
+    return false;
+
+  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
+      !isFormingBranchFromSelectProfitable(SI))
+    return false;
+
+  ModifiedDT = true;
+
+  // First, we split the block containing the select into 2 blocks.
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
+  BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+
+  // Create a new block serving as the landing pad for the branch.
+  BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid",
+                                             NextBlock->getParent(), NextBlock);
+
+  // Move the unconditional branch from the block with the select in it into our
+  // landing pad block.
+  StartBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(NextBlock, SmallBlock);
+
+  // Insert the real conditional branch based on the original condition.
+  BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI);
+
+  // The select itself is replaced with a PHI Node.
+  PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin());
+  PN->takeName(SI);
+  PN->addIncoming(SI->getTrueValue(), StartBlock);
+  PN->addIncoming(SI->getFalseValue(), SmallBlock);
+  SI->replaceAllUsesWith(PN);
+  SI->eraseFromParent();
+
+  // Instruct OptimizeBlock to skip to the next block.
+  CurInstIterator = StartBlock->end();
+  ++NumSelectsExpanded;
+  return true;
+}
+
 bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -1104,7 +1237,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     // If the source of the cast is a constant, then this should have
     // already been constant folded.  The only reason NOT to constant fold
@@ -1124,23 +1257,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CmpInst *CI = dyn_cast<CmpInst>(I))
     return OptimizeCmpExpression(CI);
-  
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (TLI)
       return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
     return false;
   }
-  
+
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (TLI)
       return OptimizeMemoryInst(I, SI->getOperand(1),
                                 SI->getOperand(0)->getType());
     return false;
   }
-  
+
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
@@ -1154,13 +1287,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CallInst *CI = dyn_cast<CallInst>(I))
     return OptimizeCallInst(CI);
 
   if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
     return DupRetToEnableTailCallOpts(RI);
 
+  if (SelectInst *SI = dyn_cast<SelectInst>(I))
+    return OptimizeSelectInst(SI);
+
   return false;
 }
 
@@ -1179,7 +1315,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 }
 
 // llvm.dbg.value is far away from the value then iSel may not be able
-// handle it properly. iSel will drop llvm.dbg.value if it can not 
+// handle it properly. iSel will drop llvm.dbg.value if it can not
 // find a node corresponding to the value.
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c8c5360..8b1283f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -32,7 +32,7 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
@@ -71,7 +71,7 @@ namespace {
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                               SmallPtrSet<Value*, 16> &DeadStackObjects);
+                               SmallSetVector<Value*, 16> &DeadStackObjects);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -106,7 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
-                                  SmallPtrSet<Value*, 16> *ValueSet = 0) {
+                                  SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -136,7 +136,7 @@ static void DeleteDeadInstruction(Instruction *I,
 
     DeadInst->eraseFromParent();
 
-    if (ValueSet) ValueSet->erase(DeadInst);
+    if (ValueSet) ValueSet->remove(DeadInst);
   } while (!NowDeadInsts.empty());
 }
 
@@ -248,7 +248,7 @@ static bool isShortenable(Instruction *I) {
   // Don't shorten stores for now
   if (isa<StoreInst>(I))
     return false;
-  
+
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
     default: return false;
@@ -275,33 +275,9 @@ static Value *getStoredPointerOperand(Instruction *I) {
 }
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
-  const TargetData *TD = AA.getTargetData();
-
-  if (const CallInst *CI = extractMallocCall(V)) {
-    if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
-      return C->getZExtValue();
-  }
-
-  if (TD == 0)
-    return AliasAnalysis::UnknownSize;
-
-  if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) {
-    // Get size information for the alloca
-    if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
-      return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
-  }
-
-  if (const Argument *A = dyn_cast<Argument>(V)) {
-    if (A->hasByValAttr())
-      if (PointerType *PT = dyn_cast<PointerType>(A->getType()))
-        return TD->getTypeAllocSize(PT->getElementType());
-  }
-
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
-    if (!GV->mayBeOverridden())
-      return TD->getTypeAllocSize(GV->getType()->getElementType());
-  }
-
+  uint64_t Size;
+  if (getObjectSize(V, Size, AA.getTargetData()))
+    return Size;
   return AliasAnalysis::UnknownSize;
 }
 
@@ -316,7 +292,7 @@ namespace {
 
 /// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
 /// completely overwrites a store to the 'Earlier' location.
-/// 'OverwriteEnd' if the end of the 'Earlier' location is completely 
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
 /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
 static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
                                    const AliasAnalysis::Location &Earlier,
@@ -339,7 +315,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
       if (AA.getTargetData() == 0 &&
           Later.Ptr->getType() == Earlier.Ptr->getType())
         return OverwriteComplete;
-        
+
       return OverwriteUnknown;
     }
 
@@ -402,10 +378,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   //
   // We have to be careful here as *Off is signed while *.Size is unsigned.
   if (EarlierOff >= LaterOff &&
-      Later.Size > Earlier.Size &&
+      Later.Size >= Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
     return OverwriteComplete;
-  
+
   // The other interesting case is if the later store overwrites the end of
   // the earlier store
   //
@@ -544,11 +520,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // If we find a write that is a) removable (i.e., non-volatile), b) is
       // completely obliterated by the store to 'Loc', and c) which we know that
       // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) && 
+      if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
-        int64_t InstWriteOffset, DepWriteOffset; 
-        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, 
-                                         DepWriteOffset, InstWriteOffset); 
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA,
+                                         DepWriteOffset, InstWriteOffset);
         if (OR == OverwriteComplete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
@@ -557,7 +533,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           DeleteDeadInstruction(DepWrite, *MD);
           ++NumFastStores;
           MadeChange = true;
-          
+
           // DeleteDeadInstruction can delete the current instruction in loop
           // cases, reset BBI.
           BBI = Inst;
@@ -575,16 +551,16 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           unsigned DepWriteAlign = DepIntrinsic->getAlignment();
           if (llvm::isPowerOf2_64(InstWriteOffset) ||
               ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-            
+
             DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
-                  << *DepWrite << "\n  KILLER (offset " 
-                  << InstWriteOffset << ", " 
+                  << *DepWrite << "\n  KILLER (offset "
+                  << InstWriteOffset << ", "
                   << DepLoc.Size << ")"
                   << *Inst << '\n');
-            
+
             Value* DepWriteLength = DepIntrinsic->getLength();
             Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
-                                                    InstWriteOffset - 
+                                                    InstWriteOffset -
                                                     DepWriteOffset);
             DepIntrinsic->setLength(TrimmedLength);
             MadeChange = true;
@@ -694,19 +670,18 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
   // Keep track of all of the stack objects that are dead at the end of the
   // function.
-  SmallPtrSet<Value*, 16> DeadStackObjects;
+  SmallSetVector<Value*, 16> DeadStackObjects;
 
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
   for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      DeadStackObjects.insert(AI);
+    if (isa<AllocaInst>(I))
+      DeadStackObjects.insert(I);
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    if (CallInst *CI = extractMallocCall(I))
-      if (!PointerMayBeCaptured(CI, true, true))
-        DeadStackObjects.insert(CI);
+    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+      DeadStackObjects.insert(I);
   }
 
   // Treat byval arguments the same, stores to them are dead at the end of the
@@ -723,14 +698,30 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     // If we find a store, check to see if it points into a dead stack value.
     if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
-      Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI));
+      SmallVector<Value *, 4> Pointers;
+      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers);
 
       // Stores to stack values are valid candidates for removal.
-      if (DeadStackObjects.count(Pointer)) {
+      bool AllDead = true;
+      for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+           E = Pointers.end(); I != E; ++I)
+        if (!DeadStackObjects.count(*I)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
         Instruction *Dead = BBI++;
 
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
-                     << *Dead << "\n  Object: " << *Pointer << '\n');
+                     << *Dead << "\n  Objects: ";
+              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                   E = Pointers.end(); I != E; ++I) {
+                dbgs() << **I;
+                if (llvm::next(I) != E)
+                  dbgs() << ", ";
+              }
+              dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
         DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
@@ -749,17 +740,19 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       continue;
     }
 
-    if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) {
-      DeadStackObjects.erase(A);
-      continue;
-    }
-
-    if (CallInst *CI = extractMallocCall(BBI)) {
-      DeadStackObjects.erase(CI);
+    if (isa<AllocaInst>(BBI)) {
+      // Remove allocas from the list of dead stack objects; there can't be
+      // any references before the definition.
+      DeadStackObjects.remove(BBI);
       continue;
     }
 
     if (CallSite CS = cast<Value>(BBI)) {
+      // Remove allocation function calls from the list of dead stack objects; 
+      // there can't be any references before the definition.
+      if (isAllocLikeFn(BBI))
+        DeadStackObjects.remove(BBI);
+
       // If this call does not access memory, it can't be loading any of our
       // pointers.
       if (AA->doesNotAccessMemory(CS))
@@ -768,7 +761,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // If the call might load from any of our allocas, then any store above
       // the call is live.
       SmallVector<Value*, 8> LiveAllocas;
-      for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
+      for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
            E = DeadStackObjects.end(); I != E; ++I) {
         // See if the call site touches it.
         AliasAnalysis::ModRefResult A =
@@ -780,12 +773,12 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
-        DeadStackObjects.erase(*I);
+        DeadStackObjects.remove(*I);
 
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
       if (DeadStackObjects.empty())
-        return MadeChange;
+        break;
 
       continue;
     }
@@ -827,7 +820,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 /// of the stack objects in the DeadStackObjects set.  If so, they become live
 /// because the location is being loaded.
 void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                                SmallPtrSet<Value*, 16> &DeadStackObjects) {
+                                SmallSetVector<Value*, 16> &DeadStackObjects) {
   const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr);
 
   // A constant can't be in the dead pointer set.
@@ -837,12 +830,12 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // If the kill pointer can be easily reduced to an alloca, don't bother doing
   // extraneous AA queries.
   if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
-    DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer));
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
     return;
   }
 
   SmallVector<Value*, 16> NowLive;
-  for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
+  for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
        E = DeadStackObjects.end(); I != E; ++I) {
     // See if the loaded location could alias the stack location.
     AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA));
@@ -852,5 +845,5 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
 
   for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end();
        I != E; ++I)
-    DeadStackObjects.erase(*I);
+    DeadStackObjects.remove(*I);
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index f3c92d6..9759549 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -39,7 +39,7 @@ static unsigned getHash(const void *V) {
 }
 
 //===----------------------------------------------------------------------===//
-// SimpleValue 
+// SimpleValue
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -47,16 +47,16 @@ namespace {
   /// scoped hash table.
   struct SimpleValue {
     Instruction *Inst;
-    
+
     SimpleValue(Instruction *I) : Inst(I) {
       assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
     }
-    
+
     bool isSentinel() const {
       return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
              Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
     }
-    
+
     static bool canHandle(Instruction *Inst) {
       // This can only handle non-void readnone functions.
       if (CallInst *CI = dyn_cast<CallInst>(Inst))
@@ -90,7 +90,7 @@ template<> struct DenseMapInfo<SimpleValue> {
 
 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   Instruction *Inst = Val.Inst;
-  
+
   // Hash in all of the operands as pointers.
   unsigned Res = 0;
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
@@ -126,13 +126,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 
   if (LHS.isSentinel() || RHS.isSentinel())
     return LHSI == RHSI;
-  
+
   if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
   return LHSI->isIdenticalTo(RHSI);
 }
 
 //===----------------------------------------------------------------------===//
-// CallValue 
+// CallValue
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -140,21 +140,21 @@ namespace {
   /// the scoped hash table.
   struct CallValue {
     Instruction *Inst;
-    
+
     CallValue(Instruction *I) : Inst(I) {
       assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
     }
-    
+
     bool isSentinel() const {
       return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
              Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
     }
-    
+
     static bool canHandle(Instruction *Inst) {
       // Don't value number anything that returns void.
       if (Inst->getType()->isVoidTy())
         return false;
-      
+
       CallInst *CI = dyn_cast<CallInst>(Inst);
       if (CI == 0 || !CI->onlyReadsMemory())
         return false;
@@ -168,7 +168,7 @@ namespace llvm {
   template<> struct isPodLike<CallValue> {
     static const bool value = true;
   };
-  
+
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -189,7 +189,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
            "Cannot value number calls with metadata operands");
     Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
   }
-  
+
   // Mix in the opcode.
   return (Res << 1) ^ Inst->getOpcode();
 }
@@ -203,11 +203,11 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
 
 
 //===----------------------------------------------------------------------===//
-// EarlyCSE pass. 
+// EarlyCSE pass.
 //===----------------------------------------------------------------------===//
 
 namespace {
-  
+
 /// EarlyCSE - This pass does a simple depth-first walk over the dominator
 /// tree, eliminating trivially redundant instructions and using instsimplify
 /// to canonicalize things as it goes.  It is intended to be fast and catch
@@ -223,14 +223,14 @@ public:
                       ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
                           AllocatorTy> ScopedHTType;
-  
+
   /// AvailableValues - This scoped hash table contains the current values of
   /// all of our simple scalar expressions.  As we walk down the domtree, we
   /// look to see if instructions are in this: if so, we replace them with what
   /// we find, otherwise we insert them so that dominated values can succeed in
   /// their lookup.
   ScopedHTType *AvailableValues;
-  
+
   /// AvailableLoads - This scoped hash table contains the current values
   /// of loads.  This allows us to get efficient access to dominating loads when
   /// we have a fully redundant load.  In addition to the most recent load, we
@@ -243,15 +243,15 @@ public:
   typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>,
                           DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType;
   LoadHTType *AvailableLoads;
-  
+
   /// AvailableCalls - This scoped hash table contains the current values
   /// of read-only call values.  It uses the same generation count as loads.
   typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType;
   CallHTType *AvailableCalls;
-  
+
   /// CurrentGeneration - This is the current generation of the memory value.
   unsigned CurrentGeneration;
-  
+
   static char ID;
   explicit EarlyCSE() : FunctionPass(ID) {
     initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
@@ -326,7 +326,7 @@ private:
   };
 
   bool processNode(DomTreeNode *Node);
-  
+
   // This transformation requires dominator postdominator info
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<DominatorTree>();
@@ -350,7 +350,7 @@ INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   BasicBlock *BB = Node->getBlock();
-  
+
   // If this block has a single predecessor, then the predecessor is the parent
   // of the domtree node and all of the live out memory values are still current
   // in this block.  If this block has multiple predecessors, then they could
@@ -359,20 +359,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // predecessors.
   if (BB->getSinglePredecessor() == 0)
     ++CurrentGeneration;
-  
+
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
   /// to the same location, we delete the dead store.  This zaps trivial dead
   /// stores which can occur in bitfield code among other things.
   StoreInst *LastStore = 0;
-  
+
   bool Changed = false;
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
     Instruction *Inst = I++;
-    
+
     // Dead instructions should just be removed.
     if (isInstructionTriviallyDead(Inst)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
@@ -381,7 +381,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       ++NumSimplify;
       continue;
     }
-    
+
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) {
@@ -392,7 +392,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       ++NumSimplify;
       continue;
     }
-    
+
     // If this is a simple instruction that we can value number, process it.
     if (SimpleValue::canHandle(Inst)) {
       // See if the instruction has an available value.  If so, use it.
@@ -404,12 +404,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSE;
         continue;
       }
-      
+
       // Otherwise, just remember that this value is available.
       AvailableValues->insert(Inst, Inst);
       continue;
     }
-    
+
     // If this is a non-volatile load, process it.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
@@ -417,7 +417,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         LastStore = 0;
         continue;
       }
-      
+
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
       std::pair<Value*, unsigned> InVal =
@@ -431,18 +431,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSELoad;
         continue;
       }
-      
+
       // Otherwise, remember that we have this instruction.
       AvailableLoads->insert(Inst->getOperand(0),
                           std::pair<Value*, unsigned>(Inst, CurrentGeneration));
       LastStore = 0;
       continue;
     }
-    
+
     // If this instruction may read from memory, forget LastStore.
     if (Inst->mayReadFromMemory())
       LastStore = 0;
-    
+
     // If this is a read-only call, process it.
     if (CallValue::canHandle(Inst)) {
       // If we have an available version of this call, and if it is the right
@@ -457,19 +457,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSECall;
         continue;
       }
-      
+
       // Otherwise, remember that we have this instruction.
       AvailableCalls->insert(Inst,
                          std::pair<Value*, unsigned>(Inst, CurrentGeneration));
       continue;
     }
-    
+
     // Okay, this isn't something we can CSE at all.  Check to see if it is
     // something that could modify memory.  If so, our available memory values
     // cannot be used so bump the generation count.
     if (Inst->mayWriteToMemory()) {
       ++CurrentGeneration;
-     
+
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // We do a trivial form of DSE if there are two stores to the same
         // location with no intervening loads.  Delete the earlier store.
@@ -483,7 +483,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LastStore = 0;
           continue;
         }
-        
+
         // Okay, we just invalidated anything we knew about loaded values.  Try
         // to salvage *something* by remembering that the stored value is a live
         // version of the pointer.  It is safe to forward from volatile stores
@@ -491,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // the store.
         AvailableLoads->insert(SI->getPointerOperand(),
          std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
-        
+
         // Remember that this was the last store we saw for DSE.
         if (SI->isSimple())
           LastStore = SI;
@@ -509,7 +509,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTree>();
-  
+
   // Tables that the pass uses when walking the domtree.
   ScopedHTType AVTable;
   AvailableValues = &AVTable;
@@ -517,7 +517,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   AvailableLoads = &LoadTable;
   CallHTType CallTable;
   AvailableCalls = &CallTable;
-  
+
   CurrentGeneration = 0;
   bool Changed = false;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index fb733ad..120175d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -18,8 +18,15 @@
 #define DEBUG_TYPE "gvn"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
@@ -30,20 +37,14 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -59,6 +60,11 @@ static cl::opt<bool> EnablePRE("enable-pre",
                                cl::init(true), cl::Hidden);
 static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+                cl::desc("Max recurse depth (default = 1000)"));
+
 //===----------------------------------------------------------------------===//
 //                         ValueTable Class
 //===----------------------------------------------------------------------===//
@@ -167,7 +173,7 @@ Expression ValueTable::create_expression(Instruction *I) {
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
   }
-  
+
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     // Sort the operand value numbers so x<y and y>x get the same value number.
     CmpInst::Predicate Predicate = C->getPredicate();
@@ -181,7 +187,7 @@ Expression ValueTable::create_expression(Instruction *I) {
          II != IE; ++II)
       e.varargs.push_back(*II);
   }
-  
+
   return e;
 }
 
@@ -385,7 +391,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     valueNumbering[V] = nextValueNumber;
     return nextValueNumber++;
   }
-  
+
   Instruction* I = cast<Instruction>(V);
   Expression exp;
   switch (I->getOpcode()) {
@@ -501,17 +507,17 @@ namespace {
     const TargetLibraryInfo *TLI;
 
     ValueTable VN;
-    
+
     /// LeaderTable - A mapping from value numbers to lists of Value*'s that
     /// have that value number.  Use findLeader to query it.
     struct LeaderTableEntry {
       Value *Val;
-      BasicBlock *BB;
+      const BasicBlock *BB;
       LeaderTableEntry *Next;
     };
     DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
     BumpPtrAllocator TableAllocator;
-    
+
     SmallVector<Instruction*, 8> InstrsToErase;
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -521,14 +527,14 @@ namespace {
     }
 
     bool runOnFunction(Function &F);
-    
+
     /// markInstructionForDeletion - This removes the specified instruction from
     /// our various maps and marks it for deletion.
     void markInstructionForDeletion(Instruction *I) {
       VN.erase(I);
       InstrsToErase.push_back(I);
     }
-    
+
     const TargetData *getTargetData() const { return TD; }
     DominatorTree &getDominatorTree() const { return *DT; }
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
@@ -536,32 +542,32 @@ namespace {
   private:
     /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for
     /// its value number.
-    void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) {
+    void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
       LeaderTableEntry &Curr = LeaderTable[N];
       if (!Curr.Val) {
         Curr.Val = V;
         Curr.BB = BB;
         return;
       }
-      
+
       LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
       Node->Val = V;
       Node->BB = BB;
       Node->Next = Curr.Next;
       Curr.Next = Node;
     }
-    
+
     /// removeFromLeaderTable - Scan the list of values corresponding to a given
-    /// value number, and remove the given value if encountered.
-    void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) {
+    /// value number, and remove the given instruction if encountered.
+    void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
       LeaderTableEntry* Prev = 0;
       LeaderTableEntry* Curr = &LeaderTable[N];
 
-      while (Curr->Val != V || Curr->BB != BB) {
+      while (Curr->Val != I || Curr->BB != BB) {
         Prev = Curr;
         Curr = Curr->Next;
       }
-      
+
       if (Prev) {
         Prev->Next = Curr->Next;
       } else {
@@ -591,7 +597,7 @@ namespace {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<AliasAnalysis>();
     }
-    
+
 
     // Helper fuctions
     // FIXME: eliminate or document these better
@@ -602,13 +608,13 @@ namespace {
     void dump(DenseMap<uint32_t, Value*> &d);
     bool iterateOnFunction(Function &F);
     bool performPRE(Function &F);
-    Value *findLeader(BasicBlock *BB, uint32_t num);
+    Value *findLeader(const BasicBlock *BB, uint32_t num);
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
-                                         BasicBlock *Root);
-    bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root);
+                                         const BasicBlock *Root);
+    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlock *Root);
   };
 
   char GVN::ID = 0;
@@ -647,7 +653,11 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
 ///   3) we are speculating for this block and have used that to speculate for
 ///      other blocks.
 static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
-                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks) {
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+                            uint32_t RecurseDepth) {
+  if (RecurseDepth > MaxRecurseDepth)
+    return false;
+
   // Optimistically assume that the block is fully available and check to see
   // if we already know about this block in one lookup.
   std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
@@ -673,7 +683,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
     // If the value isn't fully available in one of our predecessors, then it
     // isn't fully available in this block either.  Undo our previous
     // optimistic assumption and bail out.
-    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
       goto SpeculationFailure;
 
   return true;
@@ -725,15 +735,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
       StoredVal->getType()->isStructTy() ||
       StoredVal->getType()->isArrayTy())
     return false;
-  
+
   // The store has to be at least as big as the load.
   if (TD.getTypeSizeInBits(StoredVal->getType()) <
         TD.getTypeSizeInBits(LoadTy))
     return false;
-  
+
   return true;
 }
-  
+
 
 /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
 /// then a load from a must-aliased pointer of a different type, try to coerce
@@ -741,80 +751,80 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 /// InsertPt is the place to insert new instructions.
 ///
 /// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, 
+static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
                                              Type *LoadedTy,
                                              Instruction *InsertPt,
                                              const TargetData &TD) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
-  
+
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
-  
+
   uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
-  
+
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
     // Pointer to Pointer -> use bitcast.
     if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy())
       return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
-    
+
     // Convert source pointers to integers, which can be bitcast.
     if (StoredValTy->isPointerTy()) {
       StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
-    
+
     Type *TypeToCastTo = LoadedTy;
     if (TypeToCastTo->isPointerTy())
       TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
-    
+
     if (StoredValTy != TypeToCastTo)
       StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
-    
+
     // Cast to pointer if the load needs a pointer type.
     if (LoadedTy->isPointerTy())
       StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt);
-    
+
     return StoredVal;
   }
-  
+
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
   assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
-  
+
   // Convert source pointers to integers, which can be manipulated.
   if (StoredValTy->isPointerTy()) {
     StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
     StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
   }
-  
+
   // Convert vectors and fp to integer, which can be manipulated.
   if (!StoredValTy->isIntegerTy()) {
     StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
     StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt);
   }
-  
+
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (TD.isBigEndian()) {
     Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize);
     StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt);
   }
-  
+
   // Truncate the integer to the right size now.
   Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
   StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt);
-  
+
   if (LoadedTy == NewIntTy)
     return StoredVal;
-  
+
   // If the result is a pointer, inttoptr.
   if (LoadedTy->isPointerTy())
     return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt);
-  
+
   // Otherwise, bitcast.
   return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt);
 }
@@ -835,13 +845,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
     return -1;
-  
+
   int64_t StoreOffset = 0, LoadOffset = 0;
   Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD);
   Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
   if (StoreBase != LoadBase)
     return -1;
-  
+
   // If the load and store are to the exact same address, they should have been
   // a must alias.  AA must have gotten confused.
   // FIXME: Study to see if/when this happens.  One case is forwarding a memset
@@ -856,18 +866,18 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     abort();
   }
 #endif
-  
+
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy);
-  
+
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
   uint64_t StoreSize = WriteSizeInBits >> 3;  // Convert to bytes.
   LoadSize >>= 3;
-  
-  
+
+
   bool isAAFailure = false;
   if (StoreOffset < LoadOffset)
     isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
@@ -885,7 +895,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 #endif
     return -1;
   }
-  
+
   // If the Load isn't completely contained within the stored bits, we don't
   // have all the bits to feed it.  We could do something crazy in the future
   // (issue a smaller load then merge the bits in) but this seems unlikely to be
@@ -893,11 +903,11 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   if (StoreOffset > LoadOffset ||
       StoreOffset+StoreSize < LoadOffset+LoadSize)
     return -1;
-  
+
   // Okay, we can do this transformation.  Return the number of bytes into the
   // store that the load is.
   return LoadOffset-StoreOffset;
-}  
+}
 
 /// AnalyzeLoadFromClobberingStore - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
@@ -923,23 +933,23 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
-  
+
   Value *DepPtr = DepLI->getPointerOperand();
   uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType());
   int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD);
   if (R != -1) return R;
-  
+
   // If we have a load/load clobber an DepLI can be widened to cover this load,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
     GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
-  
+
   unsigned Size = MemoryDependenceAnalysis::
     getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD);
   if (Size == 0) return -1;
-  
+
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD);
 }
 
@@ -958,29 +968,29 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (MI->getIntrinsicID() == Intrinsic::memset)
     return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
                                           MemSizeInBits, TD);
-  
+
   // If we have a memcpy/memmove, the only case we can handle is if this is a
   // copy from constant memory.  In that case, we can read directly from the
   // constant memory.
   MemTransferInst *MTI = cast<MemTransferInst>(MI);
-  
+
   Constant *Src = dyn_cast<Constant>(MTI->getSource());
   if (Src == 0) return -1;
-  
+
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD));
   if (GV == 0 || !GV->isConstant()) return -1;
-  
+
   // See if the access is within the bounds of the transfer.
   int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
                                               MI->getDest(), MemSizeInBits, TD);
   if (Offset == -1)
     return Offset;
-  
+
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
-  Constant *OffsetCst = 
+  Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
@@ -988,7 +998,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
     return Offset;
   return -1;
 }
-                                            
+
 
 /// GetStoreValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.  This means
@@ -999,32 +1009,32 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
                                    Type *LoadTy,
                                    Instruction *InsertPt, const TargetData &TD){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
-  
+
   uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
   uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8;
-  
+
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
-  
+
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPointerTy())
     SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-  
+
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
   if (TD.isLittleEndian())
     ShiftAmt = Offset*8;
   else
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-  
+
   if (ShiftAmt)
     SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-  
+
   if (LoadSize != StoreSize)
     SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-  
+
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
 
@@ -1051,14 +1061,14 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
       NewLoadSize = NextPowerOf2(NewLoadSize);
 
     Value *PtrVal = SrcVal->getPointerOperand();
-    
+
     // Insert the new load after the old load.  This ensures that subsequent
     // memdep queries will find the new load.  We can't easily remove the old
     // load completely because it is already in the value numbering table.
     IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy = 
+    Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy, 
+    DestPTy = PointerType::get(DestPTy,
                        cast<PointerType>(PtrVal->getType())->getAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
@@ -1068,7 +1078,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 
     DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-    
+
     // Replace uses of the original load with the wider load.  On a big endian
     // system, we need to shift down to get the relevant bits.
     Value *RV = NewLoad;
@@ -1077,7 +1087,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
                     NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
     RV = Builder.CreateTrunc(RV, SrcVal->getType());
     SrcVal->replaceAllUsesWith(RV);
-    
+
     // We would like to use gvn.markInstructionForDeletion here, but we can't
     // because the load is already memoized into the leader map table that GVN
     // tracks.  It is potentially possible to remove the load from the table,
@@ -1086,7 +1096,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     gvn.getMemDep().removeInstruction(SrcVal);
     SrcVal = NewLoad;
   }
-  
+
   return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD);
 }
 
@@ -1100,7 +1110,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
 
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
-  
+
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
   if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
@@ -1109,9 +1119,9 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
     Value *Val = MSI->getValue();
     if (LoadSize != 1)
       Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-    
+
     Value *OneElt = Val;
-    
+
     // Splat the value out to the right number of bits.
     for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
       // If we can double the number of bytes set, do it.
@@ -1121,16 +1131,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
         NumBytesSet <<= 1;
         continue;
       }
-      
+
       // Otherwise insert one byte at a time.
       Value *ShVal = Builder.CreateShl(Val, 1*8);
       Val = Builder.CreateOr(OneElt, ShVal);
       ++NumBytesSet;
     }
-    
+
     return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD);
   }
- 
+
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
@@ -1139,7 +1149,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
-  Constant *OffsetCst = 
+  Constant *OffsetCst =
   ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
@@ -1156,13 +1166,13 @@ struct AvailableValueInBlock {
     LoadVal,    // A value produced by a load.
     MemIntrin   // A memory intrinsic which is loaded from.
   };
-  
+
   /// V - The value that is live out of the block.
   PointerIntPair<Value *, 2, ValType> Val;
-  
+
   /// Offset - The byte offset in Val that is interesting for the load query.
   unsigned Offset;
-  
+
   static AvailableValueInBlock get(BasicBlock *BB, Value *V,
                                    unsigned Offset = 0) {
     AvailableValueInBlock Res;
@@ -1182,7 +1192,7 @@ struct AvailableValueInBlock {
     Res.Offset = Offset;
     return Res;
   }
-  
+
   static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
                                        unsigned Offset = 0) {
     AvailableValueInBlock Res;
@@ -1201,17 +1211,17 @@ struct AvailableValueInBlock {
     assert(isSimpleValue() && "Wrong accessor");
     return Val.getPointer();
   }
-  
+
   LoadInst *getCoercedLoadValue() const {
     assert(isCoercedLoadValue() && "Wrong accessor");
     return cast<LoadInst>(Val.getPointer());
   }
-  
+
   MemIntrinsic *getMemIntrinValue() const {
     assert(isMemIntrinValue() && "Wrong accessor");
     return cast<MemIntrinsic>(Val.getPointer());
   }
-  
+
   /// MaterializeAdjustedValue - Emit code into this block to adjust the value
   /// defined here to the specified type.  This handles various coercion cases.
   Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
@@ -1223,7 +1233,7 @@ struct AvailableValueInBlock {
         assert(TD && "Need target data to handle type mismatch case");
         Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
                                    *TD);
-        
+
         DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                      << *getSimpleValue() << '\n'
                      << *Res << '\n' << "\n\n\n");
@@ -1235,7 +1245,7 @@ struct AvailableValueInBlock {
       } else {
         Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
                                   gvn);
-        
+
         DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                      << *getCoercedLoadValue() << '\n'
                      << *Res << '\n' << "\n\n\n");
@@ -1258,12 +1268,12 @@ struct AvailableValueInBlock {
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
 /// that should be used at LI's definition site.
-static Value *ConstructSSAForLoadSet(LoadInst *LI, 
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
                          SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
                                      GVN &gvn) {
   // Check for the fully redundant, dominating load case.  In this case, we can
   // just use the dominating value directly.
-  if (ValuesPerBlock.size() == 1 && 
+  if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
                                                LI->getParent()))
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
@@ -1272,29 +1282,29 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SmallVector<PHINode*, 8> NewPHIs;
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
-  
+
   Type *LoadTy = LI->getType();
-  
+
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
     const AvailableValueInBlock &AV = ValuesPerBlock[i];
     BasicBlock *BB = AV.BB;
-    
+
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
     SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn));
   }
-  
+
   // Perform PHI construction.
   Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
-  
+
   // If new PHI nodes were created, notify alias analysis.
   if (V->getType()->isPointerTy()) {
     AliasAnalysis *AA = gvn.getAliasAnalysis();
-    
+
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
       AA->copyValue(LI, NewPHIs[i]);
-    
+
     // Now that we've copied information to the new PHIs, scan through
     // them again and inform alias analysis that we've added potentially
     // escaping uses to any values that are operands to these PHIs.
@@ -1366,7 +1376,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // the pointer operand of the load if PHI translation occurs.  Make sure
       // to consider the right address.
       Value *Address = Deps[i].getAddress();
-      
+
       // If the dependence is to a store that writes to a superset of the bits
       // read by the load, we can extract the bits we need for the load from the
       // stored value.
@@ -1382,7 +1392,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
           }
         }
       }
-      
+
       // Check to see if we have something like this:
       //    load i32* P
       //    load i8* (P+1)
@@ -1394,7 +1404,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
           int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
                                                      LI->getPointerOperand(),
                                                      DepLI, *TD);
-          
+
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
                                                                     Offset));
@@ -1413,10 +1423,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
                                                                   Offset));
             continue;
-          }            
+          }
         }
       }
-      
+
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1426,14 +1436,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                              UndefValue::get(LI->getType())));
       continue;
     }
-    
+
     if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
       // Reject loads and stores that are to the same address but are of
       // different types if we have to.
@@ -1451,7 +1461,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
                                                          S->getValueOperand()));
       continue;
     }
-    
+
     if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
       // If the types mismatch and we can't handle it, reject reuse of the load.
       if (LD->getType() != LI->getType()) {
@@ -1460,12 +1470,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
         if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){
           UnavailableBlocks.push_back(DepBB);
           continue;
-        }          
+        }
       }
       ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
       continue;
     }
-    
+
     UnavailableBlocks.push_back(DepBB);
     continue;
   }
@@ -1479,7 +1489,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // its value.  Insert PHIs and remove the fully redundant value now.
   if (UnavailableBlocks.empty()) {
     DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
-    
+
     // Perform PHI construction.
     Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
     LI->replaceAllUsesWith(V);
@@ -1522,10 +1532,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       return false;
     if (Blockers.count(TmpBB))
       return false;
-    
+
     // If any of these blocks has more than one successor (i.e. if the edge we
-    // just traversed was critical), then there are other paths through this 
-    // block along which the load may not be anticipated.  Hoisting the load 
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
     // above this block would be adding the load to execution paths along
     // which it was not previously executed.
     if (TmpBB->getTerminator()->getNumSuccessors() != 1)
@@ -1570,7 +1580,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
-    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
     PredLoads[Pred] = 0;
@@ -1603,7 +1613,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   unsigned NumUnavailablePreds = PredLoads.size();
   assert(NumUnavailablePreds != 0 &&
          "Fully available value should be eliminated above!");
-  
+
   // If this load is unavailable in multiple predecessors, reject it.
   // FIXME: If we could restructure the CFG, we could make a common pred with
   // all the preds that don't have an available LI and insert a new load into
@@ -1680,10 +1690,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   DEBUG(if (!NewInsts.empty())
           dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
                  << *NewInsts.back() << '\n');
-  
+
   // Assign value numbers to the new instructions.
   for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) {
-    // FIXME: We really _ought_ to insert these value numbers into their 
+    // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
     // marking a value as AVAIL-IN, which isn't what we intend.
@@ -1725,6 +1735,53 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return true;
 }
 
+static void patchReplacementInstruction(Value *Repl, Instruction *I) {
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
+  BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
+  if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) &&
+      isa<OverflowingBinaryOperator>(ReplOp)) {
+    if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap())
+      ReplOp->setHasNoSignedWrap(false);
+    if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap())
+      ReplOp->setHasNoUnsignedWrap(false);
+  }
+  if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    ReplInst->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (int i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *IMD = I->getMetadata(Kind);
+      MDNode *ReplMD = Metadata[i].second;
+      switch(Kind) {
+      default:
+        ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_range:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_prof:
+        llvm_unreachable("MD_prof in a non terminator instruction");
+        break;
+      case LLVMContext::MD_fpmath:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
+        break;
+      }
+    }
+  }
+}
+
+static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) {
+  patchReplacementInstruction(Repl, I);
+  I->replaceAllUsesWith(Repl);
+}
+
 /// processLoad - Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
 bool GVN::processLoad(LoadInst *L) {
@@ -1738,7 +1795,7 @@ bool GVN::processLoad(LoadInst *L) {
     markInstructionForDeletion(L);
     return true;
   }
-  
+
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
 
@@ -1764,7 +1821,7 @@ bool GVN::processLoad(LoadInst *L) {
         AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
                                         L->getType(), L, *TD);
     }
-    
+
     // Check to see if we have something like this:
     //    load i32* P
     //    load i8* (P+1)
@@ -1774,14 +1831,14 @@ bool GVN::processLoad(LoadInst *L) {
       // we have the first instruction in the entry block.
       if (DepLI == L)
         return false;
-      
+
       int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
                                                  L->getPointerOperand(),
                                                  DepLI, *TD);
       if (Offset != -1)
         AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
     }
-    
+
     // If the clobbering value is a memset/memcpy/memmove, see if we can forward
     // a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
@@ -1791,11 +1848,11 @@ bool GVN::processLoad(LoadInst *L) {
       if (Offset != -1)
         AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD);
     }
-        
+
     if (AvailVal) {
       DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
             << *AvailVal << '\n' << *L << "\n\n\n");
-      
+
       // Replace the load!
       L->replaceAllUsesWith(AvailVal);
       if (AvailVal->getType()->isPointerTy())
@@ -1805,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L) {
       return true;
     }
   }
-  
+
   // If the value isn't available, don't do anything!
   if (Dep.isClobber()) {
     DEBUG(
@@ -1835,7 +1892,7 @@ bool GVN::processLoad(LoadInst *L) {
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
-    
+
     // The store and load are to a must-aliased pointer, but they may not
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
@@ -1845,11 +1902,11 @@ bool GVN::processLoad(LoadInst *L) {
                                                    L, *TD);
         if (StoredVal == 0)
           return false;
-        
+
         DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
                      << '\n' << *L << "\n\n\n");
       }
-      else 
+      else
         return false;
     }
 
@@ -1864,7 +1921,7 @@ bool GVN::processLoad(LoadInst *L) {
 
   if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
     Value *AvailableVal = DepLI;
-    
+
     // The loads are of a must-aliased pointer, but they may not actually have
     // the same type.  See if we know how to reuse the previously loaded value
     // (depending on its type).
@@ -1874,16 +1931,16 @@ bool GVN::processLoad(LoadInst *L) {
                                                       L, *TD);
         if (AvailableVal == 0)
           return false;
-      
+
         DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
                      << "\n" << *L << "\n\n\n");
       }
-      else 
+      else
         return false;
     }
-    
+
     // Remove it!
-    L->replaceAllUsesWith(AvailableVal);
+    patchAndReplaceAllUsesWith(AvailableVal, L);
     if (DepLI->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
@@ -1894,13 +1951,13 @@ bool GVN::processLoad(LoadInst *L) {
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
     return true;
   }
-  
+
   // If this load occurs either right after a lifetime begin,
   // then the loaded value is undefined.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
@@ -1915,28 +1972,28 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
-// findLeader - In order to find a leader for a given value number at a 
+// findLeader - In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
-// and then scan the list to find one whose block dominates the block in 
+// and then scan the list to find one whose block dominates the block in
 // question.  This is fast because dominator tree queries consist of only
 // a few comparisons of DFS numbers.
-Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
   LeaderTableEntry Vals = LeaderTable[num];
   if (!Vals.Val) return 0;
-  
+
   Value *Val = 0;
   if (DT->dominates(Vals.BB, BB)) {
     Val = Vals.Val;
     if (isa<Constant>(Val)) return Val;
   }
-  
+
   LeaderTableEntry* Next = Vals.Next;
   while (Next) {
     if (DT->dominates(Next->BB, BB)) {
       if (isa<Constant>(Next->Val)) return Next->Val;
       if (!Val) Val = Next->Val;
     }
-    
+
     Next = Next->Next;
   }
 
@@ -1947,7 +2004,7 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
 /// use is dominated by the given basic block.  Returns the number of uses that
 /// were replaced.
 unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
-                                          BasicBlock *Root) {
+                                          const BasicBlock *Root) {
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE; ) {
@@ -1973,7 +2030,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
 /// propagateEquality - The given values are known to be equal in every block
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
-bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlock *Root) {
   SmallVector<std::pair<Value*, Value*>, 4> Worklist;
   Worklist.push_back(std::make_pair(LHS, RHS));
   bool Changed = false;
@@ -2012,9 +2069,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
             DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) &&
            "Instruction doesn't dominate scope!");
 
-    // If value numbering later deduces that an instruction in the scope is equal
-    // to 'LHS' then ensure it will be turned into 'RHS'.
-    addToLeaderTable(LVN, RHS, Root);
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    if (!isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root);
 
     // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
     // LHS always has at least one use that is not dominated by Root, this will
@@ -2180,7 +2243,7 @@ bool GVN::processInstruction(Instruction *I) {
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redundancies in them.
   if (I->getType()->isVoidTy()) return false;
-  
+
   uint32_t NextNum = VN.getNextUnusedValueNumber();
   unsigned Num = VN.lookup_or_add(I);
 
@@ -2198,7 +2261,7 @@ bool GVN::processInstruction(Instruction *I) {
     addToLeaderTable(Num, I, I->getParent());
     return false;
   }
-  
+
   // Perform fast-path value-number based elimination of values inherited from
   // dominators.
   Value *repl = findLeader(I->getParent(), Num);
@@ -2207,9 +2270,9 @@ bool GVN::processInstruction(Instruction *I) {
     addToLeaderTable(Num, I, I->getParent());
     return false;
   }
-  
+
   // Remove it!
-  I->replaceAllUsesWith(repl);
+  patchAndReplaceAllUsesWith(repl, I);
   if (MD && repl->getType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
@@ -2234,7 +2297,7 @@ bool GVN::runOnFunction(Function& F) {
   // optimization opportunities.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
     BasicBlock *BB = FI++;
-    
+
     bool removedBlock = MergeBlockIntoPredecessor(BB, this);
     if (removedBlock) ++NumGVNBlocks;
 
@@ -2391,7 +2454,7 @@ bool GVN::performPRE(Function &F) {
       // we would need to insert instructions in more than one pred.
       if (NumWithout != 1 || NumWith == 0)
         continue;
-      
+
       // Don't do PRE across indirect branch.
       if (isa<IndirectBrInst>(PREPred->getTerminator()))
         continue;
@@ -2467,7 +2530,7 @@ bool GVN::performPRE(Function &F) {
           unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
           VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
         }
-        
+
         if (MD)
           MD->invalidateCachedPointerInfo(Phi);
       }
@@ -2504,7 +2567,7 @@ bool GVN::splitCriticalEdges() {
 /// iterateOnFunction - Executes one iteration of GVN
 bool GVN::iterateOnFunction(Function &F) {
   cleanupGlobalSets();
-  
+
   // Top-down walk of the dominator tree
   bool Changed = false;
 #if 0
@@ -2539,7 +2602,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
        I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
     const LeaderTableEntry *Node = &I->second;
     assert(Node->Val != Inst && "Inst still in value numbering scope!");
-    
+
     while (Node->Next) {
       Node = Node->Next;
       assert(Node->Val != Inst && "Inst still in value numbering scope!");
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index c2bd6e6..b36a3cb 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -12,7 +12,7 @@
 // global). Such a transformation can significantly reduce the register pressure
 // when many globals are involved.
 //
-// For example, consider the code which touches several global variables at 
+// For example, consider the code which touches several global variables at
 // once:
 //
 // static int foo[N], bar[N], baz[N];
@@ -208,8 +208,8 @@ bool GlobalMerge::doInitialization(Module &M) {
   if (BSSGlobals.size() > 1)
     Changed |= doMerge(BSSGlobals, M, false);
 
-  // FIXME: This currently breaks the EH processing due to way how the 
-  // typeinfo detection works. We might want to detect the TIs and ignore 
+  // FIXME: This currently breaks the EH processing due to way how the
+  // typeinfo detection works. We might want to detect the TIs and ignore
   // them in the future.
   // if (ConstGlobals.size() > 1)
   //  Changed |= doMerge(ConstGlobals, M, true);
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index a9ba657..37f8bdf 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1215,21 +1215,26 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
   return 0;
 }
 
-/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
-/// that the current exit test is already sufficiently canonical.
-static bool needsLFTR(Loop *L, DominatorTree *DT) {
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
   assert(L->getExitingBlock() && "expected loop exit");
 
   BasicBlock *LatchBlock = L->getLoopLatch();
   // Don't bother with LFTR if the loop is not properly simplified.
   if (!LatchBlock)
-    return false;
+    return 0;
 
   BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
   assert(BI && "expected exit branch");
 
+  return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
+/// that the current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
   // Do LFTR to simplify the exit condition to an ICMP.
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  ICmpInst *Cond = getLoopTest(L);
   if (!Cond)
     return true;
 
@@ -1259,6 +1264,48 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) {
   return Phi != getLoopPhiForCounter(IncV, L, DT);
 }
 
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+    if (!Visited.insert(*OI))
+      continue;
+    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
 /// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
 /// be rewritten) loop exit test.
 static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
@@ -1283,6 +1330,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 /// valid count without scaling the address stride, so it remains a pointer
 /// expression as far as SCEV is concerned.
 ///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
 /// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
 ///
 /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
@@ -1331,6 +1380,19 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     if (getLoopPhiForCounter(IncV, L, DT) != Phi)
       continue;
 
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop test. In this case we assume that performing LFTR could not
+      // increase the number of undef users.
+      if (ICmpInst *Cond = getLoopTest(L)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
+            && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+          continue;
+        }
+      }
+    }
     const SCEV *Init = AR->getStart();
 
     if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
@@ -1347,7 +1409,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
       // If two IVs both count from zero or both count from nonzero then the
       // narrower is likely a dead phi that has been widened. Use the wider phi
       // to allow the other to be eliminated.
-      if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
         continue;
     }
     BestPhi = Phi;
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 429b61b..dd42c59 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -670,6 +670,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
     Condition = SI->getCondition();
   } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
     Condition = IB->getAddress()->stripPointerCasts();
     Preference = WantBlockAddress;
   } else {
@@ -859,7 +861,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // If all of the loads and stores that feed the value have the same TBAA tag,
   // then we can propagate it onto any newly inserted loads.
-  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); 
+  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -885,7 +887,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
       OneUnavailablePred = PredBB;
       continue;
     }
-    
+
     // If tbaa tags disagree or are not present, forget about them.
     if (TBAATag != ThisTBAATag) TBAATag = 0;
 
@@ -949,7 +951,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (TBAATag)
       NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-    
+
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 8795cd8..0192e92 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -175,7 +175,9 @@ namespace {
     bool canSinkOrHoistInst(Instruction &I);
     bool isNotUsedInLoop(Instruction &I);
 
-    void PromoteAliasSet(AliasSet &AS);
+    void PromoteAliasSet(AliasSet &AS,
+                         SmallVectorImpl<BasicBlock*> &ExitBlocks,
+                         SmallVectorImpl<Instruction*> &InsertPts);
   };
 }
 
@@ -256,10 +258,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
   if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    SmallVector<Instruction *, 8> InsertPts;
+
     // Loop over all of the alias sets in the tracker object.
     for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
          I != E; ++I)
-      PromoteAliasSet(*I);
+      PromoteAliasSet(*I, ExitBlocks, InsertPts);
   }
 
   // Clear out loops state information for the next iteration
@@ -618,6 +623,11 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
     if (!DT->dominates(Inst.getParent(), ExitBlocks[i]))
       return false;
 
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
   return true;
 }
 
@@ -626,6 +636,7 @@ namespace {
     Value *SomePtr;  // Designated pointer to store to.
     SmallPtrSet<Value*, 4> &PointerMustAliases;
     SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
+    SmallVectorImpl<Instruction*> &LoopInsertPts;
     AliasSetTracker &AST;
     DebugLoc DL;
     int Alignment;
@@ -633,11 +644,12 @@ namespace {
     LoopPromoter(Value *SP,
                  const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  SmallPtrSet<Value*, 4> &PMA,
-                 SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast,
-                 DebugLoc dl, int alignment)
+                 SmallVectorImpl<BasicBlock*> &LEB,
+                 SmallVectorImpl<Instruction*> &LIP,
+                 AliasSetTracker &ast, DebugLoc dl, int alignment)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP),
-        PointerMustAliases(PMA), LoopExitBlocks(LEB), AST(ast), DL(dl),
-        Alignment(alignment) {}
+        PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP),
+        AST(ast), DL(dl), Alignment(alignment) {}
 
     virtual bool isInstInList(Instruction *I,
                               const SmallVectorImpl<Instruction*> &) const {
@@ -657,7 +669,7 @@ namespace {
       for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
-        Instruction *InsertPos = ExitBlock->getFirstInsertionPt();
+        Instruction *InsertPos = LoopInsertPts[i];
         StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
@@ -679,7 +691,9 @@ namespace {
 /// looping over the stores in the loop, looking for stores to Must pointers
 /// which are loop invariant.
 ///
-void LICM::PromoteAliasSet(AliasSet &AS) {
+void LICM::PromoteAliasSet(AliasSet &AS,
+                           SmallVectorImpl<BasicBlock*> &ExitBlocks,
+                           SmallVectorImpl<Instruction*> &InsertPts) {
   // We can promote this alias set if it has a store, if it is a "Must" alias
   // set, if the pointer is loop invariant, and if we are not eliminating any
   // volatile loads or stores.
@@ -789,14 +803,20 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
   // location is better than none.
   DebugLoc DL = LoopUses[0]->getDebugLoc();
 
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  // Figure out the loop exits and their insertion points, if this is the
+  // first promotion.
+  if (ExitBlocks.empty()) {
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    InsertPts.resize(ExitBlocks.size());
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt();
+  }
 
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        *CurAST, DL, Alignment);
+                        InsertPts, *CurAST, DL, Alignment);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index f7f3298..3771f5a 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -32,10 +32,10 @@ namespace {
     LoopDeletion() : LoopPass(ID) {
       initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // Possibly eliminate loop L if it is dead.
     bool runOnLoop(Loop* L, LPPassManager& LPM);
-    
+
     bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
                     SmallVector<BasicBlock*, 4>& exitBlocks,
                     bool &Changed, BasicBlock *Preheader);
@@ -46,7 +46,7 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      
+
       AU.addPreserved<ScalarEvolution>();
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<LoopInfo>();
@@ -55,7 +55,7 @@ namespace {
     }
   };
 }
-  
+
 char LoopDeletion::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
@@ -79,7 +79,7 @@ bool LoopDeletion::IsLoopDead(Loop* L,
                               SmallVector<BasicBlock*, 4>& exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
   BasicBlock* exitBlock = exitBlocks[0];
-  
+
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
   // must pass through a PHI in the exit block, meaning that this check is
@@ -97,14 +97,14 @@ bool LoopDeletion::IsLoopDead(Loop* L,
       if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
         return false;
     }
-      
+
     if (Instruction* I = dyn_cast<Instruction>(incoming))
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
 
     ++BI;
   }
-  
+
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
   // marked volatile.  This could be made more aggressive by using aliasing
@@ -117,23 +117,23 @@ bool LoopDeletion::IsLoopDead(Loop* L,
         return false;
     }
   }
-  
+
   return true;
 }
 
 /// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
-/// observable behavior of the program other than finite running time.  Note 
+/// observable behavior of the program other than finite running time.  Note
 /// we do ensure that this never remove a loop that might be infinite, as doing
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
 bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
-  // We can only remove the loop if there is a preheader that we can 
+  // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
   BasicBlock* preheader = L->getLoopPreheader();
   if (!preheader)
     return false;
-  
+
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->hasDedicatedExits())
     return false;
@@ -142,36 +142,36 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   // they would already have been removed in earlier executions of this pass.
   if (L->begin() != L->end())
     return false;
-  
+
   SmallVector<BasicBlock*, 4> exitingBlocks;
   L->getExitingBlocks(exitingBlocks);
-  
+
   SmallVector<BasicBlock*, 4> exitBlocks;
   L->getUniqueExitBlocks(exitBlocks);
-  
+
   // We require that the loop only have a single exit block.  Otherwise, we'd
   // be in the situation of needing to be able to solve statically which exit
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
   if (exitBlocks.size() != 1)
     return false;
-  
+
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
   if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
-  
+
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
   ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
-  
+
   // Now that we know the removal is safe, remove the loop by changing the
-  // branch from the preheader to go to the single exit block.  
+  // branch from the preheader to go to the single exit block.
   BasicBlock* exitBlock = exitBlocks[0];
-  
+
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.  Don't
   // mess with this unless you have good reason and know what you're doing.
@@ -197,7 +197,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
       P->removeIncomingValue(exitingBlocks[i]);
     ++BI;
   }
-  
+
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
   DominatorTree& DT = getAnalysis<DominatorTree>();
@@ -211,7 +211,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
          DE = ChildNodes.end(); DI != DE; ++DI) {
       DT.changeImmediateDominator(*DI, DT[preheader]);
     }
-    
+
     ChildNodes.clear();
     DT.eraseNode(*LI);
 
@@ -219,7 +219,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
     // delete it freely later.
     (*LI)->dropAllReferences();
   }
-  
+
   // Erase the instructions and the blocks without having to worry
   // about ordering because we already dropped the references.
   // NOTE: This iteration is safe because erasing the block does not remove its
@@ -236,13 +236,13 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
        E = blocks.end(); I != E; ++I)
     loopInfo.removeBlock(*I);
-  
+
   // The last step is to inform the loop pass manager that we've
   // eliminated this loop.
   LPM.deleteLoopFromQueue(L);
   Changed = true;
-  
+
   ++NumDeleted;
-  
+
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ad15cbb..ac1082c 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -43,20 +43,20 @@
 
 #define DEBUG_TYPE "loop-idiom"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
@@ -173,7 +173,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
-  // Disable loop idiom recognition if the function's name is a common idiom. 
+  // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
   if (Name == "memset" || Name == "memcpy")
     return false;
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index f0f05e6..982400c 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -48,7 +48,7 @@ namespace {
     }
   };
 }
-  
+
 char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 59aace9..7eeb152 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -418,12 +418,13 @@ bool LoopRotate::rotateLoop(Loop *L) {
     }
 
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.  Split the edge to form a real preheader.
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
     BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
-    // Preserve canonical loop form, which means that 'Exit' should have only one
-    // predecessor.
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor.
     BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this);
     ExitSplit->moveBefore(Exit);
   } else {
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index b085b00..b14a713 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1308,8 +1308,8 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
 
   case LSRUse::Special:
-    // Only handle -1 scales, or no scale.
-    return AM.Scale == 0 || AM.Scale == -1;
+    // Special case Basic to handle -1 scales.
+    return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0;
   }
 
   llvm_unreachable("Invalid LSRUse Kind!");
@@ -1439,7 +1439,41 @@ struct IVInc {
 
 // IVChain - The list of IV increments in program order.
 // We typically add the head of a chain without finding subsequent links.
-typedef SmallVector<IVInc,1> IVChain;
+struct IVChain {
+  SmallVector<IVInc,1> Incs;
+  const SCEV *ExprBase;
+
+  IVChain() : ExprBase(0) {}
+
+  IVChain(const IVInc &Head, const SCEV *Base)
+    : Incs(1, Head), ExprBase(Base) {}
+
+  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+  // begin - return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return llvm::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // hasIncs - Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // add - Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // tailUserInst - Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // isProfitableIncrement - Returns true if IncExpr can be profitably added to
+  // this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
 
 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
 /// Distinguish between FarUsers that definitely cross IV increments and
@@ -2160,7 +2194,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
             return &LU;
           // This is the formula where all the registers and symbols matched;
           // there aren't going to be any others. Since we declined it, we
-          // can skip the rest of the formulae and procede to the next LSRUse.
+          // can skip the rest of the formulae and proceed to the next LSRUse.
           break;
         }
       }
@@ -2319,41 +2353,23 @@ static const SCEV *getExprBase(const SCEV *S) {
 /// increment will be an offset relative to the same base. We allow such offsets
 /// to potentially be used as chain increment as long as it's not obviously
 /// expensive to expand using real instructions.
-static const SCEV *
-getProfitableChainIncrement(Value *NextIV, Value *PrevIV,
-                            const IVChain &Chain, Loop *L,
-                            ScalarEvolution &SE, const TargetLowering *TLI) {
-  // Prune the solution space aggressively by checking that both IV operands
-  // are expressions that operate on the same unscaled SCEVUnknown. This
-  // "base" will be canceled by the subsequent getMinusSCEV call. Checking first
-  // avoids creating extra SCEV expressions.
-  const SCEV *OperExpr = SE.getSCEV(NextIV);
-  const SCEV *PrevExpr = SE.getSCEV(PrevIV);
-  if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain)
-    return 0;
-
-  const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
-  if (!SE.isLoopInvariant(IncExpr, L))
-    return 0;
-
-  // We are not able to expand an increment unless it is loop invariant,
-  // however, the following checks are purely for profitability.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
   if (StressIVChain)
-    return IncExpr;
+    return true;
 
   // Do not replace a constant offset from IV head with a nonconstant IV
   // increment.
   if (!isa<SCEVConstant>(IncExpr)) {
-    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand));
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
       return 0;
   }
 
   SmallPtrSet<const SCEV*, 8> Processed;
-  if (isHighCostExpansion(IncExpr, Processed, SE))
-    return 0;
-
-  return IncExpr;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
 }
 
 /// Return true if the number of registers needed for the chain is estimated to
@@ -2372,18 +2388,18 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   if (StressIVChain)
     return true;
 
-  if (Chain.size() <= 2)
+  if (!Chain.hasIncs())
     return false;
 
   if (!Users.empty()) {
-    DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n";
+    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
           for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
                  E = Users.end(); I != E; ++I) {
             dbgs() << "  " << **I << "\n";
           });
     return false;
   }
-  assert(!Chain.empty() && "empty IV chains are not allowed");
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
 
   // The chain itself may require a register, so intialize cost to 1.
   int cost = 1;
@@ -2391,15 +2407,15 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   // A complete chain likely eliminates the need for keeping the original IV in
   // a register. LSR does not currently know how to form a complete chain unless
   // the header phi already exists.
-  if (isa<PHINode>(Chain.back().UserInst)
-      && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) {
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
     --cost;
   }
   const SCEV *LastIncExpr = 0;
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
-  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
        I != E; ++I) {
 
     if (I->IncExpr->isZero())
@@ -2435,7 +2451,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   // the stride.
   cost -= NumReusedIncrements;
 
-  DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n");
+  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+               << "\n");
 
   return cost < 0;
 }
@@ -2446,25 +2463,39 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
   // When IVs are used as types of varying widths, they are generally converted
   // to a wider type with some uses remaining narrow under a (free) trunc.
-  Value *NextIV = getWideOperand(IVOper);
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
 
   // Visit all existing chains. Check if its IVOper can be computed as a
   // profitable loop invariant increment from the last link in the Chain.
   unsigned ChainIdx = 0, NChains = IVChainVec.size();
   const SCEV *LastIncExpr = 0;
   for (; ChainIdx < NChains; ++ChainIdx) {
-    Value *PrevIV = getWideOperand(IVChainVec[ChainIdx].back().IVOperand);
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
     if (!isCompatibleIVType(PrevIV, NextIV))
       continue;
 
     // A phi node terminates a chain.
-    if (isa<PHINode>(UserInst)
-        && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst))
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
       continue;
 
-    if (const SCEV *IncExpr =
-        getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx],
-                                    L, SE, TLI)) {
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
       LastIncExpr = IncExpr;
       break;
     }
@@ -2478,24 +2509,24 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
       DEBUG(dbgs() << "IV Chain Limit\n");
       return;
     }
-    LastIncExpr = SE.getSCEV(NextIV);
+    LastIncExpr = OperExpr;
     // IVUsers may have skipped over sign/zero extensions. We don't currently
     // attempt to form chains involving extensions unless they can be hoisted
     // into this loop's AddRec.
     if (!isa<SCEVAddRecExpr>(LastIncExpr))
       return;
     ++NChains;
-    IVChainVec.resize(NChains);
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
     ChainUsersVec.resize(NChains);
-    DEBUG(dbgs() << "IV Head: (" << *UserInst << ") IV=" << *LastIncExpr
-          << "\n");
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                 << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                 << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
-  else
-    DEBUG(dbgs() << "IV  Inc: (" << *UserInst << ") IV+" << *LastIncExpr
-          << "\n");
-
-  // Add this IV user to the end of the chain.
-  IVChainVec[ChainIdx].push_back(IVInc(UserInst, IVOper, LastIncExpr));
 
   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
   // This chain's NearUsers become FarUsers.
@@ -2551,6 +2582,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
 /// loop latch. This will discover chains on side paths, but requires
 /// maintaining multiple copies of the Chains state.
 void LSRInstance::CollectChains() {
+  DEBUG(dbgs() << "Collecting IV Chains.\n");
   SmallVector<ChainUsers, 8> ChainUsersVec;
 
   SmallVector<BasicBlock *,8> LatchPath;
@@ -2622,10 +2654,10 @@ void LSRInstance::CollectChains() {
 }
 
 void LSRInstance::FinalizeChain(IVChain &Chain) {
-  assert(!Chain.empty() && "empty IV chains are not allowed");
-  DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n");
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
 
-  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
        I != E; ++I) {
     DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
     User::op_iterator UseI =
@@ -2659,7 +2691,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
                                   SmallVectorImpl<WeakVH> &DeadInsts) {
   // Find the new IVOperand for the head of the chain. It may have been replaced
   // by LSR.
-  const IVInc &Head = Chain[0];
+  const IVInc &Head = Chain.Incs[0];
   User::op_iterator IVOpEnd = Head.UserInst->op_end();
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
@@ -2691,7 +2723,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   Type *IVTy = IVSrc->getType();
   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
   const SCEV *LeftOverExpr = 0;
-  for (IVChain::const_iterator IncI = llvm::next(Chain.begin()),
+  for (IVChain::const_iterator IncI = Chain.begin(),
          IncE = Chain.end(); IncI != IncE; ++IncI) {
 
     Instruction *InsertPt = IncI->UserInst;
@@ -2736,7 +2768,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   }
   // If LSR created a new, wider phi, we may also replace its postinc. We only
   // do this if we also found a wide value for the head of the chain.
-  if (isa<PHINode>(Chain.back().UserInst)) {
+  if (isa<PHINode>(Chain.tailUserInst())) {
     for (BasicBlock::iterator I = L->getHeader()->begin();
          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
       if (!isCompatibleIVType(Phi, IVSrc))
@@ -2804,7 +2836,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2974,42 +3006,64 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 
 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
 /// separate registers. If C is non-null, multiply each subexpression by C.
-static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
-                            SmallVectorImpl<const SCEV *> &Ops,
-                            const Loop *L,
-                            ScalarEvolution &SE) {
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     // Break out add operands.
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I)
-      CollectSubexprs(*I, C, Ops, L, SE);
-    return;
+         I != E; ++I) {
+      const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return NULL;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
-    if (!AR->getStart()->isZero()) {
-      CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
-                                       AR->getStepRecurrence(SE),
-                                       AR->getLoop(),
-                                       //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
-                                       SCEV::FlagAnyWrap),
-                      C, Ops, L, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, L, SE);
-      return;
+    if (AR->getStart()->isZero())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = NULL;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
     // Break (C * (a + b + c)) into C*a + C*b + C*c.
-    if (Mul->getNumOperands() == 2)
-      if (const SCEVConstant *Op0 =
-            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
-        CollectSubexprs(Mul->getOperand(1),
-                        C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, L, SE);
-        return;
-      }
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return NULL;
+    }
   }
-
-  // Otherwise use the value itself, optionally with a scale applied.
-  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
+  return S;
 }
 
 /// GenerateReassociations - Split out subexpressions from adds and the bases of
@@ -3024,7 +3078,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
     const SCEV *BaseReg = Base.BaseRegs[i];
 
     SmallVector<const SCEV *, 8> AddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+    const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+    if (Remainder)
+      AddOps.push_back(Remainder);
 
     if (AddOps.size() == 1) continue;
 
@@ -4236,13 +4292,6 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
   }
 
-  // Flush the operand list to suppress SCEVExpander hoisting.
-  if (!Ops.empty()) {
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
-    Ops.clear();
-    Ops.push_back(SE.getUnknown(FullV));
-  }
-
   // Expand the ScaledReg portion.
   Value *ICmpScaledV = 0;
   if (F.AM.Scale != 0) {
@@ -4264,23 +4313,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
       // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
       ScaledS = SE.getMulExpr(ScaledS,
                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
       Ops.push_back(ScaledS);
-
-      // Flush the operand list to suppress SCEVExpander hoisting.
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
-      Ops.clear();
-      Ops.push_back(SE.getUnknown(FullV));
     }
   }
 
   // Expand the GV portion.
   if (F.AM.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
+  }
 
-    // Flush the operand list to suppress SCEVExpander hoisting.
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
     Ops.clear();
     Ops.push_back(SE.getUnknown(FullV));
@@ -4485,7 +4545,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   // Mark phi nodes that terminate chains so the expander tries to reuse them.
   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
-    if (PHINode *PN = dyn_cast<PHINode>(ChainI->back().UserInst))
+    if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
       Rewriter.setChainedPhi(PN);
   }
 
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 930980f..58f7739 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1214,8 +1214,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
 
     // See if instruction simplification can hack this up.  This is common for
     // things like "select false, X, Y" after unswitching made the condition be
-    // 'false'.
-    if (Value *V = SimplifyInstruction(I, 0, 0, DT))
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I))
       if (LI->replacementPreservesLCSSAForm(I, V)) {
         ReplaceUsesOfWith(I, V, Worklist, L, LPM);
         continue;
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 689bbe9..7419a65 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -15,9 +15,9 @@
 #define DEBUG_TYPE "loweratomic"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
@@ -25,12 +25,12 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Ptr = CXI->getPointerOperand();
   Value *Cmp = CXI->getCompareOperand();
   Value *Val = CXI->getNewValOperand();
- 
+
   LoadInst *Orig = Builder.CreateLoad(Ptr);
   Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
   Value *Res = Builder.CreateSelect(Equal, Val, Orig);
   Builder.CreateStore(Res, Ptr);
- 
+
   CXI->replaceAllUsesWith(Orig);
   CXI->eraseFromParent();
   return true;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a87cce3..2a5ee33 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -15,21 +15,21 @@
 #define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/GlobalVariable.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
 
@@ -44,7 +44,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (unsigned i = 1; i != Idx; ++i, ++GTI)
     /*skip along*/;
-  
+
   // Compute the offset implied by the rest of the indices.
   int64_t Offset = 0;
   for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
@@ -58,7 +58,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
       Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
       continue;
     }
-    
+
     // Otherwise, we have a sequential type like an array or vector.  Multiply
     // the index by the ElementSize.
     uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -77,7 +77,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   Ptr2 = Ptr2->stripPointerCasts();
   GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
   GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
-  
+
   bool VariableIdxFound = false;
 
   // If one pointer is a GEP and the other isn't, then see if the GEP is a
@@ -91,7 +91,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
     Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD);
     return !VariableIdxFound;
   }
-  
+
   // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
   // base.  After that base, they may have some number of common (and
   // potentially variable) indices.  After that they handle some constant
@@ -99,7 +99,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   // handle no other case.
   if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
     return false;
-  
+
   // Skip any common indices and track the GEP types.
   unsigned Idx = 1;
   for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
@@ -109,7 +109,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
   int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
   if (VariableIdxFound) return false;
-  
+
   Offset = Offset2-Offset1;
   return true;
 }
@@ -128,19 +128,19 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 namespace {
 struct MemsetRange {
   // Start/End - A semi range that describes the span that this range covers.
-  // The range is closed at the start and open at the end: [Start, End).  
+  // The range is closed at the start and open at the end: [Start, End).
   int64_t Start, End;
 
   /// StartPtr - The getelementptr instruction that points to the start of the
   /// range.
   Value *StartPtr;
-  
+
   /// Alignment - The known alignment of the first store.
   unsigned Alignment;
-  
+
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
-  
+
   bool isProfitableToUseMemset(const TargetData &TD) const;
 
 };
@@ -152,17 +152,17 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
 
   // If there is nothing to merge, don't do anything.
   if (TheStores.size() < 2) return false;
-  
+
   // If any of the stores are a memset, then it is always good to extend the
   // memset.
   for (unsigned i = 0, e = TheStores.size(); i != e; ++i)
     if (!isa<StoreInst>(TheStores[i]))
       return true;
-  
+
   // Assume that the code generator is capable of merging pairs of stores
   // together if it wants to.
   if (TheStores.size() == 2) return false;
-  
+
   // If we have fewer than 8 stores, it can still be worthwhile to do this.
   // For example, merging 4 i8 stores into an i32 store is useful almost always.
   // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
@@ -175,15 +175,15 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
   // actually reducing the number of stores used.
   unsigned Bytes = unsigned(End-Start);
   unsigned NumPointerStores = Bytes/TD.getPointerSize();
-  
+
   // Assume the remaining bytes if any are done a byte at a time.
   unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
-  
+
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
   // etc.
   return TheStores.size() > NumPointerStores+NumByteStores;
-}    
+}
 
 
 namespace {
@@ -195,12 +195,12 @@ class MemsetRanges {
   const TargetData &TD;
 public:
   MemsetRanges(const TargetData &td) : TD(td) {}
-  
+
   typedef std::list<MemsetRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
   const_iterator end() const { return Ranges.end(); }
   bool empty() const { return Ranges.empty(); }
-  
+
   void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
       addStore(OffsetFromFirst, SI);
@@ -210,21 +210,21 @@ public:
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
     int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType());
-    
+
     addRange(OffsetFromFirst, StoreSize,
              SI->getPointerOperand(), SI->getAlignment(), SI);
   }
-  
+
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
     addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
   }
-  
+
   void addRange(int64_t Start, int64_t Size, Value *Ptr,
                 unsigned Alignment, Instruction *Inst);
 
 };
-  
+
 } // end anon namespace
 
 
@@ -240,10 +240,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
                             unsigned Alignment, Instruction *Inst) {
   int64_t End = Start+Size;
   range_iterator I = Ranges.begin(), E = Ranges.end();
-  
+
   while (I != E && Start > I->End)
     ++I;
-  
+
   // We now know that I == E, in which case we didn't find anything to merge
   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
   // to insert a new range.  Handle this now.
@@ -256,18 +256,18 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
     R.TheStores.push_back(Inst);
     return;
   }
-  
+
   // This store overlaps with I, add it.
   I->TheStores.push_back(Inst);
-  
+
   // At this point, we may have an interval that completely contains our store.
   // If so, just add it to the interval and return.
   if (I->Start <= Start && I->End >= End)
     return;
-  
+
   // Now we know that Start <= I->End and End >= I->Start so the range overlaps
   // but is not entirely contained within the range.
-  
+
   // See if the range extends the start of the range.  In this case, it couldn't
   // possibly cause it to join the prior range, because otherwise we would have
   // stopped on *it*.
@@ -276,7 +276,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
     I->StartPtr = Ptr;
     I->Alignment = Alignment;
   }
-    
+
   // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
   // is in or right at the end of I), and that End >= I->Start.  Extend I out to
   // End.
@@ -325,7 +325,7 @@ namespace {
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
-  
+
     // Helper fuctions
     bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
     bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
@@ -341,7 +341,7 @@ namespace {
 
     bool iterateOnFunction(Function &F);
   };
-  
+
   char MemCpyOpt::ID = 0;
 }
 
@@ -361,16 +361,16 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 /// some other patterns to fold away.  In particular, this looks for stores to
 /// neighboring locations of memory.  If it sees enough consecutive ones, it
 /// attempts to merge them together into a memcpy/memset.
-Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, 
+Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
   if (TD == 0) return 0;
-  
+
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
   // are stored.
   MemsetRanges Ranges(*TD);
-  
+
   BasicBlock::iterator BI = StartInst;
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
@@ -381,43 +381,43 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
         break;
       continue;
     }
-    
+
     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
       if (!NextStore->isSimple()) break;
-    
+
       // Check to see if this stored value is of the same byte-splattable value.
       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
         break;
-      
+
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
       if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(),
                            Offset, *TD))
         break;
-      
+
       Ranges.addStore(Offset, NextStore);
     } else {
       MemSetInst *MSI = cast<MemSetInst>(BI);
-      
+
       if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
           !isa<ConstantInt>(MSI->getLength()))
         break;
-      
+
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
       if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD))
         break;
-      
+
       Ranges.addMemSet(Offset, MSI);
     }
   }
-  
+
   // If we have no ranges, then we just had a single store with nothing that
   // could be merged in.  This is a very common case of course.
   if (Ranges.empty())
     return 0;
-  
+
   // If we had at least one store that could be merged in, add the starting
   // store as well.  We try to avoid this unless there is at least something
   // interesting as a small compile-time optimization.
@@ -434,28 +434,28 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
        I != E; ++I) {
     const MemsetRange &Range = *I;
-    
+
     if (Range.TheStores.size() == 1) continue;
-    
+
     // If it is profitable to lower this range to memset, do so now.
     if (!Range.isProfitableToUseMemset(*TD))
       continue;
-    
+
     // Otherwise, we do want to transform this!  Create a new memset.
     // Get the starting pointer of the block.
     StartPtr = Range.StartPtr;
-    
+
     // Determine alignment
     unsigned Alignment = Range.Alignment;
     if (Alignment == 0) {
-      Type *EltType = 
+      Type *EltType =
         cast<PointerType>(StartPtr->getType())->getElementType();
       Alignment = TD->getABITypeAlignment(EltType);
     }
-    
-    AMemSet = 
+
+    AMemSet =
       Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
-    
+
     DEBUG(dbgs() << "Replace stores:\n";
           for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
             dbgs() << *Range.TheStores[i] << '\n';
@@ -473,14 +473,14 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     }
     ++NumMemSetInfer;
   }
-  
+
   return AMemSet;
 }
 
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
-  
+
   if (TD == 0) return false;
 
   // Detect cases where we're performing call slot forwarding, but
@@ -510,7 +510,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
       if (C) {
         bool changed = performCallSlotOptzn(LI,
-                        SI->getPointerOperand()->stripPointerCasts(), 
+                        SI->getPointerOperand()->stripPointerCasts(),
                         LI->getPointerOperand()->stripPointerCasts(),
                         TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
         if (changed) {
@@ -524,10 +524,10 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
     }
   }
-  
+
   // There are two cases that are interesting for this code to handle: memcpy
   // and memset.  Right now we only handle memset.
-  
+
   // Ensure that the value being stored is something that can be memset'able a
   // byte at a time like "0" or "-1" or any width, as well as things like
   // 0xA0A0A0A0 and 0.0.
@@ -537,7 +537,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       BBI = I;  // Don't invalidate iterator.
       return true;
     }
-  
+
   return false;
 }
 
@@ -662,7 +662,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // the use analysis, we also need to know that it does not sneakily
   // access dest.  We rely on AA to figure this out for us.
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef)
+  AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  // If necessary, perform additional analysis.
+  if (MR != AliasAnalysis::NoModRef)
+    MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+  if (MR != AliasAnalysis::NoModRef)
     return false;
 
   // All the checks have passed, so do the transformation.
@@ -676,7 +680,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       if (CS.getArgument(i)->getType() == cpyDest->getType())
         CS.setArgument(i, cpyDest);
       else
-        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest,
                           CS.getArgument(i)->getType(), cpyDest->getName(), C));
     }
 
@@ -697,14 +701,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
 /// processMemCpyMemCpyDependence - We've found that the (upward scanning)
 /// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to
 /// copy from MDep's input if we can.  MSize is the size of M's copy.
-/// 
+///
 bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                               uint64_t MSize) {
   // We can only transforms memcpy's where the dest of one is the source of the
   // other.
   if (M->getSource() != MDep->getDest() || MDep->isVolatile())
     return false;
-  
+
   // If dep instruction is reading from our current input, then it is a noop
   // transfer and substituting the input won't change this instruction.  Just
   // ignore the input and let someone else zap MDep.  This handles cases like:
@@ -712,14 +716,14 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
   //    memcpy(b <- a)
   if (M->getSource() == MDep->getSource())
     return false;
-  
+
   // Second, the length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
   ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
   ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
-  
+
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
 
   // Verify that the copied-from memory doesn't change in between the two
@@ -739,23 +743,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                  false, M, M->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
-  
+
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap.  We still want to eliminate the intermediate
   // value, but we have to generate a memmove instead of memcpy.
   bool UseMemMove = false;
   if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep)))
     UseMemMove = true;
-  
+
   // If all checks passed, then we can transform M.
-  
+
   // Make sure to use the lesser of the alignment of the source and the dest
   // since we're changing where we're reading from, but don't want to increase
   // the alignment past what can be read from or written to.
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
   unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
-  
+
   IRBuilder<> Builder(M);
   if (UseMemMove)
     Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
@@ -835,13 +839,13 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 
   if (!TLI->has(LibFunc::memmove))
     return false;
-  
+
   // See if the pointers alias.
   if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M)))
     return false;
-  
+
   DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
-  
+
   // If not, then we know we can transform this.
   Module *Mod = M->getParent()->getParent()->getParent();
   Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -857,7 +861,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   ++NumMoveToCpy;
   return true;
 }
-  
+
 /// processByValArgument - This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (TD == 0) return false;
@@ -880,7 +884,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (MDep == 0 || MDep->isVolatile() ||
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
-  
+
   // The length of the memcpy must be larger or equal to the size of the byval.
   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
   if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
@@ -890,13 +894,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   // then it is some target specific value that we can't know.
   unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
   if (ByValAlign == 0) return false;
-  
+
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
   if (MDep->getAlignment() < ByValAlign &&
       getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
     return false;
-  
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
@@ -911,16 +915,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  false, CS.getInstruction(), MDep->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
-  
+
   Value *TmpCast = MDep->getSource();
   if (MDep->getSource()->getType() != ByValArg->getType())
     TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
                               "tmpcast", CS.getInstruction());
-  
+
   DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
                << "  " << *MDep << "\n"
                << "  " << *CS.getInstruction() << "\n");
-  
+
   // Otherwise we're good!  Update the byval argument.
   CS.setArgument(ArgNo, TmpCast);
   ++NumMemCpyInstr;
@@ -936,9 +940,9 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
       // Avoid invalidating the iterator.
       Instruction *I = BI++;
-      
+
       bool RepeatInstruction = false;
-      
+
       if (StoreInst *SI = dyn_cast<StoreInst>(I))
         MadeChange |= processStore(SI, BI);
       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
@@ -960,7 +964,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
       }
     }
   }
-  
+
   return MadeChange;
 }
 
@@ -972,19 +976,19 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
-  
+
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
   if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
     return false;
-  
+
   while (1) {
     if (!iterateOnFunction(F))
       break;
     MadeChange = true;
   }
-  
+
   MD = 0;
   return MadeChange;
 }
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 7e3e69b..3222f20 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -20,7 +20,7 @@
 // This file also defines a simple ARC-aware AliasAnalysis.
 //
 // WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowedge of their semantics.
+// by name, and hardwires knowledge of their semantics.
 //
 // WARNING: This file knows about how certain Objective-C library functions are
 // used. Naive LLVM IR transformations which would otherwise be
@@ -29,18 +29,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "objc-arc"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 // A handy option to enable/disable all optimizations in this file.
@@ -141,6 +131,13 @@ namespace {
 // ARC Utilities.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/ADT/StringSwitch.h"
+
 namespace {
   /// InstructionClass - A simple classification for instructions.
   enum InstructionClass {
@@ -299,22 +296,23 @@ static InstructionClass GetInstructionClass(const Value *V) {
         // None of the intrinsic functions do objc_release. For intrinsics, the
         // only question is whether or not they may be users.
         switch (F->getIntrinsicID()) {
-        case 0: break;
-        case Intrinsic::bswap: case Intrinsic::ctpop:
-        case Intrinsic::ctlz: case Intrinsic::cttz:
         case Intrinsic::returnaddress: case Intrinsic::frameaddress:
         case Intrinsic::stacksave: case Intrinsic::stackrestore:
         case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
         // Don't let dbg info affect our results.
         case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
           // Short cut: Some intrinsics obviously don't use ObjC pointers.
           return IC_None;
         default:
-          for (Function::const_arg_iterator AI = F->arg_begin(),
-               AE = F->arg_end(); AI != AE; ++AI)
-            if (IsPotentialUse(AI))
-              return IC_User;
-          return IC_None;
+          break;
         }
       }
       return GetCallSiteClass(CI);
@@ -382,14 +380,14 @@ static InstructionClass GetBasicInstructionClass(const Value *V) {
   return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
 }
 
-/// IsRetain - Test if the the given class is objc_retain or
+/// IsRetain - Test if the given class is objc_retain or
 /// equivalent.
 static bool IsRetain(InstructionClass Class) {
   return Class == IC_Retain ||
          Class == IC_RetainRV;
 }
 
-/// IsAutorelease - Test if the the given class is objc_autorelease or
+/// IsAutorelease - Test if the given class is objc_autorelease or
 /// equivalent.
 static bool IsAutorelease(InstructionClass Class) {
   return Class == IC_Autorelease ||
@@ -444,7 +442,7 @@ static bool IsNoThrow(InstructionClass Class) {
          Class == IC_AutoreleasepoolPop;
 }
 
-/// EraseInstruction - Erase the given instruction. ObjC calls return their
+/// EraseInstruction - Erase the given instruction. Many ObjC calls return their
 /// argument verbatim, so if it's such a call and the return value has users,
 /// replace them with the argument value.
 static void EraseInstruction(Instruction *CI) {
@@ -565,9 +563,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
     return Arg;
   }
 
-  // If we found an identifiable object but it has multiple uses, but they
-  // are trivial uses, we can still consider this to be a single-use
-  // value.
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
   if (IsObjCIdentifiedObject(Arg)) {
     for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
          UI != UE; ++UI) {
@@ -692,7 +689,7 @@ namespace {
     /// specified pass info.
     virtual void *getAdjustedAnalysisPointer(const void *PI) {
       if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
+        return static_cast<AliasAnalysis *>(this);
       return this;
     }
 
@@ -815,7 +812,7 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   case IC_FusedRetainAutorelease:
   case IC_FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
-    // Note that this doesn't include objc_retainBlock, becuase it updates
+    // Note that this doesn't include objc_retainBlock, because it updates
     // pointers when it copies block data.
     return NoModRef;
   default:
@@ -915,6 +912,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Constants.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace {
   /// ObjCARCAPElim - Autorelease pool elimination.
@@ -922,8 +920,8 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual bool runOnModule(Module &M);
 
-    bool MayAutorelease(CallSite CS, unsigned Depth = 0);
-    bool OptimizeBB(BasicBlock *BB);
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
 
   public:
     static char ID;
@@ -949,15 +947,16 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
 
 /// MayAutorelease - Interprocedurally determine if calls made by the
 /// given call site can possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(CallSite CS, unsigned Depth) {
-  if (Function *Callee = CS.getCalledFunction()) {
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
     if (Callee->isDeclaration() || Callee->mayBeOverridden())
       return true;
-    for (Function::iterator I = Callee->begin(), E = Callee->end();
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
          I != E; ++I) {
-      BasicBlock *BB = I;
-      for (BasicBlock::iterator J = BB->begin(), F = BB->end(); J != F; ++J)
-        if (CallSite JCS = CallSite(J))
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
           // This recursion depth limit is arbitrary. It's just great
           // enough to cover known interesting testcases.
           if (Depth < 3 &&
@@ -992,7 +991,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
       Push = 0;
       break;
     case IC_CallOrUser:
-      if (MayAutorelease(CallSite(Inst)))
+      if (MayAutorelease(ImmutableCallSite(Inst)))
         Push = 0;
       break;
     default:
@@ -1033,7 +1032,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
     Value *Op = *OI;
     // llvm.global_ctors is an array of pairs where the second members
     // are constructor functions.
-    Function *F = cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
     // Only look at function definitions.
     if (F->isDeclaration())
       continue;
@@ -1089,14 +1092,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
 
 // TODO: Delete release+retain pairs (rare).
 
-#include "llvm/GlobalAlias.h"
-#include "llvm/Constants.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/DenseSet.h"
 
 STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
@@ -1144,22 +1143,13 @@ bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for relations between the values on corresponding arms.
   if (const SelectInst *SB = dyn_cast<SelectInst>(B))
-    if (A->getCondition() == SB->getCondition()) {
-      if (related(A->getTrueValue(), SB->getTrueValue()))
-        return true;
-      if (related(A->getFalseValue(), SB->getFalseValue()))
-        return true;
-      return false;
-    }
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
 
   // Check both arms of the Select node individually.
-  if (related(A->getTrueValue(), B))
-    return true;
-  if (related(A->getFalseValue(), B))
-    return true;
-
-  // The arms both checked out.
-  return false;
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
 }
 
 bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
@@ -1357,12 +1347,6 @@ namespace {
     /// with the "tail" keyword.
     bool IsTailCallRelease;
 
-    /// Partial - True of we've seen an opportunity for partial RR elimination,
-    /// such as pushing calls into a CFG triangle or into one side of a
-    /// CFG diamond.
-    /// TODO: Consider moving this to PtrState.
-    bool Partial;
-
     /// ReleaseMetadata - If the Calls are objc_release calls and they all have
     /// a clang.imprecise_release tag, this is the metadata tag.
     MDNode *ReleaseMetadata;
@@ -1377,7 +1361,7 @@ namespace {
 
     RRInfo() :
       KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false), Partial(false),
+      IsTailCallRelease(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1388,7 +1372,6 @@ void RRInfo::clear() {
   KnownSafe = false;
   IsRetainBlock = false;
   IsTailCallRelease = false;
-  Partial = false;
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
@@ -1398,36 +1381,39 @@ namespace {
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// RefCount - The known minimum number of reference count increments.
-    unsigned RefCount;
-
     /// NestCount - The known minimum level of retain+release nesting.
     unsigned NestCount;
 
+    /// KnownPositiveRefCount - True if the reference count is known to
+    /// be incremented.
+    bool KnownPositiveRefCount;
+
+    /// Partial - True of we've seen an opportunity for partial RR elimination,
+    /// such as pushing calls into a CFG triangle or into one side of a
+    /// CFG diamond.
+    bool Partial;
+
     /// Seq - The current position in the sequence.
-    Sequence Seq;
+    Sequence Seq : 8;
 
   public:
     /// RRI - Unidirectional information about the current sequence.
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : RefCount(0), NestCount(0), Seq(S_None) {}
-
-    void SetAtLeastOneRefCount()  {
-      if (RefCount == 0) RefCount = 1;
-    }
+    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
 
-    void IncrementRefCount() {
-      if (RefCount != UINT_MAX) ++RefCount;
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
     }
 
-    void DecrementRefCount() {
-      if (RefCount != 0) --RefCount;
+    void ClearRefCount() {
+      KnownPositiveRefCount = false;
     }
 
     bool IsKnownIncremented() const {
-      return RefCount > 0;
+      return KnownPositiveRefCount;
     }
 
     void IncrementNestCount() {
@@ -1451,7 +1437,12 @@ namespace {
     }
 
     void ClearSequenceProgress() {
-      Seq = S_None;
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
       RRI.clear();
     }
 
@@ -1462,7 +1453,7 @@ namespace {
 void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  RefCount = std::min(RefCount, Other.RefCount);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
   NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
@@ -1471,31 +1462,31 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
 
   // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
+    Partial = false;
     RRI.clear();
-  } else if (RRI.Partial || Other.RRI.Partial) {
+  } else if (Partial || Other.Partial) {
     // If we're doing a merge on a path that's previously seen a partial
     // merge, conservatively drop the sequence, to avoid doing partial
     // RR elimination. If the branch predicates for the two merge differ,
     // mixing them is unsafe.
-    Seq = S_None;
-    RRI.clear();
+    ClearSequenceProgress();
   } else {
     // Conservatively merge the ReleaseMetadata information.
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
       RRI.ReleaseMetadata = 0;
 
     RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
 
     // Merge the insert point sets. If there are any differences,
     // that makes this a partial merge.
-    RRI.Partial = RRI.ReverseInsertPts.size() !=
-                  Other.RRI.ReverseInsertPts.size();
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
     for (SmallPtrSet<Instruction *, 2>::const_iterator
          I = Other.RRI.ReverseInsertPts.begin(),
          E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      RRI.Partial |= RRI.ReverseInsertPts.insert(*I);
+      Partial |= RRI.ReverseInsertPts.insert(*I);
   }
 }
 
@@ -1521,6 +1512,11 @@ namespace {
     /// known about a pointer at the top of each block.
     MapTy PerPtrBottomUp;
 
+    /// Preds, Succs - Effective successors and predecessors of the current
+    /// block (this ignores ignorable edges and ignored backedges).
+    SmallVector<BasicBlock *, 2> Preds;
+    SmallVector<BasicBlock *, 2> Succs;
+
   public:
     BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
 
@@ -1578,14 +1574,22 @@ namespace {
     /// entry to an exit which pass through this block. This is only valid
     /// after both the top-down and bottom-up traversals are complete.
     unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
       return TopDownPathCount * BottomUpPathCount;
     }
 
-    /// IsVisitedTopDown - Test whether the block for this BBState has been
-    /// visited by the top-down portion of the algorithm.
-    bool isVisitedTopDown() const {
-      return TopDownPathCount != 0;
-    }
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
   };
 }
 
@@ -1783,12 +1787,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
   if (!RetainRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
                              Attributes);
@@ -1800,12 +1801,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
   if (!AutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     AutoreleaseRVCallee =
       M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
                              Attributes);
@@ -1816,10 +1814,8 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
 Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
   if (!ReleaseCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     ReleaseCallee =
       M->getOrInsertFunction(
         "objc_release",
@@ -1832,10 +1828,8 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
 Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   if (!RetainCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainCallee =
       M->getOrInsertFunction(
         "objc_retain",
@@ -1848,16 +1842,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
 Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
   if (!RetainBlockCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
     // objc_retainBlock is not nounwind because it calls user copy constructors
     // which could theoretically throw.
     RetainBlockCallee =
       M->getOrInsertFunction(
         "objc_retainBlock",
         FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attributes);
+        AttrListPtr());
   }
   return RetainBlockCallee;
 }
@@ -1865,10 +1857,8 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
 Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     AutoreleaseCallee =
       M->getOrInsertFunction(
         "objc_autorelease",
@@ -2153,13 +2143,13 @@ static bool isNoopInstruction(const Instruction *I) {
 /// objc_retainAutoreleasedReturnValue if the operand is a return value.
 void
 ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  CallSite CS(GetObjCArg(Retain));
-  Instruction *Call = CS.getInstruction();
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
   if (!Call) return;
   if (Call->getParent() != Retain->getParent()) return;
 
   // Check that the call is next to the retain.
-  BasicBlock::iterator I = Call;
+  BasicBlock::const_iterator I = Call;
   ++I;
   while (isNoopInstruction(I)) ++I;
   if (&*I != Retain)
@@ -2172,25 +2162,24 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
 }
 
 /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
-/// objc_retain if the operand is not a return value.  Or, if it can be
-/// paired with an objc_autoreleaseReturnValue, delete the pair and
-/// return true.
+/// objc_retain if the operand is not a return value.  Or, if it can be paired
+/// with an objc_autoreleaseReturnValue, delete the pair and return true.
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // Check for the argument being from an immediately preceding call or invoke.
-  Value *Arg = GetObjCArg(RetainRV);
-  CallSite CS(Arg);
-  if (Instruction *Call = CS.getInstruction()) {
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::iterator I = Call;
+      BasicBlock::const_iterator I = Call;
       ++I;
       while (isNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
-    } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       BasicBlock *RetainRVParent = RetainRV->getParent();
       if (II->getNormalDest() == RetainRVParent) {
-        BasicBlock::iterator I = RetainRVParent->begin();
+        BasicBlock::const_iterator I = RetainRVParent->begin();
         while (isNoopInstruction(I)) ++I;
         if (&*I == RetainRV)
           return false;
@@ -2418,7 +2407,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           // These can always be moved up.
           break;
         case IC_Release:
-          // These can't be moved across things that care about the retain count.
+          // These can't be moved across things that care about the retain
+          // count.
           FindDependencies(NeedsPositiveRetainCount, Arg,
                            Inst->getParent(), Inst,
                            DependingInstructions, Visited, PA);
@@ -2500,13 +2490,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       for (; SI != SE; ++SI) {
         Sequence SuccSSeq = S_None;
         bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has visited this successor, take what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
-        if (BBI != BBStates.end()) {
-          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-          SuccSSeq = SuccS.GetSeq();
-          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        }
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
         switch (SuccSSeq) {
         case S_None:
         case S_CanRelease: {
@@ -2553,13 +2544,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       for (; SI != SE; ++SI) {
         Sequence SuccSSeq = S_None;
         bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has visited this successor, take what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
-        if (BBI != BBStates.end()) {
-          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-          SuccSSeq = SuccS.GetSeq();
-          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        }
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
         switch (SuccSSeq) {
         case S_None: {
           if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
@@ -2617,16 +2609,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
       NestingDetected = true;
 
-    S.RRI.clear();
-
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
     S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementRefCount();
     S.IncrementNestCount();
     break;
   }
@@ -2641,8 +2630,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.DecrementRefCount();
-    S.SetAtLeastOneRefCount();
+    S.SetKnownPositiveRefCount();
     S.DecrementNestCount();
 
     switch (S.GetSeq()) {
@@ -2692,7 +2680,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.DecrementRefCount();
+      S.ClearRefCount();
       switch (Seq) {
       case S_Use:
         S.SetSeq(S_CanRelease);
@@ -2759,37 +2747,20 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-  succ_const_iterator SI(TI), SE(TI, false);
-  if (SI == SE)
-    MyStates.SetAsExit();
-  else {
-    // If the terminator is an invoke marked with the
-    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-    // ignored, for ARC purposes.
-    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-      --SE;
-
-    do {
-      const BasicBlock *Succ = *SI++;
-      if (Succ == BB)
-        continue;
-      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
-      // If we haven't seen this node yet, then we've found a CFG cycle.
-      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
-      if (I == BBStates.end())
-        continue;
-      MyStates.InitFromSucc(I->second);
-      while (SI != SE) {
-        Succ = *SI++;
-        if (Succ != BB) {
-          I = BBStates.find(Succ);
-          if (I != BBStates.end())
-            MyStates.MergeSucc(I->second);
-        }
-      }
-      break;
-    } while (SI != SE);
+  for (BBState::edge_iterator SI(MyStates.succ_begin()),
+       SE(MyStates.succ_end()); SI != SE; ++SI) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2803,15 +2774,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
 
-  // If there's a predecessor with an invoke, visit the invoke as
-  // if it were part of this block, since we can't insert code after
-  // an invoke in its own block, and we don't want to split critical
-  // edges.
-  for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) {
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
     BasicBlock *Pred = *PI;
-    TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back());
-    if (isa<InvokeInst>(PredTI))
-      NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
   return NestingDetected;
@@ -2851,25 +2821,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
       if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
-      S.SetSeq(S_Retain);
-      S.RRI.clear();
+      S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not
-      // sufficient.
+      // Don't check S.IsKnownIncremented() here because it's not sufficient.
       S.RRI.KnownSafe = S.IsKnownNested();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.SetAtLeastOneRefCount();
-    S.IncrementRefCount();
     S.IncrementNestCount();
-    return NestingDetected;
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
   }
   case IC_Release: {
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementRefCount();
     S.DecrementNestCount();
 
     switch (S.GetSeq()) {
@@ -2916,7 +2884,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.DecrementRefCount();
+      S.ClearRefCount();
       switch (Seq) {
       case S_Retain:
         S.SetSeq(S_CanRelease);
@@ -2967,41 +2935,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  const_pred_iterator PI(BB), PE(BB, false);
-  if (PI == PE)
-    MyStates.SetAsEntry();
-  else
-    do {
-      unsigned OperandNo = PI.getOperandNo();
-      const Use &Us = PI.getUse();
-      ++PI;
-
-      // Skip invoke unwind edges on invoke instructions marked with
-      // clang.arc.no_objc_arc_exceptions.
-      if (const InvokeInst *II = dyn_cast<InvokeInst>(Us.getUser()))
-        if (OperandNo == II->getNumArgOperands() + 2 &&
-            II->getMetadata(NoObjCARCExceptionsMDKind))
-          continue;
-
-      const BasicBlock *Pred = cast<TerminatorInst>(Us.getUser())->getParent();
-      if (Pred == BB)
-        continue;
-      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      // If we haven't seen this node yet, then we've found a CFG cycle.
-      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
-      if (I == BBStates.end() || !I->second.isVisitedTopDown())
-        continue;
-      MyStates.InitFromPred(I->second);
-      while (PI != PE) {
-        Pred = *PI++;
-        if (Pred != BB) {
-          I = BBStates.find(Pred);
-          if (I != BBStates.end() && I->second.isVisitedTopDown())
-            MyStates.MergePred(I->second);
-        }
-      }
-      break;
-    } while (PI != PE);
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+    break;
+  }
 
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
@@ -3016,73 +2964,82 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 static void
 ComputePostOrders(Function &F,
                   SmallVectorImpl<BasicBlock *> &PostOrder,
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder) {
-  /// Backedges - Backedges detected in the DFS. These edges will be
-  /// ignored in the reverse-CFG DFS, so that loops with multiple exits will be
-  /// traversed in the desired order.
-  DenseSet<std::pair<BasicBlock *, BasicBlock *> > Backedges;
-
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
   /// Visited - The visited set, for doing DFS walks.
   SmallPtrSet<BasicBlock *, 16> Visited;
 
   // Do DFS, computing the PostOrder.
   SmallPtrSet<BasicBlock *, 16> OnStack;
   SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
   BasicBlock *EntryBB = &F.getEntryBlock();
-  SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB)));
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
   Visited.insert(EntryBB);
   OnStack.insert(EntryBB);
   do {
   dfs_next_succ:
-    TerminatorInst *TI = cast<TerminatorInst>(&SuccStack.back().first->back());
-    succ_iterator End = succ_iterator(TI, true);
-    while (SuccStack.back().second != End) {
-      BasicBlock *BB = *SuccStack.back().second++;
-      if (Visited.insert(BB)) {
-        SuccStack.push_back(std::make_pair(BB, succ_begin(BB)));
-        OnStack.insert(BB);
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    // If the terminator is an invoke marked with the
+    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
+    // ignored, for ARC purposes.
+    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
+      --SE;
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
         goto dfs_next_succ;
       }
-      if (OnStack.count(BB))
-        Backedges.insert(std::make_pair(SuccStack.back().first, BB));
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
     }
-    OnStack.erase(SuccStack.back().first);
-    PostOrder.push_back(SuccStack.pop_back_val().first);
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
   } while (!SuccStack.empty());
 
   Visited.clear();
 
-  // Compute the exits, which are the starting points for reverse-CFG DFS.
-  // This includes blocks where all the successors are backedges that
-  // we're skipping.
-  SmallVector<BasicBlock *, 4> Exits;
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *BB = I;
-    TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI)
-      if (!Backedges.count(std::make_pair(BB, *SI)))
-        goto HasNonBackedgeSucc;
-    Exits.push_back(BB);
-  HasNonBackedgeSucc:;
-  }
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
 
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
-  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> PredStack;
-  for (SmallVectorImpl<BasicBlock *>::iterator I = Exits.begin(), E = Exits.end();
-       I != E; ++I) {
-    BasicBlock *ExitBB = *I;
-    PredStack.push_back(std::make_pair(ExitBB, pred_begin(ExitBB)));
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
     Visited.insert(ExitBB);
     while (!PredStack.empty()) {
     reverse_dfs_next_succ:
-      pred_iterator End = pred_end(PredStack.back().first);
-      while (PredStack.back().second != End) {
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
         BasicBlock *BB = *PredStack.back().second++;
-        // Skip backedges detected in the forward-CFG DFS.
-        if (Backedges.count(std::make_pair(BB, PredStack.back().first)))
-          continue;
         if (Visited.insert(BB)) {
-          PredStack.push_back(std::make_pair(BB, pred_begin(BB)));
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
           goto reverse_dfs_next_succ;
         }
       }
@@ -3105,7 +3062,9 @@ ObjCARCOpt::Visit(Function &F,
   // function exit point, and we want to ignore selected cycle edges.
   SmallVector<BasicBlock *, 16> PostOrder;
   SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder);
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
 
   // Use reverse-postorder on the reverse CFG for bottom-up.
   bool BottomUpNestingDetected = false;
@@ -3214,7 +3173,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
     bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-   
+
     // A constant pointer can't be pointing to an object on the heap. It may
     // be reference-counted, but it won't be deleted.
     if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
@@ -3375,6 +3334,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
     // Ok, everything checks out and we're all set. Let's move some code!
     Changed = true;
+    assert(OldCount != 0 && "Unreachable code?");
     AnyPairsCompletelyEliminated = NewCount == 0;
     NumRRs += OldCount - NewCount;
     MoveCalls(Arg, RetainsToMove, ReleasesToMove,
@@ -3515,7 +3475,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
       for (Value::use_iterator UI = Alloca->use_begin(),
            UE = Alloca->use_end(); UI != UE; ++UI) {
-        Instruction *UserInst = cast<Instruction>(*UI);
+        const Instruction *UserInst = cast<Instruction>(*UI);
         switch (GetBasicInstructionClass(UserInst)) {
         case IC_InitWeak:
         case IC_StoreWeak:
@@ -3529,8 +3489,18 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
       for (Value::use_iterator UI = Alloca->use_begin(),
            UE = Alloca->use_end(); UI != UE; ) {
         CallInst *UserInst = cast<CallInst>(*UI++);
-        if (!UserInst->use_empty())
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(0));
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
         UserInst->eraseFromParent();
       }
       Alloca->eraseFromParent();
@@ -3598,8 +3568,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
         dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
       if (!Autorelease)
         goto next_block;
-      InstructionClass AutoreleaseClass =
-        GetBasicInstructionClass(Autorelease);
+      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
       if (!IsAutorelease(AutoreleaseClass))
         goto next_block;
       if (GetObjCArg(Autorelease) != Arg)
@@ -3690,7 +3659,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
-  // calls finalizers.
+  // calls finalizers which can have arbitrary side effects.
 
   // These are initialized lazily.
   RetainRVCallee = 0;
@@ -3742,8 +3711,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
       while (OptimizeSequences(F)) {}
 
   // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction &
-      ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV)))
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
     OptimizeReturns(F);
 
   return Changed;
@@ -3791,7 +3760,7 @@ namespace {
     /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
     /// at the end of walking the function we have found no alloca
     /// instructions, these calls can be marked "tail".
-    DenseSet<CallInst *> StoreStrongCalls;
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
     Constant *getStoreStrongCallee(Module *M);
     Constant *getRetainAutoreleaseCallee(Module *M);
@@ -3842,13 +3811,11 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *I8XX = PointerType::getUnqual(I8X);
-    std::vector<Type *> Params;
-    Params.push_back(I8XX);
-    Params.push_back(I8X);
+    Type *Params[] = { I8XX, I8X };
 
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
-    Attributes.addAttr(1, Attribute::NoCapture);
+    AttrListPtr Attributes = AttrListPtr()
+      .addAttr(~0u, Attribute::NoUnwind)
+      .addAttr(1, Attribute::NoCapture);
 
     StoreStrongCallee =
       M->getOrInsertFunction(
@@ -3863,12 +3830,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
   if (!RetainAutoreleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainAutoreleaseCallee =
       M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
   }
@@ -3879,12 +3843,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   if (!RetainAutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainAutoreleaseRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
                              Attributes);
@@ -3892,8 +3853,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   return RetainAutoreleaseRVCallee;
 }
 
-/// ContractAutorelease - Merge an autorelease with a retain into a fused
-/// call.
+/// ContractAutorelease - Merge an autorelease with a retain into a fused call.
 bool
 ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
                                      InstructionClass Class,
@@ -3954,18 +3914,41 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   BasicBlock *BB = Release->getParent();
   if (Load->getParent() != BB) return;
 
-  // Walk down to find the store.
+  // Walk down to find the store and the release, which may be in either order.
   BasicBlock::iterator I = Load, End = BB->end();
   ++I;
   AliasAnalysis::Location Loc = AA->getLocation(Load);
-  while (I != End &&
-         (&*I == Release ||
-          IsRetain(GetBasicInstructionClass(I)) ||
-          !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
-    ++I;
-  StoreInst *Store = dyn_cast<StoreInst>(I);
-  if (!Store || !Store->isSimple()) return;
-  if (Store->getPointerOperand() != Loc.Ptr) return;
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
 
   Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
 
@@ -4053,7 +4036,8 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   // It seems that functions which "return twice" are also unsafe for the
   // "tail" argument, because they are setjmp, which could need to
   // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
 
   // For ObjC library calls which return their argument, replace uses of the
   // argument with uses of the call return value, if it dominates the use. This
@@ -4083,8 +4067,22 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (!RetainRVMarker)
         break;
       BasicBlock::iterator BBI = Inst;
-      --BBI;
-      while (isNoopInstruction(BBI)) --BBI;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (isNoopInstruction(BBI));
+
       if (&*BBI == GetObjCArg(Inst)) {
         Changed = true;
         InlineAsm *IA =
@@ -4094,6 +4092,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
                          /*Constraints=*/"", /*hasSideEffects=*/true);
         CallInst::Create(IA, "", Inst);
       }
+    decline_rv_optimization:
       break;
     }
     case IC_InitWeak: {
@@ -4143,25 +4142,21 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         // trivially dominate itself, which would lead us to rewriting its
         // argument in terms of its return value, which would lead to
         // infinite loops in GetObjCArg.
-        if (DT->isReachableFromEntry(U) &&
-            DT->dominates(Inst, U)) {
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
           Changed = true;
           Instruction *Replacement = Inst;
           Type *UseTy = U.get()->getType();
           if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
             // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo =
-              PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB =
-              PHI->getIncomingBlock(ValNo);
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
             if (Replacement->getType() != UseTy)
               Replacement = new BitCastInst(Replacement, UseTy, "",
                                             &BB->back());
             // While we're here, rewrite all edges for this PHI, rather
             // than just one use at a time, to minimize the number of
             // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues();
-                 i != e; ++i)
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
               if (PHI->getIncomingBlock(i) == BB) {
                 // Keep the UI iterator valid.
                 if (&PHI->getOperandUse(
@@ -4179,8 +4174,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         }
       }
 
-      // If Arg is a no-op casted pointer, strip one level of casts and
-      // iterate.
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
       if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
         Arg = BI->getOperand(0);
       else if (isa<GEPOperator>(Arg) &&
@@ -4197,7 +4191,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   // If this function has no escaping allocas or suspicious vararg usage,
   // objc_storeStrong calls can be marked with the "tail" keyword.
   if (TailOkForStoreStrongs)
-    for (DenseSet<CallInst *>::iterator I = StoreStrongCalls.begin(),
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
          E = StoreStrongCalls.end(); I != E; ++I)
       (*I)->setTailCall();
   StoreStrongCalls.clear();
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 5de00d1..09687d8 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -26,21 +26,23 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DenseMap.h"
 #include <algorithm>
 using namespace llvm;
 
-STATISTIC(NumLinear , "Number of insts linearized");
 STATISTIC(NumChanged, "Number of insts reassociated");
 STATISTIC(NumAnnihil, "Number of expr tree annihilated");
 STATISTIC(NumFactor , "Number of multiplies factored");
@@ -70,13 +72,51 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
   }
 }
 #endif
-  
+
+namespace {
+  /// \brief Utility class representing a base and exponent pair which form one
+  /// factor of some product.
+  struct Factor {
+    Value *Base;
+    unsigned Power;
+
+    Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
+
+    /// \brief Sort factors by their Base.
+    struct BaseSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base < RHS.Base;
+      }
+    };
+
+    /// \brief Compare factors for equal bases.
+    struct BaseEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base == RHS.Base;
+      }
+    };
+
+    /// \brief Sort factors in descending order by their power.
+    struct PowerDescendingSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power > RHS.Power;
+      }
+    };
+
+    /// \brief Compare factors for equal powers.
+    struct PowerEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power == RHS.Power;
+      }
+    };
+  };
+}
+
 namespace {
   class Reassociate : public FunctionPass {
     DenseMap<BasicBlock*, unsigned> RankMap;
     DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
-    SmallVector<WeakVH, 8> RedoInsts;
-    SmallVector<WeakVH, 8> DeadInsts;
+    SetVector<AssertingVH<Instruction> > RedoInsts;
     bool MadeChange;
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -92,18 +132,19 @@ namespace {
   private:
     void BuildRankMap(Function &F);
     unsigned getRank(Value *V);
-    Value *ReassociateExpression(BinaryOperator *I);
-    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops,
-                         unsigned Idx = 0);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeExpression(BinaryOperator *I,
                               SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
-    void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
-    void LinearizeExpr(BinaryOperator *I);
+    bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                SmallVectorImpl<Factor> &Factors);
+    Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                   SmallVectorImpl<Factor> &Factors);
+    Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
-    void ReassociateInst(BasicBlock::iterator &BBI);
-    
-    void RemoveDeadBinaryOp(Value *V);
+    void EraseInst(Instruction *I);
+    void OptimizeInst(Instruction *I);
   };
 }
 
@@ -114,28 +155,24 @@ INITIALIZE_PASS(Reassociate, "reassociate",
 // Public interface to the Reassociate pass
 FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
 
-void Reassociate::RemoveDeadBinaryOp(Value *V) {
-  Instruction *Op = dyn_cast<Instruction>(V);
-  if (!Op || !isa<BinaryOperator>(Op))
-    return;
-  
-  Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
-  
-  ValueRankMap.erase(Op);
-  DeadInsts.push_back(Op);
-  RemoveDeadBinaryOp(LHS);
-  RemoveDeadBinaryOp(RHS);
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
 }
 
-
 static bool isUnmovableInstruction(Instruction *I) {
   if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::LandingPad ||
       I->getOpcode() == Instruction::Alloca ||
       I->getOpcode() == Instruction::Load ||
       I->getOpcode() == Instruction::Invoke ||
       (I->getOpcode() == Instruction::Call &&
        !isa<DbgInfoIntrinsic>(I)) ||
-      I->getOpcode() == Instruction::UDiv || 
+      I->getOpcode() == Instruction::UDiv ||
       I->getOpcode() == Instruction::SDiv ||
       I->getOpcode() == Instruction::FDiv ||
       I->getOpcode() == Instruction::URem ||
@@ -198,211 +235,572 @@ unsigned Reassociate::getRank(Value *V) {
   return ValueRankMap[I] = Rank;
 }
 
-/// isReassociableOp - Return true if V is an instruction of the specified
-/// opcode and if it only has one use.
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
-  if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) &&
-      cast<Instruction>(V)->getOpcode() == Opcode)
-    return cast<BinaryOperator>(V);
-  return 0;
-}
-
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
-static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
-  Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
-  ValueRankMap.erase(Neg);
+  BinaryOperator *Res =
+    BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
+  Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op.
   Res->takeName(Neg);
   Neg->replaceAllUsesWith(Res);
   Res->setDebugLoc(Neg->getDebugLoc());
-  Neg->eraseFromParent();
   return Res;
 }
 
-// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'.
-// Note that if D is also part of the expression tree that we recurse to
-// linearize it as well.  Besides that case, this does not recurse into A,B, or
-// C.
-void Reassociate::LinearizeExpr(BinaryOperator *I) {
-  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
-  BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1));
-  assert(isReassociableOp(LHS, I->getOpcode()) &&
-         isReassociableOp(RHS, I->getOpcode()) &&
-         "Not an expression that needs linearization?");
-
-  DEBUG(dbgs() << "Linear" << *LHS << '\n' << *RHS << '\n' << *I << '\n');
-
-  // Move the RHS instruction to live immediately before I, avoiding breaking
-  // dominator properties.
-  RHS->moveBefore(I);
-
-  // Move operands around to do the linearization.
-  I->setOperand(1, RHS->getOperand(0));
-  RHS->setOperand(0, LHS);
-  I->setOperand(0, RHS);
-
-  // Conservatively clear all the optional flags, which may not hold
-  // after the reassociation.
-  I->clearSubclassOptionalData();
-  LHS->clearSubclassOptionalData();
-  RHS->clearSubclassOptionalData();
-
-  ++NumLinear;
-  MadeChange = true;
-  DEBUG(dbgs() << "Linearized: " << *I << '\n');
-
-  // If D is part of this expression tree, tail recurse.
-  if (isReassociableOp(I->getOperand(1), I->getOpcode()))
-    LinearizeExpr(I);
+/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda
+/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert(Opcode == Instruction::Mul && "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
 }
 
+/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C
+/// is repeated Weight times.
+static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C,
+                                          APInt Weight) {
+  // For addition the result can be efficiently computed as the product of the
+  // constant and the weight.
+  if (Opcode == Instruction::Add)
+    return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight));
+
+  // The weight might be huge, so compute by repeated squaring to ensure that
+  // compile time is proportional to the logarithm of the weight.
+  Constant *Result = 0;
+  Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc.
+  // Visit the bits in Weight.
+  while (Weight != 0) {
+    // If the current bit in Weight is non-zero do Result = Result op Power.
+    if (Weight[0])
+      Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power;
+    // Move on to the next bit if any more are non-zero.
+    Weight = Weight.lshr(1);
+    if (Weight.isMinValue())
+      break;
+    // Square the power.
+    Power = ConstantExpr::get(Opcode, Power, Power);
+  }
+
+  assert(Result && "Only positive weights supported!");
+  return Result;
+}
 
-/// LinearizeExprTree - Given an associative binary expression tree, traverse
-/// all of the uses putting it into canonical form.  This forces a left-linear
-/// form of the expression (((a+b)+c)+d), and collects information about the
-/// rank of the non-tree operands.
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// LinearizeExprTree - Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and
+/// they are all non-constant except possibly for the last one, which if it is
+/// constant will have weight one (Ops[N].second === 1).
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
 ///
-/// NOTE: These intentionally destroys the expression tree operands (turning
-/// them into undef values) to reduce #uses of the values.  This means that the
-/// caller MUST use something like RewriteExprTree to put the values back in.
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
 ///
-void Reassociate::LinearizeExprTree(BinaryOperator *I,
-                                    SmallVectorImpl<ValueEntry> &Ops) {
-  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
   unsigned Opcode = I->getOpcode();
+  assert(Instruction::isAssociative(Opcode) &&
+         Instruction::isCommutative(Opcode) &&
+         "Expected an associative and commutative operation!");
+  // If we see an absorbing element then the entire expression must be equal to
+  // it.  For example, if this is a multiplication expression and zero occurs as
+  // an operand somewhere in it then the result of the expression must be zero.
+  Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool MadeChange = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  typedef DenseMap<Value*, APInt> LeafMap;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
 
-  // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
-  BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
-  BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode);
+#ifndef NDEBUG
+  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If the expression contains an absorbing element then there is no need
+      // to analyze it further: it must evaluate to the absorbing element.
+      if (Op == Absorber && !Weight.isMinValue()) {
+        Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1)));
+        return MadeChange;
+      }
 
-  // If this is a multiply expression tree and it contains internal negations,
-  // transform them into multiplies by -1 so they can be reassociated.
-  if (I->getOpcode() == Instruction::Mul) {
-    if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
-      LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap);
-      LHSBO = isReassociableOp(LHS, Opcode);
-    }
-    if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
-      RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap);
-      RHSBO = isReassociableOp(RHS, Opcode);
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op) && "Not first visit!");
+        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
+
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op) && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else if (It != Leaves.end()) {
+        // Already in the leaf map.
+        assert(Visited.count(Op) && "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0   // TODO: Re-enable once PR13021 is fixed.
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        MadeChange = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+#endif
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(Op);
+      if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) {
+        DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+        BO = LowerNegateToMultiply(BO);
+        DEBUG(dbgs() << *BO << 'n');
+        Worklist.push_back(std::make_pair(BO, Weight));
+        MadeChange = true;
+        continue;
+      }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
     }
   }
 
-  if (!LHSBO) {
-    if (!RHSBO) {
-      // Neither the LHS or RHS as part of the tree, thus this is a leaf.  As
-      // such, just remember these operands and their rank.
-      Ops.push_back(ValueEntry(getRank(LHS), LHS));
-      Ops.push_back(ValueEntry(getRank(RHS), RHS));
-      
-      // Clear the leaves out.
-      I->setOperand(0, UndefValue::get(I->getType()));
-      I->setOperand(1, UndefValue::get(I->getType()));
-      return;
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  Constant *Cst = 0; // Accumulate constants here.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    // Glob all constants together into Cst.
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      C = EvaluateRepeatedConstant(Opcode, C, Weight);
+      Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C;
+      continue;
     }
-    
-    // Turn X+(Y+Z) -> (Y+Z)+X
-    std::swap(LHSBO, RHSBO);
-    std::swap(LHS, RHS);
-    bool Success = !I->swapOperands();
-    assert(Success && "swapOperands failed");
-    (void)Success;
-    MadeChange = true;
-  } else if (RHSBO) {
-    // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the RHS is not
-    // part of the expression tree.
-    LinearizeExpr(I);
-    LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0));
-    RHS = I->getOperand(1);
-    RHSBO = 0;
+    // Add non-constant
+    Ops.push_back(std::make_pair(V, Weight));
   }
 
-  // Okay, now we know that the LHS is a nested expression and that the RHS is
-  // not.  Perform reassociation.
-  assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!");
-
-  // Move LHS right before I to make sure that the tree expression dominates all
-  // values.
-  LHSBO->moveBefore(I);
+  // Add any constants back into Ops, all globbed together and reduced to having
+  // weight 1 for the convenience of users.
+  Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+  if (Cst && Cst != Identity) {
+    // If combining multiple constants resulted in the absorber then the entire
+    // expression must evaluate to the absorber.
+    if (Cst == Absorber)
+      Ops.clear();
+    Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1)));
+  }
 
-  // Linearize the expression tree on the LHS.
-  LinearizeExprTree(LHSBO, Ops);
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    assert(Identity && "Associative operation without identity!");
+    Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
+  }
 
-  // Remember the RHS operand and its rank.
-  Ops.push_back(ValueEntry(getRank(RHS), RHS));
-  
-  // Clear the RHS leaf out.
-  I->setOperand(1, UndefValue::get(I->getType()));
+  return MadeChange;
 }
 
 // RewriteExprTree - Now that the operands for this expression tree are
-// linearized and optimized, emit them in-order.  This function is written to be
-// tail recursive.
+// linearized and optimized, emit them in-order.
 void Reassociate::RewriteExprTree(BinaryOperator *I,
-                                  SmallVectorImpl<ValueEntry> &Ops,
-                                  unsigned i) {
-  if (i+2 == Ops.size()) {
-    if (I->getOperand(0) != Ops[i].Op ||
-        I->getOperand(1) != Ops[i+1].Op) {
-      Value *OldLHS = I->getOperand(0);
-      DEBUG(dbgs() << "RA: " << *I << '\n');
-      I->setOperand(0, Ops[i].Op);
-      I->setOperand(1, Ops[i+1].Op);
-
-      // Clear all the optional flags, which may not hold after the
-      // reassociation if the expression involved more than just this operation.
-      if (Ops.size() != 2)
-        I->clearSubclassOptionalData();
-
-      DEBUG(dbgs() << "TO: " << *I << '\n');
+                                  SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations never increase the number of operations, the new
+  // expression can always be written by reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = 0;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
       MadeChange = true;
       ++NumChanged;
-      
-      // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3)
-      // delete the extra, now dead, nodes.
-      RemoveDeadBinaryOp(OldLHS);
     }
-    return;
-  }
-  assert(i+2 < Ops.size() && "Ops index out of range!");
 
-  if (I->getOperand(1) != Ops[i].Op) {
-    DEBUG(dbgs() << "RA: " << *I << '\n');
-    I->setOperand(1, Ops[i].Op);
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) {
+      Op = BO;
+      continue;
+    }
 
-    // Conservatively clear all the optional flags, which may not hold
-    // after the reassociation.
-    I->clearSubclassOptionalData();
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
 
-    DEBUG(dbgs() << "TO: " << *I << '\n');
+    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
     MadeChange = true;
     ++NumChanged;
+    Op = NewOp;
   }
-  
-  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
-  assert(LHS->getOpcode() == I->getOpcode() &&
-         "Improper expression tree!");
-  
-  // Compactify the tree instructions together with each other to guarantee
-  // that the expression tree is dominated by all of Ops.
-  LHS->moveBefore(I);
-  RewriteExprTree(LHS, Ops, i+1);
-}
-
 
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      ExpressionChanged->clearSubclassOptionalData();
+      if (ExpressionChanged == I)
+        break;
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin());
+    } while (1);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
 
-// NegateValue - Insert instructions before the instruction pointed to by BI,
-// that computes the negative version of the value specified.  The negative
-// version of the value is returned, and BI is left pointing at the instruction
-// that should be processed next by the reassociation pass.
-//
+/// NegateValue - Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
 static Value *NegateValue(Value *V, Instruction *BI) {
   if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getNeg(C);
-  
+
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
   // expression chain as possible, to expose the add instructions.  In practice,
@@ -412,22 +810,21 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   // the constants.  We assume that instcombine will clean up the mess later if
   // we introduce tons of unnecessary negation instructions.
   //
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
-      // Push the negates through the add.
-      I->setOperand(0, NegateValue(I->getOperand(0), BI));
-      I->setOperand(1, NegateValue(I->getOperand(1), BI));
-
-      // We must move the add instruction here, because the neg instructions do
-      // not dominate the old add instruction in general.  By moving it, we are
-      // assured that the neg instructions we just inserted dominate the 
-      // instruction we are about to insert after them.
-      //
-      I->moveBefore(BI);
-      I->setName(I->getName()+".neg");
-      return I;
-    }
-  
+  if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+    return I;
+  }
+
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
@@ -443,7 +840,7 @@ static Value *NegateValue(Value *V, Instruction *BI) {
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
       continue;
-    
+
     BasicBlock::iterator InsertPt;
     if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
       if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
@@ -471,7 +868,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
   if (BinaryOperator::isNeg(Sub))
     return false;
-  
+
   // Don't bother to break this up unless either the LHS is an associable add or
   // subtract or if this is only used by one.
   if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
@@ -480,19 +877,18 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
   if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
       isReassociableOp(Sub->getOperand(1), Instruction::Sub))
     return true;
-  if (Sub->hasOneUse() && 
+  if (Sub->hasOneUse() &&
       (isReassociableOp(Sub->use_back(), Instruction::Add) ||
        isReassociableOp(Sub->use_back(), Instruction::Sub)))
     return true;
-    
+
   return false;
 }
 
 /// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
-static Instruction *BreakUpSubtract(Instruction *Sub,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
+static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
@@ -500,15 +896,15 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
   // and set it as the RHS of the add instruction we just made.
   //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
-  Instruction *New =
+  BinaryOperator *New =
     BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
   New->takeName(Sub);
 
   // Everyone now refers to the add instruction.
-  ValueRankMap.erase(Sub);
   Sub->replaceAllUsesWith(New);
   New->setDebugLoc(Sub->getDebugLoc());
-  Sub->eraseFromParent();
 
   DEBUG(dbgs() << "Negated: " << *New << '\n');
   return New;
@@ -517,32 +913,23 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
 /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
-static Instruction *ConvertShiftToMul(Instruction *Shl,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
-  // If an operand of this shift is a reassociable multiply, or if the shift
-  // is used by a reassociable multiply or add, turn into a multiply.
-  if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
-      (Shl->hasOneUse() && 
-       (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
-        isReassociableOp(Shl->use_back(), Instruction::Add)))) {
-    Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
-    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
-    
-    Instruction *Mul =
-      BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
-    ValueRankMap.erase(Shl);
-    Mul->takeName(Shl);
-    Shl->replaceAllUsesWith(Mul);
-    Mul->setDebugLoc(Shl->getDebugLoc());
-    Shl->eraseFromParent();
-    return Mul;
-  }
-  return 0;
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+  return Mul;
 }
 
-// Scan backwards and forwards among values with the same rank as element i to
-// see if X exists.  If X does not exist, return i.  This is useful when
-// scanning for 'x' when we see '-x' because they both get the same rank.
+/// FindInOperandList - Scan backwards and forwards among values with the same
+/// rank as element i to see if X exists.  If X does not exist, return i.  This
+/// is useful when scanning for 'x' when we see '-x' because they both get the
+/// same rank.
 static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
                                   Value *X) {
   unsigned XRank = Ops[i].Rank;
@@ -562,22 +949,29 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
 static Value *EmitAddTreeOfValues(Instruction *I,
                                   SmallVectorImpl<WeakVH> &Ops){
   if (Ops.size() == 1) return Ops.back();
-  
+
   Value *V1 = Ops.back();
   Ops.pop_back();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
   return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
 }
 
-/// RemoveFactorFromExpression - If V is an expression tree that is a 
+/// RemoveFactorFromExpression - If V is an expression tree that is a
 /// multiplication sequence, and if this sequence contains a multiply by Factor,
 /// remove Factor from the tree and return the new tree.
 Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
   if (!BO) return 0;
-  
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
   SmallVector<ValueEntry, 8> Factors;
-  LinearizeExprTree(BO, Factors);
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
 
   bool FoundFactor = false;
   bool NeedsNegate = false;
@@ -587,7 +981,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
       Factors.erase(Factors.begin()+i);
       break;
     }
-    
+
     // If this is a negative version of this factor, remove it.
     if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
       if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
@@ -597,29 +991,28 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
           break;
         }
   }
-  
+
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
     RewriteExprTree(BO, Factors);
     return 0;
   }
-  
+
   BasicBlock::iterator InsertPt = BO; ++InsertPt;
-  
+
   // If this was just a single multiply, remove the multiply and return the only
   // remaining operand.
   if (Factors.size() == 1) {
-    ValueRankMap.erase(BO);
-    DeadInsts.push_back(BO);
+    RedoInsts.insert(BO);
     V = Factors[0].Op;
   } else {
     RewriteExprTree(BO, Factors);
     V = BO;
   }
-  
+
   if (NeedsNegate)
     V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
-  
+
   return V;
 }
 
@@ -629,31 +1022,16 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
                                          SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops,
-                                         bool IsRoot) {
-  BinaryOperator *BO;
-  if (!(V->hasOneUse() || V->use_empty()) || // More than one use.
-      !(BO = dyn_cast<BinaryOperator>(V)) ||
-      BO->getOpcode() != Instruction::Mul) {
+                                       const SmallVectorImpl<ValueEntry> &Ops) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) {
     Factors.push_back(V);
     return;
   }
-  
-  // If this value has a single use because it is another input to the add
-  // tree we're reassociating and we dropped its use, it actually has two
-  // uses and we can't factor it.
-  if (!IsRoot) {
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (Ops[i].Op == V) {
-        Factors.push_back(V);
-        return;
-      }
-  }
-  
-  
+
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops, false);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops, false);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
 }
 
 /// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor'
@@ -673,12 +1051,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
       if (FoundX != i) {
         if (Opcode == Instruction::And)   // ...&X&~X = 0
           return Constant::getNullValue(X->getType());
-        
+
         if (Opcode == Instruction::Or)    // ...|X|~X = -1
           return Constant::getAllOnesValue(X->getType());
       }
     }
-    
+
     // Next, check for duplicate pairs of values, which we assume are next to
     // each other, due to our sorting criteria.
     assert(i < Ops.size());
@@ -690,12 +1068,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
         ++NumAnnihil;
         continue;
       }
-      
+
       // Drop pairs of values for Xor.
       assert(Opcode == Instruction::Xor);
       if (e == 2)
         return Constant::getNullValue(Ops[0].Op->getType());
-      
+
       // Y ^ X^X -> Y
       Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
       i -= 1; e -= 2;
@@ -728,46 +1106,46 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
         Ops.erase(Ops.begin()+i);
         ++NumFound;
       } while (i != Ops.size() && Ops[i].Op == TheOp);
-      
+
       DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
       ++NumFactor;
-      
+
       // Insert a new multiply.
       Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
       Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
-      
+
       // Now that we have inserted a multiply, optimize it. This allows us to
       // handle cases that require multiple factoring steps, such as this:
       // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
-      RedoInsts.push_back(Mul);
-      
+      RedoInsts.insert(cast<Instruction>(Mul));
+
       // If every add operand was a duplicate, return the multiply.
       if (Ops.empty())
         return Mul;
-      
+
       // Otherwise, we had some input that didn't have the dupe, such as
       // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
       // things being added by this operation.
       Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
-      
+
       --i;
       e = Ops.size();
       continue;
     }
-    
+
     // Check for X and -X in the operand list.
     if (!BinaryOperator::isNeg(TheOp))
       continue;
-    
+
     Value *X = BinaryOperator::getNegArgument(TheOp);
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
-    
+
     // Remove X and -X from the operand list.
     if (Ops.size() == 2)
       return Constant::getNullValue(X->getType());
-    
+
     Ops.erase(Ops.begin()+i);
     if (i < FoundX)
       --FoundX;
@@ -778,37 +1156,37 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     --i;     // Revisit element.
     e -= 2;  // Removed two elements.
   }
-  
+
   // Scan the operand list, checking to see if there are any common factors
   // between operands.  Consider something like A*A+A*B*C+D.  We would like to
   // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
   // To efficiently find this, we count the number of times a factor occurs
   // for any ADD operands that are MULs.
   DenseMap<Value*, unsigned> FactorOccurrences;
-  
+
   // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
   // where they are actually the same multiply.
   unsigned MaxOcc = 0;
   Value *MaxOccVal = 0;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op);
-    if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty())
+    BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+    if (!BOp)
       continue;
-    
+
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops, true);
+    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
     assert(Factors.size() > 1 && "Bad linearize!");
-    
+
     // Add one to FactorOccurrences for each unique factor in this op.
     SmallPtrSet<Value*, 8> Duplicates;
     for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
       Value *Factor = Factors[i];
       if (!Duplicates.insert(Factor)) continue;
-      
+
       unsigned Occ = ++FactorOccurrences[Factor];
       if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
-      
+
       // If Factor is a negative constant, add the negated value as a factor
       // because we can percolate the negate out.  Watch for minint, which
       // cannot be positivified.
@@ -817,13 +1195,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
           Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
           assert(!Duplicates.count(Factor) &&
                  "Shouldn't have two constant factors, missed a canonicalize");
-          
+
           unsigned Occ = ++FactorOccurrences[Factor];
           if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
         }
     }
   }
-  
+
   // If any factor occurred more than one time, we can pull it out.
   if (MaxOcc > 1) {
     DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
@@ -831,16 +1209,16 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
 
     // Create a new instruction that uses the MaxOccVal twice.  If we don't do
     // this, we could otherwise run into situations where removing a factor
-    // from an expression will drop a use of maxocc, and this can cause 
+    // from an expression will drop a use of maxocc, and this can cause
     // RemoveFactorFromExpression on successive values to behave differently.
     Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
     SmallVector<WeakVH, 4> NewMulOps;
     for (unsigned i = 0; i != Ops.size(); ++i) {
       // Only try to remove factors from expressions we're allowed to.
-      BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op);
-      if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty())
+      BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+      if (!BOp)
         continue;
-      
+
       if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
         // The factorized operand may occur several times.  Convert them all in
         // one fell swoop.
@@ -854,7 +1232,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
         --i;
       }
     }
-    
+
     // No need for extra uses anymore.
     delete DummyInst;
 
@@ -866,26 +1244,201 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
     assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
     (void)NumAddedValues;
-    V = ReassociateExpression(cast<BinaryOperator>(V));
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
 
     // Create the multiply.
-    Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+    Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
 
     // Rerun associate on the multiply in case the inner expression turned into
     // a multiply.  We want to make sure that we keep things in canonical form.
-    V2 = ReassociateExpression(cast<BinaryOperator>(V2));
-    
+    RedoInsts.insert(V2);
+
     // If every add operand included the factor (e.g. "A*B + A*C"), then the
     // entire result expression is just the multiply "A*(B+C)".
     if (Ops.empty())
       return V2;
-    
+
     // Otherwise, we had some input that didn't have the factor, such as
     // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
     // things being added by this operation.
     Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
   }
-  
+
+  return 0;
+}
+
+namespace {
+  /// \brief Predicate tests whether a ValueEntry's op is in a map.
+  struct IsValueInMap {
+    const DenseMap<Value *, unsigned> &Map;
+
+    IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {}
+
+    bool operator()(const ValueEntry &Entry) {
+      return Map.find(Entry.Op) != Map.end();
+    }
+  };
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                         SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                            SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            Factor::PowerEqual()),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *Reassociate::OptimizeMul(BinaryOperator *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return 0;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return 0; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
   return 0;
 }
 
@@ -893,95 +1446,105 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
                                        SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
-  bool IterateOptimization = false;
   if (Ops.size() == 1) return Ops[0].Op;
 
   unsigned Opcode = I->getOpcode();
-  
-  if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op))
-    if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) {
-      Ops.pop_back();
-      Ops.back().Op = ConstantExpr::get(Opcode, V1, V2);
-      return OptimizeExpression(I, Ops);
-    }
-
-  // Check for destructive annihilation due to a constant being used.
-  if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op))
-    switch (Opcode) {
-    default: break;
-    case Instruction::And:
-      if (CstVal->isZero())                  // X & 0 -> 0
-        return CstVal;
-      if (CstVal->isAllOnesValue())          // X & -1 -> X
-        Ops.pop_back();
-      break;
-    case Instruction::Mul:
-      if (CstVal->isZero()) {                // X * 0 -> 0
-        ++NumAnnihil;
-        return CstVal;
-      }
-        
-      if (cast<ConstantInt>(CstVal)->isOne())
-        Ops.pop_back();                      // X * 1 -> X
-      break;
-    case Instruction::Or:
-      if (CstVal->isAllOnesValue())          // X | -1 -> -1
-        return CstVal;
-      // FALLTHROUGH!
-    case Instruction::Add:
-    case Instruction::Xor:
-      if (CstVal->isZero())                  // X [|^+] 0 -> X
-        Ops.pop_back();
-      break;
-    }
-  if (Ops.size() == 1) return Ops[0].Op;
 
   // Handle destructive annihilation due to identities between elements in the
   // argument list here.
+  unsigned NumOps = Ops.size();
   switch (Opcode) {
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
-    unsigned NumOps = Ops.size();
+  case Instruction::Xor:
     if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
       return Result;
-    IterateOptimization |= Ops.size() != NumOps;
     break;
-  }
 
-  case Instruction::Add: {
-    unsigned NumOps = Ops.size();
+  case Instruction::Add:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
-    IterateOptimization |= Ops.size() != NumOps;
-  }
+    break;
 
+  case Instruction::Mul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
     break;
-  //case Instruction::Mul:
   }
 
-  if (IterateOptimization)
+  if (Ops.size() != NumOps)
     return OptimizeExpression(I, Ops);
   return 0;
 }
 
+/// EraseInst - Zap the given instruction, adding interesting operands to the
+/// work list.
+void Reassociate::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode &&
+             Visited.insert(Op))
+        Op = Op->use_back();
+      RedoInsts.insert(Op);
+    }
+}
+
+/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void Reassociate::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<BinaryOperator>(I))
+    return;
 
-/// ReassociateInst - Inspect and reassociate the instruction at the
-/// given position, post-incrementing the position.
-void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) {
-  Instruction *BI = BBI++;
-  if (BI->getOpcode() == Instruction::Shl &&
-      isa<ConstantInt>(BI->getOperand(1)))
-    if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
+  if (I->getOpcode() == Instruction::Shl &&
+      isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->use_back(), Instruction::Mul) ||
+          isReassociableOp(I->use_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
       MadeChange = true;
-      BI = NI;
+      I = NI;
+    }
+
+  // Floating point binary operators are not associative, but we can still
+  // commute (some) of them, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing
+  // other transformations simpler.
+  if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) {
+    // FAdd and FMul can be commuted.
+    if (I->getOpcode() != Instruction::FMul &&
+        I->getOpcode() != Instruction::FAdd)
+      return;
+
+    Value *LHS = I->getOperand(0);
+    Value *RHS = I->getOperand(1);
+    unsigned LHSRank = getRank(LHS);
+    unsigned RHSRank = getRank(RHS);
+
+    // Sort the operands by rank.
+    if (RHSRank < LHSRank) {
+      I->setOperand(0, RHS);
+      I->setOperand(1, LHS);
     }
 
-  // Reject cases where it is pointless to do this.
-  if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || 
-      BI->getType()->isVectorTy())
-    return;  // Floating point ops are not associative.
+    return;
+  }
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
   // original order of evaluation for short-circuited comparisons that
@@ -989,58 +1552,66 @@ void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) {
   // is not further optimized, it is likely to be transformed back to a
   // short-circuited form for code gen, and the source order may have been
   // optimized for the most likely conditions.
-  if (BI->getType()->isIntegerTy(1))
+  if (I->getType()->isIntegerTy(1))
     return;
 
   // If this is a subtract instruction which is not already in negate form,
   // see if we can convert it to X+-Y.
-  if (BI->getOpcode() == Instruction::Sub) {
-    if (ShouldBreakUpSubtract(BI)) {
-      BI = BreakUpSubtract(BI, ValueRankMap);
-      // Reset the BBI iterator in case BreakUpSubtract changed the
-      // instruction it points to.
-      BBI = BI;
-      ++BBI;
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I);
+      RedoInsts.insert(I);
       MadeChange = true;
-    } else if (BinaryOperator::isNeg(BI)) {
+      I = NI;
+    } else if (BinaryOperator::isNeg(I)) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
-      if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
-          (!BI->hasOneUse() ||
-           !isReassociableOp(BI->use_back(), Instruction::Mul))) {
-        BI = LowerNegateToMultiply(BI, ValueRankMap);
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->use_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        RedoInsts.insert(I);
         MadeChange = true;
+        I = NI;
       }
     }
   }
 
-  // If this instruction is a commutative binary operator, process it.
-  if (!BI->isAssociative()) return;
-  BinaryOperator *I = cast<BinaryOperator>(BI);
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
 
   // If this is an interior node of a reassociable tree, ignore it until we
   // get to the root of the tree, to avoid N^2 analysis.
-  if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode)
     return;
 
-  // If this is an add tree that is used by a sub instruction, ignore it 
+  // If this is an add tree that is used by a sub instruction, ignore it
   // until we process the subtract.
-  if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
-      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub)
     return;
 
-  ReassociateExpression(I);
+  ReassociateExpression(BO);
 }
 
-Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
-  
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
   SmallVector<ValueEntry, 8> Ops;
-  LinearizeExprTree(I, Ops);
-  
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
   DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
-  
+
   // Now that we have linearized the tree to a list and have gathered all of
   // the operands and their ranks, sort the operands by their rank.  Use a
   // stable_sort so that values with equal ranks will have their relative
@@ -1048,21 +1619,24 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
   // this sorts so that the highest ranking values end up at the beginning of
   // the vector.
   std::stable_sort(Ops.begin(), Ops.end());
-  
+
   // OptimizeExpression - Now that we have the expression tree in a convenient
   // sorted form, optimize it globally if possible.
   if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
     if (Instruction *VI = dyn_cast<Instruction>(V))
       VI->setDebugLoc(I->getDebugLoc());
-    RemoveDeadBinaryOp(I);
+    RedoInsts.insert(I);
     ++NumAnnihil;
-    return V;
+    return;
   }
-  
+
   // We want to sink immediates as deeply as possible except in the case where
   // this is a multiply tree used only by an add, and the immediate is a -1.
   // In this case we reassociate to put the negation on the outside so that we
@@ -1074,51 +1648,57 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
     ValueEntry Tmp = Ops.pop_back_val();
     Ops.insert(Ops.begin(), Tmp);
   }
-  
+
   DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
-  
+
   if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     I->replaceAllUsesWith(Ops[0].Op);
     if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
       OI->setDebugLoc(I->getDebugLoc());
-    RemoveDeadBinaryOp(I);
-    return Ops[0].Op;
+    RedoInsts.insert(I);
+    return;
   }
-  
+
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
   RewriteExprTree(I, Ops);
-  return I;
 }
 
-
 bool Reassociate::runOnFunction(Function &F) {
-  // Recalculate the rank map for F
+  // Calculate the rank map for F
   BuildRankMap(F);
 
   MadeChange = false;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); )
-      ReassociateInst(BBI);
-
-  // Now that we're done, revisit any instructions which are likely to
-  // have secondary reassociation opportunities.
-  while (!RedoInsts.empty())
-    if (Value *V = RedoInsts.pop_back_val()) {
-      BasicBlock::iterator BBI = cast<Instruction>(V);
-      ReassociateInst(BBI);
-    }
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+      if (isInstructionTriviallyDead(II)) {
+        EraseInst(II++);
+      } else {
+        OptimizeInst(II);
+        assert(II->getParent() == BI && "Moved to a different block!");
+        ++II;
+      }
 
-  // Now that we're done, delete any instructions which are no longer used.
-  while (!DeadInsts.empty())
-    if (Value *V = DeadInsts.pop_back_val())
-      RecursivelyDeleteTriviallyDeadInstructions(V);
+    // If this produced extra instructions to optimize, handle them now.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
 
   // We are done with the rank map.
   RankMap.clear();
   ValueRankMap.clear();
+
   return MadeChange;
 }
-
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 47afc77..ea1de63 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file demotes all registers to memory references.  It is intented to be
+// This file demotes all registers to memory references.  It is intended to be
 // the inverse of PromoteMemoryToRegister.  By converting to loads, the only
 // values live across basic blocks are allocas and loads before phi nodes.
 // It is intended that this should make CFG hacking much easier.
@@ -59,7 +59,7 @@ namespace {
     virtual bool runOnFunction(Function &F);
   };
 }
-  
+
 char RegToMem::ID = 0;
 INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
                 false, false)
@@ -68,25 +68,25 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
                 false, false)
 
 bool RegToMem::runOnFunction(Function &F) {
-  if (F.isDeclaration()) 
+  if (F.isDeclaration())
     return false;
-  
+
   // Insert all new allocas into entry block.
   BasicBlock *BBEntry = &F.getEntryBlock();
   assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
          "Entry block to function must not have predecessors!");
-  
+
   // Find first non-alloca instruction and create insertion point. This is
   // safe if block is well-formed: it always have terminator, otherwise
   // we'll get and assertion.
   BasicBlock::iterator I = BBEntry->begin();
   while (isa<AllocaInst>(I)) ++I;
-  
+
   CastInst *AllocaInsertionPoint =
     new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                     Type::getInt32Ty(F.getContext()),
                     "reg2mem alloca point", I);
-  
+
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
   std::list<Instruction*> WorkList;
@@ -99,15 +99,15 @@ bool RegToMem::runOnFunction(Function &F) {
         WorkList.push_front(&*iib);
       }
     }
-  
+
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(), 
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
        ile = WorkList.end(); ilb != ile; ++ilb)
     DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
-  
+
   WorkList.clear();
-  
+
   // Find all phi's
   for (Function::iterator ibb = F.begin(), ibe = F.end();
        ibb != ibe; ++ibb)
@@ -115,19 +115,18 @@ bool RegToMem::runOnFunction(Function &F) {
          iib != iie; ++iib)
       if (isa<PHINode>(iib))
         WorkList.push_front(&*iib);
-  
+
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(), 
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
        ile = WorkList.end(); ilb != ile; ++ilb)
     DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
-  
+
   return true;
 }
 
 
 // createDemoteRegisterToMemory - Provide an entry point to create this pass.
-//
 char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
 FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
   return new RegToMem();
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 16b64a5..2c39aab 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -409,7 +409,7 @@ private:
 
     if (Constant *C = dyn_cast<Constant>(V)) {
       Constant *Elt = C->getAggregateElement(i);
-      
+
       if (Elt == 0)
         LV.markOverdefined();      // Unknown sort of constant.
       else if (isa<UndefValue>(Elt))
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 7d65bcc..48318c8 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements common infrastructure for libLLVMScalarOpts.a, which 
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
 // implements several scalar transformations over the LLVM intermediate
 // representation, including the C bindings for that library.
 //
@@ -24,7 +24,7 @@
 
 using namespace llvm;
 
-/// initializeScalarOptsPasses - Initialize all passes linked into the 
+/// initializeScalarOptsPasses - Initialize all passes linked into the
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 026fea1..6637126 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -22,33 +22,34 @@
 #define DEBUG_TYPE "scalarrepl"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumReplaced,  "Number of allocas broken up");
@@ -59,12 +60,25 @@ STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
 
 namespace {
   struct SROA : public FunctionPass {
-    SROA(int T, bool hasDT, char &ID)
+    SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
       : FunctionPass(ID), HasDomTree(hasDT) {
       if (T == -1)
         SRThreshold = 128;
       else
         SRThreshold = T;
+      if (ST == -1)
+        StructMemberThreshold = 32;
+      else
+        StructMemberThreshold = ST;
+      if (AT == -1)
+        ArrayElementThreshold = 8;
+      else
+        ArrayElementThreshold = AT;
+      if (SLT == -1)
+        // Do not limit the scalar integer load size if no threshold is given.
+        ScalarLoadThreshold = -1;
+      else
+        ScalarLoadThreshold = SLT;
     }
 
     bool runOnFunction(Function &F);
@@ -86,11 +100,11 @@ namespace {
     struct AllocaInfo {
       /// The alloca to promote.
       AllocaInst *AI;
-      
+
       /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
       /// looping and avoid redundant work.
       SmallPtrSet<PHINode*, 8> CheckedPHIs;
-      
+
       /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
       bool isUnsafe : 1;
 
@@ -104,19 +118,32 @@ namespace {
       /// ever accessed, or false if the alloca is only accessed with mem
       /// intrinsics or load/store that only access the entire alloca at once.
       bool hasSubelementAccess : 1;
-      
+
       /// hasALoadOrStore - This is true if there are any loads or stores to it.
       /// The alloca may just be accessed with memcpy, for example, which would
       /// not set this.
       bool hasALoadOrStore : 1;
-      
+
       explicit AllocaInfo(AllocaInst *ai)
         : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
           hasSubelementAccess(false), hasALoadOrStore(false) {}
     };
 
+    /// SRThreshold - The maximum alloca size to considered for SROA.
     unsigned SRThreshold;
 
+    /// StructMemberThreshold - The maximum number of members a struct can
+    /// contain to be considered for SROA.
+    unsigned StructMemberThreshold;
+
+    /// ArrayElementThreshold - The maximum number of elements an array can
+    /// have to be considered for SROA.
+    unsigned ArrayElementThreshold;
+
+    /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
+    /// converting to scalar
+    unsigned ScalarLoadThreshold;
+
     void MarkUnsafe(AllocaInfo &I, Instruction *User) {
       I.isUnsafe = true;
       DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
@@ -155,19 +182,21 @@ namespace {
                                        SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
+    bool ShouldAttemptScalarRepl(AllocaInst *AI);
 
     static MemTransferInst *isOnlyCopiedFromConstantGlobal(
         AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete);
   };
-  
+
   // SROA_DT - SROA that uses DominatorTree.
   struct SROA_DT : public SROA {
     static char ID;
   public:
-    SROA_DT(int T = -1) : SROA(T, true, ID) {
+    SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, true, ID, ST, AT, SLT) {
       initializeSROA_DTPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -175,22 +204,23 @@ namespace {
       AU.setPreservesCFG();
     }
   };
-  
+
   // SROA_SSAUp - SROA that uses SSAUpdater.
   struct SROA_SSAUp : public SROA {
     static char ID;
   public:
-    SROA_SSAUp(int T = -1) : SROA(T, false, ID) {
+    SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, false, ID, ST, AT, SLT) {
       initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
     }
   };
-  
+
 }
 
 char SROA_DT::ID = 0;
@@ -209,10 +239,15 @@ INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
 
 // Public interface to the ScalarReplAggregates pass
 FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
-                                                   bool UseDomTree) {
+                                                   bool UseDomTree,
+                                                   int StructMemberThreshold,
+                                                   int ArrayElementThreshold,
+                                                   int ScalarLoadThreshold) {
   if (UseDomTree)
-    return new SROA_DT(Threshold);
-  return new SROA_SSAUp(Threshold);
+    return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
+                       ScalarLoadThreshold);
+  return new SROA_SSAUp(Threshold, StructMemberThreshold,
+                        ArrayElementThreshold, ScalarLoadThreshold);
 }
 
 
@@ -228,6 +263,7 @@ class ConvertToScalarInfo {
   /// AllocaSize - The size of the alloca being considered in bytes.
   unsigned AllocaSize;
   const TargetData &TD;
+  unsigned ScalarLoadThreshold;
 
   /// IsNotTrivial - This is set to true if there is some access to the object
   /// which means that mem2reg can't promote it.
@@ -258,28 +294,38 @@ class ConvertToScalarInfo {
   /// isn't possible to turn into a vector type, it gets set to VoidTy.
   VectorType *VectorTy;
 
-  /// HadNonMemTransferAccess - True if there is at least one access to the 
+  /// HadNonMemTransferAccess - True if there is at least one access to the
   /// alloca that is not a MemTransferInst.  We don't want to turn structs into
   /// large integers unless there is some potential for optimization.
   bool HadNonMemTransferAccess;
 
+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
 public:
-  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
-    : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
-      VectorTy(0), HadNonMemTransferAccess(false) { }
+  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td,
+                               unsigned SLT)
+    : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+    ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
+    HadDynamicAccess(false) { }
 
   AllocaInst *TryConvert(AllocaInst *AI);
 
 private:
-  bool CanConvertToScalar(Value *V, uint64_t Offset);
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
   void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
   bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
-  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);
 
   Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
-                                    uint64_t Offset, IRBuilder<> &Builder);
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
   Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
-                                   uint64_t Offset, IRBuilder<> &Builder);
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
 };
 } // end anonymous namespace.
 
@@ -290,7 +336,7 @@ private:
 AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // If we can't convert this scalar, or if mem2reg can trivially do it, bail
   // out.
-  if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
+  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
     return 0;
 
   // If an alloca has only memset / memcpy uses, it may still have an Unknown
@@ -315,16 +361,27 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
     NewTy = VectorTy;  // Use the vector type.
   } else {
     unsigned BitWidth = AllocaSize * 8;
+
+    // Do not convert to scalar integer if the alloca size exceeds the
+    // scalar load threshold.
+    if (BitWidth > ScalarLoadThreshold)
+      return 0;
+
     if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
         !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
       return 0;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return 0;
 
     DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
     // Create and insert the integer alloca.
     NewTy = IntegerType::get(AI->getContext(), BitWidth);
   }
   AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
-  ConvertUsesToScalar(AI, NewAI, 0);
+  ConvertUsesToScalar(AI, NewAI, 0, 0);
   return NewAI;
 }
 
@@ -411,7 +468,8 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
 ///
 /// If we see at least one access to the value that is as a vector type, set the
 /// SawVec flag.
-bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
 
@@ -441,24 +499,35 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
       if (!onlyUsedByLifetimeMarkers(BCI))
         IsNotTrivial = true;  // Can't be mem2reg'd.
-      if (!CanConvertToScalar(BCI, Offset))
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
         return false;
       continue;
     }
 
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // If this is a GEP with a variable indices, we can't handle it.
-      if (!GEP->hasAllConstantIndices())
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
         return false;
 
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return false;
-      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+      Value *GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
                                                Indices);
       // See if all uses can be converted.
-      if (!CanConvertToScalar(GEP, Offset+GEPOffset))
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
         return false;
       IsNotTrivial = true;  // Can't be mem2reg'd.
       HadNonMemTransferAccess = true;
@@ -468,6 +537,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     // If this is a constant sized memset of a constant value (e.g. 0) we can
     // handle it.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       // Store of constant value.
       if (!isa<ConstantInt>(MSI->getValue()))
         return false;
@@ -492,6 +564,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     // If this is a memcpy or memmove into or out of the whole allocation, we
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
       if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
         return false;
@@ -523,12 +598,13 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
-                                              uint64_t Offset) {
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
   while (!Ptr->use_empty()) {
     Instruction *User = cast<Instruction>(Ptr->use_back());
 
     if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
-      ConvertUsesToScalar(CI, NewAI, Offset);
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
       CI->eraseFromParent();
       continue;
     }
@@ -536,9 +612,16 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      Value* GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        assert(!NonConstantIdx &&
+               "Dynamic GEP reading from dynamic GEP unsupported");
+        GEPNonConstantIdx = Indices.pop_back_val();
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
                                                Indices);
-      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
       GEP->eraseFromParent();
       continue;
     }
@@ -549,7 +632,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // The load is a bit extract from NewAI shifted right by Offset bits.
       Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
-        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
       LI->eraseFromParent();
       continue;
@@ -559,7 +643,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       assert(SI->getOperand(0) != Ptr && "Consistency error!");
       Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
       Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
-                                             Builder);
+                                             NonConstantIdx, Builder);
       Builder.CreateStore(New, NewAI);
       SI->eraseFromParent();
 
@@ -574,6 +658,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
       int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
       if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
         unsigned NumBytes = static_cast<unsigned>(SNumBytes);
@@ -590,7 +675,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, Builder);
+                                               Old, Offset, 0, Builder);
         Builder.CreateStore(New, NewAI);
 
         // If the load we just inserted is now dead, then the memset overwrote
@@ -606,6 +691,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
       assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
 
       // If the source and destination are both to the same alloca, then this is
       // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
@@ -678,7 +764,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 /// shifted to the right.
 Value *ConvertToScalarInfo::
 ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
-                           uint64_t Offset, IRBuilder<> &Builder) {
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
   Type *FromType = FromVal->getType();
   if (FromType == ToType && Offset == 0)
@@ -700,7 +787,17 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
     if (V->getType() != ToType)
       V = Builder.CreateBitCast(V, ToType);
     return V;
@@ -709,23 +806,27 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   // If ToType is a first class aggregate, extract out each of the pieces and
   // use insertvalue's to form the FCA.
   if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
-                                              Builder);
+                                              0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, Builder);
+                                              Offset+i*EltSize, 0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
@@ -791,9 +892,14 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
 ///
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
 Value *ConvertToScalarInfo::
 ConvertScalar_InsertValue(Value *SV, Value *Old,
-                          uint64_t Offset, IRBuilder<> &Builder) {
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
   // Convert the stored type to the actual type, shift it left to insert
   // then 'or' into place.
   Type *AllocaType = Old->getType();
@@ -814,26 +920,40 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
       SV = Builder.CreateBitCast(SV, EltTy);
     uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
   if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
-                                      Builder);
+                                      0, Builder);
     }
     return Old;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
     }
     return Old;
   }
@@ -935,7 +1055,7 @@ public:
   AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  DIBuilder *DB)
     : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {}
-  
+
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
     this->AI = AI;
@@ -950,18 +1070,18 @@ public:
 
     LoadAndStorePromoter::run(Insts);
     AI->eraseFromParent();
-    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), 
+    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       DDI->eraseFromParent();
     }
-    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), 
+    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       DVI->eraseFromParent();
     }
   }
-  
+
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -970,7 +1090,7 @@ public:
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -978,7 +1098,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = NULL;
@@ -1021,12 +1141,12 @@ public:
 static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
   bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
   bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
-  
+
   for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
     if (LI == 0 || !LI->isSimple()) return false;
-    
+
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
     if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
@@ -1036,7 +1156,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
                                                     LI->getAlignment(), TD))
       return false;
   }
-  
+
   return true;
 }
 
@@ -1067,20 +1187,20 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
     if (LI == 0 || !LI->isSimple()) return false;
-    
+
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
     if (LI->getParent() != BB) return false;
-    
+
     // Ensure that there are no instructions between the PHI and the load that
     // could store.
     for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI)
       if (BBI->mayWriteToMemory())
         return false;
-    
+
     MaxAlign = std::max(MaxAlign, LI->getAlignment());
   }
-  
+
   // Okay, we know that we have one or more loads in the same block as the PHI.
   // We can transform this if it is safe to push the loads into the predecessor
   // blocks.  The only thing to watch out for is that we can't put a possibly
@@ -1108,10 +1228,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
     if (InVal->isDereferenceablePointer() ||
         isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD))
       continue;
-    
+
     return false;
   }
-    
+
   return true;
 }
 
@@ -1123,7 +1243,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
 static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   SetVector<Instruction*, SmallVector<Instruction*, 4>,
             SmallPtrSet<Instruction*, 4> > InstsToRewrite;
-  
+
   for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
        UI != UE; ++UI) {
     User *U = *UI;
@@ -1132,7 +1252,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         return false;
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI || !SI->isSimple())
         return false;   // Don't allow a store OF the AI, only INTO the AI.
@@ -1146,7 +1266,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         Value *Result = SI->getOperand(1+CI->isZero());
         SI->replaceAllUsesWith(Result);
         SI->eraseFromParent();
-        
+
         // This is very rare and we just scrambled the use list of AI, start
         // over completely.
         return tryToMakeAllocaBePromotable(AI, TD);
@@ -1156,33 +1276,33 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       // loads, then we can transform this by rewriting the select.
       if (!isSafeSelectToSpeculate(SI, TD))
         return false;
-      
+
       InstsToRewrite.insert(SI);
       continue;
     }
-    
+
     if (PHINode *PN = dyn_cast<PHINode>(U)) {
       if (PN->use_empty()) {  // Dead PHIs can be stripped.
         InstsToRewrite.insert(PN);
         continue;
       }
-      
+
       // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
       // in the pred blocks, then we can transform this by rewriting the PHI.
       if (!isSafePHIToSpeculate(PN, TD))
         return false;
-      
+
       InstsToRewrite.insert(PN);
       continue;
     }
-    
+
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (onlyUsedByLifetimeMarkers(BCI)) {
         InstsToRewrite.insert(BCI);
         continue;
       }
     }
-    
+
     return false;
   }
 
@@ -1190,7 +1310,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   // we're done!
   if (InstsToRewrite.empty())
     return true;
-  
+
   // If we have instructions that need to be rewritten for this to be promotable
   // take care of it now.
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
@@ -1211,13 +1331,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       // loads with a new select.
       while (!SI->use_empty()) {
         LoadInst *LI = cast<LoadInst>(SI->use_back());
-      
+
         IRBuilder<> Builder(LI);
-        LoadInst *TrueLoad = 
+        LoadInst *TrueLoad =
           Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
-        LoadInst *FalseLoad = 
+        LoadInst *FalseLoad =
           Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
-        
+
         // Transfer alignment and TBAA info if present.
         TrueLoad->setAlignment(LI->getAlignment());
         FalseLoad->setAlignment(LI->getAlignment());
@@ -1225,18 +1345,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
           TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
           FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
         }
-        
+
         Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
         V->takeName(LI);
         LI->replaceAllUsesWith(V);
         LI->eraseFromParent();
       }
-    
+
       // Now that all the loads are gone, the select is gone too.
       SI->eraseFromParent();
       continue;
     }
-    
+
     // Otherwise, we have a PHI node which allows us to push the loads into the
     // predecessors.
     PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
@@ -1244,7 +1364,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       PN->eraseFromParent();
       continue;
     }
-    
+
     Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
     PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
                                      PN->getName()+".ld", PN);
@@ -1254,18 +1374,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
     LoadInst *SomeLoad = cast<LoadInst>(PN->use_back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
-    
+
     // Rewrite all loads of the PN to use the new PHI.
     while (!PN->use_empty()) {
       LoadInst *LI = cast<LoadInst>(PN->use_back());
       LI->replaceAllUsesWith(NewPN);
       LI->eraseFromParent();
     }
-    
+
     // Inject loads into all of the pred blocks.  Keep track of which blocks we
     // insert them into in case we have multiple edges from the same block.
     DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
-    
+
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *Pred = PN->getIncomingBlock(i);
       LoadInst *&Load = InsertedLoads[Pred];
@@ -1276,13 +1396,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         Load->setAlignment(Align);
         if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
       }
-      
+
       NewPN->addIncoming(Load, Pred);
     }
-    
+
     PN->eraseFromParent();
   }
-    
+
   ++NumAdjusted;
   return true;
 }
@@ -1315,7 +1435,7 @@ bool SROA::performPromotion(Function &F) {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
         AllocaInst *AI = Allocas[i];
-        
+
         // Build list of instructions to promote.
         for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
              UI != E; ++UI)
@@ -1334,18 +1454,36 @@ bool SROA::performPromotion(Function &F) {
 
 /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
 /// SROA.  It must be a struct or array type with a small number of elements.
-static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
+bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
   Type *T = AI->getAllocatedType();
-  // Do not promote any struct into more than 32 separate vars.
+  // Do not promote any struct that has too many members.
   if (StructType *ST = dyn_cast<StructType>(T))
-    return ST->getNumElements() <= 32;
-  // Arrays are much less likely to be safe for SROA; only consider
-  // them if they are very small.
+    return ST->getNumElements() <= StructMemberThreshold;
+  // Do not promote any array that has too many elements.
   if (ArrayType *AT = dyn_cast<ArrayType>(T))
-    return AT->getNumElements() <= 8;
+    return AT->getNumElements() <= ArrayElementThreshold;
   return false;
 }
 
+/// getPointeeAlignment - Compute the minimum alignment of the value pointed
+/// to by the given pointer.
+static unsigned getPointeeAlignment(Value *V, const TargetData &TD) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast ||
+        (CE->getOpcode() == Instruction::GetElementPtr &&
+         cast<GEPOperator>(CE)->hasAllZeroIndices()))
+      return getPointeeAlignment(CE->getOperand(0), TD);
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (!GV->isDeclaration())
+      return TD.getPreferredAlignment(GV);
+
+  if (PointerType *PT = dyn_cast<PointerType>(V->getType()))
+    return TD.getABITypeAlignment(PT->getElementType());
+
+  return 0;
+}
+
 
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
 // which runs on all of the alloca instructions in the function, removing them
@@ -1379,23 +1517,26 @@ bool SROA::performScalarRepl(Function &F) {
       continue;
 
     // Check to see if this allocation is only modified by a memcpy/memmove from
-    // a constant global.  If this is the case, we can change all users to use
+    // a constant global whose alignment is equal to or exceeds that of the
+    // allocation.  If this is the case, we can change all users to use
     // the constant global instead.  This is commonly produced by the CFE by
     // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
     // is only subsequently read.
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) {
-      DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
-      DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-        ToDelete[i]->eraseFromParent();
-      Constant *TheSrc = cast<Constant>(Copy->getSource());
-      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
-      Copy->eraseFromParent();  // Don't mutate the global.
-      AI->eraseFromParent();
-      ++NumGlobals;
-      Changed = true;
-      continue;
+      if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) {
+        DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
+        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+          ToDelete[i]->eraseFromParent();
+        Constant *TheSrc = cast<Constant>(Copy->getSource());
+        AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
+        Copy->eraseFromParent();  // Don't mutate the global.
+        AI->eraseFromParent();
+        ++NumGlobals;
+        Changed = true;
+        continue;
+      }
     }
 
     // Check to see if we can perform the core SROA transformation.  We cannot
@@ -1425,8 +1566,8 @@ bool SROA::performScalarRepl(Function &F) {
     // promoted itself.  If so, we don't want to transform it needlessly.  Note
     // that we can't just check based on the type: the alloca may be of an i32
     // but that has pointer arithmetic to set byte 3 of it or something.
-    if (AllocaInst *NewAI =
-          ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) {
+    if (AllocaInst *NewAI = ConvertToScalarInfo(
+              (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) {
       NewAI->takeName(AI);
       AI->eraseFromParent();
       ++NumConverted;
@@ -1531,12 +1672,12 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
-        
+
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
       if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
-        
+
       Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
@@ -1553,7 +1694,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
     if (Info.isUnsafe) return;
   }
 }
- 
+
 
 /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
 /// derived from the alloca, we can often still split the alloca into elements.
@@ -1570,10 +1711,10 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
   if (PHINode *PN = dyn_cast<PHINode>(I))
     if (!Info.CheckedPHIs.insert(PN))
       return;
-  
+
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
-    
+
     if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
       isSafePHISelectUseForScalarRepl(BC, Offset, Info);
     } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
@@ -1590,12 +1731,12 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
-      
+
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
       if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
-      
+
       Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, false /*AllowWholeAccess*/);
@@ -1619,6 +1760,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
   if (GEPIt == E)
     return;
+  bool NonConstant = false;
+  unsigned NonConstantIdxSize = 0;
 
   // Walk through the GEP type indices, checking the types that this indexes
   // into.
@@ -1628,15 +1771,30 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
       continue;
 
     ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-    if (!IdxVal)
-      return MarkUnsafe(Info, GEPI);
+    if (!IdxVal) {
+      // Non constant GEPs are only a problem on arrays, structs, and pointers
+      // Vectors can be dynamically indexed.
+      // FIXME: Add support for dynamic indexing on arrays.  This should be
+      // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0]
+      // isn't.
+      if (!(*GEPIt)->isVectorTy())
+        return MarkUnsafe(Info, GEPI);
+      NonConstant = true;
+      NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt);
+    }
   }
 
   // Compute the offset due to this GEP and check if the alloca has a
   // component element at that offset.
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If this GEP is non constant then the last operand must have been a
+  // dynamic index into a vector.  Pop this now as it has no impact on the
+  // constant part of the offset.
+  if (NonConstant)
+    Indices.pop_back();
   Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
-  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
+  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset,
+                        NonConstantIdxSize))
     MarkUnsafe(Info, GEPI);
 }
 
@@ -1741,6 +1899,12 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
     if (Offset >= AT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
+  } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    EltTy = VT->getElementType();
+    EltSize = TD->getTypeAllocSize(EltTy);
+    if (Offset >= VT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
   } else {
     return false;
   }
@@ -1766,12 +1930,12 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       RewriteBitCast(BC, AI, Offset, NewElts);
       continue;
     }
-    
+
     if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
       RewriteGEP(GEPI, AI, Offset, NewElts);
       continue;
     }
-    
+
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       uint64_t MemSize = Length->getZExtValue();
@@ -1790,10 +1954,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       Type *LIType = LI->getType();
-      
+
       if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
         // Replace:
         //   %res = load { i32, i32 }* %alloc
@@ -1819,7 +1983,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       Value *Val = SI->getOperand(0);
       Type *SIType = Val->getType();
@@ -1846,16 +2010,16 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (isa<SelectInst>(User) || isa<PHINode>(User)) {
-      // If we have a PHI user of the alloca itself (as opposed to a GEP or 
+      // If we have a PHI user of the alloca itself (as opposed to a GEP or
       // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
       // the new pointer.
       if (!isa<AllocaInst>(I)) continue;
-      
+
       assert(Offset == 0 && NewElts[0] &&
              "Direct alloca use should have a zero offset");
-      
+
       // If we have a use of the alloca, we know the derived uses will be
       // utilizing just the first element of the scalarized result.  Insert a
       // bitcast of the first alloca before the user as required.
@@ -1908,9 +2072,16 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
     Offset -= Layout->getElementOffset(Idx);
     IdxTy = Type::getInt32Ty(T->getContext());
     return Idx;
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    T = AT->getElementType();
+    uint64_t EltSize = TD->getTypeAllocSize(T);
+    Idx = Offset / EltSize;
+    Offset -= Idx * EltSize;
+    IdxTy = Type::getInt64Ty(T->getContext());
+    return Idx;
   }
-  ArrayType *AT = cast<ArrayType>(T);
-  T = AT->getElementType();
+  VectorType *VT = cast<VectorType>(T);
+  T = VT->getElementType();
   uint64_t EltSize = TD->getTypeAllocSize(T);
   Idx = Offset / EltSize;
   Offset -= Idx * EltSize;
@@ -1925,6 +2096,13 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                       SmallVector<AllocaInst*, 32> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If the GEP was dynamic then it must have been a dynamic vector lookup.
+  // In this case, it must be the last GEP operand which is dynamic so keep that
+  // aside until we've found the constant GEP offset then add it back in at the
+  // end.
+  Value* NonConstantIdx = 0;
+  if (!GEPI->hasAllConstantIndices())
+    NonConstantIdx = Indices.pop_back_val();
   Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
@@ -1951,6 +2129,17 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
     uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
     NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
   }
+  if (NonConstantIdx) {
+    Type* GepTy = T;
+    // This GEP has a dynamic index.  We need to add "i32 0" to index through
+    // any structs or arrays in the original type until we get to the vector
+    // to index.
+    while (!isa<VectorType>(GepTy)) {
+      NewArgs.push_back(Constant::getNullValue(i32Ty));
+      GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
+    }
+    NewArgs.push_back(NonConstantIdx);
+  }
   Instruction *Val = NewElts[Idx];
   if (NewArgs.size() > 1) {
     Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
@@ -2202,7 +2391,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
-  
+
   // Handle tail padding by extending the operand
   if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
     SrcVal = Builder.CreateZExt(SrcVal,
@@ -2464,7 +2653,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
         return false;
     }
   }
-  
+
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index a66b3e3..d13e4ab 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -67,7 +67,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   // nodes.
   for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
     (*SI)->removePredecessor(BB);
-  
+
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
   if (UseLLVMTrap) {
@@ -77,7 +77,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
     CallTrap->setDebugLoc(I->getDebugLoc());
   }
   new UnreachableInst(I->getContext(), I);
-  
+
   // All instructions after this are dead.
   BasicBlock::iterator BBI = I, BBE = BB->end();
   while (BBI != BBE) {
@@ -89,7 +89,6 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// ChangeToCall - Convert the specified invoke into a normal call.
 static void ChangeToCall(InvokeInst *II) {
-  BasicBlock *BB = II->getParent();
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -102,19 +101,19 @@ static void ChangeToCall(InvokeInst *II) {
   BranchInst::Create(II->getNormalDest(), II);
 
   // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(BB);
-  BB->getInstList().erase(II);
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
 }
 
 static bool MarkAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
-  
+
   SmallVector<BasicBlock*, 128> Worklist;
   Worklist.push_back(BB);
   bool Changed = false;
   do {
     BB = Worklist.pop_back_val();
-    
+
     if (!Reachable.insert(BB))
       continue;
 
@@ -136,7 +135,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           break;
         }
       }
-      
+
       // Store to undef and store to null are undefined and used to signal that
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
@@ -145,7 +144,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         if (SI->isVolatile()) continue;
 
         Value *Ptr = SI->getOperand(1);
-        
+
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
@@ -157,11 +156,22 @@ static bool MarkAliveBlocks(BasicBlock *BB,
     }
 
     // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
-      if (II->doesNotThrow()) {
-        ChangeToCall(II);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        ChangeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          ChangeToCall(II);
         Changed = true;
       }
+    }
 
     Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
@@ -170,38 +180,38 @@ static bool MarkAliveBlocks(BasicBlock *BB,
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even 
-/// if they are in a dead cycle.  Return true if a change was made, false 
+/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
 static bool RemoveUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
   bool Changed = MarkAliveBlocks(F.begin(), Reachable);
-  
+
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
     return Changed;
-  
+
   assert(Reachable.size() < F.size());
   NumSimpl += F.size()-Reachable.size();
-  
+
   // Loop over all of the basic blocks that are not reachable, dropping all of
   // their internal references...
   for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
     if (Reachable.count(BB))
       continue;
-    
+
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.count(*SI))
         (*SI)->removePredecessor(BB);
     BB->dropAllReferences();
   }
-  
+
   for (Function::iterator I = ++F.begin(); I != F.end();)
     if (!Reachable.count(I))
       I = F.getBasicBlockList().erase(I);
     else
       ++I;
-  
+
   return true;
 }
 
@@ -209,17 +219,17 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool MergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
-  
+
   BasicBlock *RetBlock = 0;
-  
+
   // Scan all the blocks in the function, looking for empty return blocks.
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
     BasicBlock &BB = *BBI++;
-    
+
     // Only look at return blocks.
     ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
     if (Ret == 0) continue;
-    
+
     // Only look at the block if it is empty or the only other thing in it is a
     // single PHI node that is the operand to the return.
     if (Ret != &BB.front()) {
@@ -241,21 +251,21 @@ static bool MergeEmptyReturnBlocks(Function &F) {
       RetBlock = &BB;
       continue;
     }
-    
+
     // Otherwise, we found a duplicate return block.  Merge the two.
     Changed = true;
-    
+
     // Case when there is no input to the return or when the returned values
     // agree is trivial.  Note that they can't agree if there are phis in the
     // blocks.
     if (Ret->getNumOperands() == 0 ||
-        Ret->getOperand(0) == 
+        Ret->getOperand(0) ==
           cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
       BB.replaceAllUsesWith(RetBlock);
       BB.eraseFromParent();
       continue;
     }
-    
+
     // If the canonical return block has no PHI node, create one now.
     PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
     if (RetBlockPHI == 0) {
@@ -264,12 +274,12 @@ static bool MergeEmptyReturnBlocks(Function &F) {
       RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
                                     std::distance(PB, PE), "merge",
                                     &RetBlock->front());
-      
+
       for (pred_iterator PI = PB; PI != PE; ++PI)
         RetBlockPHI->addIncoming(InVal, *PI);
       RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
     }
-    
+
     // Turn BB into a block that just unconditionally branches to the return
     // block.  This handles the case when the two return blocks have a common
     // predecessor but that return different things.
@@ -277,7 +287,7 @@ static bool MergeEmptyReturnBlocks(Function &F) {
     BB.getTerminator()->eraseFromParent();
     BranchInst::Create(RetBlock, &BB);
   }
-  
+
   return Changed;
 }
 
@@ -288,7 +298,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
-    
+
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
@@ -317,7 +327,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
   // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of 
+  // avoid reruning IterativeSimplifyCFG if the second pass of
   // RemoveUnreachableBlocksFromFn doesn't do anything.
   if (!RemoveUnreachableBlocksFromFn(F))
     return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index f7b6941..f110320 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -18,20 +18,20 @@
 #define DEBUG_TYPE "simplify-libcalls"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
 using namespace llvm;
 
@@ -100,7 +100,7 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
   }
   return true;
 }
- 
+
 static bool CallHasFloatingPointArgument(const CallInst *CI) {
   for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
        it != e; ++it) {
@@ -157,14 +157,15 @@ struct StrCatOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    EmitStrLenMemCpy(Src, Dst, Len, B);
-    return Dst;
+    return EmitStrLenMemCpy(Src, Dst, Len, B);
   }
 
-  void EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) {
+  Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) {
     // We need to find the end of the destination string.  That's where the
     // memory is to be moved to. We just generate a call to strlen.
-    Value *DstLen = EmitStrLen(Dst, B, TD);
+    Value *DstLen = EmitStrLen(Dst, B, TD, TLI);
+    if (!DstLen)
+      return 0;
 
     // Now that we have the destination's length, we must index into the
     // destination's pointer to get the actual memcpy destination (end of
@@ -175,6 +176,7 @@ struct StrCatOpt : public LibCallOptimization {
     // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
     B.CreateMemCpy(CpyDst, Src,
                    ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
+    return Dst;
   }
 };
 
@@ -221,8 +223,7 @@ struct StrNCatOpt : public StrCatOpt {
 
     // strncat(x, s, c) -> strcat(x, s)
     // s is constant so the strcat can be optimized further
-    EmitStrLenMemCpy(Src, Dst, SrcLen, B);
-    return Dst;
+    return EmitStrLenMemCpy(Src, Dst, SrcLen, B);
   }
 };
 
@@ -254,9 +255,9 @@ struct StrChrOpt : public LibCallOptimization {
 
       return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                         ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                        B, TD);
+                        B, TD, TLI);
     }
-    
+
     // Otherwise, the character is a constant, see if the first argument is
     // a string literal.  If so, we can constant fold.
     StringRef Str;
@@ -299,7 +300,7 @@ struct StrRChrOpt : public LibCallOptimization {
     if (!getConstantStringInfo(SrcStr, Str)) {
       // strrchr(s, 0) -> strchr(s, 0)
       if (TD && CharC->isZero())
-        return EmitStrChr(SrcStr, '\0', B, TD);
+        return EmitStrChr(SrcStr, '\0', B, TD, TLI);
       return 0;
     }
 
@@ -355,7 +356,7 @@ struct StrCmpOpt : public LibCallOptimization {
 
       return EmitMemCmp(Str1P, Str2P,
                         ConstantInt::get(TD->getIntPtrType(*Context),
-                        std::min(Len1, Len2)), B, TD);
+                        std::min(Len1, Len2)), B, TD, TLI);
     }
 
     return 0;
@@ -391,7 +392,7 @@ struct StrNCmpOpt : public LibCallOptimization {
       return ConstantInt::get(CI->getType(), 0);
 
     if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD);
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI);
 
     StringRef Str1, Str2;
     bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -447,11 +448,10 @@ struct StrCpyOpt : public LibCallOptimization {
 
     // We have enough information to now generate the memcpy call to do the
     // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    if (OptChkCall)
-      EmitMemCpyChk(Dst, Src,
-                    ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                    CI->getArgOperand(2), B, TD);
-    else
+    if (!OptChkCall ||
+        !EmitMemCpyChk(Dst, Src,
+                       ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                       CI->getArgOperand(2), B, TD, TLI))
       B.CreateMemCpy(Dst, Src,
                      ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
     return Dst;
@@ -459,6 +459,51 @@ struct StrCpyOpt : public LibCallOptimization {
 };
 
 //===---------------------------------------===//
+// 'stpcpy' Optimizations
+
+struct StpCpyOpt: public LibCallOptimization {
+  bool OptChkCall;  // True if it's optimizing a __stpcpy_chk libcall.
+
+  StpCpyOpt(bool c) : OptChkCall(c) {}
+
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "stpcpy" function prototype.
+    unsigned NumParams = OptChkCall ? 3 : 2;
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != NumParams ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    // These optimizations require TargetData.
+    if (!TD) return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len);
+    Value *DstEnd = B.CreateGEP(Dst,
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 Len - 1));
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B,
+                                      TD, TLI))
+      B.CreateMemCpy(Dst, Src, LenV, 1);
+    return DstEnd;
+  }
+};
+
+//===---------------------------------------===//
 // 'strncpy' Optimizations
 
 struct StrNCpyOpt : public LibCallOptimization {
@@ -565,7 +610,7 @@ struct StrPBrkOpt : public LibCallOptimization {
 
     // strpbrk(s, "a") -> strchr(s, 'a')
     if (TD && HasS2 && S2.size() == 1)
-      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD);
+      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI);
 
     return 0;
   }
@@ -654,7 +699,7 @@ struct StrCSpnOpt : public LibCallOptimization {
 
     // strcspn(s, "") -> strlen(s)
     if (TD && HasS2 && S2.empty())
-      return EmitStrLen(CI->getArgOperand(0), B, TD);
+      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
 
     return 0;
   }
@@ -678,9 +723,13 @@ struct StrStrOpt : public LibCallOptimization {
 
     // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
     if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD);
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+      if (!StrLen)
+        return 0;
       Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
-                                   StrLen, B, TD);
+                                   StrLen, B, TD, TLI);
+      if (!StrNCmp)
+        return 0;
       for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
            UI != UE; ) {
         ICmpInst *Old = cast<ICmpInst>(*UI++);
@@ -716,9 +765,10 @@ struct StrStrOpt : public LibCallOptimization {
     }
 
     // fold strstr(x, "y") -> strchr(x, 'y').
-    if (HasStr2 && ToFindStr.size() == 1)
-      return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0),
-                             ToFindStr[0], B, TD), CI->getType());
+    if (HasStr2 && ToFindStr.size() == 1) {
+      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+    }
     return 0;
   }
 };
@@ -1135,8 +1185,8 @@ struct PrintFOpt : public LibCallOptimization {
 
     // printf("x") -> putchar('x'), even for '%'.
     if (FormatStr.size() == 1) {
-      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD);
-      if (CI->use_empty()) return CI;
+      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
@@ -1147,26 +1197,26 @@ struct PrintFOpt : public LibCallOptimization {
       // pass to be run after this pass, to merge duplicate strings.
       FormatStr = FormatStr.drop_back();
       Value *GV = B.CreateGlobalString(FormatStr, "str");
-      EmitPutS(GV, B, TD);
-      return CI->use_empty() ? (Value*)CI :
-                    ConstantInt::get(CI->getType(), FormatStr.size()+1);
+      Value *NewCI = EmitPutS(GV, B, TD, TLI);
+      return (CI->use_empty() || !NewCI) ?
+              NewCI :
+              ConstantInt::get(CI->getType(), FormatStr.size()+1);
     }
 
     // Optimize specific format strings.
     // printf("%c", chr) --> putchar(chr)
     if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
         CI->getArgOperand(1)->getType()->isIntegerTy()) {
-      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD);
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI);
 
-      if (CI->use_empty()) return CI;
+      if (CI->use_empty() || !Res) return Res;
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
     // printf("%s\n", str) --> puts(str)
     if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
         CI->getArgOperand(1)->getType()->isPointerTy()) {
-      EmitPutS(CI->getArgOperand(1), B, TD);
-      return CI;
+      return EmitPutS(CI->getArgOperand(1), B, TD, TLI);
     }
     return 0;
   }
@@ -1253,7 +1303,9 @@ struct SPrintFOpt : public LibCallOptimization {
       // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
       if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
 
-      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD);
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
+      if (!Len)
+        return 0;
       Value *IncLen = B.CreateAdd(Len,
                                   ConstantInt::get(Len->getType(), 1),
                                   "leninc");
@@ -1320,8 +1372,8 @@ struct FWriteOpt : public LibCallOptimization {
     // This optimisation is only valid, if the return value is unused.
     if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
       Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-      EmitFPutC(Char, CI->getArgOperand(3), B, TD);
-      return ConstantInt::get(CI->getType(), 1);
+      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
     }
 
     return 0;
@@ -1346,10 +1398,10 @@ struct FPutsOpt : public LibCallOptimization {
     // fputs(s,F) --> fwrite(s,1,strlen(s),F)
     uint64_t Len = GetStringLength(CI->getArgOperand(0));
     if (!Len) return 0;
-    EmitFWrite(CI->getArgOperand(0),
-               ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-               CI->getArgOperand(1), B, TD, TLI);
-    return CI;  // Known to have no uses (see above).
+    // Known to have no uses (see above).
+    return EmitFWrite(CI->getArgOperand(0),
+                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      CI->getArgOperand(1), B, TD, TLI);
   }
 };
 
@@ -1373,11 +1425,11 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require TargetData.
       if (!TD) return 0;
 
-      EmitFWrite(CI->getArgOperand(1),
-                 ConstantInt::get(TD->getIntPtrType(*Context),
-                                  FormatStr.size()),
-                 CI->getArgOperand(0), B, TD, TLI);
-      return ConstantInt::get(CI->getType(), FormatStr.size());
+      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 FormatStr.size()),
+                                CI->getArgOperand(0), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
     }
 
     // The remaining optimizations require the format string to be "%s" or "%c"
@@ -1390,16 +1442,16 @@ struct FPrintFOpt : public LibCallOptimization {
     if (FormatStr[1] == 'c') {
       // fprintf(F, "%c", chr) --> fputc(chr, F)
       if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
-      return ConstantInt::get(CI->getType(), 1);
+      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
+                               TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
     }
 
     if (FormatStr[1] == 's') {
       // fprintf(F, "%s", str) --> fputs(str, F)
       if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
         return 0;
-      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
-      return CI;
+      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
     }
     return 0;
   }
@@ -1450,8 +1502,8 @@ struct PutsOpt : public LibCallOptimization {
 
     if (Str.empty() && CI->use_empty()) {
       // puts("") -> putchar('\n')
-      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD);
-      if (CI->use_empty()) return CI;
+      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
@@ -1470,12 +1522,15 @@ namespace {
   ///
   class SimplifyLibCalls : public FunctionPass {
     TargetLibraryInfo *TLI;
-    
+
     StringMap<LibCallOptimization*> Optimizations;
     // String and Memory LibCall Optimizations
     StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr;
-    StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
-    StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk;
+    StrCmpOpt StrCmp; StrNCmpOpt StrNCmp;
+    StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
+    StpCpyOpt StpCpy; StpCpyOpt StpCpyChk;
+    StrNCpyOpt StrNCpy;
+    StrLenOpt StrLen; StrPBrkOpt StrPBrk;
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
@@ -1487,11 +1542,12 @@ namespace {
     SPrintFOpt SPrintF; PrintFOpt PrintF;
     FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
     PutsOpt Puts;
-    
+
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {
+    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
+                         StpCpy(false), StpCpyChk(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
@@ -1542,6 +1598,7 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["strncmp"] = &StrNCmp;
   Optimizations["strcpy"] = &StrCpy;
   Optimizations["strncpy"] = &StrNCpy;
+  Optimizations["stpcpy"] = &StpCpy;
   Optimizations["strlen"] = &StrLen;
   Optimizations["strpbrk"] = &StrPBrk;
   Optimizations["strtol"] = &StrTo;
@@ -1561,6 +1618,7 @@ void SimplifyLibCalls::InitOptimizations() {
 
   // _chk variants of String and Memory LibCall Optimizations.
   Optimizations["__strcpy_chk"] = &StrCpyChk;
+  Optimizations["__stpcpy_chk"] = &StpCpyChk;
 
   // Math Library Optimizations
   Optimizations["cosf"] = &Cos;
@@ -1717,7 +1775,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
 
 void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
   FunctionType *FTy = F.getFunctionType();
-  
+
   StringRef Name = F.getName();
   switch (Name[0]) {
   case 's':
@@ -1746,6 +1804,7 @@ void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
                Name == "strtold" ||
                Name == "strncat" ||
                Name == "strncpy" ||
+               Name == "stpncpy" ||
                Name == "strtoull") {
       if (FTy->getNumParams() < 2 ||
           !FTy->getParamType(1)->isPointerTy())
@@ -2406,10 +2465,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
 //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
 //
-// stpcpy:
-//   * stpcpy(str, "literal") ->
-//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
-//
 // strchr:
 //   * strchr(p, 0) -> strlen(p)
 // tan, tanf, tanl:
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index ef65c0a..34f1d6c 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -27,6 +27,7 @@
 using namespace llvm;
 
 STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
 
 namespace {
   class Sinking : public FunctionPass {
@@ -39,9 +40,9 @@ namespace {
     Sinking() : FunctionPass(ID) {
       initializeSinkingPass(*PassRegistry::getPassRegistry());
     }
-    
+
     virtual bool runOnFunction(Function &F);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       FunctionPass::getAnalysisUsage(AU);
@@ -55,9 +56,10 @@ namespace {
     bool ProcessBlock(BasicBlock &BB);
     bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores);
     bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
+    bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
   };
 } // end anonymous namespace
-  
+
 char Sinking::ID = 0;
 INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -69,7 +71,7 @@ FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
 
 /// AllUsesDominatedByBlock - Return true if all uses of the specified value
 /// occur in blocks dominated by the specified block.
-bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, 
+bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
                                       BasicBlock *BB) const {
   // Ignoring debug uses is necessary so debug info doesn't affect the code.
   // This may leave a referencing dbg_value in the original block, before
@@ -98,20 +100,19 @@ bool Sinking::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
 
-  bool EverMadeChange = false;
-  
-  while (1) {
-    bool MadeChange = false;
+  bool MadeChange, EverMadeChange = false;
 
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
     // Process all basic blocks.
-    for (Function::iterator I = F.begin(), E = F.end(); 
+    for (Function::iterator I = F.begin(), E = F.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
-    
-    // If this iteration over the code changed anything, keep iterating.
-    if (!MadeChange) break;
-    EverMadeChange = true;
-  } 
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
   return EverMadeChange;
 }
 
@@ -120,8 +121,8 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
   if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false;
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
-  // unprofitable, it can also lead to infinite looping, because in an unreachable
-  // loop there may be nowhere to stop.
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
   if (!DT->isReachableFromEntry(&BB)) return false;
 
   bool MadeChange = false;
@@ -133,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
   SmallPtrSet<Instruction *, 8> Stores;
   do {
     Instruction *Inst = I;  // The instruction to sink.
-    
+
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
     ProcessedBegin = I == BB.begin();
@@ -145,10 +146,10 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
 
     if (SinkInstruction(Inst, Stores))
       ++NumSunk, MadeChange = true;
-    
+
     // If we just processed the first instruction in the block, we're done.
   } while (!ProcessedBegin);
-  
+
   return MadeChange;
 }
 
@@ -174,6 +175,45 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
   return true;
 }
 
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+bool Sinking::IsAcceptableTarget(Instruction *Inst,
+                                 BasicBlock *SuccToSinkTo) const {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (Inst->getParent() == SuccToSinkTo)
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (!isSafeToSpeculativelyExecute(Inst))
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI->getLoopFor(SuccToSinkTo);
+    Loop *cur = LI->getLoopFor(Inst->getParent());
+    if (succ != 0 && succ != cur)
+      return false;
+  }
+
+  // Finally, check that all the uses of the instruction are actually
+  // dominated by the candidate
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool Sinking::SinkInstruction(Instruction *Inst,
@@ -181,7 +221,7 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // Check if it's safe to move the instruction.
   if (!isSafeToMove(Inst, AA, Stores))
     return false;
-  
+
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -189,86 +229,42 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // be careful not to *increase* register pressure though, e.g. sinking
   // "x = y + z" down if it kills y and z would increase the live ranges of y
   // and z and only shrink the live range of x.
-  
-  // Loop over all the operands of the specified instruction.  If there is
-  // anything we can't handle, bail out.
-  BasicBlock *ParentBlock = Inst->getParent();
-  
+
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   BasicBlock *SuccToSinkTo = 0;
-  
-  // FIXME: This picks a successor to sink into based on having one
-  // successor that dominates all the uses.  However, there are cases where
-  // sinking can happen but where the sink point isn't a successor.  For
-  // example:
-  //   x = computation
-  //   if () {} else {}
-  //   use x
-  // the instruction could be sunk over the whole diamond for the 
-  // if/then/else (or loop, etc), allowing it to be sunk into other blocks
-  // after that.
-  
+
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the successors and decide which one
-  // we should sink to.
-  for (succ_iterator SI = succ_begin(ParentBlock),
-       E = succ_end(ParentBlock); SI != E; ++SI) {
-    if (AllUsesDominatedByBlock(Inst, *SI)) {
-      SuccToSinkTo = *SI;
-      break;
-    }
+  // Look at all the postdominators and see if we can sink it in one.
+  DomTreeNode *DTN = DT->getNode(Inst->getParent());
+  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+      I != E && SuccToSinkTo == 0; ++I) {
+    BasicBlock *Candidate = (*I)->getBlock();
+    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
+        IsAcceptableTarget(Inst, Candidate))
+      SuccToSinkTo = Candidate;
+  }
+
+  // If no suitable postdominator was found, look at all the successors and
+  // decide which one we should sink to, if any.
+  for (succ_iterator I = succ_begin(Inst->getParent()),
+      E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) {
+    if (IsAcceptableTarget(Inst, *I))
+      SuccToSinkTo = *I;
   }
-      
+
   // If we couldn't find a block to sink to, ignore this instruction.
   if (SuccToSinkTo == 0)
     return false;
-  
-  // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
-  if (Inst->getParent() == SuccToSinkTo)
-    return false;
-  
-  DEBUG(dbgs() << "Sink instr " << *Inst);
-  DEBUG(dbgs() << "to block ";
-        WriteAsOperand(dbgs(), SuccToSinkTo, false));
-  
-  // If the block has multiple predecessors, this would introduce computation on
-  // a path that it doesn't already exist.  We could split the critical edge,
-  // but for now we just punt.
-  // FIXME: Split critical edges if not backedges.
-  if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) {
-    // We cannot sink a load across a critical edge - there may be stores in
-    // other code paths.
-    if (!isSafeToSpeculativelyExecute(Inst)) {
-      DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n");
-      return false;
-    }
 
-    // We don't want to sink across a critical edge if we don't dominate the
-    // successor. We could be introducing calculations to new code paths.
-    if (!DT->dominates(ParentBlock, SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Critical edge found\n");
-      return false;
-    }
-
-    // Don't sink instructions into a loop.
-    if (LI->isLoopHeader(SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Loop header found\n");
-      return false;
-    }
+  DEBUG(dbgs() << "Sink" << *Inst << " (";
+        WriteAsOperand(dbgs(), Inst->getParent(), false);
+        dbgs() << " -> ";
+        WriteAsOperand(dbgs(), SuccToSinkTo, false);
+        dbgs() << ")\n");
 
-    // Otherwise we are OK with sinking along a critical edge.
-    DEBUG(dbgs() << "Sinking along critical edge.\n");
-  }
-  
-  // Determine where to insert into.  Skip phi nodes.
-  BasicBlock::iterator InsertPos = SuccToSinkTo->begin();
-  while (InsertPos != SuccToSinkTo->end() && isa<PHINode>(InsertPos))
-    ++InsertPos;
-  
   // Move the instruction.
-  Inst->moveBefore(InsertPos);
+  Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt());
   return true;
 }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index e21eb9d..6557d63 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -172,7 +172,7 @@ bool TailCallElim::runOnFunction(Function &F) {
     FunctionContainsEscapingAllocas |=
       CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
   }
-  
+
   /// FIXME: The code generator produces really bad code when an 'escaping
   /// alloca' is changed from being a static alloca to being a dynamic alloca.
   /// Until this is resolved, disable this transformation if that would ever
@@ -234,7 +234,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
     return false;
-  
+
   if (LoadInst *L = dyn_cast<LoadInst>(I)) {
     // Loads may always be moved above calls without side effects.
     if (CI->mayHaveSideEffects()) {
@@ -364,7 +364,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
     return 0;
-  
+
   // Scan backwards from the return, checking to see if there is a tail call in
   // this block.  If so, set CI to it.
   CallInst *CI = 0;
@@ -388,10 +388,10 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F->getEntryBlock() && 
+  if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
       FirstNonDbg(llvm::next(BB->begin())) == TI &&
-      callIsSmall(F)) {
+      callIsSmall(CI)) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
     CallSite::arg_iterator I = CallSite(CI).arg_begin(),
@@ -432,7 +432,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   BasicBlock::iterator BBI = CI;
   for (++BBI; &*BBI != Ret; ++BBI) {
     if (CanMoveAboveCall(BBI, CI)) continue;
-    
+
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 3859a1a..2679b93 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -659,10 +659,26 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   // If the return instruction returns a value, and if the value was a
   // PHI node in "BB", propagate the right value into the return.
   for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
-       i != e; ++i)
-    if (PHINode *PN = dyn_cast<PHINode>(*i))
-      if (PN->getParent() == BB)
-        *i = PN->getIncomingValueForBlock(Pred);
+       i != e; ++i) {
+    Value *V = *i;
+    Instruction *NewBC = 0;
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+      // Return value might be bitcasted. Clone and insert it before the
+      // return instruction.
+      V = BCI->getOperand(0);
+      NewBC = BCI->clone();
+      Pred->getInstList().insert(NewRet, NewBC);
+      *i = NewBC;
+    }
+    if (PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (PN->getParent() == BB) {
+        if (NewBC)
+          NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        else
+          *i = PN->getIncomingValueForBlock(Pred);
+      }
+    }
+  }
       
   // Update any PHI nodes in the returning block to realize that we no
   // longer branch to them.
@@ -671,12 +687,3 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a 
-/// given basic block.
-DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) {
-  if (const Instruction *I = BB->getFirstNonPHI())
-    return I->getDebugLoc();
-  // Scanning entire block may be too expensive, if the first instruction
-  // does not have valid location info.
-  return DebugLoc();
-}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 2a8e9b8..6b04e3d 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -122,7 +122,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
 /// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
 /// is the new loop exit block, and DestBB is the old loop exit, now the
 /// successor of SplitBB.
-static void createPHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
                                        BasicBlock *SplitBB,
                                        BasicBlock *DestBB) {
   // SplitBB shouldn't have anything non-trivial in it yet.
@@ -341,11 +341,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
                "Split point for loop exit is contained in loop!");
 
         // Update LCSSA form in the newly created exit block.
-        if (P->mustPreserveAnalysisID(LCSSAID)) {
-          SmallVector<BasicBlock *, 1> OrigPred;
-          OrigPred.push_back(TIBB);
-          createPHIsForSplitLoopExit(OrigPred, NewBB, DestBB);
-        }
+        if (P->mustPreserveAnalysisID(LCSSAID))
+          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
 
         // For each unique exit block...
         // FIXME: This code is functionally equivalent to the corresponding
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index a808303..e13fd71 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -12,18 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Type.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -34,7 +34,11 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
 /// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) {
+Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strlen))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
@@ -42,7 +46,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) {
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2),
+  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -53,18 +57,48 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) {
   return CI;
 }
 
+/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the
+/// specified pointer.  Ptr is required to be some pointer type, MaxLen must
+/// be of size_t type, and the return value has 'intptr_t' type.
+Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
+                         const TargetData *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strnlen))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI),
+                                             TD->getIntPtrType(Context),
+                                             B.getInt8PtrTy(),
+                                             TD->getIntPtrType(Context),
+                                             NULL);
+  CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen");
+  if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
 /// EmitStrChr - Emit a call to the strchr function to the builder, for the
 /// specified pointer and character.  Ptr is required to be some pointer type,
 /// and the return value has 'i8*' type.
 Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
-                        const TargetData *TD) {
+                        const TargetData *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strchr))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI =
     AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1),
+  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -75,7 +109,11 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
 /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
 Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD) {
+                         IRBuilder<> &B, const TargetData *TD,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strncmp))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
@@ -84,7 +122,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3),
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -101,13 +139,17 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
 /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                        const TargetData *TD, StringRef Name) {
+                        const TargetData *TD, const TargetLibraryInfo *TLI,
+                        StringRef Name) {
+  if (!TLI->has(LibFunc::strcpy))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
+  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -119,13 +161,17 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD, StringRef Name) {
+                         IRBuilder<> &B, const TargetData *TD,
+                         const TargetLibraryInfo *TLI, StringRef Name) {
+  if (!TLI->has(LibFunc::strncpy))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
+  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -139,13 +185,17 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
 /// are pointers.
 Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                           IRBuilder<> &B, const TargetData *TD) {
+                           IRBuilder<> &B, const TargetData *TD,
+                           const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memcpy_chk))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
   AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(&AWI, 1),
+                                         AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -162,12 +212,16 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
 /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
 Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD) {
+                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memchr))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
   AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1),
+  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -183,7 +237,11 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
 
 /// EmitMemCmp - Emit a call to the memcmp function.
 Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD) {
+                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memcmp))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
@@ -192,7 +250,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3),
+  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -236,7 +294,11 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 
 /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
 /// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD) {
+Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::putchar))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
                                           B.getInt32Ty(), NULL);
@@ -254,33 +316,40 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD) {
 
 /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
 /// some pointer.
-void llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD) {
+Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
+                      const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::puts))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
 
-  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2),
+  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
   CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
-
+  return CI;
 }
 
 /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
 /// an integer and File is a pointer to FILE.
-void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                     const TargetData *TD) {
+Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fputc))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2),
+    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -295,12 +364,16 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
+  return CI;
 }
 
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
-void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                     const TargetData *TD, const TargetLibraryInfo *TLI) {
+Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fputs))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
@@ -309,7 +382,7 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -321,13 +394,17 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
+  return CI;
 }
 
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
-void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                      IRBuilder<> &B, const TargetData *TD,
-                      const TargetLibraryInfo *TLI) {
+Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
+                        IRBuilder<> &B, const TargetData *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fwrite))
+    return 0;
+
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
@@ -337,7 +414,7 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
@@ -354,11 +431,13 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
+  return CI;
 }
 
 SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { }
 
-bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
+bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD,
+                                     const TargetLibraryInfo *TLI) {
   // We really need TargetData for later.
   if (!TD) return false;
   
@@ -446,7 +525,9 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
     // string lengths for varying.
     if (isFoldable(2, 1, true)) {
       Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
-                              Name.substr(2, 6));
+                              TLI, Name.substr(2, 6));
+      if (!Ret)
+        return false;
       replaceCall(Ret);
       return true;
     }
@@ -464,7 +545,10 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
 
     if (isFoldable(3, 2, false)) {
       Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), B, TD, Name.substr(2, 7));
+                               CI->getArgOperand(2), B, TD, TLI,
+                               Name.substr(2, 7));
+      if (!Ret)
+        return false;
       replaceCall(Ret);
       return true;
     }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 7f5cb5e..4ff31ca 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -29,3 +29,5 @@ add_llvm_library(LLVMTransformUtils
   Utils.cpp
   ValueMapper.cpp
   )
+
+add_dependencies(LLVMTransformUtils intrinsics_gen)
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 20052a4..99237b8 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
@@ -28,7 +29,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index a0e027b..1dac6b5 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -53,7 +53,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
                                             I->isConstant(), I->getLinkage(),
                                             (Constant*) 0, I->getName(),
                                             (GlobalVariable*) 0,
-                                            I->isThreadLocal(),
+                                            I->getThreadLocalMode(),
                                             I->getType()->getAddressSpace());
     GV->copyAttributesFrom(I);
     VMap[I] = GV;
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index e8c0b80..c545cd6 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
@@ -23,6 +23,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/CommandLine.h"
@@ -43,61 +45,139 @@ static cl::opt<bool>
 AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
                  cl::desc("Aggregate arguments to code-extracted functions"));
 
-namespace {
-  class CodeExtractor {
-    typedef SetVector<Value*> Values;
-    SetVector<BasicBlock*> BlocksToExtract;
-    DominatorTree* DT;
-    bool AggregateArgs;
-    unsigned NumExitBlocks;
-    Type *RetTy;
-  public:
-    CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
-      : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
-
-    Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code);
-
-    bool isEligible(ArrayRef<BasicBlock*> code);
-
-  private:
-    /// definedInRegion - Return true if the specified value is defined in the
-    /// extracted region.
-    bool definedInRegion(Value *V) const {
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        if (BlocksToExtract.count(I->getParent()))
-          return true;
-      return false;
-    }
+/// \brief Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB) {
+  // Landing pads must be in the function where they were inserted for cleanup.
+  if (BB.isLandingPad())
+    return false;
 
-    /// definedInCaller - Return true if the specified value is defined in the
-    /// function being code extracted, but not in the region being extracted.
-    /// These values must be passed in as live-ins to the function.
-    bool definedInCaller(Value *V) const {
-      if (isa<Argument>(V)) return true;
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        if (!BlocksToExtract.count(I->getParent()))
-          return true;
+  // Don't hoist code containing allocas, invokes, or vastarts.
+  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
       return false;
+    if (const CallInst *CI = dyn_cast<CallInst>(I))
+      if (const Function *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+  }
+
+  return true;
+}
+
+/// \brief Build a set of blocks to extract if the input blocks are viable.
+template <typename IteratorT>
+static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
+                                                       IteratorT BBEnd) {
+  SetVector<BasicBlock *> Result;
+
+  assert(BBBegin != BBEnd);
+
+  // Loop over the blocks, adding them to our set-vector, and aborting with an
+  // empty set if we encounter invalid blocks.
+  for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) {
+    if (!Result.insert(*I))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+
+    if (!isBlockValidForExtraction(**I)) {
+      Result.clear();
+      return Result;
     }
+  }
+
+#ifndef NDEBUG
+  for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()),
+                                         E = Result.end();
+       I != E; ++I)
+    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
+         PI != PE; ++PI)
+      assert(Result.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+#endif
+
+  return Result;
+}
+
+/// \brief Helper to call buildExtractionBlockSet with an ArrayRef.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) {
+  return buildExtractionBlockSet(BBs.begin(), BBs.end());
+}
+
+/// \brief Helper to call buildExtractionBlockSet with a RegionNode.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(const RegionNode &RN) {
+  if (!RN.isSubRegion())
+    // Just a single BasicBlock.
+    return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>());
 
-    void severSplitPHINodes(BasicBlock *&Header);
-    void splitReturnBlocks();
-    void findInputsOutputs(Values &inputs, Values &outputs);
+  const Region &R = *RN.getNodeAs<Region>();
 
-    Function *constructFunction(const Values &inputs,
-                                const Values &outputs,
-                                BasicBlock *header,
-                                BasicBlock *newRootNode, BasicBlock *newHeader,
-                                Function *oldFunction, Module *M);
+  return buildExtractionBlockSet(R.block_begin(), R.block_end());
+}
 
-    void moveCodeToFunction(Function *newFunction);
+CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs)
+  : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                             bool AggregateArgs)
+  : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN,
+                             bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
 
-    void emitCallAndSwitchStatement(Function *newFunction,
-                                    BasicBlock *newHeader,
-                                    Values &inputs,
-                                    Values &outputs);
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
 
-  };
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
+                                      ValueSet &Outputs) const {
+  for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(),
+                                               E = Blocks.end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+      for (User::op_iterator OI = II->op_begin(), OE = II->op_end();
+           OI != OE; ++OI)
+        if (definedInCaller(Blocks, *OI))
+          Inputs.insert(*OI);
+
+      for (Value::use_iterator UI = II->use_begin(), UE = II->use_end();
+           UI != UE; ++UI)
+        if (!definedInRegion(Blocks, *UI)) {
+          Outputs.insert(II);
+          break;
+        }
+    }
+  }
 }
 
 /// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
@@ -115,7 +195,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // than one entry from outside the region.  If so, we need to sever the
     // header block into two.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (BlocksToExtract.count(PN->getIncomingBlock(i)))
+      if (Blocks.count(PN->getIncomingBlock(i)))
         ++NumPredsFromRegion;
       else
         ++NumPredsOutsideRegion;
@@ -136,8 +216,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   // We only want to code extract the second block now, and it becomes the new
   // header of the region.
   BasicBlock *OldPred = Header;
-  BlocksToExtract.remove(OldPred);
-  BlocksToExtract.insert(NewBB);
+  Blocks.remove(OldPred);
+  Blocks.insert(NewBB);
   Header = NewBB;
 
   // Okay, update dominator sets. The blocks that dominate the new one are the
@@ -152,7 +232,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // Loop over all of the predecessors of OldPred that are in the region,
     // changing them to branch to NewBB instead.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+      if (Blocks.count(PN->getIncomingBlock(i))) {
         TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
         TI->replaceUsesOfWith(OldPred, NewBB);
       }
@@ -170,7 +250,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
       // Loop over all of the incoming value in PN, moving them to NewPN if they
       // are from the extracted region.
       for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
-        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
           NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
           PN->removeIncomingValue(i);
           --i;
@@ -181,8 +261,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
 }
 
 void CodeExtractor::splitReturnBlocks() {
-  for (SetVector<BasicBlock*>::iterator I = BlocksToExtract.begin(),
-         E = BlocksToExtract.end(); I != E; ++I)
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
     if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
       BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
       if (DT) {
@@ -203,45 +283,11 @@ void CodeExtractor::splitReturnBlocks() {
     }
 }
 
-// findInputsOutputs - Find inputs to, outputs from the code region.
-//
-void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) {
-  std::set<BasicBlock*> ExitBlocks;
-  for (SetVector<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(),
-       ce = BlocksToExtract.end(); ci != ce; ++ci) {
-    BasicBlock *BB = *ci;
-
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      // If a used value is defined outside the region, it's an input.  If an
-      // instruction is used outside the region, it's an output.
-      for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O)
-        if (definedInCaller(*O))
-          inputs.insert(*O);
-
-      // Consider uses of this instruction (outputs).
-      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-           UI != E; ++UI)
-        if (!definedInRegion(*UI)) {
-          outputs.insert(I);
-          break;
-        }
-    } // for: insts
-
-    // Keep track of the exit blocks from the region.
-    TerminatorInst *TI = BB->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!BlocksToExtract.count(TI->getSuccessor(i)))
-        ExitBlocks.insert(TI->getSuccessor(i));
-  } // for: basic blocks
-
-  NumExitBlocks = ExitBlocks.size();
-}
-
 /// constructFunction - make a function based on inputs and outputs, as follows:
 /// f(in0, ..., inN, out0, ..., outN)
 ///
-Function *CodeExtractor::constructFunction(const Values &inputs,
-                                           const Values &outputs,
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+                                           const ValueSet &outputs,
                                            BasicBlock *header,
                                            BasicBlock *newRootNode,
                                            BasicBlock *newHeader,
@@ -261,15 +307,15 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
   std::vector<Type*> paramTy;
 
   // Add the types of the input values to the function's argument list
-  for (Values::const_iterator i = inputs.begin(),
-         e = inputs.end(); i != e; ++i) {
+  for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end();
+       i != e; ++i) {
     const Value *value = *i;
     DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
-  for (Values::const_iterator I = outputs.begin(), E = outputs.end();
+  for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end();
        I != E; ++I) {
     DEBUG(dbgs() << "instr used in func: " << **I << "\n");
     if (AggregateArgs)
@@ -326,7 +372,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
          use != useE; ++use)
       if (Instruction* inst = dyn_cast<Instruction>(*use))
-        if (BlocksToExtract.count(inst->getParent()))
+        if (Blocks.count(inst->getParent()))
           inst->replaceUsesOfWith(inputs[i], RewriteVal);
   }
 
@@ -347,7 +393,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
     if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
-      if (!BlocksToExtract.count(TI->getParent()) &&
+      if (!Blocks.count(TI->getParent()) &&
           TI->getParent()->getParent() == oldFunction)
         TI->replaceUsesOfWith(header, newHeader);
 
@@ -373,7 +419,7 @@ static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) {
 /// necessary.
 void CodeExtractor::
 emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
-                           Values &inputs, Values &outputs) {
+                           ValueSet &inputs, ValueSet &outputs) {
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
   std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
@@ -381,14 +427,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   LLVMContext &Context = newFunction->getContext();
 
   // Add inputs as params, or to be filled into the struct
-  for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+  for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
     if (AggregateArgs)
       StructValues.push_back(*i);
     else
       params.push_back(*i);
 
   // Create allocas for the outputs
-  for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+  for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
     if (AggregateArgs) {
       StructValues.push_back(*i);
     } else {
@@ -403,7 +449,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   AllocaInst *Struct = 0;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
     std::vector<Type*> ArgTypes;
-    for (Values::iterator v = StructValues.begin(),
+    for (ValueSet::iterator v = StructValues.begin(),
            ve = StructValues.end(); v != ve; ++v)
       ArgTypes.push_back((*v)->getType());
 
@@ -458,7 +504,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end());
     for (unsigned u = 0, e = Users.size(); u != e; ++u) {
       Instruction *inst = cast<Instruction>(Users[u]);
-      if (!BlocksToExtract.count(inst->getParent()))
+      if (!Blocks.count(inst->getParent()))
         inst->replaceUsesOfWith(outputs[i], load);
     }
   }
@@ -476,11 +522,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
 
   unsigned switchVal = 0;
-  for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
-         e = BlocksToExtract.end(); i != e; ++i) {
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
     TerminatorInst *TI = (*i)->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!BlocksToExtract.count(TI->getSuccessor(i))) {
+      if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
         // add a new basic block which returns the appropriate value
         BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
@@ -618,18 +664,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
     // Remove redundant case
-    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
+    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
+    TheSwitch->removeCase(ToBeRemoved);
     break;
   }
 }
 
 void CodeExtractor::moveCodeToFunction(Function *newFunction) {
-  Function *oldFunc = (*BlocksToExtract.begin())->getParent();
+  Function *oldFunc = (*Blocks.begin())->getParent();
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
   Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
 
-  for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
-         e = BlocksToExtract.end(); i != e; ++i) {
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
     // Delete the basic block from the old function, and the list of blocks
     oldBlocks.remove(*i);
 
@@ -638,47 +685,15 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   }
 }
 
-/// ExtractRegion - Removes a loop from a function, replaces it with a call to
-/// new function. Returns pointer to the new function.
-///
-/// algorithm:
-///
-/// find inputs and outputs for the region
-///
-/// for inputs: add to function as args, map input instr* to arg#
-/// for outputs: add allocas for scalars,
-///             add to func as args, map output instr* to arg#
-///
-/// rewrite func to use argument #s instead of instr*
-///
-/// for each scalar output in the function: at every exit, store intermediate
-/// computed result back into memory.
-///
-Function *CodeExtractor::
-ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
-  if (!isEligible(code))
+Function *CodeExtractor::extractCodeRegion() {
+  if (!isEligible())
     return 0;
 
-  // 1) Find inputs, outputs
-  // 2) Construct new function
-  //  * Add allocas for defs, pass as args by reference
-  //  * Pass in uses as args
-  // 3) Move code region, add call instr to func
-  //
-  BlocksToExtract.insert(code.begin(), code.end());
-
-  Values inputs, outputs;
+  ValueSet inputs, outputs;
 
   // Assumption: this is a single-entry code region, and the header is the first
   // block in the region.
-  BasicBlock *header = code[0];
-
-  for (unsigned i = 1, e = code.size(); i != e; ++i)
-    for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]);
-         PI != E; ++PI)
-      assert(BlocksToExtract.count(*PI) &&
-             "No blocks in this region may have entries from outside the region"
-             " except for the first block!");
+  BasicBlock *header = *Blocks.begin();
 
   // If we have to split PHI nodes or the entry block, do so now.
   severSplitPHINodes(header);
@@ -703,6 +718,14 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   // Find inputs to, outputs from the code region.
   findInputsOutputs(inputs, outputs);
 
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
+    for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+      if (!Blocks.count(*SI))
+        ExitBlocks.insert(*SI);
+  NumExitBlocks = ExitBlocks.size();
+
   // Construct new function based on inputs/outputs & add allocas for all defs.
   Function *newFunction = constructFunction(inputs, outputs, header,
                                             newFuncRoot,
@@ -718,7 +741,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!BlocksToExtract.count(PN->getIncomingBlock(i)))
+      if (!Blocks.count(PN->getIncomingBlock(i)))
         PN->setIncomingBlock(i, newFuncRoot);
   }
 
@@ -732,7 +755,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
       PHINode *PN = cast<PHINode>(I);
       std::set<BasicBlock*> ProcessedPreds;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
           if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
             PN->setIncomingBlock(i, codeReplacer);
           else {
@@ -754,44 +777,3 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
         report_fatal_error("verifyFunction failed!"));
   return newFunction;
 }
-
-bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) {
-  // Deny a single basic block that's a landing pad block.
-  if (code.size() == 1 && code[0]->isLandingPad())
-    return false;
-
-  // Deny code region if it contains allocas or vastarts.
-  for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end();
-       BB != e; ++BB)
-    for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
-         I != Ie; ++I)
-      if (isa<AllocaInst>(*I))
-        return false;
-      else if (const CallInst *CI = dyn_cast<CallInst>(I))
-        if (const Function *F = CI->getCalledFunction())
-          if (F->getIntrinsicID() == Intrinsic::vastart)
-            return false;
-  return true;
-}
-
-
-/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new
-/// function.
-///
-Function* llvm::ExtractCodeRegion(DominatorTree &DT,
-                                  ArrayRef<BasicBlock*> code,
-                                  bool AggregateArgs) {
-  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
-}
-
-/// ExtractLoop - Slurp a natural loop into a brand new function.
-///
-Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
-  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
-}
-
-/// ExtractBasicBlock - Slurp a basic block into a brand new function.
-///
-Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){
-  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs);
-}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index d2b167a..89e89e7 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,22 +13,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Attributes.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/Attributes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
@@ -43,10 +43,10 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
 namespace {
   /// A class for recording information about inlining through an invoke.
   class InvokeInliningInfo {
-    BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind.
-    BasicBlock *InnerResumeDest; //< Destination for the callee's resume.
-    LandingPadInst *CallerLPad;  //< LandingPadInst associated with the invoke.
-    PHINode *InnerEHValuesPHI;   //< PHI for EH values from landingpad insts.
+    BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   ///< PHI for EH values from landingpad insts.
     SmallVector<Value*, 8> UnwindDestPHIValues;
 
   public:
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d1c4d59..bed7d72 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -14,31 +14,31 @@
 
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -169,16 +169,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
       SwitchInst::CaseIt FirstCase = SI->case_begin();
-      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-          FirstCase.getCaseValue(), "cond");
-
-      // Insert the new branch.
-      Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
-                           SI->getDefaultDest());
-
-      // Delete the old switch.
-      SI->eraseFromParent();
-      return true;
+      IntegersSubset& Case = FirstCase.getCaseValueEx();
+      if (Case.isSingleNumber()) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+             Case.getSingleNumber(0).toConstantInt(),
+            "cond");
+
+        // Insert the new branch.
+        Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
+                             SI->getDefaultDest());
+
+        // Delete the old switch.
+        SI->eraseFromParent();
+        return true;
+      }
     }
     return false;
   }
@@ -260,7 +265,7 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
       return isa<UndefValue>(II->getArgOperand(1));
   }
 
-  if (extractMallocCall(I)) return true;
+  if (isAllocLikeFn(I)) return true;
 
   if (CallInst *CI = isFreeCall(I))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
@@ -700,7 +705,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
         CollisionMap[PN] = Old;
         break;
       }
-      // Procede to the next PHI in the list.
+      // Proceed to the next PHI in the list.
       OtherPN = I->second;
     }
   }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e15497a..2023750 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -95,9 +95,11 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // Erase basic block from the function...
 
   // ScalarEvolution holds references to loop exit blocks.
-  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
-    if (Loop *L = LI->getLoopFor(BB))
-      SE->forgetLoop(L);
+  if (LPM) {
+    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
+      if (Loop *L = LI->getLoopFor(BB))
+        SE->forgetLoop(L);
+    }
   }
   LI->removeBlock(BB);
   BB->eraseFromParent();
@@ -204,9 +206,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
-  if (SE)
-    SE->forgetLoop(L);
+  if (LPM) {
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE)
+      SE->forgetLoop(L);
+  }
 
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
@@ -405,24 +409,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     }
   }
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  // Incrementally updating domtree after loop unrolling would be easy.
-  if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
-    DT->runOnFunction(*L->getHeader()->getParent());
-
-  // Simplify any new induction variables in the partially unrolled loop.
-  if (SE && !CompletelyUnroll) {
-    SmallVector<WeakVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, LPM, DeadInsts);
-
-    // Aggressively clean up dead instructions that simplifyLoopIVs already
-    // identified. Any remaining should be cleaned up below.
-    while (!DeadInsts.empty())
-      if (Instruction *Inst =
-          dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  if (LPM) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    // Incrementally updating domtree after loop unrolling would be easy.
+    if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
+      DT->runOnFunction(*L->getHeader()->getParent());
+
+    // Simplify any new induction variables in the partially unrolled loop.
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE && !CompletelyUnroll) {
+      SmallVector<WeakVH, 16> DeadInsts;
+      simplifyLoopIVs(L, SE, LPM, DeadInsts);
+
+      // Aggressively clean up dead instructions that simplifyLoopIVs already
+      // identified. Any remaining should be cleaned up below.
+      while (!DeadInsts.empty())
+        if (Instruction *Inst =
+            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+          RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    }
   }
-
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 3aa6bef..67e17f4 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -131,7 +131,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
 /// There are two value maps that are defined and used.  VMap is
 /// for the values in the current loop instance.  LVMap contains
 /// the values from the last loop instance.  We need the LVMap values
-/// to update the inital values for the current loop instance.
+/// to update the initial values for the current loop instance.
 ///
 static void CloneLoopBlocks(Loop *L,
                             bool FirstCopy,
@@ -237,6 +237,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
 
   // Use Scalar Evolution to compute the trip count.  This allows more
   // loops to be unrolled than relying on induction var simplification
+  if (!LPM)
+    return false;
   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
   if (SE == 0)
     return false;
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index c70ced1..02bdcda 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -12,18 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lower-expect-intrinsic"
+#include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
 #include <vector>
 
 using namespace llvm;
@@ -70,24 +71,18 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
   if (!ExpectedValue)
     return false;
 
-  LLVMContext &Context = CI->getContext();
-  Type *Int32Ty = Type::getInt32Ty(Context);
-
   SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
-  std::vector<Value *> Vec;
-  unsigned n = SI->getNumCases();
-  Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case
-
-  Vec[0] = MDString::get(Context, "branch_weights");
-  Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ?
-                            LikelyBranchWeight : UnlikelyBranchWeight);
-  for (unsigned i = 0; i < n; ++i) {
-    Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ?
-        LikelyBranchWeight : UnlikelyBranchWeight);
-  }
+  unsigned n = SI->getNumCases(); // +1 for default case.
+  std::vector<uint32_t> Weights(n + 1);
 
-  MDNode *WeightsNode = llvm::MDNode::get(Context, Vec);
-  SI->setMetadata(LLVMContext::MD_prof, WeightsNode);
+  Weights[0] = Case == SI->case_default() ? LikelyBranchWeight
+                                          : UnlikelyBranchWeight;
+  for (unsigned i = 0; i != n; ++i)
+    Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight
+                                              : UnlikelyBranchWeight;
+
+  SI->setMetadata(LLVMContext::MD_prof,
+                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
 
   SI->setCondition(ArgValue);
   return true;
@@ -120,20 +115,17 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
   if (!ExpectedValue)
     return false;
 
-  LLVMContext &Context = CI->getContext();
-  Type *Int32Ty = Type::getInt32Ty(Context);
-  bool Likely = ExpectedValue->isOne();
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
 
   // If expect value is equal to 1 it means that we are more likely to take
   // branch 0, in other case more likely is branch 1.
-  Value *Ops[] = {
-    MDString::get(Context, "branch_weights"),
-    ConstantInt::get(Int32Ty, Likely ? LikelyBranchWeight : UnlikelyBranchWeight),
-    ConstantInt::get(Int32Ty, Likely ? UnlikelyBranchWeight : LikelyBranchWeight)
-  };
+  if (ExpectedValue->isOne())
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
 
-  MDNode *WeightsNode = MDNode::get(Context, Ops);
-  BI->setMetadata(LLVMContext::MD_prof, WeightsNode);
+  BI->setMetadata(LLVMContext::MD_prof, Node);
 
   CmpI->setOperand(0, ArgValue);
   return true;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index a16130d..1547439 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -66,18 +66,6 @@ namespace {
                              BasicBlock* OrigBlock, BasicBlock* Default);
     unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
   };
-
-  /// The comparison function for sorting the switch case values in the vector.
-  /// WARNING: Case ranges should be disjoint!
-  struct CaseCmp {
-    bool operator () (const LowerSwitch::CaseRange& C1,
-                      const LowerSwitch::CaseRange& C2) {
-
-      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
-      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
-      return CI1->getValue().slt(CI2->getValue());
-    }
-  };
 }
 
 char LowerSwitch::ID = 0;
@@ -159,7 +147,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
                                 Val, Pivot.Low, "Pivot");
   NewNode->getInstList().push_back(Comp);
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
@@ -234,40 +222,34 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
 
 // Clusterify - Transform simple list of Cases into list of CaseRange's
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
-  unsigned numCmps = 0;
+
+  IntegersSubsetToBB TheClusterifier;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
-                              i.getCaseSuccessor()));
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+       i != e; ++i) {
+    BasicBlock *SuccBB = i.getCaseSuccessor();
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    TheClusterifier.add(CaseRanges, SuccBB);
+  }
   
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
-
-  // Merge case into clusters
-  if (Cases.size()>=2)
-    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
-      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
-      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
-      BasicBlock* nextBB = J->BB;
-      BasicBlock* currentBB = I->BB;
-
-      // If the two neighboring cases go to the same destination, merge them
-      // into a single case.
-      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
-        I->High = J->High;
-        J = Cases.erase(J);
-      } else {
-        I = J++;
-      }
-    }
-
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
+  TheClusterifier.optimize();
+  
+  size_t numCmps = 0;
+  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    IntegersSubsetToBB::Cluster &C = *i;
+    
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
+                              C.first.getHigh().toConstantInt(), C.second));
+    if (C.first.isSingleNumber())
       // A range counts double, since it requires two compares.
       ++numCmps;
   }
 
-  return numCmps;
+  return numCmps;  
 }
 
 // processSwitchInst - Replace the specified switch instruction with a sequence
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 8491c55..dbcf3b2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -14,8 +14,8 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 2357d81..dd5e20e 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -28,14 +28,14 @@
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Metadata.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index e60a41b..e568a61 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -190,8 +190,11 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     return V;
   }
 
-  // Set DebugLoc.
-  InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB));
+  // Set the DebugLoc of the inserted PHI, if available.
+  DebugLoc DL;
+  if (const Instruction *I = BB->getFirstNonPHI())
+      DL = I->getDebugLoc();
+  InsertedPHI->setDebugLoc(DL);
 
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
@@ -211,6 +214,11 @@ void SSAUpdater::RewriteUse(Use &U) {
   else
     V = GetValueInMiddleOfBlock(User->getParent());
 
+  // Notify that users of the existing value that it is being replaced.
+  Value *OldVal = U.get();
+  if (OldVal != V && OldVal->hasValueHandle())
+    ValueHandleBase::ValueIsRAUWd(OldVal, V);
+
   U.set(V);
 }
 
@@ -230,28 +238,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
   U.set(V);
 }
 
-/// PHIiter - Iterator for PHI operands.  This is used for the PHI_iterator
-/// in the SSAUpdaterImpl template.
-namespace {
-  class PHIiter {
-  private:
-    PHINode *PHI;
-    unsigned idx;
-
-  public:
-    explicit PHIiter(PHINode *P) // begin iterator
-      : PHI(P), idx(0) {}
-    PHIiter(PHINode *P, bool) // end iterator
-      : PHI(P), idx(PHI->getNumIncomingValues()) {}
-
-    PHIiter &operator++() { ++idx; return *this; } 
-    bool operator==(const PHIiter& x) const { return idx == x.idx; }
-    bool operator!=(const PHIiter& x) const { return !operator==(x); }
-    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
-    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
-  };
-}
-
 /// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template,
 /// specialized for SSAUpdater.
 namespace llvm {
@@ -266,9 +252,26 @@ public:
   static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
   static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
 
-  typedef PHIiter PHI_iterator;
-  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
-  static inline PHI_iterator PHI_end(PhiT *PHI) {
+  class PHI_iterator {
+  private:
+    PHINode *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(PHINode *P) // begin iterator
+      : PHI(P), idx(0) {}
+    PHI_iterator(PHINode *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() { ++idx; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) {
     return PHI_iterator(PHI, true);
   }
 
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 66dd2c9..518df7c 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -16,29 +16,30 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/Type.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
 #include <map>
@@ -55,12 +56,26 @@ DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
+  /// ValueEqualityComparisonCase - Represents a case of a switch.
+  struct ValueEqualityComparisonCase {
+    ConstantInt *Value;
+    BasicBlock *Dest;
+
+    ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+      : Value(Value), Dest(Dest) {}
+
+    bool operator<(ValueEqualityComparisonCase RHS) const {
+      // Comparing pointers is ok as we only rely on the order for uniquing.
+      return Value < RHS.Value;
+    }
+  };
+
 class SimplifyCFGOpt {
   const TargetData *const TD;
 
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
-    std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases);
+                               std::vector<ValueEqualityComparisonCase> &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
@@ -107,6 +122,47 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
   return true;
 }
 
+/// isProfitableToFoldUnconditional - Return true if it is safe and profitable
+/// to merge these two terminator instructions together, where SI1 is an
+/// unconditional branch. PhiNodes will store all PHI nodes in common
+/// successors.
+///
+static bool isProfitableToFoldUnconditional(BranchInst *SI1,
+                                          BranchInst *SI2,
+                                          Instruction *Cond,
+                                          SmallVectorImpl<PHINode*> &PhiNodes) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+  assert(SI1->isUnconditional() && SI2->isConditional());
+
+  // We fold the unconditional branch if we can easily update all PHI nodes in
+  // common successors: 
+  // 1> We have a constant incoming value for the conditional branch;
+  // 2> We have "Cond" as the incoming value for the unconditional branch;
+  // 3> SI2->getCondition() and Cond have same operands.
+  CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
+  if (!Ci2) return false;
+  if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
+        Cond->getOperand(1) == Ci2->getOperand(1)) &&
+      !(Cond->getOperand(0) == Ci2->getOperand(1) &&
+        Cond->getOperand(1) == Ci2->getOperand(0)))
+    return false;
+
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
+            !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
+          return false;
+        PhiNodes.push_back(PN);
+      }
+  return true;
+}
+
 /// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will
 /// now be entries in it from the 'NewPred' block.  The values that will be
 /// flowing into the PHI nodes will be the same as those coming in from
@@ -476,21 +532,22 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 /// decode all of the 'cases' that it represents and return the 'default' block.
 BasicBlock *SimplifyCFGOpt::
 GetValueEqualityComparisonCases(TerminatorInst *TI,
-                                std::vector<std::pair<ConstantInt*,
-                                                      BasicBlock*> > &Cases) {
+                                std::vector<ValueEqualityComparisonCase>
+                                                                       &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-      Cases.push_back(std::make_pair(i.getCaseValue(),
-                                     i.getCaseSuccessor()));
+      Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(),
+                                                  i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
   BranchInst *BI = cast<BranchInst>(TI);
   ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
-  Cases.push_back(std::make_pair(GetConstantInt(ICI->getOperand(1), TD),
-                                 BI->getSuccessor(ICI->getPredicate() ==
-                                                  ICmpInst::ICMP_NE)));
+  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1),
+                                                             TD),
+                                              Succ));
   return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
 }
 
@@ -498,9 +555,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
 /// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries
 /// in the list that match the specified block.
 static void EliminateBlockCases(BasicBlock *BB,
-               std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) {
+                              std::vector<ValueEqualityComparisonCase> &Cases) {
   for (unsigned i = 0, e = Cases.size(); i != e; ++i)
-    if (Cases[i].second == BB) {
+    if (Cases[i].Dest == BB) {
       Cases.erase(Cases.begin()+i);
       --i; --e;
     }
@@ -509,9 +566,9 @@ static void EliminateBlockCases(BasicBlock *BB,
 /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
 /// well.
 static bool
-ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
-              std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) {
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2;
+ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+              std::vector<ValueEqualityComparisonCase > &C2) {
+  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
 
   // Make V1 be smaller than V2.
   if (V1->size() > V2->size())
@@ -520,9 +577,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
   if (V1->size() == 0) return false;
   if (V1->size() == 1) {
     // Just scan V2.
-    ConstantInt *TheVal = (*V1)[0].first;
+    ConstantInt *TheVal = (*V1)[0].Value;
     for (unsigned i = 0, e = V2->size(); i != e; ++i)
-      if (TheVal == (*V2)[i].first)
+      if (TheVal == (*V2)[i].Value)
         return true;
   }
 
@@ -531,9 +588,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
   array_pod_sort(V2->begin(), V2->end());
   unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
   while (i1 != e1 && i2 != e2) {
-    if ((*V1)[i1].first == (*V2)[i2].first)
+    if ((*V1)[i1].Value == (*V2)[i2].Value)
       return true;
-    if ((*V1)[i1].first < (*V2)[i2].first)
+    if ((*V1)[i1].Value < (*V2)[i2].Value)
       ++i1;
     else
       ++i2;
@@ -559,13 +616,13 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   if (ThisVal != PredVal) return false;  // Different predicates.
 
   // Find out information about when control will move from Pred to TI's block.
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+  std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
                                                         PredCases);
   EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
 
   // Find information about how control leaves this block.
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases;
+  std::vector<ValueEqualityComparisonCase> ThisCases;
   BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
   EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
 
@@ -587,7 +644,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       (void) NI;
 
       // Remove PHI node entries for the dead edge.
-      ThisCases[0].second->removePredecessor(TI->getParent());
+      ThisCases[0].Dest->removePredecessor(TI->getParent());
 
       DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
            << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
@@ -600,7 +657,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     // Okay, TI has cases that are statically dead, prune them away.
     SmallPtrSet<Constant*, 16> DeadCases;
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-      DeadCases.insert(PredCases[i].first);
+      DeadCases.insert(PredCases[i].Value);
 
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
@@ -622,10 +679,10 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   ConstantInt *TIV = 0;
   BasicBlock *TIBB = TI->getParent();
   for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-    if (PredCases[i].second == TIBB) {
+    if (PredCases[i].Dest == TIBB) {
       if (TIV != 0)
         return false;  // Cannot handle multiple values coming to this block.
-      TIV = PredCases[i].first;
+      TIV = PredCases[i].Value;
     }
   assert(TIV && "No edge from pred to succ?");
 
@@ -633,8 +690,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   // BB.  Find out which successor will unconditionally be branched to.
   BasicBlock *TheRealDest = 0;
   for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
-    if (ThisCases[i].first == TIV) {
-      TheRealDest = ThisCases[i].second;
+    if (ThisCases[i].Value == TIV) {
+      TheRealDest = ThisCases[i].Dest;
       break;
     }
 
@@ -702,10 +759,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
     if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
       // Figure out which 'cases' to copy from SI to PSI.
-      std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases;
+      std::vector<ValueEqualityComparisonCase> BBCases;
       BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
 
-      std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+      std::vector<ValueEqualityComparisonCase> PredCases;
       BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
 
       // Based on whether the default edge from PTI goes to BB or not, fill in
@@ -718,8 +775,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // that don't occur in PTI, or that branch to BB will be activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].second != BB)
-            PTIHandled.insert(PredCases[i].first);
+          if (PredCases[i].Dest != BB)
+            PTIHandled.insert(PredCases[i].Value);
           else {
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
@@ -734,10 +791,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           NewSuccessors.push_back(BBDefault);
         }
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (!PTIHandled.count(BBCases[i].first) &&
-              BBCases[i].second != BBDefault) {
+          if (!PTIHandled.count(BBCases[i].Value) &&
+              BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].second);
+            NewSuccessors.push_back(BBCases[i].Dest);
           }
 
       } else {
@@ -746,8 +803,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].second == BB) {
-            PTIHandled.insert(PredCases[i].first);
+          if (PredCases[i].Dest == BB) {
+            PTIHandled.insert(PredCases[i].Value);
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
             --i; --e;
@@ -756,11 +813,11 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // Okay, now we know which constants were sent to BB from the
         // predecessor.  Figure out where they will all go now.
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (PTIHandled.count(BBCases[i].first)) {
+          if (PTIHandled.count(BBCases[i].Value)) {
             // If this is one we are capable of getting...
             PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].second);
-            PTIHandled.erase(BBCases[i].first);// This constant is taken care of
+            NewSuccessors.push_back(BBCases[i].Dest);
+            PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
           }
 
         // If there are any constants vectored to BB that TI doesn't handle,
@@ -768,7 +825,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
-          PredCases.push_back(std::make_pair(*I, BBDefault));
+          PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
       }
@@ -792,7 +849,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                                PredCases.size());
       NewSI->setDebugLoc(PTI->getDebugLoc());
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-        NewSI->addCase(PredCases[i].first, PredCases[i].second);
+        NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
 
       EraseTerminatorInstAndDCECond(PTI);
 
@@ -1273,7 +1330,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       return false;
   }
   
-  // If we folded the the first phi, PN dangles at this point.  Refresh it.  If
+  // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
   if (PN == 0) return true;
@@ -1490,6 +1547,23 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
   return Result;
 }
 
+/// checkCSEInPredecessor - Return true if the given instruction is available
+/// in its predecessor block. If yes, the instruction will be removed.
+///
+static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+  if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
+    return false;
+  for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) {
+    Instruction *PBI = &*I;
+    // Check whether Inst and PBI generate the same value.
+    if (Inst->isIdenticalTo(PBI)) {
+      Inst->replaceAllUsesWith(PBI);
+      Inst->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
 
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
@@ -1497,7 +1571,36 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
 bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
 
-  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  Instruction *Cond = 0;
+  if (BI->isConditional())
+    Cond = dyn_cast<Instruction>(BI->getCondition());
+  else {
+    // For unconditional branch, check for a simple CFG pattern, where
+    // BB has a single predecessor and BB's successor is also its predecessor's
+    // successor. If such pattern exisits, check for CSE between BB and its
+    // predecessor.
+    if (BasicBlock *PB = BB->getSinglePredecessor())
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
+        if (PBI->isConditional() &&
+            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
+             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+               I != E; ) {
+            Instruction *Curr = I++;
+            if (isa<CmpInst>(Curr)) {
+              Cond = Curr;
+              break;
+            }
+            // Quit if we can't remove this instruction.
+            if (!checkCSEInPredecessor(Curr, PB))
+              return false;
+          }
+        }
+
+    if (Cond == 0)
+      return false;
+  }
+     
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
@@ -1549,7 +1652,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
-  BasicBlock *FalseDest = BI->getSuccessor(1);
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
   if (TrueDest == BB || FalseDest == BB)
     return false;
 
@@ -1560,23 +1663,33 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
+    SmallVector<PHINode*, 4> PHIs;
+    if (PBI == 0 || PBI->isUnconditional() ||
+        (BI->isConditional() && 
+         !SafeToMergeTerminators(BI, PBI)) ||
+        (!BI->isConditional() &&
+         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
     
     // Determine if the two branches share a common destination.
     Instruction::BinaryOps Opc;
     bool InvertPredCond = false;
     
-    if (PBI->getSuccessor(0) == TrueDest)
-      Opc = Instruction::Or;
-    else if (PBI->getSuccessor(1) == FalseDest)
-      Opc = Instruction::And;
-    else if (PBI->getSuccessor(0) == FalseDest)
-      Opc = Instruction::And, InvertPredCond = true;
-    else if (PBI->getSuccessor(1) == TrueDest)
-      Opc = Instruction::Or, InvertPredCond = true;
-    else
-      continue;
+    if (BI->isConditional()) {
+      if (PBI->getSuccessor(0) == TrueDest)
+        Opc = Instruction::Or;
+      else if (PBI->getSuccessor(1) == FalseDest)
+        Opc = Instruction::And;
+      else if (PBI->getSuccessor(0) == FalseDest)
+        Opc = Instruction::And, InvertPredCond = true;
+      else if (PBI->getSuccessor(1) == TrueDest)
+        Opc = Instruction::Or, InvertPredCond = true;
+      else
+        continue;
+    } else {
+      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
+        continue;
+    }
 
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
@@ -1652,17 +1765,69 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
     
-    Instruction *NewCond = 
-      cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
+    if (BI->isConditional()) {
+      Instruction *NewCond = 
+        cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
                                             New, "or.cond"));
-    PBI->setCondition(NewCond);
-    if (PBI->getSuccessor(0) == BB) {
-      AddPredecessorToBlock(TrueDest, PredBlock, BB);
-      PBI->setSuccessor(0, TrueDest);
-    }
-    if (PBI->getSuccessor(1) == BB) {
-      AddPredecessorToBlock(FalseDest, PredBlock, BB);
-      PBI->setSuccessor(1, FalseDest);
+      PBI->setCondition(NewCond);
+
+      if (PBI->getSuccessor(0) == BB) {
+        AddPredecessorToBlock(TrueDest, PredBlock, BB);
+        PBI->setSuccessor(0, TrueDest);
+      }
+      if (PBI->getSuccessor(1) == BB) {
+        AddPredecessorToBlock(FalseDest, PredBlock, BB);
+        PBI->setSuccessor(1, FalseDest);
+      }
+    } else {
+      // Update PHI nodes in the common successors.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+        ConstantInt *PBI_C = cast<ConstantInt>(
+          PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+        assert(PBI_C->getType()->isIntegerTy(1));
+        Instruction *MergedCond = 0;
+        if (PBI->getSuccessor(0) == TrueDest) {
+          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
+          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
+          //       is false: !PBI_Cond and BI_Value
+          Instruction *NotCond =
+            cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                "not.cond"));
+          MergedCond =
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                NotCond, New,
+                                "and.cond"));
+          if (PBI_C->isOne())
+            MergedCond =
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  PBI->getCondition(), MergedCond,
+                                  "or.cond"));
+        } else {
+          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
+          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
+          //       is false: PBI_Cond and BI_Value
+          MergedCond = 
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                PBI->getCondition(), New,
+                                "and.cond"));
+          if (PBI_C->isOne()) {
+            Instruction *NotCond =
+              cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                  "not.cond"));
+            MergedCond = 
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  NotCond, MergedCond,
+                                  "or.cond"));
+          }
+        }
+        // Update PHI Node.
+        PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()),
+                                  MergedCond);
+      }
+      // Change PBI from Conditional to Unconditional.
+      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
+      EraseTerminatorInstAndDCECond(PBI);
+      PBI = New_PBI;
     }
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
@@ -1670,7 +1835,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
     // Merge probability data into PredBlock's branch.
     APInt A, B, C, D;
-    if (ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
+    if (PBI->isConditional() && BI->isConditional() &&
+        ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
       // Given IR which does:
       //   bbA:
       //     br i1 %x, label %bbB, label %bbC
@@ -1740,12 +1906,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
         ProbTrue = ProbTrue.udiv(GCD);
         ProbFalse = ProbFalse.udiv(GCD);
 
-        LLVMContext &Context = BI->getContext();
-        Value *Ops[3];
-        Ops[0] = BI->getMetadata(LLVMContext::MD_prof)->getOperand(0);
-        Ops[1] = ConstantInt::get(Context, ProbTrue);
-        Ops[2] = ConstantInt::get(Context, ProbFalse);
-        PBI->setMetadata(LLVMContext::MD_prof, MDNode::get(Context, Ops));
+        MDBuilder MDB(BI->getContext());
+        MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(),
+                                            ProbFalse.getZExtValue());
+        PBI->setMetadata(LLVMContext::MD_prof, N);
       } else {
         PBI->setMetadata(LLVMContext::MD_prof, NULL);
       }
@@ -2758,6 +2922,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
         return true;
     }
   
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
+  if (FoldBranchToCommonDest(BI))
+    return SimplifyCFG(BB) | true;
   return false;
 }
 
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 4030bef..5d673f1 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -16,7 +16,6 @@
 #define DEBUG_TYPE "indvars"
 
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -44,7 +43,6 @@ namespace {
   class SimplifyIndvar {
     Loop             *L;
     LoopInfo         *LI;
-    DominatorTree    *DT;
     ScalarEvolution  *SE;
     const TargetData *TD; // May be NULL
 
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 9d62306..62d23cb 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Pass.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -41,6 +42,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <map>
@@ -66,6 +68,10 @@ static cl::opt<unsigned>
 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
   cl::desc("The maximum number of pairing iterations"));
 
+static cl::opt<bool>
+Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to form non-2^n-length vectors"));
+
 static cl::opt<unsigned>
 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
   cl::desc("The maximum number of pairable instructions per group"));
@@ -76,6 +82,10 @@ MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
                        " a full cycle check"));
 
 static cl::opt<bool>
+NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize boolean (i1) values"));
+
+static cl::opt<bool>
 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize integer values"));
 
@@ -104,6 +114,10 @@ NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize select instructions"));
 
 static cl::opt<bool>
+NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize comparison instructions"));
+
+static cl::opt<bool>
 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize getelementptr instructions"));
 
@@ -182,12 +196,12 @@ namespace {
 
     // FIXME: const correct?
 
-    bool vectorizePairs(BasicBlock &BB);
+    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
 
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts);
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
 
     void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
@@ -211,7 +225,7 @@ namespace {
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
 
     bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore);
+                       bool IsSimpleLoadStore, bool NonPow2Len);
 
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
@@ -263,26 +277,32 @@ namespace {
                       bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool &FlipMemInputs);
+                     Instruction *J, unsigned o, bool FlipMemInputs);
 
     void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask);
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask);
 
     Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
                      Instruction *J);
 
+    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
+                       unsigned o, Value *&LOp, unsigned numElemL,
+                       Type *ArgTypeL, Type *ArgTypeR,
+                       unsigned IdxOff = 0);
+
     Value *getReplacementInput(LLVMContext& Context, Instruction *I,
                      Instruction *J, unsigned o, bool FlipMemInputs);
 
     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
-                     bool &FlipMemInputs);
+                     bool FlipMemInputs);
 
     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2, bool &FlipMemInputs);
+                     Instruction *&K2, bool FlipMemInputs);
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
@@ -294,6 +314,10 @@ namespace {
                      DenseMap<Value *, Value *> &ChosenPairs,
                      std::multimap<Value *, Value *> &LoadMoveSet);
 
+    void collectPtrInfo(std::vector<Value *> &PairableInsts,
+                        DenseMap<Value *, Value *> &ChosenPairs,
+                        DenseSet<Value *> &LowPtrInsts);
+
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
                      std::multimap<Value *, Value *> &LoadMoveSet,
                      Instruction *I, Instruction *J);
@@ -303,12 +327,15 @@ namespace {
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
+    void combineMetadata(Instruction *K, const Instruction *J);
+
     bool vectorizeBB(BasicBlock &BB) {
       bool changed = false;
       // Iterate a sufficient number of times to merge types of size 1 bit,
       // then 2 bits, then 4, etc. up to half of the target vector width of the
       // target vector register.
-      for (unsigned v = 2, n = 1;
+      unsigned n = 1;
+      for (unsigned v = 2;
            v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
            v *= 2, ++n) {
         DEBUG(dbgs() << "BBV: fusing loop #" << n <<
@@ -320,6 +347,16 @@ namespace {
           break;
       }
 
+      if (changed && !Pow2LenOnly) {
+        ++n;
+        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
+          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
+                n << " for " << BB.getName() << " in " <<
+                BB.getParent()->getName() << "...\n");
+          if (!vectorizePairs(BB, true)) break;
+        }
+      }
+
       DEBUG(dbgs() << "BBV: done!\n");
       return changed;
     }
@@ -341,15 +378,43 @@ namespace {
       AU.setPreservesCFG();
     }
 
-    // This returns the vector type that holds a pair of the provided type.
-    // If the provided type is already a vector, then its length is doubled.
-    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+             "Cannot form vector from incompatible scalar types");
+      Type *STy = ElemTy->getScalarType();
+
+      unsigned numElem;
       if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        unsigned numElem = VTy->getNumElements();
-        return VectorType::get(ElemTy->getScalarType(), numElem*2);
+        numElem = VTy->getNumElements();
+      } else {
+        numElem = 1;
       }
 
-      return VectorType::get(ElemTy, 2);
+      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+        numElem += VTy->getNumElements();
+      } else {
+        numElem += 1;
+      }
+
+      return VectorType::get(STy, numElem);
+    }
+
+    static inline void getInstructionTypes(Instruction *I,
+                                           Type *&T1, Type *&T2) {
+      if (isa<StoreInst>(I)) {
+        // For stores, it is the value type, not the pointer type that matters
+        // because the value is what will come from a vector register.
+  
+        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        T1 = IVal->getType();
+      } else {
+        T1 = I->getType();
+      }
+  
+      if (I->isCast())
+        T2 = cast<CastInst>(I)->getSrcTy();
+      else
+        T2 = T1;
     }
 
     // Returns the weight associated with the provided value. A chain of
@@ -385,8 +450,7 @@ namespace {
     // true if the offset could be determined to be some constant value.
     // For example, if OffsetInElmts == 1, then J accesses the memory directly
     // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J. This function assumes that both instructions
-    // have the same type.
+    // directly after J.
     bool getPairPtrInfo(Instruction *I, Instruction *J,
         Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
         int64_t &OffsetInElmts) {
@@ -418,7 +482,12 @@ namespace {
         Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
         int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
 
-        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        if (VTy != VTy2 && Offset < 0) {
+          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+          OffsetInElmts = Offset/VTy2TSS;
+          return (abs64(Offset) % VTy2TSS) == 0;
+        }
 
         OffsetInElmts = Offset/VTyTSS;
         return (abs64(Offset) % VTyTSS) == 0;
@@ -471,7 +540,7 @@ namespace {
 
   // This function implements one vectorization iteration on the provided
   // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
     bool ShouldContinue;
     BasicBlock::iterator Start = BB.getFirstInsertionPt();
 
@@ -482,7 +551,7 @@ namespace {
       std::vector<Value *> PairableInsts;
       std::multimap<Value *, Value *> CandidatePairs;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         PairableInsts);
+                                         PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
       // Now we have a map of all of the pairable instructions and we need to
@@ -529,6 +598,10 @@ namespace {
     // passes should coalesce the build/extract combinations.
 
     fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+
+    // It is important to cleanup here so that future iterations of this
+    // function have less work to do.
+    (void) SimplifyInstructionsInBlock(&BB, TD);
     return true;
   }
 
@@ -567,6 +640,9 @@ namespace {
     } else if (isa<SelectInst>(I)) {
       if (!Config.VectorizeSelect)
         return false;
+    } else if (isa<CmpInst>(I)) {
+      if (!Config.VectorizeCmp)
+        return false;
     } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
       if (!Config.VectorizeGEP)
         return false;
@@ -584,41 +660,39 @@ namespace {
       return false;
 
     Type *T1, *T2;
-    if (isa<StoreInst>(I)) {
-      // For stores, it is the value type, not the pointer type that matters
-      // because the value is what will come from a vector register.
-
-      Value *IVal = cast<StoreInst>(I)->getValueOperand();
-      T1 = IVal->getType();
-    } else {
-      T1 = I->getType();
-    }
-
-    if (I->isCast())
-      T2 = cast<CastInst>(I)->getSrcTy();
-    else
-      T2 = T1;
+    getInstructionTypes(I, T1, T2);
 
     // Not every type can be vectorized...
     if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
         !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
       return false;
 
-    if (!Config.VectorizeInts
-        && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
-      return false;
-
+    if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) {
+      if (!Config.VectorizeBools)
+        return false;
+    } else {
+      if (!Config.VectorizeInts
+          && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+        return false;
+    }
+  
     if (!Config.VectorizeFloats
         && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
       return false;
 
+    // Don't vectorize target-specific types.
+    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
+      return false;
+    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
+      return false;
+
     if ((!Config.VectorizePointers || TD == 0) &&
         (T1->getScalarType()->isPointerTy() ||
          T2->getScalarType()->isPointerTy()))
       return false;
 
-    if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 ||
-        T2->getPrimitiveSizeInBits() > Config.VectorBits/2)
+    if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+        T2->getPrimitiveSizeInBits() >= Config.VectorBits)
       return false;
 
     return true;
@@ -629,36 +703,25 @@ namespace {
   // that I has already been determined to be vectorizable and that J is not
   // in the use tree of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore) {
+                       bool IsSimpleLoadStore, bool NonPow2Len) {
     DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
                      " <-> " << *J << "\n");
 
     // Loads and stores can be merged if they have different alignments,
     // but are otherwise the same.
-    LoadInst *LI, *LJ;
-    StoreInst *SI, *SJ;
-    if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
-      if (I->getType() != J->getType())
-        return false;
+    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
+                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
+      return false;
 
-      if (LI->getPointerOperand()->getType() !=
-            LJ->getPointerOperand()->getType() ||
-          LI->isVolatile() != LJ->isVolatile() ||
-          LI->getOrdering() != LJ->getOrdering() ||
-          LI->getSynchScope() != LJ->getSynchScope())
-        return false;
-    } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
-      if (SI->getValueOperand()->getType() !=
-            SJ->getValueOperand()->getType() ||
-          SI->getPointerOperand()->getType() !=
-            SJ->getPointerOperand()->getType() ||
-          SI->isVolatile() != SJ->isVolatile() ||
-          SI->getOrdering() != SJ->getOrdering() ||
-          SI->getSynchScope() != SJ->getSynchScope())
-        return false;
-    } else if (!J->isSameOperationAs(I)) {
+    Type *IT1, *IT2, *JT1, *JT2;
+    getInstructionTypes(I, IT1, IT2);
+    getInstructionTypes(J, JT1, JT2);
+    unsigned MaxTypeBits = std::max(
+      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
+      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
+    if (MaxTypeBits > Config.VectorBits)
       return false;
-    }
+
     // FIXME: handle addsub-type operations!
 
     if (IsSimpleLoadStore) {
@@ -668,8 +731,11 @@ namespace {
       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
             OffsetInElmts) && abs64(OffsetInElmts) == 1) {
         if (Config.AlignedOnly) {
-          Type *aType = isa<StoreInst>(I) ?
+          Type *aTypeI = isa<StoreInst>(I) ?
             cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+          Type *aTypeJ = isa<StoreInst>(J) ?
+            cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+
           // An aligned load or store is possible only if the instruction
           // with the lower offset has an alignment suitable for the
           // vector type.
@@ -677,7 +743,7 @@ namespace {
           unsigned BottomAlignment = IAlignment;
           if (OffsetInElmts < 0) BottomAlignment = JAlignment;
 
-          Type *VType = getVecTypeForPair(aType);
+          Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
           unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
           if (BottomAlignment < VecAlignment)
             return false;
@@ -685,11 +751,6 @@ namespace {
       } else {
         return false;
       }
-    } else if (isa<ShuffleVectorInst>(I)) {
-      // Only merge two shuffles if they're both constant
-      return isa<Constant>(I->getOperand(2)) &&
-             isa<Constant>(J->getOperand(2));
-      // FIXME: We may want to vectorize non-constant shuffles also.
     }
 
     // The powi intrinsic is special because only the first argument is
@@ -772,7 +833,7 @@ namespace {
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts) {
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
 
@@ -808,7 +869,7 @@ namespace {
 
         // J does not use I, and comes before the first use of I, so it can be
         // merged with I if the instructions are compatible.
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;
 
         // J is a candidate for merging with I.
         if (!PairableInsts.size() ||
@@ -1430,24 +1491,27 @@ namespace {
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
                      Instruction *I, Instruction *J, unsigned o,
-                     bool &FlipMemInputs) {
+                     bool FlipMemInputs) {
     Value *IPtr, *JPtr;
     unsigned IAlignment, JAlignment;
     int64_t OffsetInElmts;
+
+    // Note: the analysis might fail here, that is why FlipMemInputs has
+    // been precomputed (OffsetInElmts must be unused here).
     (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
                           OffsetInElmts);
 
     // The pointer value is taken to be the one with the lowest offset.
     Value *VPtr;
-    if (OffsetInElmts > 0) {
+    if (!FlipMemInputs) {
       VPtr = IPtr;
     } else {
-      FlipMemInputs = true;
       VPtr = JPtr;
     }
 
-    Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
     Type *VArgPtrType = PointerType::get(VArgType,
       cast<PointerType>(IPtr->getType())->getAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
@@ -1455,15 +1519,17 @@ namespace {
   }
 
   void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask) {
-    for (unsigned v = 0; v < NumElem/2; ++v) {
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask) {
+    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    for (unsigned v = 0; v < NumElem1; ++v) {
       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
       if (m < 0) {
         Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
       } else {
         unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem)
+        if (m >= (int) NumInElem1)
           mm += (int) NumInElem;
 
         Mask[v+MaskOffset] =
@@ -1479,8 +1545,11 @@ namespace {
     // This is the shuffle mask. We need to append the second
     // mask to the first, and the numbers need to be adjusted.
 
-    Type *ArgType = I->getType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = I->getType();
+    Type *ArgTypeJ = J->getType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
 
     // Get the total number of elements in the fused vector type.
     // By definition, this must equal the number of elements in
@@ -1488,19 +1557,81 @@ namespace {
     unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
     std::vector<Constant*> Mask(NumElem);
 
-    Type *OpType = I->getOperand(0)->getType();
-    unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+    Type *OpTypeI = I->getOperand(0)->getType();
+    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    Type *OpTypeJ = J->getOperand(0)->getType();
+    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+
+    // The fused vector will be:
+    // -----------------------------------------------------
+    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
+    // -----------------------------------------------------
+    // from which we'll extract NumElem total elements (where the first NumElemI
+    // of them come from the mask in I and the remainder come from the mask
+    // in J.
 
     // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
+                       0,          Mask);
 
     // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
-                       Mask);
+    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
+                       NumInElemI, Mask);
 
     return ConstantVector::get(Mask);
   }
 
+  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
+                                  Instruction *J, unsigned o, Value *&LOp,
+                                  unsigned numElemL,
+                                  Type *ArgTypeL, Type *ArgTypeH,
+                                  unsigned IdxOff) {
+    bool ExpandedIEChain = false;
+    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
+      // If we have a pure insertelement chain, then this can be rewritten
+      // into a chain that directly builds the larger type.
+      bool PureChain = true;
+      InsertElementInst *LIENext = LIE;
+      do {
+        if (!isa<UndefValue>(LIENext->getOperand(0)) &&
+            !isa<InsertElementInst>(LIENext->getOperand(0))) {
+          PureChain = false;
+          break;
+        }
+      } while ((LIENext =
+                 dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+      if (PureChain) {
+        SmallVector<Value *, 8> VectElemts(numElemL,
+          UndefValue::get(ArgTypeL->getScalarType()));
+        InsertElementInst *LIENext = LIE;
+        do {
+          unsigned Idx =
+            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
+          VectElemts[Idx] = LIENext->getOperand(1);
+        } while ((LIENext =
+                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+        LIENext = 0;
+        Value *LIEPrev = UndefValue::get(ArgTypeH);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          if (isa<UndefValue>(VectElemts[i])) continue;
+          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
+                             ConstantInt::get(Type::getInt32Ty(Context),
+                                              i + IdxOff),
+                             getReplacementName(I, true, o, i+1));
+          LIENext->insertBefore(J);
+          LIEPrev = LIENext;
+        }
+
+        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
+        ExpandedIEChain = true;
+      }
+    }
+
+    return ExpandedIEChain;
+  }
+
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -1508,84 +1639,333 @@ namespace {
     Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
 
-      // Compute the fused vector type for this operand
-    Type *ArgType = I->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgType);
+    // Compute the fused vector type for this operand
+    Type *ArgTypeI = I->getOperand(o)->getType();
+    Type *ArgTypeJ = J->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
     Instruction *L = I, *H = J;
+    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
     if (FlipMemInputs) {
       L = J;
       H = I;
+      ArgTypeL = ArgTypeJ;
+      ArgTypeH = ArgTypeI;
     }
 
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v)
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+    unsigned numElemL;
+    if (ArgTypeL->isVectorTy())
+      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
+    else
+      numElemL = 1;
 
-      Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
-                                              H->getOperand(o),
-                                              ConstantVector::get(Mask),
-                                              getReplacementName(I, true, o));
-      BV->insertBefore(J);
-      return BV;
+    unsigned numElemH;
+    if (ArgTypeH->isVectorTy())
+      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
+    else
+      numElemH = 1;
+
+    Value *LOp = L->getOperand(o);
+    Value *HOp = H->getOperand(o);
+    unsigned numElem = VArgType->getNumElements();
+
+    // First, we check if we can reuse the "original" vector outputs (if these
+    // exist). We might need a shuffle.
+    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
+    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
+    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
+    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
+
+    // FIXME: If we're fusing shuffle instructions, then we can't apply this
+    // optimization. The input vectors to the shuffle might be a different
+    // length from the shuffle outputs. Unfortunately, the replacement
+    // shuffle mask has already been formed, and the mask entries are sensitive
+    // to the sizes of the inputs.
+    bool IsSizeChangeShuffle =
+      isa<ShuffleVectorInst>(L) &&
+        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
+
+    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
+      // We can have at most two unique vector inputs.
+      bool CanUseInputs = true;
+      Value *I1, *I2 = 0;
+      if (LEE) {
+        I1 = LEE->getOperand(0);
+      } else {
+        I1 = LSV->getOperand(0);
+        I2 = LSV->getOperand(1);
+        if (I2 == I1 || isa<UndefValue>(I2))
+          I2 = 0;
+      }
+  
+      if (HEE) {
+        Value *I3 = HEE->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+      } else {
+        Value *I3 = HSV->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+
+        if (CanUseInputs) {
+          Value *I4 = HSV->getOperand(1);
+          if (!isa<UndefValue>(I4)) {
+            if (!I2 && I4 != I1)
+              I2 = I4;
+            else if (I4 != I1 && I4 != I2)
+              CanUseInputs = false;
+          }
+        }
+      }
+
+      if (CanUseInputs) {
+        unsigned LOpElem =
+          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
+            ->getNumElements();
+        unsigned HOpElem =
+          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
+            ->getNumElements();
+
+        // We have one or two input vectors. We need to map each index of the
+        // operands to the index of the original vector.
+        SmallVector<std::pair<int, int>, 8>  II(numElem);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          int Idx, INum;
+          if (LEE) {
+            Idx =
+              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
+            INum = LEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = LSV->getMaskValue(i);
+            if (Idx < (int) LOpElem) {
+              INum = LSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= LOpElem;
+              INum = LSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i] = std::pair<int, int>(Idx, INum);
+        }
+        for (unsigned i = 0; i < numElemH; ++i) {
+          int Idx, INum;
+          if (HEE) {
+            Idx =
+              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
+            INum = HEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = HSV->getMaskValue(i);
+            if (Idx < (int) HOpElem) {
+              INum = HSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= HOpElem;
+              INum = HSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i + numElemL] = std::pair<int, int>(Idx, INum);
+        }
+
+        // We now have an array which tells us from which index of which
+        // input vector each element of the operand comes.
+        VectorType *I1T = cast<VectorType>(I1->getType());
+        unsigned I1Elem = I1T->getNumElements();
+
+        if (!I2) {
+          // In this case there is only one underlying vector input. Check for
+          // the trivial case where we can use the input directly.
+          if (I1Elem == numElem) {
+            bool ElemInOrder = true;
+            for (unsigned i = 0; i < numElem; ++i) {
+              if (II[i].first != (int) i && II[i].first != -1) {
+                ElemInOrder = false;
+                break;
+              }
+            }
+
+            if (ElemInOrder)
+              return I1;
+          }
+
+          // A shuffle is needed.
+          std::vector<Constant *> Mask(numElem);
+          for (unsigned i = 0; i < numElem; ++i) {
+            int Idx = II[i].first;
+            if (Idx == -1)
+              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
+            else
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+
+          Instruction *S =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o));
+          S->insertBefore(J);
+          return S;
+        }
+
+        VectorType *I2T = cast<VectorType>(I2->getType());
+        unsigned I2Elem = I2T->getNumElements();
+
+        // This input comes from two distinct vectors. The first step is to
+        // make sure that both vectors are the same length. If not, the
+        // smaller one will need to grow before they can be shuffled together.
+        if (I1Elem < I2Elem) {
+          std::vector<Constant *> Mask(I2Elem);
+          unsigned v = 0;
+          for (; v < I1Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I2Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI1 =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI1->insertBefore(J);
+          I1 = NewI1;
+          I1T = I2T;
+          I1Elem = I2Elem;
+        } else if (I1Elem > I2Elem) {
+          std::vector<Constant *> Mask(I1Elem);
+          unsigned v = 0;
+          for (; v < I2Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I1Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI2 =
+            new ShuffleVectorInst(I2, UndefValue::get(I2T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI2->insertBefore(J);
+          I2 = NewI2;
+          I2T = I1T;
+          I2Elem = I1Elem;
+        }
+
+        // Now that both I1 and I2 are the same length we can shuffle them
+        // together (and use the result).
+        std::vector<Constant *> Mask(numElem);
+        for (unsigned v = 0; v < numElem; ++v) {
+          if (II[v].first == -1) {
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+          } else {
+            int Idx = II[v].first + II[v].second * I1Elem;
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+        }
+
+        Instruction *NewOp =
+          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
+                                getReplacementName(I, true, o));
+        NewOp->insertBefore(J);
+        return NewOp;
+      }
     }
 
-    // If these two inputs are the output of another vector instruction,
-    // then we should use that output directly. It might be necessary to
-    // permute it first. [When pairings are fused recursively, you can
-    // end up with cases where a large vector is decomposed into scalars
-    // using extractelement instructions, then built into size-2
-    // vectors using insertelement and the into larger vectors using
-    // shuffles. InstCombine does not simplify all of these cases well,
-    // and so we make sure that shuffles are generated here when possible.
-    ExtractElementInst *LEE
-      = dyn_cast<ExtractElementInst>(L->getOperand(o));
-    ExtractElementInst *HEE
-      = dyn_cast<ExtractElementInst>(H->getOperand(o));
-
-    if (LEE && HEE &&
-        LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
-      VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
-      unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
-      unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
-      if (LEE->getOperand(0) == HEE->getOperand(0)) {
-        if (LowIndx == 0 && HighIndx == 1)
-          return LEE->getOperand(0);
-
-        std::vector<Constant*> Mask(2);
-        Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-        Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
-
-        Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          UndefValue::get(EEType),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
-        BV->insertBefore(J);
-        return BV;
+    Type *ArgType = ArgTypeL;
+    if (numElemL < numElemH) {
+      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
+                                         ArgTypeL, VArgType, 1)) {
+        // This is another short-circuit case: we're combining a scalar into
+        // a vector that is formed by an IE chain. We've just expanded the IE
+        // chain, now insert the scalar and we're done.
+
+        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
+                                               getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
+                                ArgTypeH)) {
+        // The two vector inputs to the shuffle must be the same length,
+        // so extend the smaller vector to be the same length as the larger one.
+        Instruction *NLOp;
+        if (numElemL > 1) {
+  
+          std::vector<Constant *> Mask(numElemH);
+          unsigned v = 0;
+          for (; v < numElemL; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemH; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NLOp->insertBefore(J);
+        LOp = NLOp;
       }
 
-      std::vector<Constant*> Mask(2);
-      HighIndx += EEType->getNumElements();
-      Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-      Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+      ArgType = ArgTypeH;
+    } else if (numElemL > numElemH) {
+      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
+                                         ArgTypeH, VArgType)) {
+        Instruction *S =
+          InsertElementInst::Create(LOp, HOp, 
+                                    ConstantInt::get(Type::getInt32Ty(Context),
+                                                     numElemL),
+                                    getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
+                                ArgTypeL)) {
+        Instruction *NHOp;
+        if (numElemH > 1) {
+          std::vector<Constant *> Mask(numElemL);
+          unsigned v = 0;
+          for (; v < numElemH; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemL; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NHOp->insertBefore(J);
+        HOp = NHOp;
+      }
+    }
 
-      Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          HEE->getOperand(0),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
+    if (ArgType->isVectorTy()) {
+      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      std::vector<Constant*> Mask(numElem);
+      for (unsigned v = 0; v < numElem; ++v) {
+        unsigned Idx = v;
+        // If the low vector was expanded, we need to skip the extra
+        // undefined entries.
+        if (v >= numElemL && numElemH > numElemL)
+          Idx += (numElemH - numElemL);
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+      }
+
+      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
+                                              ConstantVector::get(Mask),
+                                              getReplacementName(I, true, o));
       BV->insertBefore(J);
       return BV;
     }
 
     Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType),
-                                          L->getOperand(o), CV0,
+                                          UndefValue::get(VArgType), LOp, CV0,
                                           getReplacementName(I, true, o, 1));
     BV1->insertBefore(I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
-                                          CV1,
+    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
                                           getReplacementName(I, true, o, 2));
     BV2->insertBefore(J);
     return BV2;
@@ -1596,8 +1976,7 @@ namespace {
   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                      Instruction *I, Instruction *J,
                      SmallVector<Value *, 3> &ReplacedOperands,
-                     bool &FlipMemInputs) {
-    FlipMemInputs = false;
+                     bool FlipMemInputs) {
     unsigned NumOperands = I->getNumOperands();
 
     for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -1616,10 +1995,10 @@ namespace {
           BasicBlock &BB = *I->getParent();
 
           Module *M = BB.getParent()->getParent();
-          Type *ArgType = I->getType();
-          Type *VArgType = getVecTypeForPair(ArgType);
+          Type *ArgTypeI = I->getType();
+          Type *ArgTypeJ = J->getType();
+          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-          // FIXME: is it safe to do this here?
           ReplacedOperands[o] = Intrinsic::getDeclaration(M,
             (Intrinsic::ID) IID, VArgType);
           continue;
@@ -1648,36 +2027,60 @@ namespace {
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt,
                      Instruction *&K1, Instruction *&K2,
-                     bool &FlipMemInputs) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
+                     bool FlipMemInputs) {
     if (isa<StoreInst>(I)) {
       AA->replaceWithNewValue(I, K);
       AA->replaceWithNewValue(J, K);
     } else {
       Type *IType = I->getType();
-      Type *VType = getVecTypeForPair(IType);
+      Type *JType = J->getType();
+
+      VectorType *VType = getVecTypeForPair(IType, JType);
+      unsigned numElem = VType->getNumElements();
+
+      unsigned numElemI, numElemJ;
+      if (IType->isVectorTy())
+        numElemI = cast<VectorType>(IType)->getNumElements();
+      else
+        numElemI = 1;
+
+      if (JType->isVectorTy())
+        numElemJ = cast<VectorType>(JType)->getNumElements();
+      else
+        numElemJ = 1;
 
       if (IType->isVectorTy()) {
-          unsigned numElem = cast<VectorType>(IType)->getNumElements();
-          std::vector<Constant*> Mask1(numElem), Mask2(numElem);
-          for (unsigned v = 0; v < numElem; ++v) {
-            Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-            Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
-          }
+        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
+        for (unsigned v = 0; v < numElemI; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
+        }
 
-          K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask2 : Mask1),
-                                       getReplacementName(K, false, 1));
-          K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask1 : Mask2),
-                                       getReplacementName(K, false, 2));
+        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask2 : Mask1),
+                                   getReplacementName(K, false, 1));
       } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
         K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
                                           getReplacementName(K, false, 1));
+      }
+
+      if (JType->isVectorTy()) {
+        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
+        for (unsigned v = 0; v < numElemJ; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
+        }
+
+        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask1 : Mask2),
+                                   getReplacementName(K, false, 2));
+      } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
         K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
                                           getReplacementName(K, false, 2));
       }
@@ -1778,6 +2181,61 @@ namespace {
     }
   }
 
+  // As with the aliasing information, SCEV can also change because of
+  // vectorization. This information is used to compute relative pointer
+  // offsets; the necessary information will be cached here prior to
+  // fusion.
+  void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
+                                   DenseMap<Value *, Value *> &ChosenPairs,
+                                   DenseSet<Value *> &LowPtrInsts) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+      PIE = PairableInsts.end(); PI != PIE; ++PI) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+      if (P == ChosenPairs.end()) continue;
+
+      Instruction *I = cast<Instruction>(P->first);
+      Instruction *J = cast<Instruction>(P->second);
+
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+        continue;
+
+      Value *IPtr, *JPtr;
+      unsigned IAlignment, JAlignment;
+      int64_t OffsetInElmts;
+      if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+                          OffsetInElmts) || abs64(OffsetInElmts) != 1)
+        llvm_unreachable("Pre-fusion pointer analysis failed");
+
+      Value *LowPI = (OffsetInElmts > 0) ? I : J;
+      LowPtrInsts.insert(LowPI);
+    }
+  }
+
+  // When the first instruction in each pair is cloned, it will inherit its
+  // parent's metadata. This metadata must be combined with that of the other
+  // instruction in a safe way.
+  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    K->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *JMD = J->getMetadata(Kind);
+      MDNode *KMD = Metadata[i].second;
+
+      switch (Kind) {
+      default:
+        K->setMetadata(Kind, 0); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      }
+    }
+  }
+
   // This function fuses the chosen instruction pairs into vector instructions,
   // taking care preserve any needed scalar outputs and, then, it reorders the
   // remaining instructions as needed (users of the first member of the pair
@@ -1804,6 +2262,9 @@ namespace {
     std::multimap<Value *, Value *> LoadMoveSet;
     collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
 
+    DenseSet<Value *> LowPtrInsts;
+    collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
+
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
     for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
@@ -1843,7 +2304,10 @@ namespace {
         continue;
       }
 
-      bool FlipMemInputs;
+      bool FlipMemInputs = false;
+      if (isa<LoadInst>(I) || isa<StoreInst>(I))
+        FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+
       unsigned NumOperands = I->getNumOperands();
       SmallVector<Value *, 3> ReplacedOperands(NumOperands);
       getReplacementInputsForPair(Context, I, J, ReplacedOperands,
@@ -1855,7 +2319,9 @@ namespace {
       if (I->hasName()) K->takeName(I);
 
       if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(I->getType()));
+        K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+
+      combineMetadata(K, J);
 
       for (unsigned o = 0; o < NumOperands; ++o)
         K->setOperand(o, ReplacedOperands[o]);
@@ -1947,6 +2413,7 @@ llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
 //===----------------------------------------------------------------------===//
 VectorizeConfig::VectorizeConfig() {
   VectorBits = ::VectorBits;
+  VectorizeBools = !::NoBools;
   VectorizeInts = !::NoInts;
   VectorizeFloats = !::NoFloats;
   VectorizePointers = !::NoPointers;
@@ -1954,6 +2421,7 @@ VectorizeConfig::VectorizeConfig() {
   VectorizeMath = !::NoMath;
   VectorizeFMA = !::NoFMA;
   VectorizeSelect = !::NoSelect;
+  VectorizeCmp = !::NoCmp;
   VectorizeGEP = !::NoGEP;
   VectorizeMemOps = !::NoMemOps;
   AlignedOnly = ::AlignedOnly;
@@ -1963,6 +2431,7 @@ VectorizeConfig::VectorizeConfig() {
   SplatBreaksChain = ::SplatBreaksChain;
   MaxInsts = ::MaxInsts;
   MaxIter = ::MaxIter;
+  Pow2LenOnly = ::Pow2LenOnly;
   NoMemOpBoost = ::NoMemOpBoost;
   FastDep = ::FastDep;
 }
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 4b66930..06cf1e4 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -2,3 +2,5 @@ add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
   )
+
+add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 7b39efb..7ef1131 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -20,11 +20,13 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Module.h"
+#include "llvm/TypeFinder.h"
 #include "llvm/ValueSymbolTable.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
@@ -99,7 +101,11 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   bool NeedsQuotes = isdigit(Name[0]);
   if (!NeedsQuotes) {
     for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-      char C = Name[i];
+      // By making this unsigned, the value passed in to isalnum will always be
+      // in the range 0-255.  This is important when building with MSVC because
+      // its implementation will assert.  This situation can arise when dealing
+      // with UTF-8 multibyte characters.
+      unsigned char C = Name[i];
       if (!isalnum(C) && C != '-' && C != '.' && C != '_') {
         NeedsQuotes = true;
         break;
@@ -140,7 +146,7 @@ class TypePrinting {
 public:
 
   /// NamedTypes - The named types that are used by the current module.
-  std::vector<StructType*> NamedTypes;
+  TypeFinder NamedTypes;
 
   /// NumberedTypes - The numbered types, along with their value.
   DenseMap<StructType*, unsigned> NumberedTypes;
@@ -159,7 +165,7 @@ public:
 
 
 void TypePrinting::incorporateTypes(const Module &M) {
-  M.findUsedStructTypes(NamedTypes);
+  NamedTypes.run(M, false);
 
   // The list of struct types we got back includes all the struct types, split
   // the unnamed ones out to a numbering and remove the anonymous structs.
@@ -708,8 +714,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf ||
-        &CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle ||
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle ||
         &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble) {
       // We would like to output the FP constant value in exponential notation,
       // but we cannot do this if doing so will lose precision.  Check here to
@@ -759,16 +764,20 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       return;
     }
 
-    // Some form of long double.  These appear as a magic letter identifying
-    // the type, then a fixed number of hex digits.
+    // Either half, or some form of long double.
+    // These appear as a magic letter identifying the type, then a
+    // fixed number of hex digits.
     Out << "0x";
+    // Bit position, in the current word, of the next nibble to print.
+    int shiftcount;
+
     if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
       Out << 'K';
       // api needed to prevent premature destruction
       APInt api = CFP->getValueAPF().bitcastToAPInt();
       const uint64_t* p = api.getRawData();
       uint64_t word = p[1];
-      int shiftcount=12;
+      shiftcount = 12;
       int width = api.getBitWidth();
       for (int j=0; j<width; j+=4, shiftcount-=4) {
         unsigned int nibble = (word>>shiftcount) & 15;
@@ -784,17 +793,21 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
         }
       }
       return;
-    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad)
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
+      shiftcount = 60;
       Out << 'L';
-    else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble)
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
+      shiftcount = 60;
       Out << 'M';
-    else
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
+      shiftcount = 12;
+      Out << 'H';
+    } else
       llvm_unreachable("Unsupported floating point type");
     // api needed to prevent premature destruction
     APInt api = CFP->getValueAPF().bitcastToAPInt();
     const uint64_t* p = api.getRawData();
     uint64_t word = *p;
-    int shiftcount=60;
     int width = api.getBitWidth();
     for (int j=0; j<width; j+=4, shiftcount-=4) {
       unsigned int nibble = (word>>shiftcount) & 15;
@@ -1369,6 +1382,26 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
   }
 }
 
+static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
+                                  formatted_raw_ostream &Out) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "thread_local ";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "thread_local(localdynamic) ";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "thread_local(initialexec) ";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "thread_local(localexec) ";
+      break;
+  }
+}
+
 void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->isMaterializable())
     Out << "; Materializable\n";
@@ -1381,8 +1414,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
 
   PrintLinkage(GV->getLinkage(), Out);
   PrintVisibility(GV->getVisibility(), Out);
+  PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
 
-  if (GV->isThreadLocal()) Out << "thread_local ";
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
   if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
@@ -2004,19 +2037,22 @@ static void WriteMDNodeComment(const MDNode *Node,
                                formatted_raw_ostream &Out) {
   if (Node->getNumOperands() < 1)
     return;
-  ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Node->getOperand(0));
-  if (!CI) return;
-  APInt Val = CI->getValue();
-  APInt Tag = Val & ~APInt(Val.getBitWidth(), LLVMDebugVersionMask);
-  if (Val.ult(LLVMDebugVersion11))
+
+  Value *Op = Node->getOperand(0);
+  if (!Op || !isa<ConstantInt>(Op) || cast<ConstantInt>(Op)->getBitWidth() < 32)
+    return;
+
+  DIDescriptor Desc(Node);
+  if (Desc.getVersion() < LLVMDebugVersion11)
     return;
 
+  unsigned Tag = Desc.getTag();
   Out.PadToColumn(50);
-  if (Tag == dwarf::DW_TAG_user_base)
+  if (dwarf::TagString(Tag)) {
+    Out << "; ";
+    Desc.print(Out);
+  } else if (Tag == dwarf::DW_TAG_user_base) {
     Out << "; [ DW_TAG_user_base ]";
-  else if (Tag.isIntN(32)) {
-    if (const char *TagName = dwarf::TagString(Tag.getZExtValue()))
-      Out << "; [ " << TagName << " ]";
   }
 }
 
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index c05132b..c8219eb 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -88,6 +88,9 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
     Result += " ";
   }
+  if (Attrs & Attribute::IANSDialect)
+    Result += "ia_nsdialect ";
+
   // Trim the trailing space.
   assert(!Result.empty() && "Unknown attribute!");
   Result.erase(Result.end()-1);
@@ -131,8 +134,8 @@ class AttributeListImpl : public FoldingSetNode {
 public:
   SmallVector<AttributeWithIndex, 4> Attrs;
   
-  AttributeListImpl(const AttributeWithIndex *Attr, unsigned NumAttrs)
-    : Attrs(Attr, Attr+NumAttrs) {
+  AttributeListImpl(ArrayRef<AttributeWithIndex> attrs)
+    : Attrs(attrs.begin(), attrs.end()) {
     RefCount = 0;
   }
   
@@ -150,13 +153,12 @@ public:
   }
   
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, Attrs.data(), Attrs.size());
+    Profile(ID, Attrs);
   }
-  static void Profile(FoldingSetNodeID &ID, const AttributeWithIndex *Attr,
-                      unsigned NumAttrs) {
-    for (unsigned i = 0; i != NumAttrs; ++i) {
-      ID.AddInteger(Attr[i].Attrs.Raw());
-      ID.AddInteger(Attr[i].Index);
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeWithIndex> Attrs){
+    for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+      ID.AddInteger(Attrs[i].Attrs.Raw());
+      ID.AddInteger(Attrs[i].Index);
     }
   }
 };
@@ -168,13 +170,13 @@ AttributeListImpl::~AttributeListImpl() {
 }
 
 
-AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) {
+AttrListPtr AttrListPtr::get(ArrayRef<AttributeWithIndex> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
-  if (NumAttrs == 0)
+  if (Attrs.empty())
     return AttrListPtr();
   
 #ifndef NDEBUG
-  for (unsigned i = 0; i != NumAttrs; ++i) {
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
     assert(Attrs[i].Attrs != Attribute::None && 
            "Pointless attribute!");
     assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
@@ -184,7 +186,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs)
   
   // Otherwise, build a key to look up the existing attributes.
   FoldingSetNodeID ID;
-  AttributeListImpl::Profile(ID, Attrs, NumAttrs);
+  AttributeListImpl::Profile(ID, Attrs);
   void *InsertPos;
   
   sys::SmartScopedLock<true> Lock(*ALMutex);
@@ -195,7 +197,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs)
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PAL) {
-    PAL = new AttributeListImpl(Attrs, NumAttrs);
+    PAL = new AttributeListImpl(Attrs);
     AttributesLists->InsertNode(PAL, InsertPos);
   }
   
@@ -308,7 +310,7 @@ AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
                        OldAttrList.begin()+i, OldAttrList.end());
   }
   
-  return get(NewAttrList.data(), NewAttrList.size());
+  return get(NewAttrList);
 }
 
 AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
@@ -343,7 +345,7 @@ AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
   NewAttrList.insert(NewAttrList.end(), 
                      OldAttrList.begin()+i, OldAttrList.end());
   
-  return get(NewAttrList.data(), NewAttrList.size());
+  return get(NewAttrList);
 }
 
 void AttrListPtr::dump() const {
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 2e16372..094ca75 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -14,17 +14,32 @@
 #include "llvm/AutoUpgrade.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instruction.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include <cstring>
 using namespace llvm;
 
+// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// changed their type from v4f32 to v2i64.
+static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
+                                 Function *&NewFn) {
+  // Check whether this is an old version of the function, which received
+  // v4f32 arguments.
+  Type *Arg0Type = F->getFunctionType()->getParamType(0);
+  if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4))
+    return false;
+
+  // Yes, it's old, replace it with new version.
+  F->setName(F->getName() + ".old");
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
 
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
@@ -37,6 +52,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
   switch (Name[0]) {
   default: break;
+  case 'a': {
+    if (Name.startswith("arm.neon.vclz")) {
+      Type* args[2] = {
+        F->arg_begin()->getType(), 
+        Type::getInt1Ty(F->getContext())
+      };
+      // Can't use Intrinsic::getDeclaration here as it adds a ".i1" to
+      // the end of the name. Change name from llvm.arm.neon.vclz.* to
+      //  llvm.ctlz.*
+      FunctionType* fType = FunctionType::get(F->getReturnType(), args, false);
+      NewFn = Function::Create(fType, F->getLinkage(), 
+                               "llvm.ctlz." + Name.substr(14), F->getParent());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vcnt")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    break;
+  }
   case 'c': {
     if (Name.startswith("ctlz.") && F->arg_size() == 1) {
       F->setName(Name + ".old");
@@ -57,17 +93,49 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name.startswith("x86.sse2.pcmpgt.") ||
         Name.startswith("x86.avx2.pcmpeq.") ||
         Name.startswith("x86.avx2.pcmpgt.") ||
-        Name.startswith("x86.avx.vpermil.")) {
+        Name.startswith("x86.avx.vpermil.") ||
+        Name == "x86.avx.movnt.dq.256" ||
+        Name == "x86.avx.movnt.pd.256" ||
+        Name == "x86.avx.movnt.ps.256" ||
+        (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = 0;
       return true;
     }
+    // SSE4.1 ptest functions may have an old signature.
+    if (Name.startswith("x86.sse41.ptest")) {
+      if (Name == "x86.sse41.ptestc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
+      if (Name == "x86.sse41.ptestz")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
+      if (Name == "x86.sse41.ptestnzc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+    }
+    // frcz.ss/sd may need to have an argument dropped
+    if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_ss);
+      return true;
+    }
+    if (Name.startswith("x86.xop.vfrcz.sd") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_sd);
+      return true;
+    }
+    // Fix the FMA4 intrinsics to remove the 4
+    if (Name.startswith("x86.fma4.")) {
+      F->setName("llvm.x86.fma" + Name.substr(8));
+      NewFn = F;
+      return true;
+    }
     break;
   }
   }
 
-  //  This may not belong here. This function is effectively being overloaded 
-  //  to both detect an intrinsic which needs upgrading, and to provide the 
-  //  upgraded form of the intrinsic. We should perhaps have two separate 
+  //  This may not belong here. This function is effectively being overloaded
+  //  to both detect an intrinsic which needs upgrading, and to provide the
+  //  upgraded form of the intrinsic. We should perhaps have two separate
   //  functions for this.
   return false;
 }
@@ -89,8 +157,8 @@ bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
   return false;
 }
 
-// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
-// upgraded intrinsic. All argument and return casting must be provided in 
+// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
+// upgraded intrinsic. All argument and return casting must be provided in
 // order to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
@@ -118,15 +186,85 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                   "pcmpgt");
       // need to sign extend since icmp returns vector of i1
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+    } else if (Name == "llvm.x86.avx.movnt.dq.256" ||
+               Name == "llvm.x86.avx.movnt.ps.256" ||
+               Name == "llvm.x86.avx.movnt.pd.256") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Module *M = F->getParent();
+      SmallVector<Value *, 1> Elts;
+      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Arg1->getType()),
+                                        "cast");
+      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      SI->setAlignment(16);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (Name.startswith("llvm.x86.xop.vpcom")) {
+      Intrinsic::ID intID;
+      if (Name.endswith("ub"))
+        intID = Intrinsic::x86_xop_vpcomub;
+      else if (Name.endswith("uw"))
+        intID = Intrinsic::x86_xop_vpcomuw;
+      else if (Name.endswith("ud"))
+        intID = Intrinsic::x86_xop_vpcomud;
+      else if (Name.endswith("uq"))
+        intID = Intrinsic::x86_xop_vpcomuq;
+      else if (Name.endswith("b"))
+        intID = Intrinsic::x86_xop_vpcomb;
+      else if (Name.endswith("w"))
+        intID = Intrinsic::x86_xop_vpcomw;
+      else if (Name.endswith("d"))
+        intID = Intrinsic::x86_xop_vpcomd;
+      else if (Name.endswith("q"))
+        intID = Intrinsic::x86_xop_vpcomq;
+      else
+        llvm_unreachable("Unknown suffix");
+
+      Name = Name.substr(18); // strip off "llvm.x86.xop.vpcom"
+      unsigned Imm;
+      if (Name.startswith("lt"))
+        Imm = 0;
+      else if (Name.startswith("le"))
+        Imm = 1;
+      else if (Name.startswith("gt"))
+        Imm = 2;
+      else if (Name.startswith("ge"))
+        Imm = 3;
+      else if (Name.startswith("eq"))
+        Imm = 4;
+      else if (Name.startswith("ne"))
+        Imm = 5;
+      else if (Name.startswith("true"))
+        Imm = 6;
+      else if (Name.startswith("false"))
+        Imm = 7;
+      else
+        llvm_unreachable("Unknown condition");
+
+      Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
+      Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
+                                CI->getArgOperand(1), Builder.getInt8(Imm));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
-      if (Name.startswith("llvm.x86.avx.vpermil.pd.256"))
+      if (Name == "llvm.x86.avx.vpermil.pd.256")
         PD256 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.pd"))
+      else if (Name == "llvm.x86.avx.vpermil.pd")
         PD128 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.ps.256"))
+      else if (Name == "llvm.x86.avx.vpermil.ps.256")
         PS256 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.ps"))
+      else if (Name == "llvm.x86.avx.vpermil.ps")
         PS128 = true;
 
       if (PD256 || PD128 || PS256 || PS128) {
@@ -162,6 +300,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
+  std::string Name = CI->getName().str();
+  CI->setName(Name + ".old");
+
   switch (NewFn->getIntrinsicID()) {
   default:
     llvm_unreachable("Unknown function for CallInst upgrade.");
@@ -170,12 +311,60 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
            "Mismatch between function args and call args");
-    StringRef Name = CI->getName();
-    CI->setName(Name + ".old");
     CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
                                                Builder.getFalse(), Name));
     CI->eraseFromParent();
     return;
+
+  case Intrinsic::arm_neon_vclz: {
+    // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
+                                               Builder.getFalse(),
+                                               "llvm.ctlz." + Name.substr(14)));
+    CI->eraseFromParent();
+    return;
+  }
+  case Intrinsic::ctpop: {
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(0)));
+    CI->eraseFromParent();
+    return;
+  }
+
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd:
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(1),
+                                              Name));
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestnzc: {
+    // The arguments for these intrinsics used to be v4f32, and changed
+    // to v2i64. This is purely a nop, since those are bitwise intrinsics.
+    // So, the only thing required is a bitcast for both arguments.
+    // First, check the arguments have the old type.
+    Value *Arg0 = CI->getArgOperand(0);
+    if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4))
+      return;
+
+    // Old intrinsic, add bitcasts
+    Value *Arg1 = CI->getArgOperand(1);
+
+    Value *BC0 =
+      Builder.CreateBitCast(Arg0,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+    Value *BC1 =
+      Builder.CreateBitCast(Arg1,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+
+    CallInst* NewCall = Builder.CreateCall2(NewFn, BC0, BC1, Name);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   }
 }
 
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index e1efcda..6a20be6 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -8,7 +8,9 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DebugInfo.cpp
   DebugLoc.cpp
+  DIBuilder.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -29,6 +31,7 @@ add_llvm_library(LLVMCore
   PassRegistry.cpp
   PrintModulePass.cpp
   Type.cpp
+  TypeFinder.cpp
   Use.cpp
   User.cpp
   Value.cpp
@@ -36,3 +39,14 @@ add_llvm_library(LLVMCore
   ValueTypes.cpp
   Verifier.cpp
   )
+
+# Workaround: It takes over 20 minutes to compile with msvc10.
+# FIXME: Suppressing optimizations to core libraries would not be good thing.
+if( MSVC_VERSION EQUAL 1600 )
+set_property(
+  SOURCE Function.cpp
+  PROPERTY COMPILE_FLAGS "/Og-"
+  )
+endif()
+
+add_dependencies(LLVMCore intrinsics_gen)
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index b743287..8e82876 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -55,13 +55,12 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   
   Type *DstEltTy = DstTy->getElementType();
 
-  // Check to verify that all elements of the input are simple.
   SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(CV->getContext(), 32);
   for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C = CV->getAggregateElement(i);
-    if (C == 0) return 0;
+    Constant *C =
+      ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
     C = ConstantExpr::getBitCast(C, DstEltTy);
-    if (isa<ConstantExpr>(C)) return 0;
     Result.push_back(C);
   }
 
@@ -553,9 +552,12 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     SmallVector<Constant*, 16> res;
     VectorType *DestVecTy = cast<VectorType>(DestTy);
     Type *DstEltTy = DestVecTy->getElementType();
-    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i)
-      res.push_back(ConstantExpr::getCast(opc,
-                                          V->getAggregateElement(i), DstEltTy));
+    Type *Ty = IntegerType::get(V->getContext(), 32);
+    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) {
+      Constant *C =
+        ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
+      res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
+    }
     return ConstantVector::get(res);
   }
 
@@ -696,12 +698,13 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
   // If the condition is a vector constant, fold the result elementwise.
   if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) {
     SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(CondV->getContext(), 32);
     for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){
       ConstantInt *Cond = dyn_cast<ConstantInt>(CondV->getOperand(i));
       if (Cond == 0) break;
       
-      Constant *Res = (Cond->getZExtValue() ? V2 : V1)->getAggregateElement(i);
-      if (Res == 0) break;
+      Constant *V = Cond->isNullValue() ? V2 : V1;
+      Constant *Res = ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
       Result.push_back(Res);
     }
     
@@ -721,12 +724,12 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
   if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
     if (TrueVal->getOpcode() == Instruction::Select)
       if (TrueVal->getOperand(0) == Cond)
-	return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
+        return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
   }
   if (ConstantExpr *FalseVal = dyn_cast<ConstantExpr>(V2)) {
     if (FalseVal->getOpcode() == Instruction::Select)
       if (FalseVal->getOperand(0) == Cond)
-	return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
+        return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
   }
 
   return 0;
@@ -760,16 +763,16 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
   const APInt &IdxVal = CIdx->getValue();
   
   SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(Val->getContext(), 32);
   for (unsigned i = 0, e = Val->getType()->getVectorNumElements(); i != e; ++i){
     if (i == IdxVal) {
       Result.push_back(Elt);
       continue;
     }
     
-    if (Constant *C = Val->getAggregateElement(i))
-      Result.push_back(C);
-    else
-      return 0;
+    Constant *C =
+      ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i));
+    Result.push_back(C);
   }
   
   return ConstantVector::get(Result);
@@ -801,11 +804,15 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
     Constant *InElt;
     if (unsigned(Elt) >= SrcNumElts*2)
       InElt = UndefValue::get(EltTy);
-    else if (unsigned(Elt) >= SrcNumElts)
-      InElt = V2->getAggregateElement(Elt - SrcNumElts);
-    else
-      InElt = V1->getAggregateElement(Elt);
-    if (InElt == 0) return 0;
+    else if (unsigned(Elt) >= SrcNumElts) {
+      Type *Ty = IntegerType::get(V2->getContext(), 32);
+      InElt =
+        ConstantExpr::getExtractElement(V2,
+                                        ConstantInt::get(Ty, Elt - SrcNumElts));
+    } else {
+      Type *Ty = IntegerType::get(V1->getContext(), 32);
+      InElt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, Elt));
+    }
     Result.push_back(InElt);
   }
 
@@ -1130,16 +1137,17 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
   } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
     // Perform elementwise folding.
     SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(VTy->getContext(), 32);
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *LHS = C1->getAggregateElement(i);
-      Constant *RHS = C2->getAggregateElement(i);
-      if (LHS == 0 || RHS == 0) break;
+      Constant *LHS =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *RHS =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
       
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
     
-    if (Result.size() == VTy->getNumElements())
-      return ConstantVector::get(Result);
+    return ConstantVector::get(Result);
   }
 
   if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
@@ -1697,17 +1705,18 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     // If we can constant fold the comparison of each element, constant fold
     // the whole vector comparison.
     SmallVector<Constant*, 4> ResElts;
+    Type *Ty = IntegerType::get(C1->getContext(), 32);
     // Compare the elements, producing an i1 result or constant expr.
     for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){
-      Constant *C1E = C1->getAggregateElement(i);
-      Constant *C2E = C2->getAggregateElement(i);
-      if (C1E == 0 || C2E == 0) break;
+      Constant *C1E =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *C2E =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
       
       ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
     }
     
-    if (ResElts.size() == C1->getType()->getVectorNumElements())
-      return ConstantVector::get(ResElts);
+    return ConstantVector::get(ResElts);
   }
 
   if (C1->getType()->isFloatingPointTy()) {
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 6dbc144..a4e21e1 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -46,7 +46,7 @@ bool Constant::isNegativeZeroValue() const {
   // Floating point values have an explicit -0.0 value.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero() && CFP->isNegative();
-  
+
   // Otherwise, just use +0.0.
   return isNullValue();
 }
@@ -55,7 +55,7 @@ bool Constant::isNullValue() const {
   // 0 is null.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->isZero();
-  
+
   // +0.0 is null.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero() && !CFP->isNegative();
@@ -161,19 +161,19 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
 Constant *Constant::getAggregateElement(unsigned Elt) const {
   if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this))
     return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : 0;
-  
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(this))
     return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : 0;
-  
+
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
     return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : 0;
-  
+
   if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
     return CAZ->getElementValue(Elt);
-  
+
   if (const UndefValue *UV = dyn_cast<UndefValue>(this))
     return UV->getElementValue(Elt);
-  
+
   if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
     return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : 0;
   return 0;
@@ -222,10 +222,10 @@ bool Constant::canTrap() const {
   // The only thing that could possibly trap are constant exprs.
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(this);
   if (!CE) return false;
-  
-  // ConstantExpr traps if any operands can trap. 
+
+  // ConstantExpr traps if any operands can trap.
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (CE->getOperand(i)->canTrap()) 
+    if (CE->getOperand(i)->canTrap())
       return true;
 
   // Otherwise, only specific operations can trap.
@@ -252,7 +252,7 @@ bool Constant::isConstantUsed() const {
     const Constant *UC = dyn_cast<Constant>(*UI);
     if (UC == 0 || isa<GlobalValue>(UC))
       return true;
-    
+
     if (UC->isConstantUsed())
       return true;
   }
@@ -302,12 +302,12 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
             cast<BlockAddress>(RHS->getOperand(0))->getFunction())
         return NoRelocation;
     }
-  
+
   PossibleRelocationsTy Result = NoRelocation;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     Result = std::max(Result,
                       cast<Constant>(getOperand(i))->getRelocationInfo());
-  
+
   return Result;
 }
 
@@ -316,14 +316,14 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
 /// constantexpr.
 static bool removeDeadUsersOfConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) return false; // Cannot remove this
-  
+
   while (!C->use_empty()) {
     const Constant *User = dyn_cast<Constant>(C->use_back());
     if (!User) return false; // Non-constant usage;
     if (!removeDeadUsersOfConstant(User))
       return false; // Constant wasn't dead
   }
-  
+
   const_cast<Constant*>(C)->destroyConstant();
   return true;
 }
@@ -343,7 +343,7 @@ void Constant::removeDeadConstantUsers() const {
       ++I;
       continue;
     }
-    
+
     if (!removeDeadUsersOfConstant(User)) {
       // If the constant wasn't dead, remember that this was the last live use
       // and move on to the next constant.
@@ -351,7 +351,7 @@ void Constant::removeDeadConstantUsers() const {
       ++I;
       continue;
     }
-    
+
     // If the constant was dead, then the iterator is invalidated.
     if (LastNonDeadUser == E) {
       I = use_begin();
@@ -485,7 +485,7 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
     return &APFloat::x87DoubleExtended;
   else if (Ty->isFP128Ty())
     return &APFloat::IEEEquad;
-  
+
   assert(Ty->isPPC_FP128Ty() && "Unknown FP format");
   return &APFloat::PPCDoubleDouble;
 }
@@ -497,7 +497,7 @@ void ConstantFP::anchor() { }
 /// 2.0/1.0 etc, that are known-valid both as double and as the target format.
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
-  
+
   APFloat FV(V);
   bool ignored;
   FV.convert(*TypeToFloatSemantics(Ty->getScalarType()),
@@ -550,11 +550,11 @@ Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
 // ConstantFP accessors.
 ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   DenseMapAPFloatKeyInfo::KeyTy Key(V);
-  
+
   LLVMContextImpl* pImpl = Context.pImpl;
-  
+
   ConstantFP *&Slot = pImpl->FPConstants[Key];
-    
+
   if (!Slot) {
     Type *Ty;
     if (&V.getSemantics() == &APFloat::IEEEhalf)
@@ -574,7 +574,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
     }
     Slot = new ConstantFP(Ty, V);
   }
-  
+
   return Slot;
 }
 
@@ -695,7 +695,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
            "Wrong type in array element initializer");
   }
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
-  
+
   // If this is an all-zero array, return a ConstantAggregateZero object.  If
   // all undef, return an UndefValue, if "all simple", then return a
   // ConstantDataArray.
@@ -751,7 +751,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
           return ConstantDataArray::get(C->getContext(), Elts);
       }
     }
-    
+
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
         SmallVector<float, 16> Elts;
@@ -788,7 +788,7 @@ StructType *ConstantStruct::getTypeForElements(LLVMContext &Context,
   SmallVector<Type*, 16> EltTypes(VecSize);
   for (unsigned i = 0; i != VecSize; ++i)
     EltTypes[i] = V[i]->getType();
-  
+
   return StructType::get(Context, EltTypes, Packed);
 }
 
@@ -833,12 +833,12 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
           isUndef = false;
       }
     }
-  }  
+  }
   if (isZero)
     return ConstantAggregateZero::get(ST);
   if (isUndef)
     return UndefValue::get(ST);
-    
+
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
@@ -881,12 +881,12 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
         break;
       }
   }
-  
+
   if (isZero)
     return ConstantAggregateZero::get(T);
   if (isUndef)
     return UndefValue::get(T);
-   
+
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
   if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
@@ -932,7 +932,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
           return ConstantDataVector::get(C->getContext(), Elts);
       }
     }
-    
+
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
         SmallVector<float, 16> Elts;
@@ -955,7 +955,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
       }
     }
   }
-  
+
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.
   return pImpl->VectorConstants.getOrCreate(T, V);
@@ -967,7 +967,7 @@ Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
   if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
       ConstantDataSequential::isElementTypeCompatible(V->getType()))
     return ConstantDataVector::getSplat(NumElts, V);
-  
+
   SmallVector<Constant*, 32> Elts(NumElts, V);
   return get(Elts);
 }
@@ -1039,7 +1039,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
   SmallVector<Constant*, 8> NewOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     NewOps.push_back(i == OpNo ? Op : getOperand(i));
-  
+
   return getWithOperands(NewOps);
 }
 
@@ -1052,7 +1052,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   bool AnyChange = Ty != getType();
   for (unsigned i = 0; i != Ops.size(); ++i)
     AnyChange |= Ops[i] != getOperand(i);
-  
+
   if (!AnyChange)  // No operands changed, return self.
     return const_cast<ConstantExpr*>(this);
 
@@ -1177,7 +1177,7 @@ ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
   ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty];
   if (Entry == 0)
     Entry = new ConstantAggregateZero(Ty);
-  
+
   return Entry;
 }
 
@@ -1232,7 +1232,7 @@ ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
   ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty];
   if (Entry == 0)
     Entry = new ConstantPointerNull(Ty);
-  
+
   return Entry;
 }
 
@@ -1252,7 +1252,7 @@ UndefValue *UndefValue::get(Type *Ty) {
   UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty];
   if (Entry == 0)
     Entry = new UndefValue(Ty);
-  
+
   return Entry;
 }
 
@@ -1277,7 +1277,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
     F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)];
   if (BA == 0)
     BA = new BlockAddress(F, BB);
-  
+
   assert(BA->getFunction() == F && "Basic block moved between functions");
   return BA;
 }
@@ -1305,19 +1305,19 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
   // case, we have to remove the map entry.
   Function *NewF = getFunction();
   BasicBlock *NewBB = getBasicBlock();
-  
+
   if (U == &Op<0>())
     NewF = cast<Function>(To);
   else
     NewBB = cast<BasicBlock>(To);
-  
+
   // See if the 'new' entry already exists, if not, just update this in place
   // and return early.
   BlockAddress *&NewBA =
     getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)];
   if (NewBA == 0) {
     getBasicBlock()->AdjustBlockAddressRefCount(-1);
-    
+
     // Remove the old entry, this can't cause the map to rehash (just a
     // tombstone will get added).
     getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(),
@@ -1331,10 +1331,10 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
 
   // Otherwise, I do need to replace this with an existing value.
   assert(NewBA != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(NewBA);
-  
+
   destroyConstant();
 }
 
@@ -1355,10 +1355,10 @@ static inline Constant *getFoldedCast(
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> argVec(1, C);
   ExprMapKeyType Key(opc, argVec);
-  
+
   return pImpl->ExprConstants.getOrCreate(Ty, Key);
 }
- 
+
 Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   Instruction::CastOps opc = Instruction::CastOps(oc);
   assert(Instruction::isCast(opc) && "opcode out of range");
@@ -1381,7 +1381,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   case Instruction::IntToPtr: return getIntToPtr(C, Ty);
   case Instruction::BitCast:  return getBitCast(C, Ty);
   }
-} 
+}
 
 Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) {
   if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -1572,11 +1572,11 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) {
 Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) &&
          "Invalid constantexpr bitcast!");
-  
+
   // It is common to ask for a bitcast of a value to its own type, handle this
   // speedily.
   if (C->getType() == DstTy) return C;
-  
+
   return getFoldedCast(Instruction::BitCast, C, DstTy);
 }
 
@@ -1588,7 +1588,7 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
          "Invalid opcode in binary constant expression");
   assert(C1->getType() == C2->getType() &&
          "Operand types in binary constant expression should match");
-  
+
 #ifndef NDEBUG
   switch (Opcode) {
   case Instruction::Add:
@@ -1649,11 +1649,11 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
 
   if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2))
     return FC;          // Fold a few common cases.
-  
+
   std::vector<Constant*> argVec(1, C1);
   argVec.push_back(C2);
   ExprMapKeyType Key(Opcode, argVec, 0, Flags);
-  
+
   LLVMContextImpl *pImpl = C1->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(C1->getType(), Key);
 }
@@ -1703,7 +1703,7 @@ Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) {
 Constant *ConstantExpr::getCompare(unsigned short Predicate, 
                                    Constant *C1, Constant *C2) {
   assert(C1->getType() == C2->getType() && "Op types should be identical!");
-  
+
   switch (Predicate) {
   default: llvm_unreachable("Invalid CmpInst predicate");
   case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT:
@@ -1713,7 +1713,7 @@ Constant *ConstantExpr::getCompare(unsigned short Predicate,
   case CmpInst::FCMP_ULT:   case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE:
   case CmpInst::FCMP_TRUE:
     return getFCmp(Predicate, C1, C2);
-    
+
   case CmpInst::ICMP_EQ:  case CmpInst::ICMP_NE:  case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE:
   case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT:
@@ -1732,7 +1732,7 @@ Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) {
   argVec[1] = V1;
   argVec[2] = V2;
   ExprMapKeyType Key(Instruction::Select, argVec);
-  
+
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(V1->getType(), Key);
 }
@@ -1747,7 +1747,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
   assert(Ty && "GEP indices invalid!");
   unsigned AS = C->getType()->getPointerAddressSpace();
   Type *ReqTy = Ty->getPointerTo(AS);
-  
+
   assert(C->getType()->isPointerTy() &&
          "Non-pointer type for constant GetElementPtr expression");
   // Look up the constant in the table first to ensure uniqueness
@@ -1758,7 +1758,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
     ArgVec.push_back(cast<Constant>(Idxs[i]));
   const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
                            InBounds ? GEPOperator::IsInBounds : 0);
-  
+
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
@@ -1815,15 +1815,15 @@ Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
          "Tried to create extractelement operation on non-vector type!");
   assert(Idx->getType()->isIntegerTy(32) &&
          "Extractelement index must be i32 type!");
-  
+
   if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
     return FC;          // Fold a few common cases.
-  
+
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> ArgVec(1, Val);
   ArgVec.push_back(Idx);
   const ExprMapKeyType Key(Instruction::ExtractElement,ArgVec);
-  
+
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
   Type *ReqTy = Val->getType()->getVectorElementType();
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
@@ -1845,7 +1845,7 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
   ArgVec.push_back(Elt);
   ArgVec.push_back(Idx);
   const ExprMapKeyType Key(Instruction::InsertElement,ArgVec);
-  
+
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(Val->getType(), Key);
 }
@@ -1867,7 +1867,7 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
   ArgVec.push_back(V2);
   ArgVec.push_back(Mask);
   const ExprMapKeyType Key(Instruction::ShuffleVector,ArgVec);
-  
+
   LLVMContextImpl *pImpl = ShufTy->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ShufTy, Key);
 }
@@ -1892,7 +1892,7 @@ Constant *ConstantExpr::getExtractValue(Constant *Agg,
   Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
   (void)ReqTy;
   assert(ReqTy && "extractvalue indices invalid!");
-  
+
   assert(Agg->getType()->isFirstClassType() &&
          "Non-first-class type for constant extractvalue expression");
   Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs);
@@ -2007,6 +2007,47 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
              isExact ? PossiblyExactOperator::IsExact : 0);
 }
 
+/// getBinOpIdentity - Return the identity for the given binary operation,
+/// i.e. a constant C such that X op C = X and C op X = X for every X.  It
+/// returns null if the operator doesn't have an identity.
+Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an identity.
+    return 0;
+
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return Constant::getNullValue(Ty);
+
+  case Instruction::Mul:
+    return ConstantInt::get(Ty, 1);
+
+  case Instruction::And:
+    return Constant::getAllOnesValue(Ty);
+  }
+}
+
+/// getBinOpAbsorber - Return the absorbing element for the given binary
+/// operation, i.e. a constant C such that X op C = C and C op X = C for
+/// every X.  For example, this returns zero for integer multiplication.
+/// It returns null if the operator doesn't have an absorbing element.
+Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an absorber.
+    return 0;
+
+  case Instruction::Or:
+    return Constant::getAllOnesValue(Ty);
+
+  case Instruction::And:
+  case Instruction::Mul:
+    return Constant::getNullValue(Ty);
+  }
+}
+
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantExpr::destroyConstant() {
@@ -2107,7 +2148,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
   // Do a lookup to see if we have already formed one of these.
   StringMap<ConstantDataSequential*>::MapEntryTy &Slot =
     Ty->getContext().pImpl->CDSConstants.GetOrCreateValue(Elements);
-  
+
   // The bucket can point to a linked list of different CDS's that have the same
   // body but different types.  For example, 0,0,0,1 could be a 4 element array
   // of i8, or a 1-element array of i32.  They'll both end up in the same
@@ -2117,7 +2158,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
        Entry = &Node->Next, Node = *Entry)
     if (Node->getType() == Ty)
       return Node;
-  
+
   // Okay, we didn't get a hit.  Create a node of the right class, link it in,
   // and return it.
   if (isa<ArrayType>(Ty))
@@ -2131,7 +2172,7 @@ void ConstantDataSequential::destroyConstant() {
   // Remove the constant from the StringMap.
   StringMap<ConstantDataSequential*> &CDSConstants = 
     getType()->getContext().pImpl->CDSConstants;
-  
+
   StringMap<ConstantDataSequential*>::iterator Slot =
     CDSConstants.find(getRawDataValues());
 
@@ -2158,11 +2199,11 @@ void ConstantDataSequential::destroyConstant() {
       }
     }
   }
-  
+
   // If we were part of a list, make sure that we don't delete the list that is
   // still owned by the uniquing map.
   Next = 0;
-  
+
   // Finally, actually delete it.
   destroyConstantImpl();
 }
@@ -2172,27 +2213,33 @@ void ConstantDataSequential::destroyConstant() {
 /// can return a ConstantAggregateZero object.
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) {
   Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 
 /// getString - This method constructs a CDS and initializes it with a text
@@ -2202,9 +2249,12 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
 /// to disable this behavior.
 Constant *ConstantDataArray::getString(LLVMContext &Context,
                                        StringRef Str, bool AddNull) {
-  if (!AddNull)
-    return get(Context, ArrayRef<uint8_t>((uint8_t*)Str.data(), Str.size()));
-  
+  if (!AddNull) {
+    const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data());
+    return get(Context, ArrayRef<uint8_t>(const_cast<uint8_t *>(Data),
+               Str.size()));
+  }
+
   SmallVector<uint8_t, 64> ElementVals;
   ElementVals.append(Str.begin(), Str.end());
   ElementVals.push_back(0);
@@ -2216,27 +2266,33 @@ Constant *ConstantDataArray::getString(LLVMContext &Context,
 /// can return a ConstantAggregateZero object.
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){
   Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 
 Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
@@ -2281,15 +2337,19 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   assert(isa<IntegerType>(getElementType()) &&
          "Accessor can only be used when element is an integer");
   const char *EltPtr = getElementPointer(Elt);
-  
+
   // The data is stored in host byte order, make sure to cast back to the right
   // type to load with the right endianness.
   switch (getElementType()->getIntegerBitWidth()) {
   default: llvm_unreachable("Invalid bitwidth for CDS");
-  case 8:  return *(uint8_t*)EltPtr;
-  case 16: return *(uint16_t*)EltPtr;
-  case 32: return *(uint32_t*)EltPtr;
-  case 64: return *(uint64_t*)EltPtr;
+  case 8:
+    return *const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(EltPtr));
+  case 16:
+    return *const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(EltPtr));
+  case 32:
+    return *const_cast<uint32_t *>(reinterpret_cast<const uint32_t *>(EltPtr));
+  case 64:
+    return *const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(EltPtr));
   }
 }
 
@@ -2301,8 +2361,14 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   switch (getElementType()->getTypeID()) {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
-  case Type::FloatTyID: return APFloat(*(float*)EltPtr);
-  case Type::DoubleTyID: return APFloat(*(double*)EltPtr);
+  case Type::FloatTyID: {
+      const float *FloatPrt = reinterpret_cast<const float *>(EltPtr);
+      return APFloat(*const_cast<float *>(FloatPrt));
+    }
+  case Type::DoubleTyID: {
+      const double *DoublePtr = reinterpret_cast<const double *>(EltPtr);
+      return APFloat(*const_cast<double *>(DoublePtr));
+    }
   }
 }
 
@@ -2311,7 +2377,8 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
 float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
   assert(getElementType()->isFloatTy() &&
          "Accessor can only be used when element is a 'float'");
-  return *(float*)getElementPointer(Elt);
+  const float *EltPtr = reinterpret_cast<const float *>(getElementPointer(Elt));
+  return *const_cast<float *>(EltPtr);
 }
 
 /// getElementAsDouble - If this is an sequential container of doubles, return
@@ -2319,7 +2386,9 @@ float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
 double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
   assert(getElementType()->isDoubleTy() &&
          "Accessor can only be used when element is a 'float'");
-  return *(double*)getElementPointer(Elt);
+  const double *EltPtr =
+      reinterpret_cast<const double *>(getElementPointer(Elt));
+  return *const_cast<double *>(EltPtr);
 }
 
 /// getElementAsConstant - Return a Constant for a specified index's element.
@@ -2328,7 +2397,7 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
-  
+
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
@@ -2342,12 +2411,12 @@ bool ConstantDataSequential::isString() const {
 bool ConstantDataSequential::isCString() const {
   if (!isString())
     return false;
-  
+
   StringRef Str = getAsString();
-  
+
   // The last value must be nul.
   if (Str.back() != 0) return false;
-  
+
   // Other elements must be non-nul.
   return Str.drop_back().find(0) == StringRef::npos;
 }
@@ -2356,13 +2425,13 @@ bool ConstantDataSequential::isCString() const {
 /// elements have the same value, return that value. Otherwise return NULL.
 Constant *ConstantDataVector::getSplatValue() const {
   const char *Base = getRawDataValues().data();
-  
+
   // Compare elements 1+ to the 0'th element.
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
       return 0;
-  
+
   // If they're all the same, return the 0th one as a representative.
   return getElementAsConstant(0);
 }
@@ -2393,10 +2462,10 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
   Lookup.first = cast<ArrayType>(getType());
   Values.reserve(getNumOperands());  // Build replacement array.
 
-  // Fill values with the modified operands of the constant array.  Also, 
+  // Fill values with the modified operands of the constant array.  Also,
   // compute whether this turns into an all-zeros array.
   unsigned NumUpdated = 0;
-  
+
   // Keep track of whether all the values in the array are "ToC".
   bool AllSame = true;
   for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
@@ -2408,7 +2477,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Values.push_back(Val);
     AllSame &= Val == ToC;
   }
-  
+
   Constant *Replacement = 0;
   if (AllSame && ToC->isNullValue()) {
     Replacement = ConstantAggregateZero::get(getType());
@@ -2419,7 +2488,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Lookup.second = makeArrayRef(Values);
     LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I =
       pImpl->ArrayConstants.find(Lookup);
-    
+
     if (I != pImpl->ArrayConstants.map_end()) {
       Replacement = I->first;
     } else {
@@ -2428,7 +2497,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
       // old with the new, then deleting the old... just update the current one
       // in place!
       pImpl->ArrayConstants.remove(this);
-      
+
       // Update to the new value.  Optimize for the case when we have a single
       // operand that we're changing, but handle bulk updates efficiently.
       if (NumUpdated == 1) {
@@ -2445,13 +2514,13 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
       return;
     }
   }
- 
+
   // Otherwise, I do need to replace this with an existing value.
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2468,8 +2537,8 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
   LLVMContextImpl::StructConstantsTy::LookupKey Lookup;
   Lookup.first = cast<StructType>(getType());
   Values.reserve(getNumOperands());  // Build replacement struct.
-  
-  // Fill values with the modified operands of the constant struct.  Also, 
+
+  // Fill values with the modified operands of the constant struct.  Also,
   // compute whether this turns into an all-zeros struct.
   bool isAllZeros = false;
   bool isAllUndef = false;
@@ -2492,9 +2561,9 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
       Values.push_back(cast<Constant>(O->get()));
   }
   Values[OperandToUpdate] = ToC;
-  
+
   LLVMContextImpl *pImpl = getContext().pImpl;
-  
+
   Constant *Replacement = 0;
   if (isAllZeros) {
     Replacement = ConstantAggregateZero::get(getType());
@@ -2505,7 +2574,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Lookup.second = makeArrayRef(Values);
     LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
       pImpl->StructConstants.find(Lookup);
-    
+
     if (I != pImpl->StructConstants.map_end()) {
       Replacement = I->first;
     } else {
@@ -2514,19 +2583,19 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
       // old with the new, then deleting the old... just update the current one
       // in place!
       pImpl->StructConstants.remove(this);
-      
+
       // Update to the new value.
       setOperand(OperandToUpdate, ToC);
       pImpl->StructConstants.insert(this);
       return;
     }
   }
-  
+
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2534,7 +2603,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
 void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
                                                  Use *U) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
-  
+
   SmallVector<Constant*, 8> Values;
   Values.reserve(getNumOperands());  // Build replacement array...
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -2542,13 +2611,13 @@ void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
     if (Val == From) Val = cast<Constant>(To);
     Values.push_back(Val);
   }
-  
+
   Constant *Replacement = get(Values);
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2557,19 +2626,19 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
                                                Use *U) {
   assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
   Constant *To = cast<Constant>(ToV);
-  
+
   SmallVector<Constant*, 8> NewOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Op = getOperand(i);
     NewOps.push_back(Op == From ? To : Op);
   }
-  
+
   Constant *Replacement = getWithOperands(NewOps);
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index a9cca22..972db3c 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -115,6 +115,25 @@ void LLVMDumpModule(LLVMModuleRef M) {
   unwrap(M)->dump();
 }
 
+LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
+                               char **ErrorMessage) {
+  std::string error;
+  raw_fd_ostream dest(Filename, error);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  unwrap(M)->print(dest, NULL);
+
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  dest.flush();
+  return false;
+}
+
 /*--.. Operations on inline assembler ......................................--*/
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
@@ -1191,7 +1210,7 @@ LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty,
                                          unsigned AddressSpace) {
   return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
                                  GlobalValue::ExternalLinkage, 0, Name, 0,
-                                 false, AddressSpace));
+                                 GlobalVariable::NotThreadLocal, AddressSpace));
 }
 
 LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/VMCore/DIBuilder.cpp
index 85913b1..f5894e9 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/VMCore/DIBuilder.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DIBuilder.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/STLExtras.h"
@@ -47,16 +47,16 @@ void DIBuilder::finalize() {
   DIType(TempSubprograms).replaceAllUsesWith(SPs);
   for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
     DISubprogram SP(SPs.getElement(i));
+    SmallVector<Value *, 4> Variables;
     if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) {
-      SmallVector<Value *, 4> Variables;
       for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii)
         Variables.push_back(NMD->getOperand(ii));
-      if (MDNode *Temp = SP.getVariablesNodes()) {
-        DIArray AV = getOrCreateArray(Variables);
-        DIType(Temp).replaceAllUsesWith(AV);
-      }
       NMD->eraseFromParent();
     }
+    if (MDNode *Temp = SP.getVariablesNodes()) {
+      DIArray AV = getOrCreateArray(Variables);
+      DIType(Temp).replaceAllUsesWith(AV);
+    }
   }
 
   DIArray GVs = getOrCreateArray(AllGVs);
@@ -101,7 +101,7 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     ConstantInt::get(Type::getInt32Ty(VMContext), Lang),
     MDString::get(VMContext, Filename),
     MDString::get(VMContext, Directory),
@@ -163,7 +163,7 @@ DIType DIBuilder::createNullPtrType(StringRef Name) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Encoding
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0)  // Encoding
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -229,12 +229,13 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
   return DIType(MDNode::get(VMContext, Elts));
 }
 
-/// createReferenceType - Create debugging information entry for a reference.
-DIType DIBuilder::createReferenceType(DIType RTy) {
+/// createReferenceType - Create debugging information entry for a reference
+/// type.
+DIType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   assert(RTy.Verify() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_reference_type),
+    GetTagConstant(VMContext, Tag),
     NULL, // TheCU,
     NULL, // Name
     NULL, // Filename
@@ -387,11 +388,11 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
 DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
-					     DIFile File, unsigned LineNumber,
+                                             DIFile File, unsigned LineNumber,
                                              StringRef GetterName,
                                              StringRef SetterName, 
                                              unsigned PropertyAttributes,
-					     DIType Ty) {
+                                             DIType Ty) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
@@ -405,33 +406,6 @@ DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
   return DIObjCProperty(MDNode::get(VMContext, Elts));
 }
 
-/// createClassType - Create debugging information entry for a class.
-DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
-                                  DIFile File, unsigned LineNumber,
-                                  uint64_t SizeInBits, uint64_t AlignInBits,
-                                  uint64_t OffsetInBits, unsigned Flags,
-                                  DIType DerivedFrom, DIArray Elements,
-                                  MDNode *VTableHolder, MDNode *TemplateParams) {
- // TAG_class_type is encoded in DICompositeType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
-    getNonCompileUnitScope(Context),
-    MDString::get(VMContext, Name),
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder,
-    TemplateParams
-  };
-  return DIType(MDNode::get(VMContext, Elts));
-}
-
 /// createTemplateTypeParameter - Create debugging information for template
 /// type parameter.
 DITemplateTypeParameter
@@ -470,6 +444,34 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
   return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
+/// createClassType - Create debugging information entry for a class.
+DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
+                                  DIFile File, unsigned LineNumber,
+                                  uint64_t SizeInBits, uint64_t AlignInBits,
+                                  uint64_t OffsetInBits, unsigned Flags,
+                                  DIType DerivedFrom, DIArray Elements,
+                                  MDNode *VTableHolder,
+                                  MDNode *TemplateParams) {
+ // TAG_class_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    DerivedFrom,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    VTableHolder,
+    TemplateParams
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
 /// createStructType - Create debugging information entry for a struct.
 DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
                                    DIFile File, unsigned LineNumber,
@@ -490,7 +492,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -515,7 +517,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -525,9 +527,9 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     MDString::get(VMContext, ""),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
@@ -536,7 +538,7 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
     NULL,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -547,7 +549,8 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
                                         DIFile File, unsigned LineNumber,
                                         uint64_t SizeInBits,
                                         uint64_t AlignInBits,
-                                        DIArray Elements) {
+                                        DIArray Elements,
+                                        DIType ClassType, unsigned Flags) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
@@ -558,11 +561,11 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    ClassType,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   AllEnumTypes.push_back(Node);
@@ -586,7 +589,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     Ty,
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -608,7 +611,7 @@ DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
     Ty,
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -677,12 +680,13 @@ DIType DIBuilder::createTemporaryType(DIFile F) {
 
 /// createForwardDecl - Create a temporary forward-declared type that
 /// can be RAUW'd if the full type is seen.
-DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F,
+DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
+                                    DIDescriptor Scope, DIFile F,
                                     unsigned Line, unsigned RuntimeLang) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    NULL, // TheCU
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     F,
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
@@ -703,7 +707,7 @@ DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F,
 /// getOrCreateArray - Get a DIArray, create one if required.
 DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
   if (Elements.empty()) {
-    Value *Null = llvm::Constant::getNullValue(Type::getInt32Ty(VMContext));
+    Value *Null = Constant::getNullValue(Type::getInt32Ty(VMContext));
     return DIArray(MDNode::get(VMContext, Null));
   }
   return DIArray(MDNode::get(VMContext, Elements));
@@ -724,10 +728,10 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) {
 /// createGlobalVariable - Create a new descriptor for the specified global.
 DIGlobalVariable DIBuilder::
 createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
+                     DIType Ty, bool isLocalToUnit, Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     NULL, // TheCU,
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -749,10 +753,10 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
 DIGlobalVariable DIBuilder::
 createStaticVariable(DIDescriptor Context, StringRef Name,
                      StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
+                     DIType Ty, bool isLocalToUnit, Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -783,7 +787,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))),
     Ty,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
@@ -812,8 +816,8 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
   Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext),
                                   (LineNo | (ArgNo << 24))));
   Elts.push_back(Ty);
-  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
   Elts.append(Addr.begin(), Addr.end());
 
   return DIVariable(MDNode::get(VMContext, Elts));
@@ -838,7 +842,7 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -887,7 +891,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -904,9 +908,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
     TParam,
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     THolder,
-    // FIXME: Do we want to use a different scope lines?
+    // FIXME: Do we want to use different scope/lines?
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/VMCore/DebugInfo.cpp
index f61a8f3..c8f8f7d 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/VMCore/DebugInfo.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Intrinsics.h"
@@ -112,16 +112,16 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
 }
 
 unsigned DIVariable::getNumAddrElements() const {
-  if (getVersion() <= llvm::LLVMDebugVersion8)
+  if (getVersion() <= LLVMDebugVersion8)
     return DbgNode->getNumOperands()-6;
-  if (getVersion() == llvm::LLVMDebugVersion9)
+  if (getVersion() == LLVMDebugVersion9)
     return DbgNode->getNumOperands()-7;
   return DbgNode->getNumOperands()-8;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
 MDNode *DIVariable::getInlinedAt() const {
-  if (getVersion() <= llvm::LLVMDebugVersion9)
+  if (getVersion() <= LLVMDebugVersion9)
     return NULL;
   return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7));
 }
@@ -150,6 +150,7 @@ bool DIDescriptor::isDerivedType() const {
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_pointer_type:
   case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
   case dwarf::DW_TAG_const_type:
   case dwarf::DW_TAG_volatile_type:
   case dwarf::DW_TAG_restrict_type:
@@ -399,11 +400,13 @@ bool DIType::Verify() const {
   unsigned Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
-      Tag != dwarf::DW_TAG_reference_type && Tag != dwarf::DW_TAG_restrict_type 
-      && Tag != dwarf::DW_TAG_vector_type && Tag != dwarf::DW_TAG_array_type
-      && Tag != dwarf::DW_TAG_enumeration_type 
-      && Tag != dwarf::DW_TAG_subroutine_type
-      && getFilename().empty())
+      Tag != dwarf::DW_TAG_reference_type &&
+      Tag != dwarf::DW_TAG_rvalue_reference_type &&
+      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_vector_type &&
+      Tag != dwarf::DW_TAG_array_type &&
+      Tag != dwarf::DW_TAG_enumeration_type &&
+      Tag != dwarf::DW_TAG_subroutine_type &&
+      getFilename().empty())
     return false;
   return true;
 }
@@ -500,27 +503,28 @@ bool DINameSpace::Verify() const {
 uint64_t DIDerivedType::getOriginalTypeSize() const {
   unsigned Tag = getTag();
 
-  if (Tag == dwarf::DW_TAG_member || Tag == dwarf::DW_TAG_typedef ||
-      Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
-      Tag == dwarf::DW_TAG_restrict_type) {
-    DIType BaseType = getTypeDerivedFrom();
-    // If this type is not derived from any type then take conservative
-    // approach.
-    if (!BaseType.isValid())
-      return getSizeInBits();
-    // If this is a derived type, go ahead and get the base type, unless
-    // it's a reference then it's just the size of the field. Pointer types
-    // have no need of this since they're a different type of qualification
-    // on the type.
-    if (BaseType.getTag() == dwarf::DW_TAG_reference_type)
-      return getSizeInBits();
-    else if (BaseType.isDerivedType())
-      return DIDerivedType(BaseType).getOriginalTypeSize();
-    else
-      return BaseType.getSizeInBits();
-  }
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return getSizeInBits();
+
+  DIType BaseType = getTypeDerivedFrom();
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return DIDerivedType(BaseType).getOriginalTypeSize();
 
-  return getSizeInBits();
+  return BaseType.getSizeInBits();
 }
 
 /// getObjCProperty - Return property node, if this ivar is associated with one.
@@ -538,7 +542,7 @@ bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
     return false;
   // This variable is not inlined function argument if its scope
   // does not describe current function.
-  return !(DISubprogram(getContext()).describes(CurFn));
+  return !DISubprogram(getContext()).describes(CurFn);
 }
 
 /// describes - Return true if this subprogram provides debugging
@@ -660,257 +664,6 @@ DIArray DICompileUnit::getGlobalVariables() const {
   return DIArray();
 }
 
-//===----------------------------------------------------------------------===//
-// DIDescriptor: vtable anchors for all descriptors.
-//===----------------------------------------------------------------------===//
-
-void DIScope::anchor() { }
-
-void DICompileUnit::anchor() { }
-
-void DIFile::anchor() { }
-
-void DIType::anchor() { }
-
-void DIBasicType::anchor() { }
-
-void DIDerivedType::anchor() { }
-
-void DICompositeType::anchor() { }
-
-void DISubprogram::anchor() { }
-
-void DILexicalBlock::anchor() { }
-
-void DINameSpace::anchor() { }
-
-void DILexicalBlockFile::anchor() { }
-
-//===----------------------------------------------------------------------===//
-// DIDescriptor: dump routines for all descriptors.
-//===----------------------------------------------------------------------===//
-
-
-/// print - Print descriptor.
-void DIDescriptor::print(raw_ostream &OS) const {
-  OS << "[" << dwarf::TagString(getTag()) << "] ";
-  OS.write_hex((intptr_t) &*DbgNode) << ']';
-}
-
-/// print - Print compile unit.
-void DICompileUnit::print(raw_ostream &OS) const {
-  if (getLanguage())
-    OS << " [" << dwarf::LanguageString(getLanguage()) << "] ";
-
-  OS << " [" << getDirectory() << "/" << getFilename() << "]";
-}
-
-/// print - Print type.
-void DIType::print(raw_ostream &OS) const {
-  if (!DbgNode) return;
-
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " ["
-         << "line " << getLineNumber() << ", "
-         << getSizeInBits() << " bits, "
-         << getAlignInBits() << " bit alignment, "
-         << getOffsetInBits() << " bit offset"
-         << "] ";
-
-  if (isPrivate())
-    OS << " [private] ";
-  else if (isProtected())
-    OS << " [protected] ";
-
-  if (isForwardDecl())
-    OS << " [fwd] ";
-
-  if (isBasicType())
-    DIBasicType(DbgNode).print(OS);
-  else if (isDerivedType()) {
-    DIDerivedType DTy = DIDerivedType(DbgNode);
-    DTy.print(OS);
-    DICompositeType CTy = getDICompositeType(DTy);
-    if (CTy.Verify())
-      CTy.print(OS);
-  }
-  else if (isCompositeType())
-    DICompositeType(DbgNode).print(OS);
-  else {
-    OS << "Invalid DIType\n";
-    return;
-  }
-
-  OS << "\n";
-}
-
-/// print - Print basic type.
-void DIBasicType::print(raw_ostream &OS) const {
-  OS << " [" << dwarf::AttributeEncodingString(getEncoding()) << "] ";
-}
-
-/// print - Print derived type.
-void DIDerivedType::print(raw_ostream &OS) const {
-  OS << "\n\t Derived From: ";
-  getTypeDerivedFrom().print(OS);
-  OS << "\n\t";
-}
-
-/// print - Print composite type.
-void DICompositeType::print(raw_ostream &OS) const {
-  DIArray A = getTypeArray();
-  OS << " [" << A.getNumElements() << " elements]";
-}
-
-/// print - Print subprogram.
-void DISubprogram::print(raw_ostream &OS) const {
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " [" << getLineNumber() << "] ";
-
-  if (isLocalToUnit())
-    OS << " [local] ";
-
-  if (isDefinition())
-    OS << " [def] ";
-
-  if (getScopeLineNumber() != getLineNumber())
-    OS << " [Scope: " << getScopeLineNumber() << "] ";
-
-  OS << "\n";
-}
-
-/// print - Print global variable.
-void DIGlobalVariable::print(raw_ostream &OS) const {
-  OS << " [";
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " [" << getLineNumber() << "] ";
-
-  if (isLocalToUnit())
-    OS << " [local] ";
-
-  if (isDefinition())
-    OS << " [def] ";
-
-  if (isGlobalVariable())
-    DIGlobalVariable(DbgNode).print(OS);
-  OS << "]\n";
-}
-
-static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
-                          const LLVMContext &Ctx) {
-  if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(Ctx));
-    // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope.Verify())
-      CommentOS << Scope.getFilename();
-    else
-      CommentOS << "<unknown>";
-    CommentOS << ':' << DL.getLine();
-    if (DL.getCol() != 0)
-      CommentOS << ':' << DL.getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
-    if (!InlinedAtDL.isUnknown()) {
-      CommentOS << " @[ ";
-      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
-      CommentOS << " ]";
-    }
-  }
-}
-
-void DIVariable::printExtendedName(raw_ostream &OS) const {
-  const LLVMContext &Ctx = DbgNode->getContext();
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << Res << "," << getLineNumber();
-  if (MDNode *InlinedAt = getInlinedAt()) {
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
-    if (!InlinedAtDL.isUnknown()) {
-      OS << " @[";
-      printDebugLoc(InlinedAtDL, OS, Ctx);
-      OS << "]";
-    }
-  }
-}
-
-/// print - Print variable.
-void DIVariable::print(raw_ostream &OS) const {
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  OS << " [" << getLineNumber() << "] ";
-  getType().print(OS);
-  OS << "\n";
-
-  // FIXME: Dump complex addresses
-}
-
-/// dump - Print descriptor to dbgs() with a newline.
-void DIDescriptor::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print compile unit to dbgs() with a newline.
-void DICompileUnit::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print type to dbgs() with a newline.
-void DIType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print basic type to dbgs() with a newline.
-void DIBasicType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print derived type to dbgs() with a newline.
-void DIDerivedType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print composite type to dbgs() with a newline.
-void DICompositeType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print subprogram to dbgs() with a newline.
-void DISubprogram::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print global variable.
-void DIGlobalVariable::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print variable.
-void DIVariable::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
 /// fixupObjcLikeName - Replace contains special characters used
 /// in a typical Objective-C names with '.' in a given string.
 static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
@@ -981,11 +734,50 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
     i == 7 ? 
-      Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))):
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
       Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
+/// getDISubprogram - Find subprogram that is enclosing this scope.
+DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
+  DIDescriptor D(Scope);
+  if (D.isSubprogram())
+    return DISubprogram(Scope);
+
+  if (D.isLexicalBlockFile())
+    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
+  
+  if (D.isLexicalBlock())
+    return getDISubprogram(DILexicalBlock(Scope).getContext());
+
+  return DISubprogram();
+}
+
+/// getDICompositeType - Find underlying composite type.
+DICompositeType llvm::getDICompositeType(DIType T) {
+  if (T.isCompositeType())
+    return DICompositeType(T);
+
+  if (T.isDerivedType())
+    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+
+  return DICompositeType();
+}
+
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool llvm::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(DIType(Context).getContext());
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
 //===----------------------------------------------------------------------===//
@@ -1188,42 +980,189 @@ bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
   return true;
 }
 
-/// getDISubprogram - Find subprogram that is enclosing this scope.
-DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
-  DIDescriptor D(Scope);
-  if (D.isSubprogram())
-    return DISubprogram(Scope);
+//===----------------------------------------------------------------------===//
+// DIDescriptor: dump routines for all descriptors.
+//===----------------------------------------------------------------------===//
 
-  if (D.isLexicalBlockFile())
-    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
-  
-  if (D.isLexicalBlock())
-    return getDISubprogram(DILexicalBlock(Scope).getContext());
+/// dump - Print descriptor to dbgs() with a newline.
+void DIDescriptor::dump() const {
+  print(dbgs()); dbgs() << '\n';
+}
 
-  return DISubprogram();
+/// print - Print descriptor.
+void DIDescriptor::print(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  if (const char *Tag = dwarf::TagString(getTag()))
+    OS << "[ " << Tag << " ]";
+
+  if (this->isSubrange()) {
+    DISubrange(DbgNode).printInternal(OS);
+  } else if (this->isCompileUnit()) {
+    DICompileUnit(DbgNode).printInternal(OS);
+  } else if (this->isFile()) {
+    DIFile(DbgNode).printInternal(OS);
+  } else if (this->isEnumerator()) {
+    DIEnumerator(DbgNode).printInternal(OS);
+  } else if (this->isBasicType()) {
+    DIType(DbgNode).printInternal(OS);
+  } else if (this->isDerivedType()) {
+    DIDerivedType(DbgNode).printInternal(OS);
+  } else if (this->isCompositeType()) {
+    DICompositeType(DbgNode).printInternal(OS);
+  } else if (this->isSubprogram()) {
+    DISubprogram(DbgNode).printInternal(OS);
+  } else if (this->isGlobalVariable()) {
+    DIGlobalVariable(DbgNode).printInternal(OS);
+  } else if (this->isVariable()) {
+    DIVariable(DbgNode).printInternal(OS);
+  } else if (this->isObjCProperty()) {
+    DIObjCProperty(DbgNode).printInternal(OS);
+  } else if (this->isScope()) {
+    DIScope(DbgNode).printInternal(OS);
+  }
 }
 
-/// getDICompositeType - Find underlying composite type.
-DICompositeType llvm::getDICompositeType(DIType T) {
-  if (T.isCompositeType())
-    return DICompositeType(T);
+void DISubrange::printInternal(raw_ostream &OS) const {
+  OS << " [" << getLo() << ", " << getHi() << ']';
+}
 
-  if (T.isDerivedType())
-    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+void DIScope::printInternal(raw_ostream &OS) const {
+  OS << " [" << getDirectory() << "/" << getFilename() << ']';
+}
 
-  return DICompositeType();
+void DICompileUnit::printInternal(raw_ostream &OS) const {
+  DIScope::printInternal(OS);
+  if (unsigned Lang = getLanguage())
+    OS << " [" << dwarf::LanguageString(Lang) << ']';
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool llvm::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
+void DIEnumerator::printInternal(raw_ostream &OS) const {
+  OS << " [" << getName() << " :: " << getEnumValue() << ']';
+}
+
+void DIType::printInternal(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << "]";
+
+  // TODO: Print context?
+
+  OS << " [line " << getLineNumber()
+     << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits()
+     << ", offset " << getOffsetInBits();
+  if (isBasicType())
+    if (const char *Enc = 
+        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+      OS << ", enc " << Enc;
+  OS << "]";
+
+  if (isPrivate())
+    OS << " [private]";
+  else if (isProtected())
+    OS << " [protected]";
+
+  if (isForwardDecl())
+    OS << " [fwd]";
+}
+
+void DIDerivedType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  OS << " [from " << getTypeDerivedFrom().getName() << ']';
+}
+
+void DICompositeType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  DIArray A = getTypeArray();
+  OS << " [" << A.getNumElements() << " elements]";
+}
+
+void DISubprogram::printInternal(raw_ostream &OS) const {
+  // TODO : Print context
+  OS << " [line " << getLineNumber() << ']';
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+
+  if (getScopeLineNumber() != getLineNumber())
+    OS << " [scope " << getScopeLineNumber() << "]";
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+}
+
+void DIGlobalVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+
+  // TODO : Print context
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+}
+
+void DIVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+}
+
+void DIObjCProperty::printInternal(raw_ostream &OS) const {
+  StringRef Name = getObjCPropertyName();
+  if (!Name.empty())
+    OS << " [" << Name << ']';
+
+  OS << " [line " << getLineNumber()
+     << ", properties " << getUnsignedField(6) << ']';
 }
 
+static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void DIVariable::printExtendedName(raw_ostream &OS) const {
+  const LLVMContext &Ctx = DbgNode->getContext();
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << Res << "," << getLineNumber();
+  if (MDNode *InlinedAt = getInlinedAt()) {
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+    if (!InlinedAtDL.isUnknown()) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
index 9013d28..c6a3053 100644
--- a/lib/VMCore/DebugLoc.cpp
+++ b/lib/VMCore/DebugLoc.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DebugLoc.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "LLVMContextImpl.h"
 using namespace llvm;
@@ -114,34 +115,19 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
 
 /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
 DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
-  if (N == 0 || N->getNumOperands() != 4) return DebugLoc();
-  
-  MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(2));
+  DILocation Loc(N);
+  MDNode *Scope = Loc.getScope();
   if (Scope == 0) return DebugLoc();
-  
-  unsigned LineNo = 0, ColNo = 0;
-  if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(0)))
-    LineNo = Line->getZExtValue();
-  if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(1)))
-    ColNo = Col->getZExtValue();
-  
-  return get(LineNo, ColNo, Scope, dyn_cast_or_null<MDNode>(N->getOperand(3)));
+  return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope,
+             Loc.getOrigLocation());
 }
 
 /// getFromDILexicalBlock - Translate the DILexicalBlock into a DebugLoc.
 DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) {
-  if (N == 0 || N->getNumOperands() < 3) return DebugLoc();
-  
-  MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(1));
+  DILexicalBlock LexBlock(N);
+  MDNode *Scope = LexBlock.getContext();
   if (Scope == 0) return DebugLoc();
-  
-  unsigned LineNo = 0, ColNo = 0;
-  if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(2)))
-    LineNo = Line->getZExtValue();
-  if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(3)))
-    ColNo = Col->getZExtValue();
-  
-  return get(LineNo, ColNo, Scope, NULL);
+  return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope, NULL);
 }
 
 void DebugLoc::dump(const LLVMContext &Ctx) const {
@@ -164,22 +150,10 @@ void DebugLoc::dump(const LLVMContext &Ctx) const {
 // DenseMap specialization
 //===----------------------------------------------------------------------===//
 
-DebugLoc DenseMapInfo<DebugLoc>::getEmptyKey() {
-  return DebugLoc::getEmptyKey();
-}
-
-DebugLoc DenseMapInfo<DebugLoc>::getTombstoneKey() {
-  return DebugLoc::getTombstoneKey();
-}
-
 unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) {
   return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx));
 }
 
-bool DenseMapInfo<DebugLoc>::isEqual(const DebugLoc &LHS, const DebugLoc &RHS) {
-  return LHS == RHS;
-}
-
 //===----------------------------------------------------------------------===//
 // LLVMContextImpl Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/VMCore/Dominators.cpp b/lib/VMCore/Dominators.cpp
index 219e631..682d928 100644
--- a/lib/VMCore/Dominators.cpp
+++ b/lib/VMCore/Dominators.cpp
@@ -39,6 +39,22 @@ static cl::opt<bool,true>
 VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo),
                cl::desc("Verify dominator info (time consuming)"));
 
+namespace llvm {
+  class BasicBlockEdge {
+    const BasicBlock *Start;
+    const BasicBlock *End;
+  public:
+    BasicBlockEdge(const BasicBlock *Start_, const BasicBlock *End_) :
+      Start(Start_), End(End_) { }
+    const BasicBlock *getStart() const {
+      return Start;
+    }
+    const BasicBlock *getEnd() const {
+      return End;
+    }
+  };
+}
+
 //===----------------------------------------------------------------------===//
 //  DominatorTree Implementation
 //===----------------------------------------------------------------------===//
@@ -142,12 +158,22 @@ bool DominatorTree::dominates(const Instruction *Def,
   // Invoke results are only usable in the normal destination, not in the
   // exceptional destination.
   BasicBlock *NormalDest = II->getNormalDest();
-  if (!dominates(NormalDest, UseBB))
+  BasicBlockEdge E(DefBB, NormalDest);
+  return dominates(E, UseBB);
+}
+
+bool DominatorTree::dominates(const BasicBlockEdge &BBE,
+                              const BasicBlock *UseBB) const {
+  // If the BB the edge ends in doesn't dominate the use BB, then the
+  // edge also doesn't.
+  const BasicBlock *Start = BBE.getStart();
+  const BasicBlock *End = BBE.getEnd();
+  if (!dominates(End, UseBB))
     return false;
 
-  // Simple case: if the normal destination has a single predecessor, the
-  // fact that it dominates the use block implies that we also do.
-  if (NormalDest->getSinglePredecessor())
+  // Simple case: if the end BB has a single predecessor, the fact that it
+  // dominates the use block implies that the edge also does.
+  if (End->getSinglePredecessor())
     return true;
 
   // The normal edge from the invoke is critical. Conceptually, what we would
@@ -170,29 +196,40 @@ bool DominatorTree::dominates(const Instruction *Def,
   // trivially dominates itself, so we only have to find if it dominates the
   // other predecessors. Since the only way out of X is via NormalDest, X can
   // only properly dominate a node if NormalDest dominates that node too.
-  for (pred_iterator PI = pred_begin(NormalDest),
-         E = pred_end(NormalDest); PI != E; ++PI) {
+  for (const_pred_iterator PI = pred_begin(End), E = pred_end(End);
+       PI != E; ++PI) {
     const BasicBlock *BB = *PI;
-    if (BB == DefBB)
+    if (BB == Start)
       continue;
 
-    if (!DT->isReachableFromEntry(BB))
-      continue;
-
-    if (!dominates(NormalDest, BB))
+    if (!dominates(End, BB))
       return false;
   }
   return true;
 }
 
-bool DominatorTree::dominates(const Instruction *Def,
+bool DominatorTree::dominates(const BasicBlockEdge &BBE,
                               const Use &U) const {
-  Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+  Instruction *UserInst = cast<Instruction>(U.getUser());
+  // A PHI in the end of the edge is dominated by it.
+  PHINode *PN = dyn_cast<PHINode>(UserInst);
+  if (PN && PN->getParent() == BBE.getEnd() &&
+      PN->getIncomingBlock(U) == BBE.getStart())
+    return true;
 
-  // Instructions do not dominate non-instructions.
-  if (!UserInst)
-    return false;
+  // Otherwise use the edge-dominates-block query, which
+  // handles the crazy critical edge cases properly.
+  const BasicBlock *UseBB;
+  if (PN)
+    UseBB = PN->getIncomingBlock(U);
+  else
+    UseBB = UserInst->getParent();
+  return dominates(BBE, UseBB);
+}
 
+bool DominatorTree::dominates(const Instruction *Def,
+                              const Use &U) const {
+  Instruction *UserInst = cast<Instruction>(U.getUser());
   const BasicBlock *DefBB = Def->getParent();
 
   // Determine the block in which the use happens. PHI nodes use
@@ -218,17 +255,9 @@ bool DominatorTree::dominates(const Instruction *Def,
   // their own block, except possibly a phi, so we don't need to
   // walk the block in any case.
   if (const InvokeInst *II = dyn_cast<InvokeInst>(Def)) {
-    // A PHI in the normal successor using the invoke's return value is
-    // dominated by the invoke's return value.
-    if (isa<PHINode>(UserInst) &&
-        UserInst->getParent() == II->getNormalDest() &&
-        cast<PHINode>(UserInst)->getIncomingBlock(U) == DefBB)
-      return true;
-
-    // Otherwise use the instruction-dominates-block query, which
-    // handles the crazy case of an invoke with a critical edge
-    // properly.
-    return dominates(Def, UseBB);
+    BasicBlock *NormalDest = II->getNormalDest();
+    BasicBlockEdge E(DefBB, NormalDest);
+    return dominates(E, U);
   }
 
   // If the def and use are in different blocks, do a simple CFG dominator
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index af6344e..2e0b316 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -29,7 +29,6 @@
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
-
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<Argument, Function>;
@@ -358,17 +357,239 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-FunctionType *Intrinsic::getType(LLVMContext &Context,
-                                       ID id, ArrayRef<Type*> Tys) {
-  Type *ResultTy = NULL;
-  SmallVector<Type*, 8> ArgTys;
-  bool IsVarArg = false;
+
+/// IIT_Info - These are enumerators that describe the entries returned by the
+/// getIntrinsicInfoTableEntries function.
+///
+/// NOTE: This must be kept in synch with the copy in TblGen/IntrinsicEmitter!
+enum IIT_Info {
+  // Common values should be encoded with 0-15.
+  IIT_Done = 0,
+  IIT_I1   = 1,
+  IIT_I8   = 2,
+  IIT_I16  = 3,
+  IIT_I32  = 4,
+  IIT_I64  = 5,
+  IIT_F32  = 6,
+  IIT_F64  = 7,
+  IIT_V2   = 8,
+  IIT_V4   = 9,
+  IIT_V8   = 10,
+  IIT_V16  = 11,
+  IIT_V32  = 12,
+  IIT_MMX  = 13,
+  IIT_PTR  = 14,
+  IIT_ARG  = 15,
   
-#define GET_INTRINSIC_GENERATOR
+  // Values from 16+ are only encodable with the inefficient encoding.
+  IIT_METADATA = 16,
+  IIT_EMPTYSTRUCT = 17,
+  IIT_STRUCT2 = 18,
+  IIT_STRUCT3 = 19,
+  IIT_STRUCT4 = 20,
+  IIT_STRUCT5 = 21,
+  IIT_EXTEND_VEC_ARG = 22,
+  IIT_TRUNC_VEC_ARG = 23,
+  IIT_ANYPTR = 24
+};
+
+
+static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+                      SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  IIT_Info Info = IIT_Info(Infos[NextElt++]);
+  unsigned StructElts = 2;
+  using namespace Intrinsic;
+  
+  switch (Info) {
+  case IIT_Done:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
+    return;
+  case IIT_MMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
+    return;
+  case IIT_METADATA:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
+    return;
+  case IIT_F32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
+    return;
+  case IIT_F64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
+    return;
+  case IIT_I1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
+    return;
+  case IIT_I8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
+    return;
+  case IIT_I16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer,16));
+    return;
+  case IIT_I32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32));
+    return;
+  case IIT_I64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
+    return;
+  case IIT_V2:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V4:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 4));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 8));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 16));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_PTR:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_ANYPTR: {  // [ANYPTR addrspace, subtype]
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 
+                                             Infos[NextElt++]));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  case IIT_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo));
+    return;
+  }
+  case IIT_EXTEND_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ExtendVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_TRUNC_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::TruncVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_EMPTYSTRUCT:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
+    return;
+  case IIT_STRUCT5: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT4: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT3: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT2: {
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts));
+
+    for (unsigned i = 0; i != StructElts; ++i)
+      DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+#define GET_INTRINSIC_GENERATOR_GLOBAL
 #include "llvm/Intrinsics.gen"
-#undef GET_INTRINSIC_GENERATOR
+#undef GET_INTRINSIC_GENERATOR_GLOBAL
+
+void Intrinsic::getIntrinsicInfoTableEntries(ID id, 
+                                             SmallVectorImpl<IITDescriptor> &T){
+  // Check to see if the intrinsic's type was expressible by the table.
+  unsigned TableVal = IIT_Table[id-1];
+  
+  // Decode the TableVal into an array of IITValues.
+  SmallVector<unsigned char, 8> IITValues;
+  ArrayRef<unsigned char> IITEntries;
+  unsigned NextElt = 0;
+  if ((TableVal >> 31) != 0) {
+    // This is an offset into the IIT_LongEncodingTable.
+    IITEntries = IIT_LongEncodingTable;
+    
+    // Strip sentinel bit.
+    NextElt = (TableVal << 1) >> 1;
+  } else {
+    // Decode the TableVal into an array of IITValues.  If the entry was encoded
+    // into a single word in the table itself, decode it now.
+    do {
+      IITValues.push_back(TableVal & 0xF);
+      TableVal >>= 4;
+    } while (TableVal);
+    
+    IITEntries = IITValues;
+    NextElt = 0;
+  }
 
-  return FunctionType::get(ResultTy, ArgTys, IsVarArg); 
+  // Okay, decode the table into the output vector of IITDescriptors.
+  DecodeIITType(NextElt, IITEntries, T);
+  while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
+    DecodeIITType(NextElt, IITEntries, T);
+}
+
+
+static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             ArrayRef<Type*> Tys, LLVMContext &Context) {
+  using namespace Intrinsic;
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  
+  switch (D.Kind) {
+  case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
+  case IITDescriptor::Float: return Type::getFloatTy(Context);
+  case IITDescriptor::Double: return Type::getDoubleTy(Context);
+      
+  case IITDescriptor::Integer:
+    return IntegerType::get(Context, D.Integer_Width);
+  case IITDescriptor::Vector:
+    return VectorType::get(DecodeFixedType(Infos, Tys, Context),D.Vector_Width);
+  case IITDescriptor::Pointer:
+    return PointerType::get(DecodeFixedType(Infos, Tys, Context),
+                            D.Pointer_AddressSpace);
+  case IITDescriptor::Struct: {
+    Type *Elts[5];
+    assert(D.Struct_NumElements <= 5 && "Can't handle this yet");
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      Elts[i] = DecodeFixedType(Infos, Tys, Context);
+    return StructType::get(Context, ArrayRef<Type*>(Elts,D.Struct_NumElements));
+  }
+
+  case IITDescriptor::Argument:
+    return Tys[D.getArgumentNumber()];
+  case IITDescriptor::ExtendVecArgument:
+    return VectorType::getExtendedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+      
+  case IITDescriptor::TruncVecArgument:
+    return VectorType::getTruncatedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+
+FunctionType *Intrinsic::getType(LLVMContext &Context,
+                                 ID id, ArrayRef<Type*> Tys) {
+  SmallVector<IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(id, Table);
+  
+  ArrayRef<IITDescriptor> TableRef = Table;
+  Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
+    
+  SmallVector<Type*, 8> ArgTys;
+  while (!TableRef.empty())
+    ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
+
+  return FunctionType::get(ResultTy, ArgTys, false); 
 }
 
 bool Intrinsic::isOverloaded(ID id) {
@@ -400,7 +621,8 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 bool Function::hasAddressTaken(const User* *PutOffender) const {
   for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
     const User *U = *I;
-    // FIXME: Check for blockaddress, which does not take the address.
+    if (isa<BlockAddress>(U))
+      continue;
     if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
       return PutOffender ? (*PutOffender = U, true) : true;
     ImmutableCallSite CS(cast<Instruction>(U));
@@ -439,4 +661,3 @@ bool Function::callsFunctionThatReturnsTwice() const {
   return false;
 }
 
-// vim: sw=2 ai
diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
index 595c452..003a5d4 100644
--- a/lib/VMCore/GCOV.cpp
+++ b/lib/VMCore/GCOV.cpp
@@ -64,7 +64,7 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
 /// dump - Dump GCOVFile content on standard out for debugging purposes.
 void GCOVFile::dump() {
   for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
-	 E = Functions.end(); I != E; ++I)
+         E = Functions.end(); I != E; ++I)
     (*I)->dump();
 }
 
@@ -72,7 +72,7 @@ void GCOVFile::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
   for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
-	 E = Functions.end(); I != E; ++I) 
+         E = Functions.end(); I != E; ++I) 
     (*I)->collectLineCounts(FI);
   FI.print();
 }
@@ -143,7 +143,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
       StringRef Filename = Buff.readString();
       if (Buff.getCursor() == (Size - 4)) break;
       while (uint32_t L = Buff.readInt())
-	Block->addLine(Filename, L);
+        Block->addLine(Filename, L);
     }
     Buff.readInt(); // flag
   }
@@ -154,7 +154,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
 void GCOVFunction::dump() {
   outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
   for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
-	 E = Blocks.end(); I != E; ++I)
+         E = Blocks.end(); I != E; ++I)
     (*I)->dump();
 }
 
@@ -162,7 +162,7 @@ void GCOVFunction::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFunction::collectLineCounts(FileInfo &FI) {
   for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
-	 E = Blocks.end(); I != E; ++I)
+         E = Blocks.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
 }
 
@@ -186,7 +186,7 @@ void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
   for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     I->second->collectLineCounts(FI, I->first(), Counter);
 }
 
@@ -196,14 +196,14 @@ void GCOVBlock::dump() {
   if (!Edges.empty()) {
     outs() << "\tEdges : ";
     for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end();
-	 I != E; ++I)
+         I != E; ++I)
       outs() << (*I) << ",";
     outs() << "\n";
   }
   if (!Lines.empty()) {
     outs() << "\tLines : ";
     for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
-	   LE = Lines.end(); LI != LE; ++LI) {
+           LE = Lines.end(); LI != LE; ++LI) {
       outs() << LI->first() << " -> ";
       LI->second->dump();
       outs() << "\n";
@@ -217,16 +217,16 @@ void GCOVBlock::dump() {
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
-				  uint32_t Count) {
+                                  uint32_t Count) {
   for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     FI.addLineCount(Filename, *I, Count);
 }
 
 /// dump - Dump GCOVLines content on standard out for debugging purposes.
 void GCOVLines::dump() {
   for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     outs() << (*I) << ",";
 }
 
@@ -266,12 +266,12 @@ void FileInfo::print() {
     StringRef AllLines = Buff.take()->getBuffer();
     for (unsigned i = 0, e = L.size(); i != e; ++i) {
       if (L[i])
-	outs() << L[i] << ":\t";
+        outs() << L[i] << ":\t";
       else
-	outs() << " :\t";
+        outs() << " :\t";
       std::pair<StringRef, StringRef> P = AllLines.split('\n');
       if (AllLines != P.first)
-	outs() << P.first;
+        outs() << P.first;
       outs() << "\n";
       AllLines = P.second;
     }
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index 4254fb2..c428b88 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -82,12 +82,12 @@ bool GlobalValue::isDeclaration() const {
 
 GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                                Constant *InitVal, const Twine &Name,
-                               bool ThreadLocal, unsigned AddressSpace)
-  : GlobalValue(PointerType::get(Ty, AddressSpace), 
+                               ThreadLocalMode TLMode, unsigned AddressSpace)
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+    isConstantGlobal(constant), threadLocalMode(TLMode) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -100,13 +100,13 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
 GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
                                const Twine &Name,
-                               GlobalVariable *Before, bool ThreadLocal,
+                               GlobalVariable *Before, ThreadLocalMode TLMode,
                                unsigned AddressSpace)
-  : GlobalValue(PointerType::get(Ty, AddressSpace), 
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+    isConstantGlobal(constant), threadLocalMode(TLMode) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index b459234..5c4e6d9 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 using namespace llvm;
@@ -28,7 +28,7 @@ Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) {
   Module &M = *BB->getParent()->getParent();
   GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(),
                                           true, GlobalValue::PrivateLinkage,
-                                          StrConstant, "", 0, false);
+                                          StrConstant);
   GV->setName(Name);
   GV->setUnnamedAddr(true);
   return GV;
@@ -120,13 +120,13 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
 
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
-	 "lifetime.start only applies to pointers.");
+         "lifetime.start only applies to pointers.");
   Ptr = getCastedInt8PtrValue(Ptr);
   if (!Size)
     Size = getInt64(-1);
   else
     assert(Size->getType() == getInt64Ty() &&
-	   "lifetime.start requires the size to be an i64");
+           "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
@@ -135,13 +135,13 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
 
 CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
-	 "lifetime.end only applies to pointers.");
+         "lifetime.end only applies to pointers.");
   Ptr = getCastedInt8PtrValue(Ptr);
   if (!Size)
     Size = getInt64(-1);
   else
     assert(Size->getType() == getInt64Ty() &&
-	   "lifetime.end requires the size to be an i64");
+           "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 5449714..66379a0 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -226,34 +226,52 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
            RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
            RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
            RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
-
+  if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
+    const PHINode *otherPHI = cast<PHINode>(I);
+    for (unsigned i = 0, e = thisPHI->getNumOperands(); i != e; ++i) {
+      if (thisPHI->getIncomingBlock(i) != otherPHI->getIncomingBlock(i))
+        return false;
+    }
+    return true;
+  }
   return true;
 }
 
 // isSameOperationAs
 // This should be kept in sync with isEquivalentOperation in
 // lib/Transforms/IPO/MergeFunctions.cpp.
-bool Instruction::isSameOperationAs(const Instruction *I) const {
+bool Instruction::isSameOperationAs(const Instruction *I,
+                                    unsigned flags) const {
+  bool IgnoreAlignment = flags & CompareIgnoringAlignment;
+  bool UseScalarTypes  = flags & CompareUsingScalarTypes;
+
   if (getOpcode() != I->getOpcode() ||
       getNumOperands() != I->getNumOperands() ||
-      getType() != I->getType())
+      (UseScalarTypes ?
+       getType()->getScalarType() != I->getType()->getScalarType() :
+       getType() != I->getType()))
     return false;
 
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same type
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (getOperand(i)->getType() != I->getOperand(i)->getType())
+    if (UseScalarTypes ?
+        getOperand(i)->getType()->getScalarType() !=
+          I->getOperand(i)->getType()->getScalarType() :
+        getOperand(i)->getType() != I->getOperand(i)->getType())
       return false;
 
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
+           (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
            LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
            LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(this))
     return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
+           (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
            SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
            SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(this))
@@ -388,6 +406,29 @@ bool Instruction::isCommutative(unsigned op) {
   }
 }
 
+/// isIdempotent - Return true if the instruction is idempotent:
+///
+///   Idempotent operators satisfy:  x op x === x
+///
+/// In LLVM, the And and Or operators are idempotent.
+///
+bool Instruction::isIdempotent(unsigned Opcode) {
+  return Opcode == And || Opcode == Or;
+}
+
+/// isNilpotent - Return true if the instruction is nilpotent:
+///
+///   Nilpotent operators satisfy:  x op x === Id,
+///
+///   where Id is the identity for the operator, i.e. a constant such that
+///     x op Id === x and Id op x === x for all x.
+///
+/// In LLVM, the Xor operator is nilpotent.
+///
+bool Instruction::isNilpotent(unsigned Opcode) {
+  return Opcode == Xor;
+}
+
 Instruction *Instruction::clone() const {
   Instruction *New = clone_impl();
   New->SubclassOptionalData = SubclassOptionalData;
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 6c5db32..9af98e8 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -161,8 +161,14 @@ Value *PHINode::hasConstantValue() const {
   // Exploit the fact that phi nodes always have at least one entry.
   Value *ConstantValue = getIncomingValue(0);
   for (unsigned i = 1, e = getNumIncomingValues(); i != e; ++i)
-    if (getIncomingValue(i) != ConstantValue)
-      return 0; // Incoming values not all the same.
+    if (getIncomingValue(i) != ConstantValue && getIncomingValue(i) != this) {
+      if (ConstantValue != this)
+        return 0; // Incoming values not all the same.
+       // The case where the first value is this PHI.
+      ConstantValue = getIncomingValue(i);
+    }
+  if (ConstantValue == this)
+    return UndefValue::get(getType());
   return ConstantValue;
 }
 
@@ -3158,6 +3164,7 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
     OL[i] = InOL[i];
     OL[i+1] = InOL[i+1];
   }
+  TheSubsets = SI.TheSubsets;
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
@@ -3169,6 +3176,16 @@ SwitchInst::~SwitchInst() {
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
+  IntegersSubsetToBB Mapping;
+  
+  // FIXME: Currently we work with ConstantInt based cases.
+  // So inititalize IntItem container directly from ConstantInt.
+  Mapping.add(IntItem::fromConstantInt(OnVal));
+  IntegersSubset CaseRanges = Mapping.getCase();
+  addCase(CaseRanges, Dest);
+}
+
+void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   unsigned NewCaseIdx = getNumCases(); 
   unsigned OpNo = NumOperands;
   if (OpNo+2 > ReservedSpace)
@@ -3176,14 +3193,17 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-  CaseIt Case(this, NewCaseIdx);
-  Case.setValue(OnVal);
+
+  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
+  
+  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
+  Case.updateCaseValueOperand(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt i) {
+void SwitchInst::removeCase(CaseIt& i) {
   unsigned idx = i.getCaseIndex();
   
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
@@ -3200,6 +3220,16 @@ void SwitchInst::removeCase(CaseIt i) {
   // Nuke the last value.
   OL[NumOps-2].set(0);
   OL[NumOps-2+1].set(0);
+
+  // Do the same with TheCases collection:
+  if (i.SubsetIt != --TheSubsets.end()) {
+    *i.SubsetIt = TheSubsets.back();
+    TheSubsets.pop_back();
+  } else {
+    TheSubsets.pop_back();
+    i.SubsetIt = TheSubsets.end();
+  }
+  
   NumOperands = NumOps-2;
 }
 
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 090b09a..95e5a8b 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ValueHandle.h"
 using namespace llvm;
@@ -66,7 +67,11 @@ public:
   MDNodeOperand(Value *V) : CallbackVH(V) {}
   ~MDNodeOperand() {}
 
-  void set(Value *V) { this->setValPtr(V); }
+  void set(Value *V) {
+    unsigned IsFirst = this->getValPtrInt();
+    this->setValPtr(V);
+    this->setAsFirstOperand(IsFirst);
+  }
 
   /// setAsFirstOperand - Accessor method to mark the operand as the first in
   /// the list.
@@ -95,7 +100,7 @@ void MDNodeOperand::allUsesReplacedWith(Value *NV) {
 static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
   // Use <= instead of < to permit a one-past-the-end address.
   assert(Op <= N->getNumOperands() && "Invalid operand number");
-  return reinterpret_cast<MDNodeOperand*>(N+1)+Op;
+  return reinterpret_cast<MDNodeOperand*>(N + 1) + Op;
 }
 
 void MDNode::replaceOperandWith(unsigned i, Value *Val) {
@@ -122,7 +127,6 @@ MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
   }
 }
 
-
 /// ~MDNode - Destroy MDNode.
 MDNode::~MDNode() {
   assert((getSubclassDataFromValue() & DestroyFlag) != 0 &&
@@ -196,7 +200,7 @@ const Function *MDNode::getFunction() const {
 // destroy - Delete this node.  Only when there are no uses.
 void MDNode::destroy() {
   setValueSubclassData(getSubclassDataFromValue() | DestroyFlag);
-  // Placement delete, the free the memory.
+  // Placement delete, then free the memory.
   this->~MDNode();
   free(this);
 }
@@ -247,7 +251,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
   }
 
   // Coallocate space for the node and Operands together, then placement new.
-  void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+  void *Ptr = malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
   N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
 
   // Cache the operand hash.
@@ -275,7 +279,7 @@ MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) {
 
 MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
   MDNode *N =
-    (MDNode *)malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+    (MDNode *)malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
   N = new (N) MDNode(Context, Vals, FL_No);
   N->setValueSubclassData(N->getSubclassDataFromValue() |
                           NotUniquedBit);
@@ -398,6 +402,155 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
   }
 }
 
+MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  SmallVector<MDNode *, 4> PathA;
+  MDNode *T = A;
+  while (T) {
+    PathA.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  SmallVector<MDNode *, 4> PathB;
+  T = B;
+  while (T) {
+    PathB.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  int IA = PathA.size() - 1;
+  int IB = PathB.size() - 1;
+
+  MDNode *Ret = 0;
+  while (IA >= 0 && IB >=0) {
+    if (PathA[IA] == PathB[IB])
+      Ret = PathA[IA];
+    else
+      break;
+    --IA;
+    --IB;
+  }
+  return Ret;
+}
+
+MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF();
+  APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF();
+  if (AVal.compare(BVal) == APFloat::cmpLessThan)
+    return A;
+  return B;
+}
+
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
+static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) {
+  return !A.intersectWith(B).isEmptySet() || isContiguous(A, B);
+}
+
+static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                          ConstantInt *High) {
+  ConstantRange NewRange(Low->getValue(), High->getValue());
+  unsigned Size = EndPoints.size();
+  APInt LB = cast<ConstantInt>(EndPoints[Size - 2])->getValue();
+  APInt LE = cast<ConstantInt>(EndPoints[Size - 1])->getValue();
+  ConstantRange LastRange(LB, LE);
+  if (canBeMerged(NewRange, LastRange)) {
+    ConstantRange Union = LastRange.unionWith(NewRange);
+    Type *Ty = High->getType();
+    EndPoints[Size - 2] = ConstantInt::get(Ty, Union.getLower());
+    EndPoints[Size - 1] = ConstantInt::get(Ty, Union.getUpper());
+    return true;
+  }
+  return false;
+}
+
+static void addRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                     ConstantInt *High) {
+  if (!EndPoints.empty())
+    if (tryMergeRange(EndPoints, Low, High))
+      return;
+
+  EndPoints.push_back(Low);
+  EndPoints.push_back(High);
+}
+
+MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
+  // Given two ranges, we want to compute the union of the ranges. This
+  // is slightly complitade by having to combine the intervals and merge
+  // the ones that overlap.
+
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  // First, walk both lists in older of the lower boundary of each interval.
+  // At each step, try to merge the new interval to the last one we adedd.
+  SmallVector<Value*, 4> EndPoints;
+  int AI = 0;
+  int BI = 0;
+  int AN = A->getNumOperands() / 2;
+  int BN = B->getNumOperands() / 2;
+  while (AI < AN && BI < BN) {
+    ConstantInt *ALow = cast<ConstantInt>(A->getOperand(2 * AI));
+    ConstantInt *BLow = cast<ConstantInt>(B->getOperand(2 * BI));
+
+    if (ALow->getValue().slt(BLow->getValue())) {
+      addRange(EndPoints, ALow, cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+      ++AI;
+    } else {
+      addRange(EndPoints, BLow, cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+      ++BI;
+    }
+  }
+  while (AI < AN) {
+    addRange(EndPoints, cast<ConstantInt>(A->getOperand(2 * AI)),
+             cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+    ++AI;
+  }
+  while (BI < BN) {
+    addRange(EndPoints, cast<ConstantInt>(B->getOperand(2 * BI)),
+             cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+    ++BI;
+  }
+
+  // If we have more than 2 ranges (4 endpoints) we have to try to merge
+  // the last and first ones.
+  unsigned Size = EndPoints.size();
+  if (Size > 4) {
+    ConstantInt *FB = cast<ConstantInt>(EndPoints[0]);
+    ConstantInt *FE = cast<ConstantInt>(EndPoints[1]);
+    if (tryMergeRange(EndPoints, FB, FE)) {
+      for (unsigned i = 0; i < Size - 2; ++i) {
+        EndPoints[i] = EndPoints[i + 2];
+      }
+      EndPoints.resize(Size - 2);
+    }
+  }
+
+  // If in the end we have a single range, it is possible that it is now the
+  // full range. Just drop the metadata in that case.
+  if (EndPoints.size() == 2) {
+    ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(),
+                        cast<ConstantInt>(EndPoints[1])->getValue());
+    if (Range.isFullSet())
+      return NULL;
+  }
+
+  return MDNode::get(A->getContext(), EndPoints);
+}
+
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 3c67191..5b5176b 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -65,20 +65,20 @@ Module::~Module() {
 Module::Endianness Module::getEndianness() const {
   StringRef temp = DataLayout;
   Module::Endianness ret = AnyEndianness;
-  
+
   while (!temp.empty()) {
     std::pair<StringRef, StringRef> P = getToken(temp, "-");
-    
+
     StringRef token = P.first;
     temp = P.second;
-    
+
     if (token[0] == 'e') {
       ret = LittleEndian;
     } else if (token[0] == 'E') {
       ret = BigEndian;
     }
   }
-  
+
   return ret;
 }
 
@@ -86,13 +86,13 @@ Module::Endianness Module::getEndianness() const {
 Module::PointerSize Module::getPointerSize() const {
   StringRef temp = DataLayout;
   Module::PointerSize ret = AnyPointerSize;
-  
+
   while (!temp.empty()) {
     std::pair<StringRef, StringRef> TmpP = getToken(temp, "-");
     temp = TmpP.second;
     TmpP = getToken(TmpP.first, ":");
     StringRef token = TmpP.second, signalToken = TmpP.first;
-    
+
     if (signalToken[0] == 'p') {
       int size = 0;
       getToken(token, ":").first.getAsInteger(10, size);
@@ -102,7 +102,7 @@ Module::PointerSize Module::getPointerSize() const {
         ret = Pointer64;
     }
   }
-  
+
   return ret;
 }
 
@@ -164,9 +164,9 @@ Constant *Module::getOrInsertFunction(StringRef Name,
   // right type.
   if (F->getType() != PointerType::getUnqual(Ty))
     return ConstantExpr::getBitCast(F, PointerType::getUnqual(Ty));
-  
+
   // Otherwise, we just found the existing function or a prototype.
-  return F;  
+  return F;
 }
 
 Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
@@ -183,13 +183,12 @@ Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
   }
 
   // Otherwise, we just found the existing function or a prototype.
-  return F;  
+  return F;
 }
 
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
-  AttrListPtr AttributeList = AttrListPtr::get((AttributeWithIndex *)0, 0);
-  return getOrInsertFunction(Name, Ty, AttributeList);
+  return getOrInsertFunction(Name, Ty, AttrListPtr());
 }
 
 // getOrInsertFunction - Look up the specified function in the module symbol
@@ -229,9 +228,9 @@ Constant *Module::getOrInsertFunction(StringRef Name,
   va_end(Args);
 
   // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name, 
+  return getOrInsertFunction(Name,
                              FunctionType::get(RetTy, ArgTys, false),
-                             AttrListPtr::get((AttributeWithIndex *)0, 0));
+                             AttrListPtr());
 }
 
 // getFunction - Look up the specified function in the module symbol table.
@@ -254,7 +253,7 @@ Function *Module::getFunction(StringRef Name) const {
 ///
 GlobalVariable *Module::getGlobalVariable(StringRef Name,
                                           bool AllowLocal) const {
-  if (GlobalVariable *Result = 
+  if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
       return Result;
@@ -282,7 +281,7 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // right type.
   if (GV->getType() != PointerType::getUnqual(Ty))
     return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
-  
+
   // Otherwise, we just found the existing function or a prototype.
   return GV;
 }
@@ -299,7 +298,7 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const {
 }
 
 /// getNamedMetadata - Return the first NamedMDNode in the module with the
-/// specified name. This method returns null if a NamedMDNode with the 
+/// specified name. This method returns null if a NamedMDNode with the
 /// specified name is not found.
 NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   SmallString<256> NameData;
@@ -307,8 +306,8 @@ NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef);
 }
 
-/// getOrInsertNamedMetadata - Return the first named MDNode in the module 
-/// with the specified name. This method returns a new NamedMDNode if a 
+/// getOrInsertNamedMetadata - Return the first named MDNode in the module
+/// with the specified name. This method returns a new NamedMDNode if a
 /// NamedMDNode with the specified name is not found.
 NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
   NamedMDNode *&NMD =
@@ -468,128 +467,3 @@ void Module::removeLibrary(StringRef Lib) {
       return;
     }
 }
-
-//===----------------------------------------------------------------------===//
-// Type finding functionality.
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// TypeFinder - Walk over a module, identifying all of the types that are
-  /// used by the module.
-  class TypeFinder {
-    // To avoid walking constant expressions multiple times and other IR
-    // objects, we keep several helper maps.
-    DenseSet<const Value*> VisitedConstants;
-    DenseSet<Type*> VisitedTypes;
-    
-    std::vector<StructType*> &StructTypes;
-  public:
-    TypeFinder(std::vector<StructType*> &structTypes)
-      : StructTypes(structTypes) {}
-    
-    void run(const Module &M) {
-      // Get types from global variables.
-      for (Module::const_global_iterator I = M.global_begin(),
-           E = M.global_end(); I != E; ++I) {
-        incorporateType(I->getType());
-        if (I->hasInitializer())
-          incorporateValue(I->getInitializer());
-      }
-      
-      // Get types from aliases.
-      for (Module::const_alias_iterator I = M.alias_begin(),
-           E = M.alias_end(); I != E; ++I) {
-        incorporateType(I->getType());
-        if (const Value *Aliasee = I->getAliasee())
-          incorporateValue(Aliasee);
-      }
-      
-      SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
-
-      // Get types from functions.
-      for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
-        incorporateType(FI->getType());
-        
-        for (Function::const_iterator BB = FI->begin(), E = FI->end();
-             BB != E;++BB)
-          for (BasicBlock::const_iterator II = BB->begin(),
-               E = BB->end(); II != E; ++II) {
-            const Instruction &I = *II;
-            // Incorporate the type of the instruction and all its operands.
-            incorporateType(I.getType());
-            for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
-                 OI != OE; ++OI)
-              incorporateValue(*OI);
-            
-            // Incorporate types hiding in metadata.
-            I.getAllMetadataOtherThanDebugLoc(MDForInst);
-            for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
-              incorporateMDNode(MDForInst[i].second);
-            MDForInst.clear();
-          }
-      }
-      
-      for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
-           E = M.named_metadata_end(); I != E; ++I) {
-        const NamedMDNode *NMD = I;
-        for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-          incorporateMDNode(NMD->getOperand(i));
-      }
-    }
-    
-  private:
-    void incorporateType(Type *Ty) {
-      // Check to see if we're already visited this type.
-      if (!VisitedTypes.insert(Ty).second)
-        return;
-      
-      // If this is a structure or opaque type, add a name for the type.
-      if (StructType *STy = dyn_cast<StructType>(Ty))
-        StructTypes.push_back(STy);
-      
-      // Recursively walk all contained types.
-      for (Type::subtype_iterator I = Ty->subtype_begin(),
-           E = Ty->subtype_end(); I != E; ++I)
-        incorporateType(*I);
-    }
-    
-    /// incorporateValue - This method is used to walk operand lists finding
-    /// types hiding in constant expressions and other operands that won't be
-    /// walked in other ways.  GlobalValues, basic blocks, instructions, and
-    /// inst operands are all explicitly enumerated.
-    void incorporateValue(const Value *V) {
-      if (const MDNode *M = dyn_cast<MDNode>(V))
-        return incorporateMDNode(M);
-      if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
-      
-      // Already visited?
-      if (!VisitedConstants.insert(V).second)
-        return;
-      
-      // Check this type.
-      incorporateType(V->getType());
-      
-      // Look in operands for types.
-      const User *U = cast<User>(V);
-      for (Constant::const_op_iterator I = U->op_begin(),
-           E = U->op_end(); I != E;++I)
-        incorporateValue(*I);
-    }
-    
-    void incorporateMDNode(const MDNode *V) {
-      
-      // Already visited?
-      if (!VisitedConstants.insert(V).second)
-        return;
-      
-      // Look in operands for types.
-      for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
-        if (Value *Op = V->getOperand(i))
-          incorporateValue(Op);
-    }
-  };
-} // end anonymous namespace
-
-void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes) const {
-  TypeFinder(StructTypes).run(*this);
-}
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 28fbaa6..4530c04 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -478,8 +478,7 @@ PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
 
 /// Set pass P as the last user of the given analysis passes.
 void
-PMTopLevelManager::setLastUser(const SmallVectorImpl<Pass *> &AnalysisPasses,
-                               Pass *P) {
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
   unsigned PDepth = 0;
   if (P->getResolver())
     PDepth = P->getResolver()->getPMDataManager().getDepth();
@@ -594,6 +593,26 @@ void PMTopLevelManager::schedulePass(Pass *P) {
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
         const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            }
+            else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
         assert(PI && "Expected required passes to be initialized");
         AnalysisPass = PI->createPass();
         if (P->getPotentialPassManagerType () ==
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index c6f3558..5e9a00f 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -464,19 +464,26 @@ void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
 void StructType::setName(StringRef Name) {
   if (Name == getName()) return;
 
-  // If this struct already had a name, remove its symbol table entry.
-  if (SymbolTableEntry) {
-    getContext().pImpl->NamedStructTypes.erase(getName());
-    SymbolTableEntry = 0;
-  }
-  
+  StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
+  typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+  // If this struct already had a name, remove its symbol table entry. Don't
+  // delete the data yet because it may be part of the new name.
+  if (SymbolTableEntry)
+    SymbolTable.remove((EntryTy *)SymbolTableEntry);
+
   // If this is just removing the name, we're done.
-  if (Name.empty())
+  if (Name.empty()) {
+    if (SymbolTableEntry) {
+      // Delete the old string data.
+      ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
+      SymbolTableEntry = 0;
+    }
     return;
+  }
   
   // Look up the entry for the name.
-  StringMapEntry<StructType*> *Entry =
-    &getContext().pImpl->NamedStructTypes.GetOrCreateValue(Name);
+  EntryTy *Entry = &getContext().pImpl->NamedStructTypes.GetOrCreateValue(Name);
   
   // While we have a name collision, try a random rename.
   if (Entry->getValue()) {
@@ -497,7 +504,10 @@ void StructType::setName(StringRef Name) {
 
   // Okay, we found an entry that isn't used.  It's us!
   Entry->setValue(this);
-    
+
+  // Delete the old string data.
+  if (SymbolTableEntry)
+    ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
   SymbolTableEntry = Entry;
 }
 
diff --git a/lib/VMCore/TypeFinder.cpp b/lib/VMCore/TypeFinder.cpp
new file mode 100644
index 0000000..4de649f
--- /dev/null
+++ b/lib/VMCore/TypeFinder.cpp
@@ -0,0 +1,148 @@
+//===-- TypeFinder.cpp - Implement the TypeFinder class -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TypeFinder class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TypeFinder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Metadata.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+void TypeFinder::run(const Module &M, bool onlyNamed) {
+  OnlyNamed = onlyNamed;
+
+  // Get types from global variables.
+  for (Module::const_global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    incorporateType(I->getType());
+    if (I->hasInitializer())
+      incorporateValue(I->getInitializer());
+  }
+
+  // Get types from aliases.
+  for (Module::const_alias_iterator I = M.alias_begin(),
+         E = M.alias_end(); I != E; ++I) {
+    incorporateType(I->getType());
+    if (const Value *Aliasee = I->getAliasee())
+      incorporateValue(Aliasee);
+  }
+
+  // Get types from functions.
+  SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+  for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
+    incorporateType(FI->getType());
+
+    // First incorporate the arguments.
+    for (Function::const_arg_iterator AI = FI->arg_begin(),
+           AE = FI->arg_end(); AI != AE; ++AI)
+      incorporateValue(AI);
+
+    for (Function::const_iterator BB = FI->begin(), E = FI->end();
+         BB != E;++BB)
+      for (BasicBlock::const_iterator II = BB->begin(),
+             E = BB->end(); II != E; ++II) {
+        const Instruction &I = *II;
+
+        // Incorporate the type of the instruction.
+        incorporateType(I.getType());
+
+        // Incorporate non-instruction operand types. (We are incorporating all
+        // instructions with this loop.)
+        for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+             OI != OE; ++OI)
+          if (!isa<Instruction>(OI))
+            incorporateValue(*OI);
+
+        // Incorporate types hiding in metadata.
+        I.getAllMetadataOtherThanDebugLoc(MDForInst);
+        for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
+          incorporateMDNode(MDForInst[i].second);
+
+        MDForInst.clear();
+      }
+  }
+
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+         E = M.named_metadata_end(); I != E; ++I) {
+    const NamedMDNode *NMD = I;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      incorporateMDNode(NMD->getOperand(i));
+  }
+}
+
+void TypeFinder::clear() {
+  VisitedConstants.clear();
+  VisitedTypes.clear();
+  StructTypes.clear();
+}
+
+/// incorporateType - This method adds the type to the list of used structures
+/// if it's not in there already.
+void TypeFinder::incorporateType(Type *Ty) {
+  // Check to see if we're already visited this type.
+  if (!VisitedTypes.insert(Ty).second)
+    return;
+
+  // If this is a structure or opaque type, add a name for the type.
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!OnlyNamed || STy->hasName())
+      StructTypes.push_back(STy);
+
+  // Recursively walk all contained types.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+         E = Ty->subtype_end(); I != E; ++I)
+    incorporateType(*I);
+}
+
+/// incorporateValue - This method is used to walk operand lists finding types
+/// hiding in constant expressions and other operands that won't be walked in
+/// other ways.  GlobalValues, basic blocks, instructions, and inst operands are
+/// all explicitly enumerated.
+void TypeFinder::incorporateValue(const Value *V) {
+  if (const MDNode *M = dyn_cast<MDNode>(V))
+    return incorporateMDNode(M);
+
+  if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
+
+  // Already visited?
+  if (!VisitedConstants.insert(V).second)
+    return;
+
+  // Check this type.
+  incorporateType(V->getType());
+
+  // If this is an instruction, we incorporate it separately.
+  if (isa<Instruction>(V))
+    return;
+
+  // Look in operands for types.
+  const User *U = cast<User>(V);
+  for (Constant::const_op_iterator I = U->op_begin(),
+         E = U->op_end(); I != E;++I)
+    incorporateValue(*I);
+}
+
+/// incorporateMDNode - This method is used to walk the operands of an MDNode to
+/// find types hiding within.
+void TypeFinder::incorporateMDNode(const MDNode *V) {
+  // Already visited?
+  if (!VisitedConstants.insert(V).second)
+    return;
+
+  // Look in operands for types.
+  for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
+    if (Value *Op = V->getOperand(i))
+      incorporateValue(Op);
+}
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 4006b2c..d871108 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -686,6 +686,9 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 #endif
 }
 
-/// ~CallbackVH. Empty, but defined here to avoid emitting the vtable
-/// more than once.
-CallbackVH::~CallbackVH() {}
+// Default implementation for CallbackVH.
+void CallbackVH::allUsesReplacedWith(Value *) {}
+
+void CallbackVH::deleted() {
+  setValPtr(NULL);
+}
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
index 9a8e185..d1ca953 100644
--- a/lib/VMCore/ValueTypes.cpp
+++ b/lib/VMCore/ValueTypes.cpp
@@ -71,6 +71,10 @@ bool EVT::isExtended512BitVector() const {
   return isExtendedVector() && getSizeInBits() == 512;
 }
 
+bool EVT::isExtended1024BitVector() const {
+  return isExtendedVector() && getSizeInBits() == 1024;
+}
+
 EVT EVT::getExtendedVectorElementType() const {
   assert(isExtended() && "Type is not extended!");
   return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType());
@@ -128,10 +132,12 @@ std::string EVT::getEVTString() const {
   case MVT::v2i32:   return "v2i32";
   case MVT::v4i32:   return "v4i32";
   case MVT::v8i32:   return "v8i32";
+  case MVT::v16i32:  return "v16i32";
   case MVT::v1i64:   return "v1i64";
   case MVT::v2i64:   return "v2i64";
   case MVT::v4i64:   return "v4i64";
   case MVT::v8i64:   return "v8i64";
+  case MVT::v16i64:  return "v16i64";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
   case MVT::v4f32:   return "v4f32";
@@ -177,10 +183,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
   case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
   case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
+  case MVT::v16i32:  return VectorType::get(Type::getInt32Ty(Context), 16);
   case MVT::v1i64:   return VectorType::get(Type::getInt64Ty(Context), 1);
   case MVT::v2i64:   return VectorType::get(Type::getInt64Ty(Context), 2);
   case MVT::v4i64:   return VectorType::get(Type::getInt64Ty(Context), 4);
   case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
+  case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 47baef3..6851246 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -68,6 +68,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -293,8 +294,9 @@ namespace {
     void VerifyCallSite(CallSite CS);
     bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
                           int VT, unsigned ArgNo, std::string &Suffix);
-    void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
-                                  unsigned RetNum, unsigned ParamNum, ...);
+    bool VerifyIntrinsicType(Type *Ty,
+                             ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             SmallVectorImpl<Type*> &ArgTys);
     void VerifyParameterAttrs(Attributes Attrs, Type *Ty,
                               bool isReturnValue, const Value *V);
     void VerifyFunctionAttrs(FunctionType *FT, const AttrListPtr &Attrs,
@@ -804,14 +806,29 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
-  SmallPtrSet<ConstantInt*, 32> Constants;
+  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
+  IntegersSubsetToBB Mapping;
+  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    Assert1(i.getCaseValue()->getType() == SwitchTy,
-            "Switch constants must all be same type as switch value!", &SI);
-    Assert2(Constants.insert(i.getCaseValue()),
-            "Duplicate integer as switch case", &SI, i.getCaseValue());
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
+      IntegersSubset::Range r = CaseRanges.getItem(ri);
+      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Mapping.add(r);
+      RangeSetMap[r] = i.getCaseIndex();
+    }
   }
-
+  
+  IntegersSubsetToBB::RangeIterator errItem;
+  if (!Mapping.verify(errItem)) {
+    unsigned CaseIndex = RangeSetMap[errItem->first];
+    SwitchInst::CaseIt i(&SI, CaseIndex);
+    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+  }
+  
   visitTerminatorInst(SI);
 }
 
@@ -1346,6 +1363,10 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   visitInstruction(GEP);
 }
 
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
 void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert1(PTy, "Load operand must be a pointer.", &LI);
@@ -1367,6 +1388,8 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
     unsigned NumRanges = NumOperands / 2;
     Assert1(NumRanges >= 1, "It should have at least one range!", Range);
+
+    ConstantRange LastRange(1); // Dummy initial value
     for (unsigned i = 0; i < NumRanges; ++i) {
       ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
       Assert1(Low, "The lower limit must be an integer!", Low);
@@ -1375,9 +1398,35 @@ void Verifier::visitLoadInst(LoadInst &LI) {
       Assert1(High->getType() == Low->getType() &&
               High->getType() == ElTy, "Range types must match load type!",
               &LI);
-      Assert1(High->getValue() != Low->getValue(), "Range must not be empty!",
+
+      APInt HighV = High->getValue();
+      APInt LowV = Low->getValue();
+      ConstantRange CurRange(LowV, HighV);
+      Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(),
+              "Range must not be empty!", Range);
+      if (i != 0) {
+        Assert1(CurRange.intersectWith(LastRange).isEmptySet(),
+                "Intervals are overlapping", Range);
+        Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
+                Range);
+        Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
+                Range);
+      }
+      LastRange = ConstantRange(LowV, HighV);
+    }
+    if (NumRanges > 2) {
+      APInt FirstLow =
+        dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
+      APInt FirstHigh =
+        dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
+      ConstantRange FirstRange(FirstLow, FirstHigh);
+      Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
+              "Intervals are overlapping", Range);
+      Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
               Range);
     }
+
+
   }
 
   visitInstruction(LI);
@@ -1487,7 +1536,7 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   // landing pad block may be branched to only by the unwind edge of an invoke.
   for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
     const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator());
-    Assert1(II && II->getUnwindDest() == BB,
+    Assert1(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
             "Block containing LandingPadInst must be jumped to "
             "only by the unwind edge of an invoke.", &LPI);
   }
@@ -1526,53 +1575,9 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
 
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
   Instruction *Op = cast<Instruction>(I.getOperand(i));
-  BasicBlock *BB = I.getParent();
-  BasicBlock *OpBlock = Op->getParent();
-  PHINode *PN = dyn_cast<PHINode>(&I);
-
-  // DT can handle non phi instructions for us.
-  if (!PN) {
-    // Definition must dominate use unless use is unreachable!
-    Assert2(InstsInThisBlock.count(Op) || !DT->isReachableFromEntry(BB) ||
-            DT->dominates(Op, &I),
-            "Instruction does not dominate all uses!", Op, &I);
-    return;
-  }
 
-  // Check that a definition dominates all of its uses.
-  if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) {
-    // Invoke results are only usable in the normal destination, not in the
-    // exceptional destination.
-    BasicBlock *NormalDest = II->getNormalDest();
-
-
-    // PHI nodes differ from other nodes because they actually "use" the
-    // value in the predecessor basic blocks they correspond to.
-    BasicBlock *UseBlock = BB;
-    unsigned j = PHINode::getIncomingValueNumForOperand(i);
-    UseBlock = PN->getIncomingBlock(j);
-    Assert2(UseBlock, "Invoke operand is PHI node with bad incoming-BB",
-            Op, &I);
-
-    if (UseBlock == OpBlock) {
-      // Special case of a phi node in the normal destination or the unwind
-      // destination.
-      Assert2(BB == NormalDest || !DT->isReachableFromEntry(UseBlock),
-              "Invoke result not available in the unwind destination!",
-              Op, &I);
-    } else {
-      Assert2(DT->dominates(II, UseBlock) ||
-              !DT->isReachableFromEntry(UseBlock),
-              "Invoke result does not dominate all uses!", Op, &I);
-    }
-  }
-
-  // PHI nodes are more difficult than other nodes because they actually
-  // "use" the value in the predecessor basic blocks they correspond to.
-  unsigned j = PHINode::getIncomingValueNumForOperand(i);
-  BasicBlock *PredBB = PN->getIncomingBlock(j);
-  Assert2(PredBB && (DT->dominates(OpBlock, PredBB) ||
-                     !DT->isReachableFromEntry(PredBB)),
+  const Use &U = I.getOperandUse(i);
+  Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, U),
           "Instruction does not dominate all uses!", Op, &I);
 }
 
@@ -1631,8 +1636,11 @@ void Verifier::visitInstruction(Instruction &I) {
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
       // Check to make sure that the "address of" an intrinsic function is never
       // taken.
-      Assert1(!F->isIntrinsic() || (i + 1 == e && isa<CallInst>(I)),
+      Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 : 0),
               "Cannot take the address of an intrinsic!", &I);
+      Assert1(!F->isIntrinsic() || isa<CallInst>(I) ||
+              F->getIntrinsicID() == Intrinsic::donothing,
+              "Cannot invoke an intrinsinc other than donothing", &I);
       Assert1(F->getParent() == Mod, "Referencing function in another module!",
               &I);
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
@@ -1673,10 +1681,85 @@ void Verifier::visitInstruction(Instruction &I) {
   InstsInThisBlock.insert(&I);
 }
 
-// Flags used by TableGen to mark intrinsic parameters with the
-// LLVMExtendedElementVectorType and LLVMTruncatedElementVectorType classes.
-static const unsigned ExtendedElementVectorType = 0x40000000;
-static const unsigned TruncatedElementVectorType = 0x20000000;
+/// VerifyIntrinsicType - Verify that the specified type (which comes from an
+/// intrinsic argument or return value) matches the type constraints specified
+/// by the .td file (e.g. an "any integer" argument really is an integer).
+///
+/// This return true on error but does not print a message.
+bool Verifier::VerifyIntrinsicType(Type *Ty,
+                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   SmallVectorImpl<Type*> &ArgTys) {
+  using namespace Intrinsic;
+
+  // If we ran out of descriptors, there are too many arguments.
+  if (Infos.empty()) return true; 
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  
+  switch (D.Kind) {
+  case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+  case IITDescriptor::Metadata: return !Ty->isMetadataTy();
+  case IITDescriptor::Float: return !Ty->isFloatTy();
+  case IITDescriptor::Double: return !Ty->isDoubleTy();
+  case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
+  case IITDescriptor::Vector: {
+    VectorType *VT = dyn_cast<VectorType>(Ty);
+    return VT == 0 || VT->getNumElements() != D.Vector_Width ||
+           VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys);
+  }
+  case IITDescriptor::Pointer: {
+    PointerType *PT = dyn_cast<PointerType>(Ty);
+    return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace ||
+           VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
+  }
+      
+  case IITDescriptor::Struct: {
+    StructType *ST = dyn_cast<StructType>(Ty);
+    if (ST == 0 || ST->getNumElements() != D.Struct_NumElements)
+      return true;
+    
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys))
+        return true;
+    return false;
+  }
+      
+  case IITDescriptor::Argument:
+    // Two cases here - If this is the second occurrence of an argument, verify
+    // that the later instance matches the previous instance. 
+    if (D.getArgumentNumber() < ArgTys.size())
+      return Ty != ArgTys[D.getArgumentNumber()];  
+      
+    // Otherwise, if this is the first instance of an argument, record it and
+    // verify the "Any" kind.
+    assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
+    ArgTys.push_back(Ty);
+      
+    switch (D.getArgumentKind()) {
+    case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
+    case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
+    case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
+    case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
+    }
+    llvm_unreachable("all argument kinds not covered");
+      
+  case IITDescriptor::ExtendVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getExtendedElementVectorType(
+                       cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+
+  case IITDescriptor::TruncVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getTruncatedElementVectorType(
+                         cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+  }
+  llvm_unreachable("unhandled");
+}
 
 /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
 ///
@@ -1685,10 +1768,30 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   Assert1(IF->isDeclaration(), "Intrinsic functions should never be defined!",
           IF);
 
-#define GET_INTRINSIC_VERIFIER
-#include "llvm/Intrinsics.gen"
-#undef GET_INTRINSIC_VERIFIER
-
+  // Verify that the intrinsic prototype lines up with what the .td files
+  // describe.
+  FunctionType *IFTy = IF->getFunctionType();
+  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
+  
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(ID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  SmallVector<Type *, 4> ArgTys;
+  Assert1(!VerifyIntrinsicType(IFTy->getReturnType(), TableRef, ArgTys),
+          "Intrinsic has incorrect return type!", IF);
+  for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
+    Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
+            "Intrinsic has incorrect argument type!", IF);
+  Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
+
+  // Now that we have the intrinsic ID and the actual argument types (and we
+  // know they are legal for the intrinsic!) get the intrinsic name through the
+  // usual means.  This allows us to verify the mangling of argument types into
+  // the name.
+  Assert1(Intrinsic::getName(ID, ArgTys) == IF->getName(),
+          "Intrinsic name not mangled correctly for type arguments!", IF);
+  
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
   for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
@@ -1772,261 +1875,6 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
-/// Produce a string to identify an intrinsic parameter or return value.
-/// The ArgNo value numbers the return values from 0 to NumRets-1 and the
-/// parameters beginning with NumRets.
-///
-static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) {
-  if (ArgNo >= NumRets)
-    return "Intrinsic parameter #" + utostr(ArgNo - NumRets);
-  if (NumRets == 1)
-    return "Intrinsic result type";
-  return "Intrinsic result type #" + utostr(ArgNo);
-}
-
-bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
-                                int VT, unsigned ArgNo, std::string &Suffix) {
-  FunctionType *FTy = F->getFunctionType();
-
-  unsigned NumElts = 0;
-  Type *EltTy = Ty;
-  VectorType *VTy = dyn_cast<VectorType>(Ty);
-  if (VTy) {
-    EltTy = VTy->getElementType();
-    NumElts = VTy->getNumElements();
-  }
-
-  Type *RetTy = FTy->getReturnType();
-  StructType *ST = dyn_cast<StructType>(RetTy);
-  unsigned NumRetVals;
-  if (RetTy->isVoidTy())
-    NumRetVals = 0;
-  else if (ST)
-    NumRetVals = ST->getNumElements();
-  else
-    NumRetVals = 1;
-
-  if (VT < 0) {
-    int Match = ~VT;
-
-    // Check flags that indicate a type that is an integral vector type with
-    // elements that are larger or smaller than the elements of the matched
-    // type.
-    if ((Match & (ExtendedElementVectorType |
-                  TruncatedElementVectorType)) != 0) {
-      IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
-      if (!VTy || !IEltTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                    "an integral vector type.", F);
-        return false;
-      }
-      // Adjust the current Ty (in the opposite direction) rather than
-      // the type being matched against.
-      if ((Match & ExtendedElementVectorType) != 0) {
-        if ((IEltTy->getBitWidth() & 1) != 0) {
-          CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " vector "
-                      "element bit-width is odd.", F);
-          return false;
-        }
-        Ty = VectorType::getTruncatedElementVectorType(VTy);
-      } else
-        Ty = VectorType::getExtendedElementVectorType(VTy);
-      Match &= ~(ExtendedElementVectorType | TruncatedElementVectorType);
-    }
-
-    if (Match <= static_cast<int>(NumRetVals - 1)) {
-      if (ST)
-        RetTy = ST->getElementType(Match);
-
-      if (Ty != RetTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
-                    "match return type.", F);
-        return false;
-      }
-    } else {
-      if (Ty != FTy->getParamType(Match - NumRetVals)) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
-                    "match parameter %" + utostr(Match - NumRetVals) + ".", F);
-        return false;
-      }
-    }
-  } else if (VT == MVT::iAny) {
-    if (!EltTy->isIntegerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                  "an integer type.", F);
-      return false;
-    }
-
-    unsigned GotBits = cast<IntegerType>(EltTy)->getBitWidth();
-    Suffix += ".";
-
-    if (EltTy != Ty)
-      Suffix += "v" + utostr(NumElts);
-
-    Suffix += "i" + utostr(GotBits);
-
-    // Check some constraints on various intrinsics.
-    switch (ID) {
-    default: break; // Not everything needs to be checked.
-    case Intrinsic::bswap:
-      if (GotBits < 16 || GotBits % 16 != 0) {
-        CheckFailed("Intrinsic requires even byte width argument", F);
-        return false;
-      }
-      break;
-    }
-  } else if (VT == MVT::fAny) {
-    if (!EltTy->isFloatingPointTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                  "a floating-point type.", F);
-      return false;
-    }
-
-    Suffix += ".";
-
-    if (EltTy != Ty)
-      Suffix += "v" + utostr(NumElts);
-
-    Suffix += EVT::getEVT(EltTy).getEVTString();
-  } else if (VT == MVT::vAny) {
-    if (!VTy) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a vector type.",
-                  F);
-      return false;
-    }
-    Suffix += ".v" + utostr(NumElts) + EVT::getEVT(EltTy).getEVTString();
-  } else if (VT == MVT::iPTR) {
-    if (!Ty->isPointerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
-                  "pointer and a pointer is required.", F);
-      return false;
-    }
-  } else if (VT == MVT::iPTRAny) {
-    // Outside of TableGen, we don't distinguish iPTRAny (to any address space)
-    // and iPTR. In the verifier, we can not distinguish which case we have so
-    // allow either case to be legal.
-    if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-      EVT PointeeVT = EVT::getEVT(PTyp->getElementType(), true);
-      if (PointeeVT == MVT::Other) {
-        CheckFailed("Intrinsic has pointer to complex type.");
-        return false;
-      }
-      Suffix += ".p" + utostr(PTyp->getAddressSpace()) +
-        PointeeVT.getEVTString();
-    } else {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
-                  "pointer and a pointer is required.", F);
-      return false;
-    }
-  } else if (EVT((MVT::SimpleValueType)VT).isVector()) {
-    EVT VVT = EVT((MVT::SimpleValueType)VT);
-
-    // If this is a vector argument, verify the number and type of elements.
-    if (VVT.getVectorElementType() != EVT::getEVT(EltTy)) {
-      CheckFailed("Intrinsic prototype has incorrect vector element type!", F);
-      return false;
-    }
-
-    if (VVT.getVectorNumElements() != NumElts) {
-      CheckFailed("Intrinsic prototype has incorrect number of "
-                  "vector elements!", F);
-      return false;
-    }
-  } else if (EVT((MVT::SimpleValueType)VT).getTypeForEVT(Ty->getContext()) != 
-             EltTy) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is wrong!", F);
-    return false;
-  } else if (EltTy != Ty) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is a vector "
-                "and a scalar is required.", F);
-    return false;
-  }
-
-  return true;
-}
-
-/// VerifyIntrinsicPrototype - TableGen emits calls to this function into
-/// Intrinsics.gen.  This implements a little state machine that verifies the
-/// prototype of intrinsics.
-void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
-                                        unsigned NumRetVals,
-                                        unsigned NumParams, ...) {
-  va_list VA;
-  va_start(VA, NumParams);
-  FunctionType *FTy = F->getFunctionType();
-
-  // For overloaded intrinsics, the Suffix of the function name must match the
-  // types of the arguments. This variable keeps track of the expected
-  // suffix, to be checked at the end.
-  std::string Suffix;
-
-  if (FTy->getNumParams() + FTy->isVarArg() != NumParams) {
-    CheckFailed("Intrinsic prototype has incorrect number of arguments!", F);
-    return;
-  }
-
-  Type *Ty = FTy->getReturnType();
-  StructType *ST = dyn_cast<StructType>(Ty);
-
-  if (NumRetVals == 0 && !Ty->isVoidTy()) {
-    CheckFailed("Intrinsic should return void", F);
-    return;
-  }
-  
-  // Verify the return types.
-  if (ST && ST->getNumElements() != NumRetVals) {
-    CheckFailed("Intrinsic prototype has incorrect number of return types!", F);
-    return;
-  }
-  
-  for (unsigned ArgNo = 0; ArgNo != NumRetVals; ++ArgNo) {
-    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
-
-    if (ST) Ty = ST->getElementType(ArgNo);
-    if (!PerformTypeCheck(ID, F, Ty, VT, ArgNo, Suffix))
-      break;
-  }
-
-  // Verify the parameter types.
-  for (unsigned ArgNo = 0; ArgNo != NumParams; ++ArgNo) {
-    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
-
-    if (VT == MVT::isVoid && ArgNo > 0) {
-      if (!FTy->isVarArg())
-        CheckFailed("Intrinsic prototype has no '...'!", F);
-      break;
-    }
-
-    if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT,
-                          ArgNo + NumRetVals, Suffix))
-      break;
-  }
-
-  va_end(VA);
-
-  // For intrinsics without pointer arguments, if we computed a Suffix then the
-  // intrinsic is overloaded and we need to make sure that the name of the
-  // function is correct. We add the suffix to the name of the intrinsic and
-  // compare against the given function name. If they are not the same, the
-  // function name is invalid. This ensures that overloading of intrinsics
-  // uses a sane and consistent naming convention.  Note that intrinsics with
-  // pointer argument may or may not be overloaded so we will check assuming it
-  // has a suffix and not.
-  if (!Suffix.empty()) {
-    std::string Name(Intrinsic::getName(ID));
-    if (Name + Suffix != F->getName()) {
-      CheckFailed("Overloaded intrinsic has incorrect suffix: '" +
-                  F->getName().substr(Name.length()) + "'. It should be '" +
-                  Suffix + "'", F);
-    }
-  }
-
-  // Check parameter attributes.
-  Assert1(F->getAttributes() == Intrinsic::getAttributes(ID),
-          "Intrinsic has wrong parameter attributes!", F);
-}
-
-
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index 415530e..dac6373 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -4,7 +4,8 @@
 file(GLOB entries *)
 foreach(entry ${entries})
   if(IS_DIRECTORY ${entry} AND EXISTS ${entry}/CMakeLists.txt)
-    if(NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt)
+    if((NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt) AND
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxx))
       add_subdirectory(${entry})
     endif()
   endif()
diff --git a/projects/sample/Makefile.llvm.config.in b/projects/sample/Makefile.llvm.config.in
index 697660c..9a85b3d 100644
--- a/projects/sample/Makefile.llvm.config.in
+++ b/projects/sample/Makefile.llvm.config.in
@@ -157,8 +157,6 @@ GAS        := @GAS@
 POD2HTML   := @POD2HTML@
 POD2MAN    := @POD2MAN@
 PDFROFF    := @PDFROFF@
-RUNTEST    := @RUNTEST@
-TCLSH      := @TCLSH@
 ZIP        := @ZIP@
 
 HAVE_PTHREAD := @HAVE_PTHREAD@
diff --git a/projects/sample/Makefile.llvm.rules b/projects/sample/Makefile.llvm.rules
index 6e04724..a655302 100644
--- a/projects/sample/Makefile.llvm.rules
+++ b/projects/sample/Makefile.llvm.rules
@@ -463,9 +463,6 @@ ifndef LLVM_TBLGEN
   endif
 endif
 LLVM_CONFIG := $(LLVMToolDir)/llvm-config
-ifndef LLVMLD
-LLVMLD    := $(LLVMToolDir)/llvm-ld$(EXEEXT)
-endif
 ifndef LLVMDIS
 LLVMDIS  := $(LLVMToolDir)/llvm-dis$(EXEEXT)
 endif
@@ -1869,20 +1866,9 @@ check::
 	  $(EchoCmd) No test directory ; \
 	fi
 
+# An alias dating from when both lit and DejaGNU test runners were used.
 check-lit:: check
 
-check-dg::
-	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/test" ; then \
-	  if test -f "$(PROJ_OBJ_ROOT)/test/Makefile" ; then \
-	    $(EchoCmd) Running test suite ; \
-	    $(MAKE) -C $(PROJ_OBJ_ROOT)/test check-local-dg ; \
-	  else \
-	    $(EchoCmd) No Makefile in test directory ; \
-	  fi ; \
-	else \
-	  $(EchoCmd) No test directory ; \
-	fi
-
 check-all::
 	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/test" ; then \
 	  if test -f "$(PROJ_OBJ_ROOT)/test/Makefile" ; then \
diff --git a/projects/sample/autoconf/config.guess b/projects/sample/autoconf/config.guess
index cc726cd..f7dd69e 100755
--- a/projects/sample/autoconf/config.guess
+++ b/projects/sample/autoconf/config.guess
@@ -1,9 +1,10 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011 Free Software Foundation, Inc.
 
-timestamp='2003-02-22'
+timestamp='2011-08-20'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -17,23 +18,25 @@ timestamp='2003-02-22'
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
 # the same distribution terms that you use for the rest of that program.
 
-# Originally written by Per Bothner <per@bothner.com>.
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
 #
 # This script attempts to guess a canonical system name similar to
 # config.sub.  If it succeeds, it prints the system name on stdout, and
 # exits with 0.  Otherwise, it exits with 1.
 #
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -53,8 +56,9 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free
+Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -66,11 +70,11 @@ Try \`$me --help' for more information."
 while test $# -gt 0 ; do
   case $1 in
     --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit 0 ;;
+       echo "$timestamp" ; exit ;;
     --version | -v )
-       echo "$version" ; exit 0 ;;
+       echo "$version" ; exit ;;
     --help | --h* | -h )
-       echo "$usage"; exit 0 ;;
+       echo "$usage"; exit ;;
     -- )     # Stop option processing
        shift; break ;;
     - )	# Use stdin as input.
@@ -104,8 +108,9 @@ set_cc_for_build='
 trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
 trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
 : ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d -q "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
  { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
  { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
 dummy=$tmp/dummy ;
 tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
@@ -122,7 +127,7 @@ case $CC_FOR_BUILD,$HOST_CC,$CC in
 	;;
  ,,*)   CC_FOR_BUILD=$CC ;;
  ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ;'
+esac ; set_cc_for_build= ;'
 
 # This is needed to find uname on a Pyramid OSx when run in the BSD universe.
 # (ghazi@noc.rutgers.edu 1994-08-24)
@@ -157,6 +162,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    arm*) machine=arm-unknown ;;
 	    sh3el) machine=shl-unknown ;;
 	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
 	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
 	esac
 	# The Operating System including object format, if it has switched
@@ -165,7 +171,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
+			| grep -q __ELF__
 		then
 		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
 		    # Return netbsd for either.  FIX?
@@ -175,7 +181,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		fi
 		;;
 	    *)
-	        os=netbsd
+		os=netbsd
 		;;
 	esac
 	# The OS release
@@ -195,50 +201,32 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# contains redundant information, the shorter form:
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
 	echo "${machine}-${os}${release}"
-	exit 0 ;;
-    amiga:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    hp300:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mac68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    macppc:OpenBSD:*:*)
-	echo powerpc-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme88k:OpenBSD:*:*)
-	echo m88k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvmeppc:OpenBSD:*:*)
-	echo powerpc-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    pmax:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sgi:OpenBSD:*:*)
-	echo mipseb-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sun3:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    wgrisc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:OpenBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
     alpha:OSF1:*:*)
-	if test $UNAME_RELEASE = "V4.0"; then
+	case $UNAME_RELEASE in
+	*4.0)
 		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-	fi
+		;;
+	*5.*)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
 	# According to Compaq, /usr/sbin/psrinfo has been available on
 	# OSF/1 and Tru64 systems produced since 1995.  I hope that
 	# covers most systems running today.  This code pipes the CPU
@@ -276,39 +264,52 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    "EV7.9 (21364A)")
 		UNAME_MACHINE="alphaev79" ;;
 	esac
+	# A Pn.n version is a patched version.
 	# A Vn.n version is a released version.
 	# A Tn.n version is a released field test version.
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit 0 ;;
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	exitcode=$?
+	trap '' 0
+	exit $exitcode ;;
     Alpha\ *:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# Should we change UNAME_MACHINE based on the output of uname instead
 	# of the specific Alpha model?
 	echo alpha-pc-interix
-	exit 0 ;;
+	exit ;;
     21064:Windows_NT:50:3)
 	echo alpha-dec-winnt3.5
-	exit 0 ;;
+	exit ;;
     Amiga*:UNIX_System_V:4.0:*)
 	echo m68k-unknown-sysv4
-	exit 0;;
+	exit ;;
     *:[Aa]miga[Oo][Ss]:*:*)
 	echo ${UNAME_MACHINE}-unknown-amigaos
-	exit 0 ;;
+	exit ;;
     *:[Mm]orph[Oo][Ss]:*:*)
 	echo ${UNAME_MACHINE}-unknown-morphos
-	exit 0 ;;
+	exit ;;
     *:OS/390:*:*)
 	echo i370-ibm-openedition
-	exit 0 ;;
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+	echo powerpc-ibm-os400
+	exit ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit 0;;
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
 	echo hppa1.1-hitachi-hiuxmpp
-	exit 0;;
+	exit ;;
     Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
 	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
 	if test "`(/bin/universe) 2>/dev/null`" = att ; then
@@ -316,29 +317,51 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	else
 		echo pyramid-pyramid-bsd
 	fi
-	exit 0 ;;
+	exit ;;
     NILE*:*:*:dcosx)
 	echo pyramid-pyramid-svr4
-	exit 0 ;;
-    DRS?6000:UNIX_SV:4.2*:7*)
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
 	case `/usr/bin/uname -p` in
-	    sparc) echo sparc-icl-nx7 && exit 0 ;;
+	    sparc) echo sparc-icl-nx7; exit ;;
 	esac ;;
+    s390x:SunOS:*:*)
+	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
     sun4H:SunOS:5.*:*)
 	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
 	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
-    i86pc:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	echo i386-pc-auroraux${UNAME_RELEASE}
+	exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	eval $set_cc_for_build
+	SUN_ARCH="i386"
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH="x86_64"
+	    fi
+	fi
+	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
     sun4*:SunOS:6*:*)
 	# According to config.sub, this is the proper way to canonicalize
 	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
 	# it's likely to be more like Solaris than SunOS4.
 	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     sun4*:SunOS:*:*)
 	case "`/usr/bin/arch -k`" in
 	    Series*|S4*)
@@ -347,10 +370,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	esac
 	# Japanese Language versions have a version number like `4.1.3-JL'.
 	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit 0 ;;
+	exit ;;
     sun3*:SunOS:*:*)
 	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     sun*:*:4.2BSD:*)
 	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
 	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
@@ -362,10 +385,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		echo sparc-sun-sunos${UNAME_RELEASE}
 		;;
 	esac
-	exit 0 ;;
+	exit ;;
     aushp:SunOS:*:*)
 	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     # The situation for MiNT is a little confusing.  The machine name
     # can be virtually everything (everything which is not
     # "atarist" or "atariste" at least should have a processor
@@ -375,38 +398,41 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     # MiNT.  But MiNT is downward compatible to TOS, so this should
     # be no problem.
     atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit 0 ;;
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
     atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
 	echo m68k-atari-mint${UNAME_RELEASE}
-        exit 0 ;;
+	exit ;;
     *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit 0 ;;
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
     milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit 0 ;;
+	echo m68k-milan-mint${UNAME_RELEASE}
+	exit ;;
     hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit 0 ;;
+	echo m68k-hades-mint${UNAME_RELEASE}
+	exit ;;
     *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit 0 ;;
+	echo m68k-unknown-mint${UNAME_RELEASE}
+	exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
     powerpc:machten:*:*)
 	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     RISC*:Mach:*:*)
 	echo mips-dec-mach_bsd4.3
-	exit 0 ;;
+	exit ;;
     RISC*:ULTRIX:*:*)
 	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     VAX*:ULTRIX*:*:*)
 	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     2020:CLIX:*:* | 2430:CLIX:*:*)
 	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     mips:*:*:UMIPS | mips:*:*:RISCos)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
@@ -430,35 +456,36 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	  exit (-1);
 	}
 EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c \
-	  && $dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
-	  && exit 0
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
 	echo mips-mips-riscos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     Motorola:PowerMAX_OS:*:*)
 	echo powerpc-motorola-powermax
-	exit 0 ;;
+	exit ;;
     Motorola:*:4.3:PL8-*)
 	echo powerpc-harris-powermax
-	exit 0 ;;
+	exit ;;
     Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
 	echo powerpc-harris-powermax
-	exit 0 ;;
+	exit ;;
     Night_Hawk:Power_UNIX:*:*)
 	echo powerpc-harris-powerunix
-	exit 0 ;;
+	exit ;;
     m88k:CX/UX:7*:*)
 	echo m88k-harris-cxux7
-	exit 0 ;;
+	exit ;;
     m88k:*:4*:R4*)
 	echo m88k-motorola-sysv4
-	exit 0 ;;
+	exit ;;
     m88k:*:3*:R3*)
 	echo m88k-motorola-sysv3
-	exit 0 ;;
+	exit ;;
     AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
 	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
 	then
 	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
@@ -471,29 +498,29 @@ EOF
 	else
 	    echo i586-dg-dgux${UNAME_RELEASE}
 	fi
- 	exit 0 ;;
+	exit ;;
     M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
 	echo m88k-dolphin-sysv3
-	exit 0 ;;
+	exit ;;
     M88*:*:R3*:*)
 	# Delta 88k system running SVR3
 	echo m88k-motorola-sysv3
-	exit 0 ;;
+	exit ;;
     XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
 	echo m88k-tektronix-sysv3
-	exit 0 ;;
+	exit ;;
     Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
 	echo m68k-tektronix-bsd
-	exit 0 ;;
+	exit ;;
     *:IRIX*:*:*)
 	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit 0 ;;
+	exit ;;
     ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
-	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
     i*86:AIX:*:*)
 	echo i386-ibm-aix
-	exit 0 ;;
+	exit ;;
     ia64:AIX:*:*)
 	if [ -x /usr/bin/oslevel ] ; then
 		IBM_REV=`/usr/bin/oslevel`
@@ -501,7 +528,7 @@ EOF
 		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
 	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-	exit 0 ;;
+	exit ;;
     *:AIX:2:3)
 	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
 		eval $set_cc_for_build
@@ -516,15 +543,19 @@ EOF
 			exit(0);
 			}
 EOF
-		$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
-		echo rs6000-ibm-aix3.2.5
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
 	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
 		echo rs6000-ibm-aix3.2.4
 	else
 		echo rs6000-ibm-aix3.2
 	fi
-	exit 0 ;;
-    *:AIX:*:[45])
+	exit ;;
+    *:AIX:*:[4567])
 	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
 	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
@@ -537,28 +568,28 @@ EOF
 		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
 	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit 0 ;;
+	exit ;;
     *:AIX:*:*)
 	echo rs6000-ibm-aix
-	exit 0 ;;
+	exit ;;
     ibmrt:4.4BSD:*|romp-ibm:BSD:*)
 	echo romp-ibm-bsd4.4
-	exit 0 ;;
+	exit ;;
     ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
 	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit 0 ;;                           # report: romp-ibm BSD 4.3
+	exit ;;                             # report: romp-ibm BSD 4.3
     *:BOSX:*:*)
 	echo rs6000-bull-bosx
-	exit 0 ;;
+	exit ;;
     DPX/2?00:B.O.S.:*:*)
 	echo m68k-bull-sysv3
-	exit 0 ;;
+	exit ;;
     9000/[34]??:4.3bsd:1.*:*)
 	echo m68k-hp-bsd
-	exit 0 ;;
+	exit ;;
     hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
 	echo m68k-hp-bsd4.4
-	exit 0 ;;
+	exit ;;
     9000/[34678]??:HP-UX:*:*)
 	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
 	case "${UNAME_MACHINE}" in
@@ -567,52 +598,52 @@ EOF
 	    9000/[678][0-9][0-9])
 		if [ -x /usr/bin/getconf ]; then
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case "${sc_cpu_version}" in
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case "${sc_kernel_bits}" in
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
 			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
+			esac ;;
+		    esac
 		fi
 		if [ "${HP_ARCH}" = "" ]; then
 		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
+		    sed 's/^		//' << EOF >$dummy.c
 
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
 
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
 
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
 EOF
 		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
@@ -620,9 +651,19 @@ EOF
 	esac
 	if [ ${HP_ARCH} = "hppa2.0w" ]
 	then
-	    # avoid double evaluation of $set_cc_for_build
-	    test -n "$CC_FOR_BUILD" || eval $set_cc_for_build
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E -) | grep __LP64__ >/dev/null
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep -q __LP64__
 	    then
 		HP_ARCH="hppa2.0w"
 	    else
@@ -630,11 +671,11 @@ EOF
 	    fi
 	fi
 	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit 0 ;;
+	exit ;;
     ia64:HP-UX:*:*)
 	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
 	echo ia64-hp-hpux${HPUX_REV}
-	exit 0 ;;
+	exit ;;
     3050*:HI-UX:*:*)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
@@ -662,205 +703,253 @@ EOF
 	  exit (0);
 	}
 EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
 	echo unknown-hitachi-hiuxwe2
-	exit 0 ;;
+	exit ;;
     9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
 	echo hppa1.1-hp-bsd
-	exit 0 ;;
+	exit ;;
     9000/8??:4.3bsd:*:*)
 	echo hppa1.0-hp-bsd
-	exit 0 ;;
+	exit ;;
     *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
 	echo hppa1.0-hp-mpeix
-	exit 0 ;;
+	exit ;;
     hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
 	echo hppa1.1-hp-osf
-	exit 0 ;;
+	exit ;;
     hp8??:OSF1:*:*)
 	echo hppa1.0-hp-osf
-	exit 0 ;;
+	exit ;;
     i*86:OSF1:*:*)
 	if [ -x /usr/sbin/sysversion ] ; then
 	    echo ${UNAME_MACHINE}-unknown-osf1mk
 	else
 	    echo ${UNAME_MACHINE}-unknown-osf1
 	fi
-	exit 0 ;;
+	exit ;;
     parisc*:Lites*:*:*)
 	echo hppa1.1-hp-lites
-	exit 0 ;;
+	exit ;;
     C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
 	echo c1-convex-bsd
-        exit 0 ;;
+	exit ;;
     C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-        exit 0 ;;
+	exit ;;
     C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
 	echo c34-convex-bsd
-        exit 0 ;;
+	exit ;;
     C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
 	echo c38-convex-bsd
-        exit 0 ;;
+	exit ;;
     C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
 	echo c4-convex-bsd
-        exit 0 ;;
+	exit ;;
     CRAY*Y-MP:*:*:*)
 	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit 0 ;;
+	exit ;;
     CRAY*[A-Z]90:*:*:*)
 	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
 	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
 	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
 	      -e 's/\.[^.]*$/.X/'
-	exit 0 ;;
+	exit ;;
     CRAY*TS:*:*:*)
 	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit 0 ;;
+	exit ;;
     CRAY*T3E:*:*:*)
 	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit 0 ;;
+	exit ;;
     CRAY*SV1:*:*:*)
 	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit 0 ;;
+	exit ;;
     *:UNICOS/mp:*:*)
-	echo nv1-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' 
-	exit 0 ;;
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
 	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit 0 ;;
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    5000:UNIX_System_V:4.*:*)
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
 	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     sparc*:BSD/OS:*:*)
 	echo sparc-unknown-bsdi${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:BSD/OS:*:*)
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:FreeBSD:*:*)
-	# Determine whether the default compiler uses glibc.
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#if __GLIBC__ >= 2
-	LIBC=gnu
-	#else
-	LIBC=
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
-	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`${LIBC:+-$LIBC}
-	exit 0 ;;
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
     i*:CYGWIN*:*)
 	echo ${UNAME_MACHINE}-pc-cygwin
-	exit 0 ;;
-    i*:MINGW*:*)
+	exit ;;
+    *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
-	exit 0 ;;
+	exit ;;
+    i*:windows32*:*)
+	# uname -m includes "-pc" on this system.
+	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
     i*:PW*:*)
 	echo ${UNAME_MACHINE}-pc-pw32
-	exit 0 ;;
-    x86:Interix*:3*)
-	echo i586-pc-interix3
-	exit 0 ;;
+	exit ;;
+    *:Interix*:*)
+	case ${UNAME_MACHINE} in
+	    x86)
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    authenticamd | genuineintel | EM64T)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
     [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
 	echo i${UNAME_MACHINE}-pc-mks
-	exit 0 ;;
+	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
     i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
 	# UNAME_MACHINE based on the output of uname instead of i386?
 	echo i586-pc-interix
-	exit 0 ;;
+	exit ;;
     i*:UWIN*:*)
 	echo ${UNAME_MACHINE}-pc-uwin
-	exit 0 ;;
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
     p*:CYGWIN*:*)
 	echo powerpcle-unknown-cygwin
-	exit 0 ;;
+	exit ;;
     prep*:SunOS:5.*:*)
 	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     *:GNU:*:*)
+	# the GNU system
 	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit 0 ;;
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
-	exit 0 ;;
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
     arm*:Linux:*:*)
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	else
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	    else
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+	    fi
+	fi
+	exit ;;
+    avr32*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit 0 ;;
-    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit 0 ;;
-    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit 0 ;;
-    mips:Linux:*:*)
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+	echo frv-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	LIBC=gnu
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips
-	#undef mipsel
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
-	#else
-	CPU=
-	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
 	#endif
 EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
-	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
-	;;
-    mips64:Linux:*:*)
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
 	#undef CPU
-	#undef mips64
-	#undef mips64el
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
 	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
+	CPU=${UNAME_MACHINE}el
 	#else
 	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
+	CPU=${UNAME_MACHINE}
 	#else
 	CPU=
 	#endif
 	#endif
 EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
-	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
 	;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit 0 ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit 0 ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
-	exit 0 ;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-gnu
+	exit ;;
+    padre:Linux:*:*)
+	echo sparc-unknown-linux-gnu
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
@@ -868,106 +957,71 @@ EOF
 	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
 	  *)    echo hppa-unknown-linux-gnu ;;
 	esac
-	exit 0 ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
-	exit 0 ;;
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
 	echo ${UNAME_MACHINE}-ibm-linux
-	exit 0 ;;
+	exit ;;
+    sh64*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     sh*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit 0 ;;
+	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit 0 ;;
+	exit ;;
+    tile*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
     x86_64:Linux:*:*)
 	echo x86_64-unknown-linux-gnu
-	exit 0 ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit 0 ;;
-	  coff-i386)
-		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
-		exit 0 ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit 0 ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#ifdef __INTEL_COMPILER
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
-	test x"${LIBC}" != x && echo "${UNAME_MACHINE}-pc-linux-${LIBC}" && exit 0
-	test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0
-	;;
+	exit ;;
+    xtensa*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
 	# earlier versions are messed up and put the nodename in both
 	# sysname and nodename.
 	echo i386-sequent-sysv4
-	exit 0 ;;
+	exit ;;
     i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
 	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
+	# Use sysv4.2uw... so that sysv4* matches it.
 	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit 0 ;;
+	exit ;;
     i*86:OS/2:*:*)
 	# If we were able to find `uname', then EMX Unix compatibility
 	# is probably installed.
 	echo ${UNAME_MACHINE}-pc-os2-emx
-	exit 0 ;;
+	exit ;;
     i*86:XTS-300:*:STOP)
 	echo ${UNAME_MACHINE}-unknown-stop
-	exit 0 ;;
+	exit ;;
     i*86:atheos:*:*)
 	echo ${UNAME_MACHINE}-unknown-atheos
-	exit 0 ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
 	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     i*86:*DOS:*:*)
 	echo ${UNAME_MACHINE}-pc-msdosdjgpp
-	exit 0 ;;
+	exit ;;
     i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
 	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
 	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
@@ -975,15 +1029,16 @@ EOF
 	else
 		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
 	fi
-	exit 0 ;;
-    i*86:*:5:[78]*)
+	exit ;;
+    i*86:*:5:[678]*)
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
 	case `/bin/uname -X | grep "^Machine"` in
 	    *486*)	     UNAME_MACHINE=i486 ;;
 	    *Pentium)	     UNAME_MACHINE=i586 ;;
 	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
 	esac
 	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-	exit 0 ;;
+	exit ;;
     i*86:*:3.2:*)
 	if test -f /usr/options/cb.name; then
 		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
@@ -1001,73 +1056,86 @@ EOF
 	else
 		echo ${UNAME_MACHINE}-pc-sysv32
 	fi
-	exit 0 ;;
+	exit ;;
     pc:*:*:*)
 	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
-        exit 0 ;;
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configury will decide that
+	# this is a cross-build.
+	echo i586-pc-msdosdjgpp
+	exit ;;
     Intel:Mach:3*:*)
 	echo i386-pc-mach3
-	exit 0 ;;
+	exit ;;
     paragon:*:*:*)
 	echo i860-intel-osf1
-	exit 0 ;;
+	exit ;;
     i860:*:4.*:*) # i860-SVR4
 	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
 	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
 	else # Add other i860-SVR4 vendors below as they are discovered.
 	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
 	fi
-	exit 0 ;;
+	exit ;;
     mini*:CTIX:SYS*5:*)
 	# "miniframe"
 	echo m68010-convergent-sysv
-	exit 0 ;;
+	exit ;;
     mc68k:UNIX:SYSTEM5:3.51m)
 	echo m68k-convergent-sysv
-	exit 0 ;;
+	exit ;;
     M680?0:D-NIX:5.3:*)
 	echo m68k-diab-dnix
-	exit 0 ;;
-    M68*:*:R3V[567]*:*)
-	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
-    3[34]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0)
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
 	OS_REL=''
 	test -r /etc/.relid \
 	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
 	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && echo i486-ncr-sysv4 && exit 0 ;;
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
 	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     mc68030:UNIX_System_V:4.*:*)
 	echo m68k-atari-sysv4
-	exit 0 ;;
+	exit ;;
     TSUNAMI:LynxOS:2.*:*)
 	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     rs6000:LynxOS:2.*:*)
 	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
 	echo powerpc-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     SM[BE]S:UNIX_SV:*:*)
 	echo mips-dde-sysv${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     RM*:ReliantUNIX-*:*:*)
 	echo mips-sni-sysv4
-	exit 0 ;;
+	exit ;;
     RM*:SINIX-*:*:*)
 	echo mips-sni-sysv4
-	exit 0 ;;
+	exit ;;
     *:SINIX-*:*:*)
 	if uname -p 2>/dev/null >/dev/null ; then
 		UNAME_MACHINE=`(uname -p) 2>/dev/null`
@@ -1075,68 +1143,94 @@ EOF
 	else
 		echo ns32k-sni-sysv
 	fi
-	exit 0 ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit 0 ;;
+	exit ;;
+    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	echo i586-unisys-sysv4
+	exit ;;
     *:UNIX_System_V:4*:FTX*)
 	# From Gerald Hewes <hewes@openmarket.com>.
 	# How about differentiating between stratus architectures? -djm
 	echo hppa1.1-stratus-sysv4
-	exit 0 ;;
+	exit ;;
     *:*:*:FTX*)
 	# From seanf@swdc.stratus.com.
 	echo i860-stratus-sysv4
-	exit 0 ;;
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
     *:VOS:*:*)
 	# From Paul.Green@stratus.com.
 	echo hppa1.1-stratus-vos
-	exit 0 ;;
+	exit ;;
     mc68*:A/UX:*:*)
 	echo m68k-apple-aux${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     news*:NEWS-OS:6*:*)
 	echo mips-sony-newsos6
-	exit 0 ;;
+	exit ;;
     R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
 	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
+		echo mips-nec-sysv${UNAME_RELEASE}
 	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
+		echo mips-unknown-sysv${UNAME_RELEASE}
 	fi
-        exit 0 ;;
+	exit ;;
     BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
 	echo powerpc-be-beos
-	exit 0 ;;
+	exit ;;
     BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
 	echo powerpc-apple-beos
-	exit 0 ;;
+	exit ;;
     BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
 	echo i586-pc-beos
-	exit 0 ;;
+	exit ;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	echo i586-pc-haiku
+	exit ;;
     SX-4:SUPER-UX:*:*)
 	echo sx4-nec-superux${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     SX-5:SUPER-UX:*:*)
 	echo sx5-nec-superux${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     SX-6:SUPER-UX:*:*)
 	echo sx6-nec-superux${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
     Power*:Rhapsody:*:*)
 	echo powerpc-apple-rhapsody${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:Rhapsody:*:*)
 	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:Darwin:*:*)
-	case `uname -p` in
-	    *86) UNAME_PROCESSOR=i686 ;;
-	    powerpc) UNAME_PROCESSOR=powerpc ;;
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    i386)
+		eval $set_cc_for_build
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      grep IS_64BIT_ARCH >/dev/null
+		  then
+		      UNAME_PROCESSOR="x86_64"
+		  fi
+		fi ;;
+	    unknown) UNAME_PROCESSOR=powerpc ;;
 	esac
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
 	UNAME_PROCESSOR=`uname -p`
 	if test "$UNAME_PROCESSOR" = "x86"; then
@@ -1144,22 +1238,28 @@ EOF
 		UNAME_MACHINE=pc
 	fi
 	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:QNX:*:4*)
 	echo i386-pc-qnx
-	exit 0 ;;
-    NSR-[DGKLNPTVW]:NONSTOP_KERNEL:*:*)
+	exit ;;
+    NEO-?:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
 	echo nsr-tandem-nsk${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:NonStop-UX:*:*)
 	echo mips-compaq-nonstopux
-	exit 0 ;;
+	exit ;;
     BS2000:POSIX*:*:*)
 	echo bs2000-siemens-sysv
-	exit 0 ;;
+	exit ;;
     DS/*:UNIX_System_V:*:*)
 	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     *:Plan9:*:*)
 	# "uname -m" is not consistent, so use $cputype instead. 386
 	# is converted to i386 for consistency with other x86
@@ -1170,25 +1270,50 @@ EOF
 	    UNAME_MACHINE="$cputype"
 	fi
 	echo ${UNAME_MACHINE}-unknown-plan9
-	exit 0 ;;
+	exit ;;
     *:TOPS-10:*:*)
 	echo pdp10-unknown-tops10
-	exit 0 ;;
+	exit ;;
     *:TENEX:*:*)
 	echo pdp10-unknown-tenex
-	exit 0 ;;
+	exit ;;
     KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
 	echo pdp10-dec-tops20
-	exit 0 ;;
+	exit ;;
     XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
 	echo pdp10-xkl-tops20
-	exit 0 ;;
+	exit ;;
     *:TOPS-20:*:*)
 	echo pdp10-unknown-tops20
-	exit 0 ;;
+	exit ;;
     *:ITS:*:*)
 	echo pdp10-unknown-its
-	exit 0 ;;
+	exit ;;
+    SEI:*:*:SEIUX)
+	echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+    i*86:AROS:*:*)
+	echo ${UNAME_MACHINE}-pc-aros
+	exit ;;
 esac
 
 #echo '(No uname command or uname output not recognized.)' 1>&2
@@ -1211,16 +1336,16 @@ main ()
 #include <sys/param.h>
   printf ("m68k-sony-newsos%s\n",
 #ifdef NEWSOS4
-          "4"
+	"4"
 #else
-	  ""
+	""
 #endif
-         ); exit (0);
+	); exit (0);
 #endif
 #endif
 
 #if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix"); exit (0);
+  printf ("arm-acorn-riscix\n"); exit (0);
 #endif
 
 #if defined (hp300) && !defined (hpux)
@@ -1309,11 +1434,12 @@ main ()
 }
 EOF
 
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && $dummy && exit 0
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
 
 # Apollos put the system type in the environment.
 
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
 
 # Convex versions that predate uname can use getsysinfo(1)
 
@@ -1322,22 +1448,22 @@ then
     case `getsysinfo -f cpu_type` in
     c1*)
 	echo c1-convex-bsd
-	exit 0 ;;
+	exit ;;
     c2*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-	exit 0 ;;
+	exit ;;
     c34*)
 	echo c34-convex-bsd
-	exit 0 ;;
+	exit ;;
     c38*)
 	echo c38-convex-bsd
-	exit 0 ;;
+	exit ;;
     c4*)
 	echo c4-convex-bsd
-	exit 0 ;;
+	exit ;;
     esac
 fi
 
@@ -1348,7 +1474,9 @@ This script, last modified $timestamp, has failed to recognize
 the operating system you are using. It is advised that you
 download the most up to date version of the config scripts from
 
-    ftp://ftp.gnu.org/pub/gnu/config/
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
 
 If the version you run ($0) is already up to date, please
 send the following data and any information you think might be
diff --git a/projects/sample/autoconf/config.sub b/projects/sample/autoconf/config.sub
index 9772e87..9942491 100755
--- a/projects/sample/autoconf/config.sub
+++ b/projects/sample/autoconf/config.sub
@@ -1,9 +1,10 @@
 #! /bin/sh
 # Configuration validation subroutine script.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011 Free Software Foundation, Inc.
 
-timestamp='2003-02-22'
+timestamp='2011-11-02'
 
 # This file is (in principle) common to ALL GNU software.
 # The presence of a machine in this file suggests that SOME GNU software
@@ -21,22 +22,26 @@ timestamp='2003-02-22'
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330,
-# Boston, MA 02111-1307, USA.
-
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
 # the same distribution terms that you use for the rest of that program.
 
+
 # Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+# diff and a properly formatted GNU ChangeLog entry.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
 # If it is invalid, we print an error message on stderr and exit with code 1.
 # Otherwise, we print the canonical config type on stdout and succeed.
 
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
 # that are meaningful with *any* GNU software.
@@ -70,8 +75,9 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free
+Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -83,11 +89,11 @@ Try \`$me --help' for more information."
 while test $# -gt 0 ; do
   case $1 in
     --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit 0 ;;
+       echo "$timestamp" ; exit ;;
     --version | -v )
-       echo "$version" ; exit 0 ;;
+       echo "$version" ; exit ;;
     --help | --h* | -h )
-       echo "$usage"; exit 0 ;;
+       echo "$usage"; exit ;;
     -- )     # Stop option processing
        shift; break ;;
     - )	# Use stdin as input.
@@ -99,7 +105,7 @@ while test $# -gt 0 ; do
     *local*)
        # First pass through any local machine types.
        echo $1
-       exit 0;;
+       exit ;;
 
     * )
        break ;;
@@ -118,7 +124,11 @@ esac
 # Here we must recognize all the valid KERNEL-OS combinations.
 maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
-  nto-qnx* | linux-gnu* | freebsd*-gnu* | netbsd*-gnu* | storm-chaos* | os2-emx* | rtmk-nova*)
+  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
+  linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | \
+  kopensolaris*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
     basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
     ;;
@@ -144,10 +154,13 @@ case $os in
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis)
+	-apple | -axis | -knuth | -cray | -microblaze)
 		os=
 		basic_machine=$1
 		;;
+	-bluegene*)
+		os=-cnk
+		;;
 	-sim | -cisco | -oki | -wec | -winbond)
 		os=
 		basic_machine=$1
@@ -162,13 +175,17 @@ case $os in
 		os=-chorusos
 		basic_machine=$1
 		;;
- 	-chorusrdb)
- 		os=-chorusrdb
+	-chorusrdb)
+		os=-chorusrdb
 		basic_machine=$1
- 		;;
+		;;
 	-hiux*)
 		os=-hiuxwe2
 		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
 	-sco5)
 		os=-sco3.2v5
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
@@ -185,6 +202,10 @@ case $os in
 		# Don't forget version if it is 3.2v4 or newer.
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
 		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
 	-sco*)
 		os=-sco3.2v2
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
@@ -228,53 +249,97 @@ case $basic_machine in
 	| a29k \
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr \
-	| clipper \
+	| am33_2.0 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+        | be32 | be64 \
+	| bfin \
+	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
-	| fr30 | frv \
+	| fido | fr30 | frv \
+	| hexagon \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| i370 | i860 | i960 | ia64 \
-	| ip2k \
-	| m32r | m68000 | m68k | m88k | mcore \
+	| ip2k | iq2000 \
+	| le32 | le64 \
+	| lm32 \
+	| m32c | m32r | m32rle | m68000 | m68k | m88k \
+	| maxq | mb | microblaze | mcore | mep | metag \
 	| mips | mipsbe | mipseb | mipsel | mipsle \
 	| mips16 \
 	| mips64 | mips64el \
-	| mips64vr | mips64vrel \
+	| mips64octeon | mips64octeonel \
 	| mips64orion | mips64orionel \
+	| mips64r5900 | mips64r5900el \
+	| mips64vr | mips64vrel \
 	| mips64vr4100 | mips64vr4100el \
 	| mips64vr4300 | mips64vr4300el \
 	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
 	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
+	| moxie \
+	| mt \
 	| msp430 \
+	| nds32 | nds32le | nds32be \
+	| nios | nios2 \
 	| ns16k | ns32k \
-	| openrisc | or32 \
+	| open8 \
+	| or32 \
 	| pdp10 | pdp11 | pj | pjl \
-	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
+	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
-	| sh | sh[1234] | sh[23]e | sh[34]eb | shbe | shle | sh[1234]le | sh3ele \
+	| rx \
+	| score \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
-	| sparc | sparc64 | sparc86x | sparclet | sparclite | sparcv9 | sparcv9b \
-	| strongarm \
-	| tahoe | thumb | tic80 | tron \
-	| v850 | v850e \
+	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+	| spu \
+	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
+	| ubicom32 \
+	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
 	| we32k \
-	| x86 | xscale | xstormy16 | xtensa \
-	| z8k)
+	| x86 | xc16x | xstormy16 | xtensa \
+	| z8k | z80)
 		basic_machine=$basic_machine-unknown
 		;;
-	m6811 | m68hc11 | m6812 | m68hc12)
+	c54x)
+		basic_machine=tic54x-unknown
+		;;
+	c55x)
+		basic_machine=tic55x-unknown
+		;;
+	c6x)
+		basic_machine=tic6x-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12 | picochip)
 		# Motorola 68HC11/12.
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
 	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
 		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
+	strongarm | thumb | xscale)
+		basic_machine=arm-unknown
+		;;
+
+	xscaleeb)
+		basic_machine=armeb-unknown
+		;;
+
+	xscaleel)
+		basic_machine=armel-unknown
+		;;
 
 	# We use `pc' rather than `unknown'
 	# because (1) that's what they normally are, and
@@ -294,54 +359,75 @@ case $basic_machine in
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-	| avr-* \
-	| bs2000-* \
-	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
-	| clipper-* | cydra-* \
+	| avr-* | avr32-* \
+	| be32-* | be64-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* \
+	| clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
 	| elxsi-* \
-	| f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \
+	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
 	| h8300-* | h8500-* \
+	| hexagon-* \
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
-	| ip2k-* \
-	| m32r-* \
+	| ip2k-* | iq2000-* \
+	| le32-* | le64-* \
+	| lm32-* \
+	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | mcore-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
 	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
 	| mips16-* \
 	| mips64-* | mips64el-* \
-	| mips64vr-* | mips64vrel-* \
+	| mips64octeon-* | mips64octeonel-* \
 	| mips64orion-* | mips64orionel-* \
+	| mips64r5900-* | mips64r5900el-* \
+	| mips64vr-* | mips64vrel-* \
 	| mips64vr4100-* | mips64vr4100el-* \
 	| mips64vr4300-* | mips64vr4300el-* \
 	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
 	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
 	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
 	| msp430-* \
-	| none-* | np1-* | nv1-* | ns16k-* | ns32k-* \
+	| nds32-* | nds32le-* | nds32be-* \
+	| nios-* | nios2-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| open8-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
 	| pyramid-* \
-	| romp-* | rs6000-* \
-	| sh-* | sh[1234]-* | sh[23]e-* | sh[34]eb-* | shbe-* \
+	| romp-* | rs6000-* | rx-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-	| sparc-* | sparc64-* | sparc86x-* | sparclet-* | sparclite-* \
-	| sparcv9-* | sparcv9b-* | strongarm-* | sv1-* | sx?-* \
-	| tahoe-* | thumb-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+	| tahoe-* \
 	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tile*-* \
 	| tron-* \
-	| v850-* | v850e-* | vax-* \
+	| ubicom32-* \
+	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
+	| vax-* \
 	| we32k-* \
-	| x86-* | x86_64-* | xps100-* | xscale-* | xstormy16-* \
-	| xtensa-* \
+	| x86-* | x86_64-* | xc16x-* | xps100-* \
+	| xstormy16-* | xtensa*-* \
 	| ymp-* \
-	| z8k-*)
+	| z8k-* | z80-*)
+		;;
+	# Recognize the basic CPU types without company name, with glob match.
+	xtensa*)
+		basic_machine=$basic_machine-unknown
 		;;
 	# Recognize the various machine names and aliases which stand
 	# for a CPU type and a company and sometimes even an OS.
@@ -359,6 +445,9 @@ case $basic_machine in
 		basic_machine=a29k-amd
 		os=-udi
 		;;
+	abacus)
+		basic_machine=abacus-unknown
+		;;
 	adobe68k)
 		basic_machine=m68010-adobe
 		os=-scout
@@ -373,6 +462,12 @@ case $basic_machine in
 		basic_machine=a29k-none
 		os=-bsd
 		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	amdahl)
 		basic_machine=580-amdahl
 		os=-sysv
@@ -396,6 +491,10 @@ case $basic_machine in
 		basic_machine=m68k-apollo
 		os=-bsd
 		;;
+	aros)
+		basic_machine=i386-pc
+		os=-aros
+		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@@ -404,10 +503,35 @@ case $basic_machine in
 		basic_machine=ns32k-sequent
 		os=-dynix
 		;;
+	blackfin)
+		basic_machine=bfin-unknown
+		os=-linux
+		;;
+	blackfin-*)
+		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	bluegene*)
+		basic_machine=powerpc-ibm
+		os=-cnk
+		;;
+	c54x-*)
+		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c55x-*)
+		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c6x-*)
+		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	c90)
 		basic_machine=c90-cray
 		os=-unicos
 		;;
+	cegcc)
+		basic_machine=arm-unknown
+		os=-cegcc
+		;;
 	convex-c1)
 		basic_machine=c1-convex
 		os=-bsd
@@ -432,12 +556,27 @@ case $basic_machine in
 		basic_machine=j90-cray
 		os=-unicos
 		;;
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
+		;;
+	cr16 | cr16-*)
+		basic_machine=cr16-unknown
+		os=-elf
+		;;
 	crds | unos)
 		basic_machine=m68k-crds
 		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
 	cris | cris-* | etrax*)
 		basic_machine=cris-axis
 		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
 	da30 | da30-*)
 		basic_machine=m68k-da30
 		;;
@@ -460,6 +599,14 @@ case $basic_machine in
 		basic_machine=m88k-motorola
 		os=-sysv3
 		;;
+	dicos)
+		basic_machine=i686-pc
+		os=-dicos
+		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
 	dpx20 | dpx20-*)
 		basic_machine=rs6000-bull
 		os=-bosx
@@ -610,6 +757,14 @@ case $basic_machine in
 		basic_machine=m68k-isi
 		os=-sysv
 		;;
+	m68knommu)
+		basic_machine=m68k-unknown
+		os=-linux
+		;;
+	m68knommu-*)
+		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
 	m88k-omron*)
 		basic_machine=m88k-omron
 		;;
@@ -621,10 +776,17 @@ case $basic_machine in
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
+	microblaze)
+		basic_machine=microblaze-xilinx
+		;;
 	mingw32)
 		basic_machine=i386-pc
 		os=-mingw32
 		;;
+	mingw32ce)
+		basic_machine=arm-unknown
+		os=-mingw32ce
+		;;
 	miniframe)
 		basic_machine=m68000-convergent
 		;;
@@ -638,10 +800,6 @@ case $basic_machine in
 	mips3*)
 		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
 		;;
-	mmix*)
-		basic_machine=mmix-knuth
-		os=-mmixware
-		;;
 	monitor)
 		basic_machine=m68k-rom68k
 		os=-coff
@@ -654,10 +812,17 @@ case $basic_machine in
 		basic_machine=i386-pc
 		os=-msdos
 		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
 	mvs)
 		basic_machine=i370-ibm
 		os=-mvs
 		;;
+	nacl)
+		basic_machine=le32-unknown
+		os=-nacl
+		;;
 	ncr3000)
 		basic_machine=i486-ncr
 		os=-sysv4
@@ -722,9 +887,11 @@ case $basic_machine in
 	np1)
 		basic_machine=np1-gould
 		;;
-	nv1)
-		basic_machine=nv1-cray
-		os=-unicosmp
+	neo-tandem)
+		basic_machine=neo-tandem
+		;;
+	nse-tandem)
+		basic_machine=nse-tandem
 		;;
 	nsr-tandem)
 		basic_machine=nsr-tandem
@@ -733,9 +900,12 @@ case $basic_machine in
 		basic_machine=hppa1.1-oki
 		os=-proelf
 		;;
-	or32 | or32-*)
+	openrisc | openrisc-*)
 		basic_machine=or32-unknown
-		os=-coff
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
 		;;
 	OSE68000 | ose68000)
 		basic_machine=m68000-ericsson
@@ -753,6 +923,14 @@ case $basic_machine in
 		basic_machine=i860-intel
 		os=-osf
 		;;
+	parisc)
+		basic_machine=hppa-unknown
+		os=-linux
+		;;
+	parisc-*)
+		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
 	pbd)
 		basic_machine=sparc-tti
 		;;
@@ -762,32 +940,45 @@ case $basic_machine in
 	pc532 | pc532-*)
 		basic_machine=ns32k-pc532
 		;;
+	pc98)
+		basic_machine=i386-pc
+		;;
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	pentium | p5 | k5 | k6 | nexgen | viac3)
 		basic_machine=i586-pc
 		;;
 	pentiumpro | p6 | 6x86 | athlon | athlon_*)
 		basic_machine=i686-pc
 		;;
-	pentiumii | pentium2)
+	pentiumii | pentium2 | pentiumiii | pentium3)
 		basic_machine=i686-pc
 		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
 	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
 		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
 	pentiumpro-* | p6-* | 6x86-* | athlon-*)
 		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
-	pentiumii-* | pentium2-*)
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
 		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	pn)
 		basic_machine=pn-gould
 		;;
 	power)	basic_machine=power-ibm
 		;;
-	ppc)	basic_machine=powerpc-unknown
+	ppc | ppcbe)	basic_machine=powerpc-unknown
 		;;
-	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+	ppc-* | ppcbe-*)
+		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
 	ppcle | powerpclittle | ppc-le | powerpc-little)
 		basic_machine=powerpcle-unknown
@@ -812,6 +1003,10 @@ case $basic_machine in
 		basic_machine=i586-unknown
 		os=-pw32
 		;;
+	rdos)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
 	rom68k)
 		basic_machine=m68k-rom68k
 		os=-coff
@@ -838,6 +1033,14 @@ case $basic_machine in
 	sb1el)
 		basic_machine=mipsisa64sb1el-unknown
 		;;
+	sde)
+		basic_machine=mipsisa32-sde
+		os=-elf
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
 	sequent)
 		basic_machine=i386-sequent
 		;;
@@ -845,6 +1048,12 @@ case $basic_machine in
 		basic_machine=sh-hitachi
 		os=-hms
 		;;
+	sh5el)
+		basic_machine=sh5le-unknown
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
 	sparclite-wrs | simso-wrs)
 		basic_machine=sparclite-wrs
 		os=-vxworks
@@ -863,6 +1072,9 @@ case $basic_machine in
 		basic_machine=i860-stratus
 		os=-sysv4
 		;;
+	strongarm-* | thumb-*)
+		basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	sun2)
 		basic_machine=m68000-sun
 		;;
@@ -919,21 +1131,9 @@ case $basic_machine in
 		basic_machine=t90-cray
 		os=-unicos
 		;;
-        tic4x | c4x*)
-		basic_machine=tic4x-unknown
-		os=-coff
-		;;
-	tic54x | c54x*)
-		basic_machine=tic54x-unknown
-		os=-coff
-		;;
-	tic55x | c55x*)
-		basic_machine=tic55x-unknown
-		os=-coff
-		;;
-	tic6x | c6x*)
-		basic_machine=tic6x-unknown
-		os=-coff
+	tile*)
+		basic_machine=$basic_machine-unknown
+		os=-linux-gnu
 		;;
 	tx39)
 		basic_machine=mipstx39-unknown
@@ -948,6 +1148,10 @@ case $basic_machine in
 	tower | tower-32)
 		basic_machine=m68k-ncr
 		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
 	udi29k)
 		basic_machine=a29k-amd
 		os=-udi
@@ -991,9 +1195,16 @@ case $basic_machine in
 		basic_machine=hppa1.1-winbond
 		os=-proelf
 		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
 	xps | xps100)
 		basic_machine=xps100-honeywell
 		;;
+	xscale-* | xscalee[bl]-*)
+		basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+		;;
 	ymp)
 		basic_machine=ymp-cray
 		os=-unicos
@@ -1002,6 +1213,10 @@ case $basic_machine in
 		basic_machine=z8k-unknown
 		os=-sim
 		;;
+	z80-*-coff)
+		basic_machine=z80-unknown
+		os=-sim
+		;;
 	none)
 		basic_machine=none-none
 		os=-none
@@ -1021,6 +1236,9 @@ case $basic_machine in
 	romp)
 		basic_machine=romp-ibm
 		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
 	rs6000)
 		basic_machine=rs6000-ibm
 		;;
@@ -1037,13 +1255,10 @@ case $basic_machine in
 	we32k)
 		basic_machine=we32k-att
 		;;
-	sh3 | sh4 | sh[34]eb | sh[1234]le | sh[23]ele)
+	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
 		basic_machine=sh-unknown
 		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparc | sparcv9 | sparcv9b)
+	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
 		basic_machine=sparc-sun
 		;;
 	cydra)
@@ -1087,9 +1302,12 @@ esac
 if [ x"$os" != x"" ]
 then
 case $os in
-        # First match some system type aliases
-        # that might get confused with valid system types.
+	# First match some system type aliases
+	# that might get confused with valid system types.
 	# -solaris* is a basic system type, with this one exception.
+	-auroraux)
+		os=-auroraux
+		;;
 	-solaris1 | -solaris1.*)
 		os=`echo $os | sed -e 's|solaris1|sunos4|'`
 		;;
@@ -1110,25 +1328,31 @@ case $os in
 	# Each alternative MUST END IN A *, to match a version number.
 	# -sysv* is not here because it comes later, after sysvr4.
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
+	      | -sym* | -kopensolaris* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* \
+	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
-	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -openbsd* | -solidbsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
 	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* \
+	      | -chorusos* | -chorusrdb* | -cegcc* \
 	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -mingw32* | -linux-gnu* | -linux-android* \
+	      | -linux-newlib* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
 	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
 	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
 	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-	      | -powermax* | -dnix*)
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1146,12 +1370,15 @@ case $os in
 		os=`echo $os | sed -e 's|nto|nto-qnx|'`
 		;;
 	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
 	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
 		;;
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
 	-linux*)
 		os=`echo $os | sed -e 's|linux|linux-gnu|'`
 		;;
@@ -1164,6 +1391,9 @@ case $os in
 	-opened*)
 		os=-openedition
 		;;
+	-os400*)
+		os=-os400
+		;;
 	-wince*)
 		os=-wince
 		;;
@@ -1185,6 +1415,9 @@ case $os in
 	-atheos*)
 		os=-atheos
 		;;
+	-syllable*)
+		os=-syllable
+		;;
 	-386bsd)
 		os=-bsd
 		;;
@@ -1207,6 +1440,9 @@ case $os in
 	-sinix*)
 		os=-sysv4
 		;;
+	-tpf*)
+		os=-tpf
+		;;
 	-triton*)
 		os=-sysv3
 		;;
@@ -1243,6 +1479,14 @@ case $os in
 	-kaos*)
 		os=-kaos
 		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
+	-dicos*)
+		os=-dicos
+		;;
+	-nacl*)
+		;;
 	-none)
 		;;
 	*)
@@ -1265,6 +1509,12 @@ else
 # system, and we'll never get to this point.
 
 case $basic_machine in
+	score-*)
+		os=-elf
+		;;
+	spu-*)
+		os=-elf
+		;;
 	*-acorn)
 		os=-riscix1.2
 		;;
@@ -1274,6 +1524,18 @@ case $basic_machine in
 	arm*-semi)
 		os=-aout
 		;;
+	c4x-* | tic4x-*)
+		os=-coff
+		;;
+	tic54x-*)
+		os=-coff
+		;;
+	tic55x-*)
+		os=-coff
+		;;
+	tic6x-*)
+		os=-coff
+		;;
 	# This must come before the *-dec entry.
 	pdp10-*)
 		os=-tops20
@@ -1299,6 +1561,9 @@ case $basic_machine in
 	m68*-cisco)
 		os=-aout
 		;;
+	mep-*)
+		os=-elf
+		;;
 	mips*-cisco)
 		os=-elf
 		;;
@@ -1317,9 +1582,15 @@ case $basic_machine in
 	*-be)
 		os=-beos
 		;;
+	*-haiku)
+		os=-haiku
+		;;
 	*-ibm)
 		os=-aix
 		;;
+	*-knuth)
+		os=-mmixware
+		;;
 	*-wec)
 		os=-proelf
 		;;
@@ -1422,7 +1693,7 @@ case $basic_machine in
 			-sunos*)
 				vendor=sun
 				;;
-			-aix*)
+			-cnk*|-aix*)
 				vendor=ibm
 				;;
 			-beos*)
@@ -1452,9 +1723,15 @@ case $basic_machine in
 			-mvs* | -opened*)
 				vendor=ibm
 				;;
+			-os400*)
+				vendor=ibm
+				;;
 			-ptx*)
 				vendor=sequent
 				;;
+			-tpf*)
+				vendor=ibm
+				;;
 			-vxsim* | -vxworks* | -windiss*)
 				vendor=wrs
 				;;
@@ -1479,7 +1756,7 @@ case $basic_machine in
 esac
 
 echo $basic_machine$os
-exit 0
+exit
 
 # Local variables:
 # eval: (add-hook 'write-file-hooks 'time-stamp)
diff --git a/projects/sample/autoconf/configure.ac b/projects/sample/autoconf/configure.ac
index c3a49d5..bd0b16a 100644
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@@ -305,11 +305,12 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   mips-*)                 llvm_cv_target_arch="Mips" ;;
+  mipsel-*)               llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
   mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
-  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
   *)                      llvm_cv_target_arch="Unknown" ;;
 esac])
 
@@ -456,7 +457,7 @@ else
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
     Hexagon)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     MBlaze)      AC_SUBST(TARGET_HAS_JIT,0) ;;
-    PTX)         AC_SUBST(TARGET_HAS_JIT,0) ;;
+    NVPTX)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     *)           AC_SUBST(TARGET_HAS_JIT,0) ;;
   esac
 fi
@@ -567,13 +568,13 @@ TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
     [Build specific host targets: all or target1,target2,... Valid targets are:
      host, x86, x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-     xcore, msp430, ptx, cbe, and cpp (default=all)]),,
+     xcore, msp430, nvptx, cbe, and cpp (default=all)]),,
     enableval=all)
 if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -588,7 +589,7 @@ case "$enableval" in
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
         mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
-        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -601,7 +602,7 @@ case "$enableval" in
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
             MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
             Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
-            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
             *)       AC_MSG_ERROR([Can not set target to build]) ;;
           esac ;;
         *) AC_MSG_ERROR([Unrecognized target $a_target]) ;;
diff --git a/projects/sample/autoconf/m4/config_makefile.m4 b/projects/sample/autoconf/m4/config_makefile.m4
index d9bfcb8..b1eaffd 100644
--- a/projects/sample/autoconf/m4/config_makefile.m4
+++ b/projects/sample/autoconf/m4/config_makefile.m4
@@ -4,6 +4,6 @@
 #
 AC_DEFUN([AC_CONFIG_MAKEFILE],
 [AC_CONFIG_COMMANDS($1,
-  [${srcdir}/autoconf/mkinstalldirs `dirname $1`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/$1 $1])
+  [${llvm_src}/autoconf/mkinstalldirs `dirname $1`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/$1 $1])
 ])
diff --git a/projects/sample/autoconf/m4/func_isinf.m4 b/projects/sample/autoconf/m4/func_isinf.m4
index 5c000f8..22ba81d 100644
--- a/projects/sample/autoconf/m4/func_isinf.m4
+++ b/projects/sample/autoconf/m4/func_isinf.m4
@@ -1,5 +1,5 @@
 #
-# This function determins if the the isinf function isavailable on this
+# This function determins if the isinf function isavailable on this
 # platform.
 #
 AC_DEFUN([AC_FUNC_ISINF],[
diff --git a/projects/sample/autoconf/m4/huge_val.m4 b/projects/sample/autoconf/m4/huge_val.m4
index 7ef9dca..6c9a22e 100644
--- a/projects/sample/autoconf/m4/huge_val.m4
+++ b/projects/sample/autoconf/m4/huge_val.m4
@@ -1,12 +1,12 @@
 #
-# This function determins if the the HUGE_VAL macro is compilable with the 
+# This function determins if the HUGE_VAL macro is compilable with the 
 # -pedantic switch or not. XCode < 2.4.1 doesn't get it right.
 #
 AC_DEFUN([AC_HUGE_VAL_CHECK],[
   AC_CACHE_CHECK([for HUGE_VAL sanity], [ac_cv_huge_val_sanity],[
     AC_LANG_PUSH([C++])
     ac_save_CXXFLAGS=$CXXFLAGS
-    CXXFLAGS+=" -pedantic"
+    CXXFLAGS="$CXXFLAGS -pedantic"
     AC_RUN_IFELSE(
       AC_LANG_PROGRAM(
         [#include <math.h>],
diff --git a/projects/sample/autoconf/m4/link_options.m4 b/projects/sample/autoconf/m4/link_options.m4
index 4c5f2f4..57da4a0 100644
--- a/projects/sample/autoconf/m4/link_options.m4
+++ b/projects/sample/autoconf/m4/link_options.m4
@@ -10,7 +10,7 @@ AC_DEFUN([AC_LINK_GET_VERSION],
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
-     llvm_cv_link_version=$(echo "$version_string" | sed -e "s#.*ld64-\([^ ]*\)#\1#")
+     llvm_cv_link_version=$(echo "$version_string" | sed -e "s#.*ld64-\([^ ]*\)\( (.*)\)\{0,1\}#\1#")
    else
      llvm_cv_link_version=$(echo "$version_string" | sed -e "s#[^0-9]*\([0-9.]*\).*#\1#")
    fi
diff --git a/projects/sample/autoconf/m4/rand48.m4 b/projects/sample/autoconf/m4/rand48.m4
index 56705d8..76f08fa 100644
--- a/projects/sample/autoconf/m4/rand48.m4
+++ b/projects/sample/autoconf/m4/rand48.m4
@@ -1,5 +1,5 @@
 #
-# This function determins if the the srand48,drand48,lrand48 functions are
+# This function determins if the srand48,drand48,lrand48 functions are
 # available on this platform.
 #
 AC_DEFUN([AC_FUNC_RAND48],[
diff --git a/projects/sample/autoconf/m4/visibility_inlines_hidden.m4 b/projects/sample/autoconf/m4/visibility_inlines_hidden.m4
index 42ddbe9..b1cc42a 100644
--- a/projects/sample/autoconf/m4/visibility_inlines_hidden.m4
+++ b/projects/sample/autoconf/m4/visibility_inlines_hidden.m4
@@ -8,8 +8,10 @@ AC_DEFUN([AC_CXX_USE_VISIBILITY_INLINES_HIDDEN],
                 [llvm_cv_cxx_visibility_inlines_hidden],
 [ AC_LANG_PUSH([C++])
   oldcxxflags="$CXXFLAGS"
-  CXXFLAGS="$CXXFLAGS -fvisibility-inlines-hidden"
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+  CXXFLAGS="$CXXFLAGS -O0 -fvisibility-inlines-hidden -Werror"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+    [template <typename T> struct X { void __attribute__((noinline)) f() {} };],
+    [X<int>().f();])],
     [llvm_cv_cxx_visibility_inlines_hidden=yes],[llvm_cv_cxx_visibility_inlines_hidden=no])
   CXXFLAGS="$oldcxxflags"
   AC_LANG_POP([C++])
diff --git a/projects/sample/configure b/projects/sample/configure
index 7c5e2ee..df08c7c 100755
--- a/projects/sample/configure
+++ b/projects/sample/configure
@@ -1402,7 +1402,7 @@ Optional Features:
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
                           x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-                          xcore, msp430, ptx, cbe, and cpp (default=all)
+                          xcore, msp430, nvptx, cbe, and cpp (default=all)
   --enable-bindings       Build specific language bindings:
                           all,auto,none,{binding-name} (default=auto)
   --enable-libffi         Check for the presence of libffi (default is NO)
@@ -3841,11 +3841,12 @@ else
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   mips-*)                 llvm_cv_target_arch="Mips" ;;
+  mipsel-*)               llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
   mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
-  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
   *)                      llvm_cv_target_arch="Unknown" ;;
 esac
 fi
@@ -5068,7 +5069,7 @@ else
  ;;
     MBlaze)      TARGET_HAS_JIT=0
  ;;
-    PTX)         TARGET_HAS_JIT=0
+    NVPTX)       TARGET_HAS_JIT=0
  ;;
     *)           TARGET_HAS_JIT=0
  ;;
@@ -5254,7 +5255,7 @@ if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -5269,7 +5270,7 @@ case "$enableval" in
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
         mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
-        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -5282,7 +5283,7 @@ case "$enableval" in
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
             MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
             Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
-            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
             *)       { { echo "$as_me:$LINENO: error: Can not set target to build" >&5
 echo "$as_me: error: Can not set target to build" >&2;}
    { (exit 1); exit 1; }; } ;;
@@ -7626,7 +7627,7 @@ else
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
-     llvm_cv_link_version=$(echo "$version_string" | sed -e "s#.*ld64-\([^ ]*\)#\1#")
+     llvm_cv_link_version=$(echo "$version_string" | sed -e "s#.*ld64-\([^ ]*\)\( (.*)\)\{0,1\}#\1#")
    else
      llvm_cv_link_version=$(echo "$version_string" | sed -e "s#[^0-9]*\([0-9.]*\).*#\1#")
    fi
@@ -10307,7 +10308,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10303 "configure"
+#line 10311 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -15968,7 +15969,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
     ac_save_CXXFLAGS=$CXXFLAGS
-    CXXFLAGS+=" -pedantic"
+    CXXFLAGS="$CXXFLAGS -pedantic"
     if test "$cross_compiling" = yes; then
   ac_cv_huge_val_sanity=yes
 else
@@ -20794,18 +20795,18 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
   oldcxxflags="$CXXFLAGS"
-  CXXFLAGS="$CXXFLAGS -fvisibility-inlines-hidden"
+  CXXFLAGS="$CXXFLAGS -O0 -fvisibility-inlines-hidden -Werror"
   cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
 cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
-
+template <typename T> struct X { void __attribute__((noinline)) f() {} };
 int
 main ()
 {
-
+X<int>().f();
   ;
   return 0;
 }
@@ -22092,16 +22093,16 @@ echo "$as_me: executing $ac_file commands" >&6;}
 
 
   case $ac_file$ac_mode in
-    "Makefile":C) ${srcdir}/autoconf/mkinstalldirs `dirname Makefile`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/Makefile Makefile ;;
-    "lib/Makefile":C) ${srcdir}/autoconf/mkinstalldirs `dirname lib/Makefile`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/lib/Makefile lib/Makefile ;;
-    "lib/sample/Makefile":C) ${srcdir}/autoconf/mkinstalldirs `dirname lib/sample/Makefile`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/lib/sample/Makefile lib/sample/Makefile ;;
-    "tools/Makefile":C) ${srcdir}/autoconf/mkinstalldirs `dirname tools/Makefile`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/tools/Makefile tools/Makefile ;;
-    "tools/sample/Makefile":C) ${srcdir}/autoconf/mkinstalldirs `dirname tools/sample/Makefile`
-   ${SHELL} ${srcdir}/autoconf/install-sh -m 0644 -c ${srcdir}/tools/sample/Makefile tools/sample/Makefile ;;
+    "Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname Makefile`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/Makefile Makefile ;;
+    "lib/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname lib/Makefile`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/lib/Makefile lib/Makefile ;;
+    "lib/sample/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname lib/sample/Makefile`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/lib/sample/Makefile lib/sample/Makefile ;;
+    "tools/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname tools/Makefile`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/tools/Makefile tools/Makefile ;;
+    "tools/sample/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname tools/sample/Makefile`
+   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/tools/sample/Makefile tools/sample/Makefile ;;
 
   esac
 done # for ac_tag
diff --git a/runtime/libprofile/CommonProfiling.c b/runtime/libprofile/CommonProfiling.c
index d55f51c..acc17ce 100644
--- a/runtime/libprofile/CommonProfiling.c
+++ b/runtime/libprofile/CommonProfiling.c
@@ -65,6 +65,16 @@ int save_arguments(int argc, const char **argv) {
   for (Length = 0, i = 0; i != (unsigned)argc; ++i)
     Length += strlen(argv[i])+1;
 
+  /* Defensively check for a zero length, even though this is unlikely
+   * to happen in practice.  This avoids calling malloc() below with a
+   * size of 0.
+   */
+  if (Length == 0) {
+    SavedArgs = 0;
+    SavedArgsLength = 0;
+    return argc;
+  }
+  
   SavedArgs = (char*)malloc(Length);
   for (Length = 0, i = 0; i != (unsigned)argc; ++i) {
     unsigned Len = strlen(argv[i]);
diff --git a/runtime/libprofile/GCDAProfiling.c b/runtime/libprofile/GCDAProfiling.c
index 3a48bb2..f2dc4f7 100644
--- a/runtime/libprofile/GCDAProfiling.c
+++ b/runtime/libprofile/GCDAProfiling.c
@@ -43,10 +43,8 @@ static void write_int32(uint32_t i) {
 }
 
 static void write_int64(uint64_t i) {
-  uint32_t lo, hi;
-  lo = i >>  0;
-  hi = i >> 32;
-
+  uint32_t lo = i >>  0;
+  uint32_t hi = i >> 32;
   write_int32(lo);
   write_int32(hi);
 }
@@ -76,7 +74,6 @@ static char *mangle_filename(const char *orig_filename) {
   strcpy(filename, prefix);
   strcat(filename, "/");
   strcat(filename, orig_filename);
-
   return filename;
 }
 
@@ -85,17 +82,16 @@ static void recursive_mkdir(const char *filename) {
   int i, e;
 
   for (i = 1, e = strlen(filename); i != e; ++i) {
-    if (filename[i] == '/') {
-      pathname = malloc(i + 1);
-      strncpy(pathname, filename, i);
-      pathname[i] = '\0';
+    if (filename[i] != '/') continue;
+    pathname = malloc(i + 1);
+    strncpy(pathname, filename, i);
+    pathname[i] = '\0';
 #ifdef _WIN32
-      _mkdir(pathname);
+    _mkdir(pathname);
 #else
-      mkdir(pathname, 0750);  /* some of these will fail, ignore it. */
+    mkdir(pathname, 0750);  /* some of these will fail, ignore it. */
 #endif
-      free(pathname);
-    }
+    free(pathname);
   }
 }
 
@@ -111,17 +107,18 @@ void llvm_gcda_start_file(const char *orig_filename) {
   char *filename;
   filename = mangle_filename(orig_filename);
   recursive_mkdir(filename);
-  output_file = fopen(filename, "wb");
+  output_file = fopen(filename, "w+b");
 
   if (!output_file) {
     const char *cptr = strrchr(orig_filename, '/');
-    output_file = fopen(cptr ? cptr + 1 : orig_filename, "wb");
+    output_file = fopen(cptr ? cptr + 1 : orig_filename, "w+b");
 
     if (!output_file) {
-      fprintf(stderr, "LLVM profiling runtime: while opening '%s': ",
+      fprintf(stderr, "LLVM profiling runtime: cannot open '%s': ",
               cptr ? cptr + 1 : orig_filename);
       perror("");
-      exit(1);
+      free(filename);
+      return;
     }
   }
 
@@ -167,6 +164,7 @@ void llvm_gcda_emit_function(uint32_t ident, const char *function_name) {
 #ifdef DEBUG_GCDAPROFILING
   printf("llvmgcda: function id=%x\n", ident);
 #endif
+  if (!output_file) return;
 
   /* function tag */  
   fwrite("\0\0\0\1", 4, 1, output_file);
@@ -179,23 +177,24 @@ void llvm_gcda_emit_function(uint32_t ident, const char *function_name) {
 
 void llvm_gcda_emit_arcs(uint32_t num_counters, uint64_t *counters) {
   uint32_t i;
-  /* counter #1 (arcs) tag */
+
+  /* Counter #1 (arcs) tag */
+  if (!output_file) return;
   fwrite("\0\0\xa1\1", 4, 1, output_file);
   write_int32(num_counters * 2);
-  for (i = 0; i < num_counters; ++i) {
+  for (i = 0; i < num_counters; ++i)
     write_int64(counters[i]);
-  }
 
 #ifdef DEBUG_GCDAPROFILING
   printf("llvmgcda:   %u arcs\n", num_counters);
-  for (i = 0; i < num_counters; ++i) {
+  for (i = 0; i < num_counters; ++i)
     printf("llvmgcda:   %llu\n", (unsigned long long)counters[i]);
-  }
 #endif
 }
 
 void llvm_gcda_end_file() {
   /* Write out EOF record. */
+  if (!output_file) return;
   fwrite("\0\0\0\0\0\0\0\0", 8, 1, output_file);
   fclose(output_file);
   output_file = NULL;
diff --git a/test/Analysis/BasicAA/2003-02-26-AccessSizeTest.ll b/test/Analysis/BasicAA/2003-02-26-AccessSizeTest.ll
index 1c2d910..45f6088 100644
--- a/test/Analysis/BasicAA/2003-02-26-AccessSizeTest.ll
+++ b/test/Analysis/BasicAA/2003-02-26-AccessSizeTest.ll
@@ -2,17 +2,19 @@
 ; is performed.  It is not legal to delete the second load instruction because
 ; the value computed by the first load instruction is changed by the store.
 
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep DONOTREMOVE
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 
 define i32 @test() {
-	%A = alloca i32
-	store i32 0, i32* %A
-    %X = load i32* %A
-    %B = bitcast i32* %A to i8*
-    %C = getelementptr i8* %B, i64 1
-	store i8 1, i8* %C    ; Aliases %A
-    %Y.DONOTREMOVE = load i32* %A
-	%Z = sub i32 %X, %Y.DONOTREMOVE
-    ret i32 %Z
+; CHECK: %Y.DONOTREMOVE = load i32* %A
+; CHECK: %Z = sub i32 0, %Y.DONOTREMOVE
+  %A = alloca i32
+  store i32 0, i32* %A
+  %X = load i32* %A
+  %B = bitcast i32* %A to i8*
+  %C = getelementptr i8* %B, i64 1
+  store i8 1, i8* %C    ; Aliases %A
+  %Y.DONOTREMOVE = load i32* %A
+  %Z = sub i32 %X, %Y.DONOTREMOVE
+  ret i32 %Z
 }
 
diff --git a/test/Analysis/BasicAA/2003-04-22-GEPProblem.ll b/test/Analysis/BasicAA/2003-04-22-GEPProblem.ll
index 5d20077..78f74a0 100644
--- a/test/Analysis/BasicAA/2003-04-22-GEPProblem.ll
+++ b/test/Analysis/BasicAA/2003-04-22-GEPProblem.ll
@@ -1,15 +1,14 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep sub
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 
 ; BasicAA was incorrectly concluding that P1 and P2 didn't conflict!
 
 define i32 @test(i32 *%Ptr, i64 %V) {
-	%P2 = getelementptr i32* %Ptr, i64 1
-	%P1 = getelementptr i32* %Ptr, i64 %V
-	%X = load i32* %P1
-	store i32 5, i32* %P2
-
-	%Y = load i32* %P1
-
-	%Z = sub i32 %X, %Y
-	ret i32 %Z
+; CHECK: sub i32 %X, %Y
+  %P2 = getelementptr i32* %Ptr, i64 1
+  %P1 = getelementptr i32* %Ptr, i64 %V
+  %X = load i32* %P1
+  store i32 5, i32* %P2
+  %Y = load i32* %P1
+  %Z = sub i32 %X, %Y
+  ret i32 %Z
 }
diff --git a/test/Analysis/BasicAA/2003-09-19-LocalArgument.ll b/test/Analysis/BasicAA/2003-09-19-LocalArgument.ll
index 56e3339..fd4c239 100644
--- a/test/Analysis/BasicAA/2003-09-19-LocalArgument.ll
+++ b/test/Analysis/BasicAA/2003-09-19-LocalArgument.ll
@@ -1,6 +1,9 @@
 ; In this test, a local alloca cannot alias an incoming argument.
 
-; RUN: opt < %s -basicaa -gvn -instcombine -S | not grep sub
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
+
+; CHECK:      define i32 @test
+; CHECK-NEXT: ret i32 0
 
 define i32 @test(i32* %P) {
 	%X = alloca i32
diff --git a/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll b/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
index 010a4588..768411e 100644
--- a/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
+++ b/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
@@ -1,11 +1,13 @@
 ; This testcase consists of alias relations which should be completely
 ; resolvable by basicaa.
 
-; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output \
-; RUN: |& not grep May:
+; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
 
 %T = type { i32, [10 x i8] }
 
+; CHECK:     Function: test
+; CHECK-NOT:   May:
+
 define void @test(%T* %P) {
   %A = getelementptr %T* %P, i64 0
   %B = getelementptr %T* %P, i64 0, i32 0
diff --git a/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll b/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
index ce01db6..b7bbf77 100644
--- a/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
+++ b/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
@@ -1,13 +1,15 @@
 ; This testcase consists of alias relations which should be completely
 ; resolvable by basicaa, but require analysis of getelementptr constant exprs.
 
-; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output \
-; RUN: |& not grep May:
+; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
 
 %T = type { i32, [10 x i8] }
 
 @G = external global %T
 
+; CHECK:     Function: test
+; CHECK-NOT:   May:
+
 define void @test() {
   %D = getelementptr %T* @G, i64 0, i32 0
   %E = getelementptr %T* @G, i64 0, i32 1, i64 5
diff --git a/test/Analysis/BasicAA/2004-07-28-MustAliasbug.ll b/test/Analysis/BasicAA/2004-07-28-MustAliasbug.ll
index 56e4ed0..578aa594 100644
--- a/test/Analysis/BasicAA/2004-07-28-MustAliasbug.ll
+++ b/test/Analysis/BasicAA/2004-07-28-MustAliasbug.ll
@@ -1,10 +1,11 @@
-; RUN: opt < %s -basicaa -dse -S | grep {store i32 0}
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
 
 define void @test({i32,i32 }* %P) {
-	%Q = getelementptr {i32,i32}* %P, i32 1
-	%X = getelementptr {i32,i32}* %Q, i32 0, i32 1
-	%Y = getelementptr {i32,i32}* %Q, i32 1, i32 1
-	store i32 0, i32* %X
-	store i32 1, i32* %Y
-	ret void
+; CHECK: store i32 0, i32* %X
+  %Q = getelementptr {i32,i32}* %P, i32 1
+  %X = getelementptr {i32,i32}* %Q, i32 0, i32 1
+  %Y = getelementptr {i32,i32}* %Q, i32 1, i32 1
+  store i32 0, i32* %X
+  store i32 1, i32* %Y
+  ret void
 }
diff --git a/test/Analysis/BasicAA/2006-03-03-BadArraySubscript.ll b/test/Analysis/BasicAA/2006-03-03-BadArraySubscript.ll
index 8320594..06a804c 100644
--- a/test/Analysis/BasicAA/2006-03-03-BadArraySubscript.ll
+++ b/test/Analysis/BasicAA/2006-03-03-BadArraySubscript.ll
@@ -1,7 +1,9 @@
-; RUN: opt < %s -basicaa -aa-eval -disable-output |& grep {2 no alias respon}
+; RUN: opt < %s -basicaa -aa-eval -disable-output 2>&1 | FileCheck %s
 ; TEST that A[1][0] may alias A[0][i].
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
+; CHECK: 2 no alias responses
+
 define void @test(i32 %N) {
 entry:
 	%X = alloca [3 x [3 x i32]]		; <[3 x [3 x i32]]*> [#uses=4]
diff --git a/test/Analysis/BasicAA/2007-01-13-BasePointerBadNoAlias.ll b/test/Analysis/BasicAA/2007-01-13-BasePointerBadNoAlias.ll
index 917bf25..46b6aaf 100644
--- a/test/Analysis/BasicAA/2007-01-13-BasePointerBadNoAlias.ll
+++ b/test/Analysis/BasicAA/2007-01-13-BasePointerBadNoAlias.ll
@@ -1,9 +1,5 @@
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 ; PR1109
-; RUN: opt < %s -basicaa -gvn -instcombine -S | \
-; RUN:   grep {sub i32}
-; RUN: opt < %s -basicaa -gvn -instcombine -S | \
-; RUN:   not grep {ret i32 0}
-; END.
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8"
@@ -20,6 +16,9 @@ target triple = "i686-apple-darwin8"
 	%struct.head_type = type { [2 x %struct.LIST], %struct.FIRST_UNION, %struct.SECOND_UNION, %struct.THIRD_UNION, %struct.FOURTH_UNION, %struct.rec*, { %struct.rec* }, %struct.rec*, %struct.rec*, %struct.rec*, %struct.rec*, %struct.rec*, %struct.rec*, %struct.rec*, %struct.rec*, i32 }
 	%struct.rec = type { %struct.head_type }
 
+; CHECK: define i32 @test
+; CHECK:   %Z = sub i32 %A, %Q
+; CHECK:   ret i32 %Z
 
 define i32 @test(%struct.closure_type* %tmp18169) {
 	%tmp18174 = getelementptr %struct.closure_type* %tmp18169, i32 0, i32 4, i32 0, i32 0		; <i32*> [#uses=2]
diff --git a/test/Analysis/BasicAA/2007-08-01-NoAliasAndCalls.ll b/test/Analysis/BasicAA/2007-08-01-NoAliasAndCalls.ll
index e6a26e3..2a6f5b9 100644
--- a/test/Analysis/BasicAA/2007-08-01-NoAliasAndCalls.ll
+++ b/test/Analysis/BasicAA/2007-08-01-NoAliasAndCalls.ll
@@ -1,6 +1,7 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {MayAlias:.*i32\\* %., i32\\* %.} | grep {%x} | grep {%y}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 
-declare i32* @unclear(i32* %a)
+; CHECK: Function: foo
+; CHECK:   MayAlias: i32* %x, i32* %y
 
 define void @foo(i32* noalias %x) {
   %y = call i32* @unclear(i32* %x)
@@ -8,3 +9,5 @@ define void @foo(i32* noalias %x) {
   store i32 0, i32* %y
   ret void
 }
+
+declare i32* @unclear(i32* %a)
diff --git a/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll b/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
index 7f33fa4..4be793e 100644
--- a/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
+++ b/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
@@ -1,6 +1,9 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {9 no alias}
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {6 may alias}
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {MayAlias:.*i32\\* %Ipointer, i32\\* %Jpointer}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; CHECK: Function: foo
+; CHECK:   MayAlias: i32* %Ipointer, i32* %Jpointer
+; CHECK: 9 no alias responses
+; CHECK: 6 may alias responses
 
 define void @foo(i32* noalias %p, i32* noalias %q, i32 %i, i32 %j) {
   %Ipointer = getelementptr i32* %p, i32 %i
diff --git a/test/Analysis/BasicAA/2007-08-05-GetOverloadedModRef.ll b/test/Analysis/BasicAA/2007-08-05-GetOverloadedModRef.ll
index 5b81c17..ec0e2bd 100644
--- a/test/Analysis/BasicAA/2007-08-05-GetOverloadedModRef.ll
+++ b/test/Analysis/BasicAA/2007-08-05-GetOverloadedModRef.ll
@@ -1,11 +1,10 @@
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 ; PR1600
-; RUN: opt < %s -basicaa -gvn -instcombine -S | \
-; RUN:   grep {ret i32 0}
-; END.
 
 declare i16 @llvm.cttz.i16(i16, i1)
 
 define i32 @test(i32* %P, i16* %Q) {
+; CHECK: ret i32 0
         %A = load i16* %Q               ; <i16> [#uses=1]
         %x = load i32* %P               ; <i32> [#uses=1]
         %B = call i16 @llvm.cttz.i16( i16 %A, i1 true )          ; <i16> [#uses=1]
diff --git a/test/Analysis/BasicAA/2007-10-24-ArgumentsGlobals.ll b/test/Analysis/BasicAA/2007-10-24-ArgumentsGlobals.ll
index 78f24b5..429160e 100644
--- a/test/Analysis/BasicAA/2007-10-24-ArgumentsGlobals.ll
+++ b/test/Analysis/BasicAA/2007-10-24-ArgumentsGlobals.ll
@@ -1,10 +1,12 @@
-; RUN: opt < %s -basicaa -gvn -dce -S | grep tmp7
+; RUN: opt < %s -basicaa -gvn -dce -S | FileCheck %s
 
         %struct.A = type { i32 }
         %struct.B = type { %struct.A }
 @a = global %struct.B zeroinitializer           ; <%struct.B*> [#uses=2]
 
 define i32 @_Z3fooP1A(%struct.A* %b) {
+; CHECK: %tmp7 = load
+; CHECK: ret i32 %tmp7
 entry:
         store i32 1, i32* getelementptr (%struct.B* @a, i32 0, i32 0, i32 0), align 8
         %tmp4 = getelementptr %struct.A* %b, i32 0, i32 0               ;<i32*> [#uses=1]
diff --git a/test/Analysis/BasicAA/2008-04-15-Byval.ll b/test/Analysis/BasicAA/2008-04-15-Byval.ll
index 2069401..428189a 100644
--- a/test/Analysis/BasicAA/2008-04-15-Byval.ll
+++ b/test/Analysis/BasicAA/2008-04-15-Byval.ll
@@ -1,10 +1,11 @@
-; RUN: opt < %s -std-compile-opts -S | grep store
+; RUN: opt < %s -std-compile-opts -S | FileCheck %s
 ; ModuleID = 'small2.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 	%struct.x = type { [4 x i32] }
 
 define void @foo(%struct.x* byval align 4  %X) nounwind  {
+; CHECK: store i32 2, i32* %tmp1
 entry:
 	%tmp = getelementptr %struct.x* %X, i32 0, i32 0		; <[4 x i32]*> [#uses=1]
 	%tmp1 = getelementptr [4 x i32]* %tmp, i32 0, i32 3		; <i32*> [#uses=1]
diff --git a/test/Analysis/BasicAA/2008-11-23-NoaliasRet.ll b/test/Analysis/BasicAA/2008-11-23-NoaliasRet.ll
index c9e553d..3db9a3f 100644
--- a/test/Analysis/BasicAA/2008-11-23-NoaliasRet.ll
+++ b/test/Analysis/BasicAA/2008-11-23-NoaliasRet.ll
@@ -1,7 +1,9 @@
-; RUN: opt < %s -basicaa -aa-eval |& grep {1 no alias response}
+; RUN: opt < %s -basicaa -aa-eval -disable-output 2>&1 | FileCheck %s
 
 declare noalias i32* @_Znwj(i32 %x) nounwind
 
+; CHECK: 1 no alias response
+
 define i32 @foo() {
   %A = call i32* @_Znwj(i32 4)
   %B = call i32* @_Znwj(i32 4)
diff --git a/test/Analysis/BasicAA/2009-03-04-GEPNoalias.ll b/test/Analysis/BasicAA/2009-03-04-GEPNoalias.ll
index 3ab5d03..add7dee 100644
--- a/test/Analysis/BasicAA/2009-03-04-GEPNoalias.ll
+++ b/test/Analysis/BasicAA/2009-03-04-GEPNoalias.ll
@@ -1,8 +1,9 @@
-; RUN: opt < %s -basicaa -gvn -S | grep load
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
 
 declare noalias i32* @noalias()
 
 define i32 @test(i32 %x) {
+; CHECK: load i32* %a
   %a = call i32* @noalias()
   store i32 1, i32* %a
   %b = getelementptr i32* %a, i32 %x
diff --git a/test/Analysis/BasicAA/2009-10-13-GEP-BaseNoAlias.ll b/test/Analysis/BasicAA/2009-10-13-GEP-BaseNoAlias.ll
index 17db2fd..c546d68 100644
--- a/test/Analysis/BasicAA/2009-10-13-GEP-BaseNoAlias.ll
+++ b/test/Analysis/BasicAA/2009-10-13-GEP-BaseNoAlias.ll
@@ -1,10 +1,13 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {NoAlias:.*%P,.*@Z}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 ; If GEP base doesn't alias Z, then GEP doesn't alias Z.
 ; rdar://7282591
 
 @Y = common global i32 0
 @Z = common global i32 0
 
+; CHECK: Function: foo
+; CHECK:   NoAlias: i32* %P, i32* @Z
+
 define void @foo(i32 %cond) nounwind {
 entry:
   %a = alloca i32
diff --git a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
index 7b5584e..6656980 100644
--- a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
+++ b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
@@ -1,8 +1,10 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {1 partial alias}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 ; PR7959
 
 target datalayout = "e-p:32:32:32"
 
+; CHECK: 1 partial alias response
+
 define i32 @test(i32* %tab, i32 %indvar) nounwind {
   %tmp31 = mul i32 %indvar, -2
   %tmp32 = add i32 %tmp31, 30
diff --git a/test/Analysis/BasicAA/args-rets-allocas-loads.ll b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
index c7b43ec..066f46b 100644
--- a/test/Analysis/BasicAA/args-rets-allocas-loads.ll
+++ b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -aa-eval -print-all-alias-modref-info -disable-output < %s |& FileCheck  %s
+; RUN: opt -basicaa -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck  %s
 
 declare void @callee(double* %callee_arg)
 declare void @nocap_callee(double* nocapture %nocap_callee_arg)
diff --git a/test/Analysis/BasicAA/byval.ll b/test/Analysis/BasicAA/byval.ll
index 2aba753..673fee0 100644
--- a/test/Analysis/BasicAA/byval.ll
+++ b/test/Analysis/BasicAA/byval.ll
@@ -1,17 +1,17 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {ret i32 1}
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
 	%struct.x = type { i32, i32, i32, i32 }
 @g = weak global i32 0		; <i32*> [#uses=1]
 
 define i32 @foo(%struct.x* byval  %a) nounwind  {
-entry:
-	%tmp1 = tail call i32 (...)* @bar( %struct.x* %a ) nounwind 		; <i32> [#uses=0]
-	%tmp2 = getelementptr %struct.x* %a, i32 0, i32 0		; <i32*> [#uses=2]
-	store i32 1, i32* %tmp2, align 4
-	store i32 2, i32* @g, align 4
-	%tmp4 = load i32* %tmp2, align 4		; <i32> [#uses=1]
-	ret i32 %tmp4
+; CHECK: ret i32 1
+  %tmp1 = tail call i32 (...)* @bar( %struct.x* %a ) nounwind 		; <i32> [#uses=0]
+  %tmp2 = getelementptr %struct.x* %a, i32 0, i32 0		; <i32*> [#uses=2]
+  store i32 1, i32* %tmp2, align 4
+  store i32 2, i32* @g, align 4
+  %tmp4 = load i32* %tmp2, align 4		; <i32> [#uses=1]
+  ret i32 %tmp4
 }
 
 declare i32 @bar(...)
diff --git a/test/Analysis/BasicAA/cas.ll b/test/Analysis/BasicAA/cas.ll
index 754309c..d0cd9f4 100644
--- a/test/Analysis/BasicAA/cas.ll
+++ b/test/Analysis/BasicAA/cas.ll
@@ -1,8 +1,9 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep {ret i32 0}
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 
 @flag0 = internal global i32 zeroinitializer
 @turn = internal global i32 zeroinitializer
 
+; CHECK: ret i32 0
 
 define i32 @main() {
   %a = load i32* @flag0
diff --git a/test/Analysis/BasicAA/constant-over-index.ll b/test/Analysis/BasicAA/constant-over-index.ll
index 48ef259..232533c 100644
--- a/test/Analysis/BasicAA/constant-over-index.ll
+++ b/test/Analysis/BasicAA/constant-over-index.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
 ; PR4267
 
 ; CHECK: MayAlias: double* %p.0.i.0, double* %p3
diff --git a/test/Analysis/BasicAA/dag.ll b/test/Analysis/BasicAA/dag.ll
index 501f4c3..1d2f6f1 100644
--- a/test/Analysis/BasicAA/dag.ll
+++ b/test/Analysis/BasicAA/dag.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/test/Analysis/BasicAA/empty.ll b/test/Analysis/BasicAA/empty.ll
index 7b06780..dfc79f9 100644
--- a/test/Analysis/BasicAA/empty.ll
+++ b/test/Analysis/BasicAA/empty.ll
@@ -1,8 +1,10 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output \
-; RUN:   |& grep {NoAlias:	\{\}\\* \[%\]p, \{\}\\* \[%\]q}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
+; CHECK:      Function: foo:
+; CHECK-NEXT:   NoAlias: {}* %p, {}* %q
+
 define void @foo({}* %p, {}* %q) {
   store {} {}, {}* %p
   store {} {}, {}* %q
diff --git a/test/Analysis/BasicAA/full-store-partial-alias.ll b/test/Analysis/BasicAA/full-store-partial-alias.ll
index 4fa6375..2c34fd5 100644
--- a/test/Analysis/BasicAA/full-store-partial-alias.ll
+++ b/test/Analysis/BasicAA/full-store-partial-alias.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -tbaa -basicaa -gvn < %s | grep {ret i32 %}
-; RUN: opt -S -tbaa -gvn < %s | grep {ret i32 0}
+; RUN: opt -S -tbaa -basicaa -gvn < %s | FileCheck -check-prefix=BASICAA %s
+; RUN: opt -S -tbaa -gvn < %s | FileCheck %s
 ; rdar://8875631, rdar://8875069
 
 ; BasicAA should notice that the store stores to the entire %u object,
@@ -14,6 +14,8 @@ target datalayout = "e-p:64:64:64"
 @endianness_test = global i64 1, align 8
 
 define i32 @signbit(double %x) nounwind {
+; BASICAA: ret i32 %tmp5.lobit
+; CHECK:   ret i32 0
 entry:
   %u = alloca %union.anon, align 8
   %tmp9 = getelementptr inbounds %union.anon* %u, i64 0, i32 0
diff --git a/test/Analysis/BasicAA/gcsetest.ll b/test/Analysis/BasicAA/gcsetest.ll
index a903362..db557b7 100644
--- a/test/Analysis/BasicAA/gcsetest.ll
+++ b/test/Analysis/BasicAA/gcsetest.ll
@@ -2,12 +2,15 @@
 ; disambiguating some obvious cases.  All loads should be removable in 
 ; this testcase.
 
-; RUN: opt < %s -basicaa -gvn -instcombine -dce -S \
-; RUN: | not grep load
+; RUN: opt < %s -basicaa -gvn -instcombine -dce -S | FileCheck %s
 
 @A = global i32 7
 @B = global i32 8
 
+; CHECK:      define i32 @test()
+; CHECK-NEXT:   store i32 123, i32* @B
+; CHECK-NEXT:   ret i32 0
+
 define i32 @test() {
 	%A1 = load i32* @A
 
@@ -18,6 +21,14 @@ define i32 @test() {
 	ret i32 %X
 }
 
+; CHECK:      define i32 @test2()
+; CHECK-NEXT:   br label %Loop
+; CHECK:      Loop:
+; CHECK-NEXT:   store i32 0, i32* @B
+; CHECK-NEXT:   br i1 true, label %out, label %Loop
+; CHECK:      out:
+; CHECK-NEXT:   ret i32 0
+
 define i32 @test2() {
         %A1 = load i32* @A
         br label %Loop
@@ -36,6 +47,10 @@ out:
 
 declare void @external()
 
+; CHECK:      define i32 @test3()
+; CHECK-NEXT:   call void @external()
+; CHECK-NEXT:   ret i32 7
+
 define i32 @test3() {
 	%X = alloca i32
 	store i32 7, i32* %X
diff --git a/test/Analysis/BasicAA/gep-alias.ll b/test/Analysis/BasicAA/gep-alias.ll
index 4bb2832..9c2c7ee 100644
--- a/test/Analysis/BasicAA/gep-alias.ll
+++ b/test/Analysis/BasicAA/gep-alias.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S |& FileCheck %s
+; RUN: opt < %s -basicaa -gvn -instcombine -S 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/Analysis/BasicAA/getmodrefinfo-cs-cs.ll b/test/Analysis/BasicAA/getmodrefinfo-cs-cs.ll
index ebd349a..f0f1a63 100644
--- a/test/Analysis/BasicAA/getmodrefinfo-cs-cs.ll
+++ b/test/Analysis/BasicAA/getmodrefinfo-cs-cs.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& FileCheck %s
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 
 
 ; CHECK: Just Ref: call void @ro() <-> call void @f0()
diff --git a/test/Analysis/BasicAA/must-and-partial.ll b/test/Analysis/BasicAA/must-and-partial.ll
index 93b6184..58139ff 100644
--- a/test/Analysis/BasicAA/must-and-partial.ll
+++ b/test/Analysis/BasicAA/must-and-partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
 
 ; When merging MustAlias and PartialAlias, merge to PartialAlias
 ; instead of MayAlias.
diff --git a/test/Analysis/BasicAA/no-escape-call.ll b/test/Analysis/BasicAA/no-escape-call.ll
index ccabce9..b93db6e 100644
--- a/test/Analysis/BasicAA/no-escape-call.ll
+++ b/test/Analysis/BasicAA/no-escape-call.ll
@@ -1,9 +1,10 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep {ret i1 true}
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 ; PR2436
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
 define i1 @foo(i32 %i) nounwind  {
+; CHECK: ret i1 true
 entry:
 	%arr = alloca [10 x i8*]		; <[10 x i8*]*> [#uses=1]
 	%tmp2 = call i8* @getPtr( ) nounwind 		; <i8*> [#uses=2]
diff --git a/test/Analysis/BasicAA/nocapture.ll b/test/Analysis/BasicAA/nocapture.ll
index 7970fbb..a8658ec 100644
--- a/test/Analysis/BasicAA/nocapture.ll
+++ b/test/Analysis/BasicAA/nocapture.ll
@@ -1,8 +1,9 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep {ret i32 0}
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 
 declare i32* @test(i32* nocapture)
 
 define i32 @test2() {
+; CHECK: ret i32 0
        %P = alloca i32
        %Q = call i32* @test(i32* %P)
        %a = load i32* %P
diff --git a/test/Analysis/BasicAA/phi-aa.ll b/test/Analysis/BasicAA/phi-aa.ll
index 50fd5cd..6aa26c1 100644
--- a/test/Analysis/BasicAA/phi-aa.ll
+++ b/test/Analysis/BasicAA/phi-aa.ll
@@ -1,10 +1,12 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {NoAlias:.*%P,.*@Z}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 ; rdar://7282591
 
 @X = common global i32 0
 @Y = common global i32 0
 @Z = common global i32 0
 
+; CHECK:  NoAlias: i32* %P, i32* @Z
+
 define void @foo(i32 %cond) nounwind {
 entry:
   %"alloca point" = bitcast i32 0 to i32
diff --git a/test/Analysis/BasicAA/phi-and-select.ll b/test/Analysis/BasicAA/phi-and-select.ll
index 0ed4a2c..b8fee00 100644
--- a/test/Analysis/BasicAA/phi-and-select.ll
+++ b/test/Analysis/BasicAA/phi-and-select.ll
@@ -1,8 +1,17 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output \
-; RUN:   |& grep {NoAlias:	double\\* \[%\]a, double\\* \[%\]b\$} | count 4
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 
 ; BasicAA should detect NoAliases in PHIs and Selects.
 
+; CHECK: Function: foo
+; CHECK:  NoAlias: double* %a, double* %b
+; CHECK: Function: bar
+; CHECK:  NoAlias: double* %a, double* %b
+; CHECK: Function: qux
+; CHECK:  NoAlias: double* %a, double* %b
+; CHECK: Function: fin
+; CHECK:  NoAlias: double* %a, double* %b
+; CHECK: ===== Alias Analysis Evaluator Report =====
+
 ; Two PHIs in the same block.
 define void @foo(i1 %m, double* noalias %x, double* noalias %y) {
 entry:
diff --git a/test/Analysis/BasicAA/pure-const-dce.ll b/test/Analysis/BasicAA/pure-const-dce.ll
index 54e6e79..266e607 100644
--- a/test/Analysis/BasicAA/pure-const-dce.ll
+++ b/test/Analysis/BasicAA/pure-const-dce.ll
@@ -1,7 +1,25 @@
-; RUN: opt < %s -basicaa -gvn -S | grep TestConst | count 2
-; RUN: opt < %s -basicaa -gvn -S | grep TestPure  | count 3
-; RUN: opt < %s -basicaa -gvn -S | grep TestNone  | count 4
-@g = global i32 0		; <i32*> [#uses=1]
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+
+@g = global i32 0
+
+; CHECK:      @test
+; CHECK:      entry
+; CHECK:      %tmp0 = call i32 @TestConst(i32 5) readnone
+; CHECK-NEXT: %tmp1 = call i32 @TestPure(i32 6) readonly
+; CHECK-NEXT: %tmp2 = call i32 @TestNone(i32 7)
+; CHECK-NEXT: store i32 1, i32* @g
+; CHECK-NEXT: %tmp5 = call i32 @TestPure(i32 6) readonly
+; CHECK-NEXT: %tmp7 = call i32 @TestNone(i32 7)
+; CHECK-NEXT: %tmp8 = call i32 @TestNone(i32 7)
+; CHECK-NEXT: %sum0 = add i32 %tmp0, %tmp1
+; CHECK-NEXT: %sum1 = add i32 %sum0, %tmp2
+; CHECK-NEXT: %sum2 = add i32 %sum1, %tmp0
+; CHECK-NEXT: %sum3 = add i32 %sum2, %tmp0
+; CHECK-NEXT: %sum4 = add i32 %sum3, %tmp5
+; CHECK-NEXT: %sum5 = add i32 %sum4, %tmp5
+; CHECK-NEXT: %sum6 = add i32 %sum5, %tmp7
+; CHECK-NEXT: %sum7 = add i32 %sum6, %tmp8
+; CHECK-NEXT: ret i32 %sum7
 
 define i32 @test() {
 entry:
diff --git a/test/Analysis/BasicAA/tailcall-modref.ll b/test/Analysis/BasicAA/tailcall-modref.ll
index f7d6c57..ebeb28c 100644
--- a/test/Analysis/BasicAA/tailcall-modref.ll
+++ b/test/Analysis/BasicAA/tailcall-modref.ll
@@ -1,11 +1,7 @@
-; RUN: opt < %s -basicaa -gvn -instcombine |\
-; RUN:   llvm-dis | grep {ret i32 0}
-
-declare void @foo(i32*)
-
-declare void @bar()
+; RUN: opt < %s -basicaa -gvn -instcombine -S | FileCheck %s
 
 define i32 @test() {
+; CHECK: ret i32 0
         %A = alloca i32         ; <i32*> [#uses=3]
         call void @foo( i32* %A )
         %X = load i32* %A               ; <i32> [#uses=1]
@@ -14,3 +10,7 @@ define i32 @test() {
         %Z = sub i32 %X, %Y             ; <i32> [#uses=1]
         ret i32 %Z
 }
+
+declare void @foo(i32*)
+
+declare void @bar()
diff --git a/test/Analysis/CallGraph/2008-09-09-DirectCall.ll b/test/Analysis/CallGraph/2008-09-09-DirectCall.ll
index 784d6c7..595cc42 100644
--- a/test/Analysis/CallGraph/2008-09-09-DirectCall.ll
+++ b/test/Analysis/CallGraph/2008-09-09-DirectCall.ll
@@ -1,5 +1,9 @@
-; RUN: opt < %s -print-callgraph -disable-output |& \
-; RUN:   grep {calls function 'callee'} | count 2
+; RUN: opt < %s -print-callgraph -disable-output 2>&1 | FileCheck %s
+
+; CHECK: Call graph node <<null function>>
+; CHECK:  CS<{{.*}}> calls function 'callee'
+; CHECK: Call graph node for function: 'caller'
+; CHECK:  CS<{{.*}}> calls function 'callee'
 
 define internal void @callee(...) {
 entry:
diff --git a/test/Analysis/CallGraph/2008-09-09-UsedByGlobal.ll b/test/Analysis/CallGraph/2008-09-09-UsedByGlobal.ll
index 0c5ef92..ac95188 100644
--- a/test/Analysis/CallGraph/2008-09-09-UsedByGlobal.ll
+++ b/test/Analysis/CallGraph/2008-09-09-UsedByGlobal.ll
@@ -1,7 +1,9 @@
-; RUN: opt < %s -print-callgraph -disable-output |& grep {calls function}
+; RUN: opt < %s -print-callgraph -disable-output 2>&1 | FileCheck %s
 
 @a = global void ()* @f		; <void ()**> [#uses=0]
 
+; CHECK: calls function 'f'
+
 define internal void @f() {
 	unreachable
 }
diff --git a/test/Analysis/CallGraph/no-intrinsics.ll b/test/Analysis/CallGraph/no-intrinsics.ll
index 272a559..450dce5 100644
--- a/test/Analysis/CallGraph/no-intrinsics.ll
+++ b/test/Analysis/CallGraph/no-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -print-callgraph -disable-output |& FileCheck %s
+; RUN: opt < %s -print-callgraph -disable-output 2>&1 | FileCheck %s
 
 ; Check that intrinsics aren't added to the call graph
 
diff --git a/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll b/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll
index e31f416..45efc42 100644
--- a/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll
+++ b/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -domtree -break-crit-edges -analyze \
-; RUN:  -domtree | grep {3.*%brtrue }
+; RUN: opt < %s -domtree -break-crit-edges -analyze -domtree | FileCheck %s
 ; PR932
 
+; CHECK: [3] %brtrue {1,2}
+
 declare void @use1(i32)
 
 define void @f(i32 %i, i1 %c) {
diff --git a/test/Analysis/GlobalsModRef/2008-09-03-ReadGlobals.ll b/test/Analysis/GlobalsModRef/2008-09-03-ReadGlobals.ll
index 17ace8a..d51c159 100644
--- a/test/Analysis/GlobalsModRef/2008-09-03-ReadGlobals.ll
+++ b/test/Analysis/GlobalsModRef/2008-09-03-ReadGlobals.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalsmodref-aa -gvn -S | grep call | count 2
+; RUN: opt < %s -globalsmodref-aa -gvn -S | FileCheck %s
 
 @g = internal global i32 0		; <i32*> [#uses=2]
 
@@ -8,6 +8,8 @@ define i32 @r() {
 }
 
 define i32 @f() {
+; CHECK: call i32 @e()
+; CHECK: call i32 @e()
 entry:
 	%tmp = call i32 @e( )		; <i32> [#uses=1]
 	store i32 %tmp, i32* @g
diff --git a/test/Analysis/GlobalsModRef/aliastest.ll b/test/Analysis/GlobalsModRef/aliastest.ll
index 75af4dc..4cfed71 100644
--- a/test/Analysis/GlobalsModRef/aliastest.ll
+++ b/test/Analysis/GlobalsModRef/aliastest.ll
@@ -1,7 +1,12 @@
-; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | not grep load
+; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | FileCheck %s
+
 @X = internal global i32 4		; <i32*> [#uses=1]
 
 define i32 @test(i32* %P) {
+; CHECK:      @test
+; CHECK-NEXT: store i32 7, i32* %P
+; CHECK-NEXT: store i32 12, i32* @X
+; CHECK-NEXT: ret i32 7
 	store i32 7, i32* %P
 	store i32 12, i32* @X
 	%V = load i32* %P		; <i32> [#uses=1]
diff --git a/test/Analysis/GlobalsModRef/chaining-analysis.ll b/test/Analysis/GlobalsModRef/chaining-analysis.ll
index 431b2a6..aeb76e4 100644
--- a/test/Analysis/GlobalsModRef/chaining-analysis.ll
+++ b/test/Analysis/GlobalsModRef/chaining-analysis.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | not grep load
+; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | FileCheck %s
 
 ; This test requires the use of previous analyses to determine that
 ; doesnotmodX does not modify X (because 'sin' doesn't).
@@ -8,6 +8,10 @@
 declare double @sin(double) readnone
 
 define i32 @test(i32* %P) {
+; CHECK:      @test
+; CHECK-NEXT: store i32 12, i32* @X
+; CHECK-NEXT: call double @doesnotmodX(double 1.000000e+00)
+; CHECK-NEXT: ret i32 12
 	store i32 12, i32* @X
 	call double @doesnotmodX( double 1.000000e+00 )		; <double>:1 [#uses=0]
 	%V = load i32* @X		; <i32> [#uses=1]
diff --git a/test/Analysis/GlobalsModRef/indirect-global.ll b/test/Analysis/GlobalsModRef/indirect-global.ll
index 826f55c..48ac6dd 100644
--- a/test/Analysis/GlobalsModRef/indirect-global.ll
+++ b/test/Analysis/GlobalsModRef/indirect-global.ll
@@ -1,9 +1,7 @@
-; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -instcombine -S | \
-; RUN:   grep {ret i32 0}
+; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -instcombine -S | FileCheck %s
 
 @G = internal global i32* null		; <i32**> [#uses=3]
 
-
 declare i8* @malloc(i32)
 define void @test() {
 	%a = call i8* @malloc(i32 4)
@@ -13,6 +11,7 @@ define void @test() {
 }
 
 define i32 @test1(i32* %P) {
+; CHECK: ret i32 0
 	%g1 = load i32** @G		; <i32*> [#uses=2]
 	%h1 = load i32* %g1		; <i32> [#uses=1]
 	store i32 123, i32* %P
diff --git a/test/Analysis/GlobalsModRef/modreftest.ll b/test/Analysis/GlobalsModRef/modreftest.ll
index 3a02a94a..3eed916 100644
--- a/test/Analysis/GlobalsModRef/modreftest.ll
+++ b/test/Analysis/GlobalsModRef/modreftest.ll
@@ -1,7 +1,12 @@
-; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | not grep load
+; RUN: opt < %s -basicaa -globalsmodref-aa -gvn -S | FileCheck %s
+
 @X = internal global i32 4		; <i32*> [#uses=2]
 
 define i32 @test(i32* %P) {
+; CHECK:      @test
+; CHECK-NEXT: store i32 12, i32* @X
+; CHECK-NEXT: call void @doesnotmodX()
+; CHECK-NEXT: ret i32 12
 	store i32 12, i32* @X
 	call void @doesnotmodX( )
 	%V = load i32* @X		; <i32> [#uses=1]
diff --git a/test/Analysis/GlobalsModRef/purecse.ll b/test/Analysis/GlobalsModRef/purecse.ll
index 994aff8..e030417 100644
--- a/test/Analysis/GlobalsModRef/purecse.ll
+++ b/test/Analysis/GlobalsModRef/purecse.ll
@@ -1,6 +1,5 @@
 ; Test that pure functions are cse'd away
-; RUN: opt < %s -globalsmodref-aa -gvn -instcombine | \
-; RUN: llvm-dis | not grep sub
+; RUN: opt < %s -globalsmodref-aa -gvn -instcombine -S | FileCheck %s
 
 define i32 @pure(i32 %X) {
         %Y = add i32 %X, 1              ; <i32> [#uses=1]
@@ -8,6 +7,8 @@ define i32 @pure(i32 %X) {
 }
 
 define i32 @test1(i32 %X) {
+; CHECK:      %A = call i32 @pure(i32 %X)
+; CHECK-NEXT: ret i32 0
         %A = call i32 @pure( i32 %X )           ; <i32> [#uses=1]
         %B = call i32 @pure( i32 %X )           ; <i32> [#uses=1]
         %C = sub i32 %A, %B             ; <i32> [#uses=1]
@@ -15,6 +16,9 @@ define i32 @test1(i32 %X) {
 }
 
 define i32 @test2(i32 %X, i32* %P) {
+; CHECK:      %A = call i32 @pure(i32 %X)
+; CHECK-NEXT: store i32 %X, i32* %P
+; CHECK-NEXT: ret i32 0
         %A = call i32 @pure( i32 %X )           ; <i32> [#uses=1]
         store i32 %X, i32* %P ;; Does not invalidate 'pure' call.
         %B = call i32 @pure( i32 %X )           ; <i32> [#uses=1]
diff --git a/test/Analysis/GlobalsModRef/volatile-instrs.ll b/test/Analysis/GlobalsModRef/volatile-instrs.ll
new file mode 100644
index 0000000..49bce67
--- /dev/null
+++ b/test/Analysis/GlobalsModRef/volatile-instrs.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.anon = type { i32, i32, i32 }
+@b = global %struct.anon { i32 1, i32 0, i32 0 }, align 4
+@c = common global i32 0, align 4
+@a = common global %struct.anon zeroinitializer, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+
+; Make sure that the initial memcpy call does not go away
+; because the volatile load is in the way. PR12899
+
+; CHECK: main_entry:
+; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64
+
+define i32 @main() nounwind uwtable ssp {
+main_entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.anon* @b to i8*), i8* bitcast (%struct.anon* @a to i8*), i64 12, i32 4, i1 false)
+  %0 = load volatile i32* getelementptr inbounds (%struct.anon* @b, i64 0, i32 0), align 4, !tbaa !0
+  store i32 %0, i32* @c, align 4, !tbaa !0
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.anon* @b to i8*), i8* bitcast (%struct.anon* @a to i8*), i64 12, i32 4, i1 false) nounwind
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %0) nounwind
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
index 9355aee..7119007 100644
--- a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
+++ b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
@@ -2,7 +2,7 @@
 ; not a child of the loopentry.6 loop.
 ;
 ; RUN: opt < %s -analyze -loops | \
-; RUN:   grep {^            Loop at depth 4 containing: %loopentry.7<header><latch><exiting>}
+; RUN:   grep "^            Loop at depth 4 containing: %loopentry.7<header><latch><exiting>"
 
 define void @getAndMoveToFrontDecode() {
 	br label %endif.2
diff --git a/test/Analysis/RegionInfo/block_sort.ll b/test/Analysis/RegionInfo/block_sort.ll
index faec45a..ac77ab3 100644
--- a/test/Analysis/RegionInfo/block_sort.ll
+++ b/test/Analysis/RegionInfo/block_sort.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats -analyze < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats -analyze < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @BZ2_blockSort() nounwind {
 start:
diff --git a/test/Analysis/RegionInfo/cond_loop.ll b/test/Analysis/RegionInfo/cond_loop.ll
index 2ce57c3..1145ffd 100644
--- a/test/Analysis/RegionInfo/cond_loop.ll
+++ b/test/Analysis/RegionInfo/cond_loop.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 5:
diff --git a/test/Analysis/RegionInfo/condition_complicated.ll b/test/Analysis/RegionInfo/condition_complicated.ll
index 7ca5c7c..6b39880 100644
--- a/test/Analysis/RegionInfo/condition_complicated.ll
+++ b/test/Analysis/RegionInfo/condition_complicated.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc zeroext i8 @handle_compress() nounwind {
 end165:
diff --git a/test/Analysis/RegionInfo/condition_complicated_2.ll b/test/Analysis/RegionInfo/condition_complicated_2.ll
index 5fa940a..f551108 100644
--- a/test/Analysis/RegionInfo/condition_complicated_2.ll
+++ b/test/Analysis/RegionInfo/condition_complicated_2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc void @compress() nounwind {
 end33:
diff --git a/test/Analysis/RegionInfo/condition_forward_edge.ll b/test/Analysis/RegionInfo/condition_forward_edge.ll
index 098c9b6..5e4d9d2 100644
--- a/test/Analysis/RegionInfo/condition_forward_edge.ll
+++ b/test/Analysis/RegionInfo/condition_forward_edge.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/condition_same_exit.ll b/test/Analysis/RegionInfo/condition_same_exit.ll
index 1b88596..e48413a 100644
--- a/test/Analysis/RegionInfo/condition_same_exit.ll
+++ b/test/Analysis/RegionInfo/condition_same_exit.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/condition_simple.ll b/test/Analysis/RegionInfo/condition_simple.ll
index 19b154b..00d9ed2 100644
--- a/test/Analysis/RegionInfo/condition_simple.ll
+++ b/test/Analysis/RegionInfo/condition_simple.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/exit_in_condition.ll b/test/Analysis/RegionInfo/exit_in_condition.ll
index 3b152d2..b84abec 100644
--- a/test/Analysis/RegionInfo/exit_in_condition.ll
+++ b/test/Analysis/RegionInfo/exit_in_condition.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc zeroext i8 @handle_compress() nounwind {
 entry:
diff --git a/test/Analysis/RegionInfo/infinite_loop.ll b/test/Analysis/RegionInfo/infinite_loop.ll
index 59cead4..8e58828 100644
--- a/test/Analysis/RegionInfo/infinite_loop.ll
+++ b/test/Analysis/RegionInfo/infinite_loop.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -regions -analyze < %s 
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/infinite_loop_2.ll b/test/Analysis/RegionInfo/infinite_loop_2.ll
index 80c69b7a..a8227e3 100644
--- a/test/Analysis/RegionInfo/infinite_loop_2.ll
+++ b/test/Analysis/RegionInfo/infinite_loop_2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s 
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/infinite_loop_3.ll b/test/Analysis/RegionInfo/infinite_loop_3.ll
index 74ceafb..b09c9c1 100644
--- a/test/Analysis/RegionInfo/infinite_loop_3.ll
+++ b/test/Analysis/RegionInfo/infinite_loop_3.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -regions -analyze < %s 
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
 
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/infinite_loop_4.ll b/test/Analysis/RegionInfo/infinite_loop_4.ll
index fd56af1..681c305 100644
--- a/test/Analysis/RegionInfo/infinite_loop_4.ll
+++ b/test/Analysis/RegionInfo/infinite_loop_4.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s 
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/loop_with_condition.ll b/test/Analysis/RegionInfo/loop_with_condition.ll
index d1d6898..08d2ba8 100644
--- a/test/Analysis/RegionInfo/loop_with_condition.ll
+++ b/test/Analysis/RegionInfo/loop_with_condition.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
 
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/loops_1.ll b/test/Analysis/RegionInfo/loops_1.ll
index d4bf3cc..6449949 100644
--- a/test/Analysis/RegionInfo/loops_1.ll
+++ b/test/Analysis/RegionInfo/loops_1.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc zeroext i8 @loops_1() nounwind {
 entry:
diff --git a/test/Analysis/RegionInfo/loops_2.ll b/test/Analysis/RegionInfo/loops_2.ll
index 07aa7c3..dc4a1ad 100644
--- a/test/Analysis/RegionInfo/loops_2.ll
+++ b/test/Analysis/RegionInfo/loops_2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @meread_() nounwind {
 entry:
diff --git a/test/Analysis/RegionInfo/mix_1.ll b/test/Analysis/RegionInfo/mix_1.ll
index 829c157..1474e03 100644
--- a/test/Analysis/RegionInfo/mix_1.ll
+++ b/test/Analysis/RegionInfo/mix_1.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
 
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @a_linear_impl_fig_1() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/multiple_exiting_edge.ll b/test/Analysis/RegionInfo/multiple_exiting_edge.ll
index 7bc0e46..8de6472 100644
--- a/test/Analysis/RegionInfo/multiple_exiting_edge.ll
+++ b/test/Analysis/RegionInfo/multiple_exiting_edge.ll
@@ -1,5 +1,5 @@
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @normal_condition_0() nounwind {
 bb38:                                             ; preds = %bb34, %bb34, %bb37
diff --git a/test/Analysis/RegionInfo/nested_loops.ll b/test/Analysis/RegionInfo/nested_loops.ll
index 9d8c455..a3707a1 100644
--- a/test/Analysis/RegionInfo/nested_loops.ll
+++ b/test/Analysis/RegionInfo/nested_loops.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
 
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc zeroext i8 @handle_compress() nounwind {
 entry:
diff --git a/test/Analysis/RegionInfo/next.ll b/test/Analysis/RegionInfo/next.ll
index 377a84d..890b4f2 100644
--- a/test/Analysis/RegionInfo/next.ll
+++ b/test/Analysis/RegionInfo/next.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @MAIN__() nounwind {
 entry:
diff --git a/test/Analysis/RegionInfo/paper.ll b/test/Analysis/RegionInfo/paper.ll
index 00b544b..96c87e0 100644
--- a/test/Analysis/RegionInfo/paper.ll
+++ b/test/Analysis/RegionInfo/paper.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define void @a_linear_impl_fig_1() nounwind {
 0:
diff --git a/test/Analysis/RegionInfo/two_loops_same_header.ll b/test/Analysis/RegionInfo/two_loops_same_header.ll
index a97182b..e75661e 100644
--- a/test/Analysis/RegionInfo/two_loops_same_header.ll
+++ b/test/Analysis/RegionInfo/two_loops_same_header.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -regions -analyze < %s | FileCheck %s
-; RUN: opt -regions -stats < %s |& FileCheck -check-prefix=STAT %s
-; RUN: opt -regions -print-region-style=bb  -analyze < %s |& FileCheck -check-prefix=BBIT %s
-; RUN: opt -regions -print-region-style=rn  -analyze < %s |& FileCheck -check-prefix=RNIT %s
+; RUN: opt -regions -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; RUN: opt -regions -print-region-style=bb  -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s
+; RUN: opt -regions -print-region-style=rn  -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s
 
 define internal fastcc zeroext i8 @handle_compress() nounwind {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2007-07-15-NegativeStride.ll b/test/Analysis/ScalarEvolution/2007-07-15-NegativeStride.ll
index 7ff130f..e0c5583 100644
--- a/test/Analysis/ScalarEvolution/2007-07-15-NegativeStride.ll
+++ b/test/Analysis/ScalarEvolution/2007-07-15-NegativeStride.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep {Loop %bb: backedge-taken count is 100}
+; RUN:   -scalar-evolution-max-iterations=0 | grep "Loop %bb: backedge-taken count is 100"
 ; PR1533
 
 @array = weak global [101 x i32] zeroinitializer, align 32		; <[100 x i32]*> [#uses=1]
diff --git a/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll b/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
index ab96243..036abf5 100644
--- a/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
+++ b/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep {Loop %bb: backedge-taken count is (-1 + (-1 \\* %x) + %y)}
+; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %bb: backedge-taken count is (-1 + (-1 \* %x) + %y)"
 ; PR1597
 
 define i32 @f(i32 %x, i32 %y) {
diff --git a/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll b/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
index b678fee..a3192b9 100644
--- a/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
+++ b/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep {backedge-taken count is 13}
+; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 13"
 ; PR1706
 
 define i32 @f() {
diff --git a/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll b/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
index fe3a7f4..d0644f7 100644
--- a/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
+++ b/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep {Loop %header: backedge-taken count is (0 smax %n)}
+; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %header: backedge-taken count is (0 smax %n)"
 
 define void @foo(i32 %n) {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll b/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
index bcc124d..41734d7 100644
--- a/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
+++ b/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep {backedge-taken count is 61}
+; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 61"
 ; PR2364
 
 define i32 @func_6() nounwind  {
diff --git a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
index 9db9b71..5cf17a2 100644
--- a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution |& not grep smax
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | not grep smax
 ; PR2261
 
 @lut = common global [256 x i8] zeroinitializer, align 32		; <[256 x i8]*> [#uses=1]
diff --git a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
index 1847665..195dfaa 100644
--- a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution |& not grep smax
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | not grep smax
 ; PR2070
 
 define i32 @a(i32 %x) nounwind  {
diff --git a/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll b/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
index 86e07ec4..cbf200e 100644
--- a/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep {backedge-taken count is 113}
+; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 113"
 ; PR2088
 
 define void @fun() {
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll b/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
index 335bbaf..c25e4a3 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -analyze -scalar-evolution |& \
-; RUN: grep {Loop %bb: backedge-taken count is (7 + (-1 \\* %argc))}
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | \
+; RUN: grep "Loop %bb: backedge-taken count is (7 + (-1 \* %argc))"
 
 define i32 @main(i32 %argc, i8** %argv) nounwind {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll b/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
index db527fe..56a8343 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:  | grep {Loop %bb: Unpredictable backedge-taken count\\.}
+; RUN:  | grep "Loop %bb: Unpredictable backedge-taken count\."
 
 ; ScalarEvolution can't compute a trip count because it doesn't know if
 ; dividing by the stride will have a remainder. This could theoretically
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll b/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
index fa9f21a..aaf6770 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution |& grep {/u 3}
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | grep "/u 3"
 ; XFAIL: *
 
 ; This is a tricky testcase for unsigned wrap detection which ScalarEvolution
diff --git a/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll b/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
index 25a0434..a1b3b71 100644
--- a/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep {backedge-taken count is 255}
+; RUN: opt < %s -analyze -scalar-evolution | grep "backedge-taken count is 255"
 
 define i32 @foo(i32 %x, i32 %y, i32* %lam, i32* %alp) nounwind {
 bb1.thread:
diff --git a/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll b/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
index 8152e98..bb14919 100644
--- a/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -analyze -scalar-evolution |& \
-; RUN: grep {(((-1 \\* %i0) + (100005 smax %i0)) /u 5)}
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | \
+; RUN: grep "(((-1 * %i0) + (100005 smax %i0)) /u 5)"
 ; XFAIL: *
 
 define i32 @foo0(i32 %i0) nounwind {
diff --git a/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll b/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
index 3eaa492..7000626 100644
--- a/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution |& grep {/u 5}
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | grep "/u 5"
 ; XFAIL: *
 
 define i8 @foo0(i8 %i0) nounwind {
diff --git a/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll b/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
index cc2a2e4..82f2608 100644
--- a/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
+++ b/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution | not grep {/u -1}
+; RUN: opt < %s -analyze -scalar-evolution | not grep "/u -1"
 ; PR3275
 
 @g_16 = external global i16		; <i16*> [#uses=3]
diff --git a/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll b/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
index c2e108a..ebd9f73 100644
--- a/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
+++ b/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep {(trunc i} | not grep ext
+; RUN: opt < %s -analyze -scalar-evolution | grep "(trunc i" | not grep ext
 
 define i16 @test1(i8 %x) {
   %A = sext i8 %x to i32
diff --git a/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll b/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
index dc7bd29..8a78043 100644
--- a/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
+++ b/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep {count is 2}
+; RUN: opt < %s -analyze -scalar-evolution | grep "count is 2"
 ; PR3171
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
diff --git a/test/Analysis/ScalarEvolution/2012-05-18-LoopPredRecurse.ll b/test/Analysis/ScalarEvolution/2012-05-18-LoopPredRecurse.ll
new file mode 100644
index 0000000..52e6683
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/2012-05-18-LoopPredRecurse.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -iv-users -S -disable-output
+;
+; PR12868: Infinite recursion:
+; getUDivExpr()->getZeroExtendExpr()->isLoopBackedgeGuardedBy()
+;
+; We actually want SCEV simplification to fail gracefully in this
+; case, so there's no output to check, just the absense of stack overflow.
+
+@c = common global i8 0, align 1
+
+define i32 @func() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i8 [ -1, %entry ], [ %inc, %for.body ]
+  %ui.0 = phi i32 [ undef, %entry ], [ %div, %for.body ]
+  %tobool = icmp eq i8 %storemerge, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %conv = sext i8 %storemerge to i32
+  %div = lshr i32 %conv, 1
+  %tobool2 = icmp eq i32 %div, 0
+  %inc = add i8 %storemerge, 1
+  br i1 %tobool2, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.body, %for.cond
+  ret i32 0
+}
diff --git a/test/Analysis/ScalarEvolution/2012-05-29-MulAddRec.ll b/test/Analysis/ScalarEvolution/2012-05-29-MulAddRec.ll
new file mode 100644
index 0000000..eee4ec4
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/2012-05-29-MulAddRec.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -S -indvars -loop-unroll | FileCheck %s
+;
+; loop-unroll fully unrolls the inner loop, creating an interesting
+; chain of multiplication. indvars forces SCEV to run again on the
+; outer loop. While reducing the recurrence at %mul3, unsigned integer overflow
+; causes one of the terms to reach zero. This forces all multiples in
+; the recurrence to be zero, reducing the whole thing to a constant expression.
+;
+; PR12929: cast<Ty>() argument of incompatible type
+
+; CHECK: @func
+; CHECK: for.cond:
+; CHECK: %inc1 = phi i8 [ 0, %entry ], [ %0, %for.body ]
+; CHECK: br label %for.body
+
+; CHECK: for.body:
+; CHECK: %inc.9 = add i8 %inc.8, 1
+; CHECK: %0 = add i8 %inc1, 10
+; CHEKC: br label %for.cond
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+define void @func() noreturn nounwind uwtable ssp {
+entry:
+  br label %for.cond
+
+for.cond.loopexit:                                ; preds = %for.body
+  %mul.lcssa = phi i8 [ %mul, %for.body ]
+  %0 = add i8 %inc1, 10
+  %indvars.iv.next = add i8 %indvars.iv, 10
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.loopexit, %entry
+  %indvars.iv = phi i8 [ %indvars.iv.next, %for.cond.loopexit ], [ 10, %entry ]
+  %mul3 = phi i8 [ undef, %entry ], [ %mul.lcssa, %for.cond.loopexit ]
+  %inc1 = phi i8 [ 0, %entry ], [ %0, %for.cond.loopexit ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.cond
+  %inc26 = phi i8 [ %inc1, %for.cond ], [ %inc, %for.body ]
+  %mul45 = phi i8 [ %mul3, %for.cond ], [ %mul, %for.body ]
+  %inc = add i8 %inc26, 1
+  %mul = mul i8 %inc26, %mul45
+  %exitcond = icmp ne i8 %inc, %indvars.iv
+  br i1 %exitcond, label %for.body, label %for.cond.loopexit
+}
diff --git a/test/Analysis/ScalarEvolution/SolveQuadraticEquation.ll b/test/Analysis/ScalarEvolution/SolveQuadraticEquation.ll
index 06f1b6f..e946d7a 100644
--- a/test/Analysis/ScalarEvolution/SolveQuadraticEquation.ll
+++ b/test/Analysis/ScalarEvolution/SolveQuadraticEquation.ll
@@ -80,3 +80,24 @@ for.cond539.preheader:
   unreachable
 }
 ; CHECK: Determining loop execution counts for: @test3
+
+; PR13489
+; We used to crash on this too.
+
+define void @test4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %v2.02 = phi i64 [ 2, %entry ], [ %phitmp, %for.body ]
+  %v1.01 = phi i64 [ -2, %entry ], [ %sub1, %for.body ]
+  %sub1 = sub i64 %v1.01, %v2.02
+  %phitmp = add i64 %v2.02, 2
+  %tobool = icmp eq i64 %sub1, %phitmp
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK: Determining loop execution counts for: @test4
diff --git a/test/Analysis/ScalarEvolution/and-xor.ll b/test/Analysis/ScalarEvolution/and-xor.ll
index 1772573..06f4a85 100644
--- a/test/Analysis/ScalarEvolution/and-xor.ll
+++ b/test/Analysis/ScalarEvolution/and-xor.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:   | grep {\\-->  (zext} | count 2
+; RUN:   | grep "\-->  (zext" | count 2
 
 define i32 @foo(i32 %x) {
   %n = and i32 %x, 255
diff --git a/test/Analysis/ScalarEvolution/avoid-smax-0.ll b/test/Analysis/ScalarEvolution/avoid-smax-0.ll
index 24275f9..3d15c78 100644
--- a/test/Analysis/ScalarEvolution/avoid-smax-0.ll
+++ b/test/Analysis/ScalarEvolution/avoid-smax-0.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep {Loop %bb3: backedge-taken count is (-1 + %n)}
+; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %bb3: backedge-taken count is (-1 + %n)"
 
 ; We don't want to use a max in the trip count expression in
 ; this testcase.
diff --git a/test/Analysis/ScalarEvolution/div-overflow.ll b/test/Analysis/ScalarEvolution/div-overflow.ll
index 4f6f1e2..2846797 100644
--- a/test/Analysis/ScalarEvolution/div-overflow.ll
+++ b/test/Analysis/ScalarEvolution/div-overflow.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep {\\-->  ((-128 \\* %a) /u -128)}
+; RUN:  | grep "\-->  ((-128 \* %a) /u -128)"
 
 ; Don't let ScalarEvolution fold this div away.
 
diff --git a/test/Analysis/ScalarEvolution/how-far-to-zero.ll b/test/Analysis/ScalarEvolution/how-far-to-zero.ll
new file mode 100644
index 0000000..07af88f
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/how-far-to-zero.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; PR13228
+define void @f() nounwind uwtable readnone {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %c.0 = phi i8 [ 1, %entry ], [ 0, %for.cond ]
+  %i.0 = phi i8 [ 0, %entry ], [ %inc, %for.cond ]
+  %lnot = icmp eq i8 %i.0, 0
+  %inc = add i8 %i.0, 1
+  br i1 %lnot, label %for.cond, label %while.cond
+
+while.cond:                                       ; preds = %while.body, %for.cond
+  %b.2 = phi i8 [ %add, %while.body ], [ 0, %for.cond ]
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add = add i8 %b.2, %c.0
+  %tobool7 = icmp eq i8 %add, 0
+  br i1 %tobool7, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.body, %while.cond
+  ret void
+}
+;CHECK: Loop %while.cond: <multiple exits> Unpredictable backedge-taken count.
diff --git a/test/Analysis/ScalarEvolution/scev-aa.ll b/test/Analysis/ScalarEvolution/scev-aa.ll
index dd5a66c..a0abbb7 100644
--- a/test/Analysis/ScalarEvolution/scev-aa.ll
+++ b/test/Analysis/ScalarEvolution/scev-aa.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scev-aa -aa-eval -print-all-alias-modref-info \
-; RUN:   |& FileCheck %s
+; RUN:   2>&1 | FileCheck %s
 
 ; At the time of this writing, -basicaa misses the example of the form
 ; A[i+(j+1)] != A[i+j], which can arise from multi-dimensional array references,
diff --git a/test/Analysis/ScalarEvolution/sext-inreg.ll b/test/Analysis/ScalarEvolution/sext-inreg.ll
index 23e1210..8b3d641 100644
--- a/test/Analysis/ScalarEvolution/sext-inreg.ll
+++ b/test/Analysis/ScalarEvolution/sext-inreg.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -analyze -scalar-evolution > %t
-; RUN: grep {sext i57 \{0,+,199\}<%bb> to i64} %t | count 1
-; RUN: grep {sext i59 \{0,+,199\}<%bb> to i64} %t | count 1
+; RUN: grep "sext i57 {0,+,199}<%bb> to i64" %t | count 1
+; RUN: grep "sext i59 {0,+,199}<%bb> to i64" %t | count 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.6"
diff --git a/test/Analysis/ScalarEvolution/sext-iv-1.ll b/test/Analysis/ScalarEvolution/sext-iv-1.ll
index 9063cbb..c34596d 100644
--- a/test/Analysis/ScalarEvolution/sext-iv-1.ll
+++ b/test/Analysis/ScalarEvolution/sext-iv-1.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep { -->  (sext i. \{.\*,+,.\*\}<%bb1> to i64)} | count 5
+; RUN:  | grep " -->  (sext i. {.*,+,.*}<%bb1> to i64)" | count 5
 
 ; Don't convert (sext {...,+,...}) to {sext(...),+,sext(...)} in cases
 ; where the trip count is not within range.
diff --git a/test/Analysis/ScalarEvolution/smax.ll b/test/Analysis/ScalarEvolution/smax.ll
index 15dd744..eceb429 100644
--- a/test/Analysis/ScalarEvolution/smax.ll
+++ b/test/Analysis/ScalarEvolution/smax.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -analyze -scalar-evolution | grep smax | count 2
 ; RUN: opt < %s -analyze -scalar-evolution | grep \
-; RUN:     {%. smax %. smax %.}
+; RUN:     "%. smax %. smax %."
 ; PR1614
 
 define i32 @x(i32 %a, i32 %b, i32 %c) {
diff --git a/test/Analysis/ScalarEvolution/trip-count.ll b/test/Analysis/ScalarEvolution/trip-count.ll
index cb4e267..94f6882 100644
--- a/test/Analysis/ScalarEvolution/trip-count.ll
+++ b/test/Analysis/ScalarEvolution/trip-count.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep {backedge-taken count is 10000}
+; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 10000"
 ; PR1101
 
 @A = weak global [1000 x i32] zeroinitializer, align 32         
diff --git a/test/Analysis/ScalarEvolution/trip-count2.ll b/test/Analysis/ScalarEvolution/trip-count2.ll
index e26cbea..d84e99f 100644
--- a/test/Analysis/ScalarEvolution/trip-count2.ll
+++ b/test/Analysis/ScalarEvolution/trip-count2.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution | \
-; RUN:   grep {backedge-taken count is 4}
+; RUN:   grep "backedge-taken count is 4"
 ; PR1101
 
 @A = weak global [1000 x i32] zeroinitializer, align 32         
diff --git a/test/Analysis/ScalarEvolution/trip-count3.ll b/test/Analysis/ScalarEvolution/trip-count3.ll
index 1bf86ae..0cb6c95 100644
--- a/test/Analysis/ScalarEvolution/trip-count3.ll
+++ b/test/Analysis/ScalarEvolution/trip-count3.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep {Loop %bb3\\.i: Unpredictable backedge-taken count\\.}
+; RUN:  | grep "Loop %bb3\.i: Unpredictable backedge-taken count\."
 
 ; ScalarEvolution can't compute a trip count because it doesn't know if
 ; dividing by the stride will have a remainder. This could theoretically
diff --git a/test/Analysis/ScalarEvolution/trip-count4.ll b/test/Analysis/ScalarEvolution/trip-count4.ll
index 116f62d..c02ae14 100644
--- a/test/Analysis/ScalarEvolution/trip-count4.ll
+++ b/test/Analysis/ScalarEvolution/trip-count4.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   | grep {sext.*trunc.*Exits: 11}
+; RUN:   | grep "sext.*trunc.*Exits: 11"
 
 ; ScalarEvolution should be able to compute a loop exit value for %indvar.i8.
 
diff --git a/test/Analysis/ScalarEvolution/trip-count5.ll b/test/Analysis/ScalarEvolution/trip-count5.ll
index 1194a1d..68a1ae1 100644
--- a/test/Analysis/ScalarEvolution/trip-count5.ll
+++ b/test/Analysis/ScalarEvolution/trip-count5.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -analyze -scalar-evolution > %t
 ; RUN: grep sext %t | count 2
-; RUN: not grep {(sext} %t
+; RUN: not grep "(sext" %t
 
 ; ScalarEvolution should be able to compute a maximum trip count
 ; value sufficient to fold away both sext casts.
diff --git a/test/Analysis/ScalarEvolution/trip-count6.ll b/test/Analysis/ScalarEvolution/trip-count6.ll
index 956fb81..882f552 100644
--- a/test/Analysis/ScalarEvolution/trip-count6.ll
+++ b/test/Analysis/ScalarEvolution/trip-count6.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:  | grep {max backedge-taken count is 1\$}
+; RUN:  | grep "max backedge-taken count is 1$"
 
 @mode_table = global [4 x i32] zeroinitializer          ; <[4 x i32]*> [#uses=1]
 
diff --git a/test/Analysis/ScalarEvolution/trip-count7.ll b/test/Analysis/ScalarEvolution/trip-count7.ll
index a8b797e..2bcb9e9 100644
--- a/test/Analysis/ScalarEvolution/trip-count7.ll
+++ b/test/Analysis/ScalarEvolution/trip-count7.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   | grep {Loop %bb7.i: Unpredictable backedge-taken count\\.}
+; RUN:   | grep "Loop %bb7.i: Unpredictable backedge-taken count\."
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/test/Analysis/ScalarEvolution/trip-count8.ll b/test/Analysis/ScalarEvolution/trip-count8.ll
index ac5ee60..005162b 100644
--- a/test/Analysis/ScalarEvolution/trip-count8.ll
+++ b/test/Analysis/ScalarEvolution/trip-count8.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:  | grep {Loop %for\\.body: backedge-taken count is (-1 + \[%\]ecx)}
+; RUN:  | grep "Loop %for\.body: backedge-taken count is (-1 + [%]ecx)"
 ; PR4599
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/Analysis/ScalarEvolution/xor-and.ll b/test/Analysis/ScalarEvolution/xor-and.ll
index c0530bb..4ab2f39 100644
--- a/test/Analysis/ScalarEvolution/xor-and.ll
+++ b/test/Analysis/ScalarEvolution/xor-and.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:   | grep {\\-->  (zext i4 (-8 + (trunc i64 (8 \\* %x) to i4)) to i64)}
+; RUN:   | grep "\-->  (zext i4 (-8 + (trunc i64 (8 \* %x) to i4)) to i64)"
 
 ; ScalarEvolution shouldn't try to analyze %z into something like
 ;   -->  (zext i4 (-1 + (-1 * (trunc i64 (8 * %x) to i4))) to i64)
diff --git a/test/Assembler/2003-04-15-ConstantInitAssertion.ll b/test/Assembler/2003-04-15-ConstantInitAssertion.ll
index fa6b807..dddbdb1 100644
--- a/test/Assembler/2003-04-15-ConstantInitAssertion.ll
+++ b/test/Assembler/2003-04-15-ConstantInitAssertion.ll
@@ -1,4 +1,5 @@
-; RUN: not llvm-as < %s >/dev/null |& grep {struct initializer doesn't match struct element type}
+; RUN: not llvm-as < %s >/dev/null 2> %t
+; RUN: grep "struct initializer doesn't match struct element type" %t
 ; Test the case of a misformed constant initializer
 ; This should cause an assembler error, not an assertion failure!
 constant { i32 } { float 1.0 }
diff --git a/test/Assembler/2003-05-21-MalformedShiftCrash.ll b/test/Assembler/2003-05-21-MalformedShiftCrash.ll
index a845d89..1d4ac40 100644
--- a/test/Assembler/2003-05-21-MalformedShiftCrash.ll
+++ b/test/Assembler/2003-05-21-MalformedShiftCrash.ll
@@ -1,4 +1,5 @@
 ; Found by inspection of the code
-; RUN: not llvm-as < %s > /dev/null |& grep {constexpr requires integer operands}
+; RUN: not llvm-as < %s > /dev/null 2> %t
+; RUN: grep "constexpr requires integer operands" %t
 
 global i32 ashr (float 1.0, float 2.0)
diff --git a/test/Assembler/2003-05-21-MalformedStructCrash.ll b/test/Assembler/2003-05-21-MalformedStructCrash.ll
index 8d20e070..44d3e23 100644
--- a/test/Assembler/2003-05-21-MalformedStructCrash.ll
+++ b/test/Assembler/2003-05-21-MalformedStructCrash.ll
@@ -1,4 +1,5 @@
 ; Found by inspection of the code
-; RUN: not llvm-as < %s  > /dev/null |& grep {initializer with struct type has wrong # elements}
+; RUN: not llvm-as < %s  > /dev/null 2> %t
+; RUN: grep "initializer with struct type has wrong # elements" %t
 
 global {} { i32 7, float 1.0, i32 7, i32 8 }
diff --git a/test/Assembler/2003-06-17-InvokeDisassemble.ll b/test/Assembler/2003-06-17-InvokeDisassemble.ll
deleted file mode 100644
index 8a9670e..0000000
--- a/test/Assembler/2003-06-17-InvokeDisassemble.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis
-
-define void @test() {
-  invoke void @test( )
-    to label %Next unwind label %Next
-
-Next:           ; preds = %0, %0
-  %lpad = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
-            cleanup
-  ret void
-}
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/Assembler/2003-11-12-ConstantExprCast.ll b/test/Assembler/2003-11-12-ConstantExprCast.ll
index 149fef2..47a5353 100644
--- a/test/Assembler/2003-11-12-ConstantExprCast.ll
+++ b/test/Assembler/2003-11-12-ConstantExprCast.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | not grep { bitcast (}
+; RUN: llvm-as < %s | llvm-dis | not grep " bitcast ("
 
 @.Base64_1 = external constant [4 x i8]         ; <[4 x i8]*> [#uses=1]
 
diff --git a/test/Assembler/2003-11-24-SymbolTableCrash.ll b/test/Assembler/2003-11-24-SymbolTableCrash.ll
index 041b0d9..28fd301 100644
--- a/test/Assembler/2003-11-24-SymbolTableCrash.ll
+++ b/test/Assembler/2003-11-24-SymbolTableCrash.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {multiple definition}
+; RUN: not llvm-as < %s 2>&1 | grep "multiple definition"
 
 define void @test() {
 	%tmp.1 = add i32 0, 1
diff --git a/test/Assembler/2004-01-11-getelementptrfolding.ll b/test/Assembler/2004-01-11-getelementptrfolding.ll
index c22aede..5249d0e 100644
--- a/test/Assembler/2004-01-11-getelementptrfolding.ll
+++ b/test/Assembler/2004-01-11-getelementptrfolding.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | \
-; RUN:   not grep {getelementptr.*getelementptr}
+; RUN:   not grep "getelementptr.*getelementptr"
 
 %struct.TTriangleItem = type { i8*, i8*, [3 x %struct.TUVVertex] }
 %struct.TUVVertex = type { i16, i16, i16, i16 }
diff --git a/test/Assembler/2004-03-30-UnclosedFunctionCrash.ll b/test/Assembler/2004-03-30-UnclosedFunctionCrash.ll
index 775b755..9f24f1a 100644
--- a/test/Assembler/2004-03-30-UnclosedFunctionCrash.ll
+++ b/test/Assembler/2004-03-30-UnclosedFunctionCrash.ll
@@ -1,3 +1,3 @@
-; RUN: not llvm-as %s |& grep {found end of file when expecting more instructions}
+; RUN: not llvm-as %s 2>&1 | grep "found end of file when expecting more instructions"
 
 define void @foo() {
diff --git a/test/Assembler/2004-11-28-InvalidTypeCrash.ll b/test/Assembler/2004-11-28-InvalidTypeCrash.ll
index 40648fd..4db5b74 100644
--- a/test/Assembler/2004-11-28-InvalidTypeCrash.ll
+++ b/test/Assembler/2004-11-28-InvalidTypeCrash.ll
@@ -1,4 +1,4 @@
 ; Test for PR463.  This program is erroneous, but should not crash llvm-as.
-; RUN: not llvm-as %s -o /dev/null |& grep {use of undefined type named 'struct.none'}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "use of undefined type named 'struct.none'"
 
 @.FOO  = internal global %struct.none zeroinitializer
diff --git a/test/Assembler/2006-09-28-CrashOnInvalid.ll b/test/Assembler/2006-09-28-CrashOnInvalid.ll
index a203c6a..6041bdf 100644
--- a/test/Assembler/2006-09-28-CrashOnInvalid.ll
+++ b/test/Assembler/2006-09-28-CrashOnInvalid.ll
@@ -1,6 +1,7 @@
 ; Test for PR902.  This program is erroneous, but should not crash llvm-as.
 ; This tests that a simple error is caught and processed correctly.
-; RUN: not llvm-as < %s >/dev/null |& grep {floating point constant invalid for type}
+; RUN: not llvm-as < %s >/dev/null 2> %t
+; RUN: grep "floating point constant invalid for type" %t
 
 define void @test() {
   add i32 1, 2.0
diff --git a/test/Assembler/2007-01-02-Undefined-Arg-Type.ll b/test/Assembler/2007-01-02-Undefined-Arg-Type.ll
index a39de1c..184e543 100644
--- a/test/Assembler/2007-01-02-Undefined-Arg-Type.ll
+++ b/test/Assembler/2007-01-02-Undefined-Arg-Type.ll
@@ -1,5 +1,5 @@
 ; The assembler should catch an undefined argument type .
-; RUN: not llvm-as %s -o /dev/null |& grep {use of undefined type named 'typedef.bc_struct'}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "use of undefined type named 'typedef.bc_struct'"
 
 ; %typedef.bc_struct = type opaque
 
diff --git a/test/Assembler/2007-01-16-CrashOnBadCast.ll b/test/Assembler/2007-01-16-CrashOnBadCast.ll
index 81f5458..aa74144 100644
--- a/test/Assembler/2007-01-16-CrashOnBadCast.ll
+++ b/test/Assembler/2007-01-16-CrashOnBadCast.ll
@@ -1,5 +1,5 @@
 ; PR1117
-; RUN: not llvm-as %s -o /dev/null |& grep {invalid cast opcode for cast from}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "invalid cast opcode for cast from"
 
 define i8* @nada(i64 %X) {
     %result = trunc i64 %X to i8*
diff --git a/test/Assembler/2007-01-16-CrashOnBadCast2.ll b/test/Assembler/2007-01-16-CrashOnBadCast2.ll
index c05c609..479bef7 100644
--- a/test/Assembler/2007-01-16-CrashOnBadCast2.ll
+++ b/test/Assembler/2007-01-16-CrashOnBadCast2.ll
@@ -1,4 +1,4 @@
 ; PR1117
-; RUN: not llvm-as %s -o /dev/null |& grep {invalid cast opcode for cast from}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "invalid cast opcode for cast from"
 
 @X = constant i8* trunc (i64 0 to i8*)
diff --git a/test/Assembler/2007-03-18-InvalidNumberedVar.ll b/test/Assembler/2007-03-18-InvalidNumberedVar.ll
index b2193b1..0f6b24d 100644
--- a/test/Assembler/2007-03-18-InvalidNumberedVar.ll
+++ b/test/Assembler/2007-03-18-InvalidNumberedVar.ll
@@ -1,5 +1,6 @@
 ; PR 1258
-; RUN: not llvm-as < %s >/dev/null |& grep {'%0' defined with type 'i1'}
+; RUN: not llvm-as < %s >/dev/null 2> %t
+; RUN: grep "'%0' defined with type 'i1'" %t
 
 define i32 @test1(i32 %a, i32 %b) {
 entry:
diff --git a/test/Assembler/2007-03-19-NegValue.ll b/test/Assembler/2007-03-19-NegValue.ll
index e90cf35..64eb3cb 100644
--- a/test/Assembler/2007-03-19-NegValue.ll
+++ b/test/Assembler/2007-03-19-NegValue.ll
@@ -1,5 +1,5 @@
 ; Test whether negative values > 64 bits retain their negativeness.
-; RUN: llvm-as < %s | llvm-dis | grep {add i65.*, -1}
+; RUN: llvm-as < %s | llvm-dis | grep "add i65.*, -1"
 
 define i65 @testConsts(i65 %N) {
   %a = add i65 %N, -1
diff --git a/test/Assembler/2007-04-20-AlignedLoad.ll b/test/Assembler/2007-04-20-AlignedLoad.ll
index f0217ae..98a5428 100644
--- a/test/Assembler/2007-04-20-AlignedLoad.ll
+++ b/test/Assembler/2007-04-20-AlignedLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | grep {align 1024}
+; RUN: llvm-as < %s | llvm-dis | grep "align 1024"
 
 define i32 @test(i32* %arg) {
 entry:
diff --git a/test/Assembler/2007-04-20-AlignedStore.ll b/test/Assembler/2007-04-20-AlignedStore.ll
index 1b08c48..9e4dd9f 100644
--- a/test/Assembler/2007-04-20-AlignedStore.ll
+++ b/test/Assembler/2007-04-20-AlignedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | grep {align 1024}
+; RUN: llvm-as < %s | llvm-dis | grep "align 1024"
 
 define void @test(i32* %arg) {
 entry:
diff --git a/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll b/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll
index c26d9eb..b0ca1aa 100644
--- a/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll
+++ b/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | grep {icmp.*test_weak.*null}
+; RUN: llvm-as < %s | llvm-dis | grep "icmp.*test_weak.*null"
 ; PR1358
 @G = global i1 icmp ne (i32 (...)* @test_weak, i32 (...)* null)
 
diff --git a/test/Assembler/2007-08-06-AliasInvalid.ll b/test/Assembler/2007-08-06-AliasInvalid.ll
index 9409598..3abdc41 100644
--- a/test/Assembler/2007-08-06-AliasInvalid.ll
+++ b/test/Assembler/2007-08-06-AliasInvalid.ll
@@ -1,4 +1,5 @@
-; RUN: not llvm-as < %s > /dev/null |& grep {expected top-level entity}
+; RUN: not llvm-as < %s > /dev/null 2> %t
+; RUN: grep "expected top-level entity" %t
 ; PR1577
 
 @anInt = global i32 1 
diff --git a/test/Assembler/2007-09-29-GC.ll b/test/Assembler/2007-09-29-GC.ll
index 789a0fe..9aefd0b 100644
--- a/test/Assembler/2007-09-29-GC.ll
+++ b/test/Assembler/2007-09-29-GC.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llvm-dis | grep {@f.*gc.*shadowstack}
-; RUN: llvm-as < %s | llvm-dis | grep {@g.*gc.*java}
+; RUN: llvm-as < %s | llvm-dis | grep "@f.*gc.*shadowstack"
+; RUN: llvm-as < %s | llvm-dis | grep "@g.*gc.*java"
 
 define void @f() gc "shadowstack" {
 entry:
diff --git a/test/Assembler/2007-12-11-AddressSpaces.ll b/test/Assembler/2007-12-11-AddressSpaces.ll
index 0eb4a79..7c9b5b5 100644
--- a/test/Assembler/2007-12-11-AddressSpaces.ll
+++ b/test/Assembler/2007-12-11-AddressSpaces.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llvm-dis | grep {addrspace(33)} | count 7
-; RUN: llvm-as < %s | llvm-dis | grep {addrspace(42)} | count 2
-; RUN: llvm-as < %s | llvm-dis | grep {addrspace(66)} | count 2
-; RUN: llvm-as < %s | llvm-dis | grep {addrspace(11)} | count 6
-; RUN: llvm-as < %s | llvm-dis | grep {addrspace(22)} | count 5
+; RUN: llvm-as < %s | llvm-dis | grep "addrspace(33)" | count 7
+; RUN: llvm-as < %s | llvm-dis | grep "addrspace(42)" | count 2
+; RUN: llvm-as < %s | llvm-dis | grep "addrspace(66)" | count 2
+; RUN: llvm-as < %s | llvm-dis | grep "addrspace(11)" | count 6
+; RUN: llvm-as < %s | llvm-dis | grep "addrspace(22)" | count 5
 
 	%struct.mystruct = type { i32, i32 addrspace(33)*, i32, i32 addrspace(33)* }
 @input = weak addrspace(42) global %struct.mystruct zeroinitializer  		; <%struct.mystruct addrspace(42)*> [#uses=1]
diff --git a/test/Assembler/2008-02-18-IntPointerCrash.ll b/test/Assembler/2008-02-18-IntPointerCrash.ll
index 5a661ad..4a33c36 100644
--- a/test/Assembler/2008-02-18-IntPointerCrash.ll
+++ b/test/Assembler/2008-02-18-IntPointerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as %s |& grep {integer constant must have integer type}
+; RUN: not llvm-as %s 2>&1 | grep "integer constant must have integer type"
 ; PR2060
 
 define i8* @foo() {
diff --git a/test/Assembler/2008-09-02-FunctionNotes2.ll b/test/Assembler/2008-09-02-FunctionNotes2.ll
index 8a49e89..97351e2 100644
--- a/test/Assembler/2008-09-02-FunctionNotes2.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes2.ll
@@ -1,5 +1,5 @@
 ; Test function notes
-; RUN: not llvm-as %s -o /dev/null |& grep "Attributes noinline alwaysinline are incompatible"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes noinline alwaysinline are incompatible"
 define void @fn1() alwaysinline  noinline {
   ret void
 }
diff --git a/test/Assembler/ConstantExprFold.ll b/test/Assembler/ConstantExprFold.ll
index d3d374a..fc18ce7 100644
--- a/test/Assembler/ConstantExprFold.ll
+++ b/test/Assembler/ConstantExprFold.ll
@@ -1,7 +1,7 @@
 ; This test checks to make sure that constant exprs fold in some simple 
 ; situations
 
-; RUN: llvm-as < %s | llvm-dis | not grep {(}
+; RUN: llvm-as < %s | llvm-dis | not grep "("
 
 @A = global i64 0
 
diff --git a/test/Assembler/extractvalue-invalid-idx.ll b/test/Assembler/extractvalue-invalid-idx.ll
index 9a215f7..b5a398c 100644
--- a/test/Assembler/extractvalue-invalid-idx.ll
+++ b/test/Assembler/extractvalue-invalid-idx.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& FileCheck %s
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 ; PR4170
 
 ; CHECK: invalid indices for extractvalue
diff --git a/test/Assembler/getelementptr_struct.ll b/test/Assembler/getelementptr_struct.ll
index bfebf29..0293672 100644
--- a/test/Assembler/getelementptr_struct.ll
+++ b/test/Assembler/getelementptr_struct.ll
@@ -1,4 +1,5 @@
-; RUN: not llvm-as < %s >/dev/null |& FileCheck %s
+; RUN: not llvm-as < %s >/dev/null 2> %t
+; RUN: FileCheck %s < %t
 ; Test the case of a incorrect indices type into struct
 
 ; CHECK: invalid getelementptr indices
diff --git a/test/Assembler/half-constprop.ll b/test/Assembler/half-constprop.ll
new file mode 100644
index 0000000..03ccdda
--- /dev/null
+++ b/test/Assembler/half-constprop.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; Testing half constant propagation.
+
+define half @abc() nounwind {
+entry:
+  %a = alloca half, align 2
+  %b = alloca half, align 2
+  %.compoundliteral = alloca float, align 4
+  store half 0xH4200, half* %a, align 2
+  store half 0xH4B9A, half* %b, align 2
+  %tmp = load half* %a, align 2
+  %tmp1 = load half* %b, align 2
+  %add = fadd half %tmp, %tmp1
+; CHECK: 0xH4C8D
+  ret half %add
+}
+
diff --git a/test/Assembler/half-conv.ll b/test/Assembler/half-conv.ll
new file mode 100644
index 0000000..bf9ae57
--- /dev/null
+++ b/test/Assembler/half-conv.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; Testing half to float conversion.
+
+define float @abc() nounwind {
+entry:
+  %a = alloca half, align 2
+  %.compoundliteral = alloca float, align 4
+  store half 0xH4C8D, half* %a, align 2
+  %tmp = load half* %a, align 2
+  %conv = fpext half %tmp to float
+; CHECK: 0x4032340000000000
+  ret float %conv
+}
diff --git a/test/Assembler/half.ll b/test/Assembler/half.ll
new file mode 100644
index 0000000..63ad392
--- /dev/null
+++ b/test/Assembler/half.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; Basic smoke test for half type.
+
+; CHECK: define half @halftest
+define half  @halftest(half %A0) {
+; CHECK: ret half %A0
+        ret half %A0
+}
diff --git a/test/Assembler/insertvalue-invalid-idx.ll b/test/Assembler/insertvalue-invalid-idx.ll
index 355d4e8..74642f4 100644
--- a/test/Assembler/insertvalue-invalid-idx.ll
+++ b/test/Assembler/insertvalue-invalid-idx.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& FileCheck %s
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 ; CHECK: invalid indices for insertvalue
 
diff --git a/test/Assembler/invalid_cast.ll b/test/Assembler/invalid_cast.ll
index f682835..91e81c7 100644
--- a/test/Assembler/invalid_cast.ll
+++ b/test/Assembler/invalid_cast.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& FileCheck %s
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 ; CHECK: invalid cast opcode for cast from '<4 x i64>' to '<3 x i8>'
 
diff --git a/test/Assembler/invalid_cast2.ll b/test/Assembler/invalid_cast2.ll
index a01b935..5ce9546 100644
--- a/test/Assembler/invalid_cast2.ll
+++ b/test/Assembler/invalid_cast2.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& FileCheck %s
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 ; CHECK: invalid cast opcode for cast from '<4 x i64>' to 'i8'
 
diff --git a/test/Assembler/tls-models.ll b/test/Assembler/tls-models.ll
new file mode 100644
index 0000000..42f2496
--- /dev/null
+++ b/test/Assembler/tls-models.ll
@@ -0,0 +1,11 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+; CHECK: @a = thread_local global i32 0
+; CHECK: @b = thread_local(localdynamic) global i32 0
+; CHECK: @c = thread_local(initialexec) global i32 0
+; CHECK: @d = thread_local(localexec) global i32 0
+
+@a = thread_local global i32 0
+@b = thread_local(localdynamic) global i32 0
+@c = thread_local(initialexec) global i32 0
+@d = thread_local(localexec) global i32 0
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index 9329286..b8eb6d3 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -84,7 +84,7 @@ let test_target () =
 (*===-- Constants ---------------------------------------------------------===*)
 
 let test_constants () =
-  (* RUN: grep {const_int.*i32.*-1} < %t.ll
+  (* RUN: grep "const_int.*i32.*-1" < %t.ll
    *)
   group "int";
   let c = const_int i32_type (-1) in
@@ -92,44 +92,44 @@ let test_constants () =
   insist (i32_type = type_of c);
   insist (is_constant c);
 
-  (* RUN: grep {const_sext_int.*i64.*-1} < %t.ll
+  (* RUN: grep "const_sext_int.*i64.*-1" < %t.ll
    *)
   group "sext int";
   let c = const_int i64_type (-1) in
   ignore (define_global "const_sext_int" c m);
   insist (i64_type = type_of c);
 
-  (* RUN: grep {const_zext_int64.*i64.*4294967295} < %t.ll
+  (* RUN: grep "const_zext_int64.*i64.*4294967295" < %t.ll
    *)
   group "zext int64";
   let c = const_of_int64 i64_type (Int64.of_string "4294967295") false in
   ignore (define_global "const_zext_int64" c m);
   insist (i64_type = type_of c);
 
-  (* RUN: grep {const_int_string.*i32.*-1} < %t.ll
+  (* RUN: grep "const_int_string.*i32.*-1" < %t.ll
    *)
   group "int string";
   let c = const_int_of_string i32_type "-1" 10 in
   ignore (define_global "const_int_string" c m);
   insist (i32_type = type_of c);
 
-  (* RUN: grep {const_string.*"cruel\\\\00world"} < %t.ll
+  (* RUN: grep 'const_string.*"cruel\00world"' < %t.ll
    *)
   group "string";
   let c = const_string context "cruel\000world" in
   ignore (define_global "const_string" c m);
   insist ((array_type i8_type 11) = type_of c);
 
-  (* RUN: grep {const_stringz.*"hi\\\\00again\\\\00"} < %t.ll
+  (* RUN: grep 'const_stringz.*"hi\00again\00"' < %t.ll
    *)
   group "stringz";
   let c = const_stringz context "hi\000again" in
   ignore (define_global "const_stringz" c m);
   insist ((array_type i8_type 9) = type_of c);
 
-  (* RUN: grep {const_single.*2.75} < %t.ll
-   * RUN: grep {const_double.*3.1459} < %t.ll
-   * RUN: grep {const_double_string.*1.25} < %t.ll
+  (* RUN: grep "const_single.*2.75" < %t.ll
+   * RUN: grep "const_double.*3.1459" < %t.ll
+   * RUN: grep "const_double_string.*1.25" < %t.ll
    *)
   begin group "real";
     let cs = const_float float_type 2.75 in
@@ -150,14 +150,14 @@ let test_constants () =
   let three = const_int i32_type 3 in
   let four = const_int i32_type 4 in
   
-  (* RUN: grep {const_array.*\\\[i32 3, i32 4\\\]} < %t.ll
+  (* RUN: grep "const_array.*[i32 3, i32 4]" < %t.ll
    *)
   group "array";
   let c = const_array i32_type [| three; four |] in
   ignore (define_global "const_array" c m);
   insist ((array_type i32_type 2) = (type_of c));
   
-  (* RUN: grep {const_vector.*<i16 1, i16 2.*>} < %t.ll
+  (* RUN: grep "const_vector.*<i16 1, i16 2.*>" < %t.ll
    *)
   group "vector";
   let c = const_vector [| one; two; one; two;
@@ -165,7 +165,7 @@ let test_constants () =
   ignore (define_global "const_vector" c m);
   insist ((vector_type i16_type 8) = (type_of c));
 
-  (* RUN: grep {const_structure.*.i16 1, i16 2, i32 3, i32 4} < %t.ll
+  (* RUN: grep "const_structure.*.i16 1, i16 2, i32 3, i32 4" < %t.ll
    *)
   group "structure";
   let c = const_struct context [| one; two; three; four |] in
@@ -173,27 +173,27 @@ let test_constants () =
   insist ((struct_type context [| i16_type; i16_type; i32_type; i32_type |])
         = (type_of c));
 
-  (* RUN: grep {const_null.*zeroinit} < %t.ll
+  (* RUN: grep "const_null.*zeroinit" < %t.ll
    *)
   group "null";
   let c = const_null (packed_struct_type context [| i1_type; i8_type; i64_type;
                                                     double_type |]) in
   ignore (define_global "const_null" c m);
   
-  (* RUN: grep {const_all_ones.*-1} < %t.ll
+  (* RUN: grep "const_all_ones.*-1" < %t.ll
    *)
   group "all ones";
   let c = const_all_ones i64_type in
   ignore (define_global "const_all_ones" c m);
 
   group "pointer null"; begin
-    (* RUN: grep {const_pointer_null = global i64\\* null} < %t.ll
+    (* RUN: grep "const_pointer_null = global i64* null" < %t.ll
      *)
     let c = const_pointer_null (pointer_type i64_type) in
     ignore (define_global "const_pointer_null" c m);
   end;
   
-  (* RUN: grep {const_undef.*undef} < %t.ll
+  (* RUN: grep "const_undef.*undef" < %t.ll
    *)
   group "undef";
   let c = undef i1_type in
@@ -202,35 +202,35 @@ let test_constants () =
   insist (is_undef c);
   
   group "constant arithmetic";
-  (* RUN: grep {@const_neg = global i64 sub} < %t.ll
-   * RUN: grep {@const_nsw_neg = global i64 sub nsw } < %t.ll
-   * RUN: grep {@const_nuw_neg = global i64 sub nuw } < %t.ll
-   * RUN: grep {@const_fneg = global double fsub } < %t.ll
-   * RUN: grep {@const_not = global i64 xor } < %t.ll
-   * RUN: grep {@const_add = global i64 add } < %t.ll
-   * RUN: grep {@const_nsw_add = global i64 add nsw } < %t.ll
-   * RUN: grep {@const_nuw_add = global i64 add nuw } < %t.ll
-   * RUN: grep {@const_fadd = global double fadd } < %t.ll
-   * RUN: grep {@const_sub = global i64 sub } < %t.ll
-   * RUN: grep {@const_nsw_sub = global i64 sub nsw } < %t.ll
-   * RUN: grep {@const_nuw_sub = global i64 sub nuw } < %t.ll
-   * RUN: grep {@const_fsub = global double fsub } < %t.ll
-   * RUN: grep {@const_mul = global i64 mul } < %t.ll
-   * RUN: grep {@const_nsw_mul = global i64 mul nsw } < %t.ll
-   * RUN: grep {@const_nuw_mul = global i64 mul nuw } < %t.ll
-   * RUN: grep {@const_fmul = global double fmul } < %t.ll
-   * RUN: grep {@const_udiv = global i64 udiv } < %t.ll
-   * RUN: grep {@const_sdiv = global i64 sdiv } < %t.ll
-   * RUN: grep {@const_exact_sdiv = global i64 sdiv exact } < %t.ll
-   * RUN: grep {@const_fdiv = global double fdiv } < %t.ll
-   * RUN: grep {@const_urem = global i64 urem } < %t.ll
-   * RUN: grep {@const_srem = global i64 srem } < %t.ll
-   * RUN: grep {@const_frem = global double frem } < %t.ll
-   * RUN: grep {@const_and = global i64 and } < %t.ll
-   * RUN: grep {@const_or = global i64 or } < %t.ll
-   * RUN: grep {@const_xor = global i64 xor } < %t.ll
-   * RUN: grep {@const_icmp = global i1 icmp sle } < %t.ll
-   * RUN: grep {@const_fcmp = global i1 fcmp ole } < %t.ll
+  (* RUN: grep "@const_neg = global i64 sub" < %t.ll
+   * RUN: grep "@const_nsw_neg = global i64 sub nsw " < %t.ll
+   * RUN: grep "@const_nuw_neg = global i64 sub nuw " < %t.ll
+   * RUN: grep "@const_fneg = global double fsub " < %t.ll
+   * RUN: grep "@const_not = global i64 xor " < %t.ll
+   * RUN: grep "@const_add = global i64 add " < %t.ll
+   * RUN: grep "@const_nsw_add = global i64 add nsw " < %t.ll
+   * RUN: grep "@const_nuw_add = global i64 add nuw " < %t.ll
+   * RUN: grep "@const_fadd = global double fadd " < %t.ll
+   * RUN: grep "@const_sub = global i64 sub " < %t.ll
+   * RUN: grep "@const_nsw_sub = global i64 sub nsw " < %t.ll
+   * RUN: grep "@const_nuw_sub = global i64 sub nuw " < %t.ll
+   * RUN: grep "@const_fsub = global double fsub " < %t.ll
+   * RUN: grep "@const_mul = global i64 mul " < %t.ll
+   * RUN: grep "@const_nsw_mul = global i64 mul nsw " < %t.ll
+   * RUN: grep "@const_nuw_mul = global i64 mul nuw " < %t.ll
+   * RUN: grep "@const_fmul = global double fmul " < %t.ll
+   * RUN: grep "@const_udiv = global i64 udiv " < %t.ll
+   * RUN: grep "@const_sdiv = global i64 sdiv " < %t.ll
+   * RUN: grep "@const_exact_sdiv = global i64 sdiv exact " < %t.ll
+   * RUN: grep "@const_fdiv = global double fdiv " < %t.ll
+   * RUN: grep "@const_urem = global i64 urem " < %t.ll
+   * RUN: grep "@const_srem = global i64 srem " < %t.ll
+   * RUN: grep "@const_frem = global double frem " < %t.ll
+   * RUN: grep "@const_and = global i64 and " < %t.ll
+   * RUN: grep "@const_or = global i64 or " < %t.ll
+   * RUN: grep "@const_xor = global i64 xor " < %t.ll
+   * RUN: grep "@const_icmp = global i1 icmp sle " < %t.ll
+   * RUN: grep "@const_fcmp = global i1 fcmp ole " < %t.ll
    *)
   let void_ptr = pointer_type i8_type in
   let five = const_int i64_type 5 in
@@ -269,18 +269,18 @@ let test_constants () =
   ignore (define_global "const_fcmp" (const_fcmp Fcmp.Ole ffoldbomb ffive) m);
   
   group "constant casts";
-  (* RUN: grep {const_trunc.*trunc} < %t.ll
-   * RUN: grep {const_sext.*sext} < %t.ll
-   * RUN: grep {const_zext.*zext} < %t.ll
-   * RUN: grep {const_fptrunc.*fptrunc} < %t.ll
-   * RUN: grep {const_fpext.*fpext} < %t.ll
-   * RUN: grep {const_uitofp.*uitofp} < %t.ll
-   * RUN: grep {const_sitofp.*sitofp} < %t.ll
-   * RUN: grep {const_fptoui.*fptoui} < %t.ll
-   * RUN: grep {const_fptosi.*fptosi} < %t.ll
-   * RUN: grep {const_ptrtoint.*ptrtoint} < %t.ll
-   * RUN: grep {const_inttoptr.*inttoptr} < %t.ll
-   * RUN: grep {const_bitcast.*bitcast} < %t.ll
+  (* RUN: grep "const_trunc.*trunc" < %t.ll
+   * RUN: grep "const_sext.*sext" < %t.ll
+   * RUN: grep "const_zext.*zext" < %t.ll
+   * RUN: grep "const_fptrunc.*fptrunc" < %t.ll
+   * RUN: grep "const_fpext.*fpext" < %t.ll
+   * RUN: grep "const_uitofp.*uitofp" < %t.ll
+   * RUN: grep "const_sitofp.*sitofp" < %t.ll
+   * RUN: grep "const_fptoui.*fptoui" < %t.ll
+   * RUN: grep "const_fptosi.*fptosi" < %t.ll
+   * RUN: grep "const_ptrtoint.*ptrtoint" < %t.ll
+   * RUN: grep "const_inttoptr.*inttoptr" < %t.ll
+   * RUN: grep "const_bitcast.*bitcast" < %t.ll
    *)
   let i128_type = integer_type context 128 in
   ignore (define_global "const_trunc" (const_trunc (const_add foldbomb five)
@@ -302,12 +302,12 @@ let test_constants () =
   ignore (define_global "const_bitcast" (const_bitcast ffoldbomb i64_type) m);
   
   group "misc constants";
-  (* RUN: grep {const_size_of.*getelementptr.*null} < %t.ll
-   * RUN: grep {const_gep.*getelementptr} < %t.ll
-   * RUN: grep {const_select.*select} < %t.ll
-   * RUN: grep {const_extractelement.*extractelement} < %t.ll
-   * RUN: grep {const_insertelement.*insertelement} < %t.ll
-   * RUN: grep {const_shufflevector = global <4 x i32> <i32 0, i32 1, i32 1, i32 0>} < %t.ll
+  (* RUN: grep "const_size_of.*getelementptr.*null" < %t.ll
+   * RUN: grep "const_gep.*getelementptr" < %t.ll
+   * RUN: grep "const_select.*select" < %t.ll
+   * RUN: grep "const_extractelement.*extractelement" < %t.ll
+   * RUN: grep "const_insertelement.*insertelement" < %t.ll
+   * RUN: grep "const_shufflevector = global <4 x i32> <i32 0, i32 1, i32 1, i32 0>" < %t.ll
    *)
   ignore (define_global "const_size_of" (size_of (pointer_type i8_type)) m);
   ignore (define_global "const_gep" (const_gep foldbomb_gv [| five |]) m);
@@ -356,7 +356,7 @@ let test_global_values () =
   let (++) x f = f x; x in
   let zero32 = const_null i32_type in
 
-  (* RUN: grep {GVal01} < %t.ll
+  (* RUN: grep "GVal01" < %t.ll
    *)
   group "naming";
   let g = define_global "TEMPORARY" zero32 m in
@@ -364,28 +364,28 @@ let test_global_values () =
   set_value_name "GVal01" g;
   insist ("GVal01" = value_name g);
 
-  (* RUN: grep {GVal02.*linkonce} < %t.ll
+  (* RUN: grep "GVal02.*linkonce" < %t.ll
    *)
   group "linkage";
   let g = define_global "GVal02" zero32 m ++
           set_linkage Linkage.Link_once in
   insist (Linkage.Link_once = linkage g);
 
-  (* RUN: grep {GVal03.*Hanalei} < %t.ll
+  (* RUN: grep "GVal03.*Hanalei" < %t.ll
    *)
   group "section";
   let g = define_global "GVal03" zero32 m ++
           set_section "Hanalei" in
   insist ("Hanalei" = section g);
   
-  (* RUN: grep {GVal04.*hidden} < %t.ll
+  (* RUN: grep "GVal04.*hidden" < %t.ll
    *)
   group "visibility";
   let g = define_global "GVal04" zero32 m ++
           set_visibility Visibility.Hidden in
   insist (Visibility.Hidden = visibility g);
   
-  (* RUN: grep {GVal05.*align 128} < %t.ll
+  (* RUN: grep "GVal05.*align 128" < %t.ll
    *)
   group "alignment";
   let g = define_global "GVal05" zero32 m ++
@@ -400,7 +400,7 @@ let test_global_variables () =
   let fourty_two32 = const_int i32_type 42 in
 
   group "declarations"; begin
-    (* RUN: grep {GVar01.*external} < %t.ll
+    (* RUN: grep "GVar01.*external" < %t.ll
      *)
     insist (None == lookup_global "GVar01" m);
     let g = declare_global i32_type "GVar01" m in
@@ -422,8 +422,8 @@ let test_global_variables () =
   end;
   
   group "definitions"; begin
-    (* RUN: grep {GVar02.*42} < %t.ll
-     * RUN: grep {GVar03.*42} < %t.ll
+    (* RUN: grep "GVar02.*42" < %t.ll
+     * RUN: grep "GVar03.*42" < %t.ll
      *)
     let g = define_global "GVar02" fourty_two32 m in
     let g2 = declare_global i32_type "GVar03" m ++
@@ -440,20 +440,20 @@ let test_global_variables () =
     insist ((global_initializer g) == (global_initializer g2));
   end;
 
-  (* RUN: grep {GVar04.*thread_local} < %t.ll
+  (* RUN: grep "GVar04.*thread_local" < %t.ll
    *)
   group "threadlocal";
   let g = define_global "GVar04" fourty_two32 m ++
           set_thread_local true in
   insist (is_thread_local g);
 
-  (* RUN: grep -v {GVar05} < %t.ll
+  (* RUN: grep -v "GVar05" < %t.ll
    *)
   group "delete";
   let g = define_global "GVar05" fourty_two32 m in
   delete_global g;
 
-  (* RUN: grep -v {ConstGlobalVar.*constant} < %t.ll
+  (* RUN: grep -v "ConstGlobalVar.*constant" < %t.ll
    *)
   group "constant";
   let g = define_global "ConstGlobalVar" fourty_two32 m in
@@ -542,7 +542,7 @@ let test_users () =
 (*===-- Aliases -----------------------------------------------------------===*)
 
 let test_aliases () =
-  (* RUN: grep {@alias = alias i32\\* @aliasee} < %t.ll
+  (* RUN: grep "@alias = alias i32* @aliasee" < %t.ll
    *)
   let v = declare_global i32_type "aliasee" m in
   ignore (add_alias m (pointer_type i32_type) v "alias")
@@ -554,7 +554,7 @@ let test_functions () =
   let ty = function_type i32_type [| i32_type; i64_type |] in
   let ty2 = function_type i8_type [| i8_type; i64_type |] in
   
-  (* RUN: grep {declare i32 @Fn1\(i32, i64\)} < %t.ll
+  (* RUN: grep "declare i32 @Fn1\(i32, i64\)" < %t.ll
    *)
   begin group "declare";
     insist (None = lookup_function "Fn1" m);
@@ -570,13 +570,13 @@ let test_functions () =
     insist (m == global_parent fn)
   end;
   
-  (* RUN: grep -v {Fn2} < %t.ll
+  (* RUN: grep -v "Fn2" < %t.ll
    *)
   group "delete";
   let fn = declare_function "Fn2" ty m in
   delete_function fn;
   
-  (* RUN: grep {define.*Fn3} < %t.ll
+  (* RUN: grep "define.*Fn3" < %t.ll
    *)
   group "define";
   let fn = define_function "Fn3" ty m in
@@ -584,7 +584,7 @@ let test_functions () =
   insist (1 = Array.length (basic_blocks fn));
   ignore (build_unreachable (builder_at_end context (entry_block fn)));
   
-  (* RUN: grep {define.*Fn4.*Param1.*Param2} < %t.ll
+  (* RUN: grep "define.*Fn4.*Param1.*Param2" < %t.ll
    *)
   group "params";
   let fn = define_function "Fn4" ty m in
@@ -598,7 +598,7 @@ let test_functions () =
   set_value_name "Param2" params.(1);
   ignore (build_unreachable (builder_at_end context (entry_block fn)));
   
-  (* RUN: grep {fastcc.*Fn5} < %t.ll
+  (* RUN: grep "fastcc.*Fn5" < %t.ll
    *)
   group "callconv";
   let fn = define_function "Fn5" ty m in
@@ -608,7 +608,7 @@ let test_functions () =
   ignore (build_unreachable (builder_at_end context (entry_block fn)));
   
   begin group "gc";
-    (* RUN: grep {Fn6.*gc.*shadowstack} < %t.ll
+    (* RUN: grep "Fn6.*gc.*shadowstack" < %t.ll
      *)
     let fn = define_function "Fn6" ty m in
     insist (None = gc fn);
@@ -694,7 +694,7 @@ let test_params () =
 let test_basic_blocks () =
   let ty = function_type void_type [| |] in
   
-  (* RUN: grep {Bb1} < %t.ll
+  (* RUN: grep "Bb1" < %t.ll
    *)
   group "entry";
   let fn = declare_function "X" ty m in
@@ -825,7 +825,7 @@ let test_builder () =
   
   group "ret void";
   begin
-    (* RUN: grep {ret void} < %t.ll
+    (* RUN: grep "ret void" < %t.ll
      *)
     let fty = function_type void_type [| |] in
     let fn = declare_function "X6" fty m in
@@ -835,7 +835,7 @@ let test_builder () =
 
   group "ret aggregate";
   begin
-      (* RUN: grep {ret \{ i8, i64 \} \{ i8 4, i64 5 \}} < %t.ll
+      (* RUN: grep "ret { i8, i64 } { i8 4, i64 5 }" < %t.ll
        *)
       let sty = struct_type context [| i8_type; i64_type |] in
       let fty = function_type sty [| |] in
@@ -895,14 +895,14 @@ let test_builder () =
   end;
 
   group "ret"; begin
-    (* RUN: grep {ret.*P1} < %t.ll
+    (* RUN: grep "ret.*P1" < %t.ll
      *)
     let ret = build_ret p1 atentry in
     position_before ret atentry
   end;
   
   group "br"; begin
-    (* RUN: grep {br.*Bb02} < %t.ll
+    (* RUN: grep "br.*Bb02" < %t.ll
      *)
     let bb02 = append_block context "Bb02" fn in
     let b = builder_at_end context bb02 in
@@ -910,7 +910,7 @@ let test_builder () =
   end;
   
   group "cond_br"; begin
-    (* RUN: grep {br.*build_br.*Bb03.*Bb00} < %t.ll
+    (* RUN: grep "br.*build_br.*Bb03.*Bb00" < %t.ll
      *)
     let bb03 = append_block context "Bb03" fn in
     let b = builder_at_end context bb03 in
@@ -919,8 +919,8 @@ let test_builder () =
   end;
   
   group "switch"; begin
-    (* RUN: grep {switch.*P1.*SwiBlock3} < %t.ll
-     * RUN: grep {2,.*SwiBlock2} < %t.ll
+    (* RUN: grep "switch.*P1.*SwiBlock3" < %t.ll
+     * RUN: grep "2,.*SwiBlock2" < %t.ll
      *)
     let bb1 = append_block context "SwiBlock1" fn in
     let bb2 = append_block context "SwiBlock2" fn in
@@ -934,9 +934,9 @@ let test_builder () =
   end;
 
   group "malloc/free"; begin
-      (* RUN: grep {call.*@malloc(i32 ptrtoint} < %t.ll
-       * RUN: grep {call.*@free(i8\*} < %t.ll
-       * RUN: grep {call.*@malloc(i32 %} < %t.ll
+      (* RUN: grep "call.*@malloc(i32 ptrtoint" < %t.ll
+       * RUN: grep "call.*@free(i8*" < %t.ll
+       * RUN: grep "call.*@malloc(i32 %" < %t.ll
        *)
       let bb1 = append_block context "MallocBlock1" fn in
       let m1 = (build_malloc (pointer_type i32_type) "m1"
@@ -947,7 +947,7 @@ let test_builder () =
   end;
 
   group "indirectbr"; begin
-    (* RUN: grep {indirectbr i8\\* blockaddress(@X7, %IBRBlock2), \\\[label %IBRBlock2, label %IBRBlock3\\\]} < %t.ll
+    (* RUN: grep "indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]" < %t.ll
      *)
     let bb1 = append_block context "IBRBlock1" fn in
 
@@ -964,8 +964,8 @@ let test_builder () =
   end;
   
   group "invoke"; begin
-    (* RUN: grep {build_invoke.*invoke.*P1.*P2} < %t.ll
-     * RUN: grep {to.*Bb04.*unwind.*Bblpad} < %t.ll
+    (* RUN: grep "build_invoke.*invoke.*P1.*P2" < %t.ll
+     * RUN: grep "to.*Bb04.*unwind.*Bblpad" < %t.ll
      *)
     let bb04 = append_block context "Bb04" fn in
     let b = builder_at_end context bb04 in
@@ -973,7 +973,7 @@ let test_builder () =
   end;
   
   group "unreachable"; begin
-    (* RUN: grep {unreachable} < %t.ll
+    (* RUN: grep "unreachable" < %t.ll
      *)
     let bb06 = append_block context "Bb06" fn in
     let b = builder_at_end context bb06 in
@@ -984,36 +984,36 @@ let test_builder () =
     let bb07 = append_block context "Bb07" fn in
     let b = builder_at_end context bb07 in
     
-    (* RUN: grep {%build_add = add i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nsw_add = add nsw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nuw_add = add nuw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_fadd = fadd float %F1, %F2} < %t.ll
-     * RUN: grep {%build_sub = sub i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nsw_sub = sub nsw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nuw_sub = sub nuw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_fsub = fsub float %F1, %F2} < %t.ll
-     * RUN: grep {%build_mul = mul i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nsw_mul = mul nsw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_nuw_mul = mul nuw i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_fmul = fmul float %F1, %F2} < %t.ll
-     * RUN: grep {%build_udiv = udiv i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_sdiv = sdiv i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_exact_sdiv = sdiv exact i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_fdiv = fdiv float %F1, %F2} < %t.ll
-     * RUN: grep {%build_urem = urem i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_srem = srem i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_frem = frem float %F1, %F2} < %t.ll
-     * RUN: grep {%build_shl = shl i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_lshl = lshr i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_ashl = ashr i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_and = and i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_or = or i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_xor = xor i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_neg = sub i32 0, %P1} < %t.ll
-     * RUN: grep {%build_nsw_neg = sub nsw i32 0, %P1} < %t.ll
-     * RUN: grep {%build_nuw_neg = sub nuw i32 0, %P1} < %t.ll
-     * RUN: grep {%build_fneg = fsub float .*0.*, %F1} < %t.ll
-     * RUN: grep {%build_not = xor i32 %P1, -1} < %t.ll
+    (* RUN: grep "%build_add = add i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nsw_add = add nsw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nuw_add = add nuw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_fadd = fadd float %F1, %F2" < %t.ll
+     * RUN: grep "%build_sub = sub i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nsw_sub = sub nsw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nuw_sub = sub nuw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_fsub = fsub float %F1, %F2" < %t.ll
+     * RUN: grep "%build_mul = mul i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nsw_mul = mul nsw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_nuw_mul = mul nuw i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_fmul = fmul float %F1, %F2" < %t.ll
+     * RUN: grep "%build_udiv = udiv i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_sdiv = sdiv i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_exact_sdiv = sdiv exact i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_fdiv = fdiv float %F1, %F2" < %t.ll
+     * RUN: grep "%build_urem = urem i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_srem = srem i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_frem = frem float %F1, %F2" < %t.ll
+     * RUN: grep "%build_shl = shl i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_lshl = lshr i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_ashl = ashr i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_and = and i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_or = or i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_xor = xor i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_neg = sub i32 0, %P1" < %t.ll
+     * RUN: grep "%build_nsw_neg = sub nsw i32 0, %P1" < %t.ll
+     * RUN: grep "%build_nuw_neg = sub nuw i32 0, %P1" < %t.ll
+     * RUN: grep "%build_fneg = fsub float .*0.*, %F1" < %t.ll
+     * RUN: grep "%build_not = xor i32 %P1, -1" < %t.ll
      *)
     ignore (build_add p1 p2 "build_add" b);
     ignore (build_nsw_add p1 p2 "build_nsw_add" b);
@@ -1052,13 +1052,13 @@ let test_builder () =
     let bb08 = append_block context "Bb08" fn in
     let b = builder_at_end context bb08 in
 
-    (* RUN: grep {%build_alloca = alloca i32} < %t.ll
-     * RUN: grep {%build_array_alloca = alloca i32, i32 %P2} < %t.ll
-     * RUN: grep {%build_load = load i32\\* %build_array_alloca} < %t.ll
-     * RUN: grep {store i32 %P2, i32\\* %build_alloca} < %t.ll
-     * RUN: grep {%build_gep = getelementptr i32\\* %build_array_alloca, i32 %P2} < %t.ll
-     * RUN: grep {%build_in_bounds_gep = getelementptr inbounds i32\\* %build_array_alloca, i32 %P2} < %t.ll
-     * RUN: grep {%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1} < %t.ll
+    (* RUN: grep "%build_alloca = alloca i32" < %t.ll
+     * RUN: grep "%build_array_alloca = alloca i32, i32 %P2" < %t.ll
+     * RUN: grep "%build_load = load i32* %build_array_alloca" < %t.ll
+     * RUN: grep "store i32 %P2, i32* %build_alloca" < %t.ll
+     * RUN: grep "%build_gep = getelementptr i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1" < %t.ll
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
     let array_alloca = build_array_alloca i32_type p2 "build_array_alloca" b in
@@ -1090,30 +1090,30 @@ let test_builder () =
   group "casts"; begin
     let void_ptr = pointer_type i8_type in
     
-    (* RUN: grep {%build_trunc = trunc i32 %P1 to i8} < %t.ll
-     * RUN: grep {%build_trunc2 = trunc i32 %P1 to i8} < %t.ll
-     * RUN: grep {%build_trunc3 = trunc i32 %P1 to i8} < %t.ll
-     * RUN: grep {%build_zext = zext i8 %build_trunc to i32} < %t.ll
-     * RUN: grep {%build_zext2 = zext i8 %build_trunc to i32} < %t.ll
-     * RUN: grep {%build_sext = sext i32 %build_zext to i64} < %t.ll
-     * RUN: grep {%build_sext2 = sext i32 %build_zext to i64} < %t.ll
-     * RUN: grep {%build_sext3 = sext i32 %build_zext to i64} < %t.ll
-     * RUN: grep {%build_uitofp = uitofp i64 %build_sext to float} < %t.ll
-     * RUN: grep {%build_sitofp = sitofp i32 %build_zext to double} < %t.ll
-     * RUN: grep {%build_fptoui = fptoui float %build_uitofp to i32} < %t.ll
-     * RUN: grep {%build_fptosi = fptosi double %build_sitofp to i64} < %t.ll
-     * RUN: grep {%build_fptrunc = fptrunc double %build_sitofp to float} < %t.ll
-     * RUN: grep {%build_fptrunc2 = fptrunc double %build_sitofp to float} < %t.ll
-     * RUN: grep {%build_fpext = fpext float %build_fptrunc to double} < %t.ll
-     * RUN: grep {%build_fpext2 = fpext float %build_fptrunc to double} < %t.ll
-     * RUN: grep {%build_inttoptr = inttoptr i32 %P1 to i8\\*} < %t.ll
-     * RUN: grep {%build_ptrtoint = ptrtoint i8\\* %build_inttoptr to i64} < %t.ll
-     * RUN: grep {%build_ptrtoint2 = ptrtoint i8\\* %build_inttoptr to i64} < %t.ll
-     * RUN: grep {%build_bitcast = bitcast i64 %build_ptrtoint to double} < %t.ll
-     * RUN: grep {%build_bitcast2 = bitcast i64 %build_ptrtoint to double} < %t.ll
-     * RUN: grep {%build_bitcast3 = bitcast i64 %build_ptrtoint to double} < %t.ll
-     * RUN: grep {%build_bitcast4 = bitcast i64 %build_ptrtoint to double} < %t.ll
-     * RUN: grep {%build_pointercast = bitcast i8\\* %build_inttoptr to i16\\*} < %t.ll
+    (* RUN: grep "%build_trunc = trunc i32 %P1 to i8" < %t.ll
+     * RUN: grep "%build_trunc2 = trunc i32 %P1 to i8" < %t.ll
+     * RUN: grep "%build_trunc3 = trunc i32 %P1 to i8" < %t.ll
+     * RUN: grep "%build_zext = zext i8 %build_trunc to i32" < %t.ll
+     * RUN: grep "%build_zext2 = zext i8 %build_trunc to i32" < %t.ll
+     * RUN: grep "%build_sext = sext i32 %build_zext to i64" < %t.ll
+     * RUN: grep "%build_sext2 = sext i32 %build_zext to i64" < %t.ll
+     * RUN: grep "%build_sext3 = sext i32 %build_zext to i64" < %t.ll
+     * RUN: grep "%build_uitofp = uitofp i64 %build_sext to float" < %t.ll
+     * RUN: grep "%build_sitofp = sitofp i32 %build_zext to double" < %t.ll
+     * RUN: grep "%build_fptoui = fptoui float %build_uitofp to i32" < %t.ll
+     * RUN: grep "%build_fptosi = fptosi double %build_sitofp to i64" < %t.ll
+     * RUN: grep "%build_fptrunc = fptrunc double %build_sitofp to float" < %t.ll
+     * RUN: grep "%build_fptrunc2 = fptrunc double %build_sitofp to float" < %t.ll
+     * RUN: grep "%build_fpext = fpext float %build_fptrunc to double" < %t.ll
+     * RUN: grep "%build_fpext2 = fpext float %build_fptrunc to double" < %t.ll
+     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8*" < %t.ll
+     * RUN: grep "%build_ptrtoint = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_bitcast = bitcast i64 %build_ptrtoint to double" < %t.ll
+     * RUN: grep "%build_bitcast2 = bitcast i64 %build_ptrtoint to double" < %t.ll
+     * RUN: grep "%build_bitcast3 = bitcast i64 %build_ptrtoint to double" < %t.ll
+     * RUN: grep "%build_bitcast4 = bitcast i64 %build_ptrtoint to double" < %t.ll
+     * RUN: grep "%build_pointercast = bitcast i8* %build_inttoptr to i16*" < %t.ll
      *)
     let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
     let inst29 = build_zext inst28 i32_type "build_zext" atentry in
@@ -1143,13 +1143,13 @@ let test_builder () =
   end;
   
   group "comparisons"; begin
-    (* RUN: grep {%build_icmp_ne = icmp ne i32 %P1, %P2} < %t.ll
-     * RUN: grep {%build_icmp_sle = icmp sle i32 %P2, %P1} < %t.ll
-     * RUN: grep {%build_fcmp_false = fcmp false float %F1, %F2} < %t.ll
-     * RUN: grep {%build_fcmp_true = fcmp true float %F2, %F1} < %t.ll
-     * RUN: grep {%build_is_null.*= icmp eq.*%X0,.*null} < %t.ll
-     * RUN: grep {%build_is_not_null = icmp ne i8\\* %X1, null} < %t.ll
-     * RUN: grep {%build_ptrdiff} < %t.ll
+    (* RUN: grep "%build_icmp_ne = icmp ne i32 %P1, %P2" < %t.ll
+     * RUN: grep "%build_icmp_sle = icmp sle i32 %P2, %P1" < %t.ll
+     * RUN: grep "%build_fcmp_false = fcmp false float %F1, %F2" < %t.ll
+     * RUN: grep "%build_fcmp_true = fcmp true float %F2, %F1" < %t.ll
+     * RUN: grep "%build_is_null.*= icmp eq.*%X0,.*null" < %t.ll
+     * RUN: grep "%build_is_not_null = icmp ne i8* %X1, null" < %t.ll
+     * RUN: grep "%build_ptrdiff" < %t.ll
      *)
     ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
     ignore (build_icmp Icmp.Sle   p2 p1 "build_icmp_sle" atentry);
@@ -1165,14 +1165,14 @@ let test_builder () =
   end;
   
   group "miscellaneous"; begin
-    (* RUN: grep {%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)} < %t.ll
-     * RUN: grep {%build_select = select i1 %build_icmp, i32 %P1, i32 %P2} < %t.ll
-     * RUN: grep {%build_va_arg = va_arg i8\\*\\* null, i32} < %t.ll
-     * RUN: grep {%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2} < %t.ll
-     * RUN: grep {%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2} < %t.ll
-     * RUN: grep {%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>} < %t.ll
-     * RUN: grep {%build_insertvalue0 = insertvalue.*%bl, i32 1, 0} < %t.ll
-     * RUN: grep {%build_extractvalue = extractvalue.*%build_insertvalue1, 1} < %t.ll
+    (* RUN: grep "%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)" < %t.ll
+     * RUN: grep "%build_select = select i1 %build_icmp, i32 %P1, i32 %P2" < %t.ll
+     * RUN: grep "%build_va_arg = va_arg i8** null, i32" < %t.ll
+     * RUN: grep "%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2" < %t.ll
+     * RUN: grep "%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2" < %t.ll
+     * RUN: grep "%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>" < %t.ll
+     * RUN: grep "%build_insertvalue0 = insertvalue.*%bl, i32 1, 0" < %t.ll
+     * RUN: grep "%build_extractvalue = extractvalue.*%build_insertvalue1, 1" < %t.ll
      *)
     let ci = build_call fn [| p2; p1 |] "build_call" atentry in
     insist (CallConv.c = instruction_call_conv ci);
@@ -1215,8 +1215,8 @@ let test_builder () =
   end;
 
   group "metadata"; begin
-    (* RUN: grep {%metadata = add i32 %P1, %P2, !test !0} < %t.ll
-     * RUN: grep {!0 = metadata !\{i32 1, metadata !"metadata test"\}} < %t.ll
+    (* RUN: grep '%metadata = add i32 %P1, %P2, !test !0' < %t.ll
+     * RUN: grep '!0 = metadata !{i32 1, metadata !"metadata test"}' < %t.ll
      *)
     let i = build_add p1 p2 "metadata" atentry in
     insist ((has_metadata i) = false);
@@ -1240,8 +1240,8 @@ let test_builder () =
   end;
 
   group "dbg"; begin
-    (* RUN: grep {%dbg = add i32 %P1, %P2, !dbg !1} < %t.ll
-     * RUN: grep {!1 = metadata !\{i32 2, i32 3, metadata !2, metadata !2\}} < %t.ll
+    (* RUN: grep "%dbg = add i32 %P1, %P2, !dbg !1" < %t.ll
+     * RUN: grep "!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}" < %t.ll
      *)
     insist ((current_debug_location atentry) = None);
 
@@ -1261,7 +1261,7 @@ let test_builder () =
   end;
   
   group "phi"; begin
-    (* RUN: grep {PhiNode.*P1.*PhiBlock1.*P2.*PhiBlock2} < %t.ll
+    (* RUN: grep "PhiNode.*P1.*PhiBlock1.*P2.*PhiBlock2" < %t.ll
      *)
     let b1 = append_block context "PhiBlock1" fn in
     let b2 = append_block context "PhiBlock2" fn in
diff --git a/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll b/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll
new file mode 100644
index 0000000..583b9a8
--- /dev/null
+++ b/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll
@@ -0,0 +1,33 @@
+; RUN: rm -f %t.bc
+; RUN: rm -f %t.ll
+; RUN: rm -f %t2.bc
+; RUN: rm -f %t2.ll
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-dis %t.bc -o - | tail -n +2 > %t.ll
+; RUN: llvm-as %t.ll -o %t2.bc
+; RUN: llvm-dis %t2.bc -o - | tail -n +2 > %t2.ll
+; RUN: llvm-diff %t.ll %t2.ll
+
+define void @test() {
+  %mem = alloca i32
+  store i32 2, i32* %mem
+  %c = load i32* %mem
+  switch i32 %c, label %exit [
+      i32 1, label %exit
+      i32 2, label %exit
+  ]
+exit:
+  ret void
+}
+define void @test_wide() {
+  %mem = alloca i256
+  store i256 2, i256* %mem
+  %c = load i256* %mem
+  switch i256 %c, label %exit [
+      i256 123456789012345678901234567890, label %exit
+      i256 2, label %exit
+  ]
+exit:
+  ret void
+}
+
diff --git a/test/Bitcode/arm32_neon_vcnt_upgrade.ll b/test/Bitcode/arm32_neon_vcnt_upgrade.ll
new file mode 100644
index 0000000..10b9284
--- /dev/null
+++ b/test/Bitcode/arm32_neon_vcnt_upgrade.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; Tests vclz and vcnt
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+;CHECK: @vclz16
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+;CHECK: {{call.*@llvm.ctlz.v4i16\(<4 x i16>.*, i1 false}}
+        ret <4 x i16> %tmp2
+}
+
+define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+;CHECK: @vcnt8
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+;CHECK: call <8 x i8> @llvm.ctpop.v8i8(<8 x i8>
+        ret <8 x i8> %tmp2
+}
+
+declare <4 x i16>  @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
new file mode 100644
index 0000000..502e967
--- /dev/null
+++ b/test/Bitcode/attributes.ll
@@ -0,0 +1,164 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; PR12696
+
+define void @f1(i8 zeroext)
+; CHECK: define void @f1(i8 zeroext)
+{
+        ret void;
+}
+
+define void @f2(i8 signext)
+; CHECK: define void @f2(i8 signext)
+{
+        ret void;
+}
+
+define void @f3() noreturn
+; CHECK: define void @f3() noreturn
+{
+        ret void;
+}
+
+define void @f4(i8 inreg)
+; CHECK: define void @f4(i8 inreg)
+{
+        ret void;
+}
+
+define void @f5(i8* sret)
+; CHECK: define void @f5(i8* sret)
+{
+        ret void;
+}
+
+define void @f6() nounwind
+; CHECK: define void @f6() nounwind
+{
+        ret void;
+}
+
+define void @f7(i8* noalias)
+; CHECK: define void @f7(i8* noalias)
+{
+        ret void;
+}
+
+define void @f8(i8* byval)
+; CHECK: define void @f8(i8* byval)
+{
+        ret void;
+}
+
+define void @f9(i8* nest)
+; CHECK: define void @f9(i8* nest)
+{
+        ret void;
+}
+
+define void @f10() readnone
+; CHECK: define void @f10() readnone
+{
+        ret void;
+}
+
+define void @f11() readonly
+; CHECK: define void @f11() readonly
+{
+        ret void;
+}
+
+define void @f12() noinline
+; CHECK: define void @f12() noinline
+{
+        ret void;
+}
+
+define void @f13() alwaysinline
+; CHECK: define void @f13() alwaysinline
+{
+        ret void;
+}
+
+define void @f14() optsize
+; CHECK: define void @f14() optsize
+{
+        ret void;
+}
+
+define void @f15() ssp
+; CHECK: define void @f15() ssp
+{
+        ret void;
+}
+
+define void @f16() sspreq
+; CHECK: define void @f16() sspreq
+{
+        ret void;
+}
+
+define void @f17(i8 align 4)
+; CHECK: define void @f17(i8 align 4)
+{
+        ret void;
+}
+
+define void @f18(i8* nocapture)
+; CHECK: define void @f18(i8* nocapture)
+{
+        ret void;
+}
+
+define void @f19() noredzone
+; CHECK: define void @f19() noredzone
+{
+        ret void;
+}
+
+define void @f20() noimplicitfloat
+; CHECK: define void @f20() noimplicitfloat
+{
+        ret void;
+}
+
+define void @f21() naked
+; CHECK: define void @f21() naked
+{
+        ret void;
+}
+
+define void @f22() inlinehint
+; CHECK: define void @f22() inlinehint
+{
+        ret void;
+}
+
+define void @f23() alignstack(4)
+; CHECK: define void @f23() alignstack(4)
+{
+        ret void;
+}
+
+define void @f24() returns_twice
+; CHECK: define void @f24() returns_twice
+{
+        ret void;
+}
+
+define void @f25() uwtable
+; CHECK: define void @f25() uwtable
+{
+        ret void;
+}
+
+define void @f26() nonlazybind
+; CHECK: define void @f26() nonlazybind
+{
+        ret void;
+}
+
+define void @f27() address_safety
+; CHECK: define void @f27() address_safety
+{
+        ret void;
+}
diff --git a/test/Bitcode/null-type.ll b/test/Bitcode/null-type.ll
index b972753..8502b0d 100644
--- a/test/Bitcode/null-type.ll
+++ b/test/Bitcode/null-type.ll
@@ -1,4 +1,5 @@
-; RUN: not llvm-dis < %s.bc > /dev/null |& FileCheck %s
+; RUN: not llvm-dis < %s.bc > /dev/null 2> %t
+; RUN: FileCheck %s < %t
 ; PR8494
 
 ; CHECK: Invalid MODULE_CODE_FUNCTION record
diff --git a/test/Bitcode/ptest-new.ll b/test/Bitcode/ptest-new.ll
new file mode 100644
index 0000000..276fb7a
--- /dev/null
+++ b/test/Bitcode/ptest-new.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define i32 @foo(<2 x i64> %bar) nounwind {
+entry:
+; CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64>
+ %res1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %bar, <2 x i64> %bar)
+; CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64>
+ %res2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %bar, <2 x i64> %bar)
+; CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64>
+ %res3 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %bar, <2 x i64> %bar)
+ %add1 = add i32 %res1, %res2
+ %add2 = add i32 %add1, %res2
+ ret i32 %add2
+}
+
+; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll
new file mode 100644
index 0000000..fc6ed8e
--- /dev/null
+++ b/test/Bitcode/ptest-old.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define i32 @foo(<4 x float> %bar) nounwind {
+entry:
+; CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64>
+ %res1 = call i32 @llvm.x86.sse41.ptestc(<4 x float> %bar, <4 x float> %bar)
+; CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> 
+ %res2 = call i32 @llvm.x86.sse41.ptestz(<4 x float> %bar, <4 x float> %bar)
+; CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64>
+ %res3 = call i32 @llvm.x86.sse41.ptestnzc(<4 x float> %bar, <4 x float> %bar)
+ %add1 = add i32 %res1, %res2
+ %add2 = add i32 %add1, %res2
+ ret i32 %add2
+}
+
+; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare i32 @llvm.x86.sse41.ptestc(<4 x float>, <4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse41.ptestz(<4 x float>, <4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse41.ptestnzc(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8cebb7c..991cc9d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,81 +1,30 @@
-foreach(c ${LLVM_TARGETS_TO_BUILD})
-  set(TARGETS_BUILT "${TARGETS_BUILT} ${c}")
-endforeach(c)
-set(TARGETS_TO_BUILD ${TARGETS_BUILT})
-
-# FIXME: This won't work for project files, we need to use a --param.
-set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}")
-set(SHLIBEXT "${LTDL_SHLIB_EXT}")
-
-set(SHLIBDIR "${LLVM_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}")
-
-if(BUILD_SHARED_LIBS)
-  set(LLVM_SHARED_LIBS_ENABLED "1")
-else()
-  set(LLVM_SHARED_LIBS_ENABLED "0")
-endif(BUILD_SHARED_LIBS)
-
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-  set(SHLIBPATH_VAR "DYLD_LIBRARY_PATH")
-else() # Default for all other unix like systems.
-  # CMake hardcodes the library locaction using rpath.
-  # Therefore LD_LIBRARY_PATH is not required to run binaries in the
-  # build dir. We pass it anyways.
-  set(SHLIBPATH_VAR "LD_LIBRARY_PATH")
-endif()
-
-set(LIT_ARGS "${LLVM_LIT_ARGS}")
-separate_arguments(LIT_ARGS)
-
-configure_file(
-  ${CMAKE_CURRENT_SOURCE_DIR}/site.exp.in
-  ${CMAKE_CURRENT_BINARY_DIR}/site.exp)
-
-MAKE_DIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/Unit)
-
-# Configuration-time: See Unit/lit.site.cfg.in
-set(LLVM_BUILD_MODE "%(build_mode)s")
-
-set(LLVM_SOURCE_DIR ${LLVM_MAIN_SRC_DIR})
-set(LLVM_BINARY_DIR ${LLVM_BINARY_DIR})
-set(LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}/%(build_config)s")
-set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
-set(ENABLE_SHARED ${LLVM_SHARED_LIBS_ENABLED})
-set(SHLIBPATH_VAR ${SHLIBPATH_VAR})
-
-if(LLVM_ENABLE_ASSERTIONS AND NOT MSVC_IDE)
-  set(ENABLE_ASSERTIONS "1")
-else()
-  set(ENABLE_ASSERTIONS "0")
-endif()
-
-configure_file(
+configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
-  @ONLY)
-configure_file(
+  )
+configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.in
   ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
-  @ONLY)
-
-add_custom_target(check
-  COMMAND ${PYTHON_EXECUTABLE}
-              ${LLVM_SOURCE_DIR}/utils/lit/lit.py
-              --param llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
-              --param llvm_unit_site_config=${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
-              --param build_config=${CMAKE_CFG_INTDIR}
-              --param build_mode=${RUNTIME_BUILD_MODE}
-              ${LIT_ARGS}
-              ${CMAKE_CURRENT_BINARY_DIR}
-              COMMENT "Running LLVM regression tests")
-
-add_custom_target(check.deps)
-add_dependencies(check check.deps)
-add_dependencies(check.deps
-              UnitTests
-              BugpointPasses LLVMHello
-              llc lli llvm-ar llvm-as llvm-dis llvm-extract llvm-dwarfdump
-              llvm-ld llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
-              macho-dump opt
-              FileCheck count not json-bench)
-set_target_properties(check.deps PROPERTIES FOLDER "Tests")
+  )
+
+add_lit_testsuite(check-llvm "Running the LLVM regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+         llvm_unit_site_config=${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
+  DEPENDS UnitTests
+          BugpointPasses LLVMHello
+          llc lli llvm-ar llvm-as
+          llvm-diff
+          llvm-dis llvm-extract llvm-dwarfdump
+          llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
+          macho-dump opt
+          FileCheck count not
+          yaml2obj
+  )
+set_target_properties(check-llvm PROPERTIES FOLDER "Tests")
+
+# Setup a legacy alias for 'check-llvm'. This will likely change to be an
+# alias for 'check-all' at some point in the future.
+add_custom_target(check)
+add_dependencies(check check-llvm)
+set_target_properties(check PROPERTIES FOLDER "Tests")
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 33f935e..a63cdd4 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
 ; RUN:   -mattr=+v6 | grep r9
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
+; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer
 ; | grep 35
 
 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index b543c57..8d3337c 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {add.*#0}
+; RUN: llc < %s -march=arm | not grep "add.*#0"
 
 define i32 @foo() {
 entry:
diff --git a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
index d2eb85d..670048b 100644
--- a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
+++ b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {str.*\\!}
+; RUN: llc < %s -march=arm | not grep "str.*\!"
 
 	%struct.shape_edge_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32 }
 	%struct.shape_path_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index fd2f462..3754db0 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index 44da8e7..5fbed0d 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
index 3526722..7342f69 100644
--- a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
+++ b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {swi 107}
+; RUN: llc < %s -march=arm | grep "swi 107"
 
 define i32 @_swilseek(i32) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
index 813bf3c..7d4cc6e 100644
--- a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
+++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -verify-machineinstrs
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs
 target triple = "arm-pc-linux-gnu"
 
 ; This test case would accidentally use the same physreg for two virtregs
diff --git a/test/CodeGen/ARM/2011-12-14-machine-sink.ll b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
index 5ce600d..b21bb00 100644
--- a/test/CodeGen/ARM/2011-12-14-machine-sink.ll
+++ b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; Radar 10266272
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios4.0.0"
diff --git a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
index 872eca3..f1c85f1 100644
--- a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
+++ b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
@@ -60,8 +60,16 @@ for.end:                                          ; preds = %entry
   ret void
 }
 
+; Check that pseudo-expansion preserves <undef> flags.
+define void @foo3(i8* %p) nounwind ssp {
+entry:
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
+  ret void
+}
+
 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
 
 !0 = metadata !{metadata !"omnipotent char", metadata !1}
 !1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
new file mode 100644
index 0000000..b3a7e34
--- /dev/null
+++ b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 < %s
+
+; CodeGen SplitCriticalEdge() shouldn't try to break edge to a landing pad.
+; rdar://11300144
+
+%0 = type opaque
+%class.FunctionInterpreter.3.15.31 = type { %class.Parser.1.13.29, %class.Parser.1.13.29*, %struct.ParserVariable.2.14.30*, i32 }
+%class.Parser.1.13.29 = type { i32 (...)**, %class.Parser.1.13.29* }
+%struct.ParserVariable.2.14.30 = type opaque
+%struct.ParseErrorMsg.0.12.28 = type { i32, i32, i32 }
+
+@_ZTI13ParseErrorMsg = external hidden unnamed_addr constant { i8*, i8* }
+@"OBJC_IVAR_$_MUMathExpressionDoubleBased.mInterpreter" = external hidden global i32, section "__DATA, __objc_ivar", align 4
+@"\01L_OBJC_SELECTOR_REFERENCES_14" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+define hidden double @t(%0* %self, i8* nocapture %_cmd) optsize ssp {
+entry:
+  %call = invoke double undef(%class.FunctionInterpreter.3.15.31* undef) optsize
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI13ParseErrorMsg to i8*)
+  br i1 undef, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %struct.ParseErrorMsg.0.12.28*)*)(i8* undef, i8* undef, %struct.ParseErrorMsg.0.12.28* undef) optsize
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  %value.0 = phi double [ 0x7FF8000000000000, %invoke.cont2 ], [ %call, %entry ]
+  ret double %value.0
+
+lpad1:                                            ; preds = %catch
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1, %lpad
+  resume { i8*, i32 } undef
+
+terminate.lpad:                                   ; preds = %lpad1
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare i32 @__gxx_personality_sj0(...)
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM/2012-05-29-TailDupBug.ll b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
new file mode 100644
index 0000000..1a57f04
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -verify-machineinstrs < %s
+
+; Teach taildup to update livein set to appease verifier.
+; rdar://11538365
+
+%struct.__CFString.2 = type opaque
+
+declare void @CFRelease(i8*)
+
+define hidden fastcc i32 @t() ssp {
+entry:
+  %mylocale.i.i = alloca [256 x i8], align 1
+  br i1 undef, label %return, label %CFStringIsHyphenationAvailableForLocale.exit
+
+CFStringIsHyphenationAvailableForLocale.exit:     ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %CFStringIsHyphenationAvailableForLocale.exit
+  br i1 undef, label %if.end8.thread.i, label %if.then.i
+
+if.then.i:                                        ; preds = %if.end
+  br i1 undef, label %if.end8.thread.i, label %if.end8.i
+
+if.end8.thread.i:                                 ; preds = %if.then.i, %if.end
+  unreachable
+
+if.end8.i:                                        ; preds = %if.then.i
+  br i1 undef, label %if.then11.i, label %__CFHyphenationPullTokenizer.exit
+
+if.then11.i:                                      ; preds = %if.end8.i
+  unreachable
+
+__CFHyphenationPullTokenizer.exit:                ; preds = %if.end8.i
+  br i1 undef, label %if.end68, label %if.then3
+
+if.then3:                                         ; preds = %__CFHyphenationPullTokenizer.exit
+  br i1 undef, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %if.then3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %if.then3
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %cond.end
+  unreachable
+
+while.end:                                        ; preds = %cond.end
+  br i1 undef, label %if.end5.i, label %if.then.i16
+
+if.then.i16:                                      ; preds = %while.end
+  br i1 undef, label %if.then4.i, label %if.end5.i
+
+if.then4.i:                                       ; preds = %if.then.i16
+  br i1 false, label %cleanup.thread, label %if.end.i20
+
+if.end5.i:                                        ; preds = %if.then.i16, %while.end
+  unreachable
+
+if.end.i20:                                       ; preds = %if.then4.i
+  br label %for.body.i146.i
+
+for.body.i146.i:                                  ; preds = %for.body.i146.i, %if.end.i20
+  br i1 undef, label %if.end20.i, label %for.body.i146.i
+
+if.end20.i:                                       ; preds = %for.body.i146.i
+  br i1 undef, label %cleanup.thread, label %if.end23.i
+
+if.end23.i:                                       ; preds = %if.end20.i
+  br label %for.body.i94.i
+
+for.body.i94.i:                                   ; preds = %for.body.i94.i, %if.end23.i
+  br i1 undef, label %if.then28.i, label %for.body.i94.i
+
+if.then28.i:                                      ; preds = %for.body.i94.i
+  br i1 undef, label %cond.true.i26, label %land.lhs.true
+
+cond.true.i26:                                    ; preds = %if.then28.i
+  br label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %cond.true.i26, %if.then28.i
+  br i1 false, label %cleanup.thread, label %if.end35
+
+if.end35:                                         ; preds = %land.lhs.true
+  br i1 undef, label %cleanup.thread, label %if.end45
+
+if.end45:                                         ; preds = %if.end35
+  br i1 undef, label %if.then50, label %if.end.i37
+
+if.end.i37:                                       ; preds = %if.end45
+  br label %if.then50
+
+if.then50:                                        ; preds = %if.end.i37, %if.end45
+  br i1 undef, label %__CFHyphenationGetHyphensForString.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %if.then50
+  br i1 undef, label %cleanup.i, label %cond.true.i
+
+cond.true.i:                                      ; preds = %if.end.i
+  br i1 undef, label %for.cond16.preheader.i, label %for.cond57.preheader.i
+
+for.cond16.preheader.i:                           ; preds = %cond.true.i
+  %cmp1791.i = icmp sgt i32 undef, 1
+  br i1 %cmp1791.i, label %for.body18.i, label %for.cond57.preheader.i
+
+for.cond57.preheader.i:                           ; preds = %for.cond16.preheader.i, %cond.true.i
+  %sub69.i = add i32 undef, -2
+  br label %cleanup.i
+
+for.body18.i:                                     ; preds = %for.cond16.preheader.i
+  store i16 0, i16* undef, align 2
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %for.body18.i
+  br label %while.body.i
+
+cleanup.i:                                        ; preds = %for.cond57.preheader.i, %if.end.i
+  br label %__CFHyphenationGetHyphensForString.exit
+
+__CFHyphenationGetHyphensForString.exit:          ; preds = %cleanup.i, %if.then50
+  %retval.1.i = phi i32 [ 0, %cleanup.i ], [ -1, %if.then50 ]
+  %phitmp = bitcast %struct.__CFString.2* null to i8*
+  br label %if.end68
+
+cleanup.thread:                                   ; preds = %if.end35, %land.lhs.true, %if.end20.i, %if.then4.i
+  call void @llvm.stackrestore(i8* null)
+  br label %return
+
+if.end68:                                         ; preds = %__CFHyphenationGetHyphensForString.exit, %__CFHyphenationPullTokenizer.exit
+  %hyphenCount.2 = phi i32 [ %retval.1.i, %__CFHyphenationGetHyphensForString.exit ], [ 0, %__CFHyphenationPullTokenizer.exit ]
+  %_token.1 = phi i8* [ %phitmp, %__CFHyphenationGetHyphensForString.exit ], [ undef, %__CFHyphenationPullTokenizer.exit ]
+  call void @CFRelease(i8* %_token.1)
+  br label %return
+
+return:                                           ; preds = %if.end68, %cleanup.thread, %CFStringIsHyphenationAvailableForLocale.exit, %entry
+  %retval.1 = phi i32 [ %hyphenCount.2, %if.end68 ], [ -1, %CFStringIsHyphenationAvailableForLocale.exit ], [ -1, %cleanup.thread ], [ -1, %entry ]
+  ret i32 %retval.1
+}
+
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
new file mode 100644
index 0000000..b05ec63
--- /dev/null
+++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -o /dev/null "-mtriple=thumbv7-apple-ios" -debug-only=post-RA-sched 2> %t
+; RUN: FileCheck %s < %t
+; REQUIRES: asserts
+; Make sure that mayalias store-load dependencies have one cycle
+; latency regardless of whether they are barriers or not.
+
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store volatile i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load volatile i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+define i32 @f2(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
new file mode 100644
index 0000000..e4ad45b
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
@@ -0,0 +1,174 @@
+; RUN: llc < %s
+; PR13377
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+%0 = type { <4 x float> }
+
+define arm_aapcs_vfpcc void @foo(float, i1 zeroext, i1 zeroext) nounwind uwtable {
+  br i1 undef, label %4, label %5
+
+; <label>:4                                       ; preds = %3
+  unreachable
+
+; <label>:5                                       ; preds = %3
+  br i1 undef, label %7, label %6
+
+; <label>:6                                       ; preds = %5
+  unreachable
+
+; <label>:7                                       ; preds = %5
+  br i1 undef, label %8, label %10
+
+; <label>:8                                       ; preds = %7
+  br i1 undef, label %9, label %10
+
+; <label>:9                                       ; preds = %8
+  br i1 undef, label %11, label %10
+
+; <label>:10                                      ; preds = %9, %8, %7
+  unreachable
+
+; <label>:11                                      ; preds = %9
+  br i1 undef, label %13, label %12
+
+; <label>:12                                      ; preds = %11
+  unreachable
+
+; <label>:13                                      ; preds = %11
+  br i1 undef, label %15, label %14
+
+; <label>:14                                      ; preds = %13
+  unreachable
+
+; <label>:15                                      ; preds = %13
+  br i1 undef, label %18, label %16
+
+; <label>:16                                      ; preds = %15
+  br i1 undef, label %17, label %18
+
+; <label>:17                                      ; preds = %16
+  unreachable
+
+; <label>:18                                      ; preds = %16, %15
+  br i1 undef, label %68, label %19
+
+; <label>:19                                      ; preds = %18
+  br label %20
+
+; <label>:20                                      ; preds = %20, %19
+  br i1 undef, label %21, label %20
+
+; <label>:21                                      ; preds = %20
+  br i1 undef, label %22, label %68
+
+; <label>:22                                      ; preds = %21
+  br i1 undef, label %23, label %24
+
+; <label>:23                                      ; preds = %22
+  unreachable
+
+; <label>:24                                      ; preds = %22
+  br i1 undef, label %26, label %25
+
+; <label>:25                                      ; preds = %24
+  unreachable
+
+; <label>:26                                      ; preds = %24
+  br i1 undef, label %28, label %27
+
+; <label>:27                                      ; preds = %26
+  unreachable
+
+; <label>:28                                      ; preds = %26
+  br i1 undef, label %29, label %30, !prof !0
+
+; <label>:29                                      ; preds = %28
+  br label %30
+
+; <label>:30                                      ; preds = %29, %28
+  br i1 undef, label %31, label %32, !prof !0
+
+; <label>:31                                      ; preds = %30
+  br label %32
+
+; <label>:32                                      ; preds = %31, %30
+  br i1 undef, label %34, label %33
+
+; <label>:33                                      ; preds = %32
+  unreachable
+
+; <label>:34                                      ; preds = %32
+  br i1 undef, label %35, label %36, !prof !0
+
+; <label>:35                                      ; preds = %34
+  br label %36
+
+; <label>:36                                      ; preds = %35, %34
+  br i1 undef, label %37, label %38, !prof !0
+
+; <label>:37                                      ; preds = %36
+  br label %38
+
+; <label>:38                                      ; preds = %37, %36
+  br i1 undef, label %39, label %67
+
+; <label>:39                                      ; preds = %38
+  br i1 undef, label %40, label %41
+
+; <label>:40                                      ; preds = %39
+  br i1 undef, label %64, label %41
+
+; <label>:41                                      ; preds = %40, %39
+  br i1 undef, label %64, label %42
+
+; <label>:42                                      ; preds = %41
+  %43 = fadd <4 x float> undef, undef
+  %44 = fadd <4 x float> undef, undef
+  %45 = fmul <4 x float> undef, undef
+  %46 = fmul <4 x float> %45, %43
+  %47 = fmul <4 x float> undef, %44
+  %48 = load <4 x float>* undef, align 8, !tbaa !1
+  %49 = bitcast <4 x float> %48 to <2 x i64>
+  %50 = shufflevector <2 x i64> %49, <2 x i64> undef, <1 x i32> <i32 1>
+  %51 = bitcast <1 x i64> %50 to <2 x float>
+  %52 = shufflevector <2 x float> %51, <2 x float> undef, <4 x i32> zeroinitializer
+  %53 = bitcast <4 x float> %52 to <2 x i64>
+  %54 = shufflevector <2 x i64> %53, <2 x i64> undef, <1 x i32> zeroinitializer
+  %55 = bitcast <1 x i64> %54 to <2 x float>
+  %56 = extractelement <2 x float> %55, i32 0
+  %57 = insertelement <4 x float> undef, float %56, i32 2
+  %58 = insertelement <4 x float> %57, float 1.000000e+00, i32 3
+  %59 = fsub <4 x float> %47, %58
+  %60 = fmul <4 x float> undef, undef
+  %61 = fmul <4 x float> %59, %60
+  %62 = fmul <4 x float> %61, <float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01>
+  %63 = fadd <4 x float> %47, %62
+  store <4 x float> %46, <4 x float>* undef, align 8, !tbaa !1
+  call arm_aapcs_vfpcc  void @bar(%0* undef, float 0.000000e+00) nounwind
+  call arm_aapcs_vfpcc  void @bar(%0* undef, float 0.000000e+00) nounwind
+  store <4 x float> %63, <4 x float>* undef, align 8, !tbaa !1
+  unreachable
+
+; <label>:64                                      ; preds = %41, %40
+  br i1 undef, label %65, label %66
+
+; <label>:65                                      ; preds = %64
+  unreachable
+
+; <label>:66                                      ; preds = %64
+  unreachable
+
+; <label>:67                                      ; preds = %38
+  unreachable
+
+; <label>:68                                      ; preds = %21, %18
+  ret void
+}
+
+declare arm_aapcs_vfpcc void @bar(%0*, float)
+
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll b/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll
new file mode 100644
index 0000000..bdcd1b6
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s
+; PR13111
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
+target triple = "armv7-none-linux-gnueabi"
+
+define void @test_hi_char8() noinline {
+entry:
+  %0 = load <4 x i8>* undef, align 1
+  store <4 x i8> %0, <4 x i8>* null, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-08-09-neon-extload.ll b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
new file mode 100644
index 0000000..b55f1ca
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple=armv7-none-linux-gnueabi < %s | FileCheck %s
+
+@var_v2i8 = global <2 x i8> zeroinitializer
+@var_v4i8 = global <4 x i8> zeroinitializer
+
+@var_v2i16 = global <2 x i16> zeroinitializer
+@var_v4i16 = global <4 x i16> zeroinitializer
+
+@var_v2i32 = global <2 x i32> zeroinitializer
+@var_v4i32 = global <4 x i32> zeroinitializer
+
+@var_v2i64 = global <2 x i64> zeroinitializer
+
+define void @test_v2i8tov2i32() {
+; CHECK: test_v2i8tov2i32:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i32val = sext <2 x i8> %i8val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.16 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i8tov2i64() {
+; CHECK: test_v2i8tov2i64:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i64val = sext <2 x i8> %i8val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.16 {d{{[0-9]+}}[0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.s32 {{q[0-9]+}}, {{d[0-9]+}}
+
+;  %i64val = sext <2 x i8> %i8val to <2 x i64>
+;  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+
+  ret void
+}
+
+define void @test_v4i8tov4i16() {
+; CHECK: test_v4i8tov4i16:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i16>
+  store <4 x i16> %i16val, <4 x i16>* @var_v4i16
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl.s16
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v4i8tov4i32() {
+; CHECK: test_v4i8tov4i32:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i32>
+  store <4 x i32> %i16val, <4 x i32>* @var_v4i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i16tov2i32() {
+; CHECK: test_v2i16tov2i32:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i32val = sext <2 x i16> %i16val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v2i16tov2i64() {
+; CHECK: test_v2i16tov2i64:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i64val = sext <2 x i16> %i16val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s32 {{q[0-9]+}}, d[[LOAD]]
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-08-13-bfi.ll b/test/CodeGen/ARM/2012-08-13-bfi.ll
new file mode 100644
index 0000000..8263833
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-13-bfi.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=thumb -mcpu=cortex-a8 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; CHECK-NOT: bfi
+; CHECK: bx
+define i32 @foo(i8 zeroext %i) nounwind uwtable readnone ssp {
+  %1 = and i8 %i, 15
+  %2 = zext i8 %1 to i32
+  %3 = icmp ult i8 %1, 10
+  %4 = or i32 %2, 48
+  %5 = add nsw i32 %2, 55
+  %6 = select i1 %3, i32 %4, i32 %5
+  ret i32 %6
+}
diff --git a/test/CodeGen/ARM/addrmode.ll b/test/CodeGen/ARM/addrmode.ll
index 9ccff07..6da9089 100644
--- a/test/CodeGen/ARM/addrmode.ll
+++ b/test/CodeGen/ARM/addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -stats |& grep asm-printer | grep 4
+; RUN: llc < %s -march=arm -stats 2>&1 | grep asm-printer | grep 4
 
 define i32 @t1(i32 %a) {
 	%b = mul i32 %a, 9
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 31c5007..d668334 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
-; RUN: grep { = } %t   | count 5
+; RUN: grep " = " %t   | count 5
 ; RUN: grep globl %t | count 4
 ; RUN: grep weak %t  | count 1
 
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
index 396de37..5e12d8e 100644
--- a/test/CodeGen/ARM/arm-modifier.ll
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -57,3 +57,12 @@ store i64 %0, i64* @f3_var, align 4
 store i64 %1, i64* @f3_var, align 4
 ret void
 }
+
+define i64 @f4(i64* %val) nounwind {
+entry:
+  ;CHECK: f4
+  ;CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r0]
+  ;CHECK: mov r0, [[REG1]]
+  %0 = tail call i64 asm sideeffect "ldrexd $0, ${0:H}, [$1]", "=&r,r,*Qo"(i64* %val, i64* %val) nounwind
+  ret i64 %0
+}
diff --git a/test/CodeGen/ARM/bicZext.ll b/test/CodeGen/ARM/bicZext.ll
new file mode 100644
index 0000000..cf4b7ba
--- /dev/null
+++ b/test/CodeGen/ARM/bicZext.ll
@@ -0,0 +1,19 @@
+; RUN: llc %s -o - | FileCheck %s
+; ModuleID = 'bic.c'
+target triple = "thumbv7-apple-ios3.0.0"
+
+define zeroext i16 @foo16(i16 zeroext %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo16
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i16 %f, -4
+  ret i16 %and
+}
+
+define i32 @foo32(i32 %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo32
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i32 %f, -4
+  ret i32 %and
+}
diff --git a/test/CodeGen/ARM/call_nolink.ll b/test/CodeGen/ARM/call_nolink.ll
index efe29d8..00b1688 100644
--- a/test/CodeGen/ARM/call_nolink.ll
+++ b/test/CodeGen/ARM/call_nolink.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:   not grep {bx lr}
+; RUN:   not grep "bx lr"
 
 	%struct.anon = type { i32 (i32, i32, i32)*, i32, i32, [3 x i32], i8*, i8*, i8* }
 @r = external global [14 x i32]		; <[14 x i32]*> [#uses=4]
diff --git a/test/CodeGen/ARM/cmn.ll b/test/CodeGen/ARM/cmn.ll
new file mode 100644
index 0000000..ef73165
--- /dev/null
+++ b/test/CodeGen/ARM/cmn.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/7569620>
+
+define i32 @compare_i_gt(i32 %a) {
+entry:
+; CHECK:     compare_i_gt
+; CHECK-NOT: mvn
+; CHECK:     cmn
+  %cmp = icmp sgt i32 %a, -78
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
+define i32 @compare_r_eq(i32 %a, i32 %b) {
+entry:
+; CHECK: compare_r_eq
+; CHECK: cmn
+  %sub = sub nsw i32 0, %b
+  %cmp = icmp eq i32 %a, %sub
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
new file mode 100644
index 0000000..fb0f4c6
--- /dev/null
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios0.0.0"
+
+; CHECK: f
+; The vld2 and vst2 are not aligned wrt each other, the second Q loaded is the
+; first one stored.
+; The coalescer must find a super-register larger than QQ to eliminate the copy
+; setting up the vst2 data.
+; CHECK: vld2
+; CHECK-NOT: vorr
+; CHECK-NOT: vmov
+; CHECK: vst2
+define void @f(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
+  ret void
+}
+
+; CHECK: f1
+; FIXME: This function still has copies.
+define void @f1(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
+  ret void
+}
+
+; CHECK: f2
+; FIXME: This function still has copies.
+define void @f2(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %qq0.0.1.0 = phi <4 x float> [ %vld224, %entry ], [ %vld2216, %do.body ]
+  %c.addr.0 = phi i32 [ %c, %entry ], [ %dec, %do.body ]
+  %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
+  %add.ptr = getelementptr inbounds float* %p.addr.0, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
+  %dec = add nsw i32 %c.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/crash-greedy.ll b/test/CodeGen/ARM/crash-greedy.ll
index 8a865e2..a3d49f6 100644
--- a/test/CodeGen/ARM/crash-greedy.ll
+++ b/test/CodeGen/ARM/crash-greedy.ll
@@ -82,3 +82,49 @@ if.then195:                                       ; preds = %if.then84
 if.end251:                                        ; preds = %if.then195, %if.then84, %entry
   ret void
 }
+
+; Coalescer failure: removeCopyByCommutingDef leaves a bad kill flag
+; behind.
+define void @rdar11950722() nounwind readonly optsize ssp align 2 {
+entry:
+  br i1 undef, label %land.lhs.true7, label %lor.lhs.false.i
+
+lor.lhs.false.i:
+  br i1 undef, label %if.then10.i, label %land.lhs.true7
+
+if.then10.i:
+  %xFlags.1.i = select i1 undef, i32 0, i32 undef
+  br i1 undef, label %land.lhs.true33.i, label %f.exit
+
+land.lhs.true33.i:
+  %and26.i = and i32 %xFlags.1.i, 8
+  %cmp27.i = icmp eq i32 %and26.i, 0
+  %and29.i = and i32 %xFlags.1.i, 2147483645
+  %xFlags.1.and29.i = select i1 %cmp27.i, i32 %xFlags.1.i, i32 %and29.i
+  %and34.i = and i32 %xFlags.1.i, 8
+  %cmp35.i = icmp eq i32 %and34.i, 0
+  %and37.i = and i32 %xFlags.1.i, 2147483645
+  %yFlags.1.and37.i = select i1 %cmp35.i, i32 %xFlags.1.i, i32 %and37.i
+  br label %f.exit
+
+f.exit:
+  %xFlags.3.i = phi i32 [ %xFlags.1.and29.i, %land.lhs.true33.i ], [ %xFlags.1.i, %if.then10.i ]
+  %yFlags.2.i = phi i32 [ %yFlags.1.and37.i, %land.lhs.true33.i ], [ %xFlags.1.i, %if.then10.i ]
+  %cmp40.i = icmp eq i32 %xFlags.3.i, %yFlags.2.i
+  br i1 %cmp40.i, label %land.lhs.true7, label %land.end
+
+land.lhs.true7:
+  br i1 undef, label %land.lhs.true34, label %lor.lhs.false27
+
+lor.lhs.false27:
+  br i1 undef, label %land.lhs.true34, label %land.end
+
+land.lhs.true34:
+  br i1 undef, label %land.end, label %lor.lhs.false44
+
+lor.lhs.false44:
+  ret void
+
+land.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/cse-libcalls.ll b/test/CodeGen/ARM/cse-libcalls.ll
index 1d011be..62b9e43 100644
--- a/test/CodeGen/ARM/cse-libcalls.ll
+++ b/test/CodeGen/ARM/cse-libcalls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {bl.\*__ltdf} | count 1
+; RUN: llc < %s -march=arm | grep "bl.*__ltdf" | count 1
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
diff --git a/test/CodeGen/ARM/data-in-code-annotations.ll b/test/CodeGen/ARM/data-in-code-annotations.ll
new file mode 100644
index 0000000..a66a9d1
--- /dev/null
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+define double @f1() nounwind {
+; CHECK: f1:
+; CHECK: .data_region
+; CHECK: .long 1413754129
+; CHECK: .long 1074340347
+; CHECK: .end_data_region
+  ret double 0x400921FB54442D11
+}
+
+
+define i32 @f2()  {
+; CHECK: f2:
+; CHECK: .data_region jt32
+; CHECK: .end_data_region
+
+entry:
+  switch i32 undef, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb6
+    i32 3, label %sw.bb13
+    i32 4, label %sw.bb20
+  ]
+
+sw.bb:                                            ; preds = %entry
+  br label %return
+
+sw.bb6:                                           ; preds = %entry
+  br label %return
+
+sw.bb13:                                          ; preds = %entry
+  br label %return
+
+sw.bb20:                                          ; preds = %entry
+  %div = sdiv i32 undef, undef
+  br label %return
+
+return:                                           ; preds = %sw.bb20, %sw.bb13, %sw.bb6, %sw.bb, %entry
+  %retval.0 = phi i32 [ %div, %sw.bb20 ], [ undef, %sw.bb13 ], [ undef, %sw.bb6 ], [ undef, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 9bdae43..4f4ff8e 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -3,16 +3,17 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 target triple = "thumbv7-apple-macosx10.6.7"
 
 ;CHECK: 	vadd.f32	q4, q8, q8
-;CHECK-NEXT: Ltmp
-;CHECK-NEXT: 	@DEBUG_VALUE: y <- Q4+0
-;CHECK-NEXT:    @DEBUG_VALUE: x <- Q4+0
+;CHECK-NEXT: Ltmp1
+
+;CHECK:@DEBUG_VALUE: x <- Q4+0
+;CHECK-NEXT:@DEBUG_VALUE: y <- Q4+0
 
 
 @.str = external constant [13 x i8]
 
 declare <4 x float> @test0001(float) nounwind readnone ssp
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+define i32 @main(i32 %argc, i8** nocapture %argv, i1 %cond) nounwind ssp {
 entry:
   br label %for.body9
 
@@ -21,7 +22,7 @@ for.body9:                                        ; preds = %for.body9, %entry
   tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
   %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
   tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
-  br i1 undef, label %for.end54, label %for.body9, !dbg !44
+  br i1 %cond, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
   %tmp115 = extractelement <4 x float> %add19, i32 1
@@ -52,7 +53,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 49c4103..7fbf8f4 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
@@ -56,3 +56,17 @@ bb1:
 
 declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
 declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
+
+; rdar://11714607
+define i32 @howmany(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: howmany:
+; CHECK: bl ___udivmodsi4
+; CHECK-NOT: ___udivsi3
+  %rem = urem i32 %x, %y
+  %div = udiv i32 %x, %y
+  %not.cmp = icmp ne i32 %rem, 0
+  %add = zext i1 %not.cmp to i32
+  %cond = add i32 %add, %div
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index 45c322d..bcb4ee7 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test(float %a, float %b) {
 entry:
         %dum = fadd float %a, %b
-	%0 = tail call float @fabsf(float %dum)
+	%0 = tail call float @fabsf(float %dum) readnone
         %dum1 = fadd float %0, %b
 	ret float %dum1
 }
diff --git a/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
new file mode 100644
index 0000000..14721a4
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+
+; Fast-isel can't handle non-double multi-reg retvals.
+; This test just check to make sure we don't hit the assert in FinishCall.
+define <16 x i8> @foo() nounwind ssp {
+entry:
+  ret <16 x i8> zeroinitializer
+}
+
+define void @t1() nounwind ssp {
+entry:
+; ARM: @t1
+; THUMB: @t1
+  %call = call <16 x i8> @foo()
+  ret void
+}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index dd460b2..edc805a 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 define i32 @t0(i1 zeroext %a) nounwind {
   %1 = zext i1 %a to i32
@@ -99,6 +101,11 @@ entry:
 ; ARM: uxtb r9, r12
 ; ARM: str r9, [sp, #4]
 ; ARM: bl _bar
+; ARM-LONG: @t10
+; ARM-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; ARM-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; ARM-LONG: ldr lr, [lr]
+; ARM-LONG: blx lr
 ; THUMB: @t10
 ; THUMB: movs r0, #0
 ; THUMB: movt r0, #0
@@ -121,8 +128,96 @@ entry:
 ; THUMB: uxtb.w r9, r12
 ; THUMB: str.w r9, [sp, #4]
 ; THUMB: bl _bar
+; THUMB-LONG: @t10
+; THUMB-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; THUMB-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; THUMB-LONG: ldr.w lr, [lr]
+; THUMB-LONG: blx lr
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
   ret i32 0
 }
 
 declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+define void @foo3() uwtable {
+; ARM: movw    r0, #0
+; ARM: movw    r1, :lower16:_bar0
+; ARM: movt    r1, :upper16:_bar0
+; ARM: blx     r1
+; THUMB: movs    r0, #0
+; THUMB: movw    r1, :lower16:_bar0
+; THUMB: movt    r1, :upper16:_bar0
+; THUMB: blx     r1
+  %fptr = alloca i32 (i32)*, align 8
+  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+  %1 = load i32 (i32)** %fptr, align 8
+  %call = call i32 %1(i32 0)
+  ret void
+}
+
+define i32 @LibCall(i32 %a, i32 %b) {
+entry:
+; ARM: LibCall
+; ARM: bl ___udivsi3
+; ARM-LONG: LibCall
+; ARM-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: ldr r2, [r2]
+; ARM-LONG: blx r2
+; THUMB: LibCall
+; THUMB: bl ___udivsi3
+; THUMB-LONG: LibCall
+; THUMB-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: ldr r2, [r2]
+; THUMB-LONG: blx r2
+        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+define i32 @VarArg() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  %m = alloca i32, align 4
+  %n = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %0 = load i32* %i, align 4
+  %1 = load i32* %j, align 4
+  %2 = load i32* %k, align 4
+  %3 = load i32* %m, align 4
+  %4 = load i32* %n, align 4
+; ARM: VarArg
+; ARM: mov r7, sp
+; ARM: movw r0, #5
+; ARM: ldr r1, [r7, #-4]
+; ARM: ldr r2, [r7, #-8]
+; ARM: ldr r3, [r7, #-12]
+; ARM: ldr r9, [sp, #16]
+; ARM: ldr r12, [sp, #12]
+; ARM: str r9, [sp]
+; ARM: str r12, [sp, #4]
+; ARM: bl _CallVariadic
+; THUMB: mov r7, sp
+; THUMB: movs r0, #5
+; THUMB: movt r0, #0
+; THUMB: ldr r1, [sp, #28]
+; THUMB: ldr r2, [sp, #24]
+; THUMB: ldr r3, [sp, #20]
+; THUMB: ldr.w r9, [sp, #16]
+; THUMB: ldr.w r12, [sp, #12]
+; THUMB: str.w r9, [sp]
+; THUMB: str.w r12, [sp, #4]
+; THUMB: bl _CallVariadic
+  %call = call i32 (i32, ...)* @CallVariadic(i32 5, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  store i32 %call, i32* %tmp, align 4
+  %5 = load i32* %tmp, align 4
+  ret i32 %5
+}
+
+declare i32 @CallVariadic(i32, ...)
diff --git a/test/CodeGen/ARM/fast-isel-frameaddr.ll b/test/CodeGen/ARM/fast-isel-frameaddr.ll
new file mode 100644
index 0000000..8f7b294
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-frameaddr.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-THUMB2
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-THUMB2
+
+define i8* @frameaddr_index0() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index0:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+
+; DARWIN-THUMB2: frameaddr_index0:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+
+; LINUX-ARM: frameaddr_index0:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+
+; LINUX-THUMB2: frameaddr_index0:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index1() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index1:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index1:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index1:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index1:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index3() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index3:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index3:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index3:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index3:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 3)
+  ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index e6bdfa7..b73fcef 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 @message1 = global [60 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 1
 @temp = common global [60 x i8] zeroinitializer, align 1
@@ -13,6 +15,11 @@ define void @t1() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: uxtb r1, r1
 ; ARM: bl _memset
+; ARM-LONG: t1
+; ARM-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t1
 ; THUMB: movw r0, :lower16:_message1
 ; THUMB: movt r0, :upper16:_message1
@@ -23,6 +30,11 @@ define void @t1() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: uxtb r1, r1
 ; THUMB: bl _memset
+; THUMB-LONG: t1
+; THUMB-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 1, i1 false)
   ret void
 }
@@ -41,6 +53,11 @@ define void @t2() nounwind ssp {
 ; ARM: mov r0, r1
 ; ARM: ldr r1, [sp]                @ 4-byte Reload
 ; ARM: bl _memcpy
+; ARM-LONG: t2
+; ARM-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t2
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -51,6 +68,11 @@ define void @t2() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memcpy
+; THUMB-LONG: t2
+; THUMB-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 1, i1 false)
   ret void
 }
@@ -67,6 +89,11 @@ define void @t3() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: mov r0, r1
 ; ARM: bl _memmove
+; ARM-LONG: t3
+; ARM-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t3
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -77,6 +104,11 @@ define void @t3() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memmove
+; THUMB-LONG: t3
+; THUMB-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memmove.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/fast-isel-shifter.ll b/test/CodeGen/ARM/fast-isel-shifter.ll
new file mode 100644
index 0000000..111818b
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-shifter.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+
+define i32 @shl() nounwind ssp {
+entry:
+; ARM: shl
+; ARM: lsl r0, r0, #2
+  %shl = shl i32 -1, 2
+  ret i32 %shl
+}
+
+define i32 @shl_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: shl_reg
+; ARM: lsl r0, r0, r1
+  %shl = shl i32 %src1, %src2
+  ret i32 %shl
+}
+
+define i32 @lshr() nounwind ssp {
+entry:
+; ARM: lshr
+; ARM: lsr r0, r0, #2
+  %lshr = lshr i32 -1, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: lshr_reg
+; ARM: lsr r0, r0, r1
+  %lshr = lshr i32 %src1, %src2
+  ret i32 %lshr
+}
+
+define i32 @ashr() nounwind ssp {
+entry:
+; ARM: ashr
+; ARM: asr r0, r0, #2
+  %ashr = ashr i32 -1, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: ashr_reg
+; ARM: asr r0, r0, r1
+  %ashr = ashr i32 %src1, %src2
+  ret i32 %ashr
+}
+
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 417e2d9..ecd5fe2 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -226,3 +226,15 @@ define i32 @urem_fold(i32 %a) nounwind {
   %rem = urem i32 %a, 32
   ret i32 %rem
 }
+
+define i32 @test7() noreturn nounwind  {
+entry:
+; ARM: @test7
+; THUMB: @test7
+; ARM: trap
+; THUMB: trap
+  tail call void @llvm.trap( )
+  unreachable
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 27fa2b0..5511d24 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -11,7 +11,7 @@ entry:
 ; HARD: test1:
 ; HARD: vmov.i32 [[REG1:(d[0-9]+)]], #0x80000000
 ; HARD: vbsl [[REG1]], d
-  %0 = tail call float @copysignf(float %x, float %y) nounwind
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
   ret float %0
 }
 
@@ -25,7 +25,7 @@ entry:
 ; HARD: vmov.i32 [[REG2:(d[0-9]+)]], #0x80000000
 ; HARD: vshl.i64 [[REG2]], [[REG2]], #32
 ; HARD: vbsl [[REG2]], d1, d0
-  %0 = tail call double @copysign(double %x, double %y) nounwind
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
   ret double %0
 }
 
@@ -36,7 +36,7 @@ entry:
 ; SOFT: vshl.i64 [[REG3]], [[REG3]], #32
 ; SOFT: vbsl [[REG3]],
   %0 = fmul double %x, %y
-  %1 = tail call double @copysign(double %0, double %z) nounwind
+  %1 = tail call double @copysign(double %0, double %z) nounwind readnone
   ret double %1
 }
 
diff --git a/test/CodeGen/ARM/floorf.ll b/test/CodeGen/ARM/floorf.ll
new file mode 100644
index 0000000..492fc36
--- /dev/null
+++ b/test/CodeGen/ARM/floorf.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=arm-unknown-unknown < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1() nounwind uwtable readnone ssp {
+; CHECK-NOT: floorf
+  %foo = call float @floorf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+; CHECK: test2
+define float @test2() nounwind uwtable readnone ssp {
+; CHECK-NOT: ceilf
+  %foo = call float @ceilf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+; CHECK: test3
+define float @test3() nounwind uwtable readnone ssp {
+; CHECK-NOT: truncf
+  %foo = call float @truncf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+declare float @floorf(float) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare float @truncf(float) nounwind readnone
+
+
+
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index bc118b8..3c3182b 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -21,3 +21,12 @@ entry:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
+
+; VFP2: test2
+define float @test2(float %a) nounwind {
+; CHECK-NOT: mul
+; CHECK: mov pc, lr
+  %ret = fmul float %a, 1.0
+  ret float %ret
+}
+
diff --git a/test/CodeGen/ARM/fparith.ll b/test/CodeGen/ARM/fparith.ll
index ce6d6b2..40ea33b 100644
--- a/test/CodeGen/ARM/fparith.ll
+++ b/test/CodeGen/ARM/fparith.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 | FileCheck %s
 
 define float @f1(float %a, float %b) {
 ;CHECK: f1:
@@ -84,7 +84,7 @@ define float @f11(float %a) {
 ;CHECK: f11:
 ;CHECK: bic
 entry:
-	%tmp1 = call float @fabsf( float %a )		; <float> [#uses=1]
+	%tmp1 = call float @fabsf( float %a ) readnone	; <float> [#uses=1]
 	ret float %tmp1
 }
 
@@ -94,7 +94,7 @@ define double @f12(double %a) {
 ;CHECK: f12:
 ;CHECK: vabs.f64
 entry:
-	%tmp1 = call double @fabs( double %a )		; <double> [#uses=1]
+	%tmp1 = call double @fabs( double %a ) readnone	; <double> [#uses=1]
 	ret double %tmp1
 }
 
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 802d1b8..303d165 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s
 ; Check generated fused MAC and MLS.
 
 define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -138,8 +138,16 @@ entry:
 ; CHECK: vfms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  %tmp3 = fsub double -0.0, %tmp2
-  ret double %tmp3
+  ret double %tmp2
+}
+
+define float @test_fnms_f32(float %a, float %b, float* %c) nounwind readnone ssp {
+; CHECK: test_fnms_f32
+; CHECK: vfnms.f32
+  %tmp1 = load float* %c, align 4
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = tail call float @llvm.fma.f32(float %a, float %b, float %tmp2) nounwind readnone
+  ret float %tmp3 
 }
 
 define double @test_fnms_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -158,7 +166,8 @@ entry:
 ; CHECK: vfnms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  ret double %tmp2
+  %tmp3 = fsub double -0.0, %tmp2
+  ret double %tmp3
 }
 
 define double @test_fnma_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -180,6 +189,36 @@ entry:
   ret double %tmp3
 }
 
+define float @test_fma_const_fold(float %a, float %b) nounwind {
+; CHECK: test_fma_const_fold
+; CHECK-NOT: vfma
+; CHECK-NOT: vmul
+; CHECK: vadd
+  %ret = call float @llvm.fma.f32(float %a, float 1.0, float %b)
+  ret float %ret
+}
+
+define float @test_fma_canonicalize(float %a, float %b) nounwind {
+; CHECK: test_fma_canonicalize
+; CHECK: vmov.f32 [[R1:s[0-9]+]], #2.000000e+00
+; CHECK: vfma.f32 {{s[0-9]+}}, {{s[0-9]+}}, [[R1]]
+  %ret = call float @llvm.fma.f32(float 2.0, float %a, float %b)
+  ret float %ret
+}
+
+; Check that very wide vector fma's can be split into legal fma's.
+define void @test_fma_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>* %p) nounwind readnone ssp {
+; CHECK: test_fma_v8f32
+; CHECK: vfma.f32
+; CHECK: vfma.f32
+entry:
+  %call = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone
+  store <8 x float> %call, <8 x float>* %p, align 16
+  ret void
+}
+
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/iabs.ll b/test/CodeGen/ARM/iabs.ll
index 89e309d..600a8c2 100644
--- a/test/CodeGen/ARM/iabs.ll
+++ b/test/CodeGen/ARM/iabs.ll
@@ -10,7 +10,25 @@ define i32 @test(i32 %a) {
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
         ret i32 %abs
-; CHECK:  movs r0, r0
+; CHECK:  cmp
 ; CHECK:  rsbmi r0, r0, #0
 ; CHECK:  bx lr
 }
+
+; rdar://11633193
+;; 3 instructions will be generated for abs(a-b):
+;;   subs
+;;   rsbmi
+;;   bx
+define i32 @test2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: test2
+; CHECK: subs
+; CHECK-NEXT: rsbmi
+; CHECK-NEXT: bx
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub1 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 3f8fd75..73b546d 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast | FileCheck %s -check-prefix=M3
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY
@@ -18,7 +18,6 @@ entry:
 
 ; M3: t:
 ; M3-NOT: ldrd
-; M3: ldm.w r2, {r2, r3}
 
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
diff --git a/test/CodeGen/ARM/lsr-scale-addr-mode.ll b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
index 8130019..0c8d387 100644
--- a/test/CodeGen/ARM/lsr-scale-addr-mode.ll
+++ b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep lsl | grep -F {lsl #2\]}
+; RUN: llc < %s -march=arm | grep lsl | grep -F "lsl #2]"
 ; Should use scaled addressing mode.
 
 define void @sintzero(i32* %a) nounwind {
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index 991d728..bbedea1 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
 
 @foo = common global i32 0
 
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index de48fee..4a82c36 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source -disable-post-ra | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
-;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
+;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
 ;CHECK: vmovn.i16
 	%tmp1 = load <8 x i8>* %A
@@ -15,10 +15,10 @@ define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
+;CHECK: vmovn.i32
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
 ;CHECK: vmovn.i32
-;CHECK: vmovn.i32
 ;CHECK: vqmovun.s16
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index b4da552..df98e23 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -mattr=+neon < %s | FileCheck %s
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
diff --git a/test/CodeGen/ARM/pr13249.ll b/test/CodeGen/ARM/pr13249.ll
new file mode 100644
index 0000000..4bc8810
--- /dev/null
+++ b/test/CodeGen/ARM/pr13249.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple armv7--linux-gnueabi
+
+define arm_aapcscc i8* @__strtok_r_1c(i8* %arg, i8 signext %arg1, i8** nocapture %arg2) nounwind {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb
+  %tmp = phi i8* [ %tmp5, %bb3 ], [ %arg, %bb ]
+  %tmp4 = load i8* %tmp, align 1
+  %tmp5 = getelementptr inbounds i8* %tmp, i32 1
+  br i1 undef, label %bb3, label %bb7
+
+bb7:                                              ; preds = %bb13, %bb3
+  %tmp8 = phi i8 [ %tmp14, %bb13 ], [ %tmp4, %bb3 ]
+  %tmp9 = phi i8* [ %tmp12, %bb13 ], [ %tmp, %bb3 ]
+  %tmp10 = icmp ne i8 %tmp8, %arg1
+  %tmp12 = getelementptr inbounds i8* %tmp9, i32 1
+  br i1 %tmp10, label %bb13, label %bb15
+
+bb13:                                             ; preds = %bb7
+  %tmp14 = load i8* %tmp12, align 1
+  br label %bb7
+
+bb15:                                             ; preds = %bb7
+  store i8* %tmp9, i8** %arg2, align 4
+  ret i8* %tmp
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 3e07da8..418d4f3 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -113,3 +113,29 @@ entry:
   call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, [2 x i32], i32, float)*)(i8* undef, i8* undef, [2 x i32] %tmp493, i32 0, float 1.000000e+00) optsize
   ret void
 }
+
+; CHECK: f10
+define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = zext i1 %1 to i32
+  %3 = sitofp i32 %2 to float
+  ret float %3
+}
+
+; CHECK: f11
+define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = sitofp i1 %1 to float
+  ret float %2
+}
+
+; CHECK: f12
+define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatunsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = uitofp i1 %1 to float
+  ret float %2
+}
+
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
new file mode 100644
index 0000000..99df0d4
--- /dev/null
+++ b/test/CodeGen/ARM/smml.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+define i32 @f(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+; CHECK-NOT: smmls
+  %conv4 = zext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %shr5 = lshr i64 %mul, 32
+  %sub = sub nsw i64 %conv4, %shr5
+  %conv3 = trunc i64 %sub to i32
+  ret i32 %conv3
+}
diff --git a/test/CodeGen/ARM/str_pre-2.ll b/test/CodeGen/ARM/str_pre-2.ll
index 983ba45..5ce2bce 100644
--- a/test/CodeGen/ARM/str_pre-2.ll
+++ b/test/CodeGen/ARM/str_pre-2.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnu -regalloc=basic | FileCheck %s
-
-; The greedy register allocator uses a single CSR here, invalidating the test.
+; RUN: llc < %s -mtriple=armv6-linux-gnu | FileCheck %s
 
 @b = external global i64*
 
 define i64 @t(i64 %a) nounwind readonly {
 entry:
-; CHECK: push {lr}
-; CHECK: pop {lr}
+; CHECK: push {r4, r5, lr}
+; CHECK: pop {r4, r5, pc}
+        call void asm sideeffect "", "~{r4},~{r5}"() nounwind
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
 	%2 = mul i64 %1, %a
diff --git a/test/CodeGen/ARM/str_pre.ll b/test/CodeGen/ARM/str_pre.ll
index e56e3f2..d8b3f0e 100644
--- a/test/CodeGen/ARM/str_pre.ll
+++ b/test/CodeGen/ARM/str_pre.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm | \
-; RUN:   grep {str.*\\!} | count 2
+; RUN:   grep "str.*\!" | count 2
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
         %B = load i32* %A               ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
new file mode 100644
index 0000000..99ba475
--- /dev/null
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
+
+; rdar://9877866
+%struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
+%struct.LargeStruct = type { i32, [1001 x i8], [300 x i32] }
+
+define i32 @f() nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: ldr
+; CHECK: str
+; CHECK-NOT:bne
+  %st = alloca %struct.SmallStruct, align 4
+  %call = call i32 @e1(%struct.SmallStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop for large struct byval
+define i32 @g() nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: ldr
+; CHECK: sub
+; CHECK: str
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 4
+  %call = call i32 @e2(%struct.LargeStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop using NEON instructions
+define i32 @h() nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: vld1
+; CHECK: sub
+; CHECK: vst1
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 16
+  %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
+  ret i32 0
+}
+
+declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
+declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
+declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
new file mode 100644
index 0000000..6fcbdee
--- /dev/null
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @g(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %a, %b
+  %sub = sub nsw i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @h(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, 3
+  %sub = sub nsw i32 %a, 3
+  %sub. = select i1 %cmp, i32 %sub, i32 %b
+  ret i32 %sub.
+}
+
+; rdar://11725965
+define i32 @i(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: i:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp ult i32 %a, %b
+  %sub = sub i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+; If CPSR is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK: sub
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 06ea703..474043a 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -36,3 +36,15 @@ entry:
   %sel = select i1 %cmp, i32 1, i32 %sub
   ret i32 %sel
 }
+
+; rdar://11726136
+define i32 @f5(i32 %x) {
+entry:
+; CHECK: f5
+; CHECK: movw r1, #65535
+; CHECK-NOT: movt
+; CHECK-NOT: add
+; CHECK: sub r0, r0, r1
+  %sub = add i32 %x, -65535
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/thread_pointer.ll b/test/CodeGen/ARM/thread_pointer.ll
index 3143387..c403fa5 100644
--- a/test/CodeGen/ARM/thread_pointer.ll
+++ b/test/CodeGen/ARM/thread_pointer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 
 define i8* @test() {
 entry:
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index 28fd469..a25352c 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -3,10 +3,10 @@
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
-; CHECK:        movs.w
+; CHECK:        cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
-; CHECK-NEXT:   movs.w
+; CHECK-NEXT:   cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
  %cmp1 = icmp slt i32 %a, 0
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
new file mode 100644
index 0000000..a5f3c90
--- /dev/null
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -0,0 +1,117 @@
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi < %s | FileCheck -check-prefix=CHECK-NONPIC %s
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-PIC %s
+
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; CHECK-NONPIC:   f1:
+  ; CHECK-NONPIC:   external_gd(gottpoff)
+  ; CHECK-PIC:      f1:
+  ; CHECK-PIC:      external_gd(tlsgd)
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f2:
+  ; CHECK-NONPIC:   internal_gd(tpoff)
+  ; CHECK-PIC:      f2:
+  ; CHECK-PIC:      internal_gd(tlsgd)
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC should use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f3:
+  ; CHECK-NONPIC:   external_ld(gottpoff)
+  ; CHECK-PIC:      f3:
+  ; CHECK-PIC:      external_ld(tlsgd)
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so it falls back to general dynamic.
+  ; CHECK-NONPIC:   f4:
+  ; CHECK-NONPIC:   internal_ld(tpoff)
+  ; CHECK-PIC:      f4:
+  ; CHECK-PIC:      internal_ld(tlsgd)
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; CHECK-NONPIC:   f5:
+  ; CHECK-NONPIC:   external_ie(gottpoff)
+  ; CHECK-PIC:      f5:
+  ; CHECK-PIC:      external_ie(gottpoff)
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; CHECK-NONPIC:   f6:
+  ; CHECK-NONPIC:   internal_ie(tpoff)
+  ; CHECK-PIC:      f6:
+  ; CHECK-PIC:      internal_ie(gottpoff)
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f7:
+  ; CHECK-NONPIC:   external_le(tpoff)
+  ; CHECK-PIC:      f7:
+  ; CHECK-PIC:      external_le(tpoff)
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f8:
+  ; CHECK-NONPIC:   internal_le(tpoff)
+  ; CHECK-PIC:      f8:
+  ; CHECK-PIC:      internal_le(tpoff)
+}
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index 1087094..ec4278c 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {i(tpoff)}
+; RUN:     grep "i(tpoff)"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
-; RUN:     -relocation-model=pic | grep {__tls_get_addr}
+; RUN:     -relocation-model=pic | grep "__tls_get_addr"
 
 
 @i = thread_local global i32 15		; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index df7a4ca..e0e944f 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {tbss}
+; RUN:     grep "tbss"
 
 %struct.anon = type { i32, i32 }
 @teste = internal thread_local global %struct.anon zeroinitializer		; <%struct.anon*> [#uses=1]
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
new file mode 100644
index 0000000..4e227dd
--- /dev/null
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -0,0 +1,21 @@
+; Tests for the two-address instruction pass.
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+define void @PR13378() nounwind {
+; This was orriginally a crasher trying to schedule the instructions.
+; CHECK:      PR13378:
+; CHECK:        vldmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+
+entry:
+  %0 = load <4 x float>* undef
+  store <4 x float> zeroinitializer, <4 x float>* undef
+  store <4 x float> %0, <4 x float>* undef
+  %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
+  store <4 x float> %1, <4 x float>* undef
+  unreachable
+}
diff --git a/test/CodeGen/ARM/unsafe-fsub.ll b/test/CodeGen/ARM/unsafe-fsub.ll
new file mode 100644
index 0000000..3a4477d
--- /dev/null
+++ b/test/CodeGen/ARM/unsafe-fsub.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s
+; RUN: llc -march=arm -mcpu=cortex-a9 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=FAST %s
+
+target triple = "armv7-apple-ios"
+
+; SAFE: test
+; FAST: test
+define float @test(float %x, float %y) {
+entry:
+; SAFE: vmul.f32
+; SAFE: vsub.f32
+; FAST: mov r0, #0
+  %0 = fmul float %x, %y
+  %1 = fsub float %0, %0
+  ret float %1
+}
+
+
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index 450f90d..9f55c24 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -1,79 +1,80 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; NB: this tests vcnt, vclz, and vcls
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
 ;CHECK: vcnt8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vcntQ8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
 ;CHECK: vclz8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
 ;CHECK: vclz16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
 ;CHECK: vclz32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vclzQ8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
 ;CHECK: vclzQ16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
 ;CHECK: vclzQ32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
 	ret <4 x i32> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
 ;CHECK: vclss8:
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 1ec36da..8fd3db2 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -20,7 +20,9 @@ define float @f(<4 x i16>* nocapture %in) {
 
 ; CHECK: g:
 define float @g(<4 x i8>* nocapture %in) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.u8
   ; CHECK: vmovl.u16
   %1 = load <4 x i8>* %in
@@ -47,7 +49,9 @@ define <4 x i8> @h(<4 x float> %v) {
 
 ; CHECK: i:
 define <4 x i8> @i(<4 x i8>* %x) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.s8
   ; CHECK: vmovl.s16
   ; CHECK: vrecpe
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll
index 49a6982..7a4b34f 100644
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 -disable-post-ra | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+vfp2 -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 -disable-post-ra -regalloc=basic | FileCheck %s
 
 define void @test(float* %P, double* %D) {
 	%A = load float* %P		; <float> [#uses=1]
@@ -17,11 +17,11 @@ define void @test_abs(float* %P, double* %D) {
 ;CHECK: test_abs:
 	%a = load float* %P		; <float> [#uses=1]
 ;CHECK: vabs.f32
-	%b = call float @fabsf( float %a )		; <float> [#uses=1]
+	%b = call float @fabsf( float %a ) readnone	; <float> [#uses=1]
 	store float %b, float* %P
 	%A = load double* %D		; <double> [#uses=1]
 ;CHECK: vabs.f64
-	%B = call double @fabs( double %A )		; <double> [#uses=1]
+	%B = call double @fabs( double %A ) readnone	; <double> [#uses=1]
 	store double %B, double* %D
 	ret void
 }
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 61d73c1..c69473f 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -75,12 +75,12 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
         ret <8 x i8> %tmp5
 }
 
-define <4 x i16> @vld2dupi16(i16* %A) nounwind {
+define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;CHECK: vld2dupi16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -94,7 +94,8 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld2dupi16_update:
 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -105,11 +106,11 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp5
 }
 
-define <2 x i32> @vld2dupi32(i32* %A) nounwind {
+define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@@ -119,8 +120,8 @@ define <2 x i32> @vld2dupi32(i32* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -144,11 +145,11 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
 	ret <8 x i8> %tmp8
 }
 
-define <4 x i16> @vld3dupi16(i16* %A) nounwind {
+define <4 x i16> @vld3dupi16(i8* %A) nounwind {
 ;CHECK: vld3dupi16:
 ;Check the (default) alignment value. VLD3 does not support alignment.
 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@@ -161,7 +162,7 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
 
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@@ -171,7 +172,8 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4dupi16_update:
 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@@ -188,12 +190,12 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp11
 }
 
-define <2 x i32> @vld4dupi32(i32* %A) nounwind {
+define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;CHECK: vld4dupi32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@@ -208,5 +210,5 @@ define <2 x i32> @vld4dupi32(i32* %A) nounwind {
         ret <2 x i32> %tmp11
 }
 
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 61d89bb..74628f0 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -525,3 +525,77 @@ define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
   %3 = extractelement <8 x i16> %2, i32 0
   ret i16 %3
 }
+
+; A constant build_vector created for a vmull with half-width elements must
+; not introduce illegal types. <rdar://problem/11324364>
+define void @vmull_buildvector() nounwind optsize ssp align 2 {
+; CHECK: vmull_buildvector
+entry:
+  br i1 undef, label %for.end179, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.loopexit:                                ; preds = %for.body33, %for.body
+  br i1 undef, label %for.end179, label %for.body
+
+for.body:                                         ; preds = %for.cond.loopexit, %for.body.lr.ph
+  br i1 undef, label %for.cond.loopexit, label %for.body33.lr.ph
+
+for.body33.lr.ph:                                 ; preds = %for.body
+  %.sub = select i1 undef, i32 0, i32 undef
+  br label %for.body33
+
+for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
+  %add45 = add i32 undef, undef
+  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
+  %0 = load i32** undef, align 4
+  %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
+  %vmovl.i249 = zext <8 x i8> %1 to <8 x i16>
+  %shuffle.i246 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i240 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> <i32 1>
+  %2 = bitcast <1 x i64> %shuffle.i240 to <8 x i8>
+  %3 = bitcast <16 x i8> undef to <2 x i64>
+  %vmovl.i237 = zext <8 x i8> undef to <8 x i16>
+  %shuffle.i234 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i226 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %vmovl.i225 = zext <8 x i8> undef to <8 x i16>
+  %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
+  %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
+  %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
+  %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %sub.i205 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n130
+  %sub.i203 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n134
+  %add.i200 = add <8 x i16> %sub.i205, <i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96>
+  %add.i198 = add <8 x i16> %add.i200, %sub.i203
+  %mul.i194 = mul <8 x i16> %add.i198, %vmovl.i237
+  %mul.i191 = mul <8 x i16> %vshr_n130, undef
+  %add.i192 = add <8 x i16> %mul.i191, %mul.i194
+  %mul.i187 = mul <8 x i16> %vshr_n134, undef
+  %add.i188 = add <8 x i16> %mul.i187, %add.i192
+  %mul.i185 = mul <8 x i16> undef, undef
+  %add.i186 = add <8 x i16> %mul.i185, undef
+  %vrshr_n160 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i188, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %vrshr_n163 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i186, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %mul.i184 = mul <8 x i16> undef, %vrshr_n160
+  %mul.i181 = mul <8 x i16> undef, %vmovl.i225
+  %add.i182 = add <8 x i16> %mul.i181, %mul.i184
+  %vrshr_n170 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i182, <8 x i16> <i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7>)
+  %vqmovn1.i180 = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %vrshr_n170) nounwind
+  %4 = bitcast <8 x i8> %vqmovn1.i180 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %4, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %5 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* undef, align 16
+  %add177 = add nsw i32 undef, 16
+  br i1 undef, label %for.body33, label %for.cond.loopexit
+
+for.end179:                                       ; preds = %for.cond.loopexit, %entry
+  ret void
+}
+
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index e3372a0..f117ab2 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -disable-arm-fast-isel -O0 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -fast-isel=0 -O0 | FileCheck %s
 
 define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8:
diff --git a/test/CodeGen/CPP/2007-06-16-Funcname.ll b/test/CodeGen/CPP/2007-06-16-Funcname.ll
index 71fea12..16e9798 100644
--- a/test/CodeGen/CPP/2007-06-16-Funcname.ll
+++ b/test/CodeGen/CPP/2007-06-16-Funcname.ll
@@ -5,3 +5,4 @@ define void @foo() {
   ret void
 }
 
+
diff --git a/test/CodeGen/CellSPU/fcmp32.ll b/test/CodeGen/CellSPU/fcmp32.ll
index c14fd7b..f6b028d 100644
--- a/test/CodeGen/CellSPU/fcmp32.ll
+++ b/test/CodeGen/CellSPU/fcmp32.ll
@@ -1,4 +1,4 @@
-; RUN: llc --march=cellspu %s -o - | FileCheck %s
+; RUN: llc --mtriple=cellspu-unknown-elf %s -o - | FileCheck %s
 
 ; Exercise the floating point comparison operators for f32:
 
@@ -15,8 +15,8 @@ define i1 @fcmp_eq(float %arg1, float %arg2) {
 define i1 @fcmp_mag_eq(float %arg1, float %arg2) {
 ; CHECK: fcmeq
 ; CHECK: bi $lr
-        %1 = call float @fabsf(float %arg1)
-        %2 = call float @fabsf(float %arg2)
+        %1 = call float @fabsf(float %arg1) readnone
+        %2 = call float @fabsf(float %arg2) readnone
         %3 = fcmp oeq float %1, %2
         ret i1 %3
 }
diff --git a/test/CodeGen/CellSPU/fneg-fabs.ll b/test/CodeGen/CellSPU/fneg-fabs.ll
index 1e5e3b3..6e01906 100644
--- a/test/CodeGen/CellSPU/fneg-fabs.ll
+++ b/test/CodeGen/CellSPU/fneg-fabs.ll
@@ -32,11 +32,11 @@ declare double @fabs(double)
 declare float @fabsf(float)
 
 define double @fabs_dp(double %X) {
-        %Y = call double @fabs( double %X )
+        %Y = call double @fabs( double %X ) readnone
         ret double %Y
 }
 
 define float @fabs_sp(float %X) {
-        %Y = call float @fabsf( float %X )
+        %Y = call float @fabsf( float %X ) readnone
         ret float %Y
 }
diff --git a/test/CodeGen/CellSPU/icmp16.ll b/test/CodeGen/CellSPU/icmp16.ll
index 32b1261..2f9b091 100644
--- a/test/CodeGen/CellSPU/icmp16.ll
+++ b/test/CodeGen/CellSPU/icmp16.ll
@@ -1,14 +1,4 @@
-; RUN: llc < %s -march=cellspu > %t1.s
-; RUN: grep ilh                                %t1.s | count 15
-; RUN: grep ceqh                               %t1.s | count 29
-; RUN: grep ceqhi                              %t1.s | count 13
-; RUN: grep clgth                              %t1.s | count 15
-; RUN: grep cgth                               %t1.s | count 14
-; RUN: grep cgthi                              %t1.s | count 6
-; RUN: grep {selb\t\\\$3, \\\$6, \\\$5, \\\$3} %t1.s | count 7
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$6, \\\$3} %t1.s | count 3
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$4, \\\$3} %t1.s | count 17
-; RUN: grep {selb\t\\\$3, \\\$4, \\\$5, \\\$3} %t1.s | count 6
+; RUN: llc < %s -march=cellspu | FileCheck %s
 
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
@@ -27,6 +17,10 @@ target triple = "spu"
 
 ; i16 integer comparisons:
 define i16 @icmp_eq_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_select_i16:
+; CHECK:        ceqh
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp eq i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -34,12 +28,22 @@ entry:
 }
 
 define i1 @icmp_eq_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_setcc_i16:
+; CHECK:        ilhu
+; CHECK:        ceqh
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp eq i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_eq_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_immed01_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i16 %arg1, 511
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -47,6 +51,10 @@ entry:
 }
 
 define i16 @icmp_eq_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_immed02_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i16 %arg1, -512
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -54,6 +62,10 @@ entry:
 }
 
 define i16 @icmp_eq_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_immed03_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i16 %arg1, -1
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -61,6 +73,11 @@ entry:
 }
 
 define i16 @icmp_eq_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_eq_immed04_i16:
+; CHECK:        ilh
+; CHECK:        ceqh
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i16 %arg1, 32768
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -68,6 +85,10 @@ entry:
 }
 
 define i16 @icmp_ne_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_select_i16:
+; CHECK:        ceqh
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ne i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -75,12 +96,23 @@ entry:
 }
 
 define i1 @icmp_ne_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_setcc_i16:
+; CHECK:        ceqh
+; CHECK:        ilhu
+; CHECK:        xorhi
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ne i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_ne_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_immed01_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i16 %arg1, 511
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -88,6 +120,10 @@ entry:
 }
 
 define i16 @icmp_ne_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_immed02_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i16 %arg1, -512
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -95,6 +131,10 @@ entry:
 }
 
 define i16 @icmp_ne_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_immed03_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i16 %arg1, -1
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -102,6 +142,11 @@ entry:
 }
 
 define i16 @icmp_ne_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ne_immed04_i16:
+; CHECK:        ilh
+; CHECK:        ceqh
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i16 %arg1, 32768
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -109,6 +154,10 @@ entry:
 }
 
 define i16 @icmp_ugt_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_select_i16:
+; CHECK:        clgth
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ugt i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -116,12 +165,22 @@ entry:
 }
 
 define i1 @icmp_ugt_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_setcc_i16:
+; CHECK:        ilhu
+; CHECK:        clgth
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ugt i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_ugt_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_immed01_i16:
+; CHECK:        clgthi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i16 %arg1, 500
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -129,6 +188,10 @@ entry:
 }
 
 define i16 @icmp_ugt_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_immed02_i16:
+; CHECK:        ceqhi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ugt i16 %arg1, 0
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -136,6 +199,10 @@ entry:
 }
 
 define i16 @icmp_ugt_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_immed03_i16:
+; CHECK:        clgthi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i16 %arg1, 65024
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -143,6 +210,11 @@ entry:
 }
 
 define i16 @icmp_ugt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ugt_immed04_i16:
+; CHECK:        ilh
+; CHECK:        clgth
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i16 %arg1, 32768
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -150,6 +222,12 @@ entry:
 }
 
 define i16 @icmp_uge_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_uge_select_i16:
+; CHECK:        ceqh
+; CHECK:        clgth
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp uge i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -157,6 +235,14 @@ entry:
 }
 
 define i1 @icmp_uge_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_uge_setcc_i16:
+; CHECK:        ceqh
+; CHECK:        clgth
+; CHECK:        ilhu
+; CHECK:        or
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp uge i16 %arg1, %arg2
        ret i1 %A
@@ -169,6 +255,12 @@ entry:
 ;; they'll ever be generated.
 
 define i16 @icmp_ult_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_select_i16:
+; CHECK:        ceqh
+; CHECK:        clgth
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ult i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -176,12 +268,26 @@ entry:
 }
 
 define i1 @icmp_ult_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_setcc_i16:
+; CHECK:        ceqh
+; CHECK:        clgth
+; CHECK:        ilhu
+; CHECK:        nor
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ult i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_ult_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_immed01_i16:
+; CHECK:        ceqhi
+; CHECK:        clgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i16 %arg1, 511
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -189,6 +295,12 @@ entry:
 }
 
 define i16 @icmp_ult_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_immed02_i16:
+; CHECK:        ceqhi
+; CHECK:        clgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i16 %arg1, 65534
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -196,6 +308,12 @@ entry:
 }
 
 define i16 @icmp_ult_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_immed03_i16:
+; CHECK:        ceqhi
+; CHECK:        clgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i16 %arg1, 65024
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -203,6 +321,13 @@ entry:
 }
 
 define i16 @icmp_ult_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ult_immed04_i16:
+; CHECK:        ilh
+; CHECK:        ceqh
+; CHECK:        clgth
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i16 %arg1, 32769
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -210,6 +335,10 @@ entry:
 }
 
 define i16 @icmp_ule_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ule_select_i16:
+; CHECK:        clgth
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ule i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -217,6 +346,13 @@ entry:
 }
 
 define i1 @icmp_ule_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_ule_setcc_i16:
+; CHECK:        clgth
+; CHECK:        ilhu
+; CHECK:        xorhi
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ule i16 %arg1, %arg2
        ret i1 %A
@@ -229,6 +365,10 @@ entry:
 ;; they'll ever be generated.
 
 define i16 @icmp_sgt_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_select_i16:
+; CHECK:        cgth
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sgt i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -236,12 +376,22 @@ entry:
 }
 
 define i1 @icmp_sgt_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_setcc_i16:
+; CHECK:        ilhu
+; CHECK:        cgth
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp sgt i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_sgt_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_immed01_i16:
+; CHECK:        cgthi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i16 %arg1, 511
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -249,6 +399,10 @@ entry:
 }
 
 define i16 @icmp_sgt_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_immed02_i16:
+; CHECK:        cgthi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i16 %arg1, -1
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -256,6 +410,10 @@ entry:
 }
 
 define i16 @icmp_sgt_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_immed03_i16:
+; CHECK:        cgthi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i16 %arg1, -512
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -263,6 +421,11 @@ entry:
 }
 
 define i16 @icmp_sgt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sgt_immed04_i16:
+; CHECK:        ilh
+; CHECK:        ceqh
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp sgt i16 %arg1, 32768
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -270,6 +433,12 @@ entry:
 }
 
 define i16 @icmp_sge_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sge_select_i16:
+; CHECK:        ceqh
+; CHECK:        cgth
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sge i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -277,6 +446,14 @@ entry:
 }
 
 define i1 @icmp_sge_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sge_setcc_i16:
+; CHECK:        ceqh
+; CHECK:        cgth
+; CHECK:        ilhu
+; CHECK:        or
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp sge i16 %arg1, %arg2
        ret i1 %A
@@ -289,6 +466,12 @@ entry:
 ;; they'll ever be generated.
 
 define i16 @icmp_slt_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_select_i16:
+; CHECK:        ceqh
+; CHECK:        cgth
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp slt i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -296,12 +479,26 @@ entry:
 }
 
 define i1 @icmp_slt_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_setcc_i16:
+; CHECK:        ceqh
+; CHECK:        cgth
+; CHECK:        ilhu
+; CHECK:        nor
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp slt i16 %arg1, %arg2
        ret i1 %A
 }
 
 define i16 @icmp_slt_immed01_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_immed01_i16:
+; CHECK:        ceqhi
+; CHECK:        cgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i16 %arg1, 511
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -309,6 +506,12 @@ entry:
 }
 
 define i16 @icmp_slt_immed02_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_immed02_i16:
+; CHECK:        ceqhi
+; CHECK:        cgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i16 %arg1, -512
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -316,6 +519,12 @@ entry:
 }
 
 define i16 @icmp_slt_immed03_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_immed03_i16:
+; CHECK:        ceqhi
+; CHECK:        cgthi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i16 %arg1, -1
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -323,6 +532,10 @@ entry:
 }
 
 define i16 @icmp_slt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_slt_immed04_i16:
+; CHECK:        lr
+; CHECK-NETX:   bi
+
 entry:
        %A = icmp slt i16 %arg1, 32768
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -330,6 +543,10 @@ entry:
 }
 
 define i16 @icmp_sle_select_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sle_select_i16:
+; CHECK:        cgth
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp sle i16 %arg1, %arg2
        %B = select i1 %A, i16 %val1, i16 %val2
@@ -337,6 +554,13 @@ entry:
 }
 
 define i1 @icmp_sle_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwind {
+; CHECK:      icmp_sle_setcc_i16:
+; CHECK:        cgth
+; CHECK:        ilhu
+; CHECK:        xorhi
+; CHECK:        iohl
+; CHECK-NETX:   bi
+
 entry:
        %A = icmp sle i16 %arg1, %arg2
        ret i1 %A
diff --git a/test/CodeGen/CellSPU/icmp32.ll b/test/CodeGen/CellSPU/icmp32.ll
index ccbb5f7..1794f4c 100644
--- a/test/CodeGen/CellSPU/icmp32.ll
+++ b/test/CodeGen/CellSPU/icmp32.ll
@@ -1,14 +1,4 @@
-; RUN: llc < %s -march=cellspu > %t1.s
-; RUN: grep ila                                %t1.s | count 6
-; RUN: grep ceq                                %t1.s | count 28
-; RUN: grep ceqi                               %t1.s | count 12
-; RUN: grep clgt                               %t1.s | count 16
-; RUN: grep clgti                              %t1.s | count 6
-; RUN: grep cgt                                %t1.s | count 16
-; RUN: grep cgti                               %t1.s | count 6
-; RUN: grep {selb\t\\\$3, \\\$6, \\\$5, \\\$3} %t1.s | count 7
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$6, \\\$3} %t1.s | count 3
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$4, \\\$3} %t1.s | count 20
+; RUN: llc < %s -march=cellspu | FileCheck %s
 
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
@@ -27,6 +17,10 @@ target triple = "spu"
 
 ; i32 integer comparisons:
 define i32 @icmp_eq_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_select_i32:
+; CHECK:        ceq
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp eq i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -34,12 +28,22 @@ entry:
 }
 
 define i1 @icmp_eq_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_setcc_i32:
+; CHECK:        ilhu
+; CHECK:        ceq
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp eq i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_eq_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_immed01_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -47,6 +51,10 @@ entry:
 }
 
 define i32 @icmp_eq_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_immed02_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i32 %arg1, -512
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -54,6 +62,10 @@ entry:
 }
 
 define i32 @icmp_eq_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_immed03_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i32 %arg1, -1
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -61,6 +73,11 @@ entry:
 }
 
 define i32 @icmp_eq_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_eq_immed04_i32:
+; CHECK:        ila
+; CHECK:        ceq
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -68,6 +85,10 @@ entry:
 }
 
 define i32 @icmp_ne_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_select_i32:
+; CHECK:        ceq
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ne i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -75,12 +96,23 @@ entry:
 }
 
 define i1 @icmp_ne_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_setcc_i32:
+; CHECK:        ceq
+; CHECK:        ilhu
+; CHECK:        xori
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ne i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_ne_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_immed01_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -88,6 +120,10 @@ entry:
 }
 
 define i32 @icmp_ne_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_immed02_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i32 %arg1, -512
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -95,6 +131,10 @@ entry:
 }
 
 define i32 @icmp_ne_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_immed03_i32:
+; CHECK:        ceqi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i32 %arg1, -1
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -102,6 +142,11 @@ entry:
 }
 
 define i32 @icmp_ne_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ne_immed04_i32:
+; CHECK:        ila
+; CHECK:        ceq
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -109,6 +154,10 @@ entry:
 }
 
 define i32 @icmp_ugt_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_select_i32:
+; CHECK:        clgt
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ugt i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -116,12 +165,22 @@ entry:
 }
 
 define i1 @icmp_ugt_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_setcc_i32:
+; CHECK:        ilhu
+; CHECK:        clgt
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ugt i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_ugt_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_immed01_i32:
+; CHECK:        clgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -129,6 +188,10 @@ entry:
 }
 
 define i32 @icmp_ugt_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_immed02_i32:
+; CHECK:        clgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i32 %arg1, 4294966784
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -136,6 +199,10 @@ entry:
 }
 
 define i32 @icmp_ugt_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_immed03_i32:
+; CHECK:        clgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i32 %arg1, 4294967293
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -143,6 +210,11 @@ entry:
 }
 
 define i32 @icmp_ugt_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ugt_immed04_i32:
+; CHECK:        ila
+; CHECK:        clgt
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -150,6 +222,12 @@ entry:
 }
 
 define i32 @icmp_uge_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_uge_select_i32:
+; CHECK:        ceq
+; CHECK:        clgt
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp uge i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -157,6 +235,14 @@ entry:
 }
 
 define i1 @icmp_uge_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_uge_setcc_i32:
+; CHECK:        ceq
+; CHECK:        clgt
+; CHECK:        ilhu
+; CHECK:        or
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp uge i32 %arg1, %arg2
        ret i1 %A
@@ -169,6 +255,12 @@ entry:
 ;; they'll ever be generated.
 
 define i32 @icmp_ult_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_select_i32:
+; CHECK:        ceq
+; CHECK:        clgt
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ult i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -176,12 +268,26 @@ entry:
 }
 
 define i1 @icmp_ult_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_setcc_i32:
+; CHECK:        ceq
+; CHECK:        clgt
+; CHECK:        ilhu
+; CHECK:        nor
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ult i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_ult_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_immed01_i32:
+; CHECK:        ceqi
+; CHECK:        clgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -189,6 +295,12 @@ entry:
 }
 
 define i32 @icmp_ult_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_immed02_i32:
+; CHECK:        ceqi
+; CHECK:        clgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i32 %arg1, 4294966784
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -196,6 +308,12 @@ entry:
 }
 
 define i32 @icmp_ult_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_immed03_i32:
+; CHECK:        ceqi
+; CHECK:        clgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i32 %arg1, 4294967293
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -203,6 +321,11 @@ entry:
 }
 
 define i32 @icmp_ult_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ult_immed04_i32:
+; CHECK:        rotmi
+; CHECK:        ceqi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -210,6 +333,10 @@ entry:
 }
 
 define i32 @icmp_ule_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ule_select_i32:
+; CHECK:        clgt
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ule i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -217,6 +344,13 @@ entry:
 }
 
 define i1 @icmp_ule_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_ule_setcc_i32:
+; CHECK:        clgt
+; CHECK:        ilhu
+; CHECK:        xori
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp ule i32 %arg1, %arg2
        ret i1 %A
@@ -229,6 +363,10 @@ entry:
 ;; they'll ever be generated.
 
 define i32 @icmp_sgt_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_select_i32:
+; CHECK:        cgt
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sgt i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -236,12 +374,22 @@ entry:
 }
 
 define i1 @icmp_sgt_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_setcc_i32:
+; CHECK:        ilhu
+; CHECK:        cgt
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp sgt i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_sgt_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_immed01_i32:
+; CHECK:        cgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -249,6 +397,10 @@ entry:
 }
 
 define i32 @icmp_sgt_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_immed02_i32:
+; CHECK:        cgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i32 %arg1, 4294966784
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -256,6 +408,10 @@ entry:
 }
 
 define i32 @icmp_sgt_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_immed03_i32:
+; CHECK:        cgti
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i32 %arg1, 4294967293
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -263,6 +419,11 @@ entry:
 }
 
 define i32 @icmp_sgt_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sgt_immed04_i32:
+; CHECK:        ila
+; CHECK:        cgt
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -270,6 +431,12 @@ entry:
 }
 
 define i32 @icmp_sge_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sge_select_i32:
+; CHECK:        ceq
+; CHECK:        cgt
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sge i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -277,6 +444,14 @@ entry:
 }
 
 define i1 @icmp_sge_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sge_setcc_i32:
+; CHECK:        ceq
+; CHECK:        cgt
+; CHECK:        ilhu
+; CHECK:        or
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp sge i32 %arg1, %arg2
        ret i1 %A
@@ -289,6 +464,12 @@ entry:
 ;; they'll ever be generated.
 
 define i32 @icmp_slt_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_select_i32:
+; CHECK:        ceq
+; CHECK:        cgt
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp slt i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -296,12 +477,26 @@ entry:
 }
 
 define i1 @icmp_slt_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_setcc_i32:
+; CHECK:        ceq
+; CHECK:        cgt
+; CHECK:        ilhu
+; CHECK:        nor
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp slt i32 %arg1, %arg2
        ret i1 %A
 }
 
 define i32 @icmp_slt_immed01_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_immed01_i32:
+; CHECK:        ceqi
+; CHECK:        cgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i32 %arg1, 511
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -309,6 +504,12 @@ entry:
 }
 
 define i32 @icmp_slt_immed02_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_immed02_i32:
+; CHECK:        ceqi
+; CHECK:        cgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i32 %arg1, -512
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -316,6 +517,12 @@ entry:
 }
 
 define i32 @icmp_slt_immed03_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_immed03_i32:
+; CHECK:        ceqi
+; CHECK:        cgti
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i32 %arg1, -1
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -323,6 +530,13 @@ entry:
 }
 
 define i32 @icmp_slt_immed04_i32(i32 %arg1, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_slt_immed04_i32:
+; CHECK:        ila
+; CHECK:        ceq
+; CHECK:        cgt
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i32 %arg1, 32768
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -330,6 +544,10 @@ entry:
 }
 
 define i32 @icmp_sle_select_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sle_select_i32:
+; CHECK:        cgt
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp sle i32 %arg1, %arg2
        %B = select i1 %A, i32 %val1, i32 %val2
@@ -337,6 +555,13 @@ entry:
 }
 
 define i1 @icmp_sle_setcc_i32(i32 %arg1, i32 %arg2, i32 %val1, i32 %val2) nounwind {
+; CHECK:      icmp_sle_setcc_i32:
+; CHECK:        cgt
+; CHECK:        ilhu
+; CHECK:        xori
+; CHECK:        iohl
+; CHECK:        shufb
+
 entry:
        %A = icmp sle i32 %arg1, %arg2
        ret i1 %A
diff --git a/test/CodeGen/CellSPU/icmp8.ll b/test/CodeGen/CellSPU/icmp8.ll
index 5517d10..1db641e 100644
--- a/test/CodeGen/CellSPU/icmp8.ll
+++ b/test/CodeGen/CellSPU/icmp8.ll
@@ -1,13 +1,4 @@
-; RUN: llc < %s -march=cellspu > %t1.s
-; RUN: grep ceqb                               %t1.s | count 24
-; RUN: grep ceqbi                              %t1.s | count 12
-; RUN: grep clgtb                              %t1.s | count 11
-; RUN: grep cgtb                               %t1.s | count 13
-; RUN: grep cgtbi                              %t1.s | count 5
-; RUN: grep {selb\t\\\$3, \\\$6, \\\$5, \\\$3} %t1.s | count 7
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$6, \\\$3} %t1.s | count 3
-; RUN: grep {selb\t\\\$3, \\\$5, \\\$4, \\\$3} %t1.s | count 11
-; RUN: grep {selb\t\\\$3, \\\$4, \\\$5, \\\$3} %t1.s | count 4
+; RUN: llc < %s -march=cellspu | FileCheck %s
 
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
@@ -26,6 +17,10 @@ target triple = "spu"
 
 ; i8 integer comparisons:
 define i8 @icmp_eq_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_eq_select_i8:
+; CHECK:        ceqb
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp eq i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -33,12 +28,20 @@ entry:
 }
 
 define i1 @icmp_eq_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_eq_setcc_i8:
+; CHECK:        ceqb
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp eq i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_eq_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_eq_immed01_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i8 %arg1, 127
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -46,6 +49,10 @@ entry:
 }
 
 define i8 @icmp_eq_immed02_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_eq_immed02_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i8 %arg1, -128
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -53,6 +60,10 @@ entry:
 }
 
 define i8 @icmp_eq_immed03_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_eq_immed03_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp eq i8 %arg1, -1
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -60,6 +71,10 @@ entry:
 }
 
 define i8 @icmp_ne_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ne_select_i8:
+; CHECK:        ceqb
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ne i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -67,12 +82,21 @@ entry:
 }
 
 define i1 @icmp_ne_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ne_setcc_i8:
+; CHECK:        ceqb
+; CHECK:        xorbi
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp ne i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_ne_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ne_immed01_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i8 %arg1, 127
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -80,6 +104,10 @@ entry:
 }
 
 define i8 @icmp_ne_immed02_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ne_immed02_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i8 %arg1, -128
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -87,6 +115,10 @@ entry:
 }
 
 define i8 @icmp_ne_immed03_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ne_immed03_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp ne i8 %arg1, -1
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -94,6 +126,10 @@ entry:
 }
 
 define i8 @icmp_ugt_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ugt_select_i8:
+; CHECK:        clgtb
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ugt i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -101,12 +137,20 @@ entry:
 }
 
 define i1 @icmp_ugt_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ugt_setcc_i8:
+; CHECK:        clgtb
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp ugt i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_ugt_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ugt_immed01_i8:
+; CHECK:        clgtbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ugt i8 %arg1, 126
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -114,6 +158,12 @@ entry:
 }
 
 define i8 @icmp_uge_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_uge_select_i8:
+; CHECK:        ceqb
+; CHECK:        clgtb
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp uge i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -121,6 +171,12 @@ entry:
 }
 
 define i1 @icmp_uge_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_uge_setcc_i8:
+; CHECK:        ceqb
+; CHECK:        clgtb
+; CHECK:        or
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp uge i8 %arg1, %arg2
        ret i1 %A
@@ -133,6 +189,12 @@ entry:
 ;; they'll ever be generated.
 
 define i8 @icmp_ult_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ult_select_i8:
+; CHECK:        ceqb
+; CHECK:        clgtb
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp ult i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -140,12 +202,24 @@ entry:
 }
 
 define i1 @icmp_ult_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ult_setcc_i8:
+; CHECK:        ceqb
+; CHECK:        clgtb
+; CHECK:        nor
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp ult i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_ult_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ult_immed01_i8:
+; CHECK:        ceqbi
+; CHECK:        clgtbi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i8 %arg1, 253
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -153,6 +227,12 @@ entry:
 }
 
 define i8 @icmp_ult_immed02_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ult_immed02_i8:
+; CHECK:        ceqbi
+; CHECK:        clgtbi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp ult i8 %arg1, 129
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -160,6 +240,10 @@ entry:
 }
 
 define i8 @icmp_ule_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ule_select_i8:
+; CHECK:        clgtb
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp ule i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -167,6 +251,11 @@ entry:
 }
 
 define i1 @icmp_ule_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_ule_setcc_i8:
+; CHECK:        clgtb
+; CHECK:        xorbi
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp ule i8 %arg1, %arg2
        ret i1 %A
@@ -179,6 +268,10 @@ entry:
 ;; they'll ever be generated.
 
 define i8 @icmp_sgt_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sgt_select_i8:
+; CHECK:        cgtb
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sgt i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -186,12 +279,20 @@ entry:
 }
 
 define i1 @icmp_sgt_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sgt_setcc_i8:
+; CHECK:        cgtb
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp sgt i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_sgt_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sgt_immed01_i8:
+; CHECK:        cgtbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i8 %arg1, 96
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -199,6 +300,10 @@ entry:
 }
 
 define i8 @icmp_sgt_immed02_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sgt_immed02_i8:
+; CHECK:        cgtbi
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp sgt i8 %arg1, -1
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -206,6 +311,10 @@ entry:
 }
 
 define i8 @icmp_sgt_immed03_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sgt_immed03_i8:
+; CHECK:        ceqbi
+; CHECK:        selb $3, $4, $5, $3
+
 entry:
        %A = icmp sgt i8 %arg1, -128
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -213,6 +322,12 @@ entry:
 }
 
 define i8 @icmp_sge_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sge_select_i8:
+; CHECK:        ceqb
+; CHECK:        cgtb
+; CHECK:        or
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp sge i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -220,6 +335,12 @@ entry:
 }
 
 define i1 @icmp_sge_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sge_setcc_i8:
+; CHECK:        ceqb
+; CHECK:        cgtb
+; CHECK:        or
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp sge i8 %arg1, %arg2
        ret i1 %A
@@ -232,6 +353,12 @@ entry:
 ;; they'll ever be generated.
 
 define i8 @icmp_slt_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_slt_select_i8:
+; CHECK:        ceqb
+; CHECK:        cgtb
+; CHECK:        nor
+; CHECK:        selb $3, $6, $5, $3
+
 entry:
        %A = icmp slt i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -239,12 +366,24 @@ entry:
 }
 
 define i1 @icmp_slt_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_slt_setcc_i8:
+; CHECK:        ceqb
+; CHECK:        cgtb
+; CHECK:        nor
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp slt i8 %arg1, %arg2
        ret i1 %A
 }
 
 define i8 @icmp_slt_immed01_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_slt_immed01_i8:
+; CHECK:        ceqbi
+; CHECK:        cgtbi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i8 %arg1, 96
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -252,6 +391,12 @@ entry:
 }
 
 define i8 @icmp_slt_immed02_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_slt_immed02_i8:
+; CHECK:        ceqbi
+; CHECK:        cgtbi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i8 %arg1, -120
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -259,6 +404,12 @@ entry:
 }
 
 define i8 @icmp_slt_immed03_i8(i8 %arg1, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_slt_immed03_i8:
+; CHECK:        ceqbi
+; CHECK:        cgtbi
+; CHECK:        nor
+; CHECK:        selb $3, $5, $4, $3
+
 entry:
        %A = icmp slt i8 %arg1, -1
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -266,6 +417,10 @@ entry:
 }
 
 define i8 @icmp_sle_select_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sle_select_i8:
+; CHECK:        cgtb
+; CHECK:        selb $3, $5, $6, $3
+
 entry:
        %A = icmp sle i8 %arg1, %arg2
        %B = select i1 %A, i8 %val1, i8 %val2
@@ -273,6 +428,11 @@ entry:
 }
 
 define i1 @icmp_sle_setcc_i8(i8 %arg1, i8 %arg2, i8 %val1, i8 %val2) nounwind {
+; CHECK:      icmp_sle_setcc_i8:
+; CHECK:        cgtb
+; CHECK:        xorbi
+; CHECK-NEXT:   bi
+
 entry:
        %A = icmp sle i8 %arg1, %arg2
        ret i1 %A
diff --git a/test/CodeGen/CellSPU/shift_ops.ll b/test/CodeGen/CellSPU/shift_ops.ll
index f4aad44..1ccc356 100644
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@@ -1,20 +1,20 @@
 ; RUN: llc < %s -march=cellspu > %t1.s
-; RUN: grep {shlh	}  %t1.s | count 10
-; RUN: grep {shlhi	}  %t1.s | count 3
-; RUN: grep {shl	}  %t1.s | count 10
-; RUN: grep {shli	}  %t1.s | count 3
-; RUN: grep {xshw	}  %t1.s | count 5
-; RUN: grep {and	}  %t1.s | count 15
-; RUN: grep {andi	}  %t1.s | count 4
-; RUN: grep {rotmi	}  %t1.s | count 4
-; RUN: grep {rotqmbyi	}  %t1.s | count 1
-; RUN: grep {rotqmbii	}  %t1.s | count 2
-; RUN: grep {rotqmby	}  %t1.s | count 1
-; RUN: grep {rotqmbi	}  %t1.s | count 2
-; RUN: grep {rotqbyi	}  %t1.s | count 1
-; RUN: grep {rotqbii	}  %t1.s | count 2
-; RUN: grep {rotqbybi	}  %t1.s | count 1
-; RUN: grep {sfi	}  %t1.s | count 6
+; RUN: grep "shlh	"  %t1.s | count 10
+; RUN: grep "shlhi	"  %t1.s | count 3
+; RUN: grep "shl	"  %t1.s | count 10
+; RUN: grep "shli	"  %t1.s | count 3
+; RUN: grep "xshw	"  %t1.s | count 5
+; RUN: grep "and	"  %t1.s | count 15
+; RUN: grep "andi	"  %t1.s | count 4
+; RUN: grep "rotmi	"  %t1.s | count 4
+; RUN: grep "rotqmbyi	"  %t1.s | count 1
+; RUN: grep "rotqmbii	"  %t1.s | count 2
+; RUN: grep "rotqmby	"  %t1.s | count 1
+; RUN: grep "rotqmbi	"  %t1.s | count 2
+; RUN: grep "rotqbyi	"  %t1.s | count 1
+; RUN: grep "rotqbii	"  %t1.s | count 2
+; RUN: grep "rotqbybi	"  %t1.s | count 1
+; RUN: grep "sfi	"  %t1.s | count 6
 ; RUN: cat %t1.s | FileCheck %s
 
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
diff --git a/test/CodeGen/CellSPU/stores.ll b/test/CodeGen/CellSPU/stores.ll
index 6ca5b08..43f8776 100644
--- a/test/CodeGen/CellSPU/stores.ll
+++ b/test/CodeGen/CellSPU/stores.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=cellspu > %t1.s
-; RUN: grep {stqd.*0(\$3)}      %t1.s | count 4
-; RUN: grep {stqd.*16(\$3)}     %t1.s | count 4
+; RUN: grep 'stqd.*0($3)'       %t1.s | count 4
+; RUN: grep 'stqd.*16($3)'      %t1.s | count 4
 ; RUN: grep 16256               %t1.s | count 2
 ; RUN: grep 16384               %t1.s | count 1
 ; RUN: grep 771                 %t1.s | count 4
@@ -8,7 +8,7 @@
 ; RUN: grep 1799                %t1.s | count 2
 ; RUN: grep 1543                %t1.s | count 5
 ; RUN: grep 1029                %t1.s | count 3
-; RUN: grep {shli.*, 4}         %t1.s | count 4
+; RUN: grep 'shli.*, 4'         %t1.s | count 4
 ; RUN: grep stqx                %t1.s | count 4
 ; RUN: grep ilhu                %t1.s | count 11
 ; RUN: grep iohl                %t1.s | count 8
diff --git a/test/CodeGen/CellSPU/trunc.ll b/test/CodeGen/CellSPU/trunc.ll
index d161852..e4c8fb4 100644
--- a/test/CodeGen/CellSPU/trunc.ll
+++ b/test/CodeGen/CellSPU/trunc.ll
@@ -1,19 +1,19 @@
 ; RUN: llc < %s -march=cellspu > %t1.s
 ; RUN: grep shufb   %t1.s | count 19
-; RUN: grep {ilhu.*1799}  %t1.s | count 1
-; RUN: grep {ilhu.*771}  %t1.s | count 2
-; RUN: grep {ilhu.*1543}  %t1.s | count 1
-; RUN: grep {ilhu.*1029}  %t1.s | count 1
-; RUN: grep {ilhu.*515}  %t1.s | count 1
-; RUN: grep {ilhu.*3855}  %t1.s | count 1
-; RUN: grep {ilhu.*3599}  %t1.s | count 1
-; RUN: grep {ilhu.*3085}  %t1.s | count 1
-; RUN: grep {iohl.*3855}  %t1.s | count 1
-; RUN: grep {iohl.*3599}  %t1.s | count 2
-; RUN: grep {iohl.*1543}  %t1.s | count 2
-; RUN: grep {iohl.*771}  %t1.s | count 2
-; RUN: grep {iohl.*515}  %t1.s | count 1
-; RUN: grep {iohl.*1799}  %t1.s | count 1
+; RUN: grep "ilhu.*1799"  %t1.s | count 1
+; RUN: grep "ilhu.*771"  %t1.s | count 2
+; RUN: grep "ilhu.*1543"  %t1.s | count 1
+; RUN: grep "ilhu.*1029"  %t1.s | count 1
+; RUN: grep "ilhu.*515"  %t1.s | count 1
+; RUN: grep "ilhu.*3855"  %t1.s | count 1
+; RUN: grep "ilhu.*3599"  %t1.s | count 1
+; RUN: grep "ilhu.*3085"  %t1.s | count 1
+; RUN: grep "iohl.*3855"  %t1.s | count 1
+; RUN: grep "iohl.*3599"  %t1.s | count 2
+; RUN: grep "iohl.*1543"  %t1.s | count 2
+; RUN: grep "iohl.*771"  %t1.s | count 2
+; RUN: grep "iohl.*515"  %t1.s | count 1
+; RUN: grep "iohl.*1799"  %t1.s | count 1
 ; RUN: grep lqa  %t1.s | count 1
 ; RUN: grep cbd  %t1.s | count 4
 ; RUN: grep chd  %t1.s | count 3
diff --git a/test/CodeGen/Generic/2006-09-02-LocalAllocCrash.ll b/test/CodeGen/Generic/2006-09-02-LocalAllocCrash.ll
index 928edc4..2dc5c16 100644
--- a/test/CodeGen/Generic/2006-09-02-LocalAllocCrash.ll
+++ b/test/CodeGen/Generic/2006-09-02-LocalAllocCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0
 	
 %struct.CHESS_POSITION = type { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i32, i8, i8, [64 x i8], i8, i8, i8, i8, i8 }
 @search = external global %struct.CHESS_POSITION		; <%struct.CHESS_POSITION*> [#uses=2]
diff --git a/test/CodeGen/Generic/2009-06-03-UnreachableSplitPad.ll b/test/CodeGen/Generic/2009-06-03-UnreachableSplitPad.ll
deleted file mode 100644
index ad418f7..0000000
--- a/test/CodeGen/Generic/2009-06-03-UnreachableSplitPad.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s
-; PR4317
-
-declare i32 @b()
-
-define void @a() {
-entry:
-  ret void
-
-dummy:
-  invoke i32 @b() to label %reg unwind label %reg
-
-reg:
-  %lpad = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
-            catch i8* null
-  ret void
-}
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/Generic/2012-06-08-APIntCrash.ll b/test/CodeGen/Generic/2012-06-08-APIntCrash.ll
new file mode 100644
index 0000000..2c096bf
--- /dev/null
+++ b/test/CodeGen/Generic/2012-06-08-APIntCrash.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s
+
+define void @test1(<8 x i32>* %ptr)
+{
+	%1 = load <8 x i32>* %ptr, align 32
+	%2 = and <8 x i32> %1, <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 -1>
+	store <8 x i32> %2, <8 x i32>* %ptr, align 16
+	ret void
+}
diff --git a/test/CodeGen/Generic/2012-07-15-BuildVectorPromote.ll b/test/CodeGen/Generic/2012-07-15-BuildVectorPromote.ll
new file mode 100644
index 0000000..6591c64
--- /dev/null
+++ b/test/CodeGen/Generic/2012-07-15-BuildVectorPromote.ll
@@ -0,0 +1,8 @@
+; RUN: llc -mcpu=corei7 < %s
+; We don't care about the output, just that it doesn't crash
+
+define <1 x i1> @buildvec_promote() {
+  %cmp = icmp ule <1 x i32> undef, undef
+  %sel = select i1 undef, <1 x i1> undef, <1 x i1> %cmp
+  ret <1 x i1> %sel
+}
diff --git a/test/CodeGen/Generic/asm-large-immediate.ll b/test/CodeGen/Generic/asm-large-immediate.ll
index 605665b..891bbc9 100644
--- a/test/CodeGen/Generic/asm-large-immediate.ll
+++ b/test/CodeGen/Generic/asm-large-immediate.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s | grep 68719476738
+; RUN: llc < %s | FileCheck %s
 
 define void @test() {
 entry:
+; CHECK: /* result: 68719476738 */
         tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+        tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
         ret void
 }
-
diff --git a/test/CodeGen/Generic/donothing.ll b/test/CodeGen/Generic/donothing.ll
new file mode 100644
index 0000000..d6ba138
--- /dev/null
+++ b/test/CodeGen/Generic/donothing.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_call_unexpected(i8*)
+declare void @llvm.donothing() readnone
+
+; CHECK: f1
+define void @f1() nounwind uwtable ssp {
+entry:
+; CHECK-NOT donothing
+  invoke void @llvm.donothing()
+  to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @__cxa_call_unexpected(i8* %1) noreturn nounwind
+  unreachable
+}
+
+; CHECK: f2
+define void @f2() nounwind {
+entry:
+; CHECK-NOT donothing
+  call void @llvm.donothing()
+  ret void
+}
diff --git a/test/CodeGen/Generic/edge-bundles-blockIDs.ll b/test/CodeGen/Generic/edge-bundles-blockIDs.ll
index b4ae415..d86c758 100644
--- a/test/CodeGen/Generic/edge-bundles-blockIDs.ll
+++ b/test/CodeGen/Generic/edge-bundles-blockIDs.ll
@@ -1,6 +1,6 @@
 ; Make sure EdgeBoundles handles the case when the function size is less then 
 ; the number of block IDs.
-; RUN: llc -regalloc=fast < %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s
 
 define void @foo() nounwind {
 entry:
diff --git a/test/CodeGen/Generic/print-after.ll b/test/CodeGen/Generic/print-after.ll
new file mode 100644
index 0000000..7505907
--- /dev/null
+++ b/test/CodeGen/Generic/print-after.ll
@@ -0,0 +1,6 @@
+; RUN: not llc --help-hidden 2>&1 | FileCheck %s
+
+; CHECK: -print-after
+; CHECK-NOT: -print-after-all
+; CHECK: =simple-register-coalescing
+; CHECK: -print-after-all
diff --git a/test/CodeGen/Generic/print-machineinstrs.ll b/test/CodeGen/Generic/print-machineinstrs.ll
new file mode 100644
index 0000000..75dceb5
--- /dev/null
+++ b/test/CodeGen/Generic/print-machineinstrs.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -O3 -debug-pass=Structure -print-machineinstrs=branch-folder -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -O3 -debug-pass=Structure -print-machineinstrs -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -O3 -debug-pass=Structure -print-machineinstrs= -o /dev/null 2>&1 | FileCheck %s
+
+define i64 @foo(i64 %a, i64 %b) nounwind {
+; CHECK: -branch-folder -print-machineinstrs
+; CHECK: Control Flow Optimizer
+; CHECK-NEXT: MachineFunction Printer
+; CHECK: Machine code for function foo:
+  %c = add i64 %a, %b
+  %d = trunc i64 %c to i32
+  %e = zext i32 %d to i64
+  ret i64 %e
+}
diff --git a/test/CodeGen/Generic/stop-after.ll b/test/CodeGen/Generic/stop-after.ll
new file mode 100644
index 0000000..557e097
--- /dev/null
+++ b/test/CodeGen/Generic/stop-after.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -debug-pass=Structure -stop-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP
+; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START
+
+; STOP: -loop-reduce -print-module
+; STOP: Loop Strength Reduction
+; STOP-NEXT: Machine Function Analysis
+
+; START: -machine-branch-prob -gc-lowering
+; START: FunctionPass Manager
+; START-NEXT: Lower Garbage Collection Instructions
diff --git a/test/CodeGen/Generic/undef-phi.ll b/test/CodeGen/Generic/undef-phi.ll
new file mode 100644
index 0000000..10899f9
--- /dev/null
+++ b/test/CodeGen/Generic/undef-phi.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs -verify-coalescing
+;
+; This function has a PHI with one undefined input. Verify that PHIElimination
+; inserts an IMPLICIT_DEF instruction in the predecessor so all paths to the use
+; pass through a def.
+
+%struct.xx_stack = type { i32, %struct.xx_stack* }
+
+define i32 @push(%struct.xx_stack* %stack) nounwind uwtable readonly ssp {
+entry:
+  %tobool1 = icmp eq %struct.xx_stack* %stack, null
+  br i1 %tobool1, label %for.end, label %for.body
+
+for.body:
+  %stack.addr.02 = phi %struct.xx_stack* [ %0, %for.body ], [ %stack, %entry ]
+  %next = getelementptr inbounds %struct.xx_stack* %stack.addr.02, i64 0, i32 1
+  %0 = load %struct.xx_stack** %next, align 8
+  %tobool = icmp eq %struct.xx_stack* %0, null
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:
+  %top.0.lcssa = phi %struct.xx_stack* [ undef, %entry ], [ %stack.addr.02, %for.body ]
+  %first = getelementptr inbounds %struct.xx_stack* %top.0.lcssa, i64 0, i32 0
+  %1 = load i32* %first, align 4
+  ret i32 %1
+}
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index 69002e0..e9ac8b6 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
 ; CHECK: r0 = #1
diff --git a/test/CodeGen/Hexagon/combine.ll b/test/CodeGen/Hexagon/combine.ll
index 36abd74..7219985 100644
--- a/test/CodeGen/Hexagon/combine.ll
+++ b/test/CodeGen/Hexagon/combine.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: combine(r{{[0-9]+}}, r{{[0-9]+}})
 
 @j = external global i32
diff --git a/test/CodeGen/Hexagon/convertdptoint.ll b/test/CodeGen/Hexagon/convertdptoint.ll
new file mode 100644
index 0000000..fa068c4
--- /dev/null
+++ b/test/CodeGen/Hexagon/convertdptoint.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate conversion from double precision floating point
+; to 32-bit int value in IEEE complaint mode in V5.
+
+; CHECK: r{{[0-9]+}} = convert_df2w(r{{[0-9]+}}:{{[0-9]+}}):chop
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store i32 0, i32* %retval
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %a, align 8
+  %1 = load double* %b, align 8
+  %add = fadd double %0, %1
+  store double %add, double* %c, align 8
+  %2 = load double* %c, align 8
+  %conv = fptosi double %2 to i32
+  store i32 %conv, i32* %i, align 4
+  %3 = load i32* %i, align 4
+  ret i32 %3
+}
diff --git a/test/CodeGen/Hexagon/convertdptoll.ll b/test/CodeGen/Hexagon/convertdptoll.ll
new file mode 100644
index 0000000..1b4dd86
--- /dev/null
+++ b/test/CodeGen/Hexagon/convertdptoll.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate conversion from double precision floating point
+; to 64-bit integer value in IEEE complaint mode in V5.
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = convert_df2d(r{{[0-9]+}}:{{[0-9]+}}):chop
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i64, align 8
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store i32 0, i32* %retval
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %a, align 8
+  %1 = load double* %b, align 8
+  %add = fadd double %0, %1
+  store double %add, double* %c, align 8
+  %2 = load double* %c, align 8
+  %conv = fptosi double %2 to i64
+  store i64 %conv, i64* %i, align 8
+  %3 = load i64* %i, align 8
+  %conv1 = trunc i64 %3 to i32
+  ret i32 %conv1
+}
diff --git a/test/CodeGen/Hexagon/convertsptoint.ll b/test/CodeGen/Hexagon/convertsptoint.ll
new file mode 100644
index 0000000..b8a9d6c
--- /dev/null
+++ b/test/CodeGen/Hexagon/convertsptoint.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate conversion from single precision floating point
+; to 32-bit int value in IEEE complaint mode in V5.
+
+; CHECK: r{{[0-9]+}} = convert_sf2w(r{{[0-9]+}}):chop
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  store i32 0, i32* %retval
+  store float 0x402ECCCCC0000000, float* %a, align 4
+  store float 0x4022333340000000, float* %b, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %c, align 4
+  %2 = load float* %c, align 4
+  %conv = fptosi float %2 to i32
+  store i32 %conv, i32* %i, align 4
+  %3 = load i32* %i, align 4
+  ret i32 %3
+}
diff --git a/test/CodeGen/Hexagon/convertsptoll.ll b/test/CodeGen/Hexagon/convertsptoll.ll
new file mode 100644
index 0000000..1c4df94
--- /dev/null
+++ b/test/CodeGen/Hexagon/convertsptoll.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate conversion from single precision floating point
+; to 64-bit int value in IEEE complaint mode in V5.
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = convert_sf2d(r{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i64, align 8
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  store i32 0, i32* %retval
+  store float 0x402ECCCCC0000000, float* %a, align 4
+  store float 0x4022333340000000, float* %b, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %c, align 4
+  %2 = load float* %c, align 4
+  %conv = fptosi float %2 to i64
+  store i64 %conv, i64* %i, align 8
+  %3 = load i64* %i, align 8
+  %conv1 = trunc i64 %3 to i32
+  ret i32 %conv1
+}
diff --git a/test/CodeGen/Hexagon/dadd.ll b/test/CodeGen/Hexagon/dadd.ll
new file mode 100644
index 0000000..602978a
--- /dev/null
+++ b/test/CodeGen/Hexagon/dadd.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate double precision floating point add in V5.
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfadd(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %a, align 8
+  %1 = load double* %b, align 8
+  %add = fadd double %0, %1
+  store double %add, double* %c, align 8
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/dmul.ll b/test/CodeGen/Hexagon/dmul.ll
new file mode 100644
index 0000000..d743773
--- /dev/null
+++ b/test/CodeGen/Hexagon/dmul.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate double precision floating point multiply in V5.
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfmpy(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %b, align 8
+  %1 = load double* %a, align 8
+  %mul = fmul double %0, %1
+  store double %mul, double* %c, align 8
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/double.ll b/test/CodeGen/Hexagon/double.ll
index 04c2ec1..c3b6f37 100644
--- a/test/CodeGen/Hexagon/double.ll
+++ b/test/CodeGen/Hexagon/double.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: __hexagon_adddf3
 ; CHECK: __hexagon_subdf3
 
diff --git a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
new file mode 100644
index 0000000..54e7ce3
--- /dev/null
+++ b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-hexagon-ieee-rnd-near < %s | FileCheck %s
+; Check that we generate conversion from double precision floating point
+; to 32-bit int value in IEEE rounding to the nearest mode in V5.
+
+; CHECK: r{{[0-9]+}} = convert_df2w(r{{[0-9]+}}:{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store i32 0, i32* %retval
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %a, align 8
+  %1 = load double* %b, align 8
+  %add = fadd double %0, %1
+  store double %add, double* %c, align 8
+  %2 = load double* %c, align 8
+  %conv = fptosi double %2 to i32
+  store i32 %conv, i32* %i, align 4
+  %3 = load i32* %i, align 4
+  ret i32 %3
+}
diff --git a/test/CodeGen/Hexagon/dsub.ll b/test/CodeGen/Hexagon/dsub.ll
new file mode 100644
index 0000000..4f9d39e
--- /dev/null
+++ b/test/CodeGen/Hexagon/dsub.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate double precision floating point subtract in V5.
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfsub(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %c = alloca double, align 8
+  store double 1.540000e+01, double* %a, align 8
+  store double 9.100000e+00, double* %b, align 8
+  %0 = load double* %b, align 8
+  %1 = load double* %a, align 8
+  %sub = fsub double %0, %1
+  store double %sub, double* %c, align 8
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/dualstore.ll b/test/CodeGen/Hexagon/dualstore.ll
new file mode 100644
index 0000000..9b27dda
--- /dev/null
+++ b/test/CodeGen/Hexagon/dualstore.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate dual stores in one packet in V4
+
+; CHECK: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}
+; CHECK-NEXT: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}
+; CHECK-NEXT: }
+
+@Reg = global i32 0, align 4
+define i32 @main() nounwind {
+entry:
+  %number= alloca i32, align 4
+  store i32 500000, i32* %number, align 4
+  %number1= alloca i32, align 4
+  store i32 100000, i32* %number1, align 4
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/fadd.ll b/test/CodeGen/Hexagon/fadd.ll
new file mode 100644
index 0000000..b95e147
--- /dev/null
+++ b/test/CodeGen/Hexagon/fadd.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate sp floating point add in V5.
+
+; CHECK: r{{[0-9]+}} = sfadd(r{{[0-9]+}}, r{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  store float 0x402ECCCCC0000000, float* %a, align 4
+  store float 0x4022333340000000, float* %b, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %c, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/fcmp.ll b/test/CodeGen/Hexagon/fcmp.ll
new file mode 100644
index 0000000..e7b649e
--- /dev/null
+++ b/test/CodeGen/Hexagon/fcmp.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate floating point compare in V5
+
+; CHECK: p{{[0-2]+}} = sfcmp.{{.}}
+
+define i32 @foo(float %y) nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %y.addr = alloca float, align 4
+  store float %y, float* %y.addr, align 4
+  %0 = load float* %y.addr, align 4
+  %cmp = fcmp ogt float %0, 0x406AD7EFA0000000
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* %retval
+  br label %return
+
+if.else:                                          ; preds = %entry
+  store i32 2, i32* %retval
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca float, align 4
+  store i32 0, i32* %retval
+  store float 0x40012E0A00000000, float* %a, align 4
+  %0 = load float* %a, align 4
+  %call = call i32 @foo(float %0)
+  ret i32 %call
+}
diff --git a/test/CodeGen/Hexagon/float.ll b/test/CodeGen/Hexagon/float.ll
index 51acf2e..bec9f58 100644
--- a/test/CodeGen/Hexagon/float.ll
+++ b/test/CodeGen/Hexagon/float.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: __hexagon_addsf3
 ; CHECK: __hexagon_subsf3
 
diff --git a/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
new file mode 100644
index 0000000..bec9f58
--- /dev/null
+++ b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: __hexagon_addsf3
+; CHECK: __hexagon_subsf3
+
+define void @foo(float* %acc, float %num, float %num2) nounwind {
+entry:
+  %acc.addr = alloca float*, align 4
+  %num.addr = alloca float, align 4
+  %num2.addr = alloca float, align 4
+  store float* %acc, float** %acc.addr, align 4
+  store float %num, float* %num.addr, align 4
+  store float %num2, float* %num2.addr, align 4
+  %0 = load float** %acc.addr, align 4
+  %1 = load float* %0
+  %2 = load float* %num.addr, align 4
+  %add = fadd float %1, %2
+  %3 = load float* %num2.addr, align 4
+  %sub = fsub float %add, %3
+  %4 = load float** %acc.addr, align 4
+  store float %sub, float* %4
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/fmul.ll b/test/CodeGen/Hexagon/fmul.ll
new file mode 100644
index 0000000..4766845
--- /dev/null
+++ b/test/CodeGen/Hexagon/fmul.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate single precision floating point multiply in V5.
+
+; CHECK: r{{[0-9]+}} = sfmpy(r{{[0-9]+}}, r{{[0-9]+}})
+
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  store float 0x402ECCCCC0000000, float* %a, align 4
+  store float 0x4022333340000000, float* %b, align 4
+  %0 = load float* %b, align 4
+  %1 = load float* %a, align 4
+  %mul = fmul float %0, %1
+  store float %mul, float* %c, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/frame.ll b/test/CodeGen/Hexagon/frame.ll
index c0a9fda..dc87c73 100644
--- a/test/CodeGen/Hexagon/frame.ll
+++ b/test/CodeGen/Hexagon/frame.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
diff --git a/test/CodeGen/Hexagon/fsub.ll b/test/CodeGen/Hexagon/fsub.ll
new file mode 100644
index 0000000..07c866f
--- /dev/null
+++ b/test/CodeGen/Hexagon/fsub.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Check that we generate sp floating point subtract in V5.
+
+; CHECK: r{{[0-9]+}} = sfsub(r{{[0-9]+}}, r{{[0-9]+}})
+
+define i32 @main() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  store float 0x402ECCCCC0000000, float* %a, align 4
+  store float 0x4022333340000000, float* %b, align 4
+  %0 = load float* %b, align 4
+  %1 = load float* %a, align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* %c, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/fusedandshift.ll b/test/CodeGen/Hexagon/fusedandshift.ll
new file mode 100644
index 0000000..022b3c6
--- /dev/null
+++ b/test/CodeGen/Hexagon/fusedandshift.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; Check that we generate fused logical and with shift instruction.
+
+; CHECK: r{{[0-9]+}} = and(#15, lsr(r{{[0-9]+}}, #{{[0-9]+}})
+
+define i32 @main(i16* %a, i16* %b) nounwind {
+  entry:
+  %0 = load i16* %a, align 2
+  %conv1 = sext i16 %0 to i32
+  %shr1 = ashr i32 %conv1, 3
+  %and1 = and i32 %shr1, 15
+  %conv2 = trunc i32 %and1 to i16
+  store i16 %conv2, i16* %b, align 2
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/macint.ll b/test/CodeGen/Hexagon/macint.ll
new file mode 100644
index 0000000..b3b9d0e
--- /dev/null
+++ b/test/CodeGen/Hexagon/macint.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; Check that we generate integer multiply accumulate.
+
+; CHECK: r{{[0-9]+}} += mpyi(r{{[0-9]+}}, r{{[0-9]+}})
+
+define i32 @main(i32* %a, i32* %b) nounwind {
+  entry:
+  %0 = load i32* %a, align 4
+  %div = udiv i32 %0, 10000
+  %rem = urem i32 %div, 10
+  store i32 %rem, i32* %b, align 4
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/mpy.ll b/test/CodeGen/Hexagon/mpy.ll
index afd6fc6..d5c5ae3 100644
--- a/test/CodeGen/Hexagon/mpy.ll
+++ b/test/CodeGen/Hexagon/mpy.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: += mpyi
 
 define void @foo(i32 %acc, i32 %num, i32 %num2) nounwind {
diff --git a/test/CodeGen/Hexagon/newvaluejump.ll b/test/CodeGen/Hexagon/newvaluejump.ll
new file mode 100644
index 0000000..9c7ca55
--- /dev/null
+++ b/test/CodeGen/Hexagon/newvaluejump.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate new value jump.
+
+@i = global i32 0, align 4
+@j = global i32 10, align 4
+
+define i32 @foo(i32 %a) nounwind {
+entry:
+; CHECK: if (cmp.eq(r{{[0-9]+}}.new, #0)) jump{{.}}
+  %addr1 = alloca i32, align 4
+  %addr2 = alloca i32, align 4
+  %0 = load i32* @i, align 4
+  store i32 %0, i32* %addr1, align 4
+  call void @bar(i32 1, i32 2)
+  %1 = load i32* @j, align 4
+  %tobool = icmp ne i32 %1, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:
+  call void @baz(i32 1, i32 2)
+  br label %if.end
+
+if.else:
+  call void @guy(i32 10, i32 20)
+  br label %if.end
+
+if.end:
+  ret i32 0
+}
+
+declare void @guy(i32, i32)
+declare void @bar(i32, i32)
+declare void @baz(i32, i32)
diff --git a/test/CodeGen/Hexagon/newvaluejump2.ll b/test/CodeGen/Hexagon/newvaluejump2.ll
new file mode 100644
index 0000000..3d50ea5
--- /dev/null
+++ b/test/CodeGen/Hexagon/newvaluejump2.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate new value jump, both registers, with one 
+; of the registers as new.
+
+@Reg = common global i8 0, align 1
+define i32 @main() nounwind {
+entry:
+; CHECK: if (cmp.gt(r{{[0-9]+}}.new, r{{[0-9]+}})) jump:{{[t|nt]}} .LBB{{[0-9]+}}_{{[0-9]+}}
+  %Reg2 = alloca i8, align 1
+  %0 = load i8* %Reg2, align 1
+  %conv0 = zext i8 %0 to i32
+  %1 = load i8* @Reg, align 1
+  %conv1 = zext i8 %1 to i32
+  %tobool = icmp sle i32 %conv0, %conv1
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:
+  call void @bar(i32 1, i32 2)
+  br label %if.end
+
+if.else:
+  call void @baz(i32 10, i32 20)
+  br label %if.end
+
+if.end:
+  ret i32 0
+}
+
+declare void @bar(i32, i32)
+declare void @baz(i32, i32)
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
new file mode 100644
index 0000000..ab69b22
--- /dev/null
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; Check that we generate new value store packet in V4
+
+@i = global i32 0, align 4
+@j = global i32 10, align 4
+@k = global i32 100, align 4
+
+define i32 @main() nounwind {
+entry:
+; CHECK: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}.new
+  %number1 = alloca i32, align 4
+  %number2 = alloca i32, align 4
+  %number3 = alloca i32, align 4
+  %0 = load i32 * @i, align 4
+  store i32 %0, i32* %number1, align 4
+  %1 = load i32 * @j, align 4
+  store i32 %1, i32* %number2, align 4
+  %2 = load i32 * @k, align 4
+  store i32 %2, i32* %number3, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Hexagon/opt-fabs.ll b/test/CodeGen/Hexagon/opt-fabs.ll
new file mode 100644
index 0000000..31b56fd
--- /dev/null
+++ b/test/CodeGen/Hexagon/opt-fabs.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv5  < %s | FileCheck %s
+; Optimize fabsf to clrbit in V5.
+
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
+
+define float @my_fabsf(float %x) nounwind {
+entry:
+  %x.addr = alloca float, align 4
+  store float %x, float* %x.addr, align 4
+  %0 = load float* %x.addr, align 4
+  %call = call float @fabsf(float %0) readnone
+  ret float %call
+}
+
+declare float @fabsf(float)
diff --git a/test/CodeGen/Hexagon/opt-fneg.ll b/test/CodeGen/Hexagon/opt-fneg.ll
new file mode 100644
index 0000000..479b4b6
--- /dev/null
+++ b/test/CodeGen/Hexagon/opt-fneg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; Optimize fneg to togglebit in V5.
+
+define float @foo(float %x) nounwind {
+entry:
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+  %x.addr = alloca float, align 4
+  store float %x, float* %x.addr, align 4
+  %0 = load float* %x.addr, align 4
+  %sub = fsub float -0.000000e+00, %0
+  ret float %sub
+}
+
+define float @bar(float %x) nounwind {
+entry:
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+  %sub = fsub float -0.000000e+00, %x
+  ret float %sub
+}
+
+define float @baz(float %x) nounwind {
+entry:
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+  %conv1 = fmul float %x, -1.000000e+00
+  ret float %conv1
+}
diff --git a/test/CodeGen/Hexagon/simpletailcall.ll b/test/CodeGen/Hexagon/simpletailcall.ll
new file mode 100644
index 0000000..2876404
--- /dev/null
+++ b/test/CodeGen/Hexagon/simpletailcall.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo_empty
+; CHECK-NOT: allocframe
+; CHECK-NOT: memd(r29
+; CHECK: jump bar_empty
+
+define void @foo_empty(i32 %h) nounwind {
+entry:
+  %add = add nsw i32 %h, 3
+  %call = tail call i32 bitcast (i32 (...)* @bar_empty to i32 (i32)*)(i32 %add) nounwind
+  ret void
+}
+
+declare i32 @bar_empty(...)
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index c63a3ba..2e4ab63 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,13 +1,12 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
 @val = external global i32
 
-; CHECK: CONST32(#num)
-; CHECK: CONST32(#acc)
-; CHECK: CONST32(#val)
+; CHECK: memw(##num)
+; CHECK: memw(##acc)
+; CHECK: memw(##val)
 
 define void @foo() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/struct_args.ll b/test/CodeGen/Hexagon/struct_args.ll
index 2c962d0..e488f33 100644
--- a/test/CodeGen/Hexagon/struct_args.ll
+++ b/test/CodeGen/Hexagon/struct_args.ll
@@ -1,6 +1,6 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: r1:0 = or(r{{[0-9]}}:{{[0-9]}}, r{{[0-9]}}:{{[0-9]}})
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: r{{[0-9]}}:{{[0-9]}} = combine(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: r{{[0-9]}}:{{[0-9]}} |= asl(r{{[0-9]}}:{{[0-9]}}, #32)
 
 %struct.small = type { i32, i32 }
 
diff --git a/test/CodeGen/Hexagon/struct_args_large.ll b/test/CodeGen/Hexagon/struct_args_large.ll
index 69de4f6..f09fd10 100644
--- a/test/CodeGen/Hexagon/struct_args_large.ll
+++ b/test/CodeGen/Hexagon/struct_args_large.ll
@@ -1,8 +1,7 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = CONST32(#s2)
-; CHECK: r[[T1:[0-9]+]] = memw(r[[T0]] + #0)
-; CHECK: memw(r29 + #0) = r[[T1]]
+; CHECK: memw(r29 + #0) = r{{.}}
+; CHECK: memw(r29+#8) = r{{.}}
 
 %struct.large = type { i64, i64 }
 
diff --git a/test/CodeGen/Hexagon/vaddh.ll b/test/CodeGen/Hexagon/vaddh.ll
index 788e474..01d2041 100644
--- a/test/CodeGen/Hexagon/vaddh.ll
+++ b/test/CodeGen/Hexagon/vaddh.ll
@@ -1,5 +1,4 @@
-; RUN: true
-; DISABLED: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: vaddh(r{{[0-9]+}}, r{{[0-9]+}})
 
 @j = external global i32
diff --git a/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll b/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll
index b92477b..c3d69c7 100644
--- a/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll
+++ b/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll
@@ -5,9 +5,9 @@ target triple = "msp430-unknown-linux-gnu"
 
 define msp430_intrcc void @foo() nounwind {
 entry:
-	%fa = call i16* @llvm.frameaddress(i32 0)
-	store i16 0, i16* %fa
+	%fa = call i8* @llvm.frameaddress(i32 0)
+	store i8 0, i8* %fa
 	ret void
 }
 
-declare i16* @llvm.frameaddress(i32)
+declare i8* @llvm.frameaddress(i32)
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index 45342e2..b9c17d9 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -4,7 +4,7 @@ target triple = "msp430-generic-generic"
 
 define i8 @mov(i8 %a, i8 %b) nounwind {
 ; CHECK: mov:
-; CHECK: mov.b	r14, r15
+; CHECK: mov.{{[bw]}} r14, r15
 	ret i8 %b
 }
 
diff --git a/test/CodeGen/Mips/2008-07-23-fpcmp.ll b/test/CodeGen/Mips/2008-07-23-fpcmp.ll
index 519e4b9..9c547f1 100644
--- a/test/CodeGen/Mips/2008-07-23-fpcmp.ll
+++ b/test/CodeGen/Mips/2008-07-23-fpcmp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=mips -o %t
-; RUN: grep {c\\..*\\.s} %t | count 3
-; RUN: grep {bc1\[tf\]} %t | count 3
+; RUN: grep "c\..*\.s" %t | count 3
+; RUN: grep "bc1[tf]" %t | count 3
 
 ; FIXME: Disabled because branch instructions are generated where
 ; conditional move instructions are expected.
diff --git a/test/CodeGen/Mips/2008-07-29-icmp.ll b/test/CodeGen/Mips/2008-07-29-icmp.ll
index e85a749..e88e3d3 100644
--- a/test/CodeGen/Mips/2008-07-29-icmp.ll
+++ b/test/CodeGen/Mips/2008-07-29-icmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=mips | grep {b\[ne\]\[eq\]} | count 1
+; RUN: llc < %s -march=mips | grep "b[ne][eq]" | count 1
 
 ; FIXME: Disabled because branch instructions are generated where
 ; conditional move instructions are expected.
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index aaf6767..261fe9d 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -7,19 +7,22 @@ entry:
   %x = alloca i32, align 4                        ; <i32*> [#uses=2]
   store volatile i32 2, i32* %x, align 4
   %0 = load volatile i32* %x, align 4             ; <i32> [#uses=1]
-; STATIC-O32: lui $[[R0:[0-9]+]], %hi($JTI0_0)
-; STATIC-O32: addiu ${{[0-9]+}}, $[[R0]], %lo($JTI0_0)
-; STATIC-O32: sll ${{[0-9]+}}, ${{[0-9]+}}, 2
-; PIC-O32: lw $[[R0:[0-9]+]], %got($JTI0_0)
-; PIC-O32: addiu ${{[0-9]+}}, $[[R0]], %lo($JTI0_0)
-; PIC-O32: sll ${{[0-9]+}}, ${{[0-9]+}}, 2
-; PIC-O32: addu $[[R1:[0-9]+]], ${{[0-9]+}}, $gp
-; PIC-O32: jr  $[[R1]]
-; PIC-N64: ld $[[R0:[0-9]+]], %got_page($JTI0_0)
-; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst($JTI0_0)
-; PIC-N64: dsll ${{[0-9]+}}, ${{[0-9]+}}, 3
-; PIC-N64: daddu $[[R1:[0-9]+]], ${{[0-9]+}}, $gp
-; PIC-N64: jr  $[[R1]]
+; STATIC-O32: sll $[[R0:[0-9]+]], ${{[0-9]+}}, 2
+; STATIC-O32: lui $[[R1:[0-9]+]], %hi($JTI0_0)
+; STATIC-O32: addu $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; STATIC-O32: lw $[[R3:[0-9]+]], %lo($JTI0_0)($[[R2]])
+; PIC-O32: sll $[[R0:[0-9]+]], ${{[0-9]+}}, 2
+; PIC-O32: lw $[[R1:[0-9]+]], %got($JTI0_0)
+; PIC-O32: addu $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; PIC-O32: lw $[[R4:[0-9]+]], %lo($JTI0_0)($[[R2]])
+; PIC-O32: addu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
+; PIC-O32: jr  $[[R5]]
+; PIC-N64: dsll $[[R0:[0-9]+]], ${{[0-9]+}}, 3
+; PIC-N64: ld $[[R1:[0-9]+]], %got_page($JTI0_0)
+; PIC-N64: daddu $[[R2:[0-9]+]], $[[R0:[0-9]+]], $[[R1]]
+; PIC-N64: ld $[[R4:[0-9]+]], %got_ofst($JTI0_0)($[[R2]])
+; PIC-N64: daddu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
+; PIC-N64: jr  $[[R5]]
   switch i32 %0, label %bb4 [
     i32 0, label %bb5
     i32 1, label %bb1
@@ -30,7 +33,6 @@ entry:
 bb1:                                              ; preds = %entry
   ret i32 2
 
-; CHECK: STATIC-O32: $BB0_2
 bb2:                                              ; preds = %entry
   ret i32 0
 
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
index 15c73e2..29f43c8 100644
--- a/test/CodeGen/Mips/alloca.ll
+++ b/test/CodeGen/Mips/alloca.ll
@@ -4,14 +4,10 @@ define i32 @twoalloca(i32 %size) nounwind {
 entry:
 ; CHECK: subu  $[[T0:[0-9]+]], $sp, $[[SZ:[0-9]+]]
 ; CHECK: addu  $sp, $zero, $[[T0]]
-; CHECK: addiu $[[T1:[0-9]+]], $sp, [[OFF:[0-9]+]]
 ; CHECK: subu  $[[T2:[0-9]+]], $sp, $[[SZ]]
 ; CHECK: addu  $sp, $zero, $[[T2]]
-; CHECK: addiu $[[T3:[0-9]+]], $sp, [[OFF]]
-; CHECK: lw    $[[T4:[0-9]+]], %call16(foo)($gp)
-; CHECK: addu  $25, $zero, $[[T4]]
-; CHECK: addu  $4, $zero, $[[T1]]
-; CHECK: jalr  $25
+; CHECK: addu  $4, $zero, $[[T0]]
+; CHECK: addu  $4, $zero, $[[T2]]
   %tmp1 = alloca i8, i32 %size, align 4
   %add.ptr = getelementptr inbounds i8* %tmp1, i32 5
   store i8 97, i8* %add.ptr, align 1
@@ -31,14 +27,9 @@ declare i32 @foo(i8*)
 
 define i32 @alloca2(i32 %size) nounwind {
 entry:
-; dynamic allocated stack area and $gp restore slot have the same offsets
-; relative to $sp.
-;
 ; CHECK: alloca2
-; CHECK: .cprestore [[OFF:[0-9]+]]
-; CHECK: subu  $[[T0:[0-9]+]], $sp, $[[SZ:[0-9]+]]
+; CHECK: subu  $[[T0:[0-9]+]], $sp
 ; CHECK: addu  $sp, $zero, $[[T0]]
-; CHECK: addiu $[[T1:[0-9]+]], $sp, [[OFF]]
 
   %tmp1 = alloca i8, i32 %size, align 4
   %0 = bitcast i8* %tmp1 to i32*
@@ -46,7 +37,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-; CHECK: addiu $4, $[[T1]], 40
+; CHECK: addiu $4, $[[T0]], 40
 
   %add.ptr = getelementptr inbounds i8* %tmp1, i32 40
   %1 = bitcast i8* %add.ptr to i32*
@@ -56,7 +47,7 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-; CHECK: addiu $4, $[[T1]], 12
+; CHECK: addiu $4, $[[T0]], 12
 
   %add.ptr5 = getelementptr inbounds i8* %tmp1, i32 12
   %2 = bitcast i8* %add.ptr5 to i32*
@@ -64,7 +55,7 @@ if.else:                                          ; preds = %entry
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-; CHECK: lw  $5, 0($[[T1]])
+; CHECK: lw  $5, 0($[[T0]])
 ; CHECK: lw  $25, %call16(printf)
 
   %.pre-phi = phi i32* [ %2, %if.else ], [ %.pre, %if.then ]
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index bc5bcc3..8ec5d93 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -2,9 +2,8 @@
 
 define double @foo(double %a, double %b) nounwind readnone {
 entry:
-; CHECK: bc1f $BB0_2
+; CHECK: bc1f $BB
 ; CHECK: nop
-; CHECK: # BB#1:    
 
   %cmp = fcmp ogt double %a, 0.000000e+00
   br i1 %cmp, label %if.end6, label %if.else
@@ -26,9 +25,8 @@ return:                                           ; preds = %if.else, %if.end6
 
 define void @f1(float %f) nounwind {
 entry:
-; CHECK: bc1f $BB1_1
+; CHECK: bc1f $BB
 ; CHECK: nop
-; CHECK: # BB#2:
   %cmp = fcmp une float %f, 0.000000e+00
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/test/CodeGen/Mips/and1.ll b/test/CodeGen/Mips/and1.ll
new file mode 100644
index 0000000..4ff1204
--- /dev/null
+++ b/test/CodeGen/Mips/and1.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %and = and i32 %0, %1
+; 16:	and	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0), i32 %and)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/asm-large-immediate.ll b/test/CodeGen/Mips/asm-large-immediate.ll
new file mode 100644
index 0000000..246fff6
--- /dev/null
+++ b/test/CodeGen/Mips/asm-large-immediate.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+        tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+        tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+        ret void
+}
+
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index a4763b1..050689d 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -8,7 +8,7 @@ entry:
   ret i32 %0
 
 ; CHECK:   AtomicLoadAdd32:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
@@ -22,7 +22,7 @@ entry:
   ret i32 %0
 
 ; CHECK:   AtomicLoadNand32:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; CHECK:   and     $[[R3:[0-9]+]], $[[R1]], $4
@@ -40,7 +40,7 @@ entry:
   ret i32 %0
 
 ; CHECK:   AtomicSwap32:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      ${{[0-9]+}}, 0($[[R0]])
 ; CHECK:   sc      $[[R2:[0-9]+]], 0($[[R0]])
@@ -56,7 +56,7 @@ entry:
   ret i32 %0
 
 ; CHECK:   AtomicCmpSwap32:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $2, 0($[[R0]])
 ; CHECK:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
@@ -75,7 +75,7 @@ entry:
   ret i8 %0
 
 ; CHECK:   AtomicLoadAdd8:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)
 ; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
 ; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
 ; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
@@ -106,7 +106,7 @@ entry:
   ret i8 %0
 
 ; CHECK:   AtomicLoadSub8:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)
 ; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
 ; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
 ; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
@@ -137,7 +137,7 @@ entry:
   ret i8 %0
 
 ; CHECK:   AtomicLoadNand8:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)
 ; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
 ; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
 ; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
@@ -169,7 +169,7 @@ entry:
   ret i8 %0
 
 ; CHECK:   AtomicSwap8:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)
 ; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
 ; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
 ; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
@@ -198,7 +198,7 @@ entry:
   ret i8 %0
 
 ; CHECK:   AtomicCmpSwap8:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)
 ; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
 ; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
 ; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
@@ -242,3 +242,19 @@ entry:
 ; CHECK:   sync 0
 }
 
+; make sure that this assertion in
+; TwoAddressInstructionPass::TryInstructionTransform does not fail:
+;
+; line 1203: assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+;
+; it failed when MipsDAGToDAGISel::ReplaceUsesWithZeroReg replaced an
+; operand of an atomic instruction with register $zero. 
+@a = external global i32
+
+define i32 @zeroreg() nounwind {
+entry:
+  %0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst
+  %1 = icmp eq i32 %0, 1
+  %conv = zext i1 %1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 03254a9..3af899a 100755
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -5,10 +5,12 @@
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global i32* null, align 4
 
-; O32:  lw  ${{[0-9]+}}, %got(i3)($gp)
-; O32:  addiu ${{[0-9]+}}, $gp, %got(i1)
-; N64:  ld  ${{[0-9]+}}, %got_disp(i3)($gp)
-; N64:  daddiu ${{[0-9]+}}, $gp, %got_disp(i1)
+; O32:  lw $[[R0:[0-9]+]], %got(i3)
+; O32:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1) 
+; O32:  movn $[[R0]], $[[R1]], ${{[0-9]+}} 
+; N64:  ldr $[[R0:[0-9]+]] 
+; N64:  ld $[[R1:[0-9]+]], %got_disp(i1)
+; N64:  movn $[[R0]], $[[R1]], ${{[0-9]+}} 
 define i32* @cmov1(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -21,12 +23,12 @@ entry:
 @d = global i32 0, align 4
 
 ; O32: cmov2:
-; O32: addiu $[[R1:[0-9]+]], $gp, %got(d)
-; O32: addiu $[[R0:[0-9]+]], $gp, %got(c)
+; O32: addiu $[[R1:[0-9]+]], ${{[a-z0-9]+}}, %got(d)
+; O32: addiu $[[R0:[0-9]+]], ${{[a-z0-9]+}}, %got(c)
 ; O32: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
 ; N64: cmov2:
-; N64: daddiu $[[R1:[0-9]+]], $gp, %got_disp(d)
-; N64: daddiu $[[R0:[0-9]+]], $gp, %got_disp(c)
+; N64: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
+; N64: daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
 ; N64: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
 define i32 @cmov2(i32 %s) nounwind readonly {
 entry:
@@ -37,3 +39,23 @@ entry:
   ret i32 %cond
 }
 
+; O32: cmov3:
+; O32: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
+; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone {
+entry:
+  %cmp = icmp eq i32 %a, 234
+  %cond = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %cond
+}
+
+; N64: cmov4:
+; N64: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
+; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone {
+entry:
+  %cmp = icmp eq i32 %a, 234
+  %cond = select i1 %cmp, i64 %b, i64 %c
+  ret i64 %cond
+}
+
diff --git a/test/CodeGen/Mips/cprestore.ll b/test/CodeGen/Mips/cprestore.ll
index 57d022f..a618b67 100644
--- a/test/CodeGen/Mips/cprestore.ll
+++ b/test/CodeGen/Mips/cprestore.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; DISABLE: llc -march=mipsel < %s | FileCheck %s
+; RUN: false
+; XFAIL: *
 
 ; CHECK: .set macro
 ; CHECK: .set at
diff --git a/test/CodeGen/Mips/eh.ll b/test/CodeGen/Mips/eh.ll
index 2e2f9a4..d14150a 100644
--- a/test/CodeGen/Mips/eh.ll
+++ b/test/CodeGen/Mips/eh.ll
@@ -15,7 +15,6 @@ entry:
 ; CHECK-EB:  .cfi_offset 53, -8
 ; CHECK-EB:  .cfi_offset 52, -4
 ; CHECK-EL:  .cfi_offset 31, -12
-; CHECK-EL:  .cprestore 
 
   %exception = tail call i8* @__cxa_allocate_exception(i32 8) nounwind
   %0 = bitcast i8* %exception to double*
@@ -25,7 +24,6 @@ entry:
 
 lpad:                                             ; preds = %entry
 ; CHECK-EL:  # %lpad
-; CHECK-EL:  lw  $gp
 ; CHECK-EL:  bne $5
 
   %exn.val = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
diff --git a/test/CodeGen/Mips/fabs.ll b/test/CodeGen/Mips/fabs.ll
index b296ab3..49d8a72 100644
--- a/test/CodeGen/Mips/fabs.ll
+++ b/test/CodeGen/Mips/fabs.ll
@@ -1,8 +1,8 @@
-; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=32
-; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
-; RUN: llc  < %s -march=mipsel -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefix=NO-NAN
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s -check-prefix=32
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefix=NO-NAN
 
 define float @foo0(float %a) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
new file mode 100644
index 0000000..82919e7
--- /dev/null
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -0,0 +1,253 @@
+; RUN: llc  < %s -march=mipsel | FileCheck %s 
+
+@gi0 = external global i32
+@gi1 = external global i32
+@gi2 = external global i32
+@gi3 = external global i32
+@gi4 = external global i32
+@gi5 = external global i32
+@gi6 = external global i32
+@gi7 = external global i32
+@gi8 = external global i32
+@gi9 = external global i32
+@gi10 = external global i32
+@gi11 = external global i32
+@gi12 = external global i32
+@gi13 = external global i32
+@gi14 = external global i32
+@gi15 = external global i32
+@gi16 = external global i32
+@gfa0 = external global float
+@gfa1 = external global float
+@gfa2 = external global float
+@gfa3 = external global float
+@gfa4 = external global float
+@gfa5 = external global float
+@gfa6 = external global float
+@gfa7 = external global float
+@gfa8 = external global float
+@gfa9 = external global float
+@gfa10 = external global float
+@gfa11 = external global float
+@gfa12 = external global float
+@gfa13 = external global float
+@gfa14 = external global float
+@gfa15 = external global float
+@gfa16 = external global float
+@gfa17 = external global float
+@gfa18 = external global float
+@gfa19 = external global float
+@gfa20 = external global float
+@gf0 = external global float
+@gf1 = external global float
+@gf2 = external global float
+@gf3 = external global float
+@gf4 = external global float
+@gf5 = external global float
+@gf6 = external global float
+@gf7 = external global float
+@gf8 = external global float
+@gf9 = external global float
+@gf10 = external global float
+@gf11 = external global float
+@gf12 = external global float
+@gf13 = external global float
+@gf14 = external global float
+@gf15 = external global float
+@gf16 = external global float
+@gf17 = external global float
+@gf18 = external global float
+@gf19 = external global float
+@gf20 = external global float
+@g0 = external global i32
+@g1 = external global i32
+@g2 = external global i32
+@g3 = external global i32
+@g4 = external global i32
+@g5 = external global i32
+@g6 = external global i32
+@g7 = external global i32
+@g8 = external global i32
+@g9 = external global i32
+@g10 = external global i32
+@g11 = external global i32
+@g12 = external global i32
+@g13 = external global i32
+@g14 = external global i32
+@g15 = external global i32
+@g16 = external global i32
+
+define void @caller0() nounwind {
+entry:
+; CHECK: caller0
+; CHECK: lw  $3
+; CHECK: lw  $24
+; CHECK: lw  $15
+; CHECK: lw  $14
+; CHECK: lw  $13
+; CHECK: lw  $12
+; CHECK: lw  $11
+; CHECK: lw  $10
+; CHECK: lw  $9
+; CHECK: lw  $8
+; CHECK: lw  $7
+; CHECK: lw  $6
+; CHECK: lw  $5
+; CHECK: lw  $4
+
+  %0 = load i32* @gi0, align 4
+  %1 = load i32* @gi1, align 4
+  %2 = load i32* @gi2, align 4
+  %3 = load i32* @gi3, align 4
+  %4 = load i32* @gi4, align 4
+  %5 = load i32* @gi5, align 4
+  %6 = load i32* @gi6, align 4
+  %7 = load i32* @gi7, align 4
+  %8 = load i32* @gi8, align 4
+  %9 = load i32* @gi9, align 4
+  %10 = load i32* @gi10, align 4
+  %11 = load i32* @gi11, align 4
+  %12 = load i32* @gi12, align 4
+  %13 = load i32* @gi13, align 4
+  %14 = load i32* @gi14, align 4
+  %15 = load i32* @gi15, align 4
+  %16 = load i32* @gi16, align 4
+  tail call fastcc void @callee0(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16)
+  ret void
+}
+
+define internal fastcc void @callee0(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) nounwind noinline {
+entry:
+; CHECK: callee0
+; CHECK: sw  $4
+; CHECK: sw  $5
+; CHECK: sw  $6
+; CHECK: sw  $7
+; CHECK: sw  $8
+; CHECK: sw  $9
+; CHECK: sw  $10
+; CHECK: sw  $11
+; CHECK: sw  $12
+; CHECK: sw  $13
+; CHECK: sw  $14
+; CHECK: sw  $15
+; CHECK: sw  $24
+; CHECK: sw  $3
+
+  store i32 %a0, i32* @g0, align 4
+  store i32 %a1, i32* @g1, align 4
+  store i32 %a2, i32* @g2, align 4
+  store i32 %a3, i32* @g3, align 4
+  store i32 %a4, i32* @g4, align 4
+  store i32 %a5, i32* @g5, align 4
+  store i32 %a6, i32* @g6, align 4
+  store i32 %a7, i32* @g7, align 4
+  store i32 %a8, i32* @g8, align 4
+  store i32 %a9, i32* @g9, align 4
+  store i32 %a10, i32* @g10, align 4
+  store i32 %a11, i32* @g11, align 4
+  store i32 %a12, i32* @g12, align 4
+  store i32 %a13, i32* @g13, align 4
+  store i32 %a14, i32* @g14, align 4
+  store i32 %a15, i32* @g15, align 4
+  store i32 %a16, i32* @g16, align 4
+  ret void
+}
+
+define void @caller1(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15, float %a16, float %a17, float %a18, float %a19, float %a20) nounwind {
+entry:
+; CHECK: caller1
+; CHECK: lwc1  $f19
+; CHECK: lwc1  $f18
+; CHECK: lwc1  $f17
+; CHECK: lwc1  $f16
+; CHECK: lwc1  $f15
+; CHECK: lwc1  $f14
+; CHECK: lwc1  $f13
+; CHECK: lwc1  $f12
+; CHECK: lwc1  $f11
+; CHECK: lwc1  $f10
+; CHECK: lwc1  $f9
+; CHECK: lwc1  $f8
+; CHECK: lwc1  $f7
+; CHECK: lwc1  $f6
+; CHECK: lwc1  $f5
+; CHECK: lwc1  $f4
+; CHECK: lwc1  $f3
+; CHECK: lwc1  $f2
+; CHECK: lwc1  $f1
+; CHECK: lwc1  $f0
+
+  %0 = load float* @gfa0, align 4
+  %1 = load float* @gfa1, align 4
+  %2 = load float* @gfa2, align 4
+  %3 = load float* @gfa3, align 4
+  %4 = load float* @gfa4, align 4
+  %5 = load float* @gfa5, align 4
+  %6 = load float* @gfa6, align 4
+  %7 = load float* @gfa7, align 4
+  %8 = load float* @gfa8, align 4
+  %9 = load float* @gfa9, align 4
+  %10 = load float* @gfa10, align 4
+  %11 = load float* @gfa11, align 4
+  %12 = load float* @gfa12, align 4
+  %13 = load float* @gfa13, align 4
+  %14 = load float* @gfa14, align 4
+  %15 = load float* @gfa15, align 4
+  %16 = load float* @gfa16, align 4
+  %17 = load float* @gfa17, align 4
+  %18 = load float* @gfa18, align 4
+  %19 = load float* @gfa19, align 4
+  %20 = load float* @gfa20, align 4
+  tail call fastcc void @callee1(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16, float %17, float %18, float %19, float %20)
+  ret void
+}
+
+define internal fastcc void @callee1(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15, float %a16, float %a17, float %a18, float %a19, float %a20) nounwind noinline {
+entry:
+; CHECK: callee1
+; CHECK: swc1  $f0
+; CHECK: swc1  $f1
+; CHECK: swc1  $f2
+; CHECK: swc1  $f3
+; CHECK: swc1  $f4
+; CHECK: swc1  $f5
+; CHECK: swc1  $f6
+; CHECK: swc1  $f7
+; CHECK: swc1  $f8
+; CHECK: swc1  $f9
+; CHECK: swc1  $f10
+; CHECK: swc1  $f11
+; CHECK: swc1  $f12
+; CHECK: swc1  $f13
+; CHECK: swc1  $f14
+; CHECK: swc1  $f15
+; CHECK: swc1  $f16
+; CHECK: swc1  $f17
+; CHECK: swc1  $f18
+; CHECK: swc1  $f19
+
+  store float %a0, float* @gf0, align 4
+  store float %a1, float* @gf1, align 4
+  store float %a2, float* @gf2, align 4
+  store float %a3, float* @gf3, align 4
+  store float %a4, float* @gf4, align 4
+  store float %a5, float* @gf5, align 4
+  store float %a6, float* @gf6, align 4
+  store float %a7, float* @gf7, align 4
+  store float %a8, float* @gf8, align 4
+  store float %a9, float* @gf9, align 4
+  store float %a10, float* @gf10, align 4
+  store float %a11, float* @gf11, align 4
+  store float %a12, float* @gf12, align 4
+  store float %a13, float* @gf13, align 4
+  store float %a14, float* @gf14, align 4
+  store float %a15, float* @gf15, align 4
+  store float %a16, float* @gf16, align 4
+  store float %a17, float* @gf17, align 4
+  store float %a18, float* @gf18, align 4
+  store float %a19, float* @gf19, align 4
+  store float %a20, float* @gf20, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index 08bd6e7..1c4a3fd 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -28,7 +28,7 @@ entry:
 
 define float @foo2(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK: luxc1
+; CHECK-NOT: luxc1
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   %0 = load float* %arrayidx1, align 1
   ret float %0
@@ -54,7 +54,7 @@ entry:
 
 define void @foo5(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK: suxc1
+; CHECK-NOT: suxc1
   %0 = load float* @gf, align 4
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   store float %0, float* %arrayidx1, align 1
@@ -64,7 +64,7 @@ entry:
 define double @foo6(i32 %b, i32 %c) nounwind readonly {
 entry:
 ; CHECK: foo6
-; CHECK-NOT: ldxc1
+; CHECK-NOT: luxc1
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   %0 = load double* %arrayidx1, align 1
   ret double %0
@@ -73,7 +73,7 @@ entry:
 define void @foo7(i32 %b, i32 %c) nounwind {
 entry:
 ; CHECK: foo7
-; CHECK-NOT: sdxc1
+; CHECK-NOT: suxc1
   %0 = load double* @gd, align 8
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   store double %0, double* %arrayidx1, align 1
@@ -83,7 +83,7 @@ entry:
 define float @foo8() nounwind readonly {
 entry:
 ; CHECK: foo8
-; CHECK: luxc1
+; CHECK-NOT: luxc1
   %0 = load float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret float %0
 }
@@ -91,7 +91,7 @@ entry:
 define void @foo9(float %f) nounwind {
 entry:
 ; CHECK: foo9
-; CHECK: suxc1
+; CHECK-NOT: suxc1
   store float %f, float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret void
 }
diff --git a/test/CodeGen/Mips/fp-spill-reload.ll b/test/CodeGen/Mips/fp-spill-reload.ll
new file mode 100644
index 0000000..f9887a5
--- /dev/null
+++ b/test/CodeGen/Mips/fp-spill-reload.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; check that $fp is not reserved. 
+
+define void @foo0(i32* nocapture %b) nounwind {
+entry:
+; CHECK: sw  $fp
+; CHECK: lw  $fp
+  %0 = load i32* %b, align 4
+  %arrayidx.1 = getelementptr inbounds i32* %b, i32 1
+  %1 = load i32* %arrayidx.1, align 4
+  %add.1 = add nsw i32 %1, 1
+  %arrayidx.2 = getelementptr inbounds i32* %b, i32 2
+  %2 = load i32* %arrayidx.2, align 4
+  %add.2 = add nsw i32 %2, 2
+  %arrayidx.3 = getelementptr inbounds i32* %b, i32 3
+  %3 = load i32* %arrayidx.3, align 4
+  %add.3 = add nsw i32 %3, 3
+  %arrayidx.4 = getelementptr inbounds i32* %b, i32 4
+  %4 = load i32* %arrayidx.4, align 4
+  %add.4 = add nsw i32 %4, 4
+  %arrayidx.5 = getelementptr inbounds i32* %b, i32 5
+  %5 = load i32* %arrayidx.5, align 4
+  %add.5 = add nsw i32 %5, 5
+  %arrayidx.6 = getelementptr inbounds i32* %b, i32 6
+  %6 = load i32* %arrayidx.6, align 4
+  %add.6 = add nsw i32 %6, 6
+  %arrayidx.7 = getelementptr inbounds i32* %b, i32 7
+  %7 = load i32* %arrayidx.7, align 4
+  %add.7 = add nsw i32 %7, 7
+  call void @foo2(i32 %0, i32 %add.1, i32 %add.2, i32 %add.3, i32 %add.4, i32 %add.5, i32 %add.6, i32 %add.7) nounwind
+  call void bitcast (void (...)* @foo1 to void ()*)() nounwind
+  call void @foo2(i32 %0, i32 %add.1, i32 %add.2, i32 %add.3, i32 %add.4, i32 %add.5, i32 %add.6, i32 %add.7) nounwind
+  ret void
+}
+
+declare void @foo2(i32, i32, i32, i32, i32, i32, i32, i32)
+
+declare void @foo1(...)
+
diff --git a/test/CodeGen/Mips/global-pointer-reg.ll b/test/CodeGen/Mips/global-pointer-reg.ll
index 174d1f9..1c0eb01 100644
--- a/test/CodeGen/Mips/global-pointer-reg.ll
+++ b/test/CodeGen/Mips/global-pointer-reg.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=mipsel -mips-fix-global-base-reg=false | FileCheck %s 
+; DISABLED: llc < %s -march=mipsel -mips-fix-global-base-reg=false | FileCheck %s 
+; RUN: false
+; XFAIL: *
 
 @g0 = external global i32
 @g1 = external global i32
diff --git a/test/CodeGen/Mips/gprestore.ll b/test/CodeGen/Mips/gprestore.ll
index ee7e131..cbcf0c9 100644
--- a/test/CodeGen/Mips/gprestore.ll
+++ b/test/CodeGen/Mips/gprestore.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; DISABLE: llc -march=mips < %s | FileCheck %s
+; RUN: false
+; XFAIL: *
 
 @p = external global i32
 @q = external global i32
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
new file mode 100644
index 0000000..bee93ac
--- /dev/null
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -0,0 +1,34 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
+;
+; re-enable this when mips16's jalr is fixed.
+; DISABLED: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+
+
+@.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0))
+  ret i32 0
+
+; SR: 	.set	mips16                  # @main
+
+; SR:	save 	$ra, [[FS:[0-9]+]]
+; PE:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
+; PE: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
+; PE:	sll	$[[T3:[0-9]+]], $[[T1]], 16
+; C1:	lw	${{[0-9]+}}, %got($.str)(${{[0-9]+}})
+; C2:	lw	${{[0-9]+}}, %call16(printf)(${{[0-9]+}})
+; C1:	addiu	${{[0-9]+}}, %lo($.str)
+; C2:	move	$25, ${{[0-9]+}}
+; C1:	move 	$gp, ${{[0-9]+}}
+; C1:	jalr 	${{[0-9]+}}
+; SR:	restore 	$ra, [[FS]]
+; PE:	li	$2, 0
+; PE:	jr 	$ra
+
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
new file mode 100644
index 0000000..f9e53cb
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
@@ -0,0 +1,15 @@
+;
+;This is a negative test. The constant value given for the constraint
+;is greater than 16 bits.
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
new file mode 100644
index 0000000..1fdf672
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
@@ -0,0 +1,16 @@
+;
+;This is a negative test. The constant value given for the constraint (J)
+;is non-zero (3).
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+  tail call i32 asm "addi $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-K.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-K.ll
new file mode 100644
index 0000000..3baf437
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-K.ll
@@ -0,0 +1,16 @@
+;
+;This is a negative test. The constant value given for the constraint (K)
+;is greater than 16 bits (0x00100000).
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+  tail call i32 asm "addu $0,$1,$2", "=r,r,K"(i32 1024, i32 1048576) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
new file mode 100644
index 0000000..49dcc87
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
@@ -0,0 +1,16 @@
+;
+;This is a negative test. The constant value given for the constraint (L)
+;is non-zero in the lower 16 bits (0x00100003).
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+  tail call i32 asm "addi $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
new file mode 100644
index 0000000..770669d
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
@@ -0,0 +1,17 @@
+ 
+;This is a negative test. The constant value given for the constraint (N).
+;immediate in the range of -65535 to -1 (inclusive).
+;Our example uses the positive value 3.
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
new file mode 100644
index 0000000..cd4431a
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
@@ -0,0 +1,16 @@
+;
+;This is a negative test. The constant value given for the constraint (O).
+;signed 15 bit immediate (+- 16383).
+;Our example uses the positive value 16384.
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'O'
+
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
new file mode 100644
index 0000000..0a4739e
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
@@ -0,0 +1,16 @@
+;
+; This is a negative test. The constant value given for the constraint (P).
+; A constant in the range of 1 to 655535 inclusive.
+; Our example uses the positive value 655536.
+;
+; RUN: not llc -march=mipsel < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+define i32 @main() nounwind {
+entry:
+
+;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'P'
+
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
new file mode 100644
index 0000000..94ded30
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -0,0 +1,44 @@
+; Positive test for inline register constraints
+;
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+define i32 @main() nounwind {
+entry:
+
+; r with char
+;CHECK:	#APP
+;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},23
+;CHECK:	#NO_APP
+  tail call i8 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
+
+; r with short
+;CHECK:	#APP
+;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},13
+;CHECK:	#NO_APP
+  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
+
+; r with int
+;CHECK:	#APP
+;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	#NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
+
+; Now c with 1024: make sure register $25 is picked
+; CHECK: #APP
+; CHECK: addi $25,${{[0-9]+}},1024
+; CHECK: #NO_APP	
+   tail call i32 asm sideeffect "addi $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
+
+; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
+; after the inline expression for a mflo to pull the value out of lo.
+; CHECK: #APP
+; CHECK-NEXT:  mtlo ${{[0-9]+}} 
+; CHECK-NEXT:  madd ${{[0-9]+}},${{[0-9]+}}
+; CHECK-NEXT: #NO_APP	
+; CHECK-NEXT:  mflo	${{[0-9]+}}
+  %bosco = alloca i32, align 4
+  call i32 asm sideeffect "\09mtlo $3 \0A\09\09madd $1,$2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
+  store volatile i32 %4, i32* %bosco, align 4
+ 
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
new file mode 100644
index 0000000..7870666
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -0,0 +1,20 @@
+;
+; Register constraint "r" shouldn't take long long unless
+; The target is 64 bit.
+;
+;
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
+
+
+define i32 @main() nounwind {
+entry:
+
+
+; r with long long
+;CHECK:	#APP
+;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	#NO_APP
+  tail call i64 asm sideeffect "addi $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
new file mode 100644
index 0000000..0197899
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -0,0 +1,153 @@
+; Positive test for inline register constraints
+;
+; RUN: llc -march=mipsel < %s  | FileCheck -check-prefix=CHECK_LITTLE_32 %s
+; RUN: llc -march=mips < %s  | FileCheck -check-prefix=CHECK_BIG_32 %s
+
+%union.u_tag = type { i64 }
+%struct.anon = type { i32, i32 }
+@uval = common global %union.u_tag zeroinitializer, align 8
+
+; X with -3
+define i32 @constraint_X() nounwind {
+entry:
+;CHECK_LITTLE_32:   constraint_X:
+;CHECK_LITTLE_32: #APP
+;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
+;CHECK_LITTLE_32: #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
+  ret i32 0
+}
+
+; x with -3
+define i32 @constraint_x() nounwind {
+entry:
+;CHECK_LITTLE_32:   constraint_x:
+;CHECK_LITTLE_32: #APP
+;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffd
+;CHECK_LITTLE_32: #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
+  ret i32 0
+}
+
+; d with -3
+define i32 @constraint_d() nounwind {
+entry:
+;CHECK_LITTLE_32:   constraint_d:
+;CHECK_LITTLE_32:   #APP
+;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:   #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
+  ret i32 0
+}
+
+; m with -3
+define i32 @constraint_m() nounwind {
+entry:
+;CHECK_LITTLE_32:   constraint_m:
+;CHECK_LITTLE_32:   #APP
+;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-4
+;CHECK_LITTLE_32:   #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
+  ret i32 0
+}
+
+; z with -3
+define i32 @constraint_z() nounwind {
+entry:
+;CHECK_LITTLE_32: constraint_z:
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:    #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+
+; z with 0
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},$0
+;CHECK_LITTLE_32:    #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+  ret i32 0
+}
+
+; a long long in 32 bit mode (use to assert)
+define i32 @constraint_longlong() nounwind {
+entry:
+;CHECK_LITTLE_32: constraint_longlong:
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK_LITTLE_32:    #NO_APP
+  tail call i64 asm sideeffect "addi $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
+  ret i32 0
+}
+
+; D, in little endian the source reg will be 4 bytes into the long long
+define i32 @constraint_D() nounwind {
+entry:
+;CHECK_LITTLE_32: constraint_D:
+;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+
+; D, in big endian the source reg will also be 4 bytes into the long long
+;CHECK_BIG_32:    constraint_D:
+;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_BIG_32:       #APP
+;CHECK_BIG_32:       or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
+;CHECK_BIG_32:       #NO_APP
+  %bosco = load i64* getelementptr inbounds (%union.u_tag* @uval, i32 0, i32 0), align 8
+  %trunc1 = trunc i64 %bosco to i32
+  tail call i32 asm sideeffect "or $0,${1:D},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  ret i32 0
+}
+
+; L, in little endian the source reg will be 0 bytes into the long long
+define i32 @constraint_L() nounwind {
+entry:
+;CHECK_LITTLE_32: constraint_L:
+;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[FIRST]],${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+; L, in big endian the source reg will be 4 bytes into the long long
+;CHECK_BIG_32: constraint_L:
+;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_BIG_32:       #APP
+;CHECK_BIG_32:       or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
+;CHECK_BIG_32:       #NO_APP
+  %bosco = load i64* getelementptr inbounds (%union.u_tag* @uval, i32 0, i32 0), align 8
+  %trunc1 = trunc i64 %bosco to i32
+  tail call i32 asm sideeffect "or $0,${1:L},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  ret i32 0
+}
+
+; M, in little endian the source reg will be 4 bytes into the long long
+define i32 @constraint_M() nounwind {
+entry:
+;CHECK_LITTLE_32: constraint_M:
+;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+; M, in big endian the source reg will be 0 bytes into the long long
+;CHECK_BIG_32:    constraint_M:
+;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+;CHECK_BIG_32:       #APP
+;CHECK_BIG_32:       or ${{[0-9]+}},$[[FIRST]],${{[0-9]+}}
+;CHECK_BIG_32:       #NO_APP
+  %bosco = load i64* getelementptr inbounds (%union.u_tag* @uval, i32 0, i32 0), align 8
+  %trunc1 = trunc i64 %bosco to i32
+  tail call i32 asm sideeffect "or $0,${1:M},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
new file mode 100644
index 0000000..5adec3b
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+define i32 @main() nounwind {
+entry:
+
+; First I with short
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},4096
+; CHECK: #NO_APP
+  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
+
+; Then I with int
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: #NO_APP
+   tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
+
+; Now J with 0
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},0
+; CHECK: #NO_APP
+  tail call i32 asm sideeffect "addi $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+
+; Now K with 64
+; CHECK: #APP
+; CHECK: addu ${{[0-9]+}},${{[0-9]+}},64
+; CHECK: #NO_APP	
+  tail call i16 asm sideeffect "addu $0,$1,$2\0A\09 ", "=r,r,K"(i16 7, i16 64) nounwind
+
+; Now L with 0x00100000
+; CHECK: #APP
+; CHECK: add ${{[0-9]+}},${{[0-9]+}},${{[0-9]+}}
+; CHECK: #NO_APP	
+  tail call i32 asm sideeffect "add $0,$1,$3\0A\09", "=r,r,L,r"(i32 7, i32 1048576, i32 0) nounwind
+
+; Now N with -3
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: #NO_APP	
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
+
+; Now O with -3
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: #NO_APP	
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
+
+; Now P with 65535
+; CHECK: #APP
+; CHECK: addi ${{[0-9]+}},${{[0-9]+}},65535
+; CHECK: #NO_APP	
+  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
+
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/inlineasmmemop.ll b/test/CodeGen/Mips/inlineasmmemop.ll
index 4b31a88..1c7c443 100644
--- a/test/CodeGen/Mips/inlineasmmemop.ll
+++ b/test/CodeGen/Mips/inlineasmmemop.ll
@@ -11,7 +11,7 @@ entry:
 ; CHECK: #APP
 ; CHECK: lw $[[T3:[0-9]+]], 0($[[T0]])
 ; CHECK: #NO_APP
-; CHECK: lw  $[[T1:[0-9]+]], %got(g1)($gp)
+; CHECK: lw  $[[T1:[0-9]+]], %got(g1)
 ; CHECK: sw  $[[T3]], 0($[[T1]])
 
   %l1 = alloca i32, align 4
diff --git a/test/CodeGen/Mips/internalfunc.ll b/test/CodeGen/Mips/internalfunc.ll
index 434b386..863375a 100644
--- a/test/CodeGen/Mips/internalfunc.ll
+++ b/test/CodeGen/Mips/internalfunc.ll
@@ -6,7 +6,7 @@
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-; CHECK: lw $[[R0:[0-9]+]], %got(f2)($gp)
+; CHECK: lw $[[R0:[0-9]+]], %got(f2)
 ; CHECK: addiu $25, $[[R0]], %lo(f2)
   tail call fastcc void @f2()
   ret i32 0
@@ -14,7 +14,7 @@ entry:
 
 define void @caller(i32 %a0, i32 %a1) nounwind {
 entry:
-; CHECK: lw  $[[R1:[0-9]+]], %got(caller.sf1)($gp)
+; CHECK: lw  $[[R1:[0-9]+]], %got(caller.sf1)
 ; CHECK: lw  $25, %lo(caller.sf1)($[[R1]])
   %tobool = icmp eq i32 %a1, 0
   br i1 %tobool, label %if.end, label %if.then
@@ -25,9 +25,9 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
-; CHECK: lw  $[[R2:[0-9]+]], %got(sf2)($gp)
+; CHECK: lw  $[[R2:[0-9]+]], %got(sf2)
 ; CHECK: addiu ${{[0-9]+}}, $[[R2]], %lo(sf2)
-; CHECK: lw  $[[R3:[0-9]+]], %got(caller.sf1)($gp)
+; CHECK: lw  $[[R3:[0-9]+]], %got(caller.sf1)
 ; CHECK: sw  ${{[0-9]+}}, %lo(caller.sf1)($[[R3]])
   %tobool3 = icmp ne i32 %a0, 0
   %tmp4 = load void (...)** @gf1, align 4
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index b7c9a9c..2e54879 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -6,10 +6,9 @@
 
 define void @f() nounwind {
 entry:
-; CHECK:  lui $at, 65534
-; CHECK:  addiu $at, $at, -24
+; CHECK:  lui $at, 65535
+; CHECK:  addiu $at, $at, -16
 ; CHECK:  addu  $sp, $sp, $at
-; CHECK:  .cprestore  65536
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/lb1.ll b/test/CodeGen/Mips/lb1.ll
new file mode 100644
index 0000000..aac2767
--- /dev/null
+++ b/test/CodeGen/Mips/lb1.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@c = global i8 -1, align 1
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %0 = load i8* @c, align 1
+; 16:	lb	${{[0-9]+}}, 0(${{[0-9]+}})
+  %conv = sext i8 %0 to i32
+  store i32 %conv, i32* %i, align 4
+  %1 = load i32* %i, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/lbu1.ll b/test/CodeGen/Mips/lbu1.ll
new file mode 100644
index 0000000..63e0cca
--- /dev/null
+++ b/test/CodeGen/Mips/lbu1.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@c = global i8 97, align 1
+@.str = private unnamed_addr constant [5 x i8] c"%c \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %0 = load i8* @c, align 1
+  %conv = zext i8 %0 to i32
+; 16:	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+  store i32 %conv, i32* %i, align 4
+  %1 = load i8* @c, align 1
+  %conv1 = zext i8 %1 to i32
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %conv1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/lh1.ll b/test/CodeGen/Mips/lh1.ll
new file mode 100644
index 0000000..1f95b09
--- /dev/null
+++ b/test/CodeGen/Mips/lh1.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@s = global i16 -1, align 2
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %0 = load i16* @s, align 2
+  %conv = sext i16 %0 to i32
+; 16:	lh	${{[0-9]+}}, 0(${{[0-9]+}})
+  store i32 %conv, i32* %i, align 4
+  %1 = load i32* %i, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/lhu1.ll b/test/CodeGen/Mips/lhu1.ll
new file mode 100644
index 0000000..0cfcede
--- /dev/null
+++ b/test/CodeGen/Mips/lhu1.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+
+@s = global i16 255, align 2
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %0 = load i16* @s, align 2
+  %conv = zext i16 %0 to i32
+; 16:	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+  store i32 %conv, i32* %i, align 4
+  %1 = load i32* %i, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
new file mode 100644
index 0000000..d0928ee
--- /dev/null
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=mipsel < %s | FileCheck  -check-prefix=EL %s
+; RUN: llc -march=mips < %s | FileCheck  -check-prefix=EB %s
+
+%struct.SI = type { i32 }
+
+@si = common global %struct.SI zeroinitializer, align 1
+
+define i32 @foo_load_i() nounwind readonly {
+entry:
+; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; EL: lwr $[[R0]], 0($[[R1]])
+; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: lwr $[[R0]], 3($[[R1]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
+  ret i32 %0
+}
+
+define void @foo_store_i(i32 %a) nounwind {
+entry:
+; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; EL: swr $[[R0]], 0($[[R1]])
+; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: swr $[[R0]], 3($[[R1]])
+
+  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
new file mode 100644
index 0000000..0227b88
--- /dev/null
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=mipsel -force-mips-long-branch < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch < %s | FileCheck %s -check-prefix=N64
+
+@g0 = external global i32
+
+define void @foo1(i32 %s) nounwind {
+entry:
+; O32: bal
+; N64: bal
+; N64: highest
+; N64: higher
+
+  %tobool = icmp eq i32 %s, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = load i32* @g0, align 4
+  %add = add nsw i32 %0, 12
+  store i32 %add, i32* @g0, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/machineverifier.ll b/test/CodeGen/Mips/machineverifier.ll
new file mode 100644
index 0000000..c673fe5
--- /dev/null
+++ b/test/CodeGen/Mips/machineverifier.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=mipsel -verify-machineinstrs
+; Make sure machine verifier understands the last instruction of a basic block
+; is not the terminator instruction after delay slot filler pass is run.
+
+@g = external global i32
+
+define void @foo() nounwind {
+entry:
+  %0 = load i32* @g, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %0, 10
+  store i32 %add, i32* @g, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/memcpy.ll b/test/CodeGen/Mips/memcpy.ll
new file mode 100644
index 0000000..39764a9
--- /dev/null
+++ b/test/CodeGen/Mips/memcpy.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s 
+
+%struct.S1 = type { i32, [41 x i8] }
+
+@.str = private unnamed_addr constant [31 x i8] c"abcdefghijklmnopqrstuvwxyzABCD\00", align 1
+
+define void @foo1(%struct.S1* %s1, i8 signext %n) nounwind {
+entry:
+; CHECK-NOT: call16(memcpy
+
+  %arraydecay = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1, i32 0
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %arraydecay, i8* getelementptr inbounds ([31 x i8]* @.str, i32 0, i32 0), i32 31, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1, i32 40
+  store i8 %n, i8* %arrayidx, align 1
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
diff --git a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll b/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
index 09745fb..bbdc05c 100644
--- a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
@@ -30,7 +30,7 @@ entry:
 
 define float @foo2(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK: luxc1
+; CHECK-NOT: luxc1
   %idxprom = zext i32 %c to i64
   %idxprom1 = zext i32 %b to i64
   %arrayidx2 = getelementptr inbounds [4 x %struct.S]* @s, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
@@ -60,7 +60,7 @@ entry:
 
 define void @foo5(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK: suxc1
+; CHECK-NOT: suxc1
   %0 = load float* @gf, align 4
   %idxprom = zext i32 %c to i64
   %idxprom1 = zext i32 %b to i64
@@ -72,7 +72,7 @@ entry:
 define double @foo6(i32 %b, i32 %c) nounwind readonly {
 entry:
 ; CHECK: foo6
-; CHECK-NOT: ldxc1
+; CHECK-NOT: luxc1
   %idxprom = zext i32 %c to i64
   %idxprom1 = zext i32 %b to i64
   %arrayidx2 = getelementptr inbounds [4 x %struct.S2]* @s2, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
@@ -83,7 +83,7 @@ entry:
 define void @foo7(i32 %b, i32 %c) nounwind {
 entry:
 ; CHECK: foo7
-; CHECK-NOT: sdxc1
+; CHECK-NOT: suxc1
   %0 = load double* @gd, align 8
   %idxprom = zext i32 %c to i64
   %idxprom1 = zext i32 %b to i64
@@ -95,7 +95,7 @@ entry:
 define float @foo8() nounwind readonly {
 entry:
 ; CHECK: foo8
-; CHECK: luxc1
+; CHECK-NOT: luxc1
   %0 = load float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
   ret float %0
 }
@@ -103,7 +103,7 @@ entry:
 define void @foo9(float %f) nounwind {
 entry:
 ; CHECK: foo9
-; CHECK: suxc1
+; CHECK-NOT: suxc1
   store float %f, float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
   ret void
 }
diff --git a/test/CodeGen/Mips/mips64load-store-left-right.ll b/test/CodeGen/Mips/mips64load-store-left-right.ll
new file mode 100644
index 0000000..4561429
--- /dev/null
+++ b/test/CodeGen/Mips/mips64load-store-left-right.ll
@@ -0,0 +1,73 @@
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EL %s
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EB %s
+
+%struct.SLL = type { i64 }
+%struct.SI = type { i32 }
+%struct.SUI = type { i32 }
+
+@sll = common global %struct.SLL zeroinitializer, align 1
+@si = common global %struct.SI zeroinitializer, align 1
+@sui = common global %struct.SUI zeroinitializer, align 1
+
+define i64 @foo_load_ll() nounwind readonly {
+entry:
+; EL: ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; EL: ldr $[[R0]], 0($[[R1]])
+; EB: ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: ldr $[[R0]], 7($[[R1]])
+
+  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret i64 %0
+}
+
+define i64 @foo_load_i() nounwind readonly {
+entry:
+; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; EL: lwr $[[R0]], 0($[[R1]])
+; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: lwr $[[R0]], 3($[[R1]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  %conv = sext i32 %0 to i64
+  ret i64 %conv
+}
+
+define i64 @foo_load_ui() nounwind readonly {
+entry:
+; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; EL: lwr $[[R0]], 0($[[R1]])
+; EL: daddiu $[[R2:[0-9]+]], $zero, 1
+; EL: dsll   $[[R3:[0-9]+]], $[[R2]], 32
+; EL: daddiu $[[R4:[0-9]+]], $[[R3]], -1
+; EL: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
+; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: lwr $[[R0]], 3($[[R1]])
+
+
+  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
+  %conv = zext i32 %0 to i64
+  ret i64 %conv
+}
+
+define void @foo_store_ll(i64 %a) nounwind {
+entry:
+; EL: sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; EL: sdr $[[R0]], 0($[[R1]])
+; EB: sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: sdr $[[R0]], 7($[[R1]])
+
+  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret void
+}
+
+define void @foo_store_i(i32 %a) nounwind {
+entry:
+; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; EL: swr $[[R0]], 0($[[R1]])
+; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; EB: swr $[[R0]], 3($[[R1]])
+
+  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/neg1.ll b/test/CodeGen/Mips/neg1.ll
new file mode 100644
index 0000000..281e626
--- /dev/null
+++ b/test/CodeGen/Mips/neg1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %sub = sub nsw i32 0, %0
+; 16:	neg	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/not1.ll b/test/CodeGen/Mips/not1.ll
new file mode 100644
index 0000000..2163b23
--- /dev/null
+++ b/test/CodeGen/Mips/not1.ll
@@ -0,0 +1,16 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %neg = xor i32 %0, -1
+; 16:	not	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0), i32 %neg)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
new file mode 100644
index 0000000..7beae99
--- /dev/null
+++ b/test/CodeGen/Mips/null.ll
@@ -0,0 +1,13 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 < %s | FileCheck %s -check-prefix=16
+
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+
+; 16: 	.set	mips16                  # @main
+
+
+; 16:	jr	$ra
+
+}
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index c5cbc7a..eac0d80 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -10,19 +10,19 @@
 
 define void @f1() nounwind {
 entry:
-; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)($gp)
+; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)
 ; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
 ; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
-; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
-; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
-; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
-; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
-; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
 ; CHECK: sw  $[[R6]], 36($sp)
+; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
 ; CHECK: sw  $[[R5]], 32($sp)
+; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
 ; CHECK: sw  $[[R4]], 28($sp)
+; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
 ; CHECK: sw  $[[R3]], 24($sp)
+; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
 ; CHECK: sw  $[[R7]], 20($sp)
+; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
 ; CHECK: sw  $[[R2]], 16($sp)
 ; CHECK: lw  $7, 4($[[R0]])
 ; CHECK: lw  $6, %lo(f1.s1)($[[R1]])
@@ -43,16 +43,16 @@ declare void @callee3(float, %struct.S3* byval, %struct.S1* byval)
 
 define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
 entry:
-; CHECK: addiu $sp, $sp, -56
-; CHECK: sw  $7, 68($sp)
-; CHECK: sw  $6, 64($sp)
-; CHECK: lw  $4, 88($sp)
-; CHECK: ldc1 $f[[F0:[0-9]+]], 80($sp)
-; CHECK: lw  $[[R3:[0-9]+]], 72($sp)
-; CHECK: lw  $[[R4:[0-9]+]], 76($sp)
-; CHECK: lw  $[[R2:[0-9]+]], 68($sp)
-; CHECK: lh  $[[R1:[0-9]+]], 66($sp)
-; CHECK: lb  $[[R0:[0-9]+]], 64($sp)
+; CHECK: addiu $sp, $sp, -48
+; CHECK: sw  $7, 60($sp)
+; CHECK: sw  $6, 56($sp)
+; CHECK: lw  $4, 80($sp)
+; CHECK: ldc1 $f[[F0:[0-9]+]], 72($sp)
+; CHECK: lw  $[[R3:[0-9]+]], 64($sp)
+; CHECK: lw  $[[R4:[0-9]+]], 68($sp)
+; CHECK: lw  $[[R2:[0-9]+]], 60($sp)
+; CHECK: lh  $[[R1:[0-9]+]], 58($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 56($sp)
 ; CHECK: sw  $[[R0]], 32($sp)
 ; CHECK: sw  $[[R1]], 28($sp)
 ; CHECK: sw  $[[R2]], 24($sp)
@@ -80,13 +80,13 @@ declare void @callee4(i32, double, i64, i32, i16 signext, i8 signext, float)
 
 define void @f3(%struct.S2* nocapture byval %s2) nounwind {
 entry:
-; CHECK: addiu $sp, $sp, -56
-; CHECK: sw  $7, 68($sp)
-; CHECK: sw  $6, 64($sp)
-; CHECK: sw  $5, 60($sp)
-; CHECK: sw  $4, 56($sp)
-; CHECK: lw  $4, 56($sp)
-; CHECK: lw  $[[R0:[0-9]+]], 68($sp)
+; CHECK: addiu $sp, $sp, -48
+; CHECK: sw  $7, 60($sp)
+; CHECK: sw  $6, 56($sp)
+; CHECK: sw  $5, 52($sp)
+; CHECK: sw  $4, 48($sp)
+; CHECK: lw  $4, 48($sp)
+; CHECK: lw  $[[R0:[0-9]+]], 60($sp)
 ; CHECK: sw  $[[R0]], 24($sp)
 
   %arrayidx = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 0
@@ -99,13 +99,13 @@ entry:
 
 define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture byval %s1) nounwind {
 entry:
-; CHECK: addiu $sp, $sp, -56
-; CHECK: sw  $7, 68($sp)
-; CHECK: sw  $6, 64($sp)
-; CHECK: sw  $5, 60($sp)
-; CHECK: lw  $4, 68($sp)
-; CHECK: lw  $[[R1:[0-9]+]], 88($sp)
-; CHECK: lb  $[[R0:[0-9]+]], 60($sp)
+; CHECK: addiu $sp, $sp, -48
+; CHECK: sw  $7, 60($sp)
+; CHECK: sw  $6, 56($sp)
+; CHECK: sw  $5, 52($sp)
+; CHECK: lw  $4, 60($sp)
+; CHECK: lw  $[[R1:[0-9]+]], 80($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 52($sp)
 ; CHECK: sw  $[[R0]], 32($sp)
 ; CHECK: sw  $[[R1]], 24($sp)
 
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index 4a3d9ab..35332b6 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -march=mipsel -pre-RA-sched=source < %s | FileCheck %s
 
-
 ; All test functions do the same thing - they return the first variable
 ; argument.
 
diff --git a/test/CodeGen/Mips/or1.ll b/test/CodeGen/Mips/or1.ll
new file mode 100644
index 0000000..b1c3696
--- /dev/null
+++ b/test/CodeGen/Mips/or1.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %or = or i32 %0, %1
+; 16:	or	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0), i32 %or)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/ra-allocatable.ll b/test/CodeGen/Mips/ra-allocatable.ll
new file mode 100644
index 0000000..7621788
--- /dev/null
+++ b/test/CodeGen/Mips/ra-allocatable.ll
@@ -0,0 +1,288 @@
+; RUN: llc  < %s -march=mipsel | FileCheck %s
+
+@a0 = external global i32
+@b0 = external global i32*
+@a1 = external global i32
+@b1 = external global i32*
+@a2 = external global i32
+@b2 = external global i32*
+@a3 = external global i32
+@b3 = external global i32*
+@a4 = external global i32
+@b4 = external global i32*
+@a5 = external global i32
+@b5 = external global i32*
+@a6 = external global i32
+@b6 = external global i32*
+@a7 = external global i32
+@b7 = external global i32*
+@a8 = external global i32
+@b8 = external global i32*
+@a9 = external global i32
+@b9 = external global i32*
+@a10 = external global i32
+@b10 = external global i32*
+@a11 = external global i32
+@b11 = external global i32*
+@a12 = external global i32
+@b12 = external global i32*
+@a13 = external global i32
+@b13 = external global i32*
+@a14 = external global i32
+@b14 = external global i32*
+@a15 = external global i32
+@b15 = external global i32*
+@a16 = external global i32
+@b16 = external global i32*
+@a17 = external global i32
+@b17 = external global i32*
+@a18 = external global i32
+@b18 = external global i32*
+@a19 = external global i32
+@b19 = external global i32*
+@a20 = external global i32
+@b20 = external global i32*
+@a21 = external global i32
+@b21 = external global i32*
+@a22 = external global i32
+@b22 = external global i32*
+@a23 = external global i32
+@b23 = external global i32*
+@a24 = external global i32
+@b24 = external global i32*
+@a25 = external global i32
+@b25 = external global i32*
+@a26 = external global i32
+@b26 = external global i32*
+@a27 = external global i32
+@b27 = external global i32*
+@a28 = external global i32
+@b28 = external global i32*
+@a29 = external global i32
+@b29 = external global i32*
+@c0 = external global i32*
+@c1 = external global i32*
+@c2 = external global i32*
+@c3 = external global i32*
+@c4 = external global i32*
+@c5 = external global i32*
+@c6 = external global i32*
+@c7 = external global i32*
+@c8 = external global i32*
+@c9 = external global i32*
+@c10 = external global i32*
+@c11 = external global i32*
+@c12 = external global i32*
+@c13 = external global i32*
+@c14 = external global i32*
+@c15 = external global i32*
+@c16 = external global i32*
+@c17 = external global i32*
+@c18 = external global i32*
+@c19 = external global i32*
+@c20 = external global i32*
+@c21 = external global i32*
+@c22 = external global i32*
+@c23 = external global i32*
+@c24 = external global i32*
+@c25 = external global i32*
+@c26 = external global i32*
+@c27 = external global i32*
+@c28 = external global i32*
+@c29 = external global i32*
+
+define i32 @f1() nounwind {
+entry:
+; CHECK: sw  $ra, {{[0-9]+}}($sp)            # 4-byte Folded Spill
+; CHECK: $ra
+; CHECK: lw  $ra, {{[0-9]+}}($sp)            # 4-byte Folded Reload
+; CHECK: jr  $ra
+
+  %0 = load i32* @a0, align 4, !tbaa !0
+  %1 = load i32** @b0, align 4, !tbaa !3
+  store i32 %0, i32* %1, align 4, !tbaa !0
+  %2 = load i32* @a1, align 4, !tbaa !0
+  %3 = load i32** @b1, align 4, !tbaa !3
+  store i32 %2, i32* %3, align 4, !tbaa !0
+  %4 = load i32* @a2, align 4, !tbaa !0
+  %5 = load i32** @b2, align 4, !tbaa !3
+  store i32 %4, i32* %5, align 4, !tbaa !0
+  %6 = load i32* @a3, align 4, !tbaa !0
+  %7 = load i32** @b3, align 4, !tbaa !3
+  store i32 %6, i32* %7, align 4, !tbaa !0
+  %8 = load i32* @a4, align 4, !tbaa !0
+  %9 = load i32** @b4, align 4, !tbaa !3
+  store i32 %8, i32* %9, align 4, !tbaa !0
+  %10 = load i32* @a5, align 4, !tbaa !0
+  %11 = load i32** @b5, align 4, !tbaa !3
+  store i32 %10, i32* %11, align 4, !tbaa !0
+  %12 = load i32* @a6, align 4, !tbaa !0
+  %13 = load i32** @b6, align 4, !tbaa !3
+  store i32 %12, i32* %13, align 4, !tbaa !0
+  %14 = load i32* @a7, align 4, !tbaa !0
+  %15 = load i32** @b7, align 4, !tbaa !3
+  store i32 %14, i32* %15, align 4, !tbaa !0
+  %16 = load i32* @a8, align 4, !tbaa !0
+  %17 = load i32** @b8, align 4, !tbaa !3
+  store i32 %16, i32* %17, align 4, !tbaa !0
+  %18 = load i32* @a9, align 4, !tbaa !0
+  %19 = load i32** @b9, align 4, !tbaa !3
+  store i32 %18, i32* %19, align 4, !tbaa !0
+  %20 = load i32* @a10, align 4, !tbaa !0
+  %21 = load i32** @b10, align 4, !tbaa !3
+  store i32 %20, i32* %21, align 4, !tbaa !0
+  %22 = load i32* @a11, align 4, !tbaa !0
+  %23 = load i32** @b11, align 4, !tbaa !3
+  store i32 %22, i32* %23, align 4, !tbaa !0
+  %24 = load i32* @a12, align 4, !tbaa !0
+  %25 = load i32** @b12, align 4, !tbaa !3
+  store i32 %24, i32* %25, align 4, !tbaa !0
+  %26 = load i32* @a13, align 4, !tbaa !0
+  %27 = load i32** @b13, align 4, !tbaa !3
+  store i32 %26, i32* %27, align 4, !tbaa !0
+  %28 = load i32* @a14, align 4, !tbaa !0
+  %29 = load i32** @b14, align 4, !tbaa !3
+  store i32 %28, i32* %29, align 4, !tbaa !0
+  %30 = load i32* @a15, align 4, !tbaa !0
+  %31 = load i32** @b15, align 4, !tbaa !3
+  store i32 %30, i32* %31, align 4, !tbaa !0
+  %32 = load i32* @a16, align 4, !tbaa !0
+  %33 = load i32** @b16, align 4, !tbaa !3
+  store i32 %32, i32* %33, align 4, !tbaa !0
+  %34 = load i32* @a17, align 4, !tbaa !0
+  %35 = load i32** @b17, align 4, !tbaa !3
+  store i32 %34, i32* %35, align 4, !tbaa !0
+  %36 = load i32* @a18, align 4, !tbaa !0
+  %37 = load i32** @b18, align 4, !tbaa !3
+  store i32 %36, i32* %37, align 4, !tbaa !0
+  %38 = load i32* @a19, align 4, !tbaa !0
+  %39 = load i32** @b19, align 4, !tbaa !3
+  store i32 %38, i32* %39, align 4, !tbaa !0
+  %40 = load i32* @a20, align 4, !tbaa !0
+  %41 = load i32** @b20, align 4, !tbaa !3
+  store i32 %40, i32* %41, align 4, !tbaa !0
+  %42 = load i32* @a21, align 4, !tbaa !0
+  %43 = load i32** @b21, align 4, !tbaa !3
+  store i32 %42, i32* %43, align 4, !tbaa !0
+  %44 = load i32* @a22, align 4, !tbaa !0
+  %45 = load i32** @b22, align 4, !tbaa !3
+  store i32 %44, i32* %45, align 4, !tbaa !0
+  %46 = load i32* @a23, align 4, !tbaa !0
+  %47 = load i32** @b23, align 4, !tbaa !3
+  store i32 %46, i32* %47, align 4, !tbaa !0
+  %48 = load i32* @a24, align 4, !tbaa !0
+  %49 = load i32** @b24, align 4, !tbaa !3
+  store i32 %48, i32* %49, align 4, !tbaa !0
+  %50 = load i32* @a25, align 4, !tbaa !0
+  %51 = load i32** @b25, align 4, !tbaa !3
+  store i32 %50, i32* %51, align 4, !tbaa !0
+  %52 = load i32* @a26, align 4, !tbaa !0
+  %53 = load i32** @b26, align 4, !tbaa !3
+  store i32 %52, i32* %53, align 4, !tbaa !0
+  %54 = load i32* @a27, align 4, !tbaa !0
+  %55 = load i32** @b27, align 4, !tbaa !3
+  store i32 %54, i32* %55, align 4, !tbaa !0
+  %56 = load i32* @a28, align 4, !tbaa !0
+  %57 = load i32** @b28, align 4, !tbaa !3
+  store i32 %56, i32* %57, align 4, !tbaa !0
+  %58 = load i32* @a29, align 4, !tbaa !0
+  %59 = load i32** @b29, align 4, !tbaa !3
+  store i32 %58, i32* %59, align 4, !tbaa !0
+  %60 = load i32* @a0, align 4, !tbaa !0
+  %61 = load i32** @c0, align 4, !tbaa !3
+  store i32 %60, i32* %61, align 4, !tbaa !0
+  %62 = load i32* @a1, align 4, !tbaa !0
+  %63 = load i32** @c1, align 4, !tbaa !3
+  store i32 %62, i32* %63, align 4, !tbaa !0
+  %64 = load i32* @a2, align 4, !tbaa !0
+  %65 = load i32** @c2, align 4, !tbaa !3
+  store i32 %64, i32* %65, align 4, !tbaa !0
+  %66 = load i32* @a3, align 4, !tbaa !0
+  %67 = load i32** @c3, align 4, !tbaa !3
+  store i32 %66, i32* %67, align 4, !tbaa !0
+  %68 = load i32* @a4, align 4, !tbaa !0
+  %69 = load i32** @c4, align 4, !tbaa !3
+  store i32 %68, i32* %69, align 4, !tbaa !0
+  %70 = load i32* @a5, align 4, !tbaa !0
+  %71 = load i32** @c5, align 4, !tbaa !3
+  store i32 %70, i32* %71, align 4, !tbaa !0
+  %72 = load i32* @a6, align 4, !tbaa !0
+  %73 = load i32** @c6, align 4, !tbaa !3
+  store i32 %72, i32* %73, align 4, !tbaa !0
+  %74 = load i32* @a7, align 4, !tbaa !0
+  %75 = load i32** @c7, align 4, !tbaa !3
+  store i32 %74, i32* %75, align 4, !tbaa !0
+  %76 = load i32* @a8, align 4, !tbaa !0
+  %77 = load i32** @c8, align 4, !tbaa !3
+  store i32 %76, i32* %77, align 4, !tbaa !0
+  %78 = load i32* @a9, align 4, !tbaa !0
+  %79 = load i32** @c9, align 4, !tbaa !3
+  store i32 %78, i32* %79, align 4, !tbaa !0
+  %80 = load i32* @a10, align 4, !tbaa !0
+  %81 = load i32** @c10, align 4, !tbaa !3
+  store i32 %80, i32* %81, align 4, !tbaa !0
+  %82 = load i32* @a11, align 4, !tbaa !0
+  %83 = load i32** @c11, align 4, !tbaa !3
+  store i32 %82, i32* %83, align 4, !tbaa !0
+  %84 = load i32* @a12, align 4, !tbaa !0
+  %85 = load i32** @c12, align 4, !tbaa !3
+  store i32 %84, i32* %85, align 4, !tbaa !0
+  %86 = load i32* @a13, align 4, !tbaa !0
+  %87 = load i32** @c13, align 4, !tbaa !3
+  store i32 %86, i32* %87, align 4, !tbaa !0
+  %88 = load i32* @a14, align 4, !tbaa !0
+  %89 = load i32** @c14, align 4, !tbaa !3
+  store i32 %88, i32* %89, align 4, !tbaa !0
+  %90 = load i32* @a15, align 4, !tbaa !0
+  %91 = load i32** @c15, align 4, !tbaa !3
+  store i32 %90, i32* %91, align 4, !tbaa !0
+  %92 = load i32* @a16, align 4, !tbaa !0
+  %93 = load i32** @c16, align 4, !tbaa !3
+  store i32 %92, i32* %93, align 4, !tbaa !0
+  %94 = load i32* @a17, align 4, !tbaa !0
+  %95 = load i32** @c17, align 4, !tbaa !3
+  store i32 %94, i32* %95, align 4, !tbaa !0
+  %96 = load i32* @a18, align 4, !tbaa !0
+  %97 = load i32** @c18, align 4, !tbaa !3
+  store i32 %96, i32* %97, align 4, !tbaa !0
+  %98 = load i32* @a19, align 4, !tbaa !0
+  %99 = load i32** @c19, align 4, !tbaa !3
+  store i32 %98, i32* %99, align 4, !tbaa !0
+  %100 = load i32* @a20, align 4, !tbaa !0
+  %101 = load i32** @c20, align 4, !tbaa !3
+  store i32 %100, i32* %101, align 4, !tbaa !0
+  %102 = load i32* @a21, align 4, !tbaa !0
+  %103 = load i32** @c21, align 4, !tbaa !3
+  store i32 %102, i32* %103, align 4, !tbaa !0
+  %104 = load i32* @a22, align 4, !tbaa !0
+  %105 = load i32** @c22, align 4, !tbaa !3
+  store i32 %104, i32* %105, align 4, !tbaa !0
+  %106 = load i32* @a23, align 4, !tbaa !0
+  %107 = load i32** @c23, align 4, !tbaa !3
+  store i32 %106, i32* %107, align 4, !tbaa !0
+  %108 = load i32* @a24, align 4, !tbaa !0
+  %109 = load i32** @c24, align 4, !tbaa !3
+  store i32 %108, i32* %109, align 4, !tbaa !0
+  %110 = load i32* @a25, align 4, !tbaa !0
+  %111 = load i32** @c25, align 4, !tbaa !3
+  store i32 %110, i32* %111, align 4, !tbaa !0
+  %112 = load i32* @a26, align 4, !tbaa !0
+  %113 = load i32** @c26, align 4, !tbaa !3
+  store i32 %112, i32* %113, align 4, !tbaa !0
+  %114 = load i32* @a27, align 4, !tbaa !0
+  %115 = load i32** @c27, align 4, !tbaa !3
+  store i32 %114, i32* %115, align 4, !tbaa !0
+  %116 = load i32* @a28, align 4, !tbaa !0
+  %117 = load i32** @c28, align 4, !tbaa !3
+  store i32 %116, i32* %117, align 4, !tbaa !0
+  %118 = load i32* @a29, align 4, !tbaa !0
+  %119 = load i32** @c29, align 4, !tbaa !3
+  store i32 %118, i32* %119, align 4, !tbaa !0
+  %120 = load i32* @a0, align 4, !tbaa !0
+  ret i32 %120
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/Mips/rdhwr-directives.ll b/test/CodeGen/Mips/rdhwr-directives.ll
new file mode 100644
index 0000000..27010d4
--- /dev/null
+++ b/test/CodeGen/Mips/rdhwr-directives.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=static < %s | FileCheck %s
+
+@a = external thread_local global i32
+
+define i32 @foo() nounwind readonly {
+entry:
+; CHECK: .set  push
+; CHECK: .set  mips32r2
+; CHECK: rdhwr 
+; CHECK: .set  pop
+
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Mips/return_address.ll b/test/CodeGen/Mips/return_address.ll
new file mode 100644
index 0000000..e1c9241
--- /dev/null
+++ b/test/CodeGen/Mips/return_address.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+define i8* @f1() nounwind {
+entry:
+  %0 = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+
+; CHECK:    addu    $2, $zero, $ra
+}
+
+define i8* @f2() nounwind {
+entry:
+  call void @g()
+  %0 = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+
+; CHECK:    addu    $[[R0:[0-9]+]], $zero, $ra
+; CHECK:    jal
+; CHECK:    addu    $2,  $zero, $[[R0]]
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
+declare void @g()
diff --git a/test/CodeGen/Mips/sb1.ll b/test/CodeGen/Mips/sb1.ll
new file mode 100644
index 0000000..e1a28d4
--- /dev/null
+++ b/test/CodeGen/Mips/sb1.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 97, align 4
+@c = common global i8 0, align 1
+@.str = private unnamed_addr constant [8 x i8] c"%i %c \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %conv = trunc i32 %0 to i8
+  store i8 %conv, i8* @c, align 1
+  %1 = load i32* @i, align 4
+  %2 = load i8* @c, align 1
+  %conv1 = sext i8 %2 to i32
+; 16:	sb	${{[0-9]+}}, 0(${{[0-9]+}})
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/selectcc.ll b/test/CodeGen/Mips/selectcc.ll
new file mode 100644
index 0000000..a17517e
--- /dev/null
+++ b/test/CodeGen/Mips/selectcc.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel < %s
+
+@gf0 = external global float
+@gf1 = external global float
+@gd0 = external global double
+@gd1 = external global double
+
+define float @select_cc_f32(float %a, float %b) nounwind {
+entry:
+  store float 0.000000e+00, float* @gf0, align 4
+  store float 1.000000e+00, float* @gf1, align 4
+  %cmp = fcmp olt float %a, %b
+  %conv = zext i1 %cmp to i32
+  %conv1 = sitofp i32 %conv to float
+  ret float %conv1
+}
+
+define double @select_cc_f64(double %a, double %b) nounwind {
+entry:
+  store double 0.000000e+00, double* @gd0, align 8
+  store double 1.000000e+00, double* @gd1, align 8
+  %cmp = fcmp olt double %a, %b
+  %conv = zext i1 %cmp to i32
+  %conv1 = sitofp i32 %conv to double
+  ret double %conv1
+}
+
diff --git a/test/CodeGen/Mips/sh1.ll b/test/CodeGen/Mips/sh1.ll
new file mode 100644
index 0000000..1746ae2
--- /dev/null
+++ b/test/CodeGen/Mips/sh1.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 97, align 4
+@s = common global i16 0, align 2
+@.str = private unnamed_addr constant [9 x i8] c"%i %hi \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %conv = trunc i32 %0 to i16
+  store i16 %conv, i16* @s, align 2
+  %1 = load i32* @i, align 4
+  %2 = load i16* @s, align 2
+  %conv1 = sext i16 %2 to i32
+; 16:	sh	${{[0-9]+}}, 0(${{[0-9]+}})
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %conv1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/shift-parts.ll b/test/CodeGen/Mips/shift-parts.ll
new file mode 100644
index 0000000..38cbf28
--- /dev/null
+++ b/test/CodeGen/Mips/shift-parts.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+define i64 @shl0(i64 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: shl0
+; CHECK-NOT: lw $25, %call16(__
+  %sh_prom = zext i32 %b to i64
+  %shl = shl i64 %a, %sh_prom
+  ret i64 %shl
+}
+
+define i64 @shr1(i64 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: shr1
+; CHECK-NOT: lw $25, %call16(__
+  %sh_prom = zext i32 %b to i64
+  %shr = lshr i64 %a, %sh_prom
+  ret i64 %shr
+}
+
+define i64 @sra2(i64 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: sra2
+; CHECK-NOT: lw $25, %call16(__
+  %sh_prom = zext i32 %b to i64
+  %shr = ashr i64 %a, %sh_prom
+  ret i64 %shr
+}
+
diff --git a/test/CodeGen/Mips/sitofp-selectcc-opt.ll b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
new file mode 100644
index 0000000..576cbd8
--- /dev/null
+++ b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+@foo12.d4 = internal unnamed_addr global double 0.000000e+00, align 8
+
+define double @foo12(i32 %a, i32, i64 %b) nounwind {
+entry:
+; check that this transformation doesn't happen:
+; (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+;
+; CHECK-NOT:   # double -1.000000e+00
+
+  %tobool1 = icmp ne i32 %a, 0
+  %not.tobool = icmp ne i64 %b, 0
+  %tobool1. = or i1 %tobool1, %not.tobool
+  %lor.ext = zext i1 %tobool1. to i32
+  %conv = sitofp i32 %lor.ext to double
+  %1 = load double* @foo12.d4, align 8
+  %add = fadd double %conv, %1
+  store double %add, double* @foo12.d4, align 8
+  ret double %add
+}
+
diff --git a/test/CodeGen/Mips/sll1.ll b/test/CodeGen/Mips/sll1.ll
new file mode 100644
index 0000000..fdcd38c
--- /dev/null
+++ b/test/CodeGen/Mips/sll1.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10, align 4
+@j = global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; 16:	sll	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+  %0 = load i32* @i, align 4
+  %shl = shl i32 %0, 4
+; 16:	sll	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+  store i32 %shl, i32* @j, align 4
+  %1 = load i32* @j, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/sll2.ll b/test/CodeGen/Mips/sll2.ll
new file mode 100644
index 0000000..c2af454
--- /dev/null
+++ b/test/CodeGen/Mips/sll2.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10, align 4
+@j = global i32 4, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %shl = shl i32 %0, %1
+; 16:	sllv	${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %shl, i32* @i, align 4
+  %2 = load i32* @j, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %2)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/sra1.ll b/test/CodeGen/Mips/sra1.ll
new file mode 100644
index 0000000..15bf8d6
--- /dev/null
+++ b/test/CodeGen/Mips/sra1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -354, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %shr = ashr i32 %0, 3
+; 16:	sra	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %shr)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/sra2.ll b/test/CodeGen/Mips/sra2.ll
new file mode 100644
index 0000000..26bf19d
--- /dev/null
+++ b/test/CodeGen/Mips/sra2.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -354, align 4
+@j = global i32 3, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %shr = ashr i32 %0, %1
+; 16:	srav	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %shr)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/srl1.ll b/test/CodeGen/Mips/srl1.ll
new file mode 100644
index 0000000..3474283
--- /dev/null
+++ b/test/CodeGen/Mips/srl1.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10654, align 4
+@j = global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %shr = lshr i32 %0, 4
+; 16:	srl	${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+  store i32 %shr, i32* @j, align 4
+  %1 = load i32* @j, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/srl2.ll b/test/CodeGen/Mips/srl2.ll
new file mode 100644
index 0000000..26ec092
--- /dev/null
+++ b/test/CodeGen/Mips/srl2.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10654, align 4
+@j = global i32 0, align 4
+@k = global i32 4, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %shr = lshr i32 %0, %1
+; 16:	srlv	${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %shr, i32* @j, align 4
+  %2 = load i32* @j, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %2)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/stacksize.ll b/test/CodeGen/Mips/stacksize.ll
new file mode 100644
index 0000000..42021b2
--- /dev/null
+++ b/test/CodeGen/Mips/stacksize.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s 
+
+define i32 @foo(i32 %a) nounwind readnone {
+entry:
+; check that stack size is zero.
+; CHECK-NOT: addiu $sp, $sp
+  %add = add nsw i32 %a, 1
+  ret i32 %add
+}
diff --git a/test/CodeGen/Mips/sub1.ll b/test/CodeGen/Mips/sub1.ll
new file mode 100644
index 0000000..195750b
--- /dev/null
+++ b/test/CodeGen/Mips/sub1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %sub = sub nsw i32 %0, 5
+; 16:	addiu	${{[0-9]+}}, -{{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/sub2.ll b/test/CodeGen/Mips/sub2.ll
new file mode 100644
index 0000000..4f6bfcc
--- /dev/null
+++ b/test/CodeGen/Mips/sub2.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 10, align 4
+@j = global i32 20, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %sub = sub nsw i32 %0, %1
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %sub)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/swzero.ll b/test/CodeGen/Mips/swzero.ll
index da1e036..9f91a39 100644
--- a/test/CodeGen/Mips/swzero.ll
+++ b/test/CodeGen/Mips/swzero.ll
@@ -4,7 +4,8 @@
 
 define void @zero_u(%struct.unaligned* nocapture %p) nounwind {
 entry:
-; CHECK: usw $zero
+; CHECK: swl $zero
+; CHECK: swr $zero
   %x = getelementptr inbounds %struct.unaligned* %p, i32 0, i32 0
   store i32 0, i32* %x, align 1
   ret void
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
new file mode 100644
index 0000000..d681091
--- /dev/null
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+
+@foo = thread_local global i32 42
+@bar = hidden alias i32* @foo
+
+define i32* @zed() {
+; CHECK: __tls_get_addr
+; CHECK-NEXT: %tlsgd(bar)
+       ret i32* @bar
+}
diff --git a/test/CodeGen/Mips/tls-models.ll b/test/CodeGen/Mips/tls-models.ll
new file mode 100644
index 0000000..8f5789e
--- /dev/null
+++ b/test/CodeGen/Mips/tls-models.ll
@@ -0,0 +1,113 @@
+; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck -check-prefix=CHECK-NONPIC %s
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; CHECK-NONPIC:   f1:
+  ; CHECK-NONPIC:   %gottprel
+  ; CHECK-PIC:      f1:
+  ; CHECK-PIC:      %tlsgd
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; CHECK-NONPIC:   f2:
+  ; CHECK-NONPIC:   %tprel_hi
+  ; CHECK-PIC:      f2:
+  ; CHECK-PIC:      %tlsldm
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC should use local dynamic.
+  ; CHECK-NONPIC:   f3:
+  ; CHECK-NONPIC:   %gottprel
+  ; CHECK-PIC:      f3:
+  ; CHECK-PIC:      %tlsldm
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; CHECK-NONPIC:   f4:
+  ; CHECK-NONPIC:   %tprel_hi
+  ; CHECK-PIC:      f4:
+  ; CHECK-PIC:      %tlsldm
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; CHECK-NONPIC:   f5:
+  ; CHECK-NONPIC:   %gottprel
+  ; CHECK-PIC:      f5:
+  ; CHECK-PIC:      %gottprel
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; CHECK-NONPIC:   f6:
+  ; CHECK-NONPIC:   %tprel_hi
+  ; CHECK-PIC:      f6:
+  ; CHECK-PIC:      %gottprel
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f7:
+  ; CHECK-NONPIC:   %tprel_hi
+  ; CHECK-PIC:      f7:
+  ; CHECK-PIC:      %tprel_hi
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f8:
+  ; CHECK-NONPIC:   %tprel_hi
+  ; CHECK-PIC:      f8:
+  ; CHECK-PIC:      %tprel_hi
+}
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a3c4768..a7ddb96 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -13,8 +13,9 @@ entry:
 
 ; CHECK: f1:
 
-; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
-; PIC:   addiu   $4, $gp, %tlsgd(t1)
+; PIC:   addu    $[[R0:[a-z0-9]+]], $2, $25
+; PIC:   lw      $25, %call16(__tls_get_addr)($[[R0]])
+; PIC:   addiu   $4, $[[R0]], %tlsgd(t1)
 ; PIC:   jalr    $25
 ; PIC:   lw      $2, 0($2)
 
@@ -35,18 +36,19 @@ entry:
 
 ; CHECK: f2:
 
-; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
-; PIC:   addiu   $4, $gp, %tlsgd(t2)
+; PIC:   addu    $[[R0:[a-z0-9]+]], $2, $25
+; PIC:   lw      $25, %call16(__tls_get_addr)($[[R0]])
+; PIC:   addiu   $4, $[[R0]], %tlsgd(t2)
 ; PIC:   jalr    $25
 ; PIC:   lw      $2, 0($2)
 
 ; STATICGP: lui     $[[R0:[0-9]+]], %hi(__gnu_local_gp)
 ; STATICGP: addiu   $[[GP:[0-9]+]], $[[R0]], %lo(__gnu_local_gp)
 ; STATICGP: lw      ${{[0-9]+}}, %gottprel(t2)($[[GP]])
-; STATIC:   lui     $gp, %hi(__gnu_local_gp)
-; STATIC:   addiu   $gp, $gp, %lo(__gnu_local_gp)
+; STATIC:   lui     $[[R0:[0-9]+]], %hi(__gnu_local_gp)
+; STATIC:   addiu   $[[GP:[0-9]+]], $[[R0]], %lo(__gnu_local_gp)
 ; STATIC:   rdhwr   $3, $29
-; STATIC:   lw      $[[R0:[0-9]+]], %gottprel(t2)($gp)
+; STATIC:   lw      $[[R0:[0-9]+]], %gottprel(t2)($[[GP]])
 ; STATIC:   addu    $[[R1:[0-9]+]], $3, $[[R0]]
 ; STATIC:   lw      $2, 0($[[R1]])
 }
@@ -57,7 +59,7 @@ define i32 @f3() nounwind {
 entry:
 ; CHECK: f3:
 
-; PIC:   addiu   $4, $gp, %tlsldm(f3.i)
+; PIC:   addiu   $4, ${{[a-z0-9]+}}, %tlsldm(f3.i)
 ; PIC:   jalr    $25
 ; PIC:   lui     $[[R0:[0-9]+]], %dtprel_hi(f3.i)
 ; PIC:   addu    $[[R1:[0-9]+]], $[[R0]], $2
diff --git a/test/CodeGen/Mips/unalignedload.ll b/test/CodeGen/Mips/unalignedload.ll
index 6a087ba..7f880b6 100644
--- a/test/CodeGen/Mips/unalignedload.ll
+++ b/test/CodeGen/Mips/unalignedload.ll
@@ -9,27 +9,17 @@
 
 define void @foo1() nounwind {
 entry:
-; CHECK-EL: ulhu  $4, 2
-; CHECK-EL: lw  $25, %call16(foo2)
-; CHECK-EL: lw  $[[R0:[0-9]+]], %got(s4)
-; CHECK-EL: lbu $[[R1:[0-9]+]], 6($[[R0]])
-; CHECK-EL: sll $[[R3:[0-9]+]], $[[R1]], 16
-; CHECK-EL: ulhu  $[[R2:[0-9]+]], 4($[[R0]])
-; CHECK-EL: or  $5, $[[R2]], $[[R3]]
-; CHECK-EL: ulw $4, 0($[[R0]])
-; CHECK-EL: lw  $25, %call16(foo4)
+; CHECK-EL: lbu ${{[0-9]+}}, 2($[[R0:[0-9]+]])
+; CHECK-EL: lbu ${{[0-9]+}}, 3($[[R0]])
+; CHECK-EL: jalr
+; CHECK-EL: lwl $[[R1:[0-9]+]], 3($[[R2:[0-9]+]])
+; CHECK-EL: lwr $[[R1]], 0($[[R2]])
 
-; CHECK-EB: ulhu  $[[R0:[0-9]+]], 2
-; CHECK-EB: sll $4, $[[R0]], 16
-; CHECK-EB: lw  $25, %call16(foo2)
-; CHECK-EB: lw  $[[R1:[0-9]+]], %got(s4)
-; CHECK-EB: lbu $[[R3:[0-9]+]], 6($[[R1]])
-; CHECK-EB: sll $[[R5:[0-9]+]], $[[R3]], 8
-; CHECK-EB: ulhu  $[[R2:[0-9]+]], 4($[[R1]])
-; CHECK-EB: sll $[[R4:[0-9]+]], $[[R2]], 16
-; CHECK-EB: or  $5, $[[R4]], $[[R5]]
-; CHECK-EB: ulw $4, 0($[[R1]])
-; CHECK-EB: lw  $25, %call16(foo4)
+; CHECK-EB: lbu ${{[0-9]+}}, 3($[[R0:[0-9]+]])
+; CHECK-EB: lbu ${{[0-9]+}}, 2($[[R0]])
+; CHECK-EB: jalr
+; CHECK-EB: lwl $[[R1:[0-9]+]], 0($[[R2:[0-9]+]])
+; CHECK-BE: lwr $[[R1]], 3($[[R2]])
 
   tail call void @foo2(%struct.S1* byval getelementptr inbounds (%struct.S2* @s2, i32 0, i32 1)) nounwind
   tail call void @foo4(%struct.S4* byval @s4) nounwind
diff --git a/test/CodeGen/Mips/xor1.ll b/test/CodeGen/Mips/xor1.ll
new file mode 100644
index 0000000..f2c1316
--- /dev/null
+++ b/test/CodeGen/Mips/xor1.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %xor = xor i32 %0, %1
+; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0), i32 %xor)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index b890e1d..79ed609 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll
@@ -4,8 +4,7 @@
 
 define i32 @foo0(i32 %s) nounwind readonly {
 entry:
-; CHECK-NOT: addiu
-; CHECK:     movn
+; CHECK:     movn ${{[0-9]+}}, $zero
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4, !tbaa !0
   %cond = select i1 %tobool, i32 0, i32 %0
@@ -14,8 +13,7 @@ entry:
 
 define i32 @foo1(i32 %s) nounwind readonly {
 entry:
-; CHECK-NOT: addiu
-; CHECK:     movz
+; CHECK:     movz ${{[0-9]+}}, $zero
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4, !tbaa !0
   %cond = select i1 %tobool, i32 %0, i32 0
diff --git a/test/CodeGen/NVPTX/annotations.ll b/test/CodeGen/NVPTX/annotations.ll
new file mode 100644
index 0000000..d93f688
--- /dev/null
+++ b/test/CodeGen/NVPTX/annotations.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+@texture = internal addrspace(1) global i64 0, align 8
+; CHECK: .global .texref texture
+@surface = internal addrspace(1) global i64 0, align 8
+; CHECK: .global .surfref surface
+
+
+; CHECK: .entry kernel_func_maxntid
+define void @kernel_func_maxntid(float* %a) {
+; CHECK: .maxntid 10, 20, 30
+; CHECK: ret
+  ret void
+}
+
+; CHECK: .entry kernel_func_reqntid
+define void @kernel_func_reqntid(float* %a) {
+; CHECK: .reqntid 11, 22, 33
+; CHECK: ret
+  ret void
+}
+
+; CHECK: .entry kernel_func_minctasm
+define void @kernel_func_minctasm(float* %a) {
+; CHECK: .minnctapersm 42
+; CHECK: ret
+  ret void
+}
+
+
+
+!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8}
+
+!1 = metadata !{void (float*)* @kernel_func_maxntid, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*)* @kernel_func_maxntid,
+                metadata !"maxntidx", i32 10,
+                metadata !"maxntidy", i32 20,
+                metadata !"maxntidz", i32 30}
+
+!3 = metadata !{void (float*)* @kernel_func_reqntid, metadata !"kernel", i32 1}
+!4 = metadata !{void (float*)* @kernel_func_reqntid,
+                metadata !"reqntidx", i32 11,
+                metadata !"reqntidy", i32 22,
+                metadata !"reqntidz", i32 33}
+
+!5 = metadata !{void (float*)* @kernel_func_minctasm, metadata !"kernel", i32 1}
+!6 = metadata !{void (float*)* @kernel_func_minctasm,
+                metadata !"minctasm", i32 42}
+
+!7 = metadata !{i64 addrspace(1)* @texture, metadata !"texture", i32 1}
+!8 = metadata !{i64 addrspace(1)* @surface, metadata !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
new file mode 100644
index 0000000..73c77f5
--- /dev/null
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; f64
+
+define double @fadd_f64(double %a, double %b) {
+; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd double %a, %b
+  ret double %ret
+}
+
+define double @fsub_f64(double %a, double %b) {
+; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub double %a, %b
+  ret double %ret
+}
+
+define double @fmul_f64(double %a, double %b) {
+; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul double %a, %b
+  ret double %ret
+}
+
+define double @fdiv_f64(double %a, double %b) {
+; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv double %a, %b
+  ret double %ret
+}
+
+;; PTX does not have a floating-point rem instruction
+
+
+;;; f32
+
+define float @fadd_f32(float %a, float %b) {
+; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd float %a, %b
+  ret float %ret
+}
+
+define float @fsub_f32(float %a, float %b) {
+; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub float %a, %b
+  ret float %ret
+}
+
+define float @fmul_f32(float %a, float %b) {
+; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul float %a, %b
+  ret float %ret
+}
+
+define float @fdiv_f32(float %a, float %b) {
+; CHECK: div.full.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv float %a, %b
+  ret float %ret
+}
+
+;; PTX does not have a floating-point rem instruction
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
new file mode 100644
index 0000000..e474fa4
--- /dev/null
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; f64
+
+define double @fadd_f64(double %a, double %b) {
+; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd double %a, %b
+  ret double %ret
+}
+
+define double @fsub_f64(double %a, double %b) {
+; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub double %a, %b
+  ret double %ret
+}
+
+define double @fmul_f64(double %a, double %b) {
+; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul double %a, %b
+  ret double %ret
+}
+
+define double @fdiv_f64(double %a, double %b) {
+; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv double %a, %b
+  ret double %ret
+}
+
+;; PTX does not have a floating-point rem instruction
+
+
+;;; f32
+
+define float @fadd_f32(float %a, float %b) {
+; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd float %a, %b
+  ret float %ret
+}
+
+define float @fsub_f32(float %a, float %b) {
+; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub float %a, %b
+  ret float %ret
+}
+
+define float @fmul_f32(float %a, float %b) {
+; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul float %a, %b
+  ret float %ret
+}
+
+define float @fdiv_f32(float %a, float %b) {
+; CHECK: div.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv float %a, %b
+  ret float %ret
+}
+
+;; PTX does not have a floating-point rem instruction
diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
new file mode 100644
index 0000000..529f849
--- /dev/null
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -0,0 +1,295 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; i64
+
+define i64 @add_i64(i64 %a, i64 %b) {
+; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = add i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @sub_i64(i64 %a, i64 %b) {
+; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @mul_i64(i64 %a, i64 %b) {
+; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @sdiv_i64(i64 %a, i64 %b) {
+; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @udiv_i64(i64 %a, i64 %b) {
+; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @srem_i64(i64 %a, i64 %b) {
+; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @urem_i64(i64 %a, i64 %b) {
+; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @and_i64(i64 %a, i64 %b) {
+; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = and i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @or_i64(i64 %a, i64 %b) {
+; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = or i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @xor_i64(i64 %a, i64 %b) {
+; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @shl_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @ashr_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @lshr_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i64 %a, %b
+  ret i64 %ret
+}
+
+
+;;; i32
+
+define i32 @add_i32(i32 %a, i32 %b) {
+; CHECK: add.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = add i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @sub_i32(i32 %a, i32 %b) {
+; CHECK: sub.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @mul_i32(i32 %a, i32 %b) {
+; CHECK: mul.lo.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @sdiv_i32(i32 %a, i32 %b) {
+; CHECK: div.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @udiv_i32(i32 %a, i32 %b) {
+; CHECK: div.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @srem_i32(i32 %a, i32 %b) {
+; CHECK: rem.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @urem_i32(i32 %a, i32 %b) {
+; CHECK: rem.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @and_i32(i32 %a, i32 %b) {
+; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = and i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @or_i32(i32 %a, i32 %b) {
+; CHECK: or.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = or i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @xor_i32(i32 %a, i32 %b) {
+; CHECK: xor.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @shl_i32(i32 %a, i32 %b) {
+; CHECK: shl.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @ashr_i32(i32 %a, i32 %b) {
+; CHECK: shr.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @lshr_i32(i32 %a, i32 %b) {
+; CHECK: shr.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i32 %a, %b
+  ret i32 %ret
+}
+
+;;; i16
+
+define i16 @add_i16(i16 %a, i16 %b) {
+; CHECK: add.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = add i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @sub_i16(i16 %a, i16 %b) {
+; CHECK: sub.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @mul_i16(i16 %a, i16 %b) {
+; CHECK: mul.lo.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @sdiv_i16(i16 %a, i16 %b) {
+; CHECK: div.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @udiv_i16(i16 %a, i16 %b) {
+; CHECK: div.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @srem_i16(i16 %a, i16 %b) {
+; CHECK: rem.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @urem_i16(i16 %a, i16 %b) {
+; CHECK: rem.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @and_i16(i16 %a, i16 %b) {
+; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = and i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @or_i16(i16 %a, i16 %b) {
+; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = or i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @xor_i16(i16 %a, i16 %b) {
+; CHECK: xor.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @shl_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shl.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @ashr_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @lshr_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i16 %a, %b
+  ret i16 %ret
+}
diff --git a/test/CodeGen/NVPTX/calling-conv.ll b/test/CodeGen/NVPTX/calling-conv.ll
new file mode 100644
index 0000000..968203e
--- /dev/null
+++ b/test/CodeGen/NVPTX/calling-conv.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+;; Kernel function using ptx_kernel calling conv
+
+; CHECK: .entry kernel_func
+define ptx_kernel void @kernel_func(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+;; Device function
+; CHECK: .func device_func
+define void @device_func(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+;; Kernel function using NVVM metadata
+; CHECK: .entry metadata_kernel
+define void @metadata_kernel(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+
+!nvvm.annotations = !{!1}
+
+!1 = metadata !{void (float*)* @metadata_kernel, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll
new file mode 100644
index 0000000..12fc754
--- /dev/null
+++ b/test/CodeGen/NVPTX/compare-int.ll
@@ -0,0 +1,389 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; i64
+
+define i64 @icmp_eq_i64(i64 %a, i64 %b) {
+; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ne_i64(i64 %a, i64 %b) {
+; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
+; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_uge_i64(i64 %a, i64 %b) {
+; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ult_i64(i64 %a, i64 %b) {
+; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ule_i64(i64 %a, i64 %b) {
+; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
+; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sge_i64(i64 %a, i64 %b) {
+; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_slt_i64(i64 %a, i64 %b) {
+; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sle_i64(i64 %a, i64 %b) {
+; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+;;; i32
+
+define i32 @icmp_eq_i32(i32 %a, i32 %b) {
+; CHECK: setp.eq.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ne_i32(i32 %a, i32 %b) {
+; CHECK: setp.ne.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ugt_i32(i32 %a, i32 %b) {
+; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_uge_i32(i32 %a, i32 %b) {
+; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ult_i32(i32 %a, i32 %b) {
+; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ule_i32(i32 %a, i32 %b) {
+; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sgt_i32(i32 %a, i32 %b) {
+; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sge_i32(i32 %a, i32 %b) {
+; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_slt_i32(i32 %a, i32 %b) {
+; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sle_i32(i32 %a, i32 %b) {
+; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+
+;;; i16
+
+define i16 @icmp_eq_i16(i16 %a, i16 %b) {
+; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ne_i16(i16 %a, i16 %b) {
+; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ugt_i16(i16 %a, i16 %b) {
+; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_uge_i16(i16 %a, i16 %b) {
+; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ult_i16(i16 %a, i16 %b) {
+; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ule_i16(i16 %a, i16 %b) {
+; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sgt_i16(i16 %a, i16 %b) {
+; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sge_i16(i16 %a, i16 %b) {
+; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_slt_i16(i16 %a, i16 %b) {
+; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sle_i16(i16 %a, i16 %b) {
+; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+
+;;; i8
+
+define i8 @icmp_eq_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ne_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_uge_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ult_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ule_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sge_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_slt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sle_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll
new file mode 100644
index 0000000..21c8437
--- /dev/null
+++ b/test/CodeGen/NVPTX/convert-fp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+define i16 @cvt_i16_f32(float %x) {
+; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_f64(double %x) {
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i16
+  ret i16 %a
+}
+
+define i32 @cvt_i32_f32(float %x) {
+; CHECK: cvt.rzi.u32.f32 %r{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_f64(double %x) {
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i32
+  ret i32 %a
+}
+
+
+define i64 @cvt_i64_f32(float %x) {
+; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_f64(double %x) {
+; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i64
+  ret i64 %a
+}
+
+define float @cvt_f32_i16(i16 %x) {
+; CHECK: cvt.rn.f32.u16 %f{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i16 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_i32(i32 %x) {
+; CHECK: cvt.rn.f32.u32 %f{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i32 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_i64(i64 %x) {
+; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i64 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_f64(double %x) {
+; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptrunc double %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s16(i16 %x) {
+; CHECK: cvt.rn.f32.s16 %f{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i16 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s32(i32 %x) {
+; CHECK: cvt.rn.f32.s32 %f{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i32 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s64(i64 %x) {
+; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i64 %x to float
+  ret float %a
+}
+
+define double @cvt_f64_i16(i16 %x) {
+; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i16 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_i32(i32 %x) {
+; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i32 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_i64(i64 %x) {
+; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i64 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_f32(float %x) {
+; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fpext float %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s16(i16 %x) {
+; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i16 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s32(i32 %x) {
+; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i32 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s64(i64 %x) {
+; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i64 %x to double
+  ret double %a
+}
diff --git a/test/CodeGen/NVPTX/convert-int-sm10.ll b/test/CodeGen/NVPTX/convert-int-sm10.ll
new file mode 100644
index 0000000..20716f9
--- /dev/null
+++ b/test/CodeGen/NVPTX/convert-int-sm10.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; i16
+
+define i16 @cvt_i16_i32(i32 %x) {
+; CHECK: cvt.u16.u32 %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i32 %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_i64(i64 %x) {
+; CHECK: cvt.u16.u64 %rs{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i64 %x to i16
+  ret i16 %a
+}
+
+
+
+; i32
+
+define i32 @cvt_i32_i16(i16 %x) {
+; CHECK: cvt.u32.u16 %r{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = zext i16 %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_i64(i64 %x) {
+; CHECK: cvt.u32.u64 %r{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i64 %x to i32
+  ret i32 %a
+}
+
+
+
+; i64
+
+define i64 @cvt_i64_i16(i16 %x) {
+; CHECK: cvt.u64.u16 %rl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = zext i16 %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_i32(i32 %x) {
+; CHECK: cvt.u64.u32 %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = zext i32 %x to i64
+  ret i64 %a
+}
diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll
new file mode 100644
index 0000000..fad240e
--- /dev/null
+++ b/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+;; Integer conversions happen inplicitly by loading/storing the proper types
+
+
+; i16
+
+define i16 @cvt_i16_i32(i32 %x) {
+; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]]
+; CHECK: ret
+  %a = trunc i32 %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_i64(i64 %x) {
+; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
+; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]]
+; CHECK: ret
+  %a = trunc i64 %x to i16
+  ret i16 %a
+}
+
+
+
+; i32
+
+define i32 @cvt_i32_i16(i16 %x) {
+; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]]
+; CHECK: ret
+  %a = zext i16 %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_i64(i64 %x) {
+; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
+; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]]
+; CHECK: ret
+  %a = trunc i64 %x to i32
+  ret i32 %a
+}
+
+
+
+; i64
+
+define i64 @cvt_i64_i16(i16 %x) {
+; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ret
+  %a = zext i16 %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_i32(i32 %x) {
+; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ret
+  %a = zext i32 %x to i64
+  ret i64 %a
+}
diff --git a/test/CodeGen/NVPTX/fma-disable.ll b/test/CodeGen/NVPTX/fma-disable.ll
new file mode 100644
index 0000000..bdd7401
--- /dev/null
+++ b/test/CodeGen/NVPTX/fma-disable.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
+
+define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
+entry:
+; FMA: fma.rn.f32
+; MUL: mul.rn.f32
+; MUL: add.rn.f32
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @test_mul_add_d(double %x, double %y, double %z) {
+entry:
+; FMA: fma.rn.f64
+; MUL: mul.rn.f64
+; MUL: add.rn.f64
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
new file mode 100644
index 0000000..4ef1a9a
--- /dev/null
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
diff --git a/test/CodeGen/PTX/intrinsic.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index 9f37ead..1c9879c 100644
--- a/test/CodeGen/PTX/intrinsic.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -1,231 +1,234 @@
-; RUN: llc < %s -march=ptx32 -mattr=+ptx20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
 define ptx_device i32 @test_tid_x() {
-; CHECK: mov.u32 %ret0, %tid.x;
+; CHECK: mov.u32 %r0, %tid.x;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.tid.x()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_tid_y() {
-; CHECK: mov.u32 %ret0, %tid.y;
+; CHECK: mov.u32 %r0, %tid.y;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.tid.y()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_tid_z() {
-; CHECK: mov.u32 %ret0, %tid.z;
+; CHECK: mov.u32 %r0, %tid.z;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.tid.z()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_tid_w() {
-; CHECK: mov.u32 %ret0, %tid.w;
+; CHECK: mov.u32 %r0, %tid.w;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.tid.w()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ntid_x() {
-; CHECK: mov.u32 %ret0, %ntid.x;
+; CHECK: mov.u32 %r0, %ntid.x;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ntid.x()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ntid_y() {
-; CHECK: mov.u32 %ret0, %ntid.y;
+; CHECK: mov.u32 %r0, %ntid.y;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ntid.y()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ntid_z() {
-; CHECK: mov.u32 %ret0, %ntid.z;
+; CHECK: mov.u32 %r0, %ntid.z;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ntid.z()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ntid_w() {
-; CHECK: mov.u32 %ret0, %ntid.w;
+; CHECK: mov.u32 %r0, %ntid.w;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ntid.w()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_laneid() {
-; CHECK: mov.u32 %ret0, %laneid;
+; CHECK: mov.u32 %r0, %laneid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.laneid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_warpid() {
-; CHECK: mov.u32 %ret0, %warpid;
+; CHECK: mov.u32 %r0, %warpid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.warpid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nwarpid() {
-; CHECK: mov.u32 %ret0, %nwarpid;
+; CHECK: mov.u32 %r0, %nwarpid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nwarpid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ctaid_x() {
-; CHECK: mov.u32 %ret0, %ctaid.x;
+; CHECK: mov.u32 %r0, %ctaid.x;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ctaid.x()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ctaid_y() {
-; CHECK: mov.u32 %ret0, %ctaid.y;
+; CHECK: mov.u32 %r0, %ctaid.y;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ctaid.y()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ctaid_z() {
-; CHECK: mov.u32 %ret0, %ctaid.z;
+; CHECK: mov.u32 %r0, %ctaid.z;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ctaid.z()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_ctaid_w() {
-; CHECK: mov.u32 %ret0, %ctaid.w;
+; CHECK: mov.u32 %r0, %ctaid.w;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.ctaid.w()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nctaid_x() {
-; CHECK: mov.u32 %ret0, %nctaid.x;
+; CHECK: mov.u32 %r0, %nctaid.x;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nctaid.x()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nctaid_y() {
-; CHECK: mov.u32 %ret0, %nctaid.y;
+; CHECK: mov.u32 %r0, %nctaid.y;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nctaid.y()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nctaid_z() {
-; CHECK: mov.u32 %ret0, %nctaid.z;
+; CHECK: mov.u32 %r0, %nctaid.z;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nctaid.z()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nctaid_w() {
-; CHECK: mov.u32 %ret0, %nctaid.w;
+; CHECK: mov.u32 %r0, %nctaid.w;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nctaid.w()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_smid() {
-; CHECK: mov.u32 %ret0, %smid;
+; CHECK: mov.u32 %r0, %smid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.smid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_nsmid() {
-; CHECK: mov.u32 %ret0, %nsmid;
+; CHECK: mov.u32 %r0, %nsmid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.nsmid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_gridid() {
-; CHECK: mov.u32 %ret0, %gridid;
+; CHECK: mov.u32 %r0, %gridid;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.gridid()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_lanemask_eq() {
-; CHECK: mov.u32 %ret0, %lanemask_eq;
+; CHECK: mov.u32 %r0, %lanemask_eq;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.lanemask.eq()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_lanemask_le() {
-; CHECK: mov.u32 %ret0, %lanemask_le;
+; CHECK: mov.u32 %r0, %lanemask_le;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.lanemask.le()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_lanemask_lt() {
-; CHECK: mov.u32 %ret0, %lanemask_lt;
+; CHECK: mov.u32 %r0, %lanemask_lt;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.lanemask.lt()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_lanemask_ge() {
-; CHECK: mov.u32 %ret0, %lanemask_ge;
+; CHECK: mov.u32 %r0, %lanemask_ge;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.lanemask.ge()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_lanemask_gt() {
-; CHECK: mov.u32 %ret0, %lanemask_gt;
+; CHECK: mov.u32 %r0, %lanemask_gt;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.lanemask.gt()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_clock() {
-; CHECK: mov.u32 %ret0, %clock;
+; CHECK: mov.u32 %r0, %clock;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.clock()
 	ret i32 %x
 }
 
 define ptx_device i64 @test_clock64() {
-; CHECK: mov.u64 %ret0, %clock64;
+; CHECK: mov.u64 %rl0, %clock64;
 ; CHECK: ret;
 	%x = call i64 @llvm.ptx.read.clock64()
 	ret i64 %x
 }
 
 define ptx_device i32 @test_pm0() {
-; CHECK: mov.u32 %ret0, %pm0;
+; CHECK: mov.u32 %r0, %pm0;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.pm0()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_pm1() {
-; CHECK: mov.u32 %ret0, %pm1;
+; CHECK: mov.u32 %r0, %pm1;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.pm1()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_pm2() {
-; CHECK: mov.u32 %ret0, %pm2;
+; CHECK: mov.u32 %r0, %pm2;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.pm2()
 	ret i32 %x
 }
 
 define ptx_device i32 @test_pm3() {
-; CHECK: mov.u32 %ret0, %pm3;
+; CHECK: mov.u32 %r0, %pm3;
 ; CHECK: ret;
 	%x = call i32 @llvm.ptx.read.pm3()
 	ret i32 %x
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
new file mode 100644
index 0000000..afab60c
--- /dev/null
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+define ptx_device float @test_fabsf(float %f) {
+; CHECK: abs.f32 %f0, %f0;
+; CHECK: ret;
+	%x = call float @llvm.fabs.f32(float %f)
+	ret float %x
+}
+
+define ptx_device double @test_fabs(double %d) {
+; CHECK: abs.f64 %fl0, %fl0;
+; CHECK: ret;
+	%x = call double @llvm.fabs.f64(double %d)
+	ret double %x
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
new file mode 100644
index 0000000..d1f5093d
--- /dev/null
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -0,0 +1,173 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
+; PTX32: ld.global.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(1)* %ptr
+  ret i8 %a
+}
+
+define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(3)* %ptr
+  ret i8 %a
+}
+
+define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
+; PTX32: ld.local.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(5)* %ptr
+  ret i8 %a
+}
+
+;; i16
+define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
+; PTX32: ld.global.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(1)* %ptr
+  ret i16 %a
+}
+
+define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(3)* %ptr
+  ret i16 %a
+}
+
+define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
+; PTX32: ld.local.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(5)* %ptr
+  ret i16 %a
+}
+
+;; i32
+define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
+; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(1)* %ptr
+  ret i32 %a
+}
+
+define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(3)* %ptr
+  ret i32 %a
+}
+
+define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
+; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(5)* %ptr
+  ret i32 %a
+}
+
+;; i64
+define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
+; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(1)* %ptr
+  ret i64 %a
+}
+
+define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(3)* %ptr
+  ret i64 %a
+}
+
+define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
+; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(5)* %ptr
+  ret i64 %a
+}
+
+;; f32
+define float @ld_global_f32(float addrspace(1)* %ptr) {
+; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(1)* %ptr
+  ret float %a
+}
+
+define float @ld_shared_f32(float addrspace(3)* %ptr) {
+; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(3)* %ptr
+  ret float %a
+}
+
+define float @ld_local_f32(float addrspace(5)* %ptr) {
+; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(5)* %ptr
+  ret float %a
+}
+
+;; f64
+define double @ld_global_f64(double addrspace(1)* %ptr) {
+; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(1)* %ptr
+  ret double %a
+}
+
+define double @ld_shared_f64(double addrspace(3)* %ptr) {
+; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(3)* %ptr
+  ret double %a
+}
+
+define double @ld_local_f64(double addrspace(5)* %ptr) {
+; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(5)* %ptr
+  ret double %a
+}
diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll
new file mode 100644
index 0000000..81a5216
--- /dev/null
+++ b/test/CodeGen/NVPTX/ld-generic.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(0)* %ptr
+  ret i8 %a
+}
+
+;; i16
+define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
+; PTX32: ld.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(0)* %ptr
+  ret i16 %a
+}
+
+;; i32
+define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
+; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(0)* %ptr
+  ret i32 %a
+}
+
+;; i64
+define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
+; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(0)* %ptr
+  ret i64 %a
+}
+
+;; f32
+define float @ld_global_f32(float addrspace(0)* %ptr) {
+; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(0)* %ptr
+  ret float %a
+}
+
+;; f64
+define double @ld_global_f64(double addrspace(0)* %ptr) {
+; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(0)* %ptr
+  ret double %a
+}
diff --git a/test/CodeGen/PTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg
index e748f7f..7180c84 100644
--- a/test/CodeGen/PTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,6 +1,5 @@
 config.suffixes = ['.ll', '.c', '.cpp']
 
 targets = set(config.root.targets_to_build.split())
-if not 'PTX' in targets:
+if not 'NVPTX' in targets:
     config.unsupported = True
-
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
new file mode 100644
index 0000000..ab6f423
--- /dev/null
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+
+; CHECK: .func ({{.*}}) device_func
+define float @device_func(float %a) noinline {
+  %ret = fmul float %a, %a
+  ret float %ret
+}
+
+; CHECK: .entry kernel_func
+define void @kernel_func(float* %a) {
+  %val = load float* %a
+; CHECK: call.uni (retval0),
+; CHECK: device_func,
+  %mul = call float @device_func(float %val)
+  store float %mul, float* %a
+  ret void
+}
+
+
+
+!nvvm.annotations = !{!1}
+
+!1 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll
new file mode 100644
index 0000000..54e04ae
--- /dev/null
+++ b/test/CodeGen/NVPTX/st-addrspace.ll
@@ -0,0 +1,179 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+
+define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
+; PTX32: st.global.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
+; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
+; PTX32: st.local.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(5)* %ptr
+  ret void
+}
+
+;; i16
+
+define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
+; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
+; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
+; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(5)* %ptr
+  ret void
+}
+
+;; i32
+
+define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
+; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
+; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
+; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(5)* %ptr
+  ret void
+}
+
+;; i64
+
+define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) {
+; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) {
+; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
+; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(5)* %ptr
+  ret void
+}
+
+;; f32
+
+define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
+; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
+; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
+; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(5)* %ptr
+  ret void
+}
+
+;; f64
+
+define void @st_global_f64(double addrspace(1)* %ptr, double %a) {
+; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_f64(double addrspace(3)* %ptr, double %a) {
+; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_f64(double addrspace(5)* %ptr, double %a) {
+; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(5)* %ptr
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll
new file mode 100644
index 0000000..59a1fe0
--- /dev/null
+++ b/test/CodeGen/NVPTX/st-generic.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+
+define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
+; PTX32: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(0)* %ptr
+  ret void
+}
+
+;; i16
+
+define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
+; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(0)* %ptr
+  ret void
+}
+
+;; i32
+
+define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
+; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(0)* %ptr
+  ret void
+}
+
+;; i64
+
+define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
+; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(0)* %ptr
+  ret void
+}
+
+;; f32
+
+define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
+; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(0)* %ptr
+  ret void
+}
+
+;; f64
+
+define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
+; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(0)* %ptr
+  ret void
+}
diff --git a/test/CodeGen/PTX/20110926-sitofp.ll b/test/CodeGen/PTX/20110926-sitofp.ll
deleted file mode 100644
index 38d35c5..0000000
--- a/test/CodeGen/PTX/20110926-sitofp.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-@A = common global [1536 x [1536 x float]] zeroinitializer, align 4
-@B = common global [1536 x [1536 x float]] zeroinitializer, align 4
-
-define internal ptx_device void @init_array(i32 %x, i32 %y) {
-  %arrayidx103 = getelementptr [1536 x [1536 x float]]* @A, i32 0, i32 %x, i32 %y
-  %arrayidx224 = getelementptr [1536 x [1536 x float]]* @B, i32 0, i32 %x, i32 %y
-  %mul5 = mul i32 %x, %y
-  %rem = srem i32 %mul5, 1024
-  %add = add nsw i32 %rem, 1
-; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}}
-  %conv = sitofp i32 %add to double
-  %div = fmul double %conv, 5.000000e-01
-  %conv7 = fptrunc double %div to float
-  store float %conv7, float* %arrayidx103, align 4
-  %rem14 = srem i32 %mul5, 1024
-  %add15 = add nsw i32 %rem14, 1
-  %conv16 = sitofp i32 %add15 to double
-  %div17 = fmul double %conv16, 5.000000e-01
-  %conv18 = fptrunc double %div17 to float
-  store float %conv18, float* %arrayidx224, align 4
-  ret void
-}
diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll
deleted file mode 100644
index 8b10d11..0000000
--- a/test/CodeGen/PTX/add.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
-; CHECK: add.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%z = add i16 %x, %y
-	ret i16 %z
-}
-
-define ptx_device i32 @t1_u32(i32 %x, i32 %y) {
-; CHECK: add.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%z = add i32 %x, %y
-	ret i32 %z
-}
-
-define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
-; CHECK: add.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%z = add i64 %x, %y
-	ret i64 %z
-}
-
-define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: add.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret;
-  %z = fadd float %x, %y
-  ret float %z
-}
-
-define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: add.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
-; CHECK: ret;
-  %z = fadd double %x, %y
-  ret double %z
-}
-
-define ptx_device i16 @t2_u16(i16 %x) {
-; CHECK: add.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}}, 1;
-; CHECK: ret;
-	%z = add i16 %x, 1
-	ret i16 %z
-}
-
-define ptx_device i32 @t2_u32(i32 %x) {
-; CHECK: add.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, 1;
-; CHECK: ret;
-	%z = add i32 %x, 1
-	ret i32 %z
-}
-
-define ptx_device i64 @t2_u64(i64 %x) {
-; CHECK: add.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}}, 1;
-; CHECK: ret;
-	%z = add i64 %x, 1
-	ret i64 %z
-}
-
-define ptx_device float @t2_f32(float %x) {
-; CHECK: add.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, 0D3FF0000000000000;
-; CHECK: ret;
-  %z = fadd float %x, 1.0
-  ret float %z
-}
-
-define ptx_device double @t2_f64(double %x) {
-; CHECK: add.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, 0D3FF0000000000000;
-; CHECK: ret;
-  %z = fadd double %x, 1.0
-  ret double %z
-}
diff --git a/test/CodeGen/PTX/aggregates.ll b/test/CodeGen/PTX/aggregates.ll
deleted file mode 100644
index 3fc0c40..0000000
--- a/test/CodeGen/PTX/aggregates.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=sm20 | FileCheck %s
-; XFAIL: *
-
-%complex = type { float, float }
-
-define ptx_device %complex @complex_add(%complex %a, %complex %b) {
-entry:
-; CHECK:      ld.param.f32	r[[R0:[0-9]+]], [__param_1];
-; CHECK-NEXT:	ld.param.f32	r[[R2:[0-9]+]], [__param_3];
-; CHECK-NEXT:	ld.param.f32	r[[R1:[0-9]+]], [__param_2];
-; CHECK-NEXT:	ld.param.f32	r[[R3:[0-9]+]], [__param_4];
-; CHECK-NEXT:	add.rn.f32	r[[R0]], r[[R0]], r[[R2]];
-; CHECK-NEXT:	add.rn.f32	r[[R1]], r[[R1]], r[[R3]];
-; CHECK-NEXT:	ret;
-  %a.real = extractvalue %complex %a, 0
-  %a.imag = extractvalue %complex %a, 1
-  %b.real = extractvalue %complex %b, 0
-  %b.imag = extractvalue %complex %b, 1
-  %ret.real = fadd float %a.real, %b.real
-  %ret.imag = fadd float %a.imag, %b.imag
-  %ret.0 = insertvalue %complex undef, float %ret.real, 0
-  %ret.1 = insertvalue %complex %ret.0, float %ret.imag, 1
-  ret %complex %ret.1
-}
diff --git a/test/CodeGen/PTX/bitwise.ll b/test/CodeGen/PTX/bitwise.ll
deleted file mode 100644
index 1403a23..0000000
--- a/test/CodeGen/PTX/bitwise.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-; preds
-
-define ptx_device i32 @t1_and_preds(i1 %x, i1 %y) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-  %c = and i1 %x, %y
-  %d = zext i1 %c to i32 
-  ret i32 %d
-}
-
-define ptx_device i32 @t1_or_preds(i1 %x, i1 %y) {
-; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-  %a = or i1 %x, %y
-  %b = zext i1 %a to i32 
-  ret i32 %b
-}
-
-define ptx_device i32 @t1_xor_preds(i1 %x, i1 %y) {
-; CHECK: xor.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-  %a = xor i1 %x, %y
-  %b = zext i1 %a to i32 
-  ret i32 %b
-}
diff --git a/test/CodeGen/PTX/bra.ll b/test/CodeGen/PTX/bra.ll
deleted file mode 100644
index 464c29c..0000000
--- a/test/CodeGen/PTX/bra.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device void @test_bra_direct() {
-; CHECK: bra $L__BB0_1;
-entry:
-	br label %loop
-loop:
-	br label %loop
-}
-
-define ptx_device i32 @test_bra_cond_direct(i32 %x, i32 %y) {
-entry:
-; CHECK: setp.le.u32 %p0, %r[[R0:[0-9]+]], %r[[R1:[0-9]+]]
-	%p = icmp ugt i32 %x, %y
-; CHECK-NEXT: @%p0 bra
-; CHECK-NOT: bra
-	br i1 %p, label %clause.if, label %clause.else
-clause.if:
-; CHECK: mov.u32 %ret{{[0-9]+}}, %r[[R0]]
-	ret i32 %x
-clause.else:
-; CHECK: mov.u32 %ret{{[0-9]+}}, %r[[R1]]
-	ret i32 %y
-}
diff --git a/test/CodeGen/PTX/cvt.ll b/test/CodeGen/PTX/cvt.ll
deleted file mode 100644
index f55070a..0000000
--- a/test/CodeGen/PTX/cvt.ll
+++ /dev/null
@@ -1,290 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-; preds
-; (note: we convert back to i32 to return)
-
-define ptx_device i32 @cvt_pred_i16(i16 %x, i1 %y) {
-; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rh{{[0-9]+}}, 0
-; CHECK: and.pred %p2, %p[[P0:[0-9]+]], %p{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0:[0-9]+]];
-; CHECK: ret;
-	%a = trunc i16 %x to i1
-	%b = and i1 %a, %y
-	%c = zext i1 %b to i32
-	ret i32 %c
-}
-
-define ptx_device i32 @cvt_pred_i32(i32 %x, i1 %y) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0
-; CHECK: and.pred %p2, %p[[P0:[0-9]+]], %p{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0:[0-9]+]];
-; CHECK: ret;
-	%a = trunc i32 %x to i1
-	%b = and i1 %a, %y
-	%c = zext i1 %b to i32
-	ret i32 %c
-}
-
-define ptx_device i32 @cvt_pred_i64(i64 %x, i1 %y) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, 0
-; CHECK: and.pred %p2, %p[[P0:[0-9]+]], %p{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0:[0-9]+]];
-; CHECK: ret;
-	%a = trunc i64 %x to i1
-	%b = and i1 %a, %y
-	%c = zext i1 %b to i32
-	ret i32 %c
-}
-
-define ptx_device i32 @cvt_pred_f32(float %x, i1 %y) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0
-; CHECK: and.pred %p2, %p[[P0:[0-9]+]], %p{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0:[0-9]+]];
-; CHECK: ret;
-	%a = fptoui float %x to i1
-	%b = and i1 %a, %y
-	%c = zext i1 %b to i32
-	ret i32 %c
-}
-
-define ptx_device i32 @cvt_pred_f64(double %x, i1 %y) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, 0
-; CHECK: and.pred %p2, %p[[P0:[0-9]+]], %p{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0:[0-9]+]];
-; CHECK: ret;
-	%a = fptoui double %x to i1
-	%b = and i1 %a, %y
-	%c = zext i1 %b to i32
-	ret i32 %c
-}
-
-; i16
-
-define ptx_device i16 @cvt_i16_preds(i1 %x) {
-; CHECK: selp.u16 %ret{{[0-9]+}}, 1, 0, %p{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i1 %x to i16
-	ret i16 %a
-}
-
-define ptx_device i16 @cvt_i16_i32(i32 %x) {
-; CHECK: cvt.u16.u32 %ret{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%a = trunc i32 %x to i16
-	ret i16 %a
-}
-
-define ptx_device i16 @cvt_i16_i64(i64 %x) {
-; CHECK: cvt.u16.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%a = trunc i64 %x to i16
-	ret i16 %a
-}
-
-define ptx_device i16 @cvt_i16_f32(float %x) {
-; CHECK: cvt.rzi.u16.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui float %x to i16
-	ret i16 %a
-}
-
-define ptx_device i16 @cvt_i16_f64(double %x) {
-; CHECK: cvt.rzi.u16.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui double %x to i16
-	ret i16 %a
-}
-
-; i32
-
-define ptx_device i32 @cvt_i32_preds(i1 %x) {
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i1 %x to i32
-	ret i32 %a
-}
-
-define ptx_device i32 @cvt_i32_i16(i16 %x) {
-; CHECK: cvt.u32.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i16 %x to i32
-	ret i32 %a
-}
-
-define ptx_device i32 @cvt_i32_i64(i64 %x) {
-; CHECK: cvt.u32.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%a = trunc i64 %x to i32
-	ret i32 %a
-}
-
-define ptx_device i32 @cvt_i32_f32(float %x) {
-; CHECK: cvt.rzi.u32.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui float %x to i32
-	ret i32 %a
-}
-
-define ptx_device i32 @cvt_i32_f64(double %x) {
-; CHECK: cvt.rzi.u32.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui double %x to i32
-	ret i32 %a
-}
-
-; i64
-
-define ptx_device i64 @cvt_i64_preds(i1 %x) {
-; CHECK: selp.u64 %ret{{[0-9]+}}, 1, 0, %p{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i1 %x to i64
-	ret i64 %a
-}
-
-define ptx_device i64 @cvt_i64_i16(i16 %x) {
-; CHECK: cvt.u64.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i16 %x to i64
-	ret i64 %a
-}
-
-define ptx_device i64 @cvt_i64_i32(i32 %x) {
-; CHECK: cvt.u64.u32 %ret{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%a = zext i32 %x to i64
-	ret i64 %a
-}
-
-define ptx_device i64 @cvt_i64_f32(float %x) {
-; CHECK: cvt.rzi.u64.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui float %x to i64
-	ret i64 %a
-}
-
-define ptx_device i64 @cvt_i64_f64(double %x) {
-; CHECK: cvt.rzi.u64.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fptoui double %x to i64
-	ret i64 %a
-}
-
-; f32
-
-define ptx_device float @cvt_f32_preds(i1 %x) {
-; CHECK: mov.b32 %f0, 0;
-; CHECK: mov.b32 %f1, 1065353216;
-; CHECK: selp.f32 %ret{{[0-9]+}}, %f1, %f0, %p{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i1 %x to float
-	ret float %a
-}
-
-define ptx_device float @cvt_f32_i16(i16 %x) {
-; CHECK: cvt.rn.f32.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i16 %x to float
-	ret float %a
-}
-
-define ptx_device float @cvt_f32_i32(i32 %x) {
-; CHECK: cvt.rn.f32.u32 %ret{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i32 %x to float
-	ret float %a
-}
-
-define ptx_device float @cvt_f32_i64(i64 %x) {
-; CHECK: cvt.rn.f32.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i64 %x to float
-	ret float %a
-}
-
-define ptx_device float @cvt_f32_f64(double %x) {
-; CHECK: cvt.rn.f32.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fptrunc double %x to float
-	ret float %a
-}
-
-define ptx_device float @cvt_f32_s16(i16 %x) {
-; CHECK: cvt.rn.f32.s16 %ret{{[0-9]+}}, %rh{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i16 %x to float
-  ret float %a
-}
-
-define ptx_device float @cvt_f32_s32(i32 %x) {
-; CHECK: cvt.rn.f32.s32 %ret{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i32 %x to float
-  ret float %a
-}
-
-define ptx_device float @cvt_f32_s64(i64 %x) {
-; CHECK: cvt.rn.f32.s64 %ret{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i64 %x to float
-  ret float %a
-}
-
-; f64
-
-define ptx_device double @cvt_f64_preds(i1 %x) {
-; CHECK: mov.b64 %fd0, 0;
-; CHECK: mov.b64 %fd1, 4575657221408423936;
-; CHECK: selp.f64 %ret{{[0-9]+}}, %fd1, %fd0, %p{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i1 %x to double
-	ret double %a
-}
-
-define ptx_device double @cvt_f64_i16(i16 %x) {
-; CHECK: cvt.rn.f64.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i16 %x to double
-	ret double %a
-}
-
-define ptx_device double @cvt_f64_i32(i32 %x) {
-; CHECK: cvt.rn.f64.u32 %ret{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i32 %x to double
-	ret double %a
-}
-
-define ptx_device double @cvt_f64_i64(i64 %x) {
-; CHECK: cvt.rn.f64.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%a = uitofp i64 %x to double
-	ret double %a
-}
-
-define ptx_device double @cvt_f64_f32(float %x) {
-; CHECK: cvt.f64.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fpext float %x to double
-	ret double %a
-}
-
-define ptx_device double @cvt_f64_s16(i16 %x) {
-; CHECK: cvt.rn.f64.s16 %ret{{[0-9]+}}, %rh{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i16 %x to double
-  ret double %a
-}
-
-define ptx_device double @cvt_f64_s32(i32 %x) {
-; CHECK: cvt.rn.f64.s32 %ret{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i32 %x to double
-  ret double %a
-}
-
-define ptx_device double @cvt_f64_s64(i64 %x) {
-; CHECK: cvt.rn.f64.s64 %ret{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: ret
-  %a = sitofp i64 %x to double
-  ret double %a
-}
diff --git a/test/CodeGen/PTX/exit.ll b/test/CodeGen/PTX/exit.ll
deleted file mode 100644
index 7816c80..0000000
--- a/test/CodeGen/PTX/exit.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_kernel void @t1() {
-; CHECK: exit;
-; CHECK-NOT: ret;
-  ret void
-}
-
-define ptx_kernel void @t2(i32* %p, i32 %x) {
-  store i32 %x, i32* %p
-; CHECK: exit;
-; CHECK-NOT: ret;
-  ret void
-}
diff --git a/test/CodeGen/PTX/fdiv-sm10.ll b/test/CodeGen/PTX/fdiv-sm10.ll
deleted file mode 100644
index e1013be..0000000
--- a/test/CodeGen/PTX/fdiv-sm10.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=+sm10 | FileCheck %s
-
-define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: div.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fdiv float %x, %y
-	ret float %a
-}
-
-define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: div.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fdiv double %x, %y
-	ret double %a
-}
diff --git a/test/CodeGen/PTX/fdiv-sm13.ll b/test/CodeGen/PTX/fdiv-sm13.ll
deleted file mode 100644
index 1afa2eb..0000000
--- a/test/CodeGen/PTX/fdiv-sm13.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=+sm13 | FileCheck %s
-
-define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: div.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fdiv float %x, %y
-	ret float %a
-}
-
-define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: div.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fdiv double %x, %y
-	ret double %a
-}
diff --git a/test/CodeGen/PTX/fneg.ll b/test/CodeGen/PTX/fneg.ll
deleted file mode 100644
index 2b76e63..0000000
--- a/test/CodeGen/PTX/fneg.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device float @t1_f32(float %x) {
-; CHECK: neg.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%y = fsub float -0.000000e+00, %x
-	ret float %y
-}
-
-define ptx_device double @t1_f64(double %x) {
-; CHECK: neg.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%y = fsub double -0.000000e+00, %x
-	ret double %y
-}
diff --git a/test/CodeGen/PTX/ld.ll b/test/CodeGen/PTX/ld.ll
deleted file mode 100644
index e55820d..0000000
--- a/test/CodeGen/PTX/ld.ll
+++ /dev/null
@@ -1,382 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-;CHECK: .extern .global .b16 array_i16[10];
-@array_i16 = external global [10 x i16]
-
-;CHECK: .extern .const .b16 array_constant_i16[10];
-@array_constant_i16 = external addrspace(1) constant [10 x i16]
-
-;CHECK: .extern .shared .b16 array_shared_i16[10];
-@array_shared_i16 = external addrspace(4) global [10 x i16]
-
-;CHECK: .extern .global .b32 array_i32[10];
-@array_i32 = external global [10 x i32]
-
-;CHECK: .extern .const .b32 array_constant_i32[10];
-@array_constant_i32 = external addrspace(1) constant [10 x i32]
-
-;CHECK: .extern .shared .b32 array_shared_i32[10];
-@array_shared_i32 = external addrspace(4) global [10 x i32]
-
-;CHECK: .extern .global .b64 array_i64[10];
-@array_i64 = external global [10 x i64]
-
-;CHECK: .extern .const .b64 array_constant_i64[10];
-@array_constant_i64 = external addrspace(1) constant [10 x i64]
-
-;CHECK: .extern .shared .b64 array_shared_i64[10];
-@array_shared_i64 = external addrspace(4) global [10 x i64]
-
-;CHECK: .extern .global .b32 array_float[10];
-@array_float = external global [10 x float]
-
-;CHECK: .extern .const .b32 array_constant_float[10];
-@array_constant_float = external addrspace(1) constant [10 x float]
-
-;CHECK: .extern .shared .b32 array_shared_float[10];
-@array_shared_float = external addrspace(4) global [10 x float]
-
-;CHECK: .extern .global .b64 array_double[10];
-@array_double = external global [10 x double]
-
-;CHECK: .extern .const .b64 array_constant_double[10];
-@array_constant_double = external addrspace(1) constant [10 x double]
-
-;CHECK: .extern .shared .b64 array_shared_double[10];
-@array_shared_double = external addrspace(4) global [10 x double]
-
-
-define ptx_device i16 @t1_u16(i16* %p) {
-entry:
-;CHECK: ld.global.u16 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-;CHECK: ret;
-  %x = load i16* %p
-  ret i16 %x
-}
-
-define ptx_device i32 @t1_u32(i32* %p) {
-entry:
-;CHECK: ld.global.u32 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-;CHECK: ret;
-  %x = load i32* %p
-  ret i32 %x
-}
-
-define ptx_device i64 @t1_u64(i64* %p) {
-entry:
-;CHECK: ld.global.u64 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-;CHECK: ret;
-  %x = load i64* %p
-  ret i64 %x
-}
-
-define ptx_device float @t1_f32(float* %p) {
-entry:
-;CHECK: ld.global.f32 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-;CHECK: ret;
-  %x = load float* %p
-  ret float %x
-}
-
-define ptx_device double @t1_f64(double* %p) {
-entry:
-;CHECK: ld.global.f64 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-;CHECK: ret;
-  %x = load double* %p
-  ret double %x
-}
-
-define ptx_device i16 @t2_u16(i16* %p) {
-entry:
-;CHECK: ld.global.u16 %ret{{[0-9]+}}, [%r{{[0-9]+}}+2];
-;CHECK: ret;
-  %i = getelementptr i16* %p, i32 1
-  %x = load i16* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t2_u32(i32* %p) {
-entry:
-;CHECK: ld.global.u32 %ret{{[0-9]+}}, [%r{{[0-9]+}}+4];
-;CHECK: ret;
-  %i = getelementptr i32* %p, i32 1
-  %x = load i32* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t2_u64(i64* %p) {
-entry:
-;CHECK: ld.global.u64 %ret{{[0-9]+}}, [%r{{[0-9]+}}+8];
-;CHECK: ret;
-  %i = getelementptr i64* %p, i32 1
-  %x = load i64* %i
-  ret i64 %x
-}
-
-define ptx_device float @t2_f32(float* %p) {
-entry:
-;CHECK: ld.global.f32 %ret{{[0-9]+}}, [%r{{[0-9]+}}+4];
-;CHECK: ret;
-  %i = getelementptr float* %p, i32 1
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device double @t2_f64(double* %p) {
-entry:
-;CHECK: ld.global.f64 %ret{{[0-9]+}}, [%r{{[0-9]+}}+8];
-;CHECK: ret;
-  %i = getelementptr double* %p, i32 1
-  %x = load double* %i
-  ret double %x
-}
-
-define ptx_device i16 @t3_u16(i16* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 1;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: ld.global.u16 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-  %i = getelementptr i16* %p, i32 %q
-  %x = load i16* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t3_u32(i32* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 2;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: ld.global.u32 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-  %i = getelementptr i32* %p, i32 %q
-  %x = load i32* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t3_u64(i64* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 3;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: ld.global.u64 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-  %i = getelementptr i64* %p, i32 %q
-  %x = load i64* %i
-  ret i64 %x
-}
-
-define ptx_device float @t3_f32(float* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 2;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: ld.global.f32 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-  %i = getelementptr float* %p, i32 %q
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device double @t3_f64(double* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 3;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: ld.global.f64 %ret{{[0-9]+}}, [%r{{[0-9]+}}];
-  %i = getelementptr double* %p, i32 %q
-  %x = load double* %i
-  ret double %x
-}
-
-define ptx_device i16 @t4_global_u16() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i16;
-;CHECK: ld.global.u16 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 0
-  %x = load i16* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t4_global_u32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i32;
-;CHECK: ld.global.u32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0
-  %x = load i32* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t4_global_u64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i64;
-;CHECK: ld.global.u64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0
-  %x = load i64* %i
-  ret i64 %x
-}
-
-define ptx_device float @t4_global_f32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_float;
-;CHECK: ld.global.f32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device double @t4_global_f64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_double;
-;CHECK: ld.global.f64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
-  %x = load double* %i
-  ret double %x
-}
-
-define ptx_device i16 @t4_const_u16() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_constant_i16;
-;CHECK: ld.const.u16 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i16] addrspace(1)* @array_constant_i16, i32 0, i32 0
-  %x = load i16 addrspace(1)* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t4_const_u32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_constant_i32;
-;CHECK: ld.const.u32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i32] addrspace(1)* @array_constant_i32, i32 0, i32 0
-  %x = load i32 addrspace(1)* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t4_const_u64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_constant_i64;
-;CHECK: ld.const.u64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i64] addrspace(1)* @array_constant_i64, i32 0, i32 0
-  %x = load i64 addrspace(1)* %i
-  ret i64 %x
-}
-
-define ptx_device float @t4_const_f32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_constant_float;
-;CHECK: ld.const.f32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x float] addrspace(1)* @array_constant_float, i32 0, i32 0
-  %x = load float addrspace(1)* %i
-  ret float %x
-}
-
-define ptx_device double @t4_const_f64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_constant_double;
-;CHECK: ld.const.f64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x double] addrspace(1)* @array_constant_double, i32 0, i32 0
-  %x = load double addrspace(1)* %i
-  ret double %x
-}
-
-define ptx_device i16 @t4_shared_u16() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i16;
-;CHECK: ld.shared.u16 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0
-  %x = load i16 addrspace(4)* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t4_shared_u32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i32;
-;CHECK: ld.shared.u32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0
-  %x = load i32 addrspace(4)* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t4_shared_u64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i64;
-;CHECK: ld.shared.u64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0
-  %x = load i64 addrspace(4)* %i
-  ret i64 %x
-}
-
-define ptx_device float @t4_shared_f32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_float;
-;CHECK: ld.shared.f32 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
-  %x = load float addrspace(4)* %i
-  ret float %x
-}
-
-define ptx_device double @t4_shared_f64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_double;
-;CHECK: ld.shared.f64 %ret{{[0-9]+}}, [%r[[R0]]];
-;CHECK: ret;
-  %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
-  %x = load double addrspace(4)* %i
-  ret double %x
-}
-
-define ptx_device i16 @t5_u16() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i16;
-;CHECK: ld.global.u16 %ret{{[0-9]+}}, [%r[[R0]]+2];
-;CHECK: ret;
-  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1
-  %x = load i16* %i
-  ret i16 %x
-}
-
-define ptx_device i32 @t5_u32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i32;
-;CHECK: ld.global.u32 %ret{{[0-9]+}}, [%r[[R0]]+4];
-;CHECK: ret;
-  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1
-  %x = load i32* %i
-  ret i32 %x
-}
-
-define ptx_device i64 @t5_u64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i64;
-;CHECK: ld.global.u64 %ret{{[0-9]+}}, [%r[[R0]]+8];
-;CHECK: ret;
-  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1
-  %x = load i64* %i
-  ret i64 %x
-}
-
-define ptx_device float @t5_f32() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_float;
-;CHECK: ld.global.f32 %ret{{[0-9]+}}, [%r[[R0]]+4];
-;CHECK: ret;
-  %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device double @t5_f64() {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_double;
-;CHECK: ld.global.f64 %ret{{[0-9]+}}, [%r[[R0]]+8];
-;CHECK: ret;
-  %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
-  %x = load double* %i
-  ret double %x
-}
diff --git a/test/CodeGen/PTX/llvm-intrinsic.ll b/test/CodeGen/PTX/llvm-intrinsic.ll
deleted file mode 100644
index e73ad25..0000000
--- a/test/CodeGen/PTX/llvm-intrinsic.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=+ptx20 | FileCheck %s
-
-define ptx_device float @test_sqrt_f32(float %x) {
-entry:
-; CHECK: sqrt.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %y = call float @llvm.sqrt.f32(float %x)
-  ret float %y
-}
-
-define ptx_device double @test_sqrt_f64(double %x) {
-entry:
-; CHECK: sqrt.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %y = call double @llvm.sqrt.f64(double %x)
-  ret double %y
-}
-
-define ptx_device float @test_sin_f32(float %x) {
-entry:
-; CHECK: sin.approx.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %y = call float @llvm.sin.f32(float %x)
-  ret float %y
-}
-
-define ptx_device double @test_sin_f64(double %x) {
-entry:
-; CHECK: sin.approx.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %y = call double @llvm.sin.f64(double %x)
-  ret double %y
-}
-
-define ptx_device float @test_cos_f32(float %x) {
-entry:
-; CHECK: cos.approx.f32 %ret{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %y = call float @llvm.cos.f32(float %x)
-  ret float %y
-}
-
-define ptx_device double @test_cos_f64(double %x) {
-entry:
-; CHECK: cos.approx.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %y = call double @llvm.cos.f64(double %x)
-  ret double %y
-}
-
-declare float  @llvm.sqrt.f32(float)
-declare double @llvm.sqrt.f64(double)
-declare float  @llvm.sin.f32(float)
-declare double @llvm.sin.f64(double)
-declare float  @llvm.cos.f32(float)
-declare double @llvm.cos.f64(double)
diff --git a/test/CodeGen/PTX/mad-disabling.ll b/test/CodeGen/PTX/mad-disabling.ll
deleted file mode 100644
index 603c3ba..0000000
--- a/test/CodeGen/PTX/mad-disabling.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20 | FileCheck %s -check-prefix=FMA
-; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20,+no-fma | FileCheck %s -check-prefix=MUL
-; RUN: llc < %s -march=ptx64 -mattr=+ptx20,+sm20 | FileCheck %s -check-prefix=FMA
-; RUN: llc < %s -march=ptx64 -mattr=+ptx20,+sm20,+no-fma | FileCheck %s -check-prefix=MUL
-
-define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
-entry:
-; FMA: mad.rn.f32
-; MUL: mul.rn.f32
-; MUL: add.rn.f32
-  %a = fmul float %x, %y
-  %b = fadd float %a, %z
-  ret float %b
-}
-
-define ptx_device double @test_mul_add_d(double %x, double %y, double %z) {
-entry:
-; FMA: mad.rn.f64
-; MUL: mul.rn.f64
-; MUL: add.rn.f64
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
-  ret double %b
-}
diff --git a/test/CodeGen/PTX/mad.ll b/test/CodeGen/PTX/mad.ll
deleted file mode 100644
index cc28e3f..0000000
--- a/test/CodeGen/PTX/mad.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=+sm13 | FileCheck %s
-
-define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: mad.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-	%a = fmul float %x, %y
-  %b = fadd float %a, %z
-	ret float %b
-}
-
-define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: mad.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-	%a = fmul double %x, %y
-  %b = fadd double %a, %z
-	ret double %b
-}
diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll
deleted file mode 100644
index 9e501be..0000000
--- a/test/CodeGen/PTX/mov.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i16 @t1_u16() {
-; CHECK: mov.u16 %ret{{[0-9]+}}, 0;
-; CHECK: ret;
-	ret i16 0
-}
-
-define ptx_device i32 @t1_u32() {
-; CHECK: mov.u32 %ret{{[0-9]+}}, 0;
-; CHECK: ret;
-	ret i32 0
-}
-
-define ptx_device i64 @t1_u64() {
-; CHECK: mov.u64 %ret{{[0-9]+}}, 0;
-; CHECK: ret;
-	ret i64 0
-}
-
-define ptx_device float @t1_f32() {
-; CHECK: mov.f32 %ret{{[0-9]+}}, 0D0000000000000000;
-; CHECK: ret;
-	ret float 0.0
-}
-
-define ptx_device double @t1_f64() {
-; CHECK: mov.f64 %ret{{[0-9]+}}, 0D0000000000000000;
-; CHECK: ret;
-	ret double 0.0
-}
-
-define ptx_device i16 @t2_u16(i16 %x) {
-; CHECK: mov.b16 %ret{{[0-9]+}}, %arg{{[0-9]+}};
-; CHECK: ret;
-	ret i16 %x
-}
-
-define ptx_device i32 @t2_u32(i32 %x) {
-; CHECK: mov.b32 %ret{{[0-9]+}}, %arg{{[0-9]+}};
-; CHECK: ret;
-	ret i32 %x
-}
-
-define ptx_device i64 @t2_u64(i64 %x) {
-; CHECK: mov.b64 %ret{{[0-9]+}}, %arg{{[0-9]+}};
-; CHECK: ret;
-	ret i64 %x
-}
-
-define ptx_device float @t3_f32(float %x) {
-; CHECK: mov.f32 %ret{{[0-9]+}}, %arg{{[0-9]+}};
-; CHECK: ret;
-	ret float %x
-}
-
-define ptx_device double @t3_f64(double %x) {
-; CHECK: mov.f64 %ret{{[0-9]+}}, %arg{{[0-9]+}};
-; CHECK: ret;
-	ret double %x
-}
-
diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll
deleted file mode 100644
index 91949db..0000000
--- a/test/CodeGen/PTX/mul.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-;define ptx_device i32 @t1(i32 %x, i32 %y) {
-;	%z = mul i32 %x, %y
-;	ret i32 %z
-;}
-
-;define ptx_device i32 @t2(i32 %x) {
-;	%z = mul i32 %x, 1
-;	ret i32 %z
-;}
-
-define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: mul.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret;
-  %z = fmul float %x, %y
-  ret float %z
-}
-
-define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: mul.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
-; CHECK: ret;
-  %z = fmul double %x, %y
-  ret double %z
-}
-
-define ptx_device float @t2_f32(float %x) {
-; CHECK: mul.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, 0D4014000000000000;
-; CHECK: ret;
-  %z = fmul float %x, 5.0
-  ret float %z
-}
-
-define ptx_device double @t2_f64(double %x) {
-; CHECK: mul.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, 0D4014000000000000;
-; CHECK: ret;
-  %z = fmul double %x, 5.0
-  ret double %z
-}
diff --git a/test/CodeGen/PTX/options.ll b/test/CodeGen/PTX/options.ll
deleted file mode 100644
index 0fb6602..0000000
--- a/test/CodeGen/PTX/options.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=ptx20 | grep ".version 2.0"
-; RUN: llc < %s -march=ptx32 -mattr=ptx21 | grep ".version 2.1"
-; RUN: llc < %s -march=ptx32 -mattr=ptx22 | grep ".version 2.2"
-; RUN: llc < %s -march=ptx32 -mattr=ptx23 | grep ".version 2.3"
-; RUN: llc < %s -march=ptx32 -mattr=sm10 | grep ".target sm_10"
-; RUN: llc < %s -march=ptx32 -mattr=sm13 | grep ".target sm_13"
-; RUN: llc < %s -march=ptx32 -mattr=sm20 | grep ".target sm_20"
-; RUN: llc < %s -march=ptx32 -mattr=ptx23 | grep ".address_size 32"
-; RUN: llc < %s -march=ptx64 -mattr=ptx23 | grep ".address_size 64"
-
-define ptx_device void @t1() {
-	ret void
-}
diff --git a/test/CodeGen/PTX/parameter-order.ll b/test/CodeGen/PTX/parameter-order.ll
deleted file mode 100644
index 377f173..0000000
--- a/test/CodeGen/PTX/parameter-order.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-; CHECK: .func (.reg .b32 %ret{{[0-9]+}}) test_parameter_order (.reg .f32 %arg{{[0-9]+}}, .reg .b32 %arg{{[0-9]+}}, .reg .b32 %arg{{[0-9]+}}, .reg .f32 %arg{{[0-9]+}})
-define ptx_device i32 @test_parameter_order(float %a, i32 %b, i32 %c, float %d) {
-; CHECK: sub.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
-	%result = sub i32 %b, %c
-	ret i32 %result
-}
diff --git a/test/CodeGen/PTX/printf.ll b/test/CodeGen/PTX/printf.ll
deleted file mode 100644
index f901b20..0000000
--- a/test/CodeGen/PTX/printf.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=ptx64 -mattr=+ptx20,+sm20 | FileCheck %s
-
-declare i32 @printf(i8*, ...)
-
-@str = private unnamed_addr constant [6 x i8] c"test\0A\00"
-
-define ptx_device void @t1_printf() {
-; CHECK: mov.u64 %rd{{[0-9]+}}, $L__str;
-; CHECK: call.uni	(__localparam_{{[0-9]+}}), vprintf, (__localparam_{{[0-9]+}}, __localparam_{{[0-9]+}});
-; CHECK: ret;
-    %1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @str, i64 0, i64 0))
-	ret void
-}
-
-@str2 = private unnamed_addr constant [11 x i8] c"test = %f\0A\00"
-
-define ptx_device void @t2_printf() {
-; CHECK: .local .align 8 .b8 __local{{[0-9]+}}[{{[0-9]+}}];
-; CHECK: mov.u64 %rd{{[0-9]+}}, $L__str2;
-; CHECK: cvta.local.u64  %rd{{[0-9]+}}, __local{{[0-9+]}};
-; CHECK: call.uni	(__localparam_{{[0-9]+}}), vprintf, (__localparam_{{[0-9]+}}, __localparam_{{[0-9]+}});
-; CHECK: ret;
-  %1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @str2, i64 0, i64 0), double 0x3FF3333340000000)
-  ret void
-}
diff --git a/test/CodeGen/PTX/ret.ll b/test/CodeGen/PTX/ret.ll
deleted file mode 100644
index ba0523f..0000000
--- a/test/CodeGen/PTX/ret.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device void @t1() {
-; CHECK: ret;
-; CHECK-NOT: exit;
-	ret void
-}
diff --git a/test/CodeGen/PTX/selp.ll b/test/CodeGen/PTX/selp.ll
deleted file mode 100644
index aa7ce85..0000000
--- a/test/CodeGen/PTX/selp.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i32 @test_selp_i32(i1 %x, i32 %y, i32 %z) {
-; CHECK: selp.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}};
-	%a = select i1 %x, i32 %y, i32 %z
-	ret i32 %a
-}
-
-define ptx_device i64 @test_selp_i64(i1 %x, i64 %y, i64 %z) {
-; CHECK: selp.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}, %p{{[0-9]+}};
-	%a = select i1 %x, i64 %y, i64 %z
-	ret i64 %a
-}
-
-define ptx_device float @test_selp_f32(i1 %x, float %y, float %z) {
-; CHECK: selp.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %p{{[0-9]+}};
-	%a = select i1 %x, float %y, float %z
-	ret float %a
-}
-
-define ptx_device double @test_selp_f64(i1 %x, double %y, double %z) {
-; CHECK: selp.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %p{{[0-9]+}};
-	%a = select i1 %x, double %y, double %z
-	ret double %a
-}
diff --git a/test/CodeGen/PTX/setp.ll b/test/CodeGen/PTX/setp.ll
deleted file mode 100644
index 646abab..0000000
--- a/test/CodeGen/PTX/setp.ll
+++ /dev/null
@@ -1,206 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i32 @test_setp_eq_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.eq.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp eq i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ne_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.ne.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ne i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_lt_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ult i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_le_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ule i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_gt_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ugt i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ge_u32_rr(i32 %x, i32 %y) {
-; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp uge i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_lt_s32_rr(i32 %x, i32 %y) {
-; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp slt i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_le_s32_rr(i32 %x, i32 %y) {
-; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sle i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_gt_s32_rr(i32 %x, i32 %y) {
-; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sgt i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ge_s32_rr(i32 %x, i32 %y) {
-; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sge i32 %x, %y
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_eq_u32_ri(i32 %x) {
-; CHECK: setp.eq.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp eq i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ne_u32_ri(i32 %x) {
-; CHECK: setp.ne.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ne i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_lt_u32_ri(i32 %x) {
-; CHECK: setp.eq.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ult i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_le_u32_ri(i32 %x) {
-; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 2;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ule i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_gt_u32_ri(i32 %x) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp ugt i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ge_u32_ri(i32 %x) {
-; CHECK: setp.ne.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp uge i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_lt_s32_ri(i32 %x) {
-; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp slt i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_le_s32_ri(i32 %x) {
-; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 2;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sle i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_gt_s32_ri(i32 %x) {
-; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sgt i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_ge_s32_ri(i32 %x) {
-; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0;
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p[[P0]];
-; CHECK: ret;
-	%p = icmp sge i32 %x, 1
-	%z = zext i1 %p to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_4_op_format_1(i32 %x, i32 %y, i32 %u, i32 %v) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: setp.eq.and.u32 %p1, %r{{[0-9]+}}, %r{{[0-9]+}}, %p[[P0]];
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p1;
-; CHECK: ret;
-	%c = icmp eq i32 %x, %y
-	%d = icmp ugt i32 %u, %v
-	%e = and i1 %c, %d
-	%z = zext i1 %e to i32
-	ret i32 %z
-}
-
-define ptx_device i32 @test_setp_4_op_format_2(i32 %x, i32 %y, i32 %w) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, 0;
-; CHECK: setp.eq.and.u32 %p1, %r{{[0-9]+}}, %r{{[0-9]+}}, !%p[[P0]];
-; CHECK: selp.u32 %ret{{[0-9]+}}, 1, 0, %p1;
-; CHECK: ret;
-	%c = trunc i32 %w to i1
-	%d = icmp eq i32 %x, %y
-	%e = xor i1 %c, 1
-	%f = and i1 %d, %e
-	%z = zext i1 %f to i32
-	ret i32 %z
-}
diff --git a/test/CodeGen/PTX/shl.ll b/test/CodeGen/PTX/shl.ll
deleted file mode 100644
index d9fe2cd..0000000
--- a/test/CodeGen/PTX/shl.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i32 @t1(i32 %x, i32 %y) {
-; CHECK: shl.b32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
-	%z = shl i32 %x, %y
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t2(i32 %x) {
-; CHECK: shl.b32 %ret{{[0-9]+}}, %r{{[0-9]+}}, 3
-	%z = shl i32 %x, 3
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t3(i32 %x) {
-; CHECK: shl.b32 %ret{{[0-9]+}}, 3, %r{{[0-9]+}}
-	%z = shl i32 3, %x
-; CHECK: ret;
-	ret i32 %z
-}
diff --git a/test/CodeGen/PTX/shr.ll b/test/CodeGen/PTX/shr.ll
deleted file mode 100644
index eb4666f..0000000
--- a/test/CodeGen/PTX/shr.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i32 @t1(i32 %x, i32 %y) {
-; CHECK: shr.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
-	%z = lshr i32 %x, %y
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t2(i32 %x) {
-; CHECK: shr.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, 3
-	%z = lshr i32 %x, 3
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t3(i32 %x) {
-; CHECK: shr.u32 %ret{{[0-9]+}}, 3, %r{{[0-9]+}}
-	%z = lshr i32 3, %x
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t4(i32 %x, i32 %y) {
-; CHECK: shr.s32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
-	%z = ashr i32 %x, %y
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t5(i32 %x) {
-; CHECK: shr.s32 %ret{{[0-9]+}}, %r{{[0-9]+}}, 3
-	%z = ashr i32 %x, 3
-; CHECK: ret;
-	ret i32 %z
-}
-
-define ptx_device i32 @t6(i32 %x) {
-; CHECK: shr.s32 %ret{{[0-9]+}}, -3, %r{{[0-9]+}}
-	%z = ashr i32 -3, %x
-; CHECK: ret;
-	ret i32 %z
-}
diff --git a/test/CodeGen/PTX/simple-call.ll b/test/CodeGen/PTX/simple-call.ll
deleted file mode 100644
index 77ea29e..0000000
--- a/test/CodeGen/PTX/simple-call.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=sm20 | FileCheck %s
-
-define ptx_device void @test_add(float %x, float %y) {
-; CHECK: ret;
-	%z = fadd float %x, %y
-	ret void
-}
-
-define ptx_device float @test_call(float %x, float %y) {
-  %a = fadd float %x, %y
-; CHECK: call.uni test_add, (__localparam_{{[0-9]+}}, __localparam_{{[0-9]+}});
-  call void @test_add(float %a, float %y)
-  ret float %a
-}
-
-define ptx_device float @test_compute(float %x, float %y) {
-; CHECK: ret;
-  %z = fadd float %x, %y
-  ret float %z
-}
-
-define ptx_device float @test_call_compute(float %x, float %y) {
-; CHECK: call.uni (__localparam_{{[0-9]+}}), test_compute, (__localparam_{{[0-9]+}}, __localparam_{{[0-9]+}})
-  %z = call float @test_compute(float %x, float %y)
-  ret float %z
-}
-
diff --git a/test/CodeGen/PTX/st.ll b/test/CodeGen/PTX/st.ll
deleted file mode 100644
index c794363..0000000
--- a/test/CodeGen/PTX/st.ll
+++ /dev/null
@@ -1,337 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-;CHECK: .extern .global .b16 array_i16[10];
-@array_i16 = external global [10 x i16]
-
-;CHECK: .extern .const .b16 array_constant_i16[10];
-@array_constant_i16 = external addrspace(1) constant [10 x i16]
-
-;CHECK: .extern .shared .b16 array_shared_i16[10];
-@array_shared_i16 = external addrspace(4) global [10 x i16]
-
-;CHECK: .extern .global .b32 array_i32[10];
-@array_i32 = external global [10 x i32]
-
-;CHECK: .extern .const .b32 array_constant_i32[10];
-@array_constant_i32 = external addrspace(1) constant [10 x i32]
-
-;CHECK: .extern .shared .b32 array_shared_i32[10];
-@array_shared_i32 = external addrspace(4) global [10 x i32]
-
-;CHECK: .extern .global .b64 array_i64[10];
-@array_i64 = external global [10 x i64]
-
-;CHECK: .extern .const .b64 array_constant_i64[10];
-@array_constant_i64 = external addrspace(1) constant [10 x i64]
-
-;CHECK: .extern .shared .b64 array_shared_i64[10];
-@array_shared_i64 = external addrspace(4) global [10 x i64]
-
-;CHECK: .extern .global .b32 array_float[10];
-@array_float = external global [10 x float]
-
-;CHECK: .extern .const .b32 array_constant_float[10];
-@array_constant_float = external addrspace(1) constant [10 x float]
-
-;CHECK: .extern .shared .b32 array_shared_float[10];
-@array_shared_float = external addrspace(4) global [10 x float]
-
-;CHECK: .extern .global .b64 array_double[10];
-@array_double = external global [10 x double]
-
-;CHECK: .extern .const .b64 array_constant_double[10];
-@array_constant_double = external addrspace(1) constant [10 x double]
-
-;CHECK: .extern .shared .b64 array_shared_double[10];
-@array_shared_double = external addrspace(4) global [10 x double]
-
-
-define ptx_device void @t1_u16(i16* %p, i16 %x) {
-entry:
-;CHECK: st.global.u16 [%r{{[0-9]+}}], %rh{{[0-9]+}};
-;CHECK: ret;
-  store i16 %x, i16* %p
-  ret void
-}
-
-define ptx_device void @t1_u32(i32* %p, i32 %x) {
-entry:
-;CHECK: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}};
-;CHECK: ret;
-  store i32 %x, i32* %p
-  ret void
-}
-
-define ptx_device void @t1_u64(i64* %p, i64 %x) {
-entry:
-;CHECK: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}};
-;CHECK: ret;
-  store i64 %x, i64* %p
-  ret void
-}
-
-define ptx_device void @t1_f32(float* %p, float %x) {
-entry:
-;CHECK: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}};
-;CHECK: ret;
-  store float %x, float* %p
-  ret void
-}
-
-define ptx_device void @t1_f64(double* %p, double %x) {
-entry:
-;CHECK: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}};
-;CHECK: ret;
-  store double %x, double* %p
-  ret void
-}
-
-define ptx_device void @t2_u16(i16* %p, i16 %x) {
-entry:
-;CHECK: st.global.u16 [%r{{[0-9]+}}+2], %rh{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i16* %p, i32 1
-  store i16 %x, i16* %i
-  ret void
-}
-
-define ptx_device void @t2_u32(i32* %p, i32 %x) {
-entry:
-;CHECK: st.global.u32 [%r{{[0-9]+}}+4], %r{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i32* %p, i32 1
-  store i32 %x, i32* %i
-  ret void
-}
-
-define ptx_device void @t2_u64(i64* %p, i64 %x) {
-entry:
-;CHECK: st.global.u64 [%r{{[0-9]+}}+8], %rd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i64* %p, i32 1
-  store i64 %x, i64* %i
-  ret void
-}
-
-define ptx_device void @t2_f32(float* %p, float %x) {
-entry:
-;CHECK: st.global.f32 [%r{{[0-9]+}}+4], %f{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr float* %p, i32 1
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t2_f64(double* %p, double %x) {
-entry:
-;CHECK: st.global.f64 [%r{{[0-9]+}}+8], %fd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr double* %p, i32 1
-  store double %x, double* %i
-  ret void
-}
-
-define ptx_device void @t3_u16(i16* %p, i32 %q, i16 %x) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 1;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: st.global.u16 [%r{{[0-9]+}}], %rh{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i16* %p, i32 %q
-  store i16 %x, i16* %i
-  ret void
-}
-
-define ptx_device void @t3_u32(i32* %p, i32 %q, i32 %x) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 2;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i32* %p, i32 %q
-  store i32 %x, i32* %i
-  ret void
-}
-
-define ptx_device void @t3_u64(i64* %p, i32 %q, i64 %x) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 3;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr i64* %p, i32 %q
-  store i64 %x, i64* %i
-  ret void
-}
-
-define ptx_device void @t3_f32(float* %p, i32 %q, float %x) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 2;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr float* %p, i32 %q
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t3_f64(double* %p, i32 %q, double %x) {
-entry:
-;CHECK: shl.b32 %r[[R0:[0-9]+]], %r{{[0-9]+}}, 3;
-;CHECK: add.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r[[R0]];
-;CHECK: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr double* %p, i32 %q
-  store double %x, double* %i
-  ret void
-}
-
-define ptx_device void @t4_global_u16(i16 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i16;
-;CHECK: st.global.u16 [%r[[R0]]], %rh{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i16]* @array_i16, i16 0, i16 0
-  store i16 %x, i16* %i
-  ret void
-}
-
-define ptx_device void @t4_global_u32(i32 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i32;
-;CHECK: st.global.u32 [%r[[R0]]], %r{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0
-  store i32 %x, i32* %i
-  ret void
-}
-
-define ptx_device void @t4_global_u64(i64 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i64;
-;CHECK: st.global.u64 [%r[[R0]]], %rd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0
-  store i64 %x, i64* %i
-  ret void
-}
-
-define ptx_device void @t4_global_f32(float %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_float;
-;CHECK: st.global.f32 [%r[[R0]]], %f{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t4_global_f64(double %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_double;
-;CHECK: st.global.f64 [%r[[R0]]], %fd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
-  store double %x, double* %i
-  ret void
-}
-
-define ptx_device void @t4_shared_u16(i16 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i16;
-;CHECK: st.shared.u16 [%r[[R0]]], %rh{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0
-  store i16 %x, i16 addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t4_shared_u32(i32 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i32;
-;CHECK: st.shared.u32 [%r[[R0]]], %r{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0
-  store i32 %x, i32 addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t4_shared_u64(i64 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_i64;
-;CHECK: st.shared.u64 [%r[[R0]]], %rd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0
-  store i64 %x, i64 addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t4_shared_f32(float %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_float;
-;CHECK: st.shared.f32 [%r[[R0]]], %f{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
-  store float %x, float addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t4_shared_f64(double %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_shared_double;
-;CHECK: st.shared.f64 [%r[[R0]]], %fd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
-  store double %x, double addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t5_u16(i16 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i16;
-;CHECK: st.global.u16 [%r[[R0]]+2], %rh{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1
-  store i16 %x, i16* %i
-  ret void
-}
-
-define ptx_device void @t5_u32(i32 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i32;
-;CHECK: st.global.u32 [%r[[R0]]+4], %r{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1
-  store i32 %x, i32* %i
-  ret void
-}
-
-define ptx_device void @t5_u64(i64 %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_i64;
-;CHECK: st.global.u64 [%r[[R0]]+8], %rd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1
-  store i64 %x, i64* %i
-  ret void
-}
-
-define ptx_device void @t5_f32(float %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_float;
-;CHECK: st.global.f32 [%r[[R0]]+4], %f{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t5_f64(double %x) {
-entry:
-;CHECK: mov.u32 %r[[R0:[0-9]+]], array_double;
-;CHECK: st.global.f64 [%r[[R0]]+8], %fd{{[0-9]+}};
-;CHECK: ret;
-  %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
-  store double %x, double* %i
-  ret void
-}
diff --git a/test/CodeGen/PTX/stack-object.ll b/test/CodeGen/PTX/stack-object.ll
deleted file mode 100644
index 65f8ee2..0000000
--- a/test/CodeGen/PTX/stack-object.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=ptx32 -mattr=sm20 | FileCheck %s
-
-define ptx_device float @stack1(float %a) {
-  ; CHECK: .local .align 4 .b8 __local0[4];
-  %a.2 = alloca float, align 4
-  ; CHECK: st.local.f32 [__local0], %f0
-  store float %a, float* %a.2
-  %a.3 = load float* %a.2
-  ret float %a.3
-}
-
-define ptx_device float @stack1_align8(float %a) {
-  ; CHECK: .local .align 8 .b8 __local0[4];
-  %a.2 = alloca float, align 8
-  ; CHECK: st.local.f32 [__local0], %f0
-  store float %a, float* %a.2
-  %a.3 = load float* %a.2
-  ret float %a.3
-}
diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll
deleted file mode 100644
index 7ac886a..0000000
--- a/test/CodeGen/PTX/sub.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc < %s -march=ptx32 | FileCheck %s
-
-define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
-; CHECK: sub.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}}, %rh{{[0-9]+}};
-; CHECK: ret;
-	%z = sub i16 %x, %y
-	ret i16 %z
-}
-
-define ptx_device i32 @t1_u32(i32 %x, i32 %y) {
-; CHECK: sub.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}};
-; CHECK: ret;
-	%z = sub i32 %x, %y
-	ret i32 %z
-}
-
-define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
-; CHECK: sub.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}};
-; CHECK: ret;
-	%z = sub i64 %x, %y
-	ret i64 %z
-}
-
-define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: sub.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret;
-  %z = fsub float %x, %y
-  ret float %z
-}
-
-define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: sub.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
-; CHECK: ret;
-  %z = fsub double %x, %y
-  ret double %z
-}
-
-define ptx_device i16 @t2_u16(i16 %x) {
-; CHECK: add.u16 %ret{{[0-9]+}}, %rh{{[0-9]+}}, -1;
-; CHECK: ret;
-	%z = sub i16 %x, 1
-	ret i16 %z
-}
-
-define ptx_device i32 @t2_u32(i32 %x) {
-; CHECK: add.u32 %ret{{[0-9]+}}, %r{{[0-9]+}}, -1;
-; CHECK: ret;
-	%z = sub i32 %x, 1
-	ret i32 %z
-}
-
-define ptx_device i64 @t2_u64(i64 %x) {
-; CHECK: add.u64 %ret{{[0-9]+}}, %rd{{[0-9]+}}, -1;
-; CHECK: ret;
-	%z = sub i64 %x, 1
-	ret i64 %z
-}
-
-define ptx_device float @t2_f32(float %x) {
-; CHECK: add.rn.f32 %ret{{[0-9]+}}, %f{{[0-9]+}}, 0DBFF0000000000000;
-; CHECK: ret;
-  %z = fsub float %x, 1.0
-  ret float %z
-}
-
-define ptx_device double @t2_f64(double %x) {
-; CHECK: add.rn.f64 %ret{{[0-9]+}}, %fd{{[0-9]+}}, 0DBFF0000000000000;
-; CHECK: ret;
-  %z = fsub double %x, 1.0
-  ret double %z
-}
diff --git a/test/CodeGen/PowerPC/2005-09-02-LegalizeDuplicatesCalls.ll b/test/CodeGen/PowerPC/2005-09-02-LegalizeDuplicatesCalls.ll
index 5d1df46..4373660 100644
--- a/test/CodeGen/PowerPC/2005-09-02-LegalizeDuplicatesCalls.ll
+++ b/test/CodeGen/PowerPC/2005-09-02-LegalizeDuplicatesCalls.ll
@@ -1,7 +1,7 @@
 ; This function should have exactly one call to fixdfdi, no more!
 
 ; RUN: llc < %s -march=ppc32 -mattr=-64bit | \
-; RUN:    grep {bl .*fixdfdi} | count 1
+; RUN:    grep "bl .*fixdfdi" | count 1
 
 define double @test2(double %tmp.7705) {
         %mem_tmp.2.0.in = fptosi double %tmp.7705 to i64                ; <i64> [#uses=1]
diff --git a/test/CodeGen/PowerPC/2006-01-11-darwin-fp-argument.ll b/test/CodeGen/PowerPC/2006-01-11-darwin-fp-argument.ll
index 97bb48e..aeb28af 100644
--- a/test/CodeGen/PowerPC/2006-01-11-darwin-fp-argument.ll
+++ b/test/CodeGen/PowerPC/2006-01-11-darwin-fp-argument.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep {, f1}
+; RUN: llc < %s | not grep ", f1"
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.2.0"
diff --git a/test/CodeGen/PowerPC/2006-04-05-splat-ish.ll b/test/CodeGen/PowerPC/2006-04-05-splat-ish.ll
index 969772e..7e84538 100644
--- a/test/CodeGen/PowerPC/2006-04-05-splat-ish.ll
+++ b/test/CodeGen/PowerPC/2006-04-05-splat-ish.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
-; RUN:   grep {vspltish v.*, 10}
+; RUN:   grep "vspltish v.*, 10"
 
 define void @test(<8 x i16>* %P) {
         %tmp = load <8 x i16>* %P               ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll b/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
index 86fd947..73736c5 100644
--- a/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
+++ b/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep {foo r3, r4}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep {bari r3, 47}
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep "foo r3, r4"
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep "bari r3, 47"
 
 ; PR1351
 
diff --git a/test/CodeGen/PowerPC/2007-04-30-InlineAsmEarlyClobber.ll b/test/CodeGen/PowerPC/2007-04-30-InlineAsmEarlyClobber.ll
index 3489477..53231b4 100644
--- a/test/CodeGen/PowerPC/2007-04-30-InlineAsmEarlyClobber.ll
+++ b/test/CodeGen/PowerPC/2007-04-30-InlineAsmEarlyClobber.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s | FileCheck %s
-; RUN: llc < %s -regalloc=fast | FileCheck %s
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 ; The first argument of subfc must not be the same as any other register.
 
 ; CHECK: subfc [[REG:r.]],
diff --git a/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll b/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
index c141551..382ba1f 100644
--- a/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/PowerPC/2007-05-22-tailmerge-3.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=ppc32 | grep bl.*baz | count 2
-; RUN: llc < %s -march=ppc32 | grep bl.*quux | count 2
-; RUN: llc < %s -march=ppc32 -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llc < %s -march=ppc32 -enable-tail-merge=1 | grep bl.*quux | count 1
-; Check that tail merging is not the default on ppc, and that -enable-tail-merge works.
+; RUN: llc < %s -march=ppc32 -enable-tail-merge=0 | grep bl.*baz | count 2
+; RUN: llc < %s -march=ppc32 -enable-tail-merge=0 | grep bl.*quux | count 2
+; RUN: llc < %s -march=ppc32 | grep bl.*baz | count 1
+; RUN: llc < %s -march=ppc32 | grep bl.*quux | count 1
+; Check that tail merging is the default on ppc, and that -enable-tail-merge works.
 
 ; ModuleID = 'tail.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll b/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
index 72e93a9..b85792c 100644
--- a/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
+++ b/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
@@ -1,7 +1,7 @@
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "powerpc-apple-darwin8.8.0"
 
-; RUN: llc < %s -march=ppc32 | grep {rlwinm r3, r3, 23, 30, 30}
+; RUN: llc < %s -march=ppc32 | grep "rlwinm r3, r3, 23, 30, 30"
 ; PR1473
 
 define zeroext i8 @foo(i16 zeroext  %a)   {
diff --git a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
index 556a4a1c40..a60d11c 100644
--- a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
+++ b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-apple-darwin9 -regalloc=fast -relocation-model=pic
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin9 -regalloc=fast -optimize-regalloc=0 -relocation-model=pic
 
 	%struct.NSError = type opaque
 	%struct.NSManagedObjectContext = type opaque
diff --git a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
index b3b9280..3d1a328 100644
--- a/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
+++ b/test/CodeGen/PowerPC/2007-10-21-LocalRegAllocAssert2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-apple-darwin9 -regalloc=fast -relocation-model=pic
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin9 -regalloc=fast -optimize-regalloc=0 -relocation-model=pic
 
 	%struct.NSError = type opaque
 	%struct.NSManagedObjectContext = type opaque
diff --git a/test/CodeGen/PowerPC/2008-02-09-LocalRegAllocAssert.ll b/test/CodeGen/PowerPC/2008-02-09-LocalRegAllocAssert.ll
index e03bd9e..e28a3e0 100644
--- a/test/CodeGen/PowerPC/2008-02-09-LocalRegAllocAssert.ll
+++ b/test/CodeGen/PowerPC/2008-02-09-LocalRegAllocAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -regalloc=fast -optimize-regalloc=0
 
 define i32 @bork(i64 %foo, i64 %bar) {
 entry:
diff --git a/test/CodeGen/PowerPC/2009-08-17-inline-asm-addr-mode-breakage.ll b/test/CodeGen/PowerPC/2009-08-17-inline-asm-addr-mode-breakage.ll
index 6a3c440..84aa40c 100644
--- a/test/CodeGen/PowerPC/2009-08-17-inline-asm-addr-mode-breakage.ll
+++ b/test/CodeGen/PowerPC/2009-08-17-inline-asm-addr-mode-breakage.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin10 -mcpu=g5 | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin10 -mcpu=g5 -disable-ppc-ilp-pref | FileCheck %s
 ; ModuleID = '<stdin>'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin10.0"
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 6b31397..0003a17 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 | FileCheck %s
 ; ModuleID = 'nn.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin11.0"
@@ -9,7 +9,9 @@ target triple = "powerpc-apple-darwin11.0"
 
 define void @foo() nounwind ssp {
 entry:
-; CHECK: mtctr r12
+; Better: mtctr r12
+; CHECK: mr r12, [[REG:r[0-9]+]]
+; CHECK: mtctr [[REG]]
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return
diff --git a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
index 6161b55..47d985c 100644
--- a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
+++ b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=g4 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 -disable-ppc-ilp-pref | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=g4 -disable-ppc-ilp-pref | FileCheck %s
 
 ; ModuleID = 'tsc.c'
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/Frames-leaf.ll b/test/CodeGen/PowerPC/Frames-leaf.ll
index c2e1d6b..7b1c464 100644
--- a/test/CodeGen/PowerPC/Frames-leaf.ll
+++ b/test/CodeGen/PowerPC/Frames-leaf.ll
@@ -1,35 +1,35 @@
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   not grep {stw r31, 20(r1)}
+; RUN:   not grep "stw r31, 20(r1)"
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   not grep {stwu r1, -.*(r1)}
+; RUN:   not grep "stwu r1, -.*(r1)"
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   not grep {addi r1, r1, }
+; RUN:   not grep "addi r1, r1, "
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   not grep {lwz r31, 20(r1)}
+; RUN:   not grep "lwz r31, 20(r1)"
 ; RUN: llc < %s -march=ppc32 -disable-fp-elim | \
-; RUN:   not grep {stw r31, 20(r1)}
+; RUN:   not grep "stw r31, 20(r1)"
 ; RUN: llc < %s -march=ppc32 -disable-fp-elim | \
-; RUN:   not grep {stwu r1, -.*(r1)}
+; RUN:   not grep "stwu r1, -.*(r1)"
 ; RUN: llc < %s -march=ppc32 -disable-fp-elim | \
-; RUN:   not grep {addi r1, r1, }
+; RUN:   not grep "addi r1, r1, "
 ; RUN: llc < %s -march=ppc32 -disable-fp-elim | \
-; RUN:   not grep {lwz r31, 20(r1)}
+; RUN:   not grep "lwz r31, 20(r1)"
 ; RUN: llc < %s -march=ppc64 | \
-; RUN:   not grep {std r31, 40(r1)}
+; RUN:   not grep "std r31, 40(r1)"
 ; RUN: llc < %s -march=ppc64 | \
-; RUN:   not grep {stdu r1, -.*(r1)}
+; RUN:   not grep "stdu r1, -.*(r1)"
 ; RUN: llc < %s -march=ppc64 | \
-; RUN:   not grep {addi r1, r1, }
+; RUN:   not grep "addi r1, r1, "
 ; RUN: llc < %s -march=ppc64 | \
-; RUN:   not grep {ld r31, 40(r1)}
+; RUN:   not grep "ld r31, 40(r1)"
 ; RUN: llc < %s -march=ppc64 -disable-fp-elim | \
-; RUN:   not grep {stw r31, 40(r1)}
+; RUN:   not grep "stw r31, 40(r1)"
 ; RUN: llc < %s -march=ppc64 -disable-fp-elim | \
-; RUN:   not grep {stdu r1, -.*(r1)}
+; RUN:   not grep "stdu r1, -.*(r1)"
 ; RUN: llc < %s -march=ppc64 -disable-fp-elim | \
-; RUN:   not grep {addi r1, r1, }
+; RUN:   not grep "addi r1, r1, "
 ; RUN: llc < %s -march=ppc64 -disable-fp-elim | \
-; RUN:   not grep {ld r31, 40(r1)}
+; RUN:   not grep "ld r31, 40(r1)"
 
 define i32* @f1() {
         %tmp = alloca i32, i32 2                ; <i32*> [#uses=1]
diff --git a/test/CodeGen/PowerPC/Frames-small.ll b/test/CodeGen/PowerPC/Frames-small.ll
index ecd5ecd..0f6bd10 100644
--- a/test/CodeGen/PowerPC/Frames-small.ll
+++ b/test/CodeGen/PowerPC/Frames-small.ll
@@ -1,26 +1,26 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -o %t1
-; RUN: not grep {stw r31, -4(r1)} %t1
-; RUN: grep {stwu r1, -16448(r1)} %t1
-; RUN: grep {addi r1, r1, 16448} %t1
+; RUN: not grep "stw r31, -4(r1)" %t1
+; RUN: grep "stwu r1, -16448(r1)" %t1
+; RUN: grep "addi r1, r1, 16448" %t1
 ; RUN: llc < %s -march=ppc32 | \
-; RUN: not grep {lwz r31, -4(r1)}
+; RUN: not grep "lwz r31, -4(r1)"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \
 ; RUN:   -o %t2
-; RUN: grep {stw r31, -4(r1)} %t2
-; RUN: grep {stwu r1, -16448(r1)} %t2
-; RUN: grep {addi r1, r1, 16448} %t2
-; RUN: grep {lwz r31, -4(r1)} %t2
+; RUN: grep "stw r31, -4(r1)" %t2
+; RUN: grep "stwu r1, -16448(r1)" %t2
+; RUN: grep "addi r1, r1, 16448" %t2
+; RUN: grep "lwz r31, -4(r1)" %t2
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -o %t3
-; RUN: not grep {std r31, -8(r1)} %t3
-; RUN: grep {stdu r1, -16496(r1)} %t3
-; RUN: grep {addi r1, r1, 16496} %t3
-; RUN: not grep {ld r31, -8(r1)} %t3
+; RUN: not grep "std r31, -8(r1)" %t3
+; RUN: grep "stdu r1, -16496(r1)" %t3
+; RUN: grep "addi r1, r1, 16496" %t3
+; RUN: not grep "ld r31, -8(r1)" %t3
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \
 ; RUN:   -o %t4
-; RUN: grep {std r31, -8(r1)} %t4
-; RUN: grep {stdu r1, -16512(r1)} %t4
-; RUN: grep {addi r1, r1, 16512} %t4
-; RUN: grep {ld r31, -8(r1)} %t4
+; RUN: grep "std r31, -8(r1)" %t4
+; RUN: grep "stdu r1, -16512(r1)" %t4
+; RUN: grep "addi r1, r1, 16512" %t4
+; RUN: grep "ld r31, -8(r1)" %t4
 
 define i32* @f1() {
         %tmp = alloca i32, i32 4095             ; <i32*> [#uses=1]
diff --git a/test/CodeGen/PowerPC/LargeAbsoluteAddr.ll b/test/CodeGen/PowerPC/LargeAbsoluteAddr.ll
index 7b0d69c..6f985c8 100644
--- a/test/CodeGen/PowerPC/LargeAbsoluteAddr.ll
+++ b/test/CodeGen/PowerPC/LargeAbsoluteAddr.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | \
-; RUN:   grep {stw r4, 32751}
+; RUN:   grep "stw r4, 32751"
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin | \
-; RUN:   grep {stw r4, 32751}
+; RUN:   grep "stw r4, 32751"
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin | \
-; RUN:   grep {std r4, 9024}
+; RUN:   grep "std r4, 9024"
 
 define void @test() nounwind {
 	store i32 0, i32* inttoptr (i64 48725999 to i32*)
diff --git a/test/CodeGen/PowerPC/a2-fp-basic.ll b/test/CodeGen/PowerPC/a2-fp-basic.ll
index 932ad7a..de3aa7c 100644
--- a/test/CodeGen/PowerPC/a2-fp-basic.ll
+++ b/test/CodeGen/PowerPC/a2-fp-basic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=a2 -fp-contract=fast | FileCheck %s
 
 %0 = type { double, double }
 
diff --git a/test/CodeGen/PowerPC/and-imm.ll b/test/CodeGen/PowerPC/and-imm.ll
index 64a45e5..6fd484b 100644
--- a/test/CodeGen/PowerPC/and-imm.ll
+++ b/test/CodeGen/PowerPC/and-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | not grep {ori\\|lis}
+; RUN: llc < %s -march=ppc32 | not grep "ori\|lis"
 
 ; andi. r3, r3, 32769	
 define i32 @test(i32 %X) {
diff --git a/test/CodeGen/PowerPC/big-endian-actual-args.ll b/test/CodeGen/PowerPC/big-endian-actual-args.ll
index 009f468..898ad7c 100644
--- a/test/CodeGen/PowerPC/big-endian-actual-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-actual-args.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {addc 4, 4, 6}
+; RUN:   grep "addc 4, 4, 6"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {adde 3, 3, 5}
+; RUN:   grep "adde 3, 3, 5"
 
 define i64 @foo(i64 %x, i64 %y) {
   %z = add i64 %x, %y
diff --git a/test/CodeGen/PowerPC/big-endian-call-result.ll b/test/CodeGen/PowerPC/big-endian-call-result.ll
index fe85404..760833c 100644
--- a/test/CodeGen/PowerPC/big-endian-call-result.ll
+++ b/test/CodeGen/PowerPC/big-endian-call-result.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {addic 4, 4, 1}
+; RUN:   grep "addic 4, 4, 1"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {addze 3, 3}
+; RUN:   grep "addze 3, 3"
 
 declare i64 @foo()
 
diff --git a/test/CodeGen/PowerPC/branch-opt.ll b/test/CodeGen/PowerPC/branch-opt.ll
index cc02e40..dda1538 100644
--- a/test/CodeGen/PowerPC/branch-opt.ll
+++ b/test/CodeGen/PowerPC/branch-opt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {b LBB.*} | count 4
+; RUN:   grep "b LBB.*" | count 4
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.7.0"
diff --git a/test/CodeGen/PowerPC/calls.ll b/test/CodeGen/PowerPC/calls.ll
index 29bcb20..dcdda57 100644
--- a/test/CodeGen/PowerPC/calls.ll
+++ b/test/CodeGen/PowerPC/calls.ll
@@ -1,11 +1,11 @@
 ; Test various forms of calls.
 
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {bl } | count 1
+; RUN:   grep "bl " | count 1
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {bctrl} | count 1
+; RUN:   grep "bctrl" | count 1
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {bla } | count 1
+; RUN:   grep "bla " | count 1
 
 declare void @foo()
 
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
new file mode 100644
index 0000000..cc80f83
--- /dev/null
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=ppc64 -mtriple=powerpc64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add [[SUM:r[0-9]+]], r3, r4
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: extsw [[EXT:r[0-9]+]], [[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: std [[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: stw [[EXT]]
+  ret i32 %D
+}
diff --git a/test/CodeGen/PowerPC/compare-simm.ll b/test/CodeGen/PowerPC/compare-simm.ll
index 92d1dbe..94c5c02 100644
--- a/test/CodeGen/PowerPC/compare-simm.ll
+++ b/test/CodeGen/PowerPC/compare-simm.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | \
-; RUN:   grep {cmpwi cr0, r3, -1}
+; RUN:   grep "cmpwi cr0, r3, -1"
 
 define i32 @test(i32 %x) nounwind {
         %c = icmp eq i32 %x, -1
diff --git a/test/CodeGen/PowerPC/constants.ll b/test/CodeGen/PowerPC/constants.ll
index 8901e02..9efca91 100644
--- a/test/CodeGen/PowerPC/constants.ll
+++ b/test/CodeGen/PowerPC/constants.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -march=ppc32 | \
 ; RUN:   grep ori | count 3
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {li } | count 4
+; RUN:   grep "li " | count 4
 
 define i32 @f1() {
 entry:
diff --git a/test/CodeGen/PowerPC/ctrloop-reg.ll b/test/CodeGen/PowerPC/ctrloop-reg.ll
new file mode 100644
index 0000000..874e571
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-reg.ll
@@ -0,0 +1,87 @@
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211 = type { %union.v.0.48.90.114.120.138.144.150.156.162.168.174.180.210, i16, i16 }
+%union.v.0.48.90.114.120.138.144.150.156.162.168.174.180.210 = type { i64 }
+%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215 = type { i8*, i8*, i8*, i32, i8, i8, i64, %struct.stream_procs.2.50.92.116.122.140.146.152.158.164.170.176.182.212, i32, %struct._IO_FILE.4.52.94.118.124.142.148.154.160.166.172.178.184.214*, %struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*, i16, i32 }
+%struct.stream_procs.2.50.92.116.122.140.146.152.158.164.170.176.182.212 = type { i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*)*, i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*, i8)*, i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*, i64*)*, i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*, i64)*, i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*)*, i32 (%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*)* }
+%struct._IO_FILE.4.52.94.118.124.142.148.154.160.166.172.178.184.214 = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker.3.51.93.117.123.141.147.153.159.165.171.177.183.213*, %struct._IO_FILE.4.52.94.118.124.142.148.154.160.166.172.178.184.214*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker.3.51.93.117.123.141.147.153.159.165.171.177.183.213 = type { %struct._IO_marker.3.51.93.117.123.141.147.153.159.165.171.177.183.213*, %struct._IO_FILE.4.52.94.118.124.142.148.154.160.166.172.178.184.214*, i32 }
+
+@special_ops = external global [7 x i32 (%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)*], align 8
+@ostack = external global [520 x %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211], align 8
+@osbot = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@osp = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@ostop = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@osp_nargs = external global [6 x %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*], align 8
+@estack = external global [150 x %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211], align 8
+@esp = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@estop = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@dstack = external global [20 x %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211], align 8
+@dsp = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@dstop = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, align 8
+@name_errordict = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211
+@name_ErrorNames = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211
+@error_object = external global %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211, align 8
+
+declare i32 @zadd(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zdup(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zexch(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zifelse(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zle(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zpop(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare i32 @zsub(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*)
+
+declare void @interp_init(i32) nounwind
+
+declare void @interp_fix_op(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211* nocapture) nounwind
+
+define i32 @interpret(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211* %pref, i32 %user_errors) nounwind {
+entry:
+  %erref = alloca %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211, align 8
+  br i1 undef, label %retry.us, label %retry
+
+retry.us:                                         ; preds = %if.end18, %retry, %retry, %retry, %retry, %entry
+  ret i32 undef
+
+retry:                                            ; preds = %if.end18, %entry
+  %0 = phi %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211* [ null, %entry ], [ %erref, %if.end18 ]
+  %call = call i32 @interp(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211* %0)
+  switch i32 %call, label %if.end18 [
+    i32 -3, label %retry.us
+    i32 -5, label %retry.us
+    i32 -16, label %retry.us
+    i32 -25, label %retry.us
+  ]
+
+if.end18:                                         ; preds = %retry
+  br i1 false, label %retry.us, label %retry
+}
+
+; CHECK: @interpret
+
+declare i32 @interp_exit(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211* nocapture) nounwind readnone
+
+declare i32 @interp(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*) nounwind
+
+declare i32 @dict_lookup(%struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211*, %struct.ref_s.1.49.91.115.121.139.145.151.157.163.169.175.181.211**)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+declare i32 @obj_compare(...)
+
+declare i32 @file_check_read(...)
+
+declare i32 @scan_token(...)
+
+declare i32 @file_close(...)
+
+declare void @sread_string(%struct.stream_s.5.53.95.119.125.143.149.155.161.167.173.179.185.215*, i8*, i32)
diff --git a/test/CodeGen/PowerPC/ctrloop-s000.ll b/test/CodeGen/PowerPC/ctrloop-s000.ll
new file mode 100644
index 0000000..dcea06f
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-s000.ll
@@ -0,0 +1,156 @@
+; ModuleID = 'tsc_s000.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@Y = common global [16000 x double] zeroinitializer, align 32
+@X = common global [16000 x double] zeroinitializer, align 32
+@Z = common global [16000 x double] zeroinitializer, align 32
+@U = common global [16000 x double] zeroinitializer, align 32
+@V = common global [16000 x double] zeroinitializer, align 32
+@aa = common global [256 x [256 x double]] zeroinitializer, align 32
+@bb = common global [256 x [256 x double]] zeroinitializer, align 32
+@cc = common global [256 x [256 x double]] zeroinitializer, align 32
+@array = common global [65536 x double] zeroinitializer, align 32
+@x = common global [16000 x double] zeroinitializer, align 32
+@temp = common global double 0.000000e+00, align 8
+@temp_int = common global i32 0, align 4
+@a = common global [16000 x double] zeroinitializer, align 32
+@b = common global [16000 x double] zeroinitializer, align 32
+@c = common global [16000 x double] zeroinitializer, align 32
+@d = common global [16000 x double] zeroinitializer, align 32
+@e = common global [16000 x double] zeroinitializer, align 32
+@tt = common global [256 x [256 x double]] zeroinitializer, align 32
+@indx = common global [16000 x i32] zeroinitializer, align 32
+@xx = common global double* null, align 8
+@yy = common global double* null, align 8
+
+define i32 @s000() nounwind {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.end, %entry
+  %nl.010 = phi i32 [ 0, %entry ], [ %inc7, %for.end ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next.15, %for.body3 ]
+  %arrayidx = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 32, !tbaa !0
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv
+  store double %add, double* %arrayidx5, align 32, !tbaa !0
+  %indvars.iv.next11 = or i64 %indvars.iv, 1
+  %arrayidx.1 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next11
+  %1 = load double* %arrayidx.1, align 8, !tbaa !0
+  %add.1 = fadd double %1, 1.000000e+00
+  %arrayidx5.1 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next11
+  store double %add.1, double* %arrayidx5.1, align 8, !tbaa !0
+  %indvars.iv.next.112 = or i64 %indvars.iv, 2
+  %arrayidx.2 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.112
+  %2 = load double* %arrayidx.2, align 16, !tbaa !0
+  %add.2 = fadd double %2, 1.000000e+00
+  %arrayidx5.2 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.112
+  store double %add.2, double* %arrayidx5.2, align 16, !tbaa !0
+  %indvars.iv.next.213 = or i64 %indvars.iv, 3
+  %arrayidx.3 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.213
+  %3 = load double* %arrayidx.3, align 8, !tbaa !0
+  %add.3 = fadd double %3, 1.000000e+00
+  %arrayidx5.3 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.213
+  store double %add.3, double* %arrayidx5.3, align 8, !tbaa !0
+  %indvars.iv.next.314 = or i64 %indvars.iv, 4
+  %arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.314
+  %4 = load double* %arrayidx.4, align 32, !tbaa !0
+  %add.4 = fadd double %4, 1.000000e+00
+  %arrayidx5.4 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.314
+  store double %add.4, double* %arrayidx5.4, align 32, !tbaa !0
+  %indvars.iv.next.415 = or i64 %indvars.iv, 5
+  %arrayidx.5 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.415
+  %5 = load double* %arrayidx.5, align 8, !tbaa !0
+  %add.5 = fadd double %5, 1.000000e+00
+  %arrayidx5.5 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.415
+  store double %add.5, double* %arrayidx5.5, align 8, !tbaa !0
+  %indvars.iv.next.516 = or i64 %indvars.iv, 6
+  %arrayidx.6 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.516
+  %6 = load double* %arrayidx.6, align 16, !tbaa !0
+  %add.6 = fadd double %6, 1.000000e+00
+  %arrayidx5.6 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.516
+  store double %add.6, double* %arrayidx5.6, align 16, !tbaa !0
+  %indvars.iv.next.617 = or i64 %indvars.iv, 7
+  %arrayidx.7 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.617
+  %7 = load double* %arrayidx.7, align 8, !tbaa !0
+  %add.7 = fadd double %7, 1.000000e+00
+  %arrayidx5.7 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.617
+  store double %add.7, double* %arrayidx5.7, align 8, !tbaa !0
+  %indvars.iv.next.718 = or i64 %indvars.iv, 8
+  %arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.718
+  %8 = load double* %arrayidx.8, align 32, !tbaa !0
+  %add.8 = fadd double %8, 1.000000e+00
+  %arrayidx5.8 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.718
+  store double %add.8, double* %arrayidx5.8, align 32, !tbaa !0
+  %indvars.iv.next.819 = or i64 %indvars.iv, 9
+  %arrayidx.9 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.819
+  %9 = load double* %arrayidx.9, align 8, !tbaa !0
+  %add.9 = fadd double %9, 1.000000e+00
+  %arrayidx5.9 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.819
+  store double %add.9, double* %arrayidx5.9, align 8, !tbaa !0
+  %indvars.iv.next.920 = or i64 %indvars.iv, 10
+  %arrayidx.10 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.920
+  %10 = load double* %arrayidx.10, align 16, !tbaa !0
+  %add.10 = fadd double %10, 1.000000e+00
+  %arrayidx5.10 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.920
+  store double %add.10, double* %arrayidx5.10, align 16, !tbaa !0
+  %indvars.iv.next.1021 = or i64 %indvars.iv, 11
+  %arrayidx.11 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1021
+  %11 = load double* %arrayidx.11, align 8, !tbaa !0
+  %add.11 = fadd double %11, 1.000000e+00
+  %arrayidx5.11 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1021
+  store double %add.11, double* %arrayidx5.11, align 8, !tbaa !0
+  %indvars.iv.next.1122 = or i64 %indvars.iv, 12
+  %arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1122
+  %12 = load double* %arrayidx.12, align 32, !tbaa !0
+  %add.12 = fadd double %12, 1.000000e+00
+  %arrayidx5.12 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1122
+  store double %add.12, double* %arrayidx5.12, align 32, !tbaa !0
+  %indvars.iv.next.1223 = or i64 %indvars.iv, 13
+  %arrayidx.13 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1223
+  %13 = load double* %arrayidx.13, align 8, !tbaa !0
+  %add.13 = fadd double %13, 1.000000e+00
+  %arrayidx5.13 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1223
+  store double %add.13, double* %arrayidx5.13, align 8, !tbaa !0
+  %indvars.iv.next.1324 = or i64 %indvars.iv, 14
+  %arrayidx.14 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1324
+  %14 = load double* %arrayidx.14, align 16, !tbaa !0
+  %add.14 = fadd double %14, 1.000000e+00
+  %arrayidx5.14 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1324
+  store double %add.14, double* %arrayidx5.14, align 16, !tbaa !0
+  %indvars.iv.next.1425 = or i64 %indvars.iv, 15
+  %arrayidx.15 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1425
+  %15 = load double* %arrayidx.15, align 8, !tbaa !0
+  %add.15 = fadd double %15, 1.000000e+00
+  %arrayidx5.15 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1425
+  store double %add.15, double* %arrayidx5.15, align 8, !tbaa !0
+  %indvars.iv.next.15 = add i64 %indvars.iv, 16
+  %lftr.wideiv.15 = trunc i64 %indvars.iv.next.15 to i32
+  %exitcond.15 = icmp eq i32 %lftr.wideiv.15, 16000
+  br i1 %exitcond.15, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %for.body3
+  %call = tail call i32 @dummy(double* getelementptr inbounds ([16000 x double]* @X, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Y, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Z, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @U, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @V, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i64 0, i64 0), double 0.000000e+00) nounwind
+  %inc7 = add nsw i32 %nl.010, 1
+  %exitcond = icmp eq i32 %inc7, 400000
+  br i1 %exitcond, label %for.end8, label %for.cond1.preheader
+
+for.end8:                                         ; preds = %for.end
+  ret i32 0
+
+; CHECK: @s000
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*, [256 x double]*, [256 x double]*, double)
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/ctrloop-sums.ll b/test/CodeGen/PowerPC/ctrloop-sums.ll
new file mode 100644
index 0000000..eae8c38
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-sums.ll
@@ -0,0 +1,134 @@
+; ModuleID = 'SingleSource/Regression/C/sumarray2d.c'
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@.str = private unnamed_addr constant [23 x i8] c"Sum(Array[%d,%d] = %d\0A\00", align 1
+
+define i32 @SumArray([100 x i32]* nocapture %Array, i32 %NumI, i32 %NumJ) nounwind readonly {
+entry:
+  %cmp12 = icmp eq i32 %NumI, 0
+  br i1 %cmp12, label %for.end8, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp29 = icmp eq i32 %NumJ, 0
+  br i1 %cmp29, label %for.inc6, label %for.body3.lr.ph.us
+
+for.inc6.us:                                      ; preds = %for.body3.us
+  %indvars.iv.next17 = add i64 %indvars.iv16, 1
+  %lftr.wideiv18 = trunc i64 %indvars.iv.next17 to i32
+  %exitcond19 = icmp eq i32 %lftr.wideiv18, %NumI
+  br i1 %exitcond19, label %for.end8, label %for.body3.lr.ph.us
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next, %for.body3.us ]
+  %Result.111.us = phi i32 [ %Result.014.us, %for.body3.lr.ph.us ], [ %add.us, %for.body3.us ]
+  %arrayidx5.us = getelementptr inbounds [100 x i32]* %Array, i64 %indvars.iv16, i64 %indvars.iv
+  %0 = load i32* %arrayidx5.us, align 4, !tbaa !0
+  %add.us = add nsw i32 %0, %Result.111.us
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %NumJ
+  br i1 %exitcond, label %for.inc6.us, label %for.body3.us
+
+for.body3.lr.ph.us:                               ; preds = %for.inc6.us, %for.cond1.preheader.lr.ph
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  %Result.014.us = phi i32 [ %add.us, %for.inc6.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  br label %for.body3.us
+
+for.inc6:                                         ; preds = %for.inc6, %for.cond1.preheader.lr.ph
+  %i.013 = phi i32 [ %inc7, %for.inc6 ], [ 0, %for.cond1.preheader.lr.ph ]
+  %inc7 = add i32 %i.013, 1
+  %exitcond20 = icmp eq i32 %inc7, %NumI
+  br i1 %exitcond20, label %for.end8, label %for.inc6
+
+for.end8:                                         ; preds = %for.inc6.us, %for.inc6, %entry
+  %Result.0.lcssa = phi i32 [ 0, %entry ], [ %add.us, %for.inc6.us ], [ 0, %for.inc6 ]
+  ret i32 %Result.0.lcssa
+; CHECK: @SumArray
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+define i32 @main() nounwind {
+entry:
+  %Array = alloca [100 x [100 x i32]], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+  %0 = trunc i64 %indvars.iv33 to i32
+  %sub = sub i32 0, %0
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+  store i32 %sub, i32* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+  br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader:                              ; preds = %for.body, %for.inc17
+  %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.inc14, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %2 = trunc i64 %indvars.iv29 to i32
+  %cmp9 = icmp eq i32 %1, %2
+  br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then:                                          ; preds = %for.body8
+  %3 = add i64 %indvars.iv, %indvars.iv29
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+  %4 = trunc i64 %3 to i32
+  store i32 %4, i32* %arrayidx13, align 4, !tbaa !0
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body8, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+  %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+  br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17:                                        ; preds = %for.inc14
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+  br i1 %exitcond32, label %for.body3.lr.ph.us.i, label %for.cond6.preheader
+
+for.inc6.us.i:                                    ; preds = %for.body3.us.i
+  %indvars.iv.next17.i = add i64 %indvars.iv16.i, 1
+  %lftr.wideiv24 = trunc i64 %indvars.iv.next17.i to i32
+  %exitcond25 = icmp eq i32 %lftr.wideiv24, 100
+  br i1 %exitcond25, label %SumArray.exit, label %for.body3.lr.ph.us.i
+
+for.body3.us.i:                                   ; preds = %for.body3.lr.ph.us.i, %for.body3.us.i
+  %indvars.iv.i = phi i64 [ 0, %for.body3.lr.ph.us.i ], [ %indvars.iv.next.i, %for.body3.us.i ]
+  %Result.111.us.i = phi i32 [ %Result.014.us.i, %for.body3.lr.ph.us.i ], [ %add.us.i, %for.body3.us.i ]
+  %arrayidx5.us.i = getelementptr inbounds [100 x [100 x i32]]* %Array, i64 0, i64 %indvars.iv16.i, i64 %indvars.iv.i
+  %5 = load i32* %arrayidx5.us.i, align 4, !tbaa !0
+  %add.us.i = add nsw i32 %5, %Result.111.us.i
+  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.inc6.us.i, label %for.body3.us.i
+
+for.body3.lr.ph.us.i:                             ; preds = %for.inc17, %for.inc6.us.i
+  %indvars.iv16.i = phi i64 [ %indvars.iv.next17.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+  %Result.014.us.i = phi i32 [ %add.us.i, %for.inc6.us.i ], [ 0, %for.inc17 ]
+  br label %for.body3.us.i
+
+SumArray.exit:                                    ; preds = %for.inc6.us.i
+  %call20 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str, i64 0, i64 0), i32 100, i32 100, i32 %add.us.i) nounwind
+  ret i32 0
+
+; CHECK: @main
+; CHECK: mtctr
+; CHECK: bdnz
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/ctrloops.ll b/test/CodeGen/PowerPC/ctrloops.ll
new file mode 100644
index 0000000..4b6f7b9
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloops.ll
@@ -0,0 +1,79 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-freebsd10.0"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@a = common global i32 0, align 4
+
+define void @test1(i32 %c) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load volatile i32* @a, align 4, !tbaa !0
+  %add = add nsw i32 %0, %c
+  store volatile i32 %add, i32* @a, align 4, !tbaa !0
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+; CHECK: @test1
+; CHECK-NOT: or 3, 3, 3
+; CHECK: mtctr
+; CHECK-NOT: addi
+; CHECK-NOT: cmplwi
+; CHECK: bdnz
+}
+
+define void @test2(i32 %c, i32 %d) nounwind {
+entry:
+  %cmp1 = icmp sgt i32 %d, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %0 = load volatile i32* @a, align 4, !tbaa !0
+  %add = add nsw i32 %0, %c
+  store volatile i32 %add, i32* @a, align 4, !tbaa !0
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, %d
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK: @test2
+; CHECK: mtctr
+; CHECK-NOT: addi
+; CHECK-NOT: cmplwi
+; CHECK: bdnz
+}
+
+define void @test3(i32 %c, i32 %d) nounwind {
+entry:
+  %cmp1 = icmp sgt i32 %d, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %mul = mul nsw i32 %i.02, %c
+  %0 = load volatile i32* @a, align 4, !tbaa !0
+  %add = add nsw i32 %0, %mul
+  store volatile i32 %add, i32* @a, align 4, !tbaa !0
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, %d
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK: @test3
+; CHECK: mtctr
+; CHECK-NOT: addi
+; CHECK-NOT: cmplwi
+; CHECK: bdnz
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/darwin-labels.ll b/test/CodeGen/PowerPC/darwin-labels.ll
index af23369..56f7782 100644
--- a/test/CodeGen/PowerPC/darwin-labels.ll
+++ b/test/CodeGen/PowerPC/darwin-labels.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {foo bar":}
+; RUN: llc < %s | grep 'foo bar":'
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.2.0"
diff --git a/test/CodeGen/PowerPC/fabs.ll b/test/CodeGen/PowerPC/fabs.ll
index 6ef740f..ddcce745 100644
--- a/test/CodeGen/PowerPC/fabs.ll
+++ b/test/CodeGen/PowerPC/fabs.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | grep {fabs f1, f1}
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
 
 define double @fabs(double %f) {
 entry:
-	%tmp2 = tail call double @fabs( double %f )		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %f ) readnone	; <double> [#uses=1]
 	ret double %tmp2
 }
diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll
index 815c72c..27496f7 100644
--- a/test/CodeGen/PowerPC/fma.ll
+++ b/test/CodeGen/PowerPC/fma.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 | \
-; RUN:   egrep {fn?madd|fn?msub} | count 8
+; RUN: llc < %s -march=ppc32 -fp-contract=fast | \
+; RUN:   egrep "fn?madd|fn?msub" | count 8
 
 define double @test_FMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
diff --git a/test/CodeGen/PowerPC/fnabs.ll b/test/CodeGen/PowerPC/fnabs.ll
index bbd5c71..9fa2dcb 100644
--- a/test/CodeGen/PowerPC/fnabs.ll
+++ b/test/CodeGen/PowerPC/fnabs.ll
@@ -3,7 +3,7 @@
 declare double @fabs(double)
 
 define double @test(double %X) {
-        %Y = call double @fabs( double %X )             ; <double> [#uses=1]
+        %Y = call double @fabs( double %X ) readnone     ; <double> [#uses=1]
         %Z = fsub double -0.000000e+00, %Y               ; <double> [#uses=1]
         ret double %Z
 }
diff --git a/test/CodeGen/PowerPC/fsqrt.ll b/test/CodeGen/PowerPC/fsqrt.ll
index 74a8725..bf8c4a2 100644
--- a/test/CodeGen/PowerPC/fsqrt.ll
+++ b/test/CodeGen/PowerPC/fsqrt.ll
@@ -2,13 +2,13 @@
 ; otherwise.
 
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=+fsqrt | \
-; RUN:   grep {fsqrt f1, f1}
+; RUN:   grep "fsqrt f1, f1"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
-; RUN:   grep {fsqrt f1, f1}
+; RUN:   grep "fsqrt f1, f1"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-fsqrt | \
-; RUN:   not grep {fsqrt f1, f1}
+; RUN:   not grep "fsqrt f1, f1"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g4 | \
-; RUN:   not grep {fsqrt f1, f1}
+; RUN:   not grep "fsqrt f1, f1"
 
 declare double @llvm.sqrt.f64(double)
 
diff --git a/test/CodeGen/PowerPC/iabs.ll b/test/CodeGen/PowerPC/iabs.ll
index a43f09c..7d089bb 100644
--- a/test/CodeGen/PowerPC/iabs.ll
+++ b/test/CodeGen/PowerPC/iabs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -stats |& \
-; RUN:   grep {4 .*Number of machine instrs printed}
+; RUN: llc < %s -march=ppc32 -stats 2>&1 | \
+; RUN:   grep "4 .*Number of machine instrs printed"
 
 ;; Integer absolute value, should produce something as good as:
 ;;      srawi r2, r3, 31
diff --git a/test/CodeGen/PowerPC/isel.ll b/test/CodeGen/PowerPC/isel.ll
new file mode 100644
index 0000000..ed494c5
--- /dev/null
+++ b/test/CodeGen/PowerPC/isel.ll
@@ -0,0 +1,23 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+
+define i64 @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
+entry:
+	%p = icmp uge i64 %a, %b
+	%x = select i1 %p, i64 %c, i64 %d
+	ret i64 %x
+; CHECK: @test1
+; CHECK: isel
+}
+
+define i32 @test2(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+	%p = icmp uge i32 %a, %b
+	%x = select i1 %p, i32 %c, i32 %d
+	ret i32 %x
+; CHECK: @test2
+; CHECK: isel
+}
+
diff --git a/test/CodeGen/PowerPC/ispositive.ll b/test/CodeGen/PowerPC/ispositive.ll
index 4161e34..78cdf4a 100644
--- a/test/CodeGen/PowerPC/ispositive.ll
+++ b/test/CodeGen/PowerPC/ispositive.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | \
-; RUN:   grep {srwi r3, r3, 31}
+; RUN:   grep "srwi r3, r3, 31"
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/PowerPC/lbzux.ll b/test/CodeGen/PowerPC/lbzux.ll
new file mode 100644
index 0000000..12f1d1f
--- /dev/null
+++ b/test/CodeGen/PowerPC/lbzux.ll
@@ -0,0 +1,49 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s | FileCheck %s
+
+define fastcc void @allocateSpace(i1 %cond1, i1 %cond2) nounwind {
+entry:
+  %0 = load i8** undef, align 8, !tbaa !0
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %if.end7, label %return
+
+if.end7:                                          ; preds = %lor.lhs.false
+  br i1 undef, label %if.then15, label %if.end71
+
+if.then15:                                        ; preds = %if.end7
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %if.then15
+  %idxprom17 = sext i32 0 to i64
+  %arrayidx18 = getelementptr inbounds i8* %0, i64 %idxprom17
+  %or = or i32 undef, undef
+  br i1 %cond1, label %if.end71, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  br i1 %cond2, label %while.cond, label %if.then45
+
+if.then45:                                        ; preds = %while.body
+  %idxprom48139 = zext i32 %or to i64
+  %arrayidx49 = getelementptr inbounds i8* %0, i64 %idxprom48139
+  %1 = bitcast i8* %arrayidx49 to i16*
+  %2 = bitcast i8* %arrayidx18 to i16*
+  %3 = load i16* %1, align 1
+  store i16 %3, i16* %2, align 1
+  br label %return
+
+if.end71:                                         ; preds = %while.cond, %if.end7
+  unreachable
+
+return:                                           ; preds = %if.then45, %lor.lhs.false, %entry
+  ret void
+
+; CHECK: @allocateSpace
+; CHECK: lbzux
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/long-compare.ll b/test/CodeGen/PowerPC/long-compare.ll
index 94c2526..915595f 100644
--- a/test/CodeGen/PowerPC/long-compare.ll
+++ b/test/CodeGen/PowerPC/long-compare.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=ppc32 | grep cntlzw 
 ; RUN: llc < %s -march=ppc32 | not grep xori 
-; RUN: llc < %s -march=ppc32 | not grep {li }
-; RUN: llc < %s -march=ppc32 | not grep {mr }
+; RUN: llc < %s -march=ppc32 | not grep "li "
+; RUN: llc < %s -march=ppc32 | not grep "mr "
 
 define i1 @test(i64 %x) {
   %tmp = icmp ult i64 %x, 4294967296
diff --git a/test/CodeGen/PowerPC/lsr-postinc-pos.ll b/test/CodeGen/PowerPC/lsr-postinc-pos.ll
index f441e42..42472c5 100644
--- a/test/CodeGen/PowerPC/lsr-postinc-pos.ll
+++ b/test/CodeGen/PowerPC/lsr-postinc-pos.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -print-lsr-output |& FileCheck %s
+; RUN: llc < %s -print-lsr-output 2>&1 | FileCheck %s
 
 ; The icmp is a post-inc use, and the increment is in %bb11, but the
 ; scevgep needs to be inserted in %bb so that it is dominated by %t.
diff --git a/test/CodeGen/PowerPC/mem_update.ll b/test/CodeGen/PowerPC/mem_update.ll
index 17e7e28..39af11a 100644
--- a/test/CodeGen/PowerPC/mem_update.ll
+++ b/test/CodeGen/PowerPC/mem_update.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=ppc32 -enable-ppc-preinc | \
+; RUN: llc < %s -march=ppc32 | \
 ; RUN:   not grep addi
-; RUN: llc < %s -march=ppc64 -enable-ppc-preinc | \
+; RUN: llc < %s -march=ppc64 | \
 ; RUN:   not grep addi
 
 @Glob = global i64 4
diff --git a/test/CodeGen/PowerPC/no-dead-strip.ll b/test/CodeGen/PowerPC/no-dead-strip.ll
index 3459413..6320e28 100644
--- a/test/CodeGen/PowerPC/no-dead-strip.ll
+++ b/test/CodeGen/PowerPC/no-dead-strip.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {no_dead_strip.*_X}
+; RUN: llc < %s | grep "no_dead_strip.*_X"
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "powerpc-apple-darwin8.8.0"
diff --git a/test/CodeGen/PowerPC/ppc440-fp-basic.ll b/test/CodeGen/PowerPC/ppc440-fp-basic.ll
index 1fad2fa..77b726c 100644
--- a/test/CodeGen/PowerPC/ppc440-fp-basic.ll
+++ b/test/CodeGen/PowerPC/ppc440-fp-basic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mcpu=440 | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mcpu=440 -fp-contract=fast | FileCheck %s
 
 %0 = type { double, double }
 
diff --git a/test/CodeGen/PowerPC/ppc64-cyclecounter.ll b/test/CodeGen/PowerPC/ppc64-cyclecounter.ll
new file mode 100644
index 0000000..38406ca
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-cyclecounter.ll
@@ -0,0 +1,15 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s | FileCheck %s
+
+define i64 @test1() nounwind {
+entry:
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
+}
+
+; CHECK: @test1
+; CHECK: mfspr 3, 268
+
+declare i64 @llvm.readcyclecounter()
+
diff --git a/test/CodeGen/PowerPC/retaddr.ll b/test/CodeGen/PowerPC/retaddr.ll
index cf16b4c..c931dfe 100644
--- a/test/CodeGen/PowerPC/retaddr.ll
+++ b/test/CodeGen/PowerPC/retaddr.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=ppc32 | grep mflr
 ; RUN: llc < %s -march=ppc32 | grep lwz
-; RUN: llc < %s -march=ppc64 | grep {ld r., 16(r1)}
+; RUN: llc < %s -march=ppc64 | grep "ld r., 16(r1)"
 
 target triple = "powerpc-apple-darwin8"
 
diff --git a/test/CodeGen/PowerPC/rlwimi-commute.ll b/test/CodeGen/PowerPC/rlwimi-commute.ll
index 6410c63..3f90008 100644
--- a/test/CodeGen/PowerPC/rlwimi-commute.ll
+++ b/test/CodeGen/PowerPC/rlwimi-commute.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 | grep rlwimi
-; RUN: llc < %s -march=ppc32 | not grep {or }
+; RUN: llc < %s -march=ppc32 | not grep "or "
 
 ; Make sure there is no register-register copies here.
 
diff --git a/test/CodeGen/PowerPC/rlwimi3.ll b/test/CodeGen/PowerPC/rlwimi3.ll
index 05d37bf..7efdbe9 100644
--- a/test/CodeGen/PowerPC/rlwimi3.ll
+++ b/test/CodeGen/PowerPC/rlwimi3.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -stats |& \
-; RUN:   grep {Number of machine instrs printed} | grep 12
+; RUN: llc < %s -march=ppc32 -stats 2>&1 | \
+; RUN:   grep "Number of machine instrs printed" | grep 12
 
 define i16 @Trans16Bit(i32 %srcA, i32 %srcB, i32 %alpha) {
 	%tmp1 = shl i32 %srcA, 15		; <i32> [#uses=1]
diff --git a/test/CodeGen/PowerPC/seteq-0.ll b/test/CodeGen/PowerPC/seteq-0.ll
index 688b29a..7319583 100644
--- a/test/CodeGen/PowerPC/seteq-0.ll
+++ b/test/CodeGen/PowerPC/seteq-0.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | \
-; RUN:   grep {srwi r., r., 5}
+; RUN:   grep "srwi r., r., 5"
 
 define i32 @eq0(i32 %a) {
         %tmp.1 = icmp eq i32 %a, 0              ; <i1> [#uses=1]
diff --git a/test/CodeGen/PowerPC/small-arguments.ll b/test/CodeGen/PowerPC/small-arguments.ll
index b4767b0..19ca098 100644
--- a/test/CodeGen/PowerPC/small-arguments.ll
+++ b/test/CodeGen/PowerPC/small-arguments.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | not grep {extsh\\|rlwinm}
+; RUN: llc < %s -march=ppc32 | not grep "extsh\|rlwinm"
 
 declare signext i16 @foo()  
 
diff --git a/test/CodeGen/PowerPC/stack-protector.ll b/test/CodeGen/PowerPC/stack-protector.ll
index 2020361..810630f6 100644
--- a/test/CodeGen/PowerPC/stack-protector.ll
+++ b/test/CodeGen/PowerPC/stack-protector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=ppc32 < %s -o - | grep {__stack_chk_guard}
-; RUN: llc -march=ppc32 < %s -o - | grep {__stack_chk_fail}
+; RUN: llc -march=ppc32 < %s -o - | grep "__stack_chk_guard"
+; RUN: llc -march=ppc32 < %s -o - | grep "__stack_chk_fail"
 
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <[11 x i8]*> [#uses=1]
 
diff --git a/test/CodeGen/PowerPC/stwu-gta.ll b/test/CodeGen/PowerPC/stwu-gta.ll
new file mode 100644
index 0000000..4febe7e
--- /dev/null
+++ b/test/CodeGen/PowerPC/stwu-gta.ll
@@ -0,0 +1,22 @@
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+; RUN: llc < %s | FileCheck %s
+
+%class.Two.0.5 = type { i32, i32, i32 }
+
+@foo = external global %class.Two.0.5, align 4
+
+define void @_GLOBAL__I_a() nounwind section ".text.startup" {
+entry:
+  store i32 5, i32* getelementptr inbounds (%class.Two.0.5* @foo, i32 0, i32 0), align 4, !tbaa !0
+  store i32 6, i32* getelementptr inbounds (%class.Two.0.5* @foo, i32 0, i32 1), align 4, !tbaa !0
+  ret void
+}
+
+; CHECK: @_GLOBAL__I_a
+; CHECK-NOT: stwux
+; CHECK: stwu
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/stwu8.ll b/test/CodeGen/PowerPC/stwu8.ll
new file mode 100644
index 0000000..897bfc6
--- /dev/null
+++ b/test/CodeGen/PowerPC/stwu8.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%class.spell_checker.21.103.513.538 = type { %"class.std::map.20.102.512.537" }
+%"class.std::map.20.102.512.537" = type { %"class.std::_Rb_tree.19.101.511.536" }
+%"class.std::_Rb_tree.19.101.511.536" = type { %"struct.std::_Rb_tree<std::pair<const char *, const char *>, std::pair<const std::pair<const char *, const char *>, int>, std::_Select1st<std::pair<const std::pair<const char *, const char *>, int>>, std::less<std::pair<const char *, const char *>>, std::allocator<std::pair<const std::pair<const char *, const char *>, int>> >::_Rb_tree_impl.18.100.510.535" }
+%"struct.std::_Rb_tree<std::pair<const char *, const char *>, std::pair<const std::pair<const char *, const char *>, int>, std::_Select1st<std::pair<const std::pair<const char *, const char *>, int>>, std::less<std::pair<const char *, const char *>>, std::allocator<std::pair<const std::pair<const char *, const char *>, int>> >::_Rb_tree_impl.18.100.510.535" = type { %"struct.std::less.16.98.508.533", %"struct.std::_Rb_tree_node_base.17.99.509.534", i64 }
+%"struct.std::less.16.98.508.533" = type { i8 }
+%"struct.std::_Rb_tree_node_base.17.99.509.534" = type { i32, %"struct.std::_Rb_tree_node_base.17.99.509.534"*, %"struct.std::_Rb_tree_node_base.17.99.509.534"*, %"struct.std::_Rb_tree_node_base.17.99.509.534"* }
+
+define void @test1(%class.spell_checker.21.103.513.538* %this) unnamed_addr align 2 {
+entry:
+  %_M_header.i.i.i.i.i.i = getelementptr inbounds %class.spell_checker.21.103.513.538* %this, i64 0, i32 0, i32 0, i32 0, i32 1
+  %0 = bitcast %"struct.std::_Rb_tree_node_base.17.99.509.534"* %_M_header.i.i.i.i.i.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 4, i1 false) nounwind
+  store %"struct.std::_Rb_tree_node_base.17.99.509.534"* %_M_header.i.i.i.i.i.i, %"struct.std::_Rb_tree_node_base.17.99.509.534"** undef, align 8, !tbaa !0
+  unreachable
+}
+
+; CHECK: @test1
+; CHECK: stwu
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/stwux.ll b/test/CodeGen/PowerPC/stwux.ll
new file mode 100644
index 0000000..737e9d9
--- /dev/null
+++ b/test/CodeGen/PowerPC/stwux.ll
@@ -0,0 +1,47 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc < %s | FileCheck %s
+
+@multvec_i = external unnamed_addr global [100 x i32], align 4
+
+define fastcc void @subs_STMultiExceptIntern() nounwind {
+entry:
+  br i1 undef, label %while.body.lr.ph, label %return
+
+while.body.lr.ph:                                 ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %if.end12, %while.body.lr.ph
+  %i.0240 = phi i32 [ -1, %while.body.lr.ph ], [ %i.1, %if.end12 ]
+  br i1 undef, label %if.end12, label %if.then
+
+if.then:                                          ; preds = %while.body
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.then, %while.body
+  %i.1 = phi i32 [ %i.0240, %while.body ], [ undef, %if.then ]
+  br i1 undef, label %while.body, label %while.end
+
+while.end:                                        ; preds = %if.end12
+  br i1 undef, label %return, label %if.end15
+
+if.end15:                                         ; preds = %while.end
+  %idxprom.i.i230 = sext i32 %i.1 to i64
+  %arrayidx18 = getelementptr inbounds [100 x i32]* @multvec_i, i64 0, i64 %idxprom.i.i230
+  store i32 0, i32* %arrayidx18, align 4
+  br i1 undef, label %while.body21, label %while.end90
+
+while.body21:                                     ; preds = %if.end15
+  unreachable
+
+while.end90:                                      ; preds = %if.end15
+  store i32 0, i32* %arrayidx18, align 4
+  br label %return
+
+return:                                           ; preds = %while.end90, %while.end, %entry
+  ret void
+
+; CHECK: @subs_STMultiExceptIntern
+; CHECK: stwux
+}
+
diff --git a/test/CodeGen/PowerPC/tls.ll b/test/CodeGen/PowerPC/tls.ll
new file mode 100644
index 0000000..713893b
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls.ll
@@ -0,0 +1,16 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-freebsd10.0"
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+
+@a = thread_local global i32 0, align 4
+
+;CHECK:          localexec:
+define i32 @localexec() nounwind {
+entry:
+;CHECK:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
+;CHECK-NEXT:     li [[REG2:[0-9]+]], 42
+;CHECK-NEXT:     addi [[REG1]], [[REG1]], a@tprel@l
+;CHECK-NEXT:     stw [[REG2]], 0([[REG1]])
+  store i32 42, i32* @a, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/PowerPC/trampoline.ll b/test/CodeGen/PowerPC/trampoline.ll
index 91b2011..3ea46f5 100644
--- a/test/CodeGen/PowerPC/trampoline.ll
+++ b/test/CodeGen/PowerPC/trampoline.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep {__trampoline_setup}
+; RUN: llc < %s -march=ppc32 | grep "__trampoline_setup"
 
 module asm "\09.lazy_reference .objc_class_name_NSImageRep"
 module asm "\09.objc_class_name_NSBitmapImageRep=0"
diff --git a/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll b/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll
index 015c086..7e58ec0 100644
--- a/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll
+++ b/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin -mattr=+altivec  | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin -mattr=+altivec -disable-ppc-ilp-pref  | FileCheck %s
 ; Formerly this did byte loads and word stores.
 @a = external global <16 x i8>
 @b = external global <16 x i8>
diff --git a/test/CodeGen/SPARC/2012-05-01-LowerArguments.ll b/test/CodeGen/SPARC/2012-05-01-LowerArguments.ll
new file mode 100644
index 0000000..a607f10
--- /dev/null
+++ b/test/CodeGen/SPARC/2012-05-01-LowerArguments.ll
@@ -0,0 +1,13 @@
+; Just check that this doesn't crash:
+; RUN: llc < %s
+; PR2960
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f128:128:128"
+target triple = "sparc-unknown-linux-gnu"
+	%"5tango4core9Exception11IOException" = type { [5 x i8*]*, i8*, { i64, i8* }, { i64, i8* }, i64, %"6Object7Monitor"*, %"5tango4core9Exception11IOException"* }
+	%"6Object7Monitor" = type { [3 x i8*]*, i8* }
+
+define fastcc %"5tango4core9Exception11IOException"* @_D5tango4core9Exception13TextException5_ctorMFAaZC5tango4core9Exception13TextException(%"5tango4core9Exception11IOException"* %this, { i64, i8* } %msg) {
+entry_tango.core.Exception.TextException.this:
+	unreachable
+}
diff --git a/test/CodeGen/SPARC/private.ll b/test/CodeGen/SPARC/private.ll
index f06ccd0..38cea4c 100644
--- a/test/CodeGen/SPARC/private.ll
+++ b/test/CodeGen/SPARC/private.ll
@@ -1,14 +1,11 @@
 ; Test to make sure that the 'private' is used correctly.
 ;
-; RUN: llc < %s  -march=sparc > %t
-; RUN: grep .foo: %t
-; RUN: grep call.*\.foo %t
-; RUN: grep .baz: %t
-; RUN: grep ld.*\.baz %t
+; RUN: llc < %s  -march=sparc | FileCheck %s
 
 define private void @foo() {
         ret void
 }
+; CHECK: [[FOO:\..*foo]]:
 
 @baz = private global i32 4
 
@@ -17,3 +14,8 @@ define i32 @bar() {
 	%1 = load i32* @baz, align 4
         ret i32 %1
 }
+
+; CHECK: call [[FOO]]
+; CHECK: ld {{.+}}[[BAZ:\..*baz]]
+
+; CHECK: [[BAZ]]
diff --git a/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
new file mode 100644
index 0000000..a4c05d2
--- /dev/null
+++ b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=thumbv6-apple-ios -mcpu=cortex-m0 < %s | FileCheck %s
+; Cortex-M0 doesn't have 32-bit Thumb2 instructions (except for dmb, mrs, etc.)
+; rdar://11331541
+
+define i32 @t(i32 %a) nounwind {
+; CHECK: t:
+; CHECK: asrs [[REG1:(r[0-9]+)]], [[REG2:(r[0-9]+)]], #31
+; CHECK: eors [[REG1]], [[REG2]]
+  %tmp0 = ashr i32 %a, 31
+  %tmp1 = xor i32 %tmp0, %a
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/Thumb/asmprinter-bug.ll b/test/CodeGen/Thumb/asmprinter-bug.ll
index f73f93d..18e11ba 100644
--- a/test/CodeGen/Thumb/asmprinter-bug.ll
+++ b/test/CodeGen/Thumb/asmprinter-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv6-apple-darwin10 | grep rsbs | grep {#0}
+; RUN: llc < %s -mtriple=thumbv6-apple-darwin10 | grep rsbs | grep "#0"
 
 	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
 	%struct.__sFILEX = type opaque
diff --git a/test/CodeGen/Thumb/frame_thumb.ll b/test/CodeGen/Thumb/frame_thumb.ll
index 0cac755..6cc4dd1 100644
--- a/test/CodeGen/Thumb/frame_thumb.ll
+++ b/test/CodeGen/Thumb/frame_thumb.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=thumb-apple-darwin \
-; RUN:     -disable-fp-elim | not grep {r11}
+; RUN:     -disable-fp-elim | not grep "r11"
 ; RUN: llc < %s -mtriple=thumb-linux-gnueabi \
-; RUN:     -disable-fp-elim | not grep {r11}
+; RUN:     -disable-fp-elim | not grep "r11"
 
 define i32 @f() {
 entry:
diff --git a/test/CodeGen/Thumb/iabs.ll b/test/CodeGen/Thumb/iabs.ll
index d03b5b2..2e77660 100644
--- a/test/CodeGen/Thumb/iabs.ll
+++ b/test/CodeGen/Thumb/iabs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -stats |& \
-; RUN:   grep {4 .*Number of machine instrs printed}
+; RUN: llc < %s -march=thumb -stats 2>&1 | \
+; RUN:   grep "4 .*Number of machine instrs printed"
 
 ;; Integer absolute value, should produce something as good as:
 ;; Thumb:
diff --git a/test/CodeGen/Thumb2/2010-01-06-TailDuplicateLabels.ll b/test/CodeGen/Thumb2/2010-01-06-TailDuplicateLabels.ll
index af7d716..348e9d3 100644
--- a/test/CodeGen/Thumb2/2010-01-06-TailDuplicateLabels.ll
+++ b/test/CodeGen/Thumb2/2010-01-06-TailDuplicateLabels.ll
@@ -1,4 +1,4 @@
-; RUN: llc -relocation-model=pic < %s | grep {:$} | sort | uniq -d | count 0
+; RUN: llc -relocation-model=pic < %s | grep ":$" | sort | uniq -d | count 0
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10"
 
diff --git a/test/CodeGen/Thumb2/constant-islands.ll b/test/CodeGen/Thumb2/constant-islands.ll
index 19d2385..255b709 100644
--- a/test/CodeGen/Thumb2/constant-islands.ll
+++ b/test/CodeGen/Thumb2/constant-islands.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm   -mcpu=cortex-a8 -O0 -filetype=obj -o %t.o
 ; RUN: llc < %s -march=thumb -mcpu=cortex-a8 -O0 -filetype=obj -o %t.o
-; RUN: llc < %s -march=arm   -mcpu=cortex-a8 -O2 -filetype=obj -o %t.o
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 -O2 -filetype=obj -o %t.o
+; RUN: llc < %s -march=arm   -mcpu=cortex-a8 -O2 -filetype=obj -verify-machineinstrs -o %t.o
+; RUN: llc < %s -march=thumb -mcpu=cortex-a8 -O2 -filetype=obj -verify-machineinstrs -o %t.o
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios"
 
diff --git a/test/CodeGen/Thumb2/inflate-regs.ll b/test/CodeGen/Thumb2/inflate-regs.ll
new file mode 100644
index 0000000..d8a558c
--- /dev/null
+++ b/test/CodeGen/Thumb2/inflate-regs.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mcpu=cortex-a8 | FileCheck %s
+target triple = "thumbv7-apple-ios"
+
+; CHECK: local_split
+;
+; The load must go into d0-15 which are all clobbered by the asm.
+; RAGreedy should split the range and use d16-d31 to avoid a spill.
+;
+; CHECK: vldr s
+; CHECK-NOT: vstr
+; CHECK: vadd.f32
+; CHECK-NOT: vstr
+; CHECK: vorr
+; CHECK: vstr s
+define void @local_split(float* nocapture %p) nounwind ssp {
+entry:
+  %x = load float* %p, align 4
+  %a = fadd float %x, 1.0
+  tail call void asm sideeffect "", "~{d0},~{d1},~{d2},~{d3},~{d4},~{d5},~{d6},~{d7},~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() nounwind
+  store float %a, float* %p, align 4
+  ret void
+}
+
+; CHECK: global_split
+;
+; Same thing, but across basic blocks.
+;
+; CHECK: vldr s
+; CHECK-NOT: vstr
+; CHECK: vadd.f32
+; CHECK-NOT: vstr
+; CHECK: vorr
+; CHECK: vstr s
+define void @global_split(float* nocapture %p1, float* nocapture %p2) nounwind ssp {
+entry:
+  %0 = load float* %p1, align 4
+  %add = fadd float %0, 1.000000e+00
+  tail call void asm sideeffect "", "~{d0},~{d1},~{d2},~{d3},~{d4},~{d5},~{d6},~{d7},~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() nounwind
+  %cmp = fcmp ogt float %add, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store float %add, float* %p2, align 4
+  br label %if.end
+
+if.end:
+  store float %add, float* %p1, align 4
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/inlineasm.ll b/test/CodeGen/Thumb2/inlineasm.ll
new file mode 100644
index 0000000..30f28f8
--- /dev/null
+++ b/test/CodeGen/Thumb2/inlineasm.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -filetype=obj
+
+target triple = "thumbv7-none--eabi"
+
+define void @t1() nounwind {
+entry:
+  call void asm sideeffect "mov r0, r1", ""() nounwind
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/large-call.ll b/test/CodeGen/Thumb2/large-call.ll
index aef6f85..61c477a 100644
--- a/test/CodeGen/Thumb2/large-call.ll
+++ b/test/CodeGen/Thumb2/large-call.ll
@@ -3,17 +3,18 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 target triple = "thumbv7-apple-ios0.0.0"
 
 ; This test case would clobber the outgoing call arguments by writing to the
-; emergency spill slot at [sp, #4] without adjusting the stack pointer first.
+; emergency spill slots at [sp, #4] or [sp, #8] without adjusting the stack
+; pointer first.
 
 ; CHECK: main
 ; CHECK: vmov.f64
 ; Adjust SP for the large call
 ; CHECK: sub sp,
-; CHECK: mov [[FR:r[0-9]+]], sp
-; Store to call frame + #4
-; CHECK: str{{.*\[}}[[FR]], #4]
+; Store to call frame + #8
+; CHECK: vstr{{.*\[}}sp, #8]
 ; Don't clobber that store until the call.
 ; CHECK-NOT: [sp, #4]
+; CHECK-NOT: [sp, #8]
 ; CHECK: variadic
 
 define i32 @main() ssp {
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index df221b9..67b07e6 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; test as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests could be improved by 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'.
 
 define i1 @f1(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
@@ -9,7 +9,7 @@ define i1 @f1(i32 %a, i32 %b) {
     ret i1 %tmp
 }
 ; CHECK: f1:
-; CHECK: 	cmn.w	r0, r1
+; CHECK: 	cmn	{{.*}}, r1
 
 define i1 @f2(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
@@ -17,7 +17,7 @@ define i1 @f2(i32 %a, i32 %b) {
     ret i1 %tmp
 }
 ; CHECK: f2:
-; CHECK: 	cmn.w	r0, r1
+; CHECK: 	cmn	{{.*}}, r1
 
 define i1 @f3(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
@@ -25,7 +25,7 @@ define i1 @f3(i32 %a, i32 %b) {
     ret i1 %tmp
 }
 ; CHECK: f3:
-; CHECK: 	cmn.w	r0, r1
+; CHECK: 	cmn	{{.*}}, r1
 
 define i1 @f4(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
@@ -33,7 +33,7 @@ define i1 @f4(i32 %a, i32 %b) {
     ret i1 %tmp
 }
 ; CHECK: f4:
-; CHECK: 	cmn.w	r0, r1
+; CHECK: 	cmn	{{.*}}, r1
 
 define i1 @f5(i32 %a, i32 %b) {
     %tmp = shl i32 %b, 5
@@ -42,7 +42,7 @@ define i1 @f5(i32 %a, i32 %b) {
     ret i1 %tmp1
 }
 ; CHECK: f5:
-; CHECK: 	cmn.w	r0, r1, lsl #5
+; CHECK: 	cmn.w	{{.*}}, r1, lsl #5
 
 define i1 @f6(i32 %a, i32 %b) {
     %tmp = lshr i32 %b, 6
@@ -51,7 +51,7 @@ define i1 @f6(i32 %a, i32 %b) {
     ret i1 %tmp1
 }
 ; CHECK: f6:
-; CHECK: 	cmn.w	r0, r1, lsr #6
+; CHECK: 	cmn.w	{{.*}}, r1, lsr #6
 
 define i1 @f7(i32 %a, i32 %b) {
     %tmp = ashr i32 %b, 7
@@ -60,7 +60,7 @@ define i1 @f7(i32 %a, i32 %b) {
     ret i1 %tmp1
 }
 ; CHECK: f7:
-; CHECK: 	cmn.w	r0, r1, asr #7
+; CHECK: 	cmn.w	{{.*}}, r1, asr #7
 
 define i1 @f8(i32 %a, i32 %b) {
     %l8 = shl i32 %a, 24
@@ -71,5 +71,15 @@ define i1 @f8(i32 %a, i32 %b) {
     ret i1 %tmp1
 }
 ; CHECK: f8:
-; CHECK: 	cmn.w	r0, r0, ror #8
+; CHECK: 	cmn.w	{{.*}}, {{.*}}, ror #8
+
 
+define void @f9(i32 %a, i32 %b) nounwind optsize {
+  tail call void asm sideeffect "cmn.w     r0, r1", ""() nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 81}
+
+; CHECK: f9:
+; CHECK: 	cmn.w	r0, r1
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index da12114..4ce7acc 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; test as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
 ; CHECK: f1:
-; CHECK: cmp r0, #187
+; CHECK: cmp {{.*}}, #187
     %tmp = icmp ne i32 %a, 187
     ret i1 %tmp
 }
@@ -14,7 +14,7 @@ define i1 @f1(i32 %a) {
 ; 0x00aa00aa = 11141290
 define i1 @f2(i32 %a) {
 ; CHECK: f2:
-; CHECK: cmp.w r0, #11141290
+; CHECK: cmp.w {{.*}}, #11141290
     %tmp = icmp eq i32 %a, 11141290 
     ret i1 %tmp
 }
@@ -22,7 +22,7 @@ define i1 @f2(i32 %a) {
 ; 0xcc00cc00 = 3422604288
 define i1 @f3(i32 %a) {
 ; CHECK: f3:
-; CHECK: cmp.w r0, #-872363008
+; CHECK: cmp.w {{.*}}, #-872363008
     %tmp = icmp ne i32 %a, 3422604288
     ret i1 %tmp
 }
@@ -30,7 +30,7 @@ define i1 @f3(i32 %a) {
 ; 0xdddddddd = 3722304989
 define i1 @f4(i32 %a) {
 ; CHECK: f4:
-; CHECK: cmp.w r0, #-572662307
+; CHECK: cmp.w {{.*}}, #-572662307
     %tmp = icmp ne i32 %a, 3722304989
     ret i1 %tmp
 }
@@ -38,7 +38,7 @@ define i1 @f4(i32 %a) {
 ; 0x00110000 = 1114112
 define i1 @f5(i32 %a) {
 ; CHECK: f5:
-; CHECK: cmp.w r0, #1114112
+; CHECK: cmp.w {{.*}}, #1114112
     %tmp = icmp eq i32 %a, 1114112
     ret i1 %tmp
 }
@@ -46,7 +46,7 @@ define i1 @f5(i32 %a) {
 ; Check that we don't do an invalid (a > b) --> !(a < b + 1) transform.
 ;
 ; CHECK: f6:
-; CHECK-NOT: cmp.w r0, #-2147483648
+; CHECK-NOT: cmp.w {{.*}}, #-2147483648
 ; CHECK: bx lr
 define i32 @f6(i32 %a) {
     %tmp = icmp sgt i32 %a, 2147483647
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 15052e0..f6790de 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,25 +1,25 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; test as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1:
-; CHECK: cmp r0, r1
+; CHECK: cmp {{.*}}, r1
     %tmp = icmp ne i32 %a, %b
     ret i1 %tmp
 }
 
 define i1 @f2(i32 %a, i32 %b) {
 ; CHECK: f2:
-; CHECK: cmp r0, r1
+; CHECK: cmp {{.*}}, r1
     %tmp = icmp eq i32 %a, %b
     ret i1 %tmp
 }
 
 define i1 @f6(i32 %a, i32 %b) {
 ; CHECK: f6:
-; CHECK: cmp.w r0, r1, lsl #5
+; CHECK: cmp.w {{.*}}, r1, lsl #5
     %tmp = shl i32 %b, 5
     %tmp1 = icmp eq i32 %tmp, %a
     ret i1 %tmp1
@@ -27,7 +27,7 @@ define i1 @f6(i32 %a, i32 %b) {
 
 define i1 @f7(i32 %a, i32 %b) {
 ; CHECK: f7:
-; CHECK: cmp.w r0, r1, lsr #6
+; CHECK: cmp.w {{.*}}, r1, lsr #6
     %tmp = lshr i32 %b, 6
     %tmp1 = icmp ne i32 %tmp, %a
     ret i1 %tmp1
@@ -35,7 +35,7 @@ define i1 @f7(i32 %a, i32 %b) {
 
 define i1 @f8(i32 %a, i32 %b) {
 ; CHECK: f8:
-; CHECK: cmp.w r0, r1, asr #7
+; CHECK: cmp.w {{.*}}, r1, asr #7
     %tmp = ashr i32 %b, 7
     %tmp1 = icmp eq i32 %a, %tmp
     ret i1 %tmp1
@@ -43,7 +43,7 @@ define i1 @f8(i32 %a, i32 %b) {
 
 define i1 @f9(i32 %a, i32 %b) {
 ; CHECK: f9:
-; CHECK: cmp.w r0, r0, ror #8
+; CHECK: cmp.w {{.*}}, {{.*}}, ror #8
     %l8 = shl i32 %a, 24
     %r8 = lshr i32 %a, 8
     %tmp = or i32 %l8, %r8
diff --git a/test/CodeGen/Thumb2/thumb2-jtb.ll b/test/CodeGen/Thumb2/thumb2-jtb.ll
index 7e1655f..0748b9b3 100644
--- a/test/CodeGen/Thumb2/thumb2-jtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-jtb.ll
@@ -1,9 +1,15 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-adjust-jump-tables=0 | not grep tbb
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-adjust-jump-tables=0 | FileCheck %s
 
 ; Do not use tbb / tbh if any destination is before the jumptable.
 ; rdar://7102917
 
 define i16 @main__getopt_internal_2E_exit_2E_ce(i32, i1 %b) nounwind {
+; CHECK: main__getopt_internal_2E_exit_2E_ce
+; CHECK-NOT: tbb
+; CHECK-NOT: tbh
+; 32-bit jump tables use explicit branches, not data regions, so make sure
+; we don't annotate this region.
+; CHECK-NOT: data_region
 entry:
   br i1 %b, label %codeRepl127.exitStub, label %newFuncRoot
 
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_post.ll b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
index d1af4ba..2178eec 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
-; RUN:   grep {ldr.*\\\[.*\],} | count 1
+; RUN:   grep "ldr.*\[.*\]," | count 1
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp1 = mul i32 %a, %b          ; <i32> [#uses=2]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
index 9cc3f4a..601c0b5 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
-; RUN:   grep {ldr.*\\!} | count 3
+; RUN:   grep "ldr.*\!" | count 3
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
-; RUN:   grep {ldrsb.*\\!} | count 1
+; RUN:   grep "ldrsb.*\!" | count 1
 
 define i32* @test1(i32* %X, i32* %dest) {
         %Y = getelementptr i32* %X, i32 4               ; <i32*> [#uses=2]
diff --git a/test/CodeGen/Thumb2/thumb2-rev16.ll b/test/CodeGen/Thumb2/thumb2-rev16.ll
index 39b6ac3..10cd539 100644
--- a/test/CodeGen/Thumb2/thumb2-rev16.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev16.ll
@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; fixme rev16 pattern is not matching
 
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep {rev16\\W*r\[0-9\]*,\\W*r\[0-9\]*} | count 1
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep "rev16\W*r[0-9]*,\W*r[0-9]*" | count 1
 
 ; 0xff00ff00 = 4278255360
 ; 0x00ff00ff = 16711935
diff --git a/test/CodeGen/Thumb2/thumb2-ror.ll b/test/CodeGen/Thumb2/thumb2-ror.ll
index 590c333..5ad92cd 100644
--- a/test/CodeGen/Thumb2/thumb2-ror.ll
+++ b/test/CodeGen/Thumb2/thumb2-ror.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=THUMB1
 
 ; CHECK: f1:
 ; CHECK: 	ror.w	r0, r0, #22
@@ -13,6 +13,8 @@ define i32 @f1(i32 %a) {
 ; CHECK: f2:
 ; CHECK-NOT: and
 ; CHECK: ror
+; THUMB1: f2
+; THUMB1: and
 define i32 @f2(i32 %v, i32 %nbits) {
 entry:
   %and = and i32 %nbits, 31
@@ -21,4 +23,4 @@ entry:
   %shl = shl i32 %v, %sub
   %or = or i32 %shl, %shr
   ret i32 %or
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/Thumb2/thumb2-tbb.ll b/test/CodeGen/Thumb2/thumb2-tbb.ll
index 5dc3cc3..a9d71d6 100644
--- a/test/CodeGen/Thumb2/thumb2-tbb.ll
+++ b/test/CodeGen/Thumb2/thumb2-tbb.ll
@@ -5,7 +5,9 @@ define void @bar(i32 %n.u) {
 entry:
 ; CHECK: bar:
 ; CHECK: tbb
-; CHECK: .align 1
+; CHECK: .data_region jt8
+; CHECK: .end_data_region
+; CHECK-NEXT: .align 1
 
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 00c928f..d453f46 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; test as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'.
 
 ; 0x000000bb = 187
 define i1 @f2(i32 %a) {
@@ -10,7 +10,7 @@ define i1 @f2(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f2:
-; CHECK: 	teq.w	r0, #187
+; CHECK: 	teq.w	{{.*}}, #187
 
 ; 0x00aa00aa = 11141290
 define i1 @f3(i32 %a) {
@@ -19,7 +19,7 @@ define i1 @f3(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f3:
-; CHECK: 	teq.w	r0, #11141290
+; CHECK: 	teq.w	{{.*}}, #11141290
 
 ; 0xcc00cc00 = 3422604288
 define i1 @f6(i32 %a) {
@@ -28,7 +28,7 @@ define i1 @f6(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f6:
-; CHECK: 	teq.w	r0, #-872363008
+; CHECK: 	teq.w	{{.*}}, #-872363008
 
 ; 0xdddddddd = 3722304989
 define i1 @f7(i32 %a) {
@@ -37,7 +37,7 @@ define i1 @f7(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f7:
-; CHECK: 	teq.w	r0, #-572662307
+; CHECK: 	teq.w	{{.*}}, #-572662307
 
 ; 0xdddddddd = 3722304989
 define i1 @f8(i32 %a) {
@@ -53,5 +53,5 @@ define i1 @f10(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f10:
-; CHECK: 	teq.w	r0, #1114112
+; CHECK: 	teq.w	{{.*}}, #1114112
 
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index 8acae90..27ecad8 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'.
 
 define i1 @f2(i32 %a, i32 %b) {
 ; CHECK: f2
-; CHECK: teq.w r0, r1
+; CHECK: teq.w {{.*}}, r1
     %tmp = xor i32 %a, %b
     %tmp1 = icmp eq i32 %tmp, 0
     ret i1 %tmp1
@@ -13,7 +13,7 @@ define i1 @f2(i32 %a, i32 %b) {
 
 define i1 @f4(i32 %a, i32 %b) {
 ; CHECK: f4
-; CHECK: teq.w  r0, r1
+; CHECK: teq.w  {{.*}}, r1
     %tmp = xor i32 %a, %b
     %tmp1 = icmp eq i32 0, %tmp
     ret i1 %tmp1
@@ -21,7 +21,7 @@ define i1 @f4(i32 %a, i32 %b) {
 
 define i1 @f6(i32 %a, i32 %b) {
 ; CHECK: f6
-; CHECK: teq.w  r0, r1, lsl #5
+; CHECK: teq.w  {{.*}}, r1, lsl #5
     %tmp = shl i32 %b, 5
     %tmp1 = xor i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -30,7 +30,7 @@ define i1 @f6(i32 %a, i32 %b) {
 
 define i1 @f7(i32 %a, i32 %b) {
 ; CHECK: f7
-; CHECK: teq.w  r0, r1, lsr #6
+; CHECK: teq.w  {{.*}}, r1, lsr #6
     %tmp = lshr i32 %b, 6
     %tmp1 = xor i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -39,7 +39,7 @@ define i1 @f7(i32 %a, i32 %b) {
 
 define i1 @f8(i32 %a, i32 %b) {
 ; CHECK: f8
-; CHECK: teq.w  r0, r1, asr #7
+; CHECK: teq.w  {{.*}}, r1, asr #7
     %tmp = ashr i32 %b, 7
     %tmp1 = xor i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -48,7 +48,7 @@ define i1 @f8(i32 %a, i32 %b) {
 
 define i1 @f9(i32 %a, i32 %b) {
 ; CHECK: f9
-; CHECK: teq.w  r0, r0, ror #8
+; CHECK: teq.w  {{.*}}, {{.*}}, ror #8
     %l8 = shl i32 %a, 24
     %r8 = lshr i32 %a, 8
     %tmp = or i32 %l8, %r8
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index 43e208c..67fe82e 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'.
 
 ; 0x000000bb = 187
 define i1 @f2(i32 %a) {
@@ -10,7 +10,7 @@ define i1 @f2(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f2:
-; CHECK: 	tst.w	r0, #187
+; CHECK: 	tst.w	{{.*}}, #187
 
 ; 0x00aa00aa = 11141290
 define i1 @f3(i32 %a) {
@@ -19,7 +19,7 @@ define i1 @f3(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f3:
-; CHECK: 	tst.w	r0, #11141290
+; CHECK: 	tst.w	{{.*}}, #11141290
 
 ; 0xcc00cc00 = 3422604288
 define i1 @f6(i32 %a) {
@@ -28,7 +28,7 @@ define i1 @f6(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f6:
-; CHECK: 	tst.w	r0, #-872363008
+; CHECK: 	tst.w	{{.*}}, #-872363008
 
 ; 0xdddddddd = 3722304989
 define i1 @f7(i32 %a) {
@@ -37,7 +37,7 @@ define i1 @f7(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f7:
-; CHECK: 	tst.w	r0, #-572662307
+; CHECK: 	tst.w	{{.*}}, #-572662307
 
 ; 0x00110000 = 1114112
 define i1 @f10(i32 %a) {
@@ -46,4 +46,4 @@ define i1 @f10(i32 %a) {
     ret i1 %tmp1
 }
 ; CHECK: f10:
-; CHECK: 	tst.w	r0, #1114112
+; CHECK: 	tst.w	{{.*}}, #1114112
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index bfe016f..e3fe792 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
-; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
-; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
+; These tests would be improved by 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'.
 
 define i1 @f2(i32 %a, i32 %b) {
 ; CHECK: f2:
-; CHECK: tst r0, r1
+; CHECK: tst {{.*}}, r1
     %tmp = and i32 %a, %b
     %tmp1 = icmp eq i32 %tmp, 0
     ret i1 %tmp1
@@ -13,7 +13,7 @@ define i1 @f2(i32 %a, i32 %b) {
 
 define i1 @f4(i32 %a, i32 %b) {
 ; CHECK: f4:
-; CHECK: tst r0, r1
+; CHECK: tst {{.*}}, r1
     %tmp = and i32 %a, %b
     %tmp1 = icmp eq i32 0, %tmp
     ret i1 %tmp1
@@ -21,7 +21,7 @@ define i1 @f4(i32 %a, i32 %b) {
 
 define i1 @f6(i32 %a, i32 %b) {
 ; CHECK: f6:
-; CHECK: tst.w r0, r1, lsl #5
+; CHECK: tst.w {{.*}}, r1, lsl #5
     %tmp = shl i32 %b, 5
     %tmp1 = and i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -30,7 +30,7 @@ define i1 @f6(i32 %a, i32 %b) {
 
 define i1 @f7(i32 %a, i32 %b) {
 ; CHECK: f7:
-; CHECK: tst.w r0, r1, lsr #6
+; CHECK: tst.w {{.*}}, r1, lsr #6
     %tmp = lshr i32 %b, 6
     %tmp1 = and i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -39,7 +39,7 @@ define i1 @f7(i32 %a, i32 %b) {
 
 define i1 @f8(i32 %a, i32 %b) {
 ; CHECK: f8:
-; CHECK: tst.w r0, r1, asr #7
+; CHECK: tst.w {{.*}}, r1, asr #7
     %tmp = ashr i32 %b, 7
     %tmp1 = and i32 %a, %tmp
     %tmp2 = icmp eq i32 %tmp1, 0
@@ -48,7 +48,7 @@ define i1 @f8(i32 %a, i32 %b) {
 
 define i1 @f9(i32 %a, i32 %b) {
 ; CHECK: f9:
-; CHECK: tst.w r0, r0, ror #8
+; CHECK: tst.w {{.*}}, {{.*}}, ror #8
     %l8 = shl i32 %a, 24
     %r8 = lshr i32 %a, 8
     %tmp = or i32 %l8, %r8
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index 03189aa..61e849e 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -1,15 +1,22 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
+; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s --check-prefix=M3
+; rdar://11318438
 
 define zeroext i8 @test1(i32 %A.u)  {
-; CHECK: test1
-; CHECK: uxtb r0, r0
+; A8: test1
+; A8: uxtb r0, r0
     %B.u = trunc i32 %A.u to i8
     ret i8 %B.u
 }
 
 define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
-; CHECK: test2
-; CHECK: uxtab  r0, r0, r1
+; A8: test2
+; A8: uxtab  r0, r0, r1
+
+; M3: test2
+; M3: uxtb  r1, r1
+; M3-NOT: uxtab
+; M3: add   r0, r1
     %C.u = trunc i32 %B.u to i8
     %D.u = zext i8 %C.u to i32
     %E.u = add i32 %A.u, %D.u
@@ -17,8 +24,8 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 }
 
 define zeroext i32 @test3(i32 %A.u)  {
-; CHECK: test3
-; CHECK: uxth.w r0, r0, ror #8
+; A8: test3
+; A8: uxth.w r0, r0, ror #8
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
     %D.u = or i32 %B.u, %C.u
diff --git a/test/CodeGen/Thumb2/tls1.ll b/test/CodeGen/Thumb2/tls1.ll
index 1e55557..d91e3b3 100644
--- a/test/CodeGen/Thumb2/tls1.ll
+++ b/test/CodeGen/Thumb2/tls1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi | \
-; RUN:     grep {i(tpoff)}
+; RUN:     grep "i(tpoff)"
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi \
-; RUN:     -relocation-model=pic | grep {__tls_get_addr}
+; RUN:     -relocation-model=pic | grep "__tls_get_addr"
 
 
 @i = thread_local global i32 15		; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
index 2484860..0af2445 100644
--- a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
+++ b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
@@ -3,7 +3,7 @@
 ; it makes a ton of annoying overlapping live ranges.  This code should not
 ; cause spills!
 ;
-; RUN: llc < %s -march=x86 -stats |& not grep spilled
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep spilled
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/CodeGen/X86/2003-11-03-GlobalBool.ll b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
index 8b0a185..f201b98 100644
--- a/test/CodeGen/X86/2003-11-03-GlobalBool.ll
+++ b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
@@ -1,4 +1,4 @@
 ; RUN: llc < %s -march=x86 | \
-; RUN:   not grep {.byte\[\[:space:\]\]*true}
+; RUN:   not grep ".byte[[:space:]]*true"
 
 @X = global i1 true             ; <i1*> [#uses=0]
diff --git a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
index fea2b54..dde210b 100644
--- a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
+++ b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {(%esp}
-; RUN: llc < %s -march=x86 | grep {pushl	%ebp} | count 1
-; RUN: llc < %s -march=x86 | grep {popl	%ebp} | count 1
+; RUN: llc < %s -march=x86 | grep "(%esp"
+; RUN: llc < %s -march=x86 | grep "pushl	%ebp" | count 1
+; RUN: llc < %s -march=x86 | grep "popl	%ebp" | count 1
 
 declare i8* @llvm.returnaddress(i32)
 
diff --git a/test/CodeGen/X86/2004-03-30-Select-Max.ll b/test/CodeGen/X86/2004-03-30-Select-Max.ll
index c44d10a..e22aa6a 100644
--- a/test/CodeGen/X86/2004-03-30-Select-Max.ll
+++ b/test/CodeGen/X86/2004-03-30-Select-Max.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {j\[lgbe\]}
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; CHECK-NOT: {{j[lgbe]}}
 
 define i32 @max(i32 %A, i32 %B) nounwind {
         %gt = icmp sgt i32 %A, %B               ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index dc69ef8..f8bf099 100644
--- a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 | not grep "subl.*%esp"
 
 define i32 @f(i32 %a, i32 %b) {
         %tmp.2 = mul i32 %a, %a         ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
index 0421896..1a3d749 100644
--- a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86  -stats |& \
+; RUN: llc < %s -march=x86  -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 7
 
 define i32 @g(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
index 8783a11..fb1262a 100644
--- a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin8 -relocation-model=static > %t
-; RUN: grep {movl	_last} %t | count 1
-; RUN: grep {cmpl.*_last} %t | count 1
+; RUN: grep "movl	_last" %t | count 1
+; RUN: grep "cmpl.*_last" %t | count 1
 
 @block = external global i8*            ; <i8**> [#uses=1]
 @last = external global i32             ; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
index b045329..5cba3ef 100644
--- a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
+++ b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 ; END.
 
 
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 7d0a6ab..1c75f93 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -stats |& \
+; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 14
 ;
 @size20 = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched2.ll b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
index 23954d7..95eefa1 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched2.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats  |& \
+; RUN: llc < %s -march=x86 -stats  2>&1 | \
 ; RUN:   grep asm-printer | grep 13
 
 define void @_ZN9__gnu_cxx9hashtableISt4pairIKPKciES3_NS_4hashIS3_EESt10_Select1stIS5_E5eqstrSaIiEE14find_or_insertERKS5__cond_true456.i(i8* %tmp435.i, i32* %tmp449.i.out) nounwind {
diff --git a/test/CodeGen/X86/2006-05-08-InstrSched.ll b/test/CodeGen/X86/2006-05-08-InstrSched.ll
index d58d638..3419d01 100644
--- a/test/CodeGen/X86/2006-05-08-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 -relocation-model=static | not grep "subl.*%esp"
 
 @A = external global i16*		; <i16**> [#uses=1]
 @B = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index 38bca28..37c5107 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 35
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats -realign-stack=0 2>&1 | \
+; RUN:     grep "asm-printer" | grep 35
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index 3159cec..c5c74d1 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,7 +1,7 @@
 ; PR850
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=att > %t
-; RUN: grep {movl 4(%eax),%ebp} %t
-; RUN: grep {movl 0(%eax), %ebx} %t
+; RUN: grep "movl 4(%eax),%ebp" %t
+; RUN: grep "movl 0(%eax), %ebx" %t
 
 define i32 @foo(i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i) {
 	%tmp9.i.i = call i32 asm sideeffect "push %ebp\0Apush %ebx\0Amovl 4($2),%ebp\0Amovl 0($2), %ebx\0Amovl $1,%eax\0Aint  $$0x80\0Apop  %ebx\0Apop %ebp", "={ax},i,0,{cx},{dx},{si},{di}"( i32 192, i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i )		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
index a19d8f7..56d5f2f 100644
--- a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
+++ b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | \
-; RUN:    not grep {movl %eax, %edx}
+; RUN:    not grep "movl %eax, %edx"
 
 define i32 @foo(i32 %t, i32 %C) {
 entry:
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 6ec9a48..a58c9b1 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -52,8 +52,8 @@ entry:
         %tmp21 = load double* %tmp20            ; <double> [#uses=1]
         %tmp.upgrd.6 = getelementptr [9 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
         %tmp.upgrd.7 = call i32 (i8*, ...)* @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
-        br label %return
-return:         ; preds = %entry
+        br label %finish
+finish:
         %retval.upgrd.8 = load i32* %retval             ; <i32> [#uses=1]
         ret i32 %retval.upgrd.8
 }
diff --git a/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
index affb7af..783d9f9 100644
--- a/test/CodeGen/X86/2006-11-17-IllegalMove.ll
+++ b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep movb %t | count 2
-; RUN: grep {movzb\[wl\]} %t
+; RUN: grep movb %t | count 1
+; RUN: grep "movzb[wl]" %t
 
 
 define void @handle_vector_size_attribute() nounwind {
diff --git a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index a228898..04d4b8e 100644
--- a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: not grep {,%rsp)} %t
+; RUN: not grep ",%rsp)" %t
 ; PR1103
 
 target datalayout = "e-p:64:64"
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 3312e01..3b2e443 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov %gs:72, %eax}
+; RUN: llc < %s -march=x86 | grep "mov %gs:72, %eax"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index c1b1ad1..18b06dc 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=yonah -march=x86 | \
-; RUN:   grep {cmpltsd %xmm0, %xmm0}
+; RUN:   grep "cmpltsd %xmm0, %xmm0"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
index 85a2ecc..cae68c9 100644
--- a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
+++ b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep {bsrl.*10}
+; RUN: llc < %s | not grep "bsrl.*10"
 ; PR1356
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
index deb3999..c3d7e8a 100644
--- a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
+++ b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep {addl .12, %esp}
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep "addl .12, %esp"
 ; PR1398
 
 	%struct.S = type { i32, i32 }
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index 77291f0..aa0ee5d 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movsbl}
+; RUN: llc < %s -march=x86 | grep "movsbl"
 
 @X = global i32 0               ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
index 5acb051..e81534b 100644
--- a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
+++ b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | not grep {lea\[\[:space:\]\]R}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | FileCheck %s
+; CHECK-NOT: lea R
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
index 228a915..56a109a 100644
--- a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
+++ b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | grep {foo str$}
+; RUN: llc < %s -relocation-model=static | grep "foo str$"
 ; PR1761
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux"
diff --git a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
index 2e95082..99df20d 100644
--- a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
+++ b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | grep {(%esp)} | count 2
+; RUN: llc < %s -march=x86 -mcpu=generic | grep "(%esp)" | count 2
 ; PR1872
 
 	%struct.c34007g__designated___XUB = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 266fd7b..39af931 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -10,10 +10,10 @@
 
 	%struct.indexentry = type { i32, i8*, i8*, i8*, i8*, i8* }
 
-define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind  {
+define i32 @_bfd_stab_section_find_nearest_line(i32 %offset, i1 %cond) nounwind  {
 entry:
 	%tmp910 = add i32 0, %offset		; <i32> [#uses=1]
-	br i1 true, label %bb951, label %bb917
+	br i1 %cond, label %bb951, label %bb917
 
 bb917:		; preds = %entry
 	ret i32 0
@@ -21,7 +21,7 @@ bb917:		; preds = %entry
 bb951:		; preds = %bb986, %entry
 	%tmp955 = sdiv i32 0, 2		; <i32> [#uses=3]
 	%tmp961 = getelementptr %struct.indexentry* null, i32 %tmp955, i32 0		; <i32*> [#uses=1]
-	br i1 true, label %bb986, label %bb967
+	br i1 %cond, label %bb986, label %bb967
 
 bb967:		; preds = %bb951
 	ret i32 0
diff --git a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
index 0091397..9584b71 100644
--- a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
+++ b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast
+; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast -optimize-regalloc=0
 
 define void @SolveCubic(double %a, double %b, double %c, double %d, i32* %solutions, double* %x) {
 entry:
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index bdacf50..a1b973d 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& grep {Number of block tails merged} | grep 16
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | grep "Number of block tails merged" | grep 16
 ; PR1909
 
 @.str = internal constant [48 x i8] c"transformed bounds: (%.2f, %.2f), (%.2f, %.2f)\0A\00"		; <[48 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
index 5115e48..a52b365 100644
--- a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
+++ b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | grep {a:} | not grep ax
-; RUN: llc < %s | grep {b:} | not grep ax
+; RUN: llc < %s | grep "a:" | not grep ax
+; RUN: llc < %s | grep "b:" | not grep ax
 ; PR2078
 ; The clobber list says that "ax" is clobbered.  Make sure that eax isn't 
 ; allocated to the input/output register.
diff --git a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
index da02907..9185a36 100644
--- a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -march=x86 -mattr=+mmx | grep esi
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -march=x86 -mattr=+mmx | grep esi
 ; PR2082
 ; Local register allocator was refusing to use ESI, EDI, and EBP so it ran out of
 ; registers.
diff --git a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
index 4dc3a10..5ca7e3e 100644
--- a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
+++ b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep {#} | not grep -v {##}
+; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep "#" | not grep -v "##"
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index 109069e..3a1de11 100644
--- a/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep {, %e}
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep ", %e"
 
 	%struct.DBC_t = type { i32, i8*, i16, %struct.DBC_t*, i8*, i8*, i8*, i8*, i8*, %struct.DBC_t*, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i16, i16, i32*, i8, i16, %struct.DRVOPT*, i16 }
 	%struct.DRVOPT = type { i16, i32, i8, %struct.DRVOPT* }
diff --git a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 859041e..f244793 100644
--- a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep {%e}
+; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep "%e"
 ; Make sure xorl operands are 32-bit registers.
 
 	%struct.tm = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
index 5b97eb7..7c04206 100644
--- a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movl > %t
-; RUN: not grep {r\[abcd\]x} %t
-; RUN: not grep {r\[ds\]i} %t
-; RUN: not grep {r\[bs\]p} %t
+; RUN: not grep "r[abcd]x" %t
+; RUN: not grep "r[ds]i" %t
+; RUN: not grep "r[bs]p" %t
 
 	%struct.BITMAP = type { i16, i16, i32, i32, i32, i32, i32, i32, i8*, i8* }
 	%struct.BltData = type { float, float, float, float }
diff --git a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
index c068f8a..4e73b5a 100644
--- a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast -optimize-regalloc=0
 
 @_ZTVN10Evaluation10GridOutputILi3EEE = external constant [5 x i32 (...)*]		; <[5 x i32 (...)*]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index 99cb856..bdac8fd 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -o - | grep {cmpl	\\$\[1\], %}
+; RUN: llc -march=x86-64 < %s -o - | grep "cmpl	\$[1], %"
 
 @.str = internal constant [4 x i8] c"%d\0A\00"
 
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index 1d27fc5..c63c890 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,15 +1,36 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %ebp | count 9
-; RUN: llc < %s | grep %ecx | count 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i386-pc-linux"
 
-define i8* @test(i32 %a, i8* %b)  {
+; CHECK: test1
+; CHECK: pushl %ebp
+define i8* @test1(i32 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
   %foo   = alloca i32
   call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
+  unreachable
+}
+
+; CHECK: test2
+; CHECK: pushl %ebp
+define i8* @test2(i32 %a, i8* %b)  {
+entry:
+  call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
   unreachable
 }
 
diff --git a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
index 86e50c9..4b2774b 100644
--- a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
+++ b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; %0 must not be put in EAX or EDX.
 ; In the first asm, $0 and $2 must not be put in EAX.
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 6867ae7..5c2fbee 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -regalloc=fast       | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
 
diff --git a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
index 421b931..e504bc3 100644
--- a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
+++ b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep {ucomiss\[^,\]*esp}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep "ucomiss[^,]*esp"
 
 define void @f(float %wt) {
 entry:
diff --git a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
index 9d144a4..b2cf34cd 100644
--- a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats |& FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats 2>&1 | FileCheck %s
 ; Now this test spills one register. But a reload in the loop is cheaper than
 ; the divsd so it's a win.
 
@@ -17,8 +17,7 @@ bb:		; preds = %bb, %entry
 ; CHECK: %bb30.loopexit
 ; CHECK: divsd %xmm0
 ; CHECK: movsd %xmm0, 16(%esp)
-; CHECK: .align
-; CHECK-NEXT: %bb3
+; CHECK: %bb3
 bb3:		; preds = %bb30.loopexit, %bb25, %bb3
 	%2 = load i32* null, align 4		; <i32> [#uses=1]
 	%3 = mul i32 %2, 0		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2008-12-23-crazy-address.ll b/test/CodeGen/X86/2008-12-23-crazy-address.ll
index 2edcaea..0e95c9e 100644
--- a/test/CodeGen/X86/2008-12-23-crazy-address.ll
+++ b/test/CodeGen/X86/2008-12-23-crazy-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | grep {lea.*X.*esp} | count 2
+; RUN: llc < %s -march=x86 -relocation-model=static | grep "lea.*X.*esp" | count 2
 
 @X = external global [0 x i32]
 
diff --git a/test/CodeGen/X86/2009-01-31-BigShift2.ll b/test/CodeGen/X86/2009-01-31-BigShift2.ll
index 3e42553..b478f27 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift2.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov.*56}
+; RUN: llc < %s -march=x86 | grep "mov.*56"
 ; PR3449
 
 define void @test(<8 x double>* %P, i64* %Q) nounwind {
diff --git a/test/CodeGen/X86/2009-02-25-CommuteBug.ll b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
index 7ea6998..9cbf350 100644
--- a/test/CodeGen/X86/2009-02-25-CommuteBug.ll
+++ b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& not grep commuted
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | not grep commuted
 ; rdar://6608609
 
 define <2 x double> @t(<2 x double> %A, <2 x double> %B, <2 x double> %C) nounwind readnone {
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 0b5b7bd..d50fe6f 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {8 machine-licm}
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn -stats 2>&1 | grep "5 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
 
diff --git a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
index 3564f01..847a43f 100644
--- a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
+++ b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep {.space}
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep ".space"
 ; rdar://6668548
 
 declare double @llvm.sqrt.f64(double) nounwind readonly
diff --git a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
index 8bbdb0e..d934ec9 100644
--- a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
+++ b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static -o /dev/null -stats -info-output-file - > %t
 ; RUN: not grep spill %t
-; RUN: not grep {%rsp} %t
-; RUN: not grep {%rbp} %t
+; RUN: not grep "%rsp" %t
+; RUN: not grep "%rbp" %t
 
 ; The register-pressure scheduler should be able to schedule this in a
 ; way that does not require spills.
diff --git a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
index f46eed4..ad18a0c 100644
--- a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
+++ b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of modref unfolded}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats 2>&1 | grep "Number of modref unfolded"
 ; XFAIL: *
 ; 69408 removed the opportunity for this optimization to work
 
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
index 9f5a8c5..5cb05e8 100644
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
+; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
 ; RUN:   FileCheck %s
 ; rdar://6808032
 
diff --git a/test/CodeGen/X86/2009-04-24.ll b/test/CodeGen/X86/2009-04-24.ll
index d6ed0c4..08bf9e3 100644
--- a/test/CodeGen/X86/2009-04-24.ll
+++ b/test/CodeGen/X86/2009-04-24.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -relocation-model=pic > %t2
-; RUN: grep {leaq.*TLSGD} %t2
-; RUN: grep {__tls_get_addr} %t2
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -optimize-regalloc=0 -relocation-model=pic > %t2
+; RUN: grep "leaq.*TLSGD" %t2
+; RUN: grep "__tls_get_addr" %t2
 ; PR4004
 
 @i = thread_local global i32 15
diff --git a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
index a2fd2e4..a6ed74b 100644
--- a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
+++ b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl.*%ebx, 8(%esi)}
+; RUN: llc < %s | grep "movl.*%ebx, 8(%esi)"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.0"
 
diff --git a/test/CodeGen/X86/2009-05-30-ISelBug.ll b/test/CodeGen/X86/2009-05-30-ISelBug.ll
index af552d4..fe04272 100644
--- a/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep {movzbl	%\[abcd\]h,}
+; RUN: llc < %s -march=x86-64 | not grep "movzbl	%[abcd]h,"
 
 define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(i32*, i32 %c_nblock_used.2.i, i32 %.reload51, i32* %.out, i32* %.out1, i32* %.out2, i32* %.out3) nounwind {
 newFuncRoot:
diff --git a/test/CodeGen/X86/20090313-signext.ll b/test/CodeGen/X86/20090313-signext.ll
index de930d5..b8effa6 100644
--- a/test/CodeGen/X86/20090313-signext.ll
+++ b/test/CodeGen/X86/20090313-signext.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 -relocation-model=pic > %t
-; RUN: grep {movswl	%ax, %edi} %t
-; RUN: grep {movw	(%rax), %ax} %t
+; RUN: grep "movswl	%ax, %edi" %t
+; RUN: grep "movw	(%rax), %ax" %t
 ; XFAIL: *
 
 @x = common global i16 0
diff --git a/test/CodeGen/X86/2010-01-19-OptExtBug.ll b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
index cd8960b..eb4a5c0 100644
--- a/test/CodeGen/X86/2010-01-19-OptExtBug.ll
+++ b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats |& not grep ext-opt
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats 2>&1 | not grep ext-opt
 
 define fastcc i8* @S_scan_str(i8* %start, i32 %keep_quoted, i32 %keep_delims) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
index 90eb84d..35f2339 100644
--- a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
+++ b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast %s -o %t
+; RUN: llc -regalloc=fast -optimize-regalloc=0 %s -o %t
 ; PR7066
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
index 36a99d6..eb0b150 100644
--- a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
+++ b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast -verify-machineinstrs < %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin"
 
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 4639866..9b47bb7 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast < %s | FileCheck %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s | FileCheck %s
 ; PR7382
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
index c6f4b49..be10ad5 100644
--- a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -12,9 +12,9 @@ declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalData
 
 ; Avoid hoisting the test above loads or copies
 ; CHECK: %entry
-; CHECK: cmpq
+; CHECK: test
 ; CHECK-NOT: mov
-; CHECK: jb
+; CHECK: je
 define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
 entry:
   %0 = load i8** null, align 8
diff --git a/test/CodeGen/X86/2011-04-19-sclr-bb.ll b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
new file mode 100644
index 0000000..771e4b3
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Make sure that values of illegal types are not scalarized between basic blocks.
+;CHECK: test
+;CHECK-NOT: pinsrw
+;CHECK-NOT: pextrb
+;CHECK: ret
+define void @test(i1 %cond) {
+ENTRY:
+  br label %LOOP
+LOOP:
+  %vec1 = phi <4 x i1> [ %vec1_or_2, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec2 = phi <4 x i1> [ %vec2_and_1, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec1_or_2 = or <4 x i1> %vec1, %vec2
+  %vec2_and_1 = and <4 x i1> %vec2, %vec1
+  br i1 %cond, label %LOOP, label %EXIT
+
+EXIT:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index bf7f583..ce63c74 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse | FileCheck %s
 
 define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
 entry:
diff --git a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
index a51dad0..47ef693 100644
--- a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
+++ b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats |& FileCheck %s
+; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats 2>&1 | FileCheck %s
 ;
 ; This test should not cause any spilling with RAFast.
 ;
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index 844d674..a6f428f 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -promote-elements -mattr=+sse2,-sse41 | FileCheck %s
+;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
 
 ;CHECK: @max
 ;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index ed5649c..4daf678 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41
 
 ; Make sure we are not crashing on this code.
 
diff --git a/test/CodeGen/X86/2011-10-11-srl.ll b/test/CodeGen/X86/2011-10-11-srl.ll
index cf9d36f..6c6d340 100644
--- a/test/CodeGen/X86/2011-10-11-srl.ll
+++ b/test/CodeGen/X86/2011-10-11-srl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=-sse41 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse41 
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index 6f9188c..dc3a08b 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse41 < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse41 < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -march=x86-64 -mattr=+sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -march=x86-64 -mattr=-sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
 ; Test case for r146671
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
index 557d49d..477b4de 100644
--- a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
+++ b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=core2 -mattr=+sse | FileCheck %s
 ; PR11940: Do not optimize away movb %al, %ch
 
 %struct.APInt = type { i64* }
diff --git a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
index 101ecca..18a3313 100644
--- a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
+++ b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats |& \
-; RUN:   not grep {Number of machine instructions hoisted out of loops post regalloc}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats 2>&1 | \
+; RUN:   not grep "Number of machine instructions hoisted out of loops post regalloc"
 
 ; rdar://11095580
 
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9543587..9a66b67 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vmovups
+;CHECK: vpxor
+;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
diff --git a/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
new file mode 100644
index 0000000..171c3f1
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=source | FileCheck %s
+
+; Teach two-address pass to update the "source" map so it doesn't perform a
+; non-profitable commute using outdated info. The test case would still fail
+; because of poor pre-RA schedule. That will be fixed by MI scheduler.
+; rdar://11472010
+define i32 @t(i32 %mask) nounwind readnone ssp {
+entry:
+; CHECK: t:
+; CHECK-NOT: mov
+  %sub = add i32 %mask, -65535
+  %shr = lshr i32 %sub, 23
+  %and = and i32 %mask, 1
+  %add = add i32 %shr, %and
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
new file mode 100644
index 0000000..837fbc0
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -verify-coalescing
+; PR12892
+;
+; Dead code elimination during coalesing causes a live range to split into two
+; virtual registers. Stale identity copies that had already been joined were
+; interfering with the liveness computations.
+
+target triple = "i386-pc-linux-gnu"
+
+define void @_ZN4llvm17AsmMatcherEmitter3runERNS_11raw_ostreamE() align 2 {
+  invoke void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+          to label %1 unwind label %5
+
+; <label>:1                                       ; preds = %0
+  invoke void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+          to label %4 unwind label %2
+
+; <label>:2                                       ; preds = %1
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  unreachable
+
+; <label>:4                                       ; preds = %1
+  invoke void @_ZN4llvm18isCurrentDebugTypeEPKc()
+          to label %12 unwind label %7
+
+; <label>:5                                       ; preds = %0
+  %6 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %33
+
+; <label>:7                                       ; preds = %4
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %9
+
+; <label>:9                                       ; preds = %28, %7
+  %10 = phi { i8*, i32 } [ %29, %28 ], [ %8, %7 ]
+  %11 = extractvalue { i8*, i32 } %10, 1
+  invoke fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev()
+          to label %32 unwind label %35
+
+; <label>:12                                      ; preds = %4
+  invoke void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
+          to label %13 unwind label %16
+
+; <label>:13                                      ; preds = %12
+  br label %14
+
+; <label>:14                                      ; preds = %20, %13
+  %15 = icmp eq i32 undef, 0
+  br i1 %15, label %20, label %18
+
+; <label>:16                                      ; preds = %12
+  %17 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %26
+
+; <label>:18                                      ; preds = %14
+  invoke void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+          to label %19 unwind label %21
+
+; <label>:19                                      ; preds = %18
+  unreachable
+
+; <label>:20                                      ; preds = %14
+  br label %14
+
+; <label>:21                                      ; preds = %18
+  %22 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %23 = extractvalue { i8*, i32 } %22, 1
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %21
+  br i1 undef, label %25, label %26
+
+; <label>:25                                      ; preds = %24
+  unreachable
+
+; <label>:26                                      ; preds = %24, %21, %16
+  %27 = phi i32 [ 0, %16 ], [ %23, %21 ], [ %23, %24 ]
+  invoke void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev()
+          to label %28 unwind label %30
+
+; <label>:28                                      ; preds = %26
+  %29 = insertvalue { i8*, i32 } undef, i32 %27, 1
+  br label %9
+
+; <label>:30                                      ; preds = %26
+  %31 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+
+; <label>:32                                      ; preds = %9
+  br label %33
+
+; <label>:33                                      ; preds = %32, %5
+  %34 = phi i32 [ undef, %5 ], [ %11, %32 ]
+  unreachable
+
+; <label>:35                                      ; preds = %9
+  %36 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+
+declare void @_ZN4llvm18isCurrentDebugTypeEPKc()
+
+declare fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev() unnamed_addr inlinehint align 2
+
+declare hidden void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev() unnamed_addr align 2
+
+declare void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+
+declare void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll
new file mode 100644
index 0000000..1c1e8e2
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
+entry:
+  ; CHECK: vmovaps
+  ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
+  ; CHECK: vmovups
+  %A = load <4 x i32>* %Ap
+  %B = load <4 x i32>* %Bp
+  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %Z, <8 x i32>* %P, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
new file mode 100644
index 0000000..906b748
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+; CHECK: load_store
+define void @load_store(<4 x i16>* %in) {
+entry:
+; CHECK: movsd
+  %A27 = load <4 x i16>* %in, align 4
+  %A28 = add <4 x i16> %A27, %A27
+; CHECK: movlpd
+  store <4 x i16> %A28, <4 x i16>* %in, align 4
+  ret void
+; CHECK: ret
+}
+
+; Make sure that we store a 64bit value, even on 32bit systems.
+;CHECK: store_64
+define void @store_64(<2 x i32>* %ptr) {
+BB:
+  store <2 x i32> zeroinitializer, <2 x i32>* %ptr
+  ret void
+;CHECK: movlpd
+;CHECK: ret
+}
+
+;CHECK: load_64
+define <2 x i32> @load_64(<2 x i32>* %ptr) {
+BB:
+  %t = load <2 x i32>* %ptr
+  ret <2 x i32> %t
+;CHECK: movsd
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/2012-07-10-shufnorm.ll b/test/CodeGen/X86/2012-07-10-shufnorm.ll
new file mode 100644
index 0000000..e39df58
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-shufnorm.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx | FileCheck %s
+
+; CHECK: ocl
+define void @ocl() {
+entry:
+  %vext = shufflevector <2 x double> zeroinitializer, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit = shufflevector <8 x double> %vext, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1 = insertelement <8 x double> %vecinit, double undef, i32 2
+  %vecinit3 = insertelement <8 x double> %vecinit1, double undef, i32 3
+  %vecinit5 = insertelement <8 x double> %vecinit3, double 0.000000e+00, i32 4
+  %vecinit9 = shufflevector <8 x double> %vecinit5, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+  store <8 x double> %vecinit9, <8 x double>* undef
+  ret void
+; CHECK: vxorps
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
new file mode 100644
index 0000000..3b7a8a7
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+
+declare x86_fastcallcc i64 @barrier()
+
+;CHECK: bcast_fold
+;CHECK: vmovaps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
+;CHECK: barrier
+;CHECK: vbroadcastss [[SPILLED]], %ymm0
+;CHECK: ret
+define <8 x float> @bcast_fold( float* %A) {
+BB:
+  %A0 = load float* %A
+  %tt3 = call x86_fastcallcc i64 @barrier()
+  br i1 undef, label %work, label %exit
+
+work:
+  %A1 = insertelement <8 x float> undef, float %A0, i32 0
+  %A2 = shufflevector <8 x float> %A1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %A2
+
+exit:
+  ret <8 x float> undef
+}
diff --git a/test/CodeGen/X86/2012-07-15-tconst_shl.ll b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
new file mode 100644
index 0000000..46eca76
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+avx2
+; make sure that we are not crashing.
+
+define <16 x i32> @autogen_SD34717() {
+BB:
+  %Shuff7 = shufflevector <16 x i32> zeroinitializer, <16 x i32> zeroinitializer, <16 x i32> <i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 undef, i32 22, i32 24, i32 26, i32 28, i32 30, i32 undef>
+  %B9 = lshr <16 x i32> zeroinitializer, %Shuff7
+  ret <16 x i32> %B9
+}
diff --git a/test/CodeGen/X86/2012-07-15-vshl.ll b/test/CodeGen/X86/2012-07-15-vshl.ll
new file mode 100644
index 0000000..cd0fef4
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-vshl.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx
+; PR13352
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define void @f_f() nounwind {
+allocas:
+  br label %for_loop29
+
+for_loop29:                                       ; preds = %safe_if_after_true, %allocas
+  %indvars.iv596 = phi i64 [ %indvars.iv.next597, %safe_if_after_true ], [ 0, %allocas ]
+  %0 = trunc i64 %indvars.iv596 to i32
+  %smear.15 = insertelement <16 x i32> undef, i32 %0, i32 15
+  %bitop = lshr <16 x i32> zeroinitializer, %smear.15
+  %bitop35 = and <16 x i32> %bitop, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bitop35_to_bool = icmp ne <16 x i32> %bitop35, zeroinitializer
+  %val_to_boolvec32 = sext <16 x i1> %bitop35_to_bool to <16 x i32>
+  %floatmask.i526 = bitcast <16 x i32> %val_to_boolvec32 to <16 x float>
+  %mask1.i529 = shufflevector <16 x float> %floatmask.i526, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %"internal_mask&function_mask41_any" = icmp eq i32 undef, 0
+  br i1 %"internal_mask&function_mask41_any", label %safe_if_after_true, label %safe_if_run_true
+
+safe_if_after_true:                               ; preds = %for_loop29
+  %indvars.iv.next597 = add i64 %indvars.iv596, 1
+  br label %for_loop29
+
+safe_if_run_true:                                 ; preds = %for_loop29
+  %blend1.i583 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> undef, <8 x float> undef, <8 x float> %mask1.i529) nounwind
+  unreachable
+}
+
diff --git a/test/CodeGen/X86/2012-07-16-LeaUndef.ll b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
new file mode 100644
index 0000000..9e5cbd2
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD2543() {
+A:
+  %E83 = add i32 0, 1
+  %E820 = add i32 0, undef
+  br label %C
+C:
+  %B908 = add i32 %E83, %E820
+  store i32 %B908, i32* undef
+  %Sl2391 = select i1 undef, i32 undef, i32 %E83
+  %Cmp3114 = icmp ne i32 %Sl2391, undef
+  br i1 %Cmp3114, label %C, label %G
+G:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
new file mode 100644
index 0000000..17533a1
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD3100() {
+BB:
+  %FC123 = fptoui float 0x40693F5D00000000 to i1
+  br i1 %FC123, label %V, label %W
+
+V:
+  ret void
+W:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-17-vtrunc.ll b/test/CodeGen/X86/2012-07-17-vtrunc.ll
new file mode 100644
index 0000000..2de2f97
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-17-vtrunc.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD33189483() {
+BB:
+  br label %CF76
+
+CF76:                                             ; preds = %CF76, %BB
+  %Shuff13 = shufflevector <4 x i64> zeroinitializer, <4 x i64> undef, <4 x i32> zeroinitializer
+  %Tr16 = trunc <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i1>
+  %E19 = extractelement <8 x i1> %Tr16, i32 2
+  br i1 %E19, label %CF76, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF76
+  %BC = bitcast <4 x i64> %Shuff13 to <4 x double>
+  br label %CF78
+}
diff --git a/test/CodeGen/X86/2012-07-23-select_cc.ll b/test/CodeGen/X86/2012-07-23-select_cc.ll
new file mode 100644
index 0000000..33fcb12
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-23-select_cc.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR 13428
+
+declare void @use(double)
+
+define void @test() {
+entry:
+  call void @use(double 1.000000e+00)
+  %A = icmp eq i64 undef, 2
+  %B = zext i1 %A to i32
+  %C = sitofp i32 %B to double
+  call void @use(double %C)
+  call void @use(double 0.000000e+00)
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
new file mode 100644
index 0000000..000b853
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; Cmp lowering should not look past the truncate unless the high bits are known
+; zero.
+; rdar://12027825
+
+define void @foo(i8 %arg4, i32 %arg5, i32* %arg14) nounwind {
+bb:
+; CHECK: foo:
+; CHECK-NOT: testl
+; CHECK: testb
+  %tmp48 = zext i8 %arg4 to i32
+  %tmp49 = and i32 %tmp48, 32
+  %tmp50 = add i32 %tmp49, 1593371643
+  %tmp55 = sub i32 %tmp50, 0
+  %tmp56 = add i32 %tmp55, 7787538
+  %tmp57 = xor i32 %tmp56, 1601159181
+  %tmp58 = xor i32 %arg5, 1601159181
+  %tmp59 = and i32 %tmp57, %tmp58
+  %tmp60 = add i32 %tmp59, -1263900958
+  %tmp67 = sub i32 %tmp60, 0
+  %tmp103 = xor i32 %tmp56, 13
+  %tmp104 = trunc i32 %tmp103 to i8
+  %tmp105 = sub i8 0, %tmp104
+  %tmp106 = add i8 %tmp105, -103
+  %tmp113 = sub i8 %tmp106, 0
+  %tmp114 = add i8 %tmp113, -72
+  %tmp141 = icmp ne i32 %tmp67, -1263900958
+  %tmp142 = select i1 %tmp141, i8 %tmp114, i8 undef
+  %tmp143 = xor i8 %tmp142, 81
+  %tmp144 = zext i8 %tmp143 to i32
+  %tmp145 = add i32 %tmp144, 2062143348
+  %tmp152 = sub i32 %tmp145, 0
+  store i32 %tmp152, i32* %arg14
+  ret void
+}
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index 386057f..4f1a859 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,11 +1,12 @@
 ; A test for checking PR 9623
-;RUN: llc -march=x86-64 -mcpu=corei7 -promote-elements < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 target triple = "x86_64-apple-darwin"
 
-; CHECK:  pmulld 
-; CHECK:  paddd  
-; CHECK:  movdqa 
+; CHECK:  pmulld
+; CHECK:  paddd
+; CHECK-NOT:  movdqa
+; CHECK:  ret
 
 define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
 entry:
diff --git a/test/CodeGen/X86/MachineSink-PHIUse.ll b/test/CodeGen/X86/MachineSink-PHIUse.ll
index 3758fd8..3314168 100644
--- a/test/CodeGen/X86/MachineSink-PHIUse.ll
+++ b/test/CodeGen/X86/MachineSink-PHIUse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats |& grep {machine-sink}
+; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats 2>&1 | grep "machine-sink"
 
 define fastcc void @t() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index 8e871f4..03d2e47 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -join-physregs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -join-physregs | FileCheck %s -check-prefix=X64
-
-; Some of these tests depend on -join-physregs to commute instructions.
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; The immediate can be encoded in a smaller way if the
 ; instruction is a sub instead of an add.
@@ -101,9 +99,9 @@ define {i32, i1} @test7(i32 %v1, i32 %v2) nounwind {
 }
 
 ; X64: test7:
-; X64: addl %e[[A1]], %eax
+; X64: addl %e[[A1]], %e
 ; X64-NEXT: setb %dl
-; X64-NEXT: ret
+; X64: ret
 
 ; PR5443
 define {i64, i1} @test8(i64 %left, i64 %right) nounwind {
diff --git a/test/CodeGen/X86/addr-label-difference.ll b/test/CodeGen/X86/addr-label-difference.ll
index 49abd8a..15fbec5 100644
--- a/test/CodeGen/X86/addr-label-difference.ll
+++ b/test/CodeGen/X86/addr-label-difference.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | grep {__TEXT,__const}
+; RUN: llc %s -o - | grep "__TEXT,__const"
 ; PR5929
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0"
diff --git a/test/CodeGen/X86/aligned-comm.ll b/test/CodeGen/X86/aligned-comm.ll
index 7715869..eab02cc 100644
--- a/test/CodeGen/X86/aligned-comm.ll
+++ b/test/CodeGen/X86/aligned-comm.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep {array,16512,7}
-; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep {array,16512,7}
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep "array,16512,7"
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep "array,16512,7"
 
 ; Darwin 9+ should get alignment on common symbols.
 @array = common global [4128 x i32] zeroinitializer, align 128
diff --git a/test/CodeGen/X86/alignment-2.ll b/test/CodeGen/X86/alignment-2.ll
index cc709b5..1f9e85c 100644
--- a/test/CodeGen/X86/alignment-2.ll
+++ b/test/CodeGen/X86/alignment-2.ll
@@ -18,7 +18,9 @@
 define signext i8 @do_lo_list() nounwind optsize ssp {
 bb:
 ; CHECK:     do_lo_list
-; CHECK-NOT: movaps
+; Make sure we do not use movaps for the global variable.
+; It is okay to use movaps for writing the local variable on stack.
+; CHECK-NOT: movaps {{[0-9]*}}(%{{[a-z]*}}), {{%xmm[0-9]}}
   %myopt = alloca %struct.printQueryOpt, align 4
   %tmp = bitcast %struct.printQueryOpt* %myopt to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* bitcast (%struct.printQueryOpt* getelementptr inbounds (%struct._psqlSettings* @pset, i32 0, i32 4) to i8*), i32 76, i32 4, i1 false)
diff --git a/test/CodeGen/X86/alloca-align-rounding-32.ll b/test/CodeGen/X86/alloca-align-rounding-32.ll
index c0f1a18..a45284e 100644
--- a/test/CodeGen/X86/alloca-align-rounding-32.ll
+++ b/test/CodeGen/X86/alloca-align-rounding-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | grep and | count 1
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i32 %h) {
   %p = alloca <2 x i64>, i32 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andl $-32, %eax
 }
 
 define void @foo2(i32 %h) {
   %p = alloca <2 x i64>, i32 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andl $-32, %esp
+; CHECK: andl $-32, %eax
 }
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 3c87dbf..3d76fb0 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | grep and | count 1
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i64 %h) {
   %p = alloca <2 x i64>, i64 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andq $-32, %rax
 }
 
 define void @foo2(i64 %h) {
   %p = alloca <2 x i64>, i64 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andq $-32, %rsp
+; CHECK: andq $-32, %rax
 }
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index a3dc85f..640237d 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding -join-physregs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
 
 ; PR8365
 ; CHECK: andl	$-64, %edi              # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll
index f0d46a0..47accdb 100644
--- a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll
+++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mcpu=core2 | grep xorps | count 2
-; RUN: llc < %s -mcpu=core2 | not grep movap
+; RUN: llc < %s -mcpu=core2 | FileCheck %s
 ; PR2715
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -11,8 +10,22 @@ target triple = "x86_64-unknown-linux-gnu"
 	%struct.nsXPTCVariant = type { %struct.nsXPTCMiniVariant, i8*, %struct.nsXPTType, i8 }
 	%struct.nsXPTType = type { %struct.XPTTypeDescriptorPrefix }
 
-define i32 @XPTC_InvokeByIndex(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
+define i32 @test1(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
 entry:
 	call void asm sideeffect "", "{xmm0},{xmm1},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},~{dirflag},~{fpsr},~{flags}"( double undef, double undef, double undef, double 1.0, double undef, double 0.0, double undef, double 0.0 ) nounwind
 	ret i32 0
+	; CHECK: test1
+	; CHECK-NOT: movap
+	; CHECK: xorps
+	; CHECK: xorps
+	; CHECK-NOT: movap
+}
+
+define i64 @test2() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "movq $1, $0", "={xmm7},*m,~{dirflag},~{fpsr},~{flags}"(i64* null) nounwind
+  ret i64 %0
+  ; CHECK: test2
+	; CHECK: movq {{.*}}, %xmm7
+	; CHECK: movd %xmm7, %rax
 }
diff --git a/test/CodeGen/X86/atom-lea-sp.ll b/test/CodeGen/X86/atom-lea-sp.ll
index 5942788..19482e1 100644
--- a/test/CodeGen/X86/atom-lea-sp.ll
+++ b/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=atom %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=ATOM %s
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
 
 declare void @use_arr(i8*)
 declare void @many_params(i32, i32, i32, i32, i32, i32)
 
 define void @test1() nounwind {
-; atom: test1:
-; atom: leal -1052(%esp), %esp
-; atom-NOT: sub
-; atom: call
-; atom: leal 1052(%esp), %esp
+; ATOM: test1:
+; ATOM: leal -1052(%esp), %esp
+; ATOM-NOT: sub
+; ATOM: call
+; ATOM: leal 1052(%esp), %esp
 
 ; CHECK: test1:
 ; CHECK: subl
@@ -22,10 +22,10 @@ define void @test1() nounwind {
 }
 
 define void @test2() nounwind {
-; atom: test2:
-; atom: leal -28(%esp), %esp
-; atom: call
-; atom: leal 28(%esp), %esp
+; ATOM: test2:
+; ATOM: leal -28(%esp), %esp
+; ATOM: call
+; ATOM: leal 28(%esp), %esp
 
 ; CHECK: test2:
 ; CHECK-NOT: lea
@@ -34,9 +34,9 @@ define void @test2() nounwind {
 }
 
 define void @test3() nounwind {
-; atom: test3:
-; atom: leal -8(%esp), %esp
-; atom: leal 8(%esp), %esp
+; ATOM: test3:
+; ATOM: leal -8(%esp), %esp
+; ATOM: leal 8(%esp), %esp
 
 ; CHECK: test3:
 ; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index 4dd9a9e..0d97e85 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,9 +1,6 @@
-; XFAIL: *
 ; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
 ; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
 ;
-; FIXME: Atom's scheduler is temporarily disabled.
-; XFAIL: *
 
 @a = common global i32 0, align 4
 @b = common global i32 0, align 4
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 7c5abe2..152bece 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 7729491..188efe2 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx  -mattr=+avx | FileCheck %s
 
 ; AVX128 tests:
 
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index b334932..c44beb4 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
 define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
   ; CHECK: vaesdec
@@ -1154,7 +1154,7 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1165,7 +1165,7 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1176,7 +1176,7 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1187,7 +1187,7 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1198,7 +1198,7 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1209,6 +1209,7 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -1226,7 +1227,7 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
 
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1235,7 +1236,7 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1244,7 +1245,7 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1253,7 +1254,7 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1262,7 +1263,7 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1271,6 +1272,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -2555,3 +2557,36 @@ define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+; CHECK: movntdq
+define void @movnt_dq(i8* %p, <4 x i64> %a1) nounwind {
+  %a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
+  tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
+
+; CHECK: movntps
+define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
+  tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
+
+; CHECK: movntpd
+define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
+  ; add operation forces the execution domain.
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
+
+
+; Check for pclmulqdq
+define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK: vpclmulqdq
+  %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-minmax.ll b/test/CodeGen/X86/avx-minmax.ll
index 7c58820..eff9251 100644
--- a/test/CodeGen/X86/avx-minmax.ll
+++ b/test/CodeGen/X86/avx-minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
 
 ; UNSAFE: maxpd:
 ; UNSAFE: vmaxpd {{.+}}, %xmm
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 5268ec3a..e203c4e 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -4,5 +4,5 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  ret <4 x i64>%b
  ; CHECK: test1:
- ; CHECK: vinsertf128
+ ; CHECK-NOT: vinsertf128
  }
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 16c447b..9b41709 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -90,8 +90,8 @@ define i32 @test9(<4 x i32> %a) nounwind {
 ; Extract a value which is the result of an undef mask.
 define i32 @test10(<4 x i32> %a) nounwind {
 ; CHECK: @test10
-; CHECK-NEXT: #
-; CHECK-NEXT: ret
+; CHECK-NOT: {{^[^#]*[a-z]}}
+; CHECK: ret
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %r = extractelement <8 x i32> %b, i32 2
   ret i32 %r
@@ -149,17 +149,26 @@ entry:
 }
 
 ; PR12413
+; CHECK: shuf1
+; CHECK: vpshufb
+; CHECK: vpshufb
 ; CHECK: vpshufb
 ; CHECK: vpshufb
+define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
+entry:
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+ ret <32 x i8> %0
+}
+
+; handle the case where only half of the 256-bits is splittable
+; CHECK: shuf2
 ; CHECK: vpshufb
 ; CHECK: vpshufb
-define <32 x i8> @shuf(<32 x i8> %inval1, <32 x i8> %inval2) {
+; CHECK: vpextrb
+; CHECK: vpextrb
+define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
 entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0,
-i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32
-22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32
-42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32
-62>
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
  ret <32 x i8> %0
 }
 
@@ -202,3 +211,40 @@ define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
   %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i64> %t
 }
+
+; CHECK: narrow
+; CHECK: vpermilps
+; CHECK: ret
+define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
+  ret <16 x i16> %t
+}
+
+;CHECK: test17
+;CHECK-NOT: vinsertf128
+;CHECK: ret
+define   <8 x float> @test17(<4 x float> %y) {
+  %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %x
+}
+
+; CHECK: test18
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x float>%S
+}
+
+; CHECK: test19
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x float>%S
+}
+
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 148ae73..0d403d4 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -112,3 +112,32 @@ entry:
   %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
   ret <2 x double> %vecinit2.i
 }
+
+; CHECK: _RR
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  ; force a chain
+  %j = load i32* %k, align 4
+  store i32 %j, i32* undef
+  ret <4 x float> %vecinit6.i
+}
+
+
+; CHECK: _RR2
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %v = insertelement <4 x float> undef, float %q, i32 0
+  %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %t
+}
+
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
new file mode 100755
index 0000000..b474913
--- /dev/null
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: trunc4
+; CHECK: vpermd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
+  %B = trunc <4 x i64> %A to <4 x i32>
+  ret <4 x i32>%B
+}
+
+; CHECK: trunc8
+; CHECK: vpshufb
+; CHECK-NOT: vinsert
+; CHECK: ret
+
+define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
+  %B = trunc <8 x i32> %A to <8 x i16>
+  ret <8 x i16>%B
+}
+
+; CHECK: sext4
+; CHECK: vpmovsxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @sext4(<4 x i32> %A) nounwind {
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: sext8
+; CHECK: vpmovsxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @sext8(<8 x i16> %A) nounwind {
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+; CHECK: zext4
+; CHECK: vpmovzxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @zext4(<4 x i32> %A) nounwind {
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: zext8
+; CHECK: vpmovzxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @zext8(<8 x i16> %A) nounwind {
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+; CHECK: zext_8i8_8i32
+; CHECK: vpmovzxwd
+; CHECK: vpand
+; CHECK: ret
+define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
+  %B = zext <8 x i8> %A to <8 x i32>  
+  ret <8 x i32>%B
+}
+
+
+
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 3f27a02..a6141b0 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -976,3 +976,182 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
   ret void
 }
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
+                      <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
+                      <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
+                      <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
+                      <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
+                      <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
+                      <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
+                      <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
+                      <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
+                      <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
+                      <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
+                      <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
+                      <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
+                      <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
+                      <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
+                      <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
+                      <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+; PR13298
+define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
+                                      <8 x i32> %idx, <8 x float> %mask,
+                                      float* nocapture %out) {
+; CHECK: test_gather_mask
+; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
+; CHECK: vgatherdps [[DEST]]
+;; gather with mask
+  %a_i8 = bitcast float* %a to i8*
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                           i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
+
+;; for debugging, we'll just dump out the mask
+  %out_ptr = bitcast float * %out to <8 x float> *
+  store <8 x float> %mask, <8 x float> * %out_ptr, align 4
+
+  ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
new file mode 100644
index 0000000..c5899fa
--- /dev/null
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
+; The mask for the vpblendw instruction needs to be identical for both halves
+; of the YMM. Need to use two vpblendw instructions.
+
+; CHECK: blendw1
+; CHECK: vpblendw
+; CHECK: vpblendw
+; CHECK: ret
+define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: vpshufhw $27, %ymm
+define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i16> %shuffle.i
+}
+
+; CHECK: vpshuflw $27, %ymm
+define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle.i
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 1a78414..b804233 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -160,6 +160,15 @@ entry:
   ret <8 x i32> %g
 }
 
+; CHECK: V113
+; CHECK: vbroadcastss
+; CHECK: ret
+define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
+entry:
+  %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
+  ret <8 x float> %g
+}
+
 ; CHECK: _e2
 ; CHECK: vbroadcastss
 ; CHECK: ret
@@ -179,9 +188,170 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
   %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
   %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
   ret <8 x i8> %vecinit7.i
 }
+
+
+define void @crash() nounwind alwaysinline {
+WGLoopsEntry:
+  br i1 undef, label %ret, label %footer329VF
+
+footer329VF:
+  %A.0.inVF = fmul float undef, 6.553600e+04
+  %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
+  %A.0VF = fptosi float %A.0.inVF to i32
+  %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
+  %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %1 = and i32 %A.0VF, 65535
+  %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
+  %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
+  br i1 undef, label %preload1201VF, label %footer349VF
+
+preload1201VF:
+  br label %footer349VF
+
+footer349VF:
+  %2 = mul nsw <8 x i32> undef, %0
+  %3 = mul nsw <8 x i32> undef, %vector1099VF
+  br label %footer329VF
+
+ret:
+  ret void
+}
+
+; CHECK: _inreg0
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
+  %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %wide
+}
+
+; CHECK: _inreg1
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %wide
+}
+
+; CHECK: _inreg2
+; CHECK: broadcastss
+; CHECK: ret
+define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %wide
+}
+
+; CHECK: _inreg3
+; CHECK: broadcastsd
+; CHECK: ret
+define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x double> undef, double %scalar, i32 0
+  %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %wide
+}
+
+;CHECK: _inreg8xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <8 x float> @_inreg8xfloat(<8 x float> %a) {
+  %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %b
+}
+
+;CHECK: _inreg4xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <4 x float> @_inreg4xfloat(<4 x float> %a) {
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %b
+}
+
+;CHECK: _inreg16xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <16 x i16> @_inreg16xi16(<16 x i16> %a) {
+  %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %b
+}
+
+;CHECK: _inreg8xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
+  %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %b
+}
+
+
+;CHECK: _inreg4xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
+  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
+  ret <4 x i64> %b
+}
+
+;CHECK: _inreg2xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x i64> @_inreg2xi64(<2 x i64> %a) {
+  %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %b
+}
+
+;CHECK: _inreg4xdouble
+;CHECK: vbroadcastsd
+;CHECK: ret
+define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
+  %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %b
+}
+
+;CHECK: _inreg2xdouble
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
+  %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %b
+}
+
+;CHECK: _inreg8xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %b
+}
+
+;CHECK: _inreg4xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %b
+}
+
+;CHECK: _inreg32xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <32 x i8> @_inreg32xi8(<32 x i8> %a) {
+  %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %b
+}
+
+;CHECK: _inreg16xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
+  %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %b
+}
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
index c80f2b0..fce6b7f 100644
--- a/test/CodeGen/X86/basic-promote-integers.ll
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -1,7 +1,7 @@
 ; Test that vectors are scalarized/lowered correctly
 ; (with both legalization methods).
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86                   < %s
+; RUN: llc -march=x86  < %s
+; RUN: llc -march=x86  < %s
 
 ; A simple test to check copyToParts and copyFromParts.
 
diff --git a/test/CodeGen/X86/bigstructret.ll b/test/CodeGen/X86/bigstructret.ll
index 633995d..3c499fa 100644
--- a/test/CodeGen/X86/bigstructret.ll
+++ b/test/CodeGen/X86/bigstructret.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=x86 -o %t
-; RUN: grep "movl	.24601, 12(%ecx)" %t
-; RUN: grep "movl	.48, 8(%ecx)" %t
-; RUN: grep "movl	.24, 4(%ecx)" %t
-; RUN: grep "movl	.12, (%ecx)" %t
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 %0 = type { i32, i32, i32, i32 }
+%1 = type { i1, i1, i1, i32 }
 
-define internal fastcc %0 @ReturnBigStruct() nounwind readnone {
+; CHECK: ReturnBigStruct
+; CHECK: movl $24601, 12(%ecx)
+; CHECK: movl	$48, 8(%ecx)
+; CHECK: movl	$24, 4(%ecx)
+; CHECK: movl	$12, (%ecx)
+
+define fastcc %0 @ReturnBigStruct() nounwind readnone {
 entry:
   %0 = insertvalue %0 zeroinitializer, i32 12, 0
   %1 = insertvalue %0 %0, i32 24, 1
@@ -15,3 +18,17 @@ entry:
   ret %0 %3
 }
 
+; CHECK: ReturnBigStruct2
+; CHECK: movl	$48, 4(%ecx)
+; CHECK: movb	$1, 2(%ecx)
+; CHECK: movb	$1, 1(%ecx)
+; CHECK: movb	$0, (%ecx)
+
+define fastcc %1 @ReturnBigStruct2() nounwind readnone {
+entry:
+  %0 = insertvalue %1 zeroinitializer, i1 false, 0
+  %1 = insertvalue %1 %0, i1 true, 1
+  %2 = insertvalue %1 %1, i1 true, 2
+  %3 = insertvalue %1 %2, i32 48, 3
+  ret %1 %3
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 3a10c70..11f811f 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 
 ; In this test we check that sign-extend of the mask bit is performed by
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index fc7b638..5534712 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -7,10 +7,15 @@ define i32 @test_ifchains(i32 %i, i32* %a, i32 %b) {
 ; that is not expected to run.
 ; CHECK: test_ifchains:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %else1
+; CHECK-NOT: .align
 ; CHECK: %else2
+; CHECK-NOT: .align
 ; CHECK: %else3
+; CHECK-NOT: .align
 ; CHECK: %else4
+; CHECK-NOT: .align
 ; CHECK: %exit
 ; CHECK: %then1
 ; CHECK: %then2
@@ -76,8 +81,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
 ; Check that we sink cold loop blocks after the hot loop body.
 ; CHECK: test_loop_cold_blocks:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %unlikely1
+; CHECK-NOT: .align
 ; CHECK: %unlikely2
+; CHECK: .align
 ; CHECK: %body1
 ; CHECK: %body2
 ; CHECK: %body3
@@ -634,7 +642,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ;
 ; CHECK: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
-; CHECK: %body
+; CHECK: [[BODY:# BB#[0-9]+]]:
 ; CHECK: %loop2b
 ; CHECK: %loop1
 ; CHECK: %loop2a
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
new file mode 100644
index 0000000..0cb9fd9
--- /dev/null
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+
+define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i32 %a, i32 %b
+  ret i32 %t3
+; CHECK: foo
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: cmov
+; CHECK: ret
+}
+
+define i32 @bar(<2 x i64> %c) {
+entry:
+  %0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %1 = icmp ne i32 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; CHECK: bar
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: jne
+; CHECK: ret
+}
+
+define i32 @bax(<2 x i64> %c) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp eq i32 %t1, 1
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+; CHECK: bax
+; CHECK: ptest
+; CHECK-NOT: cmpl
+; CHECK: ret
+}
+
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 2c37194..5223463 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=x86-64 < %s | FileCheck %s
 
 ; CHECK: orq
-; CHECK-NEXT: LBB0_1
+; CHECK-NEXT: %bb8.i329
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index 93b2043..c942614 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,10 +1,12 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
-; RUN:   grep {%xmm0} %t | count 14
-; RUN:   not grep {%xmm1} %t
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -break-anti-dependencies=critical > %t
-; RUN:   grep {%xmm0} %t | count 7
-; RUN:   grep {%xmm1} %t | count 7
+; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
+; breaker requires liveness information to be kept.
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN:   grep "%xmm0" %t | count 14
+; RUN:   not grep "%xmm1" %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
+; RUN:   grep "%xmm0" %t | count 7
+; RUN:   grep "%xmm1" %t | count 7
 
 define void @goo(double* %r, double* %p, double* %q) nounwind {
 entry:
diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll
index 2dee575..4d80189 100644
--- a/test/CodeGen/X86/break-sse-dep.ll
+++ b/test/CodeGen/X86/break-sse-dep.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 
 define double @t1(float* nocapture %x) nounwind readonly ssp {
 entry:
@@ -34,8 +34,7 @@ entry:
 define double @squirt(double* %x) nounwind {
 entry:
 ; CHECK: squirt:
-; CHECK: movsd ([[A0]]), %xmm0
-; CHECK: sqrtsd %xmm0, %xmm0
+; CHECK: sqrtsd ([[A0]]), %xmm0
   %z = load double* %x
   %t = call double @llvm.sqrt.f64(double %z)
   ret double %t
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 3857fb1..38cda4d 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep {call.*12345678}
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep "call.*12345678"
 
 ; Call to immediate is not safe on x86-64 unless we *know* that the
 ; call will be within 32-bits pcrel from the dest immediate.
 
-; RUN: llc < %s -march=x86-64 | grep {call.*\\*%rax}
+; RUN: llc < %s -march=x86-64 | grep "call.*\*%rax"
 
 ; PR3666
 ; PR3773
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 7420ce7..8cdd59e 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -4,7 +4,7 @@
 %0 = type opaque
 %struct.NSConstantString = type { i32*, i32, i8*, i32 }
 
-; Make sure that the string ends up the the correct section.
+; Make sure that the string ends up the correct section.
 
 ; CHECK:        .section __TEXT,__cstring
 ; CHECK-NEXT: l_.str3:
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
new file mode 100644
index 0000000..780746a
--- /dev/null
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+
+; cmp with single-use load, should not form cmov.
+define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
+  %load = load double* %b, align 8
+  %cmp = fcmp olt double %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test1:
+; CHECK: ucomisd
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Sanity check: no load.
+define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
+  %cmp = fcmp ogt double %a, %b
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test2:
+; CHECK: ucomisd
+; CHECK: cmov
+}
+
+; Multiple uses of %a, should not form cmov.
+define i32 @test3(i32 %a, i32* nocapture %b, i32 %x)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %a, i32 %x
+  ret i32 %cond
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Multiple uses of the load.
+define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  %add = add i32 %cond, %load
+  ret i32 %add
+; CHECK: test4:
+; CHECK: cmpl
+; CHECK: cmov
+}
+
+; Multiple uses of the cmp.
+define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cmp1 = icmp ugt i32 %load, %a
+  %cond = select i1 %cmp1, i32 %a, i32 %y
+  %cond5 = select i1 %cmp, i32 %cond, i32 %x
+  ret i32 %cond5
+; CHECK: test5:
+; CHECK: cmpl
+; CHECK: cmov
+; CHECK: cmov
+}
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 2e7ffbf..ed25c82 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test1:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test2:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index ef5e353..eb06327 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -90,3 +90,64 @@ F:
 ; CHECK: encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
 }
 
+; rdar://11866926
+define i32 @test7(i64 %res) nounwind {
+entry:
+; CHECK: test7:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test8(i64 %res) nounwind {
+entry:
+; CHECK: test8:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: cmpq $3, %rdi
+  %lnot = icmp ult i64 %res, 12884901888
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test9(i64 %res) nounwind {
+entry:
+; CHECK: test9:
+; CHECK-NOT: movabsq
+; CHECK: shrq $33, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 8589934592
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test10(i64 %res) nounwind {
+entry:
+; CHECK: test10:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: setne
+  %lnot = icmp uge i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+; rdar://9758774
+define i32 @test11(i64 %l) nounwind {
+entry:
+; CHECK: test11:
+; CHECK-NOT: movabsq
+; CHECK-NOT: andq
+; CHECK: shrq $47, %rdi
+; CHECK: cmpq $1, %rdi
+  %shr.mask = and i64 %l, -140737488355328
+  %cmp = icmp eq i64 %shr.mask, 140737488355328
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/coalesce-esp.ll b/test/CodeGen/X86/coalesce-esp.ll
index a584876..4004379 100644
--- a/test/CodeGen/X86/coalesce-esp.ll
+++ b/test/CodeGen/X86/coalesce-esp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%esp, %ebp}
+; RUN: llc < %s | grep "movl	%esp, %ebp"
 ; PR4572
 
 ; Don't coalesce with %esp if it would end up putting %esp in
diff --git a/test/CodeGen/X86/coalescer-commute2.ll b/test/CodeGen/X86/coalescer-commute2.ll
index 6e5c1cf..e45437c 100644
--- a/test/CodeGen/X86/coalescer-commute2.ll
+++ b/test/CodeGen/X86/coalescer-commute2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
 ; CHECK-NOT:     mov
 ; CHECK:     paddw
 ; CHECK-NOT:     mov
@@ -26,14 +26,3 @@ entry:
 	%tmp10 = bitcast <8 x i16> %tmp9 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %tmp10
 }
-
-
-; The coalescer should commute the add to avoid a copy.
-define <4 x float> @test3(<4 x float> %V) {
-entry:
-        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
-                                        <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
-        %add = fadd <4 x float> %tmp8, %V
-        ret <4 x float> %add
-}
-
diff --git a/test/CodeGen/X86/coalescer-dce2.ll b/test/CodeGen/X86/coalescer-dce2.ll
new file mode 100644
index 0000000..bbbf09b
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-dce2.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -verify-coalescing
+; PR12911
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@h = common global i32 0, align 4
+@f = common global i32 0, align 4
+@g = common global i32 0, align 4
+@a = common global i16 0, align 2
+@e = common global i32 0, align 4
+
+define void @fn1() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @d, align 4
+  %tobool72 = icmp eq i32 %0, 0
+  br i1 %tobool72, label %for.end32, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %1 = load i32* @c, align 4
+  %tobool2 = icmp eq i32 %1, 0
+  %2 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %2, 0
+  %conv = zext i1 %cmp to i32
+  %3 = load i32* @g, align 4
+  %tobool4 = icmp eq i32 %3, 0
+  %4 = load i16* @a, align 2
+  %tobool9 = icmp eq i16 %4, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond25.loopexit.us-lcssa.us-lcssa, %if.end.us50, %if.end.us, %if.end.us.us, %for.cond1.preheader.lr.ph
+  %j.073 = phi i32 [ undef, %for.cond1.preheader.lr.ph ], [ %j.1.us.us, %if.end.us.us ], [ %j.1.us, %if.end.us ], [ %j.073, %for.cond25.loopexit.us-lcssa.us-lcssa ], [ %j.1.us36, %if.end.us50 ]
+  br i1 %tobool2, label %for.cond1.preheader.split.us, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+
+for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %if.end.us50, label %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+
+for.cond1.preheader.split.us:                     ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %cond.end.us.us, label %cond.end.us
+
+cond.false18.us.us:                               ; preds = %if.end.us.us
+  %5 = load i32* @f, align 4
+  %sext76 = shl i32 %5, 16
+  %phitmp75 = ashr exact i32 %sext76, 16
+  br label %cond.end.us.us
+
+if.end.us.us:                                     ; preds = %cond.end.us.us, %if.then.us.us
+  br i1 %tobool4, label %cond.false18.us.us, label %for.cond1.preheader
+
+if.then.us.us:                                    ; preds = %cond.end.us.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us.us
+
+cond.end.us.us:                                   ; preds = %cond.false18.us.us, %for.cond1.preheader.split.us
+  %j.1.us.us = phi i32 [ %j.073, %for.cond1.preheader.split.us ], [ %phitmp75, %cond.false18.us.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us.us, label %if.end.us.us
+
+cond.end21.us:                                    ; preds = %land.lhs.true12.us, %cond.false18.us
+  %cond22.us = phi i16 [ %add.us, %cond.false18.us ], [ %4, %land.lhs.true12.us ]
+  %conv24.us = sext i16 %cond22.us to i32
+  br label %cond.end.us
+
+cond.false18.us:                                  ; preds = %if.end6.us, %land.lhs.true12.us
+  %add.us = add i16 %4, %conv7.us
+  br label %cond.end21.us
+
+land.lhs.true12.us:                               ; preds = %if.end6.us
+  %conv10.us = sext i16 %conv7.us to i32
+  %sub.us = sub nsw i32 0, %conv10.us
+  %cmp14.us = icmp slt i32 %sub.us, 1
+  br i1 %cmp14.us, label %cond.end21.us, label %cond.false18.us
+
+if.end6.us:                                       ; preds = %if.end.us
+  %6 = load i32* @f, align 4
+  %conv7.us = trunc i32 %6 to i16
+  %tobool11.us = icmp eq i16 %conv7.us, 0
+  br i1 %tobool11.us, label %cond.false18.us, label %land.lhs.true12.us
+
+if.end.us:                                        ; preds = %cond.end.us, %if.then.us
+  br i1 %tobool4, label %if.end6.us, label %for.cond1.preheader
+
+if.then.us:                                       ; preds = %cond.end.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us
+
+cond.end.us:                                      ; preds = %cond.end21.us, %for.cond1.preheader.split.us
+  %j.1.us = phi i32 [ %conv24.us, %cond.end21.us ], [ %j.073, %for.cond1.preheader.split.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us, label %if.end.us
+
+for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge: ; preds = %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  br i1 %tobool4, label %if.end6.us65, label %for.cond25.loopexit.us-lcssa.us-lcssa
+
+cond.false18.us40:                                ; preds = %if.end.us50
+  %7 = load i32* @f, align 4
+  %sext = shl i32 %7, 16
+  %phitmp = ashr exact i32 %sext, 16
+  br label %if.end.us50
+
+if.end.us50:                                      ; preds = %cond.false18.us40, %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  %j.1.us36 = phi i32 [ %j.073, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %phitmp, %cond.false18.us40 ]
+  store i32 0, i32* @h, align 4
+  br i1 %tobool4, label %cond.false18.us40, label %for.cond1.preheader
+
+if.end6.us65:                                     ; preds = %if.end6.us65, %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %if.end6.us65
+
+for.cond25.loopexit.us-lcssa.us-lcssa:            ; preds = %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %for.cond1.preheader
+
+for.end32:                                        ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/coalescer-identity.ll b/test/CodeGen/X86/coalescer-identity.ll
new file mode 100644
index 0000000..9c72ee6
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-identity.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -verify-coalescing
+; PR12927
+target triple = "x86_64-apple-macosx10.8.0"
+
+; This is a case where removeCopyByCommutingDef() creates an identity copy that
+; joinCopy must then deal with correctly.
+
+@s = common global i16 0, align 2
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g0 = common global i32 0, align 4
+
+define void @func() nounwind uwtable ssp {
+for.body.lr.ph:
+  %0 = load i32* @g2, align 4, !tbaa !0
+  %tobool6 = icmp eq i32 %0, 0
+  %s.promoted = load i16* @s, align 2
+  %.pre = load i32* @g1, align 4, !tbaa !0
+  br i1 %tobool6, label %for.body.us, label %for.body
+
+for.body.us:                                      ; preds = %for.body.lr.ph, %for.inc.us
+  %1 = phi i32 [ %3, %for.inc.us ], [ %.pre, %for.body.lr.ph ]
+  %dec13.us = phi i16 [ %dec12.us, %for.inc.us ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011.us = phi i32 [ %inc.us, %for.inc.us ], [ undef, %for.body.lr.ph ]
+  %v.010.us = phi i32 [ %phitmp.us, %for.inc.us ], [ 1, %for.body.lr.ph ]
+  %tobool1.us = icmp ne i32 %v.010.us, 0
+  %2 = zext i1 %tobool1.us to i16
+  %lnot.ext.us = xor i16 %2, 1
+  %add.us = add i16 %dec13.us, %lnot.ext.us
+  %conv3.us = zext i16 %add.us to i32
+  %add4.us = sub i32 0, %1
+  %tobool5.us = icmp eq i32 %conv3.us, %add4.us
+  br i1 %tobool5.us, label %for.inc.us, label %if.then7.us
+
+for.inc.us:                                       ; preds = %cond.end.us, %for.body.us
+  %3 = phi i32 [ %1, %for.body.us ], [ %4, %cond.end.us ]
+  %dec12.us = phi i16 [ %add.us, %for.body.us ], [ %dec.us, %cond.end.us ]
+  %inc.us = add i32 %i.011.us, 1
+  %phitmp.us = udiv i32 %v.010.us, 12
+  %tobool.us = icmp eq i32 %inc.us, 0
+  br i1 %tobool.us, label %for.end, label %for.body.us
+
+cond.end.us:                                      ; preds = %if.then7.us, %cond.false.us
+  %4 = phi i32 [ 0, %cond.false.us ], [ %1, %if.then7.us ]
+  %cond.us = phi i32 [ 0, %cond.false.us ], [ %v.010.us, %if.then7.us ]
+  store i32 %cond.us, i32* @g0, align 4, !tbaa !0
+  br label %for.inc.us
+
+cond.false.us:                                    ; preds = %if.then7.us
+  store i32 0, i32* @g1, align 4, !tbaa !0
+  br label %cond.end.us
+
+if.then7.us:                                      ; preds = %for.body.us
+  %dec.us = add i16 %add.us, -1
+  br i1 %tobool1.us, label %cond.end.us, label %cond.false.us
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %dec13 = phi i16 [ %dec12, %for.body ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011 = phi i32 [ %inc, %for.body ], [ undef, %for.body.lr.ph ]
+  %v.010 = phi i32 [ %phitmp, %for.body ], [ 1, %for.body.lr.ph ]
+  %tobool1 = icmp eq i32 %v.010, 0
+  %lnot.ext = zext i1 %tobool1 to i16
+  %add = add i16 %dec13, %lnot.ext
+  %conv3 = zext i16 %add to i32
+  %add4 = sub i32 0, %.pre
+  %not.tobool5 = icmp ne i32 %conv3, %add4
+  %dec = sext i1 %not.tobool5 to i16
+  %dec12 = add i16 %add, %dec
+  %inc = add i32 %i.011, 1
+  %phitmp = udiv i32 %v.010, 12
+  %tobool = icmp eq i32 %inc, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc.us, %for.body
+  %dec12.lcssa = phi i16 [ %dec12.us, %for.inc.us ], [ %dec12, %for.body ]
+  store i16 %dec12.lcssa, i16* @s, align 2
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index f979945..26318dd 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
 
 ; llc should share constant pool entries between this integer vector
 ; and this floating-point vector since they have the same encoding.
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
new file mode 100644
index 0000000..b578896
--- /dev/null
+++ b/test/CodeGen/X86/constructor.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+
+define void @f() {
+entry:
+  ret void
+}
+
+define void @g() {
+entry:
+  ret void
+}
+
+; CTOR:		.section	.ctors.65520,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	g
+; CTOR-NEXT:	.section	.ctors,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	f
+
+; INIT-ARRAY:		.section	.init_array.15,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	g
+; INIT-ARRAY-NEXT:	.section	.init_array,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	f
diff --git a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
index b82348b..064ee36 100644
--- a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
+++ b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
-; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; STATS: 9 asm-printer
 
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index cf6e27d..9badfc8 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 %s -o -
-; RUN: llc -march=x86-64 %s -o -
+; RUN: llc -march=x86 < %s -verify-machineinstrs
+; RUN: llc -march=x86-64 < %s -verify-machineinstrs
 
 ; PR6497
 
@@ -391,3 +391,54 @@ if.end:
   %t11 = tail call i64 asm sideeffect "foo", "=*m,=A,{bx},{cx},1,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %t6, i32 0, i32 0, i64 0) nounwind
   ret void
 }
+
+; Avoid emitting wrong kill flags from InstrEmitter.
+; InstrEmitter::EmitSubregNode() may steal virtual registers from already
+; emitted blocks when isCoalescableExtInstr points out the opportunity.
+; Make sure kill flags are cleared on the newly global virtual register.
+define i64 @ov_read(i8* %vf, i8* nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, i32* %bitstream) nounwind uwtable ssp {
+entry:
+  br i1 undef, label %return, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br i1 undef, label %if.then3, label %if.end7
+
+if.then3:                                         ; preds = %while.body.preheader
+  %0 = load i32* undef, align 4
+  br i1 undef, label %land.lhs.true.i255, label %if.end7
+
+land.lhs.true.i255:                               ; preds = %if.then3
+  br i1 undef, label %if.then.i256, label %if.end7
+
+if.then.i256:                                     ; preds = %land.lhs.true.i255
+  %sub.i = sub i32 0, %0
+  %conv = sext i32 %sub.i to i64
+  br i1 undef, label %if.end7, label %while.end
+
+if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
+  unreachable
+
+while.end:                                        ; preds = %if.then.i256
+  %cmp18 = icmp sgt i32 %sub.i, 0
+  %.conv = select i1 %cmp18, i64 -131, i64 %conv
+  ret i64 %.conv
+
+return:                                           ; preds = %entry
+  ret i64 -131
+}
+
+; The tail call to a varargs function sets %AL.
+; uitofp expands to an FCMOV instruction which splits the basic block.
+; Make sure the live range of %AL isn't split.
+@.str = private unnamed_addr constant { [1 x i8], [63 x i8] } zeroinitializer, align 32
+define void @pr13188(i64* nocapture %this) uwtable ssp address_safety align 2 {
+entry:
+  %x7 = load i64* %this, align 8
+  %sub = add i64 %x7, -1
+  %conv = uitofp i64 %sub to float
+  %div = fmul float %conv, 5.000000e-01
+  %conv2 = fpext float %div to double
+  tail call void (...)* @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
+  ret void
+}
+declare void @_Z6PrintFz(...)
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 6406cc7..0a3dfca 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index c3c7990..af69531 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats |& grep asm-printer | grep 14
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14
 
 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
index c35935f..d1e349f 100644
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ b/test/CodeGen/X86/dbg-merge-loc-entry.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-darwin8"
 ;CHECK-NEXT:    .short  Lset
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85                      ## DW_OP_reg5
-;CHECK-NEXT: Ltmp5
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
 
diff --git a/test/CodeGen/X86/dbg-value-range.ll b/test/CodeGen/X86/dbg-value-range.ll
index 28d873b..6b16865 100644
--- a/test/CodeGen/X86/dbg-value-range.ll
+++ b/test/CodeGen/X86/dbg-value-range.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin10 < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic -join-physregs < %s | FileCheck %s
 
 %struct.a = type { i32 }
 
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index e577ecb..8e7c13d 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -71,3 +71,24 @@ define i32 @test7(i32 %x) nounwind {
 ; CHECK-NOT: shrl
 ; CHECK: ret
 }
+
+; PR13326
+define i8 @test8(i8 %x) nounwind {
+  %div = udiv i8 %x, 78
+  ret i8 %div
+; CHECK: test8:
+; CHECK: shrb %
+; CHECK: imull $211
+; CHECK: shrl $13
+; CHECK: ret
+}
+
+define i8 @test9(i8 %x) nounwind {
+  %div = udiv i8 %x, 116
+  ret i8 %div
+; CHECK: test9:
+; CHECK: shrb $2
+; CHECK: imull $71
+; CHECK: shrl $11
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
new file mode 100644
index 0000000..c5e47fa
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; rdar://11496434
+
+; no VLAs or dynamic alignment
+define i32 @t1() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t1
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: leaq [[OFFSET:[0-9]*]](%rsp), %rdi
+; CHECK: callq _t1_helper
+; CHECK: movl [[OFFSET]](%rsp), %eax
+; CHECK: addl $13, %eax
+}
+
+declare void @t1_helper(i32*)
+
+; dynamic realignment
+define i32 @t2() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  call void @t2_helper(i32* %a, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t2
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq {{[0-9]*}}(%rsp), %rdi
+; CHECK: leaq {{[0-9]*}}(%rsp), %rsi
+; CHECK: callq _t2_helper
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t2_helper(i32*, <8 x float>*)
+
+; VLAs
+define i32 @t3(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t3
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %rbx
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %rbp
+}
+
+declare void @t3_helper(i32*, i32*)
+
+; VLAs + Dynamic realignment
+define i32 @t4(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t4_helper(i32* %a, i32* %vla, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t4
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %r14
+; CHECK: pushq %rbx
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+; CHECK: movq %rsp, %rbx
+;
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdi
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdx
+; CHECK: callq   _t4_helper
+;
+; CHECK: leaq -16(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+declare void @t4_helper(i32*, i32*, <8 x float>*)
+
+; Dynamic realignment + Spill
+define i32 @t5(float* nocapture %f) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  call void @t5_helper1(i32* %a) nounwind
+  call void @t5_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+
+; CHECK: _t5
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]]
+; CHECK: vmovaps [[AVXREG]], (%rsp)
+; CHECK: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK: callq   _t5_helper1
+; CHECK: vmovaps (%rsp), %ymm0
+; CHECK: callq   _t5_helper2
+; CHECK: movl {{[0-9]+}}(%rsp), %eax
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t5_helper1(i32*)
+
+declare void @t5_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + Spill
+; FIXME: RA has already reserved RBX, so we can't do dynamic realignment.
+define i32 @t6(i64 %sz, float* nocapture %f) nounwind uwtable ssp {
+entry:
+; CHECK: _t6
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t6_helper1(i32* %a, i32* %vla) nounwind
+  call void @t6_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+}
+
+declare void @t6_helper1(i32*, i32*)
+
+declare void @t6_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + byval
+; The byval adjust the sp after the prolog, but if we're restoring the sp from
+; the base pointer we use the original adjustment.
+%struct.struct_t = type { [5 x i32] }
+
+define void @t7(i32 %size, %struct.struct_t* byval align 8 %arg1) nounwind uwtable {
+entry:
+  %x = alloca i32, align 32
+  store i32 0, i32* %x, align 32
+  %0 = zext i32 %size to i64
+  %vla = alloca i32, i64 %0, align 16
+  %1 = load i32* %x, align 32
+  call void @bar(i32 %1, i32* %vla, %struct.struct_t* byval align 8 %arg1)
+  ret void
+
+; CHECK: _t7
+; CHECK:     pushq %rbp
+; CHECK:     movq %rsp, %rbp
+; CHECK:     pushq %rbx
+; CHECK:     andq $-32, %rsp
+; CHECK:     subq ${{[0-9]+}}, %rsp
+; CHECK:     movq %rsp, %rbx
+
+; Stack adjustment for byval
+; CHECK:     subq {{.*}}, %rsp
+; CHECK:     callq _bar
+; CHECK-NOT: addq {{.*}}, %rsp
+; CHECK:     leaq -8(%rbp), %rsp
+; CHECK:     popq %rbx
+; CHECK:     popq %rbp
+}
+
+declare i8* @llvm.stacksave() nounwind
+
+declare void @bar(i32, i32*, %struct.struct_t* byval align 8)
+
+declare void @llvm.stackrestore(i8*) nounwind
+
+
+; Test when forcing stack alignment
+define i32 @t8() nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t8
+; FORCE-ALIGN:      movq %rsp, %rbp
+; FORCE-ALIGN:      andq $-32, %rsp
+; FORCE-ALIGN-NEXT: subq $32, %rsp
+; FORCE-ALIGN:      movq %rbp, %rsp
+; FORCE-ALIGN:      popq %rbp
+}
+
+; VLAs
+define i32 @t9(i64 %sz) nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t9
+; FORCE-ALIGN: pushq %rbp
+; FORCE-ALIGN: movq %rsp, %rbp
+; FORCE-ALIGN: pushq %rbx
+; FORCE-ALIGN: andq $-32, %rsp
+; FORCE-ALIGN: subq $32, %rsp
+; FORCE-ALIGN: movq %rsp, %rbx
+
+; FORCE-ALIGN: leaq -8(%rbp), %rsp
+; FORCE-ALIGN: popq %rbx
+; FORCE-ALIGN: popq %rbp
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
new file mode 100644
index 0000000..7883ffa
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: decl %esi
+; CHECK: jne LBB
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK: multipreds
+; Deal with alternative tail predecessors
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: fprintf
+
+define void @multipreds(i32 %sw) nounwind uwtable ssp {
+entry:
+  switch i32 %sw, label %if.then29 [
+    i32 0, label %if.then37
+    i32 127, label %if.end41
+  ]
+
+if.then29:
+  br label %if.end41
+
+if.then37:
+  br label %if.end41
+
+if.end41:
+  %exit_status.0 = phi i32 [ 2, %if.then29 ], [ 0, %if.then37 ], [ 66, %entry ]
+  call void (...)* @fprintf(i32 %exit_status.0) nounwind
+  unreachable
+}
+
+declare void @fprintf(...) nounwind
diff --git a/test/CodeGen/X86/epilogue.ll b/test/CodeGen/X86/epilogue.ll
index 0f16a64..090680e 100644
--- a/test/CodeGen/X86/epilogue.ll
+++ b/test/CodeGen/X86/epilogue.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | not grep lea
-; RUN: llc < %s -mcpu=generic -march=x86 | grep {movl	%ebp}
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+
+; CHECK-NOT: lea{{.*}}(%esp)
+; CHECK: {{(mov.* %ebp, %esp)|(lea.*\(%ebp\), %esp)}}
 
 declare void @bar(<2 x i64>* %n)
 
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index 14778f0..9e1a375 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mcpu=penryn > %t
 ; RUN: not grep movd %t
-; RUN: grep {movss	%xmm} %t | count 1
-; RUN: grep {extractps	\\\$1, %xmm0, } %t | count 1
+; RUN: grep "movss	%xmm" %t | count 1
+; RUN: grep "extractps	\$1, %xmm0, " %t | count 1
 ; PR2647
 
 external global float, align 16         ; <float*>:0 [#uses=2]
diff --git a/test/CodeGen/X86/fabs.ll b/test/CodeGen/X86/fabs.ll
index 9ded7e0..af1867f 100644
--- a/test/CodeGen/X86/fabs.ll
+++ b/test/CodeGen/X86/fabs.ll
@@ -1,28 +1,54 @@
 ; Make sure this testcase codegens to the fabs instruction, not a call to fabsf
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3,-sse | grep fabs\$ | \
-; RUN:   count 2
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | \
-; RUN:   grep fabs\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse2,-sse3,-sse | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=UNSAFE
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -O0 | FileCheck %s --check-prefix=NOOPT
 
 declare float @fabsf(float)
 
 declare x86_fp80 @fabsl(x86_fp80)
 
+; CHECK:  test1:
+; UNSAFE: test1:
+; NOOPT:  test1:
 define float @test1(float %X) {
-        %Y = call float @fabsf(float %X)
+        %Y = call float @fabsf(float %X) readnone
         ret float %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabsf
+
+; CHECK:  test2:
+; UNSAFE: test2:
+; NOOPT:  test2:
 define double @test2(double %X) {
         %Y = fcmp oge double %X, -0.0
         %Z = fsub double -0.0, %X
         %Q = select i1 %Y, double %X, double %Z
         ret double %Q
 }
+; fabs is not used here.
+; CHECK-NOT:  fabs
+; NOOPT-NOT:  fabs
+
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; UNSAFE-NOT: fabs
+
+; CHECK:  test3:
+; UNSAFE: test3:
+; NOOPT:  test3:
 define x86_fp80 @test3(x86_fp80 %X) {
-        %Y = call x86_fp80 @fabsl(x86_fp80 %X)
+        %Y = call x86_fp80 @fabsl(x86_fp80 %X) readnone
         ret x86_fp80 %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
+; NOOPT:  {{^[ \t]+fabs$}}
 
-
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabs
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index e4982f0..14cb136 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {add	ESP, 8}
+; RUN:   grep "add	ESP, 8"
 
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index 323c853..b3adb80 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {LCPI0_0(%rip)}
+; RUN: llc < %s -fast-isel | grep "LCPI0_0(%rip)"
 ; Make sure fast isel uses rip-relative addressing when required.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9.0"
diff --git a/test/CodeGen/X86/fast-isel-gv.ll b/test/CodeGen/X86/fast-isel-gv.ll
index 34f8b38..cb2464e 100644
--- a/test/CodeGen/X86/fast-isel-gv.ll
+++ b/test/CodeGen/X86/fast-isel-gv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {_kill@GOTPCREL(%rip)}
+; RUN: llc < %s -fast-isel | grep "_kill@GOTPCREL(%rip)"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
 @f = global i8 (...)* @kill		; <i8 (...)**> [#uses=1]
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 8db1936..52b1e85 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=generic | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 @src = external global i32
 
@@ -18,6 +19,13 @@ entry:
 ; CHECK: 	movl	%eax, (%ecx)
 ; CHECK: 	ret
 
+; ATOM:	loadgv:
+; ATOM:		movl    L_src$non_lazy_ptr, %ecx
+; ATOM:         movl    (%ecx), %eax
+; ATOM:         addl    (%ecx), %eax
+; ATOM:         movl    %eax, (%ecx)
+; ATOM:         ret
+
 }
 
 %stuff = type { i32 (...)** }
@@ -31,4 +39,8 @@ entry:
 ; CHECK:	movl	$0, %eax
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
+; ATOM: _t:
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
+; ATOM:         movl    $0, %eax
+
 }
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index b9598bb..19f3888 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -46,3 +46,17 @@ entry:
 ; CHECK: addl $40
 }
 declare void @test3sret(%struct.a* sret)
+
+; Check that fast-isel sret works with fastcc (and does not callee-pop)
+define void @test4() nounwind ssp {
+entry:
+  %tmp = alloca %struct.a, align 8
+  call fastcc void @test4fastccsret(%struct.a* sret %tmp)
+  ret void
+; CHECK: test4:
+; CHECK: subl $28
+; CHECK: leal (%esp), %ecx
+; CHECK: calll _test4fastccsret
+; CHECK addl $28
+}
+declare fastcc void @test4fastccsret(%struct.a* sret)
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index c88d529..132df2b 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 
 ; This tests very minimal fast-isel functionality.
 
@@ -117,3 +117,11 @@ define i64* @life() nounwind {
   ret i64* %a3
 }
 
+declare void @llvm.donothing() readnone
+
+; CHECK: donada
+define void @donada() nounwind {
+entry:
+  call void @llvm.donothing()
+  ret void
+}
diff --git a/test/CodeGen/X86/fastcc-byval.ll b/test/CodeGen/X86/fastcc-byval.ll
index 52b3e57..f1204d6 100644
--- a/test/CodeGen/X86/fastcc-byval.ll
+++ b/test/CodeGen/X86/fastcc-byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt=false | grep {movl\[\[:space:\]\]*8(%esp), %eax} | count 2
+; RUN: llc < %s -tailcallopt=false | grep "movl[[:space:]]*8(%esp), %eax" | count 2
 ; PR3122
 ; rdar://6400815
 
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 5deedb9..b0c1d0a 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
-; CHECK: _fmaf
+; CHECK-FMA-INST: vfmadd213ss
+; CHECK-FMA-CALL: _fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -11,7 +14,8 @@ entry:
 }
 
 ; CHECK: test_f64
-; CHECK: _fma
+; CHECK-FMA-INST: vfmadd213sd
+; CHECK-FMA-CALL: _fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
new file mode 100755
index 0000000..90529e0
--- /dev/null
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+
+define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+;;;;
+
+define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+
+
+define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 5ed03ef..fd414b3 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,295 +1,295 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
 
 ; VFMADD
-define < 4 x float > @test_x86_fma4_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
   ; CHECK: vfmaddss (%{{.*}})
   %x = load float *%a2
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
   %x = load float *%a1
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
   ; CHECK: vfmaddsd (%{{.*}})
   %x = load double *%a2
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
   %x = load double *%a1
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
   ; CHECK: vfmaddps (%{{.*}})
   %x = load <4 x float>* %a2
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
   %x = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
   ; CHECK: vfmaddpd (%{{.*}})
   %x = load <2 x double>* %a2
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
   %x = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUB
-define < 4 x float > @test_x86_fma4_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMADD
-define < 4 x float > @test_x86_fma4_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMSUB
-define < 4 x float > @test_x86_fma4_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMADDSUB
-define < 4 x float > @test_x86_fma4_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUBADD
-define < 4 x float > @test_x86_fma4_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
new file mode 100644
index 0000000..5d97a87
--- /dev/null
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fadd <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps
+; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %a2, %x
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps
+; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmadd_ps_y
+; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fadd <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps_y
+; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps_y
+; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %a2, %x
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps_y
+; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <8 x float> %y, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmadd_pd_y
+; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fadd <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd_y
+; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fsub <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd
+; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  %x = fmul <2 x double> %a0, %a1
+  %res = fsub <2 x double> %x, %a2
+  ret <2 x double> %res
+}
+
+; CHECK: test_x86_fnmadd_ss
+; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
+  %x = fmul float %a0, %a1
+  %res = fsub float %a2, %x
+  ret float %res
+}
+
+; CHECK: test_x86_fnmadd_sd
+; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %a2, %x
+  ret double %res
+}
+
+; CHECK: test_x86_fmsub_sd
+; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+; CHECK: test_x86_fnmsub_ss
+; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
+  %x = fsub float -0.000000e+00, %a0
+  %y = fmul float %x, %a1
+  %res = fsub float %y, %a2
+  ret float %res
+}
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index e03cb7e..c961f75 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -45,3 +45,29 @@ L:
 
 }
 
+; rdar://10554090
+; xor in exit block will be CSE'ed and load will be folded to xor in entry.
+define i1 @test3(i32* %P, i32* %Q) nounwind {
+; CHECK: test3:
+; CHECK: movl 8(%esp), %eax
+; CHECK: xorl (%eax),
+; CHECK: j
+; CHECK-NOT: xor
+entry:
+  %0 = load i32* %P, align 4
+  %1 = load i32* %Q, align 4
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 65535
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %exit, label %land.end
+
+exit:
+  %shr.i.i19 = xor i32 %1, %0
+  %5 = and i32 %shr.i.i19, 2147418112
+  %6 = icmp eq i32 %5, 0
+  br label %land.end
+
+land.end:
+  %7 = phi i1 [ %6, %exit ], [ false, %entry ]
+  ret i1 %7
+}
diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll
index cc4198d..d850630 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-1.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep pcmpeqd %t | count 1
-; RUN: grep xor %t | count 1
-; RUN: not grep LCP %t
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
 
 define <2 x double> @foo() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 -1, i64 -1> to <2 x double>)
+; CHECK: foo:
+; CHECK: pcmpeqd %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
 define <2 x double> @bar() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 0, i64 0> to <2 x double>)
+; CHECK: bar:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
new file mode 100644
index 0000000..2ada194
--- /dev/null
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -0,0 +1,70 @@
+; This test is attempting to detect when we request forced re-alignment of the
+; stack to an alignment greater than would be available due to the ABI. We
+; arbitrarily force alignment up to 32-bytes for i386 hoping that this will
+; exceed any ABI provisions.
+;
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define i32 @f(i8* %p) nounwind {
+entry:
+  %0 = load i8* %p
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i64 @g(i32 %i) nounwind {
+; CHECK: g:
+; CHECK:      pushl  %ebp
+; CHECK-NEXT: movl   %esp, %ebp
+; CHECK-NEXT: pushl
+; CHECK-NEXT: pushl
+; CHECK-NEXT: andl   $-32, %esp
+; CHECK-NEXT: subl   $32, %esp
+;
+; Now setup the base pointer (%esi).
+; CHECK-NEXT: movl   %esp, %esi
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; The next adjustment of the stack is due to the alloca.
+; CHECK:      movl   %{{...}}, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the memset call, and then undo it.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  memset
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the call to 'f'.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  f
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Restore %esp from %ebp (frame pointer) and subtract the size of
+; zone with callee-saved registers to pop them.
+; This is the state prior to stack realignment and the allocation of VLAs.
+; CHECK-NOT:  popl
+; CHECK:      leal   -8(%ebp), %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl   %ebp
+; CHECK-NEXT: ret
+
+entry:
+  br label %if.then
+
+if.then:
+  %0 = alloca i8, i32 %i
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 %i, i32 1, i1 false)
+  %call = call i32 @f(i8* %0)
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/X86/fp-immediate-shorten.ll b/test/CodeGen/X86/fp-immediate-shorten.ll
index cafc61a..62d8100 100644
--- a/test/CodeGen/X86/fp-immediate-shorten.ll
+++ b/test/CodeGen/X86/fp-immediate-shorten.ll
@@ -1,7 +1,7 @@
 ;; Test that this FP immediate is stored in the constant pool as a float.
 
 ; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3 | \
-; RUN:   grep {.long.1123418112}
+; RUN:   grep ".long.1123418112"
 
 define double @D() {
         ret double 1.230000e+02
diff --git a/test/CodeGen/X86/fp-in-intregs.ll b/test/CodeGen/X86/fp-in-intregs.ll
index 6966cf0..1f5121d 100644
--- a/test/CodeGen/X86/fp-in-intregs.ll
+++ b/test/CodeGen/X86/fp-in-intregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mcpu=yonah | FileCheck %s
 ; CHECK-NOT:     {{((xor|and)ps|movd)}}
 
 ; These operations should be done in integer registers, eliminating constant
diff --git a/test/CodeGen/X86/fp-stack-compare-cmov.ll b/test/CodeGen/X86/fp-stack-compare-cmov.ll
new file mode 100644
index 0000000..b457fbc
--- /dev/null
+++ b/test/CodeGen/X86/fp-stack-compare-cmov.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; PR1012
+
+define float @foo(float* %col.2.0) {
+; CHECK: fucompi
+; CHECK: fcmov
+  %tmp = load float* %col.2.0
+  %tmp16 = fcmp olt float %tmp, 0.000000e+00
+  %tmp20 = fsub float -0.000000e+00, %tmp
+  %iftmp.2.0 = select i1 %tmp16, float %tmp20, float %tmp
+  ret float %iftmp.2.0
+}
diff --git a/test/CodeGen/X86/fp-stack-compare.ll b/test/CodeGen/X86/fp-stack-compare.ll
index f3998b6..a8557ad 100644
--- a/test/CodeGen/X86/fp-stack-compare.ll
+++ b/test/CodeGen/X86/fp-stack-compare.ll
@@ -1,8 +1,11 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | FileCheck %s
-; PR1012
+; PR6679
 
 define float @foo(float* %col.2.0) {
-; CHECK: fucompi
+; CHECK: fucomp
+; CHECK-NOT: fucompi
+; CHECK: j
+; CHECK-NOT: fcmov
   %tmp = load float* %col.2.0
   %tmp16 = fcmp olt float %tmp, 0.000000e+00
   %tmp20 = fsub float -0.000000e+00, %tmp
diff --git a/test/CodeGen/X86/fp-stack-ret.ll b/test/CodeGen/X86/fp-stack-ret.ll
index 1307f70..2733117 100644
--- a/test/CodeGen/X86/fp-stack-ret.ll
+++ b/test/CodeGen/X86/fp-stack-ret.ll
@@ -22,7 +22,7 @@ define fastcc double @test2(<2 x double> %A) {
 
 ; CHECK: test3
 ; CHECK: sub{{.*}}%esp
-; CHECLK-NOT: xmm
+; CHECK-NOT: xmm
 define fastcc double @test3(<4 x float> %A) {
 	%B = bitcast <4 x float> %A to <2 x double>
 	%C = call fastcc double @test2(<2 x double> %B)
diff --git a/test/CodeGen/X86/fp_load_fold.ll b/test/CodeGen/X86/fp_load_fold.ll
index 0145069..a2cea5e 100644
--- a/test/CodeGen/X86/fp_load_fold.ll
+++ b/test/CodeGen/X86/fp_load_fold.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep -i ST | not grep {fadd\\|fsub\\|fdiv\\|fmul}
+; RUN:   grep -i ST | not grep "fadd\|fsub\|fdiv\|fmul"
 
 ; Test that the load of the memory location is folded into the operation.
 
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index ff9b1b0..1344cdc 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,9 +1,17 @@
-; RUN: llc < %s -march=x86 >%t
-
-; RUN: grep {addl	\\\$4,} %t | count 3
-; RUN: not grep {,%} %t
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
+; ATOM: foo
+; ATOM: addl
+; ATOM: leal
+; ATOM: leal
+
+; CHECK: foo
+; CHECK: addl
+; CHECK: addl
+; CEHCK: addl
+
 entry:
 	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
 	br i1 %0, label %bb, label %return
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 4a6927f..72a5096 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
diff --git a/test/CodeGen/X86/gs-fold.ll b/test/CodeGen/X86/gs-fold.ll
new file mode 100644
index 0000000..dbec76b
--- /dev/null
+++ b/test/CodeGen/X86/gs-fold.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-freebsd | FileCheck %s --check-prefix=CHECK-FBSD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK-LINUX
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.thread = type { i32, i32, i32, i32 }
+
+define i32 @test() nounwind uwtable {
+entry:
+  %0 = load volatile %struct.thread* addrspace(256)* null
+  %c = getelementptr inbounds %struct.thread* %0, i64 0, i32 2
+  %1 = load i32* %c, align 4
+  ret i32 %1
+}
+
+; Check that we are not assuming that gs contains the address of gs if we are not targeting Linux
+; CHECK-FBSD: movq	%gs:0, %rax
+; CHECK-FBSD: movl	8(%rax), %eax
+; Check that we are assuming that gs contains the address of gs if we are targeting Linux
+; CHECK-LINUX: movl	%gs:8, %eax
+
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index 76ffd66..968a9e8 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index 98817f3..a19fca5 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86-64 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 402cdfe..903c453 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux > %t
-; RUN: grep {movzbl	%\[abcd\]h,} %t | count 8
-; RUN: grep {%\[abcd\]h} %t | not grep {%r\[\[:digit:\]\]*d}
+; RUN: grep "movzbl	%[abcd]h," %t | count 8
+; RUN: grep "%[abcd]h" %t | not grep "%r[[:digit:]]*d"
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 4289fa7..74ecd04 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stats -O2 |& grep "1 machine-licm"
+; RUN: llc < %s -stats -O2 2>&1 | grep "1 machine-licm"
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/test/CodeGen/X86/iabs.ll b/test/CodeGen/X86/iabs.ll
index a8ba015..9196cce 100644
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@@ -1,13 +1,17 @@
-; RUN: llc < %s -march=x86-64 -stats  |& \
-; RUN:   grep {5 .*Number of machine instrs printed}
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 ;; Integer absolute value, should produce something at least as good as:
-;;       movl %edi, %ecx
-;;       sarl $31, %ecx
-;;       leal (%rdi,%rcx), %eax
-;;       xorl %ecx, %eax
+;;       movl   %edi, %eax
+;;       negl   %eax
+;;       cmovll %edi, %eax
 ;;       ret
+; rdar://10695237
 define i32 @test(i32 %a) nounwind {
+; CHECK: test:
+; CHECK: mov
+; CHECK-NEXT: neg
+; CHECK-NEXT: cmov
+; CHECK-NEXT: ret
         %tmp1neg = sub i32 0, %a
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll
index cecf77a..62a21f4 100644
--- a/test/CodeGen/X86/illegal-vector-args-return.ll
+++ b/test/CodeGen/X86/illegal-vector-args-return.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm2, %xmm0}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm2, %xmm0}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm2, %xmm0"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm2, %xmm0"
 
 define <4 x double> @foo(<4 x double> %x, <4 x double> %z) {
   %y = fmul <4 x double> %x, %z
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 134d6e9..747a589 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march x86 -regalloc=fast       < %s 2> %t1
+; RUN: not llc -march x86 -regalloc=fast -optimize-regalloc=0 < %s 2> %t1
 ; RUN: not llc -march x86 -regalloc=basic      < %s 2> %t2
 ; RUN: not llc -march x86 -regalloc=greedy     < %s 2> %t3
 ; RUN: FileCheck %s < %t1
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index 5e76b6c..b069c46 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep { 37}
+; RUN: llc < %s -march=x86 | grep " 37"
 ; rdar://7008959
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index eef6c2f..e6eb9ef 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -43,3 +43,12 @@ entry:
   %0 = tail call i8 asm sideeffect "xchg $0, $1", "=r,*m,0,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %p, i1 %desired) nounwind
   ret void
 }
+
+; <rdar://problem/11542429>
+; The constrained GR32_ABCD register class of the 'q' constraint requires
+; special handling after the preceding outputs used up eax-edx.
+define void @constrain_abcd(i8* %h) nounwind ssp {
+entry:
+  %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/inreg.ll b/test/CodeGen/X86/inreg.ll
new file mode 100644
index 0000000..6653cfb
--- /dev/null
+++ b/test/CodeGen/X86/inreg.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 | FileCheck --check-prefix=DAG %s
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 -O0 | FileCheck --check-prefix=FAST %s
+
+%struct.s1 = type { double, float }
+
+define void @g1() nounwind {
+entry:
+  %tmp = alloca %struct.s1, align 4
+  call void @f(%struct.s1* inreg sret %tmp, i32 inreg 41, i32 inreg 42, i32 43)
+  ret void
+  ; DAG: g1:
+  ; DAG: subl $[[AMT:.*]], %esp
+  ; DAG-NEXT: $43, (%esp)
+  ; DAG-NEXT: leal    16(%esp), %eax
+  ; DAG-NEXT: movl    $41, %edx
+  ; DAG-NEXT: movl    $42, %ecx
+  ; DAG-NEXT: calll   f
+  ; DAG-NEXT: addl $[[AMT]], %esp
+  ; DAG-NEXT: ret
+
+  ; FAST: g1:
+  ; FAST: subl $[[AMT:.*]], %esp
+  ; FAST-NEXT: leal    8(%esp), %eax
+  ; FAST-NEXT: movl    $41, %edx
+  ; FAST-NEXT: movl    $42, %ecx
+  ; FAST: $43, (%esp)
+  ; FAST: calll   f
+  ; FAST-NEXT: addl $[[AMT]], %esp
+  ; FAST: ret
+}
+
+declare void @f(%struct.s1* inreg sret, i32 inreg, i32 inreg, i32)
+
+%struct.s2 = type {}
+
+define void @g2(%struct.s2* inreg sret %agg.result) nounwind {
+entry:
+  ret void
+  ; DAG: g2
+  ; DAG-NOT: ret $4
+  ; DAG: .size g2
+
+  ; FAST: g2
+  ; FAST-NOT: ret $4
+  ; FAST: .size g2
+}
diff --git a/test/CodeGen/X86/isel-sink2.ll b/test/CodeGen/X86/isel-sink2.ll
index 5ed0e00..b162666 100644
--- a/test/CodeGen/X86/isel-sink2.ll
+++ b/test/CodeGen/X86/isel-sink2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 > %t
-; RUN: grep {movb.7(%...)} %t
+; RUN: grep "movb.7(%...)" %t
 ; RUN: not grep leal %t
 
 define i8 @test(i32 *%P) nounwind {
diff --git a/test/CodeGen/X86/ispositive.ll b/test/CodeGen/X86/ispositive.ll
index 8adf723..b1d1a20 100644
--- a/test/CodeGen/X86/ispositive.ll
+++ b/test/CodeGen/X86/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {shrl.*31}
+; RUN: llc < %s -march=x86 | grep "shrl.*31"
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index dbd133c..48e2106 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -22,6 +22,7 @@ declare i32 @bar(...)
 declare i32 @baz(...)
 
 ; rdar://10633221
+; rdar://11355268
 define i32 @g(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: g:
@@ -32,3 +33,223 @@ entry:
   %cond = select i1 %cmp, i32 %sub, i32 0
   ret i32 %cond
 }
+
+; rdar://10734411
+define i32 @h(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: h:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @i(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: i:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ugt i32 %a, %b
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @k(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: k:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ult i32 %b, %a
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+; redundant cmp instruction
+define i32 @l(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l:
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  ret i32 %cond
+}
+define i32 @m(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: m:
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %b, i32 %sub
+  ret i32 %cond
+}
+; If EFLAGS is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @l2(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l2:
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
+define i32 @l3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l3:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: jge
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  ret i32 %sub
+
+if.else:
+  %add = add nsw i32 %sub, 1
+  ret i32 %add
+}
+; rdar://11830760
+; When Movr0 is between sub and cmp, we need to move "Movr0" before sub.
+define i32 @l4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l4:
+; CHECK: xor
+; CHECK: sub
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub i32 %a, %b
+  %.sub = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %.sub
+}
+; rdar://11540023
+define i32 @n(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: n:
+; CHECK-NOT: sub
+; CHECK: cmp
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp slt i32 %sub, 0
+  %y.x = select i1 %cmp, i32 %y, i32 %x
+  ret i32 %y.x
+}
+; PR://13046
+define void @o() nounwind uwtable {
+entry:
+  %0 = load i16* undef, align 2
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %sw.bb, label %sw.default
+
+sw.bb:                                            ; preds = %if.end.i
+  br i1 undef, label %if.then44, label %if.end29
+
+if.end29:                                         ; preds = %sw.bb
+; CHECK: o:
+; CHECK: cmp
+  %1 = urem i16 %0, 10
+  %cmp25 = icmp eq i16 %1, 0
+  %. = select i1 %cmp25, i16 2, i16 0
+  br i1 %cmp25, label %if.then44, label %sw.default
+
+sw.default:                                       ; preds = %if.end29, %if.end.i
+  br i1 undef, label %if.then.i96, label %if.else.i97
+
+if.then.i96:                                      ; preds = %sw.default
+  unreachable
+
+if.else.i97:                                      ; preds = %sw.default
+  unreachable
+
+if.then44:                                        ; preds = %if.end29, %sw.bb
+  %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
+  br i1 undef, label %if.then.i103, label %if.else.i104
+
+if.then.i103:                                     ; preds = %if.then44
+  unreachable
+
+if.else.i104:                                     ; preds = %if.then44
+  ret void
+}
+; rdar://11855129
+define i32 @p(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: p:
+; CHECK-NOT: test
+; CHECK: cmovs
+  %add = add nsw i32 %b, %a
+  %cmp = icmp sgt i32 %add, 0
+  %add. = select i1 %cmp, i32 %add, i32 0
+  ret i32 %add.
+}
+; PR13475
+; If we have sub a, b and cmp b, a and the result of cmp is used
+; by sbb, we should not optimize cmp away.
+define i32 @q(i32 %j.4, i32 %w, i32 %el) {
+; CHECK: q:
+; CHECK: sub
+; CHECK: cmp
+; CHECK-NEXT: sbb
+  %tmp532 = add i32 %j.4, %w
+  %tmp533 = icmp ugt i32 %tmp532, %el
+  %tmp534 = icmp ult i32 %w, %el
+  %or.cond = and i1 %tmp533, %tmp534
+  %tmp535 = sub i32 %el, %w
+  %j.5 = select i1 %or.cond, i32 %tmp535, i32 %j.4
+  ret i32 %j.5
+}
+; rdar://11873276
+define i8* @r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK: r:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: j
+; CHECK-NOT: sub
+; CHECK: ret
+  %0 = load i32* %offset, align 8
+  %cmp = icmp slt i32 %0, %size
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %sub = sub nsw i32 %0, %size
+  store i32 %sub, i32* %offset, align 8
+  %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+  br label %return
+
+return:
+  %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+  ret i8* %retval.0
+}
diff --git a/test/CodeGen/X86/label-redefinition.ll b/test/CodeGen/X86/label-redefinition.ll
index 9ad33e0..9e88a18 100644
--- a/test/CodeGen/X86/label-redefinition.ll
+++ b/test/CodeGen/X86/label-redefinition.ll
@@ -1,5 +1,5 @@
 ; PR7054
-; RUN: not llc %s -o - |& grep {'_foo' label emitted multiple times to assembly}
+; RUN: not llc %s -o - 2>&1 | grep "'_foo' label emitted multiple times to assembly"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0.0"
 
diff --git a/test/CodeGen/X86/large-global.ll b/test/CodeGen/X86/large-global.ll
new file mode 100644
index 0000000..7cb974b
--- /dev/null
+++ b/test/CodeGen/X86/large-global.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+; rdar://11729134
+
+; EmitZerofill was incorrectly expecting a 32-bit "size" so 26214400000
+; was printed as 444596224
+
+%struct.X = type { [25000 x i8] }
+
+@gArray = global [1048576 x %struct.X] zeroinitializer, align 16
+
+; CHECK: .zerofill __DATA,__common,_gArray,26214400000,4
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 6930350..43f69b0 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {lea	EAX, DWORD PTR \\\[... + 4\\*... - 5\\\]}
+; RUN:   grep "lea	EAX, DWORD PTR \[... + 4\*... - 5\]"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
 ; RUN:   not grep add
 
diff --git a/test/CodeGen/X86/liveness-local-regalloc.ll b/test/CodeGen/X86/liveness-local-regalloc.ll
index b469d083..721f545 100644
--- a/test/CodeGen/X86/liveness-local-regalloc.ll
+++ b/test/CodeGen/X86/liveness-local-regalloc.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O3 -regalloc=fast -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; <rdar://problem/7755473>
+; PR12821
 
 %0 = type { i32, i8*, i8*, %1*, i8*, i64, i64, i32, i32, i32, i32, [1024 x i8] }
 %1 = type { i8*, i32, i32, i16, i16, %2, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %2, %3*, i32, [3 x i8], [1 x i8], %2, i32, i64 }
@@ -58,3 +59,34 @@ infloop:                                          ; preds = %infloop, %bb3
 infloop1:                                         ; preds = %infloop1, %bb5
   br label %infloop1
 }
+
+
+; RAFast would forget to add a super-register <imp-def> when rewriting:
+;  %vreg10:sub_32bit<def,read-undef> = COPY %R9D<kill>
+; This trips up the machine code verifier.
+define void @autogen_SD24657(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <16 x i16>
+  %A3 = alloca double
+  %A2 = alloca <2 x i8>
+  %A1 = alloca i1
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 -37, i8* %0
+  %E = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I = insertelement <2 x i8> <i8 -1, i8 -1>, i8 %5, i32 1
+  %B = fadd float 0x45CDF5B1C0000000, 0x45CDF5B1C0000000
+  %FC = uitofp i32 275048 to double
+  %Sl = select i1 true, <2 x i8> %I, <2 x i8> <i8 -1, i8 -1>
+  %Cmp = icmp slt i64 0, %E
+  br label %CF
+
+CF:                                               ; preds = %BB
+  store i8 %5, i8* %0
+  store <2 x i8> %I, <2 x i8>* %A2
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index d14102f..4bd162b 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -41,7 +41,6 @@ done:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_4:
 ; CHECK-NEXT:   callq bar99
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_1:
 ; CHECK-NEXT:   callq body
 
@@ -79,7 +78,6 @@ exit:
 ; CHECK-NEXT: .LBB2_5:
 ; CHECK-NEXT:   callq block_a_true_func
 ; CHECK-NEXT:   callq block_a_merge_func
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB2_1:
 ; CHECK-NEXT:   callq body
 ;
@@ -139,13 +137,13 @@ exit:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_7:
 ; CHECK-NEXT:   callq   bar100
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_1:
 ; CHECK-NEXT:   callq   loop_header
 ;      CHECK:   jl .LBB3_7
 ;      CHECK:   jge .LBB3_3
 ; CHECK-NEXT:   callq   bar101
 ; CHECK-NEXT:   jmp     .LBB3_1
+; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_3:
 ;      CHECK:   jge .LBB3_4
 ; CHECK-NEXT:   callq   bar102
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index ebda9f2..8a81f70 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,10 +1,16 @@
-; RUN: llc -mtriple=x86_64-darwin < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=generic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: t:
 ; CHECK: decq
-; CHECK-NEXT: movl (
+; CHECK-NEXT: movl (%r9,%rax,4), %eax
 ; CHECK-NEXT: jne
 
+; ATOM: t:
+; ATOM: movl (%r9,%rax,4), %eax
+; ATOM-NEXT: decq
+; ATOM-NEXT: jne
+
 @Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
 @Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
@@ -149,6 +155,13 @@ bb2:		; preds = %bb
 ; CHECK: jne
 ; CHECK: ret
 
+; ATOM: f:
+; ATOM: %for.body
+; ATOM: incl [[IV:%e..]]
+; ATOM: cmpl $1, [[IV]]
+; ATOM: jne
+; ATOM: ret
+
 define i32 @f(i32 %i, i32* nocapture %a) nounwind uwtable readonly ssp {
 entry:
   %cmp4 = icmp eq i32 %i, 1
diff --git a/test/CodeGen/X86/lsr-reuse-trunc.ll b/test/CodeGen/X86/lsr-reuse-trunc.ll
index 1f87089..276dab7 100644
--- a/test/CodeGen/X86/lsr-reuse-trunc.ll
+++ b/test/CodeGen/X86/lsr-reuse-trunc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck %s
 
 ; Full strength reduction wouldn't reduce register pressure, so LSR should
 ; stick with indexing here.
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index c9ed3e5..6566f56 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: xorl  %eax, %eax
 ; CHECK: movsd .LCPI0_0(%rip), %xmm0
@@ -9,6 +10,15 @@
 ; CHECK-NEXT: movsd
 ; CHECK-NEXT: incq %rax
 
+; ATOM: movsd .LCPI0_0(%rip), %xmm0
+; ATOM: xorl  %eax, %eax
+; ATOM: align
+; ATOM-NEXT: BB0_2:
+; ATOM-NEXT: movsd A(,%rax,8)
+; ATOM-NEXT: mulsd
+; ATOM-NEXT: movsd
+; ATOM-NEXT: incq %rax
+
 @A = external global [0 x double]
 
 define void @foo(i64 %n) nounwind {
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index a757cde..d171fd5 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -99,3 +99,60 @@ return:                                           ; preds = %if.end, %entry
   %retval.0 = phi i32 [ 1, %entry ], [ %., %if.end ]
   ret i32 %retval.0
 }
+
+; rdar://11393714
+define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
+; CHECK: %entry
+; CHECK: xorl
+; CHECK: %preheader
+; CHECK: %do.body
+; CHECK-NOT: xorl
+; CHECK: %do.cond
+; CHECK-NOT: xorl
+; CHECK: %return
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %return, label %preheader
+
+preheader:
+  %conv2 = and i32 %c, 255
+  br label %do.body
+
+do.body:
+  %n.addr.0 = phi i64 [ %dec, %do.cond ], [ %n, %preheader ]
+  %p.0 = phi i8* [ %incdec.ptr, %do.cond ], [ %s, %preheader ]
+  %cmp3 = icmp eq i32 %a, %conv2
+  br i1 %cmp3, label %return, label %do.cond
+
+do.cond:
+  %incdec.ptr = getelementptr inbounds i8* %p.0, i64 1
+  %dec = add i64 %n.addr.0, -1
+  %cmp6 = icmp eq i64 %dec, 0
+  br i1 %cmp6, label %return, label %do.body
+
+return:
+  %retval.0 = phi i8* [ null, %entry ], [ null, %do.cond ], [ %p.0, %do.body ]
+  ret i8* %retval.0
+}
+
+; PR13578
+@t2_global = external global i32
+
+declare i1 @t2_func()
+
+define i32 @t2() {
+  store i32 42, i32* @t2_global
+  %c = call i1 @t2_func()
+  br i1 %c, label %a, label %b
+
+a:
+  %l = load i32* @t2_global
+  ret i32 %l
+
+b:
+  ret i32 0
+
+; CHECK: t2:
+; CHECK: t2_global@GOTPCREL(%rip)
+; CHECK-NOT: t2_global@GOTPCREL(%rip)
+}
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 80103d1..0015df0 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
 ; Test the basic functionality of integer element promotions of different types.
 ; This tests checks passing of arguments, loading and storing to memory and
 ; basic arithmetic.
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86-64 -promote-elements < %s
+; RUN: llc -march=x86 < %s
+; RUN: llc -march=x86-64 < %s
 
 define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
   %bb = load <1 x i8>* %b
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index f4bc1bb..723d1d8 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -disable-simplify-libcalls -mtriple=x86_64-linux | FileCheck %s --check-prefix=NOBUILTIN
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
 
 ; This tests codegen time inlining/optimization of memcmp
@@ -23,6 +24,8 @@ return:                                           ; preds = %entry
 ; CHECK: memcmp2:
 ; CHECK: movw    ([[A0:%rdi|%rcx]]), %ax
 ; CHECK: cmpw    ([[A1:%rsi|%rdx]]), %ax
+; NOBUILTIN: memcmp2:
+; NOBUILTIN: callq
 }
 
 define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 689f7bf..206cb33 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -3,7 +3,7 @@
 
 define void @bork(<1 x i64>* %x) {
 ; CHECK: bork
-; CHECK: pextrd
+; CHECK: movlpd
 entry:
 	%tmp2 = load <1 x i64>* %x		; <<1 x i64>> [#uses=1]
 	%tmp6 = bitcast <1 x i64> %tmp2 to <2 x i32>		; <<2 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index aeb540f..65ee7b1 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -55,4 +55,20 @@ entry:
 ; X64:	ret
 }
 
+; The two loads here both look identical to selection DAG, except for their
+; address spaces.  Make sure they aren't CSE'd.
+define i32 @test_no_cse() nounwind readonly {
+entry:
+	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
+	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
+	%tmp2 = load i32* addrspace(257)* getelementptr (i32* addrspace(257)* inttoptr (i32 72 to i32* addrspace(257)*), i32 31)		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp1, %tmp3
+	ret i32 %tmp4
+}
+; X32: test_no_cse:
+; X32: 	movl	%gs:196
+; X32: 	movl	%fs:196
+; X32: 	ret
+
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index 4f7e28a..9f7d036 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,9 +1,9 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7236213
-
-; Xfailed now that scheduler 2-address hack is disabled a lea is generated.
-; The code isn't any worse though.
-; XFAIL: *
+;
+; The scheduler's 2-address hack has been disabled, so there is
+; currently no good guarantee that this test will pass until the
+; machine scheduler develops an equivalent heuristic.
 
 ; CodeGen shouldn't require any lea instructions inside the marked loop.
 ; It should properly set up post-increment uses and do coalescing for
diff --git a/test/CodeGen/X86/neg_cmp.ll b/test/CodeGen/X86/neg_cmp.ll
new file mode 100644
index 0000000..866514e
--- /dev/null
+++ b/test/CodeGen/X86/neg_cmp.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; rdar://11245199
+; PR12545
+define void @f(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+; CHECK: f:
+; CHECK-NOT: neg
+; CHECK: add
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %x, %sub
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @g() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @g()
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
index fc24913..3e72084 100644
--- a/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s -promote-elements -mattr=+sse2,+sse41 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse41 | FileCheck %s
 
 ; CHECK: func_4_8
 ; A single memory write
diff --git a/test/CodeGen/X86/overlap-shift.ll b/test/CodeGen/X86/overlap-shift.ll
index d185af1..e987495 100644
--- a/test/CodeGen/X86/overlap-shift.ll
+++ b/test/CodeGen/X86/overlap-shift.ll
@@ -7,7 +7,7 @@
 ; Check that the shift gets turned into an LEA.
 
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov E.X, E.X}
+; RUN:   not grep "mov E.X, E.X"
 
 @G = external global i32                ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/pass-three.ll b/test/CodeGen/X86/pass-three.ll
new file mode 100644
index 0000000..23005c7
--- /dev/null
+++ b/test/CodeGen/X86/pass-three.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.3.0"
+
+
+define { i8*, i64, i64* } @copy_3(i8* %a, i64 %b, i64* %c) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i64* } undef, i8* %a, 0
+  %1 = insertvalue { i8*, i64, i64* } %0, i64 %b, 1
+  %2 = insertvalue { i8*, i64, i64* } %1, i64* %c, 2
+  ret { i8*, i64, i64* } %2
+}
+
+; CHECK: copy_3:
+; CHECK-NOT: (%rdi)
+; CHECK: ret
diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
index d48a331..f958b6b 100644
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ b/test/CodeGen/X86/peep-vector-extract-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {xorps	%xmm0, %xmm0} | count 2
+; RUN: llc < %s -march=x86-64 | grep "xorps	%xmm0, %xmm0" | count 2
 
 define float @foo(<4 x float> %a) {
   %b = insertelement <4 x float> %a, float 0.0, i32 3
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index ef02af2..476bb10 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& grep {Number of blocks eliminated} | grep 6
+; RUN: llc < %s -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
 ; PR1296
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phielim-split.ll b/test/CodeGen/X86/phielim-split.ll
new file mode 100644
index 0000000..aa47735
--- /dev/null
+++ b/test/CodeGen/X86/phielim-split.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; The critical edge from for.cond to if.end2 should be split to avoid injecting
+; copies into the loop. The use of %b after the loop causes interference that
+; makes a copy necessary.
+; <rdar://problem/11561842>
+;
+; CHECK: split_loop_exit
+; CHECK: %for.cond
+; CHECK-NOT: mov
+; CHECK: je
+
+define i32 @split_loop_exit(i32 %a, i32 %b, i8* nocapture %p) nounwind uwtable readonly ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 10
+  br i1 %cmp, label %for.cond, label %if.end2
+
+for.cond:                                         ; preds = %entry, %for.cond
+  %p.addr.0 = phi i8* [ %incdec.ptr, %for.cond ], [ %p, %entry ]
+  %incdec.ptr = getelementptr inbounds i8* %p.addr.0, i64 1
+  %0 = load i8* %p.addr.0, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %for.cond, label %if.end2
+
+if.end2:                                          ; preds = %for.cond, %entry
+  %r.0 = phi i32 [ %a, %entry ], [ %b, %for.cond ]
+  %add = add nsw i32 %r.0, %b
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index 8b9ea17..37eca1c 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; CHECKed instructions should be the same with or without -O0.
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
+; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
 
 @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
 
@@ -15,6 +16,19 @@ entry:
 ; CHECK: movl	%ebx, 40(%esp)
 ; CHECK-NOT: movl
 ; CHECK: addl %ebx, %eax
+
+; On Intel Atom the scheduler moves a movl instruction
+; used for the printf call to follow movl 24(%esp), %eax
+; ATOM: movl 24(%esp), %eax
+; ATOM: movl
+; ATOM: movl   %eax, 36(%esp)
+; ATOM-NOT: movl
+; ATOM: movl 28(%esp), %ebx
+; ATOM-NOT: movl
+; ATOM: movl   %ebx, 40(%esp)
+; ATOM-NOT: movl
+; ATOM: addl %ebx, %eax
+
   %retval = alloca i32                            ; <i32*> [#uses=2]
   %"%ebx" = alloca i32                            ; <i32*> [#uses=1]
   %"%eax" = alloca i32                            ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 4162015..984d7e5 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,10 +1,14 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+;
+; This test is XFAIL until the register allocator understands trivial physreg
+; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index d8ed4c0..da4af81 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,9 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 -join-physregs > %t
+; RUN: llc < %s -march=x86 -mattr=sse41 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
 ; RUN: grep mov %t | count 11
 
-; The f() arguments in %xmm0 and %xmm1 cause an extra movdqa without -join-physregs.
-
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
         ret <4 x i32> %A
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index cc1df2f..800fbed 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -105,8 +105,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pinsrd
+;CHECK: movsd
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index e1fa032..6c32a22 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; We used to consider the early clobber in the second asm statement as
 ; defining %0 before it was read. This caused us to omit the
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
new file mode 100644
index 0000000..f7e9adb
--- /dev/null
+++ b/test/CodeGen/X86/pr11468.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; PR11468
+
+define void @f(i64 %sz) uwtable {
+entry:
+  %a = alloca i32, align 32
+  store volatile i32 0, i32* %a, align 32
+  ; force to push r14 on stack
+  call void asm sideeffect "nop", "~{r14},~{dirflag},~{fpsr},~{flags}"() nounwind, !srcloc !0
+  ret void
+
+; CHECK: _f
+; CHECK: pushq %rbp
+; CHECK: .cfi_offset %rbp, -16
+; CHECK: movq %rsp, %rbp
+; CHECK: .cfi_def_cfa_register %rbp
+
+; We first push register on stack, and then realign it, so that
+; .cfi_offset value is correct
+; CHECK: pushq %r14
+; CHECK: andq $-32, %rsp
+; CHECK: .cfi_offset %r14, -24
+
+; Restore %rsp from %rbp and subtract the total size of saved regsiters.
+; CHECK: leaq -8(%rbp), %rsp
+
+; Pop saved registers.
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+!0 = metadata !{i32 125}
+
diff --git a/test/CodeGen/X86/pr12889.ll b/test/CodeGen/X86/pr12889.ll
new file mode 100644
index 0000000..331d8f9
--- /dev/null
+++ b/test/CodeGen/X86/pr12889.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@c0 = common global i8 0, align 1
+
+define void @func() nounwind uwtable {
+entry:
+  %0 = load i8* @c0, align 1, !tbaa !0
+  %tobool = icmp ne i8 %0, 0
+  %conv = zext i1 %tobool to i8
+  %storemerge = shl nuw nsw i8 %conv, %conv
+  store i8 %storemerge, i8* @c0, align 1
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/pr13209.ll b/test/CodeGen/X86/pr13209.ll
new file mode 100644
index 0000000..1c93163
--- /dev/null
+++ b/test/CodeGen/X86/pr13209.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+
+; CHECK: pr13209:
+; CHECK-NOT: mov
+; CHECK: .size pr13209
+
+define zeroext i1 @pr13209(i8** %x, i8*** %jumpTable) nounwind {
+if.end51:
+  br label %indirectgoto.preheader
+indirectgoto.preheader:
+  %frombool.i5915.ph = phi i8 [ undef, %if.end51 ], [ %frombool.i5917, %jit_return ]
+  br label %indirectgoto
+do.end165:
+  %tmp92 = load i8** %x, align 8
+  br label %indirectgoto
+do.end209:
+  %tmp104 = load i8** %x, align 8
+  br label %indirectgoto
+do.end220:
+  %tmp107 = load i8** %x, align 8
+  br label %indirectgoto
+do.end231:
+  %tmp110 = load i8** %x, align 8
+  br label %indirectgoto
+do.end242:
+  %tmp113 = load i8** %x, align 8
+  br label %indirectgoto
+do.end253:
+  %tmp116 = load i8** %x, align 8
+  br label %indirectgoto
+do.end286:
+  %tmp125 = load i8** %x, align 8
+  br label %indirectgoto
+do.end297:
+  %tmp128 = load i8** %x, align 8
+  br label %indirectgoto
+do.end308:
+  %tmp131 = load i8** %x, align 8
+  br label %indirectgoto
+do.end429:
+  %tmp164 = load i8** %x, align 8
+  br label %indirectgoto
+do.end440:
+  %tmp167 = load i8** %x, align 8
+  br label %indirectgoto
+do.body482:
+  br i1 false, label %indirectgoto, label %do.body495
+do.body495:
+  br label %indirectgoto
+do.end723:
+  br label %inline_return
+inline_return:
+  %frombool.i5917 = phi i8 [ 0, %if.end5571 ], [ %frombool.i5915, %do.end723 ]
+  br label %jit_return
+jit_return:
+  br label %indirectgoto.preheader
+L_JSOP_UINT24:
+  %tmp864 = load i8** %x, align 8
+  br label %indirectgoto
+L_JSOP_THROWING:
+  %tmp1201 = load i8** %x, align 8
+  br label %indirectgoto
+do.body4936:
+  %tmp1240 = load i8** %x, align 8
+  br label %indirectgoto
+do.body5184:
+  %tmp1340 = load i8** %x, align 8
+  br label %indirectgoto
+if.end5571:
+  br  label %inline_return
+indirectgoto:
+  %frombool.i5915 = phi i8  [ 0, %do.body495 ],[ 0, %do.body482 ] , [ %frombool.i5915, %do.body4936 ],[ %frombool.i5915, %do.body5184 ], [ %frombool.i5915, %L_JSOP_UINT24 ], [ %frombool.i5915, %do.end286 ], [ %frombool.i5915, %do.end297 ], [ %frombool.i5915, %do.end308 ], [ %frombool.i5915, %do.end429 ], [ %frombool.i5915, %do.end440 ], [ %frombool.i5915, %L_JSOP_THROWING ], [ %frombool.i5915, %do.end253 ], [ %frombool.i5915, %do.end242 ], [ %frombool.i5915, %do.end231 ], [ %frombool.i5915, %do.end220 ], [ %frombool.i5915, %do.end209 ],[ %frombool.i5915, %do.end165 ], [ %frombool.i5915.ph, %indirectgoto.preheader ]
+  indirectbr i8* null, [ label %if.end5571, label %do.end165, label %do.end209, label %do.end220, label %do.end231, label %do.end242, label %do.end253, label %do.end723, label %L_JSOP_THROWING, label %do.end440, label %do.end429, label %do.end308, label %do.end297, label %do.end286, label %L_JSOP_UINT24, label %do.body5184, label %do.body4936, label %do.body482]
+}
diff --git a/test/CodeGen/X86/pr13220.ll b/test/CodeGen/X86/pr13220.ll
new file mode 100644
index 0000000..b9ac4b6
--- /dev/null
+++ b/test/CodeGen/X86/pr13220.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s
+; PR13220
+
+define <8 x i32> @foo(<8 x i96> %x) {
+  %a = lshr <8 x i96> %x, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bar(<8 x i97> %x) {
+  %a = lshr <8 x i97> %x, <i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1>
+  %b = trunc <8 x i97> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bax() {
+  %a = lshr <8 x i96> <i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4>, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
new file mode 100644
index 0000000..faaec26
--- /dev/null
+++ b/test/CodeGen/X86/pr13577.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=x86-64
+
+define x86_fp80 @foo(x86_fp80 %a) {
+  %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
+  ret x86_fp80 %1
+}
+
+declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index afd7114..f0e31f7 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {xorps.\*sp} | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2 | grep "xorps.*sp" | count 1
 ; PR2656
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/pr3522.ll b/test/CodeGen/X86/pr3522.ll
index 1122530..d8f3778 100644
--- a/test/CodeGen/X86/pr3522.ll
+++ b/test/CodeGen/X86/pr3522.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& not grep {instructions sunk}
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep "instructions sunk"
 ; PR3522
 
 target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/promote-trunc.ll b/test/CodeGen/X86/promote-trunc.ll
index 4211d82..40a58b0 100644
--- a/test/CodeGen/X86/promote-trunc.ll
+++ b/test/CodeGen/X86/promote-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -promote-elements < %s -march=x86-64
+; RUN: llc < %s -march=x86-64
 
 define<4 x i8> @func_8_64() {
   %F = load <4 x i64>* undef
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index faca3d7..8ef9b5d 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -177,3 +177,49 @@ if.end4:
 return:
   ret void
 }
+
+; Deal with TokenFactor chain
+; rdar://11236106
+@foo = external global i64*, align 8
+
+define void @test3() nounwind ssp {
+entry:
+; CHECK: test3:
+; CHECK: decq 16(%rax)
+  %0 = load i64** @foo, align 8
+  %arrayidx = getelementptr inbounds i64* %0, i64 2
+  %1 = load i64* %arrayidx, align 8
+  %dec = add i64 %1, -1
+  store i64 %dec, i64* %arrayidx, align 8
+  %cmp = icmp eq i64 %dec, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @baz() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @baz()
+
+; Avoid creating a cycle in the DAG which would trigger an assert in the
+; scheduler.
+; PR12565
+; rdar://11451474
+@x = external global i32, align 4
+@y = external global i32, align 4
+@z = external global i32, align 4
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @y, align 4
+  %tobool.i = icmp ne i32 %dec, 0
+  %cond.i = select i1 %tobool.i, i32 %0, i32 0
+  store i32 %cond.i, i32* @z, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
new file mode 100644
index 0000000..e2224a6
--- /dev/null
+++ b/test/CodeGen/X86/rdrand.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx-i -mattr=+rdrand | FileCheck %s
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i32 @_rdrand16_step(i16* %random_val) {
+  %call = call {i16, i32} @llvm.x86.rdrand.16()
+  %randval = extractvalue {i16, i32} %call, 0
+  store i16 %randval, i16* %random_val
+  %isvalid = extractvalue {i16, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand16_step:
+; CHECK: rdrandw	%ax
+; CHECK: movw	%ax, (%r[[A0:di|cx]])
+; CHECK: movzwl	%ax, %ecx
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%ecx, %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand32_step(i32* %random_val) {
+  %call = call {i32, i32} @llvm.x86.rdrand.32()
+  %randval = extractvalue {i32, i32} %call, 0
+  store i32 %randval, i32* %random_val
+  %isvalid = extractvalue {i32, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand32_step:
+; CHECK: rdrandl	%e[[T0:[a-z]+]]
+; CHECK: movl	%e[[T0]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand64_step(i64* %random_val) {
+  %call = call {i64, i32} @llvm.x86.rdrand.64()
+  %randval = extractvalue {i64, i32} %call, 0
+  store i64 %randval, i64* %random_val
+  %isvalid = extractvalue {i64, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand64_step:
+; CHECK: rdrandq	%r[[T1:[[a-z]+]]
+; CHECK: movq	%r[[T1]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: ret
+}
+
+; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
+define i32 @CSE() nounwind {
+ %rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v1 = extractvalue { i32, i32 } %rand1, 0
+ %rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v2 = extractvalue { i32, i32 } %rand2, 0
+ %add = add i32 %v2, %v1
+ ret i32 %add
+; CHECK: CSE:
+; CHECK: rdrandl
+; CHECK: rdrandl
+}
+
+; Check that MachineLICM doesn't hoist rdrand instructions.
+define void @loop(i32* %p, i32 %n) nounwind {
+entry:
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %p.addr.03 = phi i32* [ %incdec.ptr, %while.body ], [ %p, %entry ]
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.03, i64 1
+  %rand = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %v1 = extractvalue { i32, i32 } %rand, 0
+  store i32 %v1, i32* %p.addr.03, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+; CHECK: loop:
+; CHECK-NOT: rdrandl
+; CHECK: This Inner Loop Header: Depth=1
+; CHECK: rdrandl
+}
diff --git a/test/CodeGen/X86/regpressure.ll b/test/CodeGen/X86/regpressure.ll
index e0b5f7a..52d7b56 100644
--- a/test/CodeGen/X86/regpressure.ll
+++ b/test/CodeGen/X86/regpressure.ll
@@ -1,8 +1,8 @@
 ;; Both functions in this testcase should codegen to the same function, and
 ;; neither of them should require spilling anything to the stack.
 
-; RUN: llc < %s -march=x86 -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 
 ;; This can be compiled to use three registers if the loads are not
 ;; folded into the multiplies, 2 registers otherwise.
diff --git a/test/CodeGen/X86/remat-fold-load.ll b/test/CodeGen/X86/remat-fold-load.ll
new file mode 100644
index 0000000..de77ad3
--- /dev/null
+++ b/test/CodeGen/X86/remat-fold-load.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -disable-fp-elim -verify-coalescing
+; PR13414
+;
+; During coalescing, remat triggers DCE which deletes the penultimate use of a
+; load. This load should not be folded into the remaining use because it is not
+; safe to move, and it would extend the live range of the address.
+;
+; LiveRangeEdit::foldAsLoad() doesn't extend live ranges, so -verify-coalescing
+; catches the problem.
+
+target triple = "i386-unknown-linux-gnu"
+
+%type_a = type { %type_a*, %type_b }
+%type_b = type { %type_c, i32 }
+%type_c = type { i32, %type_d }
+%type_d = type { i64 }
+%type_e = type { %type_c, i64 }
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define linkonce_odr void @test() nounwind {
+entry:
+  br i1 undef, label %while.end.while.end26_crit_edge, label %while.body12.lr.ph
+
+while.end.while.end26_crit_edge:                  ; preds = %entry
+  br label %while.end26
+
+while.body12.lr.ph:                               ; preds = %entry
+  br label %while.body12
+
+while.body12:                                     ; preds = %if.end24, %while.body12.lr.ph
+  %tmp = phi %type_a* [ undef, %while.body12.lr.ph ], [ %tmp18, %if.end24 ]
+  %ins151154161 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp, %if.end24 ]
+  %ins135156160 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp158, %if.end24 ]
+  %ins151 = or i128 0, %ins151154161
+  %cmp.i.i.i.i.i67 = icmp sgt i32 undef, 8
+  br i1 %cmp.i.i.i.i.i67, label %if.then.i.i.i.i71, label %if.else.i.i.i.i74
+
+if.then.i.i.i.i71:                                ; preds = %while.body12
+  %call4.i.i.i.i68 = call noalias i8* @malloc(i32 undef) nounwind
+  %tmp1 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1
+  %buf_6.i.i.i.i70 = bitcast %type_d* %tmp1 to i8**
+  %tmp2 = load i8** %buf_6.i.i.i.i70, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* %tmp2, i32 undef, i32 1, i1 false) nounwind
+  unreachable
+
+if.else.i.i.i.i74:                                ; preds = %while.body12
+  %i_.i.i.i.i72 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1, i32 0
+  %tmp3 = load i64* %i_.i.i.i.i72, align 4
+  %tmp4 = zext i64 %tmp3 to i128
+  %tmp5 = shl nuw nsw i128 %tmp4, 32
+  %ins148 = or i128 %tmp5, %ins151
+  %second3.i.i76 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 1
+  %tmp6 = load i32* %second3.i.i76, align 4
+  %tmp7 = zext i32 %tmp6 to i128
+  %tmp8 = shl nuw i128 %tmp7, 96
+  %mask144 = and i128 %ins148, 79228162495817593519834398720
+  %tmp9 = load %type_e** undef, align 4
+  %len_.i.i.i.i86 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 0
+  %tmp10 = load i32* %len_.i.i.i.i86, align 4
+  %tmp11 = zext i32 %tmp10 to i128
+  %ins135 = or i128 %tmp11, %ins135156160
+  %cmp.i.i.i.i.i88 = icmp sgt i32 %tmp10, 8
+  br i1 %cmp.i.i.i.i.i88, label %if.then.i.i.i.i92, label %if.else.i.i.i.i95
+
+if.then.i.i.i.i92:                                ; preds = %if.else.i.i.i.i74
+  %call4.i.i.i.i89 = call noalias i8* @malloc(i32 %tmp10) nounwind
+  %ins126 = or i128 0, %ins135
+  %tmp12 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1
+  %buf_6.i.i.i.i91 = bitcast %type_d* %tmp12 to i8**
+  %tmp13 = load i8** %buf_6.i.i.i.i91, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %call4.i.i.i.i89, i8* %tmp13, i32 %tmp10, i32 1, i1 false) nounwind
+  br label %A
+
+if.else.i.i.i.i95:                                ; preds = %if.else.i.i.i.i74
+  %i_.i.i.i.i93 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1, i32 0
+  br label %A
+
+A:                                                ; preds = %if.else.i.i.i.i95, %if.then.i.i.i.i92
+  %ins135157 = phi i128 [ %ins126, %if.then.i.i.i.i92 ], [ undef, %if.else.i.i.i.i95 ]
+  %second3.i.i97 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 1
+  %tmp14 = load i64* %second3.i.i97, align 4
+  %tmp15 = trunc i64 %tmp14 to i32
+  %cmp.i99 = icmp sgt i32 %tmp6, %tmp15
+  %tmp16 = trunc i128 %ins135157 to i32
+  %cmp.i.i.i.i.i.i101 = icmp sgt i32 %tmp16, 8
+  br i1 %cmp.i.i.i.i.i.i101, label %if.then.i.i.i.i.i103, label %B
+
+if.then.i.i.i.i.i103:                             ; preds = %A
+  unreachable
+
+B:                                                ; preds = %A
+  %tmp17 = trunc i128 %ins148 to i32
+  %cmp.i.i.i.i.i.i83 = icmp sgt i32 %tmp17, 8
+  br i1 %cmp.i.i.i.i.i.i83, label %if.then.i.i.i.i.i85, label %C
+
+if.then.i.i.i.i.i85:                              ; preds = %B
+  unreachable
+
+C:                                                ; preds = %B
+  br i1 %cmp.i99, label %if.then17, label %if.end24
+
+if.then17:                                        ; preds = %C
+  br i1 false, label %if.then.i.i.i.i.i43, label %D
+
+if.then.i.i.i.i.i43:                              ; preds = %if.then17
+  unreachable
+
+D:                                                ; preds = %if.then17
+  br i1 undef, label %if.then.i.i.i.i.i, label %E
+
+if.then.i.i.i.i.i:                                ; preds = %D
+  unreachable
+
+E:                                                ; preds = %D
+  br label %if.end24
+
+if.end24:                                         ; preds = %E, %C
+  %phitmp = or i128 %tmp8, %mask144
+  %phitmp158 = or i128 undef, undef
+  %tmp18 = load %type_a** undef, align 4
+  %tmp19 = load %type_a** undef, align 4
+  %cmp.i49 = icmp eq %type_a* %tmp18, %tmp19
+  br i1 %cmp.i49, label %while.cond10.while.end26_crit_edge, label %while.body12
+
+while.cond10.while.end26_crit_edge:               ; preds = %if.end24
+  %.pre = load %type_e** undef, align 4
+  br label %while.end26
+
+while.end26:                                      ; preds = %while.cond10.while.end26_crit_edge, %while.end.while.end26_crit_edge
+  br i1 undef, label %while.body.lr.ph.i, label %F
+
+while.body.lr.ph.i:                               ; preds = %while.end26
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %while.body.lr.ph.i
+  br i1 false, label %while.body.i, label %F
+
+F:                                                ; preds = %while.body.i, %while.end26
+  ret void
+}
+
+declare noalias i8* @malloc(i32) nounwind
diff --git a/test/CodeGen/X86/remat-scalar-zero.ll b/test/CodeGen/X86/remat-scalar-zero.ll
index 75f438d..f6095a7 100644
--- a/test/CodeGen/X86/remat-scalar-zero.ll
+++ b/test/CodeGen/X86/remat-scalar-zero.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu > %t
 ; RUN: not grep xor %t
 ; RUN: not grep movap %t
-; RUN: grep {\\.quad.*0} %t
+; RUN: grep "\.quad.*0" %t
 
 ; Remat should be able to fold the zero constant into the div instructions
 ; as a constant-pool load.
diff --git a/test/CodeGen/X86/reverse_branches.ll b/test/CodeGen/X86/reverse_branches.ll
new file mode 100644
index 0000000..9772125
--- /dev/null
+++ b/test/CodeGen/X86/reverse_branches.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [7 x i8] c"memchr\00", align 1
+@.str3 = private unnamed_addr constant [11 x i8] c"bsd_memchr\00", align 1
+@str4 = private unnamed_addr constant [5 x i8] c"Bug!\00"
+
+; Make sure at end of do.cond.i, we jump to do.body.i first to have a tighter
+; inner loop.
+define i32 @test_branches_order() uwtable ssp {
+; CHECK: test_branches_order:
+; CHECK: [[L0:LBB0_[0-9]+]]: ## %do.body.i
+; CHECK: je
+; CHECK: %do.cond.i
+; CHECK: jne [[L0]]
+; CHECK: jmp
+; CHECK: %exit
+entry:
+  %strs = alloca [1000 x [1001 x i8]], align 16
+  br label %for.cond
+
+for.cond:
+  %j.0 = phi i32 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  %cmp = icmp slt i32 %j.0, 1000
+  br i1 %cmp, label %for.cond1, label %for.end11
+
+for.cond1:
+  %indvars.iv50 = phi i64 [ %indvars.iv.next51, %for.body3 ], [ 0, %for.cond ]
+  %0 = trunc i64 %indvars.iv50 to i32
+  %cmp2 = icmp slt i32 %0, 1000
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.body3:
+  %arraydecay = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 0
+  %call = call i8* @memchr(i8* %arraydecay, i32 120, i64 1000)
+  %add.ptr = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 %indvars.iv50
+  %cmp7 = icmp eq i8* %call, %add.ptr
+  %indvars.iv.next51 = add i64 %indvars.iv50, 1
+  br i1 %cmp7, label %for.cond1, label %if.then
+
+if.then:
+  %puts = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc9:
+  %inc10 = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end11:
+  %puts42 = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str2, i64 0, i64 0))
+  br label %for.cond14
+
+for.cond14:
+  %j13.0 = phi i32 [ 0, %for.end11 ], [ %inc39, %for.inc38 ]
+  %cmp15 = icmp slt i32 %j13.0, 1000
+  br i1 %cmp15, label %for.cond18, label %for.end40
+
+for.cond18:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %exit ], [ 0, %for.cond14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %cmp19 = icmp slt i32 %1, 1000
+  br i1 %cmp19, label %for.body20, label %for.inc38
+
+for.body20:
+  %arraydecay24 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 0
+  br label %do.body.i
+
+do.body.i:
+  %n.addr.0.i = phi i64 [ %dec.i, %do.cond.i ], [ 1000, %for.body20 ]
+  %p.0.i = phi i8* [ %incdec.ptr.i, %do.cond.i ], [ %arraydecay24, %for.body20 ]
+  %2 = load i8* %p.0.i, align 1
+  %cmp3.i = icmp eq i8 %2, 120
+  br i1 %cmp3.i, label %exit, label %do.cond.i
+
+do.cond.i:
+  %incdec.ptr.i = getelementptr inbounds i8* %p.0.i, i64 1
+  %dec.i = add i64 %n.addr.0.i, -1
+  %cmp5.i = icmp eq i64 %dec.i, 0
+  br i1 %cmp5.i, label %if.then32, label %do.body.i
+
+exit:
+  %add.ptr30 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 %indvars.iv
+  %cmp31 = icmp eq i8* %p.0.i, %add.ptr30
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br i1 %cmp31, label %for.cond18, label %if.then32
+
+if.then32:
+  %puts43 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc38:
+  %inc39 = add nsw i32 %j13.0, 1
+  br label %for.cond14
+
+for.end40:
+  %puts44 = call i32 @puts(i8* getelementptr inbounds ([11 x i8]* @.str3, i64 0, i64 0))
+  ret i32 0
+}
+
+declare i8* @memchr(i8*, i32, i64) nounwind readonly
+declare void @exit(i32) noreturn
+declare i32 @puts(i8* nocapture) nounwind
+
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 1e20273..1173001 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {ro\[rl\]} | count 12
+; RUN:   grep "ro[rl]" | count 12
 
 define i32 @rotl32(i32 %A, i8 %Amt) {
 	%shift.upgrd.1 = zext i8 %Amt to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index 0dd74ea..51fcf64 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
 
 define float @test1(float %x) nounwind  {
   %call = tail call float @floorf(float %x) nounwind readnone
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index 5ce08aa..d68b00b 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -51,14 +51,14 @@ false:
 ; X64-NEXT: callq __morestack
 ; X64-NEXT: ret
 
-; X64:      movq %rsp, %rdi
-; X64-NEXT: subq %rax, %rdi
-; X64-NEXT: cmpq %rdi, %fs:112
+; X64:      movq %rsp, %[[RDI:rdi|rax]]
+; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64-NEXT: cmpq %[[RDI]], %fs:112
 
-; X64:      movq %rdi, %rsp
+; X64:      movq %[[RDI]], %rsp
 
-; X64:      movq %rax, %rdi
+; X64:      movq %{{.*}}, %rdi
 ; X64-NEXT: callq __morestack_allocate_stack_space
-; X64-NEXT: movq %rax, %rdi
+; X64:      movq %rax, %rdi
 
 }
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index f465a4f..2e39473 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 ; PR5757
 
 %0 = type { i64, i32 }
@@ -12,6 +13,10 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
 ; CHECK: test1:
 ; CHECK: cmovneq %rdi, %rsi
 ; CHECK: movl (%rsi), %eax
+
+; ATOM: test1:
+; ATOM: cmovneq %rdi, %rsi
+; ATOM: movl (%rsi), %eax
 }
 
 
@@ -31,6 +36,10 @@ bb91:		; preds = %bb84
 ; CHECK: test2:
 ; CHECK: movnew
 ; CHECK: movswl
+
+; ATOM: test2:
+; ATOM: movnew
+; ATOM: movswl
 }
 
 declare i1 @return_false()
@@ -44,6 +53,9 @@ entry:
 	ret float %iftmp.0.0
 ; CHECK: test3:
 ; CHECK: movss	{{.*}},4), %xmm0
+
+; ATOM: test3:
+; ATOM: movss  {{.*}},4), %xmm0
 }
 
 define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
@@ -55,6 +67,9 @@ entry:
 	ret i8 %2
 ; CHECK: test4:
 ; CHECK: movsbl	({{.*}},4), %eax
+
+; ATOM: test4:
+; ATOM: movsbl ({{.*}},4), %eax
 }
 
 define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
@@ -62,6 +77,8 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
   store <2 x i16> %x, <2 x i16>* %p
   ret void
 ; CHECK: test5:
+
+; ATOM: test5:
 }
 
 define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
@@ -79,6 +96,12 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK: ret
 ; CHECK: mulps
 ; CHECK: ret
+
+; ATOM: test6:
+; ATOM: je
+; ATOM: ret
+; ATOM: mulps
+; ATOM: ret
 }
 
 ; Select with fp80's
@@ -89,6 +112,10 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; CHECK: test7:
 ; CHECK: leaq
 ; CHECK: fldt (%r{{.}}x,%r{{.}}x)
+
+; ATOM: test7:
+; ATOM: leaq
+; ATOM: fldt (%r{{.}}x,%r{{.}}x)
 }
 
 ; widening select v6i32 and then a sub
@@ -97,8 +124,10 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 	%val = sub <6 x i32> %x, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	store <6 x i32> %val, <6 x i32>* %dst.addr
 	ret void
-        
+
 ; CHECK: test8:
+
+; ATOM: test8:
 }
 
 
@@ -113,6 +142,12 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Same as test9
@@ -125,6 +160,12 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -137,6 +178,12 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9b:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Select between -1 and 1.
@@ -149,6 +196,12 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	$1, %rax
 ; CHECK: ret
+
+; ATOM: test10:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    $1, %rax
+; ATOM: ret
 }
 
 
@@ -163,6 +216,13 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -175,6 +235,13 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 
@@ -189,10 +256,16 @@ entry:
   %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
   ret i8* %call
 ; CHECK: test12:
-; CHECK: mulq
 ; CHECK: movq $-1, %rdi
+; CHECK: mulq
 ; CHECK: cmovnoq	%rax, %rdi
 ; CHECK: jmp	__Znam
+
+; ATOM: test12:
+; ATOM: mulq
+; ATOM: movq $-1, %rdi
+; ATOM: cmovnoq        %rax, %rdi
+; ATOM: jmp    __Znam
 }
 
 declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
@@ -205,6 +278,11 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; CHECK: cmpl
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: ret
+
+; ATOM: test13:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: ret
 }
 
 define i32 @test14(i32 %a, i32 %b) nounwind {
@@ -216,5 +294,53 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: notl
 ; CHECK-NEXT: ret
+
+; ATOM: test14:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: notl
+; ATOM-NEXT: ret
+}
+
+; rdar://10961709
+define i32 @test15(i32 %x) nounwind {
+entry:
+  %cmp = icmp ne i32 %x, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+; CHECK: test15:
+; CHECK: negl
+; CHECK: sbbl
+
+; ATOM: test15:
+; ATOM: negl
+; ATOM: sbbl
 }
 
+define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+; CHECK: test16:
+; CHECK: negq
+; CHECK: sbbq
+
+; ATOM: test16:
+; ATOM: negq
+; ATOM: sbbq
+}
+
+define i16 @test17(i16 %x) nounwind {
+entry:
+  %cmp = icmp ne i16 %x, 0
+  %sub = sext i1 %cmp to i16
+  ret i16 %sub
+; CHECK: test17:
+; CHECK: negw
+; CHECK: sbbw
+
+; ATOM: test17:
+; ATOM: negw
+; ATOM: sbbw
+}
diff --git a/test/CodeGen/X86/selectiondag-cse.ll b/test/CodeGen/X86/selectiondag-cse.ll
new file mode 100644
index 0000000..a653a1c
--- /dev/null
+++ b/test/CodeGen/X86/selectiondag-cse.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s
+; PR12599
+;
+; This bitcode causes the X86 target to make changes to the DAG during
+; selection in MatchAddressRecursively. The edit triggers CSE which causes both
+; the current node and yet-to-be-selected nodes to be deleted.
+;
+; SelectionDAGISel::DoInstructionSelection must handle that.
+;
+target triple = "x86_64-apple-macosx"
+
+%0 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i8**, i32, i32***, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], i32****, i32***, i32***, i32***, i32****, i32****, %1*, %2*, %9*, i32*, i32*, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i16******, i16******, i16******, i16******, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [32 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %10*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double**, double***, i32***, double**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [2 x i32], i32, i32, i16, i32, i32, i32, i32, i32 }
+%1 = type { i32, i32, [100 x %2*], i32, float, float, float }
+%2 = type { i32, i32, i32, i32, i32, i32, %3*, %6*, %8*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (i32)*, [3 x [2 x i32]] }
+%3 = type { %4*, %5, %5 }
+%4 = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 }
+%5 = type { i32, i32, i32, i32, i32, i8*, i32*, i32, i32 }
+%6 = type { [3 x [11 x %7]], [2 x [9 x %7]], [2 x [10 x %7]], [2 x [6 x %7]], [4 x %7], [4 x %7], [3 x %7] }
+%7 = type { i16, i8, i64 }
+%8 = type { [2 x %7], [4 x %7], [3 x [4 x %7]], [10 x [4 x %7]], [10 x [15 x %7]], [10 x [15 x %7]], [10 x [5 x %7]], [10 x [5 x %7]], [10 x [15 x %7]], [10 x [15 x %7]] }
+%9 = type { i32, i32, i32, [2 x i32], i32, [8 x i32], %9*, %9*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%10 = type { i32, i32, i32, i32, i32, %10* }
+
+@images = external hidden global %0, align 8
+
+define hidden fastcc void @Mode_Decision_for_4x4IntraBlocks() nounwind uwtable ssp {
+bb4:
+  %tmp = or i208 undef, 0
+  br i1 undef, label %bb35, label %bb5
+
+bb5:
+  %tmp6 = add i32 0, 2
+  %tmp7 = lshr i208 %tmp, 80
+  %tmp8 = trunc i208 %tmp7 to i32
+  %tmp9 = and i32 %tmp8, 65535
+  %tmp10 = shl nuw nsw i32 %tmp9, 1
+  %tmp11 = add i32 0, 2
+  %tmp12 = add i32 %tmp11, 0
+  %tmp13 = add i32 %tmp12, %tmp10
+  %tmp14 = lshr i32 %tmp13, 2
+  %tmp15 = trunc i32 %tmp14 to i16
+  store i16 %tmp15, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 0, i64 3), align 2
+  %tmp16 = lshr i208 %tmp, 96
+  %tmp17 = trunc i208 %tmp16 to i32
+  %tmp18 = and i32 %tmp17, 65535
+  %tmp19 = add i32 %tmp18, 2
+  %tmp20 = add i32 %tmp19, 0
+  %tmp21 = add i32 %tmp20, 0
+  %tmp22 = lshr i32 %tmp21, 2
+  %tmp23 = trunc i32 %tmp22 to i16
+  store i16 %tmp23, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 2, i64 3), align 2
+  %tmp24 = add i32 %tmp6, %tmp9
+  %tmp25 = add i32 %tmp24, 0
+  %tmp26 = lshr i32 %tmp25, 2
+  %tmp27 = trunc i32 %tmp26 to i16
+  store i16 %tmp27, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 2), align 4
+  %tmp28 = lshr i208 %tmp, 80
+  %tmp29 = shl nuw nsw i208 %tmp28, 1
+  %tmp30 = trunc i208 %tmp29 to i32
+  %tmp31 = and i32 %tmp30, 131070
+  %tmp32 = add i32 %tmp12, %tmp31
+  %tmp33 = lshr i32 %tmp32, 2
+  %tmp34 = trunc i32 %tmp33 to i16
+  store i16 %tmp34, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 3), align 2
+  br label %bb35
+
+bb35:                                             ; preds = %bb5, %bb4
+  unreachable
+}
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
new file mode 100644
index 0000000..23d66a2
--- /dev/null
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=x86-64 -mcpu=nehalem -asm-verbose=false < %s | FileCheck %s
+
+define <4 x i32> @test_ueq(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ueq <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_uge(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp uge <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ule(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ule <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_one(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp one <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ogt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ogt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_olt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp olt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/X86/shift-and.ll b/test/CodeGen/X86/shift-and.ll
index b747cc5..1de9151 100644
--- a/test/CodeGen/X86/shift-and.ll
+++ b/test/CodeGen/X86/shift-and.ll
@@ -1,13 +1,27 @@
-; RUN: llc < %s -march=x86    | grep and | count 2
-; RUN: llc < %s -march=x86-64 | not grep and 
+; RUN: llc < %s -mtriple=i386-apple-macosx   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=X64
 
 define i32 @t1(i32 %t, i32 %val) nounwind {
+; X32: t1:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t1:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 31
        %res = shl i32 %val, %shamt
        ret i32 %res
 }
 
 define i32 @t2(i32 %t, i32 %val) nounwind {
+; X32: t2:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t2:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 63
        %res = shl i32 %val, %shamt
        ret i32 %res
@@ -16,6 +30,13 @@ define i32 @t2(i32 %t, i32 %val) nounwind {
 @X = internal global i16 0
 
 define void @t3(i16 %t) nounwind {
+; X32: t3:
+; X32-NOT: andl
+; X32: sarw
+
+; X64: t3:
+; X64-NOT: andl
+; X64: sarw
        %shamt = and i16 %t, 31
        %tmp = load i16* @X
        %tmp1 = ashr i16 %tmp, %shamt
@@ -24,13 +45,34 @@ define void @t3(i16 %t) nounwind {
 }
 
 define i64 @t4(i64 %t, i64 %val) nounwind {
+; X64: t4:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 63
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
 
 define i64 @t5(i64 %t, i64 %val) nounwind {
+; X64: t5:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 191
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
+
+
+; rdar://11866926
+define i64 @t6(i64 %key, i64* nocapture %val) nounwind {
+entry:
+; X64: t6:
+; X64-NOT: movabsq
+; X64: decq
+; X64: andq
+  %shr = lshr i64 %key, 3
+  %0 = load i64* %val, align 8
+  %sub = add i64 %0, 2305843009213693951
+  %and = and i64 %sub, %shr
+  ret i64 %and
+}
diff --git a/test/CodeGen/X86/shift-coalesce.ll b/test/CodeGen/X86/shift-coalesce.ll
index d38f9a8..4f27e97 100644
--- a/test/CodeGen/X86/shift-coalesce.ll
+++ b/test/CodeGen/X86/shift-coalesce.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {shld.*CL}
+; RUN:   grep "shld.*CL"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov CL, BL}
+; RUN:   not grep "mov CL, BL"
 
 ; PR687
 
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 5adee7c..8d2b290 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {sh\[lr\]d} | count 5
+; RUN:   grep "sh[lr]d" | count 5
 
 define i64 @test1(i64 %X, i8 %C) {
         %shift.upgrd.1 = zext i8 %C to i64              ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 3ea6011..c518cdd 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s
 
 define i32* @test1(i32* %P, i32 %X) {
 ; CHECK: test1:
diff --git a/test/CodeGen/X86/shl_elim.ll b/test/CodeGen/X86/shl_elim.ll
index 0827221..83e1eb5 100644
--- a/test/CodeGen/X86/shl_elim.ll
+++ b/test/CodeGen/X86/shl_elim.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {movl	8(.esp), %eax}
-; RUN: llc < %s -march=x86 | grep {shrl	.eax}
-; RUN: llc < %s -march=x86 | grep {movswl	.ax, .eax}
+; RUN: llc < %s -march=x86 | grep "movl	8(.esp), %eax"
+; RUN: llc < %s -march=x86 | grep "shrl	.eax"
+; RUN: llc < %s -march=x86 | grep "movswl	.ax, .eax"
 
 define i32 @test1(i64 %a) nounwind {
         %tmp29 = lshr i64 %a, 24                ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 13f9329..1479be1 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,8 +1,6 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep sin\$ | count 3
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep cos\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
 
 declare float  @sinf(float) readonly
 
@@ -10,39 +8,59 @@ declare double @sin(double) readonly
 
 declare x86_fp80 @sinl(x86_fp80) readonly
 
+; SIN: test1:
 define float @test1(float %X) {
         %Y = call float @sinf(float %X) readonly
         ret float %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+
+; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
         ret double %Y
 }
+; SIN: {{^[ \t]*fsin$}}
+
+; SIN-NOT: fsin
 
+; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
 declare float @cosf(float) readonly
 
 declare double @cos(double) readonly
 
 declare x86_fp80 @cosl(x86_fp80) readonly
 
+
+; SIN: test4:
+; COS: test3:
 define float @test4(float %X) {
         %Y = call float @cosf(float %X) readonly
         ret float %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; COS: {{^[ \t]*fcos}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 7957eb8..649cd61 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true | FileCheck %s
 
 ; Currently, floating-point selects are lowered to CFG triangles.
 ; This means that one side of the select is always unconditionally
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
new file mode 100644
index 0000000..c600f925
--- /dev/null
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; A MOV32ri is inside a loop, it has two successors, one successor is inside the
+; same loop, the other successor is outside the loop. We should be able to sink
+; MOV32ri outside the loop.
+; rdar://11980766
+define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
+; CHECK: sink_succ
+; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
+; CHECK: %exit
+; CHECK-NOT: movl
+; CHECK: jne [[OUTER_LN1]]
+; CHECK: movl
+; CHECK: [[LN2:LBB0_[0-9]+]]: ## %for.body2
+; CHECK: jne [[LN2]]
+; CHECK: ret
+entry:
+  br label %preheader
+
+preheader:
+  %i.127 = phi i32 [ 0, %entry ], [ %inc9, %exit ]
+  br label %for.body1.lr
+
+for.body1.lr:
+  %iv30 = phi i32 [ 1, %preheader ], [ %iv.next31, %for.inc40.i ]
+  br label %for.body1
+
+for.body1:
+  %iv.i = phi i64 [ 0, %for.body1.lr ], [ %iv.next.i, %for.body1 ]
+  %iv.next.i = add i64 %iv.i, 1
+  %lftr.wideiv32 = trunc i64 %iv.next.i to i32
+  %exitcond33 = icmp eq i32 %lftr.wideiv32, %iv30
+  br i1 %exitcond33, label %for.inc40.i, label %for.body1
+
+for.inc40.i:
+  %iv.next31 = add i32 %iv30, 1
+  %exitcond49.i = icmp eq i32 %iv.next31, 32
+  br i1 %exitcond49.i, label %exit, label %for.body1.lr
+
+exit:
+  %inc9 = add nsw i32 %i.127, 1
+  %exitcond34 = icmp eq i32 %inc9, 10
+  br i1 %exitcond34, label %for.body2, label %preheader
+
+for.body2:
+  %iv = phi i64 [ %iv.next, %for.body2 ], [ 0, %exit ]
+  %iv.next = add i64 %iv, 1
+  %lftr.wideiv = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 2048
+  br i1 %exitcond, label %for.end20, label %for.body2
+
+for.end20:
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
index 81a072f..980f18c 100644
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ b/test/CodeGen/X86/splat-scalar-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 ; rdar://7434544
 
 define <2 x i64> @t2() nounwind {
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 118e393..71a42f4 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
 
 ; CHECK: a:
 ; CHECK: movdqu
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index d1e07c8..c99287b 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mcpu=nehalem | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
 
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 1112440..3839e87 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -promote-elements | FileCheck %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math  | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
 ; then can be matched provided that the operands are swapped.
@@ -8,13 +8,10 @@
 ; and a conditional branch.
 
 ; The naming convention is {,x_,y_}{o,u}{gt,lt,ge,le}{,_inverse}
-; x_ : use 0.0 instead of %y
-; y_ : use -0.0 instead of %y
+;  _x: use 0.0 instead of %y
+;  _y: use -0.0 instead of %y
 ; _inverse : swap the arms of the select.
 
-; Some of these tests depend on -join-physregs commuting instructions to
-; eliminate copies.
-
 ; CHECK:      ogt:
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
@@ -139,147 +136,147 @@ define double @ole_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ogt:
+; CHECK:      ogt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt:
+; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt:
+; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt(double %x) nounwind {
+define double @ogt_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_olt:
+; CHECK:      olt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt:
+; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt:
+; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt(double %x) nounwind {
+define double @olt_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ogt_inverse:
+; CHECK:      ogt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt_inverse:
+; UNSAFE:      ogt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt_inverse:
+; FINITE:      ogt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_olt_inverse:
+; CHECK:      olt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt_inverse:
+; UNSAFE:      olt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt_inverse:
+; FINITE:      olt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt_inverse(double %x) nounwind {
+define double @olt_inverse_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_oge:
+; CHECK:      oge_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge:
+; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge:
+; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge(double %x) nounwind {
+define double @oge_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ole:
+; CHECK:      ole_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole:
+; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole:
+; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole(double %x) nounwind {
+define double @ole_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge_inverse:
+; CHECK:      oge_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge_inverse:
+; FINITE:      oge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge_inverse(double %x) nounwind {
+define double @oge_inverse_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole_inverse:
+; CHECK:      ole_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole_inverse:
+; FINITE:      ole_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole_inverse(double %x) nounwind {
+define double @ole_inverse_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
@@ -411,419 +408,419 @@ define double @ule_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ugt:
+; CHECK:      ugt_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt:
+; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt:
+; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt(double %x) nounwind {
+define double @ugt_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ult:
+; CHECK:      ult_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult:
+; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult:
+; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult(double %x) nounwind {
+define double @ult_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt_inverse:
+; CHECK:      ugt_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt_inverse:
+; FINITE:      ugt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult_inverse:
+; CHECK:      ult_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult_inverse:
+; FINITE:      ult_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult_inverse(double %x) nounwind {
+define double @ult_inverse_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_uge:
+; CHECK:      uge_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge:
+; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge:
+; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge(double %x) nounwind {
+define double @uge_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ule:
+; CHECK:      ule_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule:
+; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule:
+; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule(double %x) nounwind {
+define double @ule_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_uge_inverse:
+; CHECK:      uge_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge_inverse:
+; UNSAFE:      uge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge_inverse:
+; FINITE:      uge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge_inverse(double %x) nounwind {
+define double @uge_inverse_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ule_inverse:
+; CHECK:      ule_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule_inverse:
+; UNSAFE:      ule_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule_inverse:
+; FINITE:      ule_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule_inverse(double %x) nounwind {
+define double @ule_inverse_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ogt:
+; CHECK:      ogt_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt:
+; UNSAFE:      ogt_y:
 ; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt:
+; FINITE:      ogt_y:
 ; FINITE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt(double %x) nounwind {
+define double @ogt_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_olt:
+; CHECK:      olt_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt:
+; UNSAFE:      olt_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt:
+; FINITE:      olt_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt(double %x) nounwind {
+define double @olt_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ogt_inverse:
+; CHECK:      ogt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt_inverse:
+; UNSAFE:      ogt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt_inverse:
+; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_olt_inverse:
+; CHECK:      olt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt_inverse:
+; UNSAFE:      olt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt_inverse:
+; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt_inverse(double %x) nounwind {
+define double @olt_inverse_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_oge:
+; CHECK:      oge_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge:
+; UNSAFE:      oge_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge:
+; FINITE:      oge_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge(double %x) nounwind {
+define double @oge_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ole:
+; CHECK:      ole_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole:
+; UNSAFE:      ole_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole:
+; FINITE:      ole_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole(double %x) nounwind {
+define double @ole_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge_inverse:
+; CHECK:      oge_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge_inverse:
+; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge_inverse(double %x) nounwind {
+define double @oge_inverse_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole_inverse:
+; CHECK:      ole_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole_inverse:
+; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole_inverse(double %x) nounwind {
+define double @ole_inverse_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ugt:
+; CHECK:      ugt_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt:
+; UNSAFE:      ugt_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt:
+; FINITE:      ugt_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt(double %x) nounwind {
+define double @ugt_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ult:
+; CHECK:      ult_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult:
+; UNSAFE:      ult_y:
 ; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult:
+; FINITE:      ult_y:
 ; FINITE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult(double %x) nounwind {
+define double @ult_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt_inverse:
+; CHECK:      ugt_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt_inverse:
+; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult_inverse:
+; CHECK:      ult_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult_inverse:
+; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult_inverse(double %x) nounwind {
+define double @ult_inverse_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_uge:
+; CHECK:      uge_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge:
+; UNSAFE:      uge_y:
 ; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge:
+; FINITE:      uge_y:
 ; FINITE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge(double %x) nounwind {
+define double @uge_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ule:
+; CHECK:      ule_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule:
+; UNSAFE:      ule_y:
 ; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule:
+; FINITE:      ule_y:
 ; FINITE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule(double %x) nounwind {
+define double @ule_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_uge_inverse:
+; CHECK:      uge_inverse_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge_inverse:
+; UNSAFE:      uge_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge_inverse:
+; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge_inverse(double %x) nounwind {
+define double @uge_inverse_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ule_inverse:
+; CHECK:      ule_inverse_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule_inverse:
+; UNSAFE:      ule_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule_inverse:
+; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule_inverse(double %x) nounwind {
+define double @ule_inverse_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 5ea1b4d..48638b3 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -249,9 +249,10 @@ entry:
 ; X64: 	t16:
 ; X64: 		pextrw	$8, %xmm0, %eax
 ; X64: 		pslldq	$2, %xmm0
-; X64: 		movd	%xmm0, %ecx
-; X64: 		pextrw	$1, %xmm0, %edx
-; X64: 		pinsrw	$0, %ecx, %xmm0
+; X64: 		pextrw	$1, %xmm0, %ecx
+; X64: 		movzbl	%cl, %ecx
+; X64: 		orl	%eax, %ecx
+; X64: 		pinsrw	$1, %ecx, %xmm0
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 1a1017d..a2a0deb 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: vsel_float
 ;CHECK: blendvps
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 54264b1..c6f9f0c 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X64
 
 @g16 = external global i16
 
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
new file mode 100644
index 0000000..076e213
--- /dev/null
+++ b/test/CodeGen/X86/sse4a.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
+
+define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; CHECK: test1:
+; CHECK: movntss
+  tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test2(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; CHECK: test2:
+; CHECK: movntsd
+  tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+
+define <2 x i64> @test3(<2 x i64> %x) nounwind uwtable ssp {
+; CHECK: test3:
+; CHECK: extrq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test4:
+; CHECK: extrq
+  %1 = bitcast <2 x i64> %y to <16 x i8>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
+  ret <2 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
+
+define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test5:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test6(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test6:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
diff --git a/test/CodeGen/X86/sse_reload_fold.ll b/test/CodeGen/X86/sse_reload_fold.ll
index a57fa58..fd8db3b 100644
--- a/test/CodeGen/X86/sse_reload_fold.ll
+++ b/test/CodeGen/X86/sse_reload_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic 2>&1 | FileCheck %s
 ; CHECK: fail
 ; CHECK-NOT: fail
 
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index f6c13ec..0ddb237 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -10,11 +10,11 @@ target triple = "i686-apple-darwin8"
 define void @test({ double, double }* byval  %z, double* %P) nounwind {
 entry:
 	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
-	%tmp4 = tail call double @fabs( double %tmp3 )		; <double> [#uses=1]
+	%tmp4 = tail call double @fabs( double %tmp3 ) readnone	; <double> [#uses=1]
         store volatile double %tmp4, double* %P
 	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
 	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
-	%tmp2 = tail call double @fabs( double %tmp1 )		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
     ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
 	store volatile double %tmp6, double* %P, align 8
diff --git a/test/CodeGen/X86/stack-protector-linux.ll b/test/CodeGen/X86/stack-protector.ll
index fe2a9c5..c075114 100644
--- a/test/CodeGen/X86/stack-protector-linux.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | grep %gs:
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %fs:
 ; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %gs:
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_guard}
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_fail}
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_guard"
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_fail"
 
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <[11 x i8]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/store_op_load_fold2.ll b/test/CodeGen/X86/store_op_load_fold2.ll
index 8313166..6e4fe90 100644
--- a/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/test/CodeGen/X86/store_op_load_fold2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
 
 target datalayout = "e-p:32:32"
         %struct.Macroblock = type { i32, i32, i32, i32, i32, [8 x i32], %struct.Macroblock*, %struct.Macroblock*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/subreg-to-reg-1.ll b/test/CodeGen/X86/subreg-to-reg-1.ll
index a297728..4f31ab5 100644
--- a/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {leal	.*), %e.\*} | count 1
+; RUN: llc < %s -march=x86-64 | grep "leal	.*), %e.*" | count 1
 
 ; Don't eliminate or coalesce away the explicit zero-extension!
 ; This is currently using an leal because of a 3-addressification detail,
diff --git a/test/CodeGen/X86/subreg-to-reg-4.ll b/test/CodeGen/X86/subreg-to-reg-4.ll
index 0ea5541..0693789 100644
--- a/test/CodeGen/X86/subreg-to-reg-4.ll
+++ b/test/CodeGen/X86/subreg-to-reg-4.ll
@@ -5,7 +5,7 @@
 ; RUN: not grep negq %t
 ; RUN: not grep addq %t
 ; RUN: not grep subq %t
-; RUN: not grep {movl	%} %t
+; RUN: not grep "movl	%" %t
 
 ; Utilize implicit zero-extension on x86-64 to eliminate explicit
 ; zero-extensions. Shrink 64-bit adds to 32-bit when the high
diff --git a/test/CodeGen/X86/switch-order-weight.ll b/test/CodeGen/X86/switch-order-weight.ll
new file mode 100644
index 0000000..0fdd56d
--- /dev/null
+++ b/test/CodeGen/X86/switch-order-weight.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-apple-darwin11 < %s | FileCheck %s
+
+; Check that the cases which lead to unreachable are checked after "10"
+
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  switch i32 %x, label %if.end7 [
+    i32 0, label %if.then
+    i32 10, label %if.then2
+    i32 20, label %if.then5
+  ]
+
+; CHECK: test1:
+; CHECK-NOT: unr
+; CHECK: cmpl $10
+; CHECK: bar
+; CHECK: cmpl $20
+
+if.then:
+  tail call void @unr(i32 23) noreturn nounwind
+  unreachable
+
+if.then2:
+  tail call void @bar(i32 42) nounwind
+  br label %if.end7
+
+if.then5:
+  tail call void @unr(i32 5) noreturn nounwind
+  unreachable
+
+if.end7:
+  ret void
+}
+
+declare void @unr(i32) noreturn
+
+declare void @bar(i32)
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
new file mode 100644
index 0000000..7030753
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.4.0"
+
+declare i64 @testi()
+
+define i64 @test_trivial() {
+ %A = tail call i64 @testi()
+ ret i64 %A
+}
+; CHECK: test_trivial:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+define i64 @test_noop_bitcast() {
+ %A = tail call i64 @testi()
+ %B = bitcast i64 %A to i64
+ ret i64 %B
+}
+; CHECK: test_noop_bitcast:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+; Tail call shouldn't be blocked by no-op inttoptr.
+define i8* @test_inttoptr() {
+  %A = tail call i64 @testi()
+  %B = inttoptr i64 %A to i8*
+  ret i8* %B
+}
+
+; CHECK: test_inttoptr:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+declare <4 x float> @testv()
+
+define <4 x i32> @test_vectorbitcast() {
+  %A = tail call <4 x float> @testv()
+  %B = bitcast <4 x float> %A to <4 x i32>
+  ret <4 x i32> %B
+}
+; CHECK: test_vectorbitcast:
+; CHECK: jmp	_testv                  ## TAILCALL
+
+
+declare { i64, i64 } @testp()
+
+define {i64, i64} @test_pair_trivial() {
+  %A = tail call { i64, i64} @testp()
+  ret { i64, i64} %A
+}
+; CHECK: test_pair_trivial:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+define {i64, i64} @test_pair_trivial_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %b = insertvalue {i64, i64} undef, i64 %x, 0
+  %c = insertvalue {i64, i64} %b, i64 %y, 1
+  
+  ret { i64, i64} %c
+}
+
+; CHECK: test_pair_trivial_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+define {i8*, i64} @test_pair_conv_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %x1 = inttoptr i64 %x to i8*
+  
+  %b = insertvalue {i8*, i64} undef, i8* %x1, 0
+  %c = insertvalue {i8*, i64} %b, i64 %y, 1
+  
+  ret { i8*, i64} %c
+}
+
+; CHECK: test_pair_conv_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+; PR13006
+define { i64, i64 } @crash(i8* %this) {
+  %c = tail call { i64, i64 } @testp()
+  %mrv7 = insertvalue { i64, i64 } %c, i64 undef, 1
+  ret { i64, i64 } %mrv7
+}
+
+
diff --git a/test/CodeGen/X86/tailcall-cgp-dup.ll b/test/CodeGen/X86/tailcall-cgp-dup.ll
new file mode 100644
index 0000000..a80b90f
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -0,0 +1,87 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; Teach CGP to dup returns to enable tail call optimization.
+; rdar://9147433
+
+define i32 @foo(i32 %x) nounwind ssp {
+; CHECK: foo:
+entry:
+  switch i32 %x, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb5
+    i32 5, label %sw.bb7
+    i32 6, label %sw.bb9
+  ]
+
+sw.bb:                                            ; preds = %entry
+; CHECK: jmp _f1
+  %call = tail call i32 @f1() nounwind
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+; CHECK: jmp _f2
+  %call2 = tail call i32 @f2() nounwind
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+; CHECK: jmp _f3
+  %call4 = tail call i32 @f3() nounwind
+  br label %return
+
+sw.bb5:                                           ; preds = %entry
+; CHECK: jmp _f4
+  %call6 = tail call i32 @f4() nounwind
+  br label %return
+
+sw.bb7:                                           ; preds = %entry
+; CHECK: jmp _f5
+  %call8 = tail call i32 @f5() nounwind
+  br label %return
+
+sw.bb9:                                           ; preds = %entry
+; CHECK: jmp _f6
+  %call10 = tail call i32 @f6() nounwind
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb9, %sw.bb7, %sw.bb5, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i32 [ %call10, %sw.bb9 ], [ %call8, %sw.bb7 ], [ %call6, %sw.bb5 ], [ %call4, %sw.bb3 ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @f1()
+
+declare i32 @f2()
+
+declare i32 @f3()
+
+declare i32 @f4()
+
+declare i32 @f5()
+
+declare i32 @f6()
+
+; rdar://11958338
+%0 = type opaque
+
+declare i8* @bar(i8*) uwtable optsize noinline ssp
+
+define hidden %0* @thingWithValue(i8* %self) uwtable ssp {
+entry:
+; CHECK: thingWithValue:
+; CHECK: jmp _bar
+  br i1 undef, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %entry
+  br label %someThingWithValue.exit
+
+if.else.i:                                        ; preds = %entry
+  %call4.i = tail call i8* @bar(i8* undef) optsize
+  br label %someThingWithValue.exit
+
+someThingWithValue.exit:                          ; preds = %if.else.i, %if.then.i
+  %retval.0.in.i = phi i8* [ undef, %if.then.i ], [ %call4.i, %if.else.i ]
+  %retval.0.i = bitcast i8* %retval.0.in.i to %0*
+  ret %0* %retval.0.i
+}
diff --git a/test/CodeGen/X86/tailcall-i1.ll b/test/CodeGen/X86/tailcall-i1.ll
deleted file mode 100644
index 8ef1f11..0000000
--- a/test/CodeGen/X86/tailcall-i1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc i1 @i1test(i32, i32, i32, i32) {
-  entry:
-  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-  ret i1 %4
-}
diff --git a/test/CodeGen/X86/tailcall-largecode.ll b/test/CodeGen/X86/tailcall-largecode.ll
index c3f4278..e9b8721 100644
--- a/test/CodeGen/X86/tailcall-largecode.ll
+++ b/test/CodeGen/X86/tailcall-largecode.ll
@@ -49,6 +49,11 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: pushq
 ; Pass the stack argument.
 ;  CHECK: movl $7, 16(%rsp)
+; This is the large code model, so &manyargs_callee may not fit into
+; the jmp instruction.  Put it into a register which won't be clobbered
+; while restoring callee-saved registers and won't be used for passing
+; arguments.
+;  CHECK: movabsq $manyargs_callee, %rax
 ; Pass the register arguments, in the right registers.
 ;  CHECK: movl $1, %edi
 ;  CHECK: movl $2, %esi
@@ -56,11 +61,6 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: movl $4, %ecx
 ;  CHECK: movl $5, %r8d
 ;  CHECK: movl $6, %r9d
-; This is the large code model, so &manyargs_callee may not fit into
-; the jmp instruction.  Put it into R11, which won't be clobbered
-; while restoring callee-saved registers and won't be used for passing
-; arguments.
-;  CHECK: movabsq $manyargs_callee, %rax
 ; Adjust the stack to "return".
 ;  CHECK: popq
 ; And tail-call to the target.
diff --git a/test/CodeGen/X86/tailcall-void.ll b/test/CodeGen/X86/tailcall-void.ll
deleted file mode 100644
index 4e578d1..0000000
--- a/test/CodeGen/X86/tailcall-void.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc void @i1test(i32, i32, i32, i32) {
-  entry:
-   tail call fastcc void @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-   ret void 
-}
diff --git a/test/CodeGen/X86/tailcall1.ll b/test/CodeGen/X86/tailcall.ll
index f7ff5d5..36a38e0 100644
--- a/test/CodeGen/X86/tailcall1.ll
+++ b/test/CodeGen/X86/tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 5
+; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 7
 
 ; With -tailcallopt, CodeGen guarantees a tail call optimization
 ; for all of these.
@@ -38,3 +38,15 @@ define fastcc i32 @noret() nounwind {
   tail call fastcc void @does_not_return()
   unreachable
 }
+
+define fastcc void @void_test(i32, i32, i32, i32) {
+  entry:
+   tail call fastcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+   ret void 
+}
+
+define fastcc i1 @i1test(i32, i32, i32, i32) {
+  entry:
+  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+  ret i1 %4
+}
diff --git a/test/CodeGen/X86/tailcallbyval.ll b/test/CodeGen/X86/tailcallbyval.ll
index 03d6f94..118eee6 100644
--- a/test/CodeGen/X86/tailcallbyval.ll
+++ b/test/CodeGen/X86/tailcallbyval.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-; RUN: llc < %s -march=x86 -tailcallopt | grep {movl\[\[:space:\]\]*4(%esp), %eax} | count 1
+; RUN: llc < %s -march=x86 -tailcallopt | grep "movl[[:space:]]*4(%esp), %eax" | count 1
 %struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
new file mode 100644
index 0000000..ba5f8f8
--- /dev/null
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+
+; Gather non-machine specific tests for the transformations in
+; CodeGen/SelectionDAG/TargetLowering.  Currently, these
+; can't be tested easily by checking the SDNodes that are
+; the data structures that these transformations act on.
+; Therefore, use X86 assembler output to check against.
+
+; rdar://11195364 A problem with the transformation:
+;  If all of the demanded bits on one side are known, and all of the set
+;  bits on that side are also known to be set on the other side, turn this
+;  into an AND, as we know the bits will be cleared.
+; The known set (one) bits for the arguments %xor1 are not the same, so the
+; transformation should not occur
+define void @foo(i32 %i32In1, i32 %i32In2, i32 %i32In3, i32 %i32In4, 
+                 i32 %i32In5, i32 %i32In6, i32* %i32StarOut, i1 %i1In1, 
+                 i32* %i32SelOut) nounwind {
+    %and3 = and i32 %i32In1, 1362779777
+    %or2 = or i32 %i32In2, %i32In3
+    %and2 = and i32 %or2, 1362779777
+    %xor3 = xor i32 %and3, %and2
+    ; CHECK: shll
+    %shl1 = shl i32 %xor3, %i32In4
+    %sub1 = sub i32 %or2, %shl1
+    %add1 = add i32 %sub1, %i32In5
+    %and1 = and i32 %add1, 1
+    %xor2 = xor i32 %and1, 1
+    %or1 = or i32 %xor2, 364806994 ;0x15BE8352
+    ; CHECK-NOT: andl $96239955
+    %xor1 = xor i32 %or1, 268567040 ;0x10020200
+    ; force an output so not DCE'd
+    store i32 %xor1, i32* %i32StarOut
+    ; force not fast isel by using a select
+    %i32SelVal = select i1 %i1In1, i32 %i32In1, i32 %xor1
+    store i32 %i32SelVal, i32* %i32SelOut
+    ; CHECK: ret
+    ret void
+}
diff --git a/test/CodeGen/X86/thiscall-struct-return.ll b/test/CodeGen/X86/thiscall-struct-return.ll
index a7be483..0507cb8 100644
--- a/test/CodeGen/X86/thiscall-struct-return.ll
+++ b/test/CodeGen/X86/thiscall-struct-return.ll
@@ -10,7 +10,7 @@ declare x86_thiscallcc void @_ZNK1C6MediumEv(%struct.M* noalias sret %agg.result
 
 define void @testv() nounwind {
 ; CHECK: testv:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
@@ -29,7 +29,7 @@ entry:
 
 define void @test2v() nounwind {
 ; CHECK: test2v:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
diff --git a/test/CodeGen/X86/tls-local-dynamic.ll b/test/CodeGen/X86/tls-local-dynamic.ll
new file mode 100644
index 0000000..c5fd16b
--- /dev/null
+++ b/test/CodeGen/X86/tls-local-dynamic.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck  %s
+
+@x = internal thread_local global i32 0, align 4
+@y = internal thread_local global i32 0, align 4
+
+; get_x and get_y are here to prevent x and y to be optimized away as 0
+
+define i32* @get_x() {
+entry:
+  ret i32* @x
+; FIXME: This function uses a single thread-local variable,
+; so we might want to fall back to general-dynamic here.
+; CHECK:       get_x:
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+}
+
+define i32* @get_y() {
+entry:
+  ret i32* @y
+}
+
+define i32 @f(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 1
+  br i1 %cmp, label %return, label %if.else
+; This bb does not access TLS, so should not call __tls_get_addr.
+; CHECK:       f:
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       je
+
+
+if.else:
+  %0 = load i32* @x, align 4
+  %cmp1 = icmp eq i32 %i, 2
+  br i1 %cmp1, label %if.then2, label %return
+; Now we call __tls_get_addr.
+; CHECK:       # %if.else
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+
+
+if.then2:
+  %1 = load i32* @y, align 4
+  %add = add nsw i32 %1, %0
+  br label %return
+; This accesses TLS, but is dominated by the previous block,
+; so should not have to call __tls_get_addr again.
+; CHECK:       # %if.then2
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       y@DTPOFF
+
+
+return:
+  %retval.0 = phi i32 [ %add, %if.then2 ], [ 5, %entry ], [ %0, %if.else ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
new file mode 100644
index 0000000..7c527e2
--- /dev/null
+++ b/test/CodeGen/X86/tls-models.ll
@@ -0,0 +1,166 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64_PIC %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32_PIC %s
+
+; Darwin always uses the same model.
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; X64:     f1:
+  ; X64:     external_gd@GOTTPOFF
+  ; X32:     f1:
+  ; X32:     external_gd@INDNTPOFF
+  ; X64_PIC: f1:
+  ; X64_PIC: external_gd@TLSGD
+  ; X32_PIC: f1:
+  ; X32_PIC: external_gd@TLSGD
+  ; DARWIN:  f1:
+  ; DARWIN:  _external_gd@TLVP
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f2:
+  ; X64:     internal_gd@TPOFF
+  ; X32:     f2:
+  ; X32:     internal_gd@NTPOFF
+  ; X64_PIC: f2:
+  ; X64_PIC: internal_gd@TLSLD
+  ; X32_PIC: f2:
+  ; X32_PIC: internal_gd@TLSLDM
+  ; DARWIN:  f2:
+  ; DARWIN:  _internal_gd@TLVP
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC code use local dynamic as specified.
+  ; X64:     f3:
+  ; X64:     external_ld@GOTTPOFF
+  ; X32:     f3:
+  ; X32:     external_ld@INDNTPOFF
+  ; X64_PIC: f3:
+  ; X64_PIC: external_ld@TLSLD
+  ; X32_PIC: f3:
+  ; X32_PIC: external_ld@TLSLDM
+  ; DARWIN:  f3:
+  ; DARWIN:  _external_ld@TLVP
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f4:
+  ; X64:     internal_ld@TPOFF
+  ; X32:     f4:
+  ; X32:     internal_ld@NTPOFF
+  ; X64_PIC: f4:
+  ; X64_PIC: internal_ld@TLSLD
+  ; X32_PIC: f4:
+  ; X32_PIC: internal_ld@TLSLDM
+  ; DARWIN:  f4:
+  ; DARWIN:  _internal_ld@TLVP
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; X64:     f5:
+  ; X64:     external_ie@GOTTPOFF
+  ; X32:     f5:
+  ; X32:     external_ie@INDNTPOFF
+  ; X64_PIC: f5:
+  ; X64_PIC: external_ie@GOTTPOFF
+  ; X32_PIC: f5:
+  ; X32_PIC: external_ie@GOTNTPOFF
+  ; DARWIN:  f5:
+  ; DARWIN:  _external_ie@TLVP
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; X64:     f6:
+  ; X64:     internal_ie@TPOFF
+  ; X32:     f6:
+  ; X32:     internal_ie@NTPOFF
+  ; X64_PIC: f6:
+  ; X64_PIC: internal_ie@GOTTPOFF
+  ; X32_PIC: f6:
+  ; X32_PIC: internal_ie@GOTNTPOFF
+  ; DARWIN:  f6:
+  ; DARWIN:  _internal_ie@TLVP
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f7:
+  ; X64:     external_le@TPOFF
+  ; X32:     f7:
+  ; X32:     external_le@NTPOFF
+  ; X64_PIC: f7:
+  ; X64_PIC: external_le@TPOFF
+  ; X32_PIC: f7:
+  ; X32_PIC: external_le@NTPOFF
+  ; DARWIN:  f7:
+  ; DARWIN:  _external_le@TLVP
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f8:
+  ; X64:     internal_le@TPOFF
+  ; X32:     f8:
+  ; X32:     internal_le@NTPOFF
+  ; X64_PIC: f8:
+  ; X64_PIC: internal_le@TPOFF
+  ; X32_PIC: f8:
+  ; X32_PIC: internal_le@NTPOFF
+  ; DARWIN:  f8:
+  ; DARWIN:  _internal_le@TLVP
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index b83416d..51c3d23 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
+@j = internal thread_local global i32 42
+@k = internal thread_local global i32 42
 
 define i32 @f1() {
 entry:
@@ -64,4 +66,22 @@ entry:
 ; X64:   callq __tls_get_addr@PLT
 
 
+define i32 @f5() nounwind {
+entry:
+	%0 = load i32* @j, align 4
+	%1 = load i32* @k, align 4
+	%add = add nsw i32 %0, %1
+	ret i32 %add
+}
 
+; X32:    f5:
+; X32:      leal {{[jk]}}@TLSLDM(%ebx)
+; X32-NEXT: calll ___tls_get_addr@PLT
+; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
+; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+
+; X64:    f5:
+; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
+; X64-NEXT: callq	__tls_get_addr@PLT
+; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
+; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index e2e58a54..3fca9f5 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
@@ -35,7 +35,12 @@ entry:
 
 define i32 @f3() {
 ; X32: f3:
-; X32:      movl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %eax
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
+; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
 ; X32-NEXT: ret
 ; X64: f3:
@@ -50,8 +55,13 @@ entry:
 
 define i32* @f4() {
 ; X32: f4:
-; X32:      movl %gs:0, %eax
-; X32-NEXT: addl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %ecx
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
+; X32-NEXT: movl %gs:0, %eax
+; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
 ; X32-NEXT: ret
 ; X64: f4:
 ; X64:      movq %fs:0, %rax
diff --git a/test/CodeGen/X86/trap.ll b/test/CodeGen/X86/trap.ll
index 03ae6bf..3f44be0 100644
--- a/test/CodeGen/X86/trap.ll
+++ b/test/CodeGen/X86/trap.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep ud2
-define i32 @test() noreturn nounwind  {
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+
+; CHECK: test0:
+; CHECK: ud2
+define i32 @test0() noreturn nounwind  {
 entry:
 	tail call void @llvm.trap( )
 	unreachable
 }
 
+; CHECK: test1:
+; CHECK: int3
+define i32 @test1() noreturn nounwind  {
+entry:
+	tail call void @llvm.debugtrap( )
+	unreachable
+}
+
 declare void @llvm.trap() nounwind 
+declare void @llvm.debugtrap() nounwind 
 
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 57d6e97..9877d7b 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index 6f16a25..af6d47a 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& \
-; RUN:   grep {twoaddrinstr} | grep {Number of instructions aggressively commuted}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
+; RUN:   grep "twoaddrinstr" | grep "Number of instructions aggressively commuted"
 ; rdar://6480363
 
 target triple = "i386-apple-darwin9.6"
diff --git a/test/CodeGen/X86/twoaddr-pass-sink.ll b/test/CodeGen/X86/twoaddr-pass-sink.ll
index 077fee0..513c304 100644
--- a/test/CodeGen/X86/twoaddr-pass-sink.ll
+++ b/test/CodeGen/X86/twoaddr-pass-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& grep {Number of 3-address instructions sunk}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
 
 define void @t2(<2 x i64>* %vDct, <2 x i64>* %vYp, i8* %skiplist, <2 x i64> %a1) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index 41ee194..0536eb0 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {sub.*esp}
+; RUN: llc < %s -march=x86 -mcpu=yonah | not grep "sub.*esp"
 ; RUN: llc < %s -march=x86 -mcpu=yonah | grep cvtsi2ss
 ; rdar://6034396
 
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index 7416051..56fdadb 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {jc} | count 1
+; RUN: llc < %s -march=x86 | grep "jc" | count 1
 ; XFAIL: *
 
 ; FIXME: umul-with-overflow not supported yet.
diff --git a/test/CodeGen/X86/unwindraise.ll b/test/CodeGen/X86/unwindraise.ll
new file mode 100644
index 0000000..a438723
--- /dev/null
+++ b/test/CodeGen/X86/unwindraise.ll
@@ -0,0 +1,252 @@
+; RUN: llc < %s -verify-machineinstrs
+; PR13188
+;
+; The _Unwind_RaiseException function can return normally and via eh.return.
+; This causes confusion about the function live-out registers, since the two
+; different ways of returning have different return values.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-freebsd9.0"
+
+%struct._Unwind_Context = type { [18 x i8*], i8*, i8*, i8*, %struct.dwarf_eh_bases, i64, i64, i64, [18 x i8] }
+%struct.dwarf_eh_bases = type { i8*, i8*, i8* }
+%struct._Unwind_FrameState = type { %struct.frame_state_reg_info, i64, i64, i8*, i32, i8*, i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)*, i64, i64, i64, i8, i8, i8, i8, i8* }
+%struct.frame_state_reg_info = type { [18 x %struct.anon], %struct.frame_state_reg_info* }
+%struct.anon = type { %union.anon, i32 }
+%union.anon = type { i64 }
+%struct._Unwind_Exception = type { i64, void (i32, %struct._Unwind_Exception*)*, i64, i64 }
+
+@dwarf_reg_size_table = external hidden unnamed_addr global [18 x i8], align 16
+
+declare void @abort() noreturn
+
+declare fastcc i32 @uw_frame_state_for(%struct._Unwind_Context*, %struct._Unwind_FrameState*) uwtable
+
+define hidden i32 @_Unwind_RaiseException(%struct._Unwind_Exception* %exc) uwtable {
+entry:
+  %fs.i = alloca %struct._Unwind_FrameState, align 8
+  %this_context = alloca %struct._Unwind_Context, align 8
+  %cur_context = alloca %struct._Unwind_Context, align 8
+  %fs = alloca %struct._Unwind_FrameState, align 8
+  call void @llvm.eh.unwind.init()
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = call i8* @llvm.returnaddress(i32 0)
+  call fastcc void @uw_init_context_1(%struct._Unwind_Context* %this_context, i8* %0, i8* %1)
+  %2 = bitcast %struct._Unwind_Context* %cur_context to i8*
+  %3 = bitcast %struct._Unwind_Context* %this_context to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %personality = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 6
+  %retaddr_column.i = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 9
+  %flags.i.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 5
+  %ra.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 2
+  %exception_class = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 0
+  br label %while.body
+
+while.body:                                       ; preds = %uw_update_context.exit, %entry
+  %call = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  switch i32 %call, label %do.end21 [
+    i32 5, label %do.end21.loopexit46
+    i32 0, label %if.end3
+  ]
+
+if.end3:                                          ; preds = %while.body
+  %4 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality, align 8, !tbaa !0
+  %tobool = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %4, null
+  br i1 %tobool, label %if.end13, label %if.then4
+
+if.then4:                                         ; preds = %if.end3
+  %5 = load i64* %exception_class, align 8, !tbaa !3
+  %call6 = call i32 %4(i32 1, i32 1, i64 %5, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call6, label %do.end21.loopexit46 [
+    i32 6, label %while.end
+    i32 8, label %if.end13
+  ]
+
+if.end13:                                         ; preds = %if.then4, %if.end3
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  %6 = load i64* %retaddr_column.i, align 8, !tbaa !3
+  %conv.i = trunc i64 %6 to i32
+  %cmp.i.i.i = icmp slt i32 %conv.i, 18
+  br i1 %cmp.i.i.i, label %cond.end.i.i.i, label %cond.true.i.i.i
+
+cond.true.i.i.i:                                  ; preds = %if.end13
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i:                                   ; preds = %if.end13
+  %sext.i = shl i64 %6, 32
+  %idxprom.i.i.i = ashr exact i64 %sext.i, 32
+  %arrayidx.i.i.i = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i
+  %7 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1
+  %arrayidx2.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i
+  %8 = load i8** %arrayidx2.i.i.i, align 8, !tbaa !0
+  %9 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i = and i64 %9, 4611686018427387904
+  %tobool.i.i.i = icmp eq i64 %and.i.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end.i.i.i, label %land.lhs.true.i.i.i
+
+land.lhs.true.i.i.i:                              ; preds = %cond.end.i.i.i
+  %arrayidx4.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i
+  %10 = load i8* %arrayidx4.i.i.i, align 1, !tbaa !1
+  %tobool6.i.i.i = icmp eq i8 %10, 0
+  br i1 %tobool6.i.i.i, label %if.end.i.i.i, label %if.then.i.i.i
+
+if.then.i.i.i:                                    ; preds = %land.lhs.true.i.i.i
+  %11 = ptrtoint i8* %8 to i64
+  br label %uw_update_context.exit
+
+if.end.i.i.i:                                     ; preds = %land.lhs.true.i.i.i, %cond.end.i.i.i
+  %cmp8.i.i.i = icmp eq i8 %7, 8
+  br i1 %cmp8.i.i.i, label %if.then10.i.i.i, label %cond.true14.i.i.i
+
+if.then10.i.i.i:                                  ; preds = %if.end.i.i.i
+  %12 = bitcast i8* %8 to i64*
+  %13 = load i64* %12, align 8, !tbaa !3
+  br label %uw_update_context.exit
+
+cond.true14.i.i.i:                                ; preds = %if.end.i.i.i
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit:                           ; preds = %if.then10.i.i.i, %if.then.i.i.i
+  %retval.0.i.i.i = phi i64 [ %11, %if.then.i.i.i ], [ %13, %if.then10.i.i.i ]
+  %14 = inttoptr i64 %retval.0.i.i.i to i8*
+  store i8* %14, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body
+
+while.end:                                        ; preds = %if.then4
+  %private_1 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 2
+  store i64 0, i64* %private_1, align 8, !tbaa !3
+  %15 = load i8** %ra.i, align 8, !tbaa !0
+  %16 = ptrtoint i8* %15 to i64
+  %private_2 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 3
+  store i64 %16, i64* %private_2, align 8, !tbaa !3
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %17 = bitcast %struct._Unwind_FrameState* %fs.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %17)
+  %personality.i = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 6
+  %retaddr_column.i22 = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 9
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %uw_update_context.exit44, %while.end
+  %call.i = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %18 = load i8** %ra.i, align 8, !tbaa !0
+  %19 = ptrtoint i8* %18 to i64
+  %20 = load i64* %private_2, align 8, !tbaa !3
+  %cmp.i = icmp eq i64 %19, %20
+  %cmp2.i = icmp eq i32 %call.i, 0
+  br i1 %cmp2.i, label %if.end.i, label %do.end21
+
+if.end.i:                                         ; preds = %while.body.i
+  %21 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality.i, align 8, !tbaa !0
+  %tobool.i = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %21, null
+  br i1 %tobool.i, label %if.end12.i, label %if.then3.i
+
+if.then3.i:                                       ; preds = %if.end.i
+  %or.i = select i1 %cmp.i, i32 6, i32 2
+  %22 = load i64* %exception_class, align 8, !tbaa !3
+  %call5.i = call i32 %21(i32 1, i32 %or.i, i64 %22, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call5.i, label %do.end21 [
+    i32 7, label %do.body19
+    i32 8, label %if.end12.i
+  ]
+
+if.end12.i:                                       ; preds = %if.then3.i, %if.end.i
+  br i1 %cmp.i, label %cond.true.i, label %cond.end.i
+
+cond.true.i:                                      ; preds = %if.end12.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i:                                       ; preds = %if.end12.i
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %23 = load i64* %retaddr_column.i22, align 8, !tbaa !3
+  %conv.i23 = trunc i64 %23 to i32
+  %cmp.i.i.i24 = icmp slt i32 %conv.i23, 18
+  br i1 %cmp.i.i.i24, label %cond.end.i.i.i33, label %cond.true.i.i.i25
+
+cond.true.i.i.i25:                                ; preds = %cond.end.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i33:                                 ; preds = %cond.end.i
+  %sext.i26 = shl i64 %23, 32
+  %idxprom.i.i.i27 = ashr exact i64 %sext.i26, 32
+  %arrayidx.i.i.i28 = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i27
+  %24 = load i8* %arrayidx.i.i.i28, align 1, !tbaa !1
+  %arrayidx2.i.i.i29 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i27
+  %25 = load i8** %arrayidx2.i.i.i29, align 8, !tbaa !0
+  %26 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i31 = and i64 %26, 4611686018427387904
+  %tobool.i.i.i32 = icmp eq i64 %and.i.i.i.i31, 0
+  br i1 %tobool.i.i.i32, label %if.end.i.i.i39, label %land.lhs.true.i.i.i36
+
+land.lhs.true.i.i.i36:                            ; preds = %cond.end.i.i.i33
+  %arrayidx4.i.i.i34 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i27
+  %27 = load i8* %arrayidx4.i.i.i34, align 1, !tbaa !1
+  %tobool6.i.i.i35 = icmp eq i8 %27, 0
+  br i1 %tobool6.i.i.i35, label %if.end.i.i.i39, label %if.then.i.i.i37
+
+if.then.i.i.i37:                                  ; preds = %land.lhs.true.i.i.i36
+  %28 = ptrtoint i8* %25 to i64
+  br label %uw_update_context.exit44
+
+if.end.i.i.i39:                                   ; preds = %land.lhs.true.i.i.i36, %cond.end.i.i.i33
+  %cmp8.i.i.i38 = icmp eq i8 %24, 8
+  br i1 %cmp8.i.i.i38, label %if.then10.i.i.i40, label %cond.true14.i.i.i41
+
+if.then10.i.i.i40:                                ; preds = %if.end.i.i.i39
+  %29 = bitcast i8* %25 to i64*
+  %30 = load i64* %29, align 8, !tbaa !3
+  br label %uw_update_context.exit44
+
+cond.true14.i.i.i41:                              ; preds = %if.end.i.i.i39
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit44:                         ; preds = %if.then10.i.i.i40, %if.then.i.i.i37
+  %retval.0.i.i.i42 = phi i64 [ %28, %if.then.i.i.i37 ], [ %30, %if.then10.i.i.i40 ]
+  %31 = inttoptr i64 %retval.0.i.i.i42 to i8*
+  store i8* %31, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body.i
+
+do.body19:                                        ; preds = %if.then3.i
+  call void @llvm.lifetime.end(i64 -1, i8* %17)
+  %call20 = call fastcc i64 @uw_install_context_1(%struct._Unwind_Context* %this_context, %struct._Unwind_Context* %cur_context)
+  %32 = load i8** %ra.i, align 8, !tbaa !0
+  call void @llvm.eh.return.i64(i64 %call20, i8* %32)
+  unreachable
+
+do.end21.loopexit46:                              ; preds = %if.then4, %while.body
+  %retval.0.ph = phi i32 [ 3, %if.then4 ], [ 5, %while.body ]
+  br label %do.end21
+
+do.end21:                                         ; preds = %do.end21.loopexit46, %if.then3.i, %while.body.i, %while.body
+  %retval.0 = phi i32 [ %retval.0.ph, %do.end21.loopexit46 ], [ 3, %while.body ], [ 2, %while.body.i ], [ 2, %if.then3.i ]
+  ret i32 %retval.0
+}
+
+declare void @llvm.eh.unwind.init() nounwind
+
+declare fastcc void @uw_init_context_1(%struct._Unwind_Context*, i8*, i8*) uwtable
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+declare fastcc i64 @uw_install_context_1(%struct._Unwind_Context*, %struct._Unwind_Context*) uwtable
+
+declare void @llvm.eh.return.i64(i64, i8*) nounwind
+
+declare fastcc void @uw_update_context_1(%struct._Unwind_Context*, %struct._Unwind_FrameState* nocapture) uwtable
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"long", metadata !1}
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
index ae3f55a..569586a 100644
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ b/test/CodeGen/X86/v-binop-widen2.ll
@@ -1,9 +1,16 @@
-; RUN: llc -march=x86 -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
 
 %vec = type <6 x float>
 ; CHECK: divss
 ; CHECK: divss
 ; CHECK: divps
+
+; Scheduler causes a different instruction order to be produced on Intel Atom
+; ATOM: divps
+; ATOM: divss
+; ATOM: divss
+
 define %vec @vecdiv( %vec %p1, %vec %p2)
 {
   %result = fdiv %vec %p1, %p2
diff --git a/test/CodeGen/X86/vec_call.ll b/test/CodeGen/X86/vec_call.ll
index f2fc7e7..e0862ca 100644
--- a/test/CodeGen/X86/vec_call.ll
+++ b/test/CodeGen/X86/vec_call.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {subl.*60}
+; RUN:   grep "subl.*60"
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {movaps.*32}
+; RUN:   grep "movaps.*32"
 
 
 define void @test() {
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
new file mode 100644
index 0000000..08eb16f
--- /dev/null
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+;CHECK: foo1_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo1_8(<8 x i8> %src) {
+  %res = sitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo1_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo1_4(<4 x i8> %src) {
+  %res = sitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo2_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo2_8(<8 x i8> %src) {
+  %res = uitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo2_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo2_4(<4 x i8> %src) {
+  %res = uitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo3_8
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <8 x i8> @foo3_8(<8 x float> %src) {
+  %res = fptosi <8 x float> %src to <8 x i8>
+  ret <8 x i8> %res
+}
+;CHECK: foo3_4
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <4 x i8> @foo3_4(<4 x float> %src) {
+  %res = fptosi <4 x float> %src to <4 x i8>
+  ret <4 x i8> %res
+}
+
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 91777f7..46d6a23 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: movzwl
-; CHECK: movzwl
+; CHECK: punpcklwd
 ; CHECK: pshufd
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 39c9b77..367dd27 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
 
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 2951193..565be7a 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep {(%esp,%eax,4)} | count 4
+; RUN: llc < %s -march=x86 -mcpu=yonah | grep "(%esp,%eax,4)" | count 4
 
 ; Inserts and extracts with variable indices must be lowered
 ; to memory accesses.
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
index de3b36f..2a4864a 100644
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ b/test/CodeGen/X86/vec_insert-6.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
 
 define <4 x float> @t3(<4 x float>* %P) nounwind  {
 	%tmp1 = load <4 x float>* %P
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index ada17e0..d1d7608 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
 ; RUN: grep pshufd %t | count 2
 
 define <4 x float> @test(float %a) nounwind {
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
index 3656e5f..b8ec0cf 100644
--- a/test/CodeGen/X86/vec_set-9.ll
+++ b/test/CodeGen/X86/vec_set-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 | grep movd | count 1
-; RUN: llc < %s -march=x86-64 | grep {movlhps.*%xmm0, %xmm0}
+; RUN: llc < %s -march=x86-64 | grep "movlhps.*%xmm0, %xmm0"
 
 define <2 x i64> @test3(i64 %A) nounwind {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
index 06f38ed..09d4c1a 100644
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ b/test/CodeGen/X86/vec_shuffle-16.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
 
 ; sse:  t1:
 ; sse2: t1:
diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
index 861a1cc..b26f920 100644
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ b/test/CodeGen/X86/vec_shuffle-19.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
 ; PR2485
 
 define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind  {
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
index dec98c7..0aff822 100644
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 ; ModuleID = 'vec_shuffle-27.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -35,4 +35,4 @@ entry:
   store <4 x i64> %vect1487, <4 x i64>* %ap
   store <4 x i64> %vect1488, <4 x i64>* %bp
   ret void;
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
index 7f0fcb5..f5083b4 100644
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ b/test/CodeGen/X86/vec_shuffle-35.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 13
-; RUN: grep pinsrw %t | count 14
+; RUN: grep pextrw %t | count 12
+; RUN: grep pinsrw %t | count 13
 ; RUN: grep rolw %t | count 13
 ; RUN: not grep esp %t
 ; RUN: not grep ebp %t
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
index 8090afc..9a06015 100644
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ b/test/CodeGen/X86/vec_shuffle-36.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; CHECK: pshufb
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
index 430aa04..ed285f9 100644
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ b/test/CodeGen/X86/vec_shuffle-37.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
 ; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
 
 define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
index 96ef883..ec196df 100644
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ b/test/CodeGen/X86/vec_shuffle-38.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
 
 define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
 ; CHECK: unpcklpd
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 55531e3..ee8d2d5 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
 ; rdar://10050222, rdar://10134392
 
 define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
index cde5ae9..f105de4 100644
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ b/test/CodeGen/X86/vec_splat-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd | count 1
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd | count 1
 
 define void @test(<2 x i64>* %P, i8 %x) nounwind {
 	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index 649b85c..feacc42 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklwd %t | count 4
 ; RUN: grep punpckhwd %t | count 4
 ; RUN: grep "pshufd" %t | count 8
diff --git a/test/CodeGen/X86/vec_splat-4.ll b/test/CodeGen/X86/vec_splat-4.ll
index d9941e6..374acfa 100644
--- a/test/CodeGen/X86/vec_splat-4.ll
+++ b/test/CodeGen/X86/vec_splat-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklbw %t | count 16
 ; RUN: grep punpckhbw %t | count 16
 ; RUN: grep "pshufd" %t | count 16
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index a87fbd0..24d8487 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd
-; RUN: llc < %s -march=x86 -mattr=+sse3 | grep movddup
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse3 | grep movddup
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 3bd3f7b..c294df5 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -70,3 +70,17 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
 ; CHECK: call
 ; CHECK: roundss $4, %xmm{{.*}}, %xmm0
 }
+
+; PR13576 
+define  <2 x double> @test5() nounwind uwtable readnone noinline {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double
+4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
+  ret <2 x double> %0
+; CHECK: test5:
+; CHECK: mov
+; CHECK: mov
+; CHECK: cvtsi2sd
+}
+
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index 4955156..e775750 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psllq
+; CHECK: psllq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %shl = shl <2 x i64> %val, %1
@@ -38,7 +38,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index 9a9b419..9496893 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psrlq
+; CHECK: psrlq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %lshr = lshr <2 x i64> %val, %1
@@ -37,7 +37,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrld
+; CHECK: psrld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -63,7 +63,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psrlw
+; CHECK: psrlw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index 8e8a9aa..b2b48b9 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -28,7 +28,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -52,7 +52,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psraw
+; CHECK: psraw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-5.ll b/test/CodeGen/X86/vshift-5.ll
index cb254ae..f6c311d 100644
--- a/test/CodeGen/X86/vshift-5.ll
+++ b/test/CodeGen/X86/vshift-5.ll
@@ -6,7 +6,7 @@ define void @shift5a(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5a:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -20,7 +20,7 @@ define void @shift5b(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -34,7 +34,7 @@ define void @shift5c(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5c:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shl = shl <4 x i32> %val, %shamt
@@ -47,7 +47,7 @@ define void @shift5d(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5d:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shr = ashr <4 x i32> %val, %shamt
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index f55b184..d86042a 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -2,7 +2,6 @@
 ; CHECK: incl
 ; CHECK: incl
 ; CHECK: incl
-; CHECK: addl
 
 ; Widen a v3i16 to v8i16 to do a vector add
 
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 4330aae..ebdfea9 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,7 +1,14 @@
-; RUN: llc -march=x86 -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+
 ; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movd
+; CHECK: movl
+; CHECK: movlpd
+
+; Scheduler causes produce a different instruction order
+; ATOM: movl
+; ATOM: paddd
+; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32
 
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 5c695ea..3979ce4 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 136578d..9086d3a 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,9 +1,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
 ; CHECK: movl
-; CHECK: movd
+; CHECK: movlpd
 
 ; bitcast a i64 to v2i32
-
 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
 entry:
 	%conv = bitcast i64 %src to <2 x i32>
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index affd796..1158e04 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK-NOT: cvtsi2ss
 
 ; unsigned to float v7i16 to v7f32
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index 4bcac58..8672742 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; widen extract subvector
 
 define void @convert(<2 x double>* %dst.addr, <3 x double> %src)  {
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index 4aeec91..d543728 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -1,18 +1,12 @@
 ; RUN: llc < %s -o - -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -o - -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s -check-prefix=WIN64
 ; PR4891
 
 ; Both loads should happen before either store.
 
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  {{.*}}, ({{.*}})
-; CHECK: movd  {{.*}}, ({{.*}})
-
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  {{.*}}, ({{.*}})
-; WIN64: movd  {{.*}}, ({{.*}})
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  {{.*}}, ({{.*}})
+; CHECK: movl  {{.*}}, ({{.*}})
 
 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry:
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index a961c6a..cc11e4c 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,12 +1,9 @@
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
-; Passing the same value in two registers creates a false interference that
-; only -join-physregs resolves. It could also be handled by a parallel copy.
-
 define i64 @foo(i64 %n, i64 %x) nounwind {
 entry:
 
@@ -31,19 +28,19 @@ entry:
 
   %buf1 = alloca i8, i64 %n, align 1
 
-; M64: leaq  15(%rcx), %rax
+; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
 ; M64: callq ___chkstk
 ; M64-NOT:   %rsp
 ; M64: movq  %rsp, %rax
 
-; W64: leaq  15(%rcx), %rax
+; W64: leaq  15(%{{.*}}), %rax
 ; W64: andq  $-16, %rax
 ; W64: callq __chkstk
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
-; EFI: leaq  15(%rcx), [[R1:%r.*]]
+; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
 ; EFI: movq  %rsp, [[R64:%r.*]]
 ; EFI: subq  [[R1]], [[R64]]
diff --git a/test/CodeGen/X86/x86-64-arg.ll b/test/CodeGen/X86/x86-64-arg.ll
index ec8dd8e..9a959e8 100644
--- a/test/CodeGen/X86/x86-64-arg.ll
+++ b/test/CodeGen/X86/x86-64-arg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%edi, %eax}
+; RUN: llc < %s | grep "movl	%edi, %eax"
 ; The input value is already sign extended, don't re-extend it.
 ; This testcase corresponds to:
 ;   int test(short X) { return (int)X; }
diff --git a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
index 79316f2..902c9d5 100644
--- a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
+++ b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | not grep rsp
-; RUN: llc < %s | grep cvttsd2siq
+; RUN: llc < %s -mcpu=nehalem | not grep rsp
+; RUN: llc < %s -mcpu=nehalem | grep cvttsd2siq
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/x86-64-pic-1.ll b/test/CodeGen/X86/x86-64-pic-1.ll
index 46f6d33..46cd4f8 100644
--- a/test/CodeGen/X86/x86-64-pic-1.ll
+++ b/test/CodeGen/X86/x86-64-pic-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f@PLT} %t1
+; RUN: grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index b6f82e2..3ec172b 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	g@PLT} %t1
+; RUN: grep "callq	g@PLT" %t1
 
 @g = alias weak i32 ()* @f
 
diff --git a/test/CodeGen/X86/x86-64-pic-11.ll b/test/CodeGen/X86/x86-64-pic-11.ll
index 4db331c..fd64beb 100644
--- a/test/CodeGen/X86/x86-64-pic-11.ll
+++ b/test/CodeGen/X86/x86-64-pic-11.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	__fixunsxfti@PLT} %t1
+; RUN: grep "callq	__fixunsxfti@PLT" %t1
 
 define i128 @f(x86_fp80 %a) nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-2.ll b/test/CodeGen/X86/x86-64-pic-2.ll
index 1ce2de7..f3f7b1d 100644
--- a/test/CodeGen/X86/x86-64-pic-2.ll
+++ b/test/CodeGen/X86/x86-64-pic-2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-3.ll b/test/CodeGen/X86/x86-64-pic-3.ll
index aa3c888..ba93378 100644
--- a/test/CodeGen/X86/x86-64-pic-3.ll
+++ b/test/CodeGen/X86/x86-64-pic-3.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-4.ll b/test/CodeGen/X86/x86-64-pic-4.ll
index 90fc119..33b08c4 100644
--- a/test/CodeGen/X86/x86-64-pic-4.ll
+++ b/test/CodeGen/X86/x86-64-pic-4.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	a@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	a@GOTPCREL(%rip)," %t1
 
 @a = global i32 0
 
diff --git a/test/CodeGen/X86/x86-64-pic-5.ll b/test/CodeGen/X86/x86-64-pic-5.ll
index 6369bde..234bc0d 100644
--- a/test/CodeGen/X86/x86-64-pic-5.ll
+++ b/test/CodeGen/X86/x86-64-pic-5.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = hidden global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-6.ll b/test/CodeGen/X86/x86-64-pic-6.ll
index 6e19ad3..ae5b583 100644
--- a/test/CodeGen/X86/x86-64-pic-6.ll
+++ b/test/CodeGen/X86/x86-64-pic-6.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = internal global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-7.ll b/test/CodeGen/X86/x86-64-pic-7.ll
index 4d98ee6..de240a3 100644
--- a/test/CodeGen/X86/x86-64-pic-7.ll
+++ b/test/CodeGen/X86/x86-64-pic-7.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	f@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	f@GOTPCREL(%rip)," %t1
 
 define void ()* @g() nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-8.ll b/test/CodeGen/X86/x86-64-pic-8.ll
index d3b567c..db35c33 100644
--- a/test/CodeGen/X86/x86-64-pic-8.ll
+++ b/test/CodeGen/X86/x86-64-pic-8.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() {
diff --git a/test/CodeGen/X86/x86-64-pic-9.ll b/test/CodeGen/X86/x86-64-pic-9.ll
index 0761031..6daea84 100644
--- a/test/CodeGen/X86/x86-64-pic-9.ll
+++ b/test/CodeGen/X86/x86-64-pic-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() nounwind {
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index a2521b0..8af782c 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -875,37 +875,37 @@ define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
 }
 declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0, <4 x float> %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %a1) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
   ret <4 x float> %res
 }
-define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(<4 x float> %a0, float* %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %elem = load float* %a1
+  %elem = load float* %a0
   %vec = insertelement <4 x float> undef, float %elem, i32 0
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %vec) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %vec) ;
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
 
-define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0, <2 x double> %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %a1) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
   ret <2 x double> %res
 }
-define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(<2 x double> %a0, double* %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %elem = load double* %a1
+  %elem = load double* %a0
   %vec = insertelement <2 x double> undef, double %elem, i32 0
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %vec) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %vec) ;
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
 
 define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
   ; CHECK: vfrczpd
@@ -967,3 +967,59 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
 }
 declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
 
+define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomb
+  %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomd
+  %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomub
+  %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomuw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomud
+  %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomuq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index ddc4cab..996bfc4 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; Though it is undefined, we want xor undef,undef to produce zero.
 define <4 x i32> @test1() nounwind {
@@ -31,7 +31,7 @@ entry:
 ; X64: test3:
 ; X64:	notl
 ; X64:	andl
-; X64:	shrl	%eax
+; X64:	shrl
 ; X64:	ret
 
 ; X32: test3:
diff --git a/test/CodeGen/XCore/mkmsk.ll b/test/CodeGen/XCore/mkmsk.ll
new file mode 100644
index 0000000..377612b
--- /dev/null
+++ b/test/CodeGen/XCore/mkmsk.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+define i32 @f(i32) nounwind {
+; CHECK: f:
+; CHECK: mkmsk r0, r0
+; CHECK-NEXT: retsp 0
+entry:
+  %1 = shl i32 1, %0
+  %2 = add i32 %1, -1
+  ret i32 %2
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64
new file mode 100755
index 0000000..fe20c8e
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64
new file mode 100755
index 0000000..ce4af7f
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64
new file mode 100755
index 0000000..7c17304
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64
new file mode 100755
index 0000000..8848708
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
new file mode 100644
index 0000000..3be9aba
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -0,0 +1,111 @@
+; RUN: llc -O1 -mtriple=x86_64-apple-darwin < %s | FileCheck -check-prefix=DARWIN %s
+; RUN: llc -O1 -mtriple=x86_64-pc-linux-gnu < %s | FileCheck -check-prefix=LINUX %s
+; PR9493
+; Adapted from the original test case in r127757.
+; We use 'llc -O1' to induce variable 'x' to live in different locations.
+; We don't actually care where 'x' lives, or what exact optimizations get
+; used, as long as 'x' moves around we're fine.
+
+; // The variable 'x' lives in different locations, so it needs an entry in
+; // the .debug_loc table section, referenced by DW_AT_location.
+; // This ref is not relocatable on Darwin, and is relocatable elsewhere.
+; extern int g(int, int);
+; extern int a;
+; 
+; void f(void) {
+;   int x;
+;   a = g(0, 0);
+;   x = 1;
+;   while (x & 1) { x *= a; }
+;   a = g(x, 0);
+;   x = 2;
+;   while (x & 2) { x *= a; }
+;   a = g(0, x);
+; }
+
+; // The 'x' variable and its symbol reference location
+; DARWIN:      DW_TAG_variable
+; DARWIN-NEXT: ## DW_AT_name
+; DARWIN-NEXT: .long Lset{{[0-9]+}}
+; DARWIN-NEXT: ## DW_AT_decl_file
+; DARWIN-NEXT: ## DW_AT_decl_line
+; DARWIN-NEXT: ## DW_AT_type
+; DARWIN-NEXT: Lset{{[0-9]+}} = Ldebug_loc{{[0-9]+}}-Lsection_debug_loc ## DW_AT_location
+; DARWIN-NEXT: .long Lset{{[0-9]+}}
+
+; LINUX:      DW_TAG_variable
+; LINUX-NEXT: # DW_AT_name
+; LINUX-NEXT: # DW_AT_decl_file
+; LINUX-NEXT: # DW_AT_decl_line
+; LINUX-NEXT: # DW_AT_type
+; LINUX-NEXT: .long .Ldebug_loc{{[0-9]+}} # DW_AT_location
+
+
+; ModuleID = 'simple.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
+
+@a = external global i32
+
+define void @f() nounwind {
+entry:
+  %call = tail call i32 @g(i32 0, i32 0) nounwind, !dbg !8
+  store i32 %call, i32* @a, align 4, !dbg !8, !tbaa !9
+  tail call void @llvm.dbg.value(metadata !12, i64 0, metadata !5), !dbg !13
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %x.017 = phi i32 [ 1, %entry ], [ %mul, %while.body ]
+  %mul = mul nsw i32 %call, %x.017, !dbg !14
+  %and = and i32 %mul, 1, !dbg !14
+  %tobool = icmp eq i32 %and, 0, !dbg !14
+  br i1 %tobool, label %while.end, label %while.body, !dbg !14
+
+while.end:                                        ; preds = %while.body
+  tail call void @llvm.dbg.value(metadata !{i32 %mul}, i64 0, metadata !5), !dbg !14
+  %call4 = tail call i32 @g(i32 %mul, i32 0) nounwind, !dbg !15
+  store i32 %call4, i32* @a, align 4, !dbg !15, !tbaa !9
+  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !5), !dbg !17
+  br label %while.body9
+
+while.body9:                                      ; preds = %while.end, %while.body9
+  %x.116 = phi i32 [ 2, %while.end ], [ %mul12, %while.body9 ]
+  %mul12 = mul nsw i32 %call4, %x.116, !dbg !18
+  %and7 = and i32 %mul12, 2, !dbg !18
+  %tobool8 = icmp eq i32 %and7, 0, !dbg !18
+  br i1 %tobool8, label %while.end13, label %while.body9, !dbg !18
+
+while.end13:                                      ; preds = %while.body9
+  tail call void @llvm.dbg.value(metadata !{i32 %mul12}, i64 0, metadata !5), !dbg !18
+  %call15 = tail call i32 @g(i32 0, i32 %mul12) nounwind, !dbg !19
+  store i32 %call15, i32* @a, align 4, !dbg !19, !tbaa !9
+  ret void, !dbg !20
+}
+
+declare i32 @g(i32, i32)
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+!llvm.dbg.lv.f = !{!5}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"simple.c", metadata !"/home/rengol01/temp/tests/dwarf/relocation", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"simple.c", metadata !"/home/rengol01/temp/tests/dwarf/relocation", metadata !"clang version 3.0 (trunk)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
+!5 = metadata !{i32 590080, metadata !6, metadata !"x", metadata !1, i32 5, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{i32 589835, metadata !0, i32 4, i32 14, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 6, i32 3, metadata !6, null}
+!9 = metadata !{metadata !"int", metadata !10}
+!10 = metadata !{metadata !"omnipotent char", metadata !11}
+!11 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!12 = metadata !{i32 1}
+!13 = metadata !{i32 7, i32 3, metadata !6, null}
+!14 = metadata !{i32 8, i32 3, metadata !6, null}
+!15 = metadata !{i32 9, i32 3, metadata !6, null}
+!16 = metadata !{i32 2}
+!17 = metadata !{i32 10, i32 3, metadata !6, null}
+!18 = metadata !{i32 11, i32 3, metadata !6, null}
+!19 = metadata !{i32 12, i32 3, metadata !6, null}
+!20 = metadata !{i32 13, i32 1, metadata !6, null}
diff --git a/test/DebugInfo/X86/aligned_stack_var.ll b/test/DebugInfo/X86/aligned_stack_var.ll
new file mode 100644
index 0000000..9e6c7ff
--- /dev/null
+++ b/test/DebugInfo/X86/aligned_stack_var.ll
@@ -0,0 +1,42 @@
+; RUN: llc %s -mtriple=x86_64-pc-linux-gnu -O0 -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; If stack is realigned, we shouldn't describe locations of local
+; variables by giving offset from the frame pointer (%rbp):
+; push %rpb
+; mov  %rsp,%rbp
+; and  ALIGNMENT,%rsp ; (%rsp and %rbp are different now)
+; It's better to use offset from %rsp instead.
+
+; DW_AT_location of variable "x" shouldn't be equal to
+; (DW_OP_fbreg: .*): DW_OP_fbreg has code 0x91
+
+; CHECK: {{0x.* DW_TAG_variable}}
+; CHECK-NOT: {{DW_AT_location.*DW_FORM_block1.*0x.*91}}
+; CHECK: NULL
+
+define void @_Z3runv() nounwind uwtable {
+entry:
+  %x = alloca i32, align 32
+  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !9), !dbg !12
+  ret void, !dbg !13
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/debuginfo", metadata !"clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"_Z3runv", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/debuginfo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null}
+!9 = metadata !{i32 786688, metadata !10, metadata !"x", metadata !6, i32 2, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 786443, metadata !5, i32 1, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!12 = metadata !{i32 2, i32 7, metadata !10, null}
+!13 = metadata !{i32 3, i32 1, metadata !10, null}
diff --git a/test/DebugInfo/X86/enum-class.ll b/test/DebugInfo/X86/enum-class.ll
new file mode 100644
index 0000000..6eb715d
--- /dev/null
+++ b/test/DebugInfo/X86/enum-class.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+@a = global i32 0, align 4
+@b = global i64 0, align 8
+@c = global i32 0, align 4
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157269) (llvm/trunk 157264)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !15, metadata !15, metadata !17} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{metadata !3, metadata !8, metadata !12}
+!3 = metadata !{i32 786436, null, metadata !"A", metadata !4, i32 1, i64 32, i64 32, i32 0, i32 0, metadata !5, metadata !6, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786472, metadata !"A1", i64 1} ; [ DW_TAG_enumerator ]
+!8 = metadata !{i32 786436, null, metadata !"B", metadata !4, i32 2, i64 64, i64 64, i32 0, i32 0, metadata !9, metadata !10, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!9 = metadata !{i32 786468, null, metadata !"long unsigned int", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786472, metadata !"B1", i64 1} ; [ DW_TAG_enumerator ]
+!12 = metadata !{i32 786436, null, metadata !"C", metadata !4, i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786472, metadata !"C1", i64 1} ; [ DW_TAG_enumerator ]
+!15 = metadata !{metadata !16}
+!16 = metadata !{i32 0}
+!17 = metadata !{metadata !18}
+!18 = metadata !{metadata !19, metadata !20, metadata !21}
+!19 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !4, i32 4, metadata !3, i32 0, i32 1, i32* @a} ; [ DW_TAG_variable ]
+!20 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !4, i32 5, metadata !8, i32 0, i32 1, i64* @b} ; [ DW_TAG_variable ]
+!21 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !4, i32 6, metadata !12, i32 0, i32 1, i32* @c} ; [ DW_TAG_variable ]
+
+; CHECK: DW_TAG_enumeration_type [3]
+; CHECK: DW_AT_type [DW_FORM_ref4]      (cu + 0x0026 => {0x00000026})
+; CHECK: DW_AT_enum_class [DW_FORM_flag]    (0x01)
+; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "A")
+
+; CHECK: DW_TAG_enumeration_type [3] *
+; CHECK: DW_AT_type [DW_FORM_ref4]      (cu + 0x0057 => {0x00000057})
+; CHECK: DW_AT_enum_class [DW_FORM_flag]    (0x01)
+; CHECK: DW_AT_name [DW_FORM_strp]          ( .debug_str[{{.*}}] = "B")
+
+; CHECK: DW_TAG_enumeration_type [6]
+; CHECK-NOT: DW_AT_enum_class
+; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "C")
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
new file mode 100644
index 0000000..c2dacea
--- /dev/null
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+@e = global i16 0, align 2
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157772) (llvm/trunk 157761)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !6, metadata !6, metadata !7} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{metadata !3}
+!3 = metadata !{i32 786436, null, metadata !"E", metadata !4, i32 1, i64 16, i64 16, i32 0, i32 4, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 0}
+!6 = metadata !{metadata !5}
+!7 = metadata !{metadata !8}
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !4, i32 2, metadata !3, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ]
+
+; CHECK: DW_TAG_enumeration_type
+; CHECK-NEXT: DW_AT_name
+; CHECK-NEXT: DW_AT_byte_size
+; CHECK-NEXT: DW_AT_declaration
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
new file mode 100644
index 0000000..c84b2e6
--- /dev/null
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -0,0 +1,89 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
+; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
+; DW_AT_location lists yet.
+; CHECK: DW_AT_location [DW_FORM_data4]                      (0x00000000)
+
+define void @testVLAwithSize(i32 %s) nounwind uwtable ssp {
+entry:
+  %s.addr = alloca i32, align 4
+  %saved_stack = alloca i8*
+  %i = alloca i32, align 4
+  store i32 %s, i32* %s.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %s.addr}, metadata !10), !dbg !11
+  %0 = load i32* %s.addr, align 4, !dbg !12
+  %1 = zext i32 %0 to i64, !dbg !12
+  %2 = call i8* @llvm.stacksave(), !dbg !12
+  store i8* %2, i8** %saved_stack, !dbg !12
+  %vla = alloca i32, i64 %1, align 16, !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !14), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !19), !dbg !20
+  store i32 0, i32* %i, align 4, !dbg !21
+  br label %for.cond, !dbg !21
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %3 = load i32* %i, align 4, !dbg !21
+  %4 = load i32* %s.addr, align 4, !dbg !21
+  %cmp = icmp slt i32 %3, %4, !dbg !21
+  br i1 %cmp, label %for.body, label %for.end, !dbg !21
+
+for.body:                                         ; preds = %for.cond
+  %5 = load i32* %i, align 4, !dbg !23
+  %6 = load i32* %i, align 4, !dbg !23
+  %mul = mul nsw i32 %5, %6, !dbg !23
+  %7 = load i32* %i, align 4, !dbg !23
+  %idxprom = sext i32 %7 to i64, !dbg !23
+  %arrayidx = getelementptr inbounds i32* %vla, i64 %idxprom, !dbg !23
+  store i32 %mul, i32* %arrayidx, align 4, !dbg !23
+  br label %for.inc, !dbg !25
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32* %i, align 4, !dbg !26
+  %inc = add nsw i32 %8, 1, !dbg !26
+  store i32 %inc, i32* %i, align 4, !dbg !26
+  br label %for.cond, !dbg !26
+
+for.end:                                          ; preds = %for.cond
+  %9 = load i8** %saved_stack, !dbg !27
+  call void @llvm.stackrestore(i8* %9), !dbg !27
+  ret void, !dbg !27
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare void @llvm.stackrestore(i8*) nounwind
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 156005) (llvm/trunk 156000)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"testVLAwithSize", metadata !"testVLAwithSize", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @testVLAwithSize, null, null, metadata !1, i32 2} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786689, metadata !5, metadata !"s", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{i32 1, i32 26, metadata !5, null}
+!12 = metadata !{i32 3, i32 13, metadata !13, null}
+!13 = metadata !{i32 786443, metadata !5, i32 2, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 0, i32 0, i64 2} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786465, i64 1, i64 0}        ; [ DW_TAG_subrange_type ]
+!18 = metadata !{i32 3, i32 7, metadata !13, null}
+!19 = metadata !{i32 786688, metadata !13, metadata !"i", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!20 = metadata !{i32 4, i32 7, metadata !13, null}
+!21 = metadata !{i32 5, i32 8, metadata !22, null}
+!22 = metadata !{i32 786443, metadata !13, i32 5, i32 3, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!23 = metadata !{i32 6, i32 5, metadata !24, null}
+!24 = metadata !{i32 786443, metadata !22, i32 5, i32 27, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{i32 7, i32 3, metadata !24, null}
+!26 = metadata !{i32 5, i32 22, metadata !22, null}
+!27 = metadata !{i32 8, i32 1, metadata !13, null}
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
new file mode 100644
index 0000000..abb946d
--- /dev/null
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -0,0 +1,238 @@
+; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -o /dev/null
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.function = type { i8 }
+%class.BPLFunctionWriter = type { %struct.BPLModuleWriter* }
+%struct.BPLModuleWriter = type { i8 }
+%class.anon = type { i8 }
+%class.anon.0 = type { i8 }
+
+@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_" = alias internal void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_"
+@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_0EET_" = alias internal void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_"
+
+define void @_ZN17BPLFunctionWriter9writeExprEv(%class.BPLFunctionWriter* %this) nounwind uwtable align 2 {
+entry:
+  %this.addr = alloca %class.BPLFunctionWriter*, align 8
+  %agg.tmp = alloca %class.function, align 1
+  %agg.tmp2 = alloca %class.anon, align 1
+  %agg.tmp4 = alloca %class.function, align 1
+  %agg.tmp5 = alloca %class.anon.0, align 1
+  store %class.BPLFunctionWriter* %this, %class.BPLFunctionWriter** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.BPLFunctionWriter** %this.addr}, metadata !133), !dbg !135
+  %this1 = load %class.BPLFunctionWriter** %this.addr
+  %MW = getelementptr inbounds %class.BPLFunctionWriter* %this1, i32 0, i32 0, !dbg !136
+  %0 = load %struct.BPLModuleWriter** %MW, align 8, !dbg !136
+  call void @"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_0EET_"(%class.function* %agg.tmp), !dbg !136
+  call void @_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE(%struct.BPLModuleWriter* %0), !dbg !136
+  %MW3 = getelementptr inbounds %class.BPLFunctionWriter* %this1, i32 0, i32 0, !dbg !138
+  %1 = load %struct.BPLModuleWriter** %MW3, align 8, !dbg !138
+  call void @"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_"(%class.function* %agg.tmp4), !dbg !138
+  call void @_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE(%struct.BPLModuleWriter* %1), !dbg !138
+  ret void, !dbg !139
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare void @_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE(%struct.BPLModuleWriter*)
+
+define internal void @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_"(%class.function* %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  %this.addr = alloca %class.function*, align 8
+  %__f = alloca %class.anon.0, align 1
+  store %class.function* %this, %class.function** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !140), !dbg !142
+  call void @llvm.dbg.declare(metadata !{%class.anon.0* %__f}, metadata !143), !dbg !144
+  %this1 = load %class.function** %this.addr
+  call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_"(%class.anon.0* %__f), !dbg !145
+  ret void, !dbg !147
+}
+
+define internal void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_"(%class.anon.0*) nounwind uwtable align 2 {
+entry:
+  %.addr = alloca %class.anon.0*, align 8
+  store %class.anon.0* %0, %class.anon.0** %.addr, align 8
+  ret void, !dbg !148
+}
+
+define internal void @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_"(%class.function* %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  %this.addr = alloca %class.function*, align 8
+  %__f = alloca %class.anon, align 1
+  store %class.function* %this, %class.function** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !150), !dbg !151
+  call void @llvm.dbg.declare(metadata !{%class.anon* %__f}, metadata !152), !dbg !153
+  %this1 = load %class.function** %this.addr
+  call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_"(%class.anon* %__f), !dbg !154
+  ret void, !dbg !156
+}
+
+define internal void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_"(%class.anon*) nounwind uwtable align 2 {
+entry:
+  %.addr = alloca %class.anon*, align 8
+  store %class.anon* %0, %class.anon** %.addr, align 8
+  ret void, !dbg !157
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta", metadata !"clang version 3.2 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !128} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", metadata !6, i32 19, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1, i32 19} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{i32 786434, null, metadata !"BPLFunctionWriter", metadata !6, i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ]
+!11 = metadata !{metadata !12, metadata !103}
+!12 = metadata !{i32 786445, metadata !10, metadata !"MW", metadata !6, i32 16, i64 64, i64 64, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ]
+!13 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 786434, null, metadata !"BPLModuleWriter", metadata !6, i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ]
+!15 = metadata !{metadata !16}
+!16 = metadata !{i32 786478, i32 0, metadata !14, metadata !"writeIntrinsic", metadata !"writeIntrinsic", metadata !"_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE", metadata !6, i32 13, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !101, i32 13} ; [ DW_TAG_subprogram ]
+!17 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!18 = metadata !{null, metadata !19, metadata !20}
+!19 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !14} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{i32 786434, null, metadata !"function<void ()>", metadata !6, i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, metadata !97} ; [ DW_TAG_class_type ]
+!21 = metadata !{metadata !22, metadata !51, metadata !58, metadata !86, metadata !92}
+!22 = metadata !{i32 786478, i32 0, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"", metadata !6, i32 8, metadata !23, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !47, i32 0, metadata !49, i32 8} ; [ DW_TAG_subprogram ]
+!23 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!24 = metadata !{null, metadata !25, metadata !26}
+!25 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
+!26 = metadata !{i32 786434, metadata !5, metadata !"", metadata !6, i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !27, i32 0, null, null} ; [ DW_TAG_class_type ]
+!27 = metadata !{metadata !28, metadata !35, metadata !41}
+!28 = metadata !{i32 786478, i32 0, metadata !26, metadata !"operator()", metadata !"operator()", metadata !"", metadata !6, i32 20, metadata !29, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !33, i32 20} ; [ DW_TAG_subprogram ]
+!29 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!30 = metadata !{null, metadata !31}
+!31 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!35 = metadata !{i32 786478, i32 0, metadata !26, metadata !"~", metadata !"~", metadata !"", metadata !6, i32 20, metadata !36, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !39, i32 20} ; [ DW_TAG_subprogram ]
+!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!37 = metadata !{null, metadata !38}
+!38 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !26} ; [ DW_TAG_pointer_type ]
+!39 = metadata !{metadata !40}
+!40 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!41 = metadata !{i32 786478, i32 0, metadata !26, metadata !"", metadata !"", metadata !"", metadata !6, i32 20, metadata !42, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !45, i32 20} ; [ DW_TAG_subprogram ]
+!42 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!43 = metadata !{null, metadata !38, metadata !44}
+!44 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_rvalue_reference_type ]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!47 = metadata !{metadata !48}
+!48 = metadata !{i32 786479, null, metadata !"_Functor", metadata !26, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!49 = metadata !{metadata !50}
+!50 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!51 = metadata !{i32 786478, i32 0, metadata !20, metadata !"function<function<void ()> >", metadata !"function<function<void ()> >", metadata !"", metadata !6, i32 8, metadata !52, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !54, i32 0, metadata !56, i32 8} ; [ DW_TAG_subprogram ]
+!52 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !53, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!53 = metadata !{null, metadata !25, metadata !20}
+!54 = metadata !{metadata !55}
+!55 = metadata !{i32 786479, null, metadata !"_Functor", metadata !20, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!56 = metadata !{metadata !57}
+!57 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!58 = metadata !{i32 786478, i32 0, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"", metadata !6, i32 8, metadata !59, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !82, i32 0, metadata !84, i32 8} ; [ DW_TAG_subprogram ]
+!59 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !60, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!60 = metadata !{null, metadata !25, metadata !61}
+!61 = metadata !{i32 786434, metadata !5, metadata !"", metadata !6, i32 23, i64 8, i64 8, i32 0, i32 0, null, metadata !62, i32 0, null, null} ; [ DW_TAG_class_type ]
+!62 = metadata !{metadata !63, metadata !70, metadata !76}
+!63 = metadata !{i32 786478, i32 0, metadata !61, metadata !"operator()", metadata !"operator()", metadata !"", metadata !6, i32 23, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !68, i32 23} ; [ DW_TAG_subprogram ]
+!64 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!65 = metadata !{null, metadata !66}
+!66 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !67} ; [ DW_TAG_pointer_type ]
+!67 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_const_type ]
+!68 = metadata !{metadata !69}
+!69 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!70 = metadata !{i32 786478, i32 0, metadata !61, metadata !"~", metadata !"~", metadata !"", metadata !6, i32 23, metadata !71, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !74, i32 23} ; [ DW_TAG_subprogram ]
+!71 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !72, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!72 = metadata !{null, metadata !73}
+!73 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !61} ; [ DW_TAG_pointer_type ]
+!74 = metadata !{metadata !75}
+!75 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!76 = metadata !{i32 786478, i32 0, metadata !61, metadata !"", metadata !"", metadata !"", metadata !6, i32 23, metadata !77, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !80, i32 23} ; [ DW_TAG_subprogram ]
+!77 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !78, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!78 = metadata !{null, metadata !73, metadata !79}
+!79 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_rvalue_reference_type ]
+!80 = metadata !{metadata !81}
+!81 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!82 = metadata !{metadata !83}
+!83 = metadata !{i32 786479, null, metadata !"_Functor", metadata !61, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!84 = metadata !{metadata !85}
+!85 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!86 = metadata !{i32 786478, i32 0, metadata !20, metadata !"function", metadata !"function", metadata !"", metadata !6, i32 6, metadata !87, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !90, i32 6} ; [ DW_TAG_subprogram ]
+!87 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !88, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!88 = metadata !{null, metadata !25, metadata !89}
+!89 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_rvalue_reference_type ]
+!90 = metadata !{metadata !91}
+!91 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!92 = metadata !{i32 786478, i32 0, metadata !20, metadata !"~function", metadata !"~function", metadata !"", metadata !6, i32 6, metadata !93, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !95, i32 6} ; [ DW_TAG_subprogram ]
+!93 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !94, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!94 = metadata !{null, metadata !25}
+!95 = metadata !{metadata !96}
+!96 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!97 = metadata !{metadata !98}
+!98 = metadata !{i32 786479, null, metadata !"T", metadata !99, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!99 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !100, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!100 = metadata !{null}
+!101 = metadata !{metadata !102}
+!102 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
+!103 = metadata !{i32 786478, i32 0, metadata !10, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", metadata !6, i32 17, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !104, i32 17} ; [ DW_TAG_subprogram ]
+!104 = metadata !{metadata !105}
+!105 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
+!106 = metadata !{i32 786478, i32 0, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !6, i32 8, metadata !59, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !82, metadata !58, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
+!107 = metadata !{i32 786478, i32 0, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !6, i32 3, metadata !108, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !111, metadata !113, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
+!108 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !109, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!109 = metadata !{null, metadata !110}
+!110 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_reference_type ]
+!111 = metadata !{metadata !112}
+!112 = metadata !{i32 786479, null, metadata !"_Tp", metadata !61, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!113 = metadata !{i32 786478, i32 0, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !6, i32 3, metadata !108, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !111, i32 0, metadata !124, i32 3} ; [ DW_TAG_subprogram ]
+!114 = metadata !{i32 786434, null, metadata !"_Base_manager", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !115, i32 0, null, null} ; [ DW_TAG_class_type ]
+!115 = metadata !{metadata !116, metadata !113}
+!116 = metadata !{i32 786478, i32 0, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !6, i32 3, metadata !117, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !120, i32 0, metadata !122, i32 3} ; [ DW_TAG_subprogram ]
+!117 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !118, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!118 = metadata !{null, metadata !119}
+!119 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_reference_type ]
+!120 = metadata !{metadata !121}
+!121 = metadata !{i32 786479, null, metadata !"_Tp", metadata !26, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!122 = metadata !{metadata !123}
+!123 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
+!124 = metadata !{metadata !125}
+!125 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
+!126 = metadata !{i32 786478, i32 0, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !6, i32 8, metadata !23, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !47, metadata !22, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
+!127 = metadata !{i32 786478, i32 0, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !6, i32 3, metadata !117, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !120, metadata !116, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
+!128 = metadata !{metadata !129}
+!129 = metadata !{metadata !130}
+!130 = metadata !{i32 786484, i32 0, metadata !114, metadata !"__stored_locally", metadata !"__stored_locally", metadata !"__stored_locally", metadata !6, i32 2, metadata !131, i32 1, i32 1, i1 true} ; [ DW_TAG_variable ]
+!131 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !132} ; [ DW_TAG_const_type ]
+!132 = metadata !{i32 786468, null, metadata !"bool", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
+!133 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777235, metadata !134, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!134 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
+!135 = metadata !{i32 19, i32 39, metadata !5, null}
+!136 = metadata !{i32 20, i32 17, metadata !137, null}
+!137 = metadata !{i32 786443, metadata !5, i32 19, i32 51, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!138 = metadata !{i32 23, i32 17, metadata !137, null}
+!139 = metadata !{i32 26, i32 15, metadata !137, null}
+!140 = metadata !{i32 786689, metadata !106, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!141 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
+!142 = metadata !{i32 8, i32 45, metadata !106, null}
+!143 = metadata !{i32 786689, metadata !106, metadata !"__f", metadata !6, i32 33554440, metadata !61, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!144 = metadata !{i32 8, i32 63, metadata !106, null}
+!145 = metadata !{i32 9, i32 9, metadata !146, null}
+!146 = metadata !{i32 786443, metadata !106, i32 8, i32 81, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!147 = metadata !{i32 10, i32 13, metadata !146, null}
+!148 = metadata !{i32 4, i32 5, metadata !149, null}
+!149 = metadata !{i32 786443, metadata !107, i32 3, i32 105, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
+!150 = metadata !{i32 786689, metadata !126, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!151 = metadata !{i32 8, i32 45, metadata !126, null}
+!152 = metadata !{i32 786689, metadata !126, metadata !"__f", metadata !6, i32 33554440, metadata !26, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!153 = metadata !{i32 8, i32 63, metadata !126, null}
+!154 = metadata !{i32 9, i32 9, metadata !155, null}
+!155 = metadata !{i32 786443, metadata !126, i32 8, i32 81, metadata !6, i32 3} ; [ DW_TAG_lexical_block ]
+!156 = metadata !{i32 10, i32 13, metadata !155, null}
+!157 = metadata !{i32 4, i32 5, metadata !158, null}
+!158 = metadata !{i32 786443, metadata !127, i32 3, i32 105, metadata !6, i32 4} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
new file mode 100644
index 0000000..e820cb5
--- /dev/null
+++ b/test/DebugInfo/X86/pr13303.ll
@@ -0,0 +1,28 @@
+; RUN: llc %s -o %t -filetype=obj -mtriple=x86_64-unknown-linux-gnu
+; RUN: llvm-dwarfdump %t | FileCheck %s
+; PR13303
+
+; Check that the prologue ends with is_stmt here.
+; CHECK: 0x0000000000000000 {{.*}} is_stmt
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !10
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"PR13303.c", metadata !"/home/probinson", metadata !"clang version 3.2 (trunk 160143)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!6 = metadata !{i32 786473, metadata !"PR13303.c", metadata !"/home/probinson", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 1, i32 14, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 1, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
diff --git a/test/DebugInfo/X86/rvalue-ref.ll b/test/DebugInfo/X86/rvalue-ref.ll
new file mode 100644
index 0000000..e73869d
--- /dev/null
+++ b/test/DebugInfo/X86/rvalue-ref.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj -O0
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_TAG_rvalue_reference_type
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define void @_Z3fooOi(i32* %i) uwtable ssp {
+entry:
+  %i.addr = alloca i32*, align 8
+  store i32* %i, i32** %i.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i32** %i.addr}, metadata !11), !dbg !12
+  %0 = load i32** %i.addr, align 8, !dbg !13
+  %1 = load i32* %0, align 4, !dbg !13
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1), !dbg !13
+  ret void, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i32 @printf(i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157054) (llvm/trunk 157060)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooOi", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @_Z3fooOi, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_rvalue_reference_type ]
+!10 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!11 = metadata !{i32 786689, metadata !5, metadata !"i", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{i32 4, i32 17, metadata !5, null}
+!13 = metadata !{i32 6, i32 3, metadata !14, null}
+!14 = metadata !{i32 786443, metadata !5, i32 5, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 7, i32 1, metadata !14, null}
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
new file mode 100644
index 0000000..de23dcd
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-test.test
@@ -0,0 +1,46 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
+RUN:   --address=0x400589 --functions | FileCheck %s -check-prefix MAIN
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
+RUN:   --address=0x400558 --functions | FileCheck %s -check-prefix FUNCTION
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 \
+RUN:   --address=0x4005b6 --functions | FileCheck %s -check-prefix CTOR_WITH_SPEC
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
+RUN:   --address=0x4004b8 --functions | FileCheck %s -check-prefix MANY_CU_1
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
+RUN:   --address=0x4004c4 --functions | FileCheck %s -check-prefix MANY_CU_2
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
+RUN:   --address=0x580 --functions | FileCheck %s -check-prefix ABS_ORIGIN_1
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
+RUN:   --address=0x573 --functions | FileCheck %s -check-prefix INCLUDE_TEST_1
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
+RUN:   --address=0x56d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
+RUN:   --address=0x55c --functions \
+RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
+
+MAIN: main
+MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
+
+FUNCTION: _Z1fii
+FUNCTION-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:11:18
+
+CTOR_WITH_SPEC: _ZN10DummyClassC1Ei
+CTOR_WITH_SPEC-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:4:30
+
+MANY_CU_1: a
+MANY_CU_1-NEXT: /tmp/dbginfo{{[/\\]}}a.cc:2:0
+
+MANY_CU_2: main
+MANY_CU_2-NEXT: /tmp/dbginfo{{[/\\]}}main.cc:4:0
+
+ABS_ORIGIN_1: C
+ABS_ORIGIN_1-NEXT: /tmp/dbginfo{{[/\\]}}def2.cc:4:0
+
+INCLUDE_TEST_1: _Z3do2v
+INCLUDE_TEST_1-NEXT: /tmp/dbginfo{{[/\\]}}include{{[/\\]}}decl2.h:1:0
+
+INCLUDE_TEST_2: _Z3do1v
+INCLUDE_TEST_2-NEXT: /tmp/include{{[/\\]}}decl.h:5:0
+
+MANY_SEQ_IN_LINE_TABLE: _Z1cv
+MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
new file mode 100644
index 0000000..ed4e7da
--- /dev/null
+++ b/test/DebugInfo/inlined-vars.ll
@@ -0,0 +1,57 @@
+; RUN: llc -O0 < %s | FileCheck %s -check-prefix ARGUMENT
+; RUN: llc -O0 < %s | FileCheck %s -check-prefix VARIABLE
+; PR 13202
+
+define i32 @main() uwtable {
+entry:
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18), !dbg !21
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !22), !dbg !23
+  tail call void @smth(i32 0), !dbg !24
+  tail call void @smth(i32 0), !dbg !25
+  ret i32 0, !dbg !19
+}
+
+declare void @smth(i32)
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202", metadata !"clang version 3.2 (trunk 159419)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 10, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !1, i32 10} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786478, i32 0, metadata !6, metadata !"f", metadata !"f", metadata !"_ZL1fi", metadata !6, i32 3, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !13, i32 3} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{metadata !9, metadata !9}
+!13 = metadata !{metadata !14}
+!14 = metadata !{metadata !15, metadata !16}
+!15 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+
+; Two DW_TAG_formal_parameter: one abstract and one inlined.
+; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
+; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
+; ARGUMENT-NOT: {{.*Abbrev.*DW_TAG_formal_parameter}}
+
+!16 = metadata !{i32 786688, metadata !17, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+
+; Two DW_TAG_variable: one abstract and one inlined.
+; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
+; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
+; VARIABLE-NOT: {{.*Abbrev.*DW_TAG_variable}}
+
+!17 = metadata !{i32 786443, metadata !10, i32 3, i32 35, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, metadata !19} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 11, i32 10, metadata !20, null}
+!20 = metadata !{i32 786443, metadata !5, i32 10, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!21 = metadata !{i32 3, i32 25, metadata !10, metadata !19}
+!22 = metadata !{i32 786688, metadata !17, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, metadata !19} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 4, i32 16, metadata !17, metadata !19}
+!24 = metadata !{i32 5, i32 3, metadata !17, metadata !19}
+!25 = metadata !{i32 6, i32 3, metadata !17, metadata !19}
diff --git a/test/DebugInfo/lit.local.cfg b/test/DebugInfo/lit.local.cfg
index 19eebc0..00bd9b8 100644
--- a/test/DebugInfo/lit.local.cfg
+++ b/test/DebugInfo/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll', '.c', '.cpp', '.test']
diff --git a/test/DebugInfo/printdbginfo2.ll b/test/DebugInfo/printdbginfo2.ll
index 3193791..396ae85 100644
--- a/test/DebugInfo/printdbginfo2.ll
+++ b/test/DebugInfo/printdbginfo2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -print-dbginfo -disable-output |& FileCheck %s
+; RUN: opt < %s -print-dbginfo -disable-output 2>&1 | FileCheck %s
 ;  grep {%b is variable b of type x declared at x.c:7} %t1
 ;  grep {%2 is variable b of type x declared at x.c:7} %t1
 ;  grep {@c.1442 is variable c of type int declared at x.c:4} %t1
diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
new file mode 100644
index 0000000..46273d3
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
@@ -0,0 +1,37 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define void @getoptions(i32* %argc) {
+bb0:
+	ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define i32 @main(i32 %argc, i8** %argv) {
+bb0:
+	call i32 (i8*, ...)* @printf( i8* getelementptr ([10 x i8]* @.LC0, i64 0, i64 0), i32 %argc )		; <i32>:0 [#uses=0]
+	%cast224 = bitcast i8** %argv to i8*		; <i8*> [#uses=1]
+	%local = alloca i8*		; <i8**> [#uses=3]
+	store i8* %cast224, i8** %local
+	%cond226 = icmp sle i32 %argc, 0		; <i1> [#uses=1]
+	br i1 %cond226, label %bb3, label %bb2
+bb2:		; preds = %bb2, %bb0
+	%cann-indvar = phi i32 [ 0, %bb0 ], [ %add1-indvar, %bb2 ]		; <i32> [#uses=2]
+	%add1-indvar = add i32 %cann-indvar, 1		; <i32> [#uses=2]
+	%cann-indvar-idxcast = sext i32 %cann-indvar to i64		; <i64> [#uses=1]
+	%CT = bitcast i8** %local to i8***		; <i8***> [#uses=1]
+	%reg115 = load i8*** %CT		; <i8**> [#uses=1]
+	%cast235 = getelementptr i8** %reg115, i64 %cann-indvar-idxcast		; <i8**> [#uses=1]
+	%reg117 = load i8** %cast235		; <i8*> [#uses=1]
+	%reg236 = call i32 @puts( i8* %reg117 )		; <i32> [#uses=0]
+	%cond239 = icmp slt i32 %add1-indvar, %argc		; <i1> [#uses=1]
+	br i1 %cond239, label %bb2, label %bb3
+bb3:		; preds = %bb2, %bb0
+	%cast243 = bitcast i8** %local to i32*		; <i32*> [#uses=1]
+	call void @getoptions( i32* %cast243 )
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
new file mode 100644
index 0000000..88bfbb3
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
@@ -0,0 +1,13 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @foo(i32 %X, i32 %Y, double %A) {
+	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
+	%cast110 = zext i1 %cond212 to i32		; <i32> [#uses=1]
+	ret i32 %cast110
+}
+
+define i32 @main() {
+	%reg212 = call i32 @foo( i32 0, i32 1, double 1.000000e+00 )		; <i32> [#uses=1]
+	ret i32 %reg212
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
new file mode 100644
index 0000000..d5f860d
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
@@ -0,0 +1,20 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
+	ret i32 0
+}
+
+define internal i32 @mylog(i32 %num) {
+bb0:
+	br label %bb2
+bb2:		; preds = %bb2, %bb0
+	%reg112 = phi i32 [ 10, %bb2 ], [ 1, %bb0 ]		; <i32> [#uses=1]
+	%cann-indvar = phi i32 [ %cann-indvar, %bb2 ], [ 0, %bb0 ]		; <i32> [#uses=1]
+	%reg114 = add i32 %reg112, 1		; <i32> [#uses=2]
+	%cond222 = icmp slt i32 %reg114, %num		; <i1> [#uses=1]
+	br i1 %cond222, label %bb2, label %bb3
+bb3:		; preds = %bb2
+	ret i32 %reg114
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
new file mode 100644
index 0000000..721f2e8
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+; <label>:0
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	%X = phi i32 [ 0, %0 ], [ 1, %Loop ]		; <i32> [#uses=1]
+	br i1 true, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret i32 %X
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
new file mode 100644
index 0000000..d17df99
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; We were accidentally inverting the signedness of right shifts.  Whoops.
+
+define i32 @main() {
+	%X = ashr i32 -1, 16		; <i32> [#uses=1]
+	%Y = ashr i32 %X, 16		; <i32> [#uses=1]
+	%Z = add i32 %Y, 1		; <i32> [#uses=1]
+	ret i32 %Z
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
new file mode 100644
index 0000000..e55cb06
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
+	%Y = fsub double 0.000000e+00, 1.000000e+00		; <double> [#uses=2]
+	%Z = fcmp oeq double %X, %Y		; <i1> [#uses=0]
+	fadd double %Y, 0.000000e+00		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
new file mode 100644
index 0000000..663dc40
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @bar(i8* %X) {
+        ; pointer should be 4 byte aligned!
+	%P = alloca double		; <double*> [#uses=1]
+	%R = ptrtoint double* %P to i32		; <i32> [#uses=1]
+	%A = and i32 %R, 3		; <i32> [#uses=1]
+	ret i32 %A
+}
+
+define i32 @main() {
+	%SP = alloca i8		; <i8*> [#uses=1]
+	%X = add i32 0, 0		; <i32> [#uses=1]
+	alloca i8, i32 %X		; <i8*>:1 [#uses=0]
+	call i32 @bar( i8* %SP )		; <i32>:2 [#uses=1]
+	ret i32 %2
+}
diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
new file mode 100644
index 0000000..e95294b
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
@@ -0,0 +1,19 @@
+; This testcase should return with an exit code of 1.
+;
+; RUN: not %lli -use-mcjit %s
+
+@test = global i64 0		; <i64*> [#uses=1]
+
+define internal i64 @test.upgrd.1() {
+	%tmp.0 = load i64* @test		; <i64> [#uses=1]
+	%tmp.1 = add i64 %tmp.0, 1		; <i64> [#uses=1]
+	ret i64 %tmp.1
+}
+
+define i32 @main() {
+	%L = call i64 @test.upgrd.1( )		; <i64> [#uses=1]
+	%I = trunc i64 %L to i32		; <i32> [#uses=1]
+	ret i32 %I
+}
+
+
diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
new file mode 100644
index 0000000..a237194
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-mcjit %s test
+
+declare i32 @puts(i8*)
+
+define i32 @main(i32 %argc.1, i8** %argv.1) {
+	%tmp.5 = getelementptr i8** %argv.1, i64 1		; <i8**> [#uses=1]
+	%tmp.6 = load i8** %tmp.5		; <i8*> [#uses=1]
+	%tmp.0 = call i32 @puts( i8* %tmp.6 )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
new file mode 100644
index 0000000..70464a3
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -0,0 +1,15 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+target datalayout = "e-p:32:32"
+
+define i32 @main() {
+entry:
+	br label %endif
+then:		; No predecessors!
+	br label %endif
+endif:		; preds = %then, %entry
+	%x = phi i32 [ 4, %entry ], [ 27, %then ]		; <i32> [#uses=0]
+	%result = phi i32 [ 32, %then ], [ 0, %entry ]		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
new file mode 100644
index 0000000..58d423f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -0,0 +1,19 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; Testcase distilled from 256.bzip2.
+
+target datalayout = "e-p:32:32"
+
+define i32 @main() {
+entry:
+	br label %loopentry.0
+loopentry.0:		; preds = %loopentry.0, %entry
+	%h.0 = phi i32 [ %tmp.2, %loopentry.0 ], [ -1, %entry ]		; <i32> [#uses=1]
+	%tmp.2 = add i32 %h.0, 1		; <i32> [#uses=3]
+	%tmp.4 = icmp ne i32 %tmp.2, 0		; <i1> [#uses=1]
+	br i1 %tmp.4, label %loopentry.0, label %loopentry.1
+loopentry.1:		; preds = %loopentry.0
+	%h.1 = phi i32 [ %tmp.2, %loopentry.0 ]		; <i32> [#uses=1]
+	ret i32 %h.1
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
new file mode 100644
index 0000000..a22fe07
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; Testcase distilled from 256.bzip2.
+
+target datalayout = "e-p:32:32"
+
+define i32 @main() {
+entry:
+	%X = add i32 1, -1		; <i32> [#uses=3]
+	br label %Next
+Next:		; preds = %entry
+	%A = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
+	%B = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
+	%C = phi i32 [ %X, %entry ]		; <i32> [#uses=1]
+	ret i32 %C
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
new file mode 100644
index 0000000..b3c6d8a
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; This testcase failed to work because two variable sized allocas confused the
+; local register allocator.
+
+define i32 @main(i32 %X) {
+	%A = alloca i32, i32 %X		; <i32*> [#uses=0]
+	%B = alloca float, i32 %X		; <float*> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
new file mode 100644
index 0000000..bd32f30
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+;
+; Regression Test: EnvironmentTest.ll
+;
+; Description:
+;	This is a regression test that verifies that the JIT passes the
+;	environment to the main() function.
+;
+
+
+declare i32 @strlen(i8*)
+
+define i32 @main(i32 %argc.1, i8** %argv.1, i8** %envp.1) {
+	%tmp.2 = load i8** %envp.1		; <i8*> [#uses=1]
+	%tmp.3 = call i32 @strlen( i8* %tmp.2 )		; <i32> [#uses=1]
+	%T = icmp eq i32 %tmp.3, 0		; <i1> [#uses=1]
+	%R = zext i1 %T to i32		; <i32> [#uses=1]
+	ret i32 %R
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
new file mode 100644
index 0000000..1959534
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; This testcase exposes a bug in the local register allocator where it runs out
+; of registers (due to too many overlapping live ranges), but then attempts to
+; use the ESP register (which is not allocatable) to hold a value.
+
+define i32 @main(i32 %A) {
+        ; ESP gets used again...
+	%Ap2 = alloca i32, i32 %A		; <i32*> [#uses=11]
+	; Produce lots of overlapping live ranges
+        %B = add i32 %A, 1		; <i32> [#uses=1]
+	%C = add i32 %A, 2		; <i32> [#uses=1]
+	%D = add i32 %A, 3		; <i32> [#uses=1]
+	%E = add i32 %A, 4		; <i32> [#uses=1]
+	%F = add i32 %A, 5		; <i32> [#uses=1]
+	%G = add i32 %A, 6		; <i32> [#uses=1]
+	%H = add i32 %A, 7		; <i32> [#uses=1]
+	%I = add i32 %A, 8		; <i32> [#uses=1]
+	%J = add i32 %A, 9		; <i32> [#uses=1]
+	%K = add i32 %A, 10		; <i32> [#uses=1]
+        ; Uses of all of the values
+	store i32 %A, i32* %Ap2
+	store i32 %B, i32* %Ap2
+	store i32 %C, i32* %Ap2
+	store i32 %D, i32* %Ap2
+	store i32 %E, i32* %Ap2
+	store i32 %F, i32* %Ap2
+	store i32 %G, i32* %Ap2
+	store i32 %H, i32* %Ap2
+	store i32 %I, i32* %Ap2
+	store i32 %J, i32* %Ap2
+	store i32 %K, i32* %Ap2
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
new file mode 100644
index 0000000..1f8343f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -0,0 +1,23 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@A = global i32 0		; <i32*> [#uses=1]
+
+define i32 @main() {
+	%Ret = call i32 @test( i1 true, i32 0 )		; <i32> [#uses=1]
+	ret i32 %Ret
+}
+
+define i32 @test(i1 %c, i32 %A) {
+	br i1 %c, label %Taken1, label %NotTaken
+Cont:		; preds = %Taken1, %NotTaken
+	%V = phi i32 [ 0, %NotTaken ], [ sub (i32 ptrtoint (i32* @A to i32), i32 1234), %Taken1 ]		; <i32> [#uses=0]
+	ret i32 0
+NotTaken:		; preds = %0
+	br label %Cont
+Taken1:		; preds = %0
+	%B = icmp eq i32 %A, 0		; <i1> [#uses=1]
+	br i1 %B, label %Cont, label %ExitError
+ExitError:		; preds = %Taken1
+	ret i32 12
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
new file mode 100644
index 0000000..79a7d58
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
@@ -0,0 +1,22 @@
+; PR672
+; RUN: %lli -use-mcjit %s
+; XFAIL: mcjit-ia32
+
+define i32 @main() {
+	%f = bitcast i32 (i32, i32*, i32)* @check_tail to i32*		; <i32*> [#uses=1]
+	%res = tail call fastcc i32 @check_tail( i32 10, i32* %f, i32 10 )		; <i32> [#uses=1]
+	ret i32 %res
+}
+
+define fastcc i32 @check_tail(i32 %x, i32* %f, i32 %g) {
+	%tmp1 = icmp sgt i32 %x, 0		; <i1> [#uses=1]
+	br i1 %tmp1, label %if-then, label %if-else
+if-then:		; preds = %0
+	%fun_ptr = bitcast i32* %f to i32 (i32, i32*, i32)*		; <i32 (i32, i32*, i32)*> [#uses=1]
+	%arg1 = add i32 %x, -1		; <i32> [#uses=1]
+	%res = tail call fastcc i32 %fun_ptr( i32 %arg1, i32* %f, i32 %g )		; <i32> [#uses=1]
+	ret i32 %res
+if-else:		; preds = %0
+	ret i32 %x
+}
+
diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
new file mode 100644
index 0000000..52cef4d
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
@@ -0,0 +1,19 @@
+; RUN: %lli -use-mcjit -force-interpreter %s
+; PR1836
+
+define i32 @main() {
+entry:
+    %retval = alloca i32        ; <i32*> [#uses=2]
+    %tmp = alloca i32       ; <i32*> [#uses=2]
+    %x = alloca i75, align 16       ; <i75*> [#uses=1]
+    %"alloca point" = bitcast i32 0 to i32      ; <i32> [#uses=0]
+    store i75 999, i75* %x, align 16
+    store i32 0, i32* %tmp, align 4
+    %tmp1 = load i32* %tmp, align 4     ; <i32> [#uses=1]
+    store i32 %tmp1, i32* %retval, align 4
+    br label %return
+
+return:     ; preds = %entry
+    %retval2 = load i32* %retval        ; <i32> [#uses=1]
+    ret i32 %retval2
+}
diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
new file mode 100644
index 0000000..a6e917f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
@@ -0,0 +1,59 @@
+; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 1
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-pc-linux-gnu"
+@.str = internal constant [10 x i8] c"MSB = %d\0A\00"		; <[10 x i8]*> [#uses=1]
+
+define i65 @foo(i65 %x) {
+entry:
+	%x_addr = alloca i65		; <i65*> [#uses=2]
+	%retval = alloca i65		; <i65*> [#uses=2]
+	%tmp = alloca i65		; <i65*> [#uses=2]
+	%"alloca point" = bitcast i65 0 to i65		; <i65> [#uses=0]
+	store i65 %x, i65* %x_addr
+	%tmp1 = load i65* %x_addr, align 4		; <i65> [#uses=1]
+	%tmp2 = ashr i65 %tmp1, 65		; <i65> [#uses=1]
+	store i65 %tmp2, i65* %tmp, align 4
+	%tmp3 = load i65* %tmp, align 4		; <i65> [#uses=1]
+	store i65 %tmp3, i65* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval4 = load i65* %retval		; <i65> [#uses=1]
+	ret i65 %retval4
+}
+
+define i32 @main() {
+entry:
+	%retval = alloca i32		; <i32*> [#uses=1]
+	%iftmp.0 = alloca i32		; <i32*> [#uses=3]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp = call i65 @foo( i65 -9 )		; <i65> [#uses=1]
+	%tmp1 = lshr i65 %tmp, 64		; <i65> [#uses=1]
+	%tmp2 = xor i65 %tmp1, 1		; <i65> [#uses=1]
+	%tmp3 = and i65 %tmp2, 1		; <i65> [#uses=1]
+	%tmp34 = trunc i65 %tmp3 to i8		; <i8> [#uses=1]
+	%toBool = icmp ne i8 %tmp34, 0		; <i1> [#uses=1]
+	br i1 %toBool, label %cond_true, label %cond_false
+
+cond_true:		; preds = %entry
+	store i32 0, i32* %iftmp.0, align 4
+	br label %cond_next
+
+cond_false:		; preds = %entry
+	store i32 1, i32* %iftmp.0, align 4
+	br label %cond_next
+
+cond_next:		; preds = %cond_false, %cond_true
+	%tmp5 = getelementptr [10 x i8]* @.str, i32 0, i32 0		; <i8*> [#uses=1]
+	%tmp6 = load i32* %iftmp.0, align 4		; <i32> [#uses=1]
+	%tmp7 = call i32 (i8*, ...)* @printf( i8* noalias  %tmp5, i32 %tmp6 ) nounwind 		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %cond_next
+    store i32 0, i32* %retval, align 4
+	%retval8 = load i32* %retval		; <i32> [#uses=1]
+	ret i32 %retval8
+}
+
+declare i32 @printf(i8* noalias , ...) nounwind 
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
new file mode 100644
index 0000000..524a724
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
@@ -0,0 +1,8 @@
+; RUN: %lli -use-mcjit -force-interpreter=true %s
+
+define i32 @main() {
+       %a = add i32 0, undef
+       %b = fadd float 0.0, undef
+       %c = fadd double 0.0, undef
+       ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
new file mode 100644
index 0000000..9da908f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll
@@ -0,0 +1,20 @@
+; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 40091eb8
+;
+define i32 @test(double %x) {
+entry:
+	%x46.i = bitcast double %x to i64	
+	%tmp343.i = lshr i64 %x46.i, 32	
+	%tmp344.i = trunc i64 %tmp343.i to i32
+        ret i32 %tmp344.i
+}
+
+define i32 @main()
+{
+       %res = call i32 @test(double 3.14)
+       %ptr = getelementptr [4 x i8]* @format, i32 0, i32 0
+       call i32 (i8*,...)* @printf(i8* %ptr, i32 %res)
+       ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+@format = internal constant [4 x i8] c"%x\0A\00"
diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
new file mode 100644
index 0000000..a52b6d4
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/hello.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define i32 @main() {
+	%reg210 = call i32 @puts( i8* getelementptr ([12 x i8]* @.LC0, i64 0, i64 0) )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
new file mode 100644
index 0000000..670a6dd
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/hello2.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@X = global i32 7		; <i32*> [#uses=0]
+@msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
+
+declare void @printf([13 x i8]*, ...)
+
+define void @bar() {
+	call void ([13 x i8]*, ...)* @printf( [13 x i8]* @msg )
+	ret void
+}
+
+define i32 @main() {
+	call void @bar( )
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
new file mode 100644
index 0000000..2980ce7
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -0,0 +1,20 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+targets = set(root.targets_to_build.split())
+if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets):
+    config.unsupported = False
+else:
+    config.unsupported = True
+
+if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips']:
+    config.unsupported = True
+
+if root.host_os in ['Win32', 'Cygwin', 'MingW', 'Windows', 'Darwin']:
+    config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
new file mode 100644
index 0000000..a6688c2
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll
@@ -0,0 +1,6 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
new file mode 100644
index 0000000..4562aa6
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @bar() {
+	ret i32 0
+}
+
+define i32 @main() {
+	%r = call i32 @bar( )		; <i32> [#uses=1]
+	ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
new file mode 100644
index 0000000..b285b0e
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/stubs.ll
@@ -0,0 +1,35 @@
+; RUN: %lli -use-mcjit -disable-lazy-compilation=false %s
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
new file mode 100644
index 0000000..3177760
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	%A = add i8 0, 12		; <i8> [#uses=1]
+	%B = sub i8 %A, 1		; <i8> [#uses=2]
+	%C = mul i8 %B, %B		; <i8> [#uses=2]
+	%D = sdiv i8 %C, %C		; <i8> [#uses=2]
+	%E = srem i8 %D, %D		; <i8> [#uses=0]
+	%F = udiv i8 5, 6		; <i8> [#uses=0]
+	%G = urem i8 6, 5		; <i8> [#uses=0]
+	%A.upgrd.1 = add i16 0, 12		; <i16> [#uses=1]
+	%B.upgrd.2 = sub i16 %A.upgrd.1, 1		; <i16> [#uses=2]
+	%C.upgrd.3 = mul i16 %B.upgrd.2, %B.upgrd.2		; <i16> [#uses=2]
+	%D.upgrd.4 = sdiv i16 %C.upgrd.3, %C.upgrd.3		; <i16> [#uses=2]
+	%E.upgrd.5 = srem i16 %D.upgrd.4, %D.upgrd.4		; <i16> [#uses=0]
+	%F.upgrd.6 = udiv i16 5, 6		; <i16> [#uses=0]
+	%G.upgrd.7 = urem i32 6, 5		; <i32> [#uses=0]
+	%A.upgrd.8 = add i32 0, 12		; <i32> [#uses=1]
+	%B.upgrd.9 = sub i32 %A.upgrd.8, 1		; <i32> [#uses=2]
+	%C.upgrd.10 = mul i32 %B.upgrd.9, %B.upgrd.9		; <i32> [#uses=2]
+	%D.upgrd.11 = sdiv i32 %C.upgrd.10, %C.upgrd.10		; <i32> [#uses=2]
+	%E.upgrd.12 = srem i32 %D.upgrd.11, %D.upgrd.11		; <i32> [#uses=0]
+	%F.upgrd.13 = udiv i32 5, 6		; <i32> [#uses=0]
+	%G1 = urem i32 6, 5		; <i32> [#uses=0]
+	%A.upgrd.14 = add i64 0, 12		; <i64> [#uses=1]
+	%B.upgrd.15 = sub i64 %A.upgrd.14, 1		; <i64> [#uses=2]
+	%C.upgrd.16 = mul i64 %B.upgrd.15, %B.upgrd.15		; <i64> [#uses=2]
+	%D.upgrd.17 = sdiv i64 %C.upgrd.16, %C.upgrd.16		; <i64> [#uses=2]
+	%E.upgrd.18 = srem i64 %D.upgrd.17, %D.upgrd.17		; <i64> [#uses=0]
+	%F.upgrd.19 = udiv i64 5, 6		; <i64> [#uses=0]
+	%G.upgrd.20 = urem i64 6, 5		; <i64> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
new file mode 100644
index 0000000..702c110
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; test unconditional branch
+define i32 @main() {
+	br label %Test
+Test:		; preds = %Test, %0
+	%X = icmp eq i32 0, 4		; <i1> [#uses=1]
+	br i1 %X, label %Test, label %Label
+Label:		; preds = %Test
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
new file mode 100644
index 0000000..6f28405
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @_Z14func_exit_codev() nounwind uwtable {
+entry:
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @_Z14func_exit_codev()
+  ret i32 %call
+}
diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
new file mode 100644
index 0000000..7a244ee
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-call.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+declare void @exit(i32)
+
+define i32 @test(i8 %C, i16 %S) {
+	%X = trunc i16 %S to i8		; <i8> [#uses=1]
+	%Y = zext i8 %X to i32		; <i32> [#uses=1]
+	ret i32 %Y
+}
+
+define void @FP(void (i32)* %F) {
+	%X = call i32 @test( i8 123, i16 1024 )		; <i32> [#uses=1]
+	call void %F( i32 %X )
+	ret void
+}
+
+define i32 @main() {
+	call void @FP( void (i32)* @exit )
+	ret i32 1
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
new file mode 100644
index 0000000..75e7d1b
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll
@@ -0,0 +1,109 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @foo() {
+	ret i32 0
+}
+
+define i32 @main() {
+	icmp ne i1 true, false		; <i1>:1 [#uses=0]
+	zext i1 true to i8		; <i8>:2 [#uses=0]
+	zext i1 true to i8		; <i8>:3 [#uses=0]
+	zext i1 true to i16		; <i16>:4 [#uses=0]
+	zext i1 true to i16		; <i16>:5 [#uses=0]
+	zext i1 true to i32		; <i32>:6 [#uses=0]
+	zext i1 true to i32		; <i32>:7 [#uses=0]
+	zext i1 true to i64		; <i64>:8 [#uses=0]
+	zext i1 true to i64		; <i64>:9 [#uses=0]
+	uitofp i1 true to float		; <float>:10 [#uses=0]
+	uitofp i1 true to double		; <double>:11 [#uses=0]
+	icmp ne i8 0, 0		; <i1>:12 [#uses=0]
+	icmp ne i8 1, 0		; <i1>:13 [#uses=0]
+	bitcast i8 0 to i8		; <i8>:14 [#uses=0]
+	bitcast i8 -1 to i8		; <i8>:15 [#uses=0]
+	sext i8 4 to i16		; <i16>:16 [#uses=0]
+	sext i8 4 to i16		; <i16>:17 [#uses=0]
+	sext i8 4 to i64		; <i64>:18 [#uses=0]
+	sext i8 4 to i64		; <i64>:19 [#uses=0]
+	sitofp i8 4 to float		; <float>:20 [#uses=0]
+	sitofp i8 4 to double		; <double>:21 [#uses=0]
+	icmp ne i8 0, 0		; <i1>:22 [#uses=0]
+	icmp ne i8 1, 0		; <i1>:23 [#uses=0]
+	bitcast i8 0 to i8		; <i8>:24 [#uses=0]
+	bitcast i8 1 to i8		; <i8>:25 [#uses=0]
+	zext i8 4 to i16		; <i16>:26 [#uses=0]
+	zext i8 4 to i16		; <i16>:27 [#uses=0]
+	zext i8 4 to i64		; <i64>:28 [#uses=0]
+	zext i8 4 to i64		; <i64>:29 [#uses=0]
+	uitofp i8 0 to float		; <float>:30 [#uses=0]
+	uitofp i8 0 to double		; <double>:31 [#uses=0]
+	icmp ne i16 1, 0		; <i1>:32 [#uses=0]
+	trunc i16 -1 to i8		; <i8>:33 [#uses=0]
+	trunc i16 255 to i8		; <i8>:34 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:35 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:36 [#uses=0]
+	sext i16 0 to i64		; <i64>:37 [#uses=0]
+	sext i16 0 to i64		; <i64>:38 [#uses=0]
+	sitofp i16 0 to float		; <float>:39 [#uses=0]
+	sitofp i16 0 to double		; <double>:40 [#uses=0]
+	icmp ne i16 1, 0		; <i1>:41 [#uses=0]
+	trunc i16 1 to i8		; <i8>:42 [#uses=0]
+	trunc i16 255 to i8		; <i8>:43 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:44 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:45 [#uses=0]
+	zext i16 0 to i64		; <i64>:46 [#uses=0]
+	zext i16 0 to i64		; <i64>:47 [#uses=0]
+	uitofp i16 0 to float		; <float>:48 [#uses=0]
+	uitofp i16 0 to double		; <double>:49 [#uses=0]
+	icmp ne i32 6, 0		; <i1>:50 [#uses=0]
+	trunc i32 -6 to i8		; <i8>:51 [#uses=0]
+	trunc i32 6 to i8		; <i8>:52 [#uses=0]
+	trunc i32 6 to i16		; <i16>:53 [#uses=0]
+	bitcast i32 0 to i32		; <i32>:54 [#uses=0]
+	sext i32 0 to i64		; <i64>:55 [#uses=0]
+	sext i32 0 to i64		; <i64>:56 [#uses=0]
+	sitofp i32 0 to float		; <float>:57 [#uses=0]
+	sitofp i32 0 to double		; <double>:58 [#uses=0]
+	icmp ne i32 6, 0		; <i1>:59 [#uses=0]
+	trunc i32 7 to i8		; <i8>:60 [#uses=0]
+	trunc i32 8 to i8		; <i8>:61 [#uses=0]
+	trunc i32 9 to i16		; <i16>:62 [#uses=0]
+	bitcast i32 10 to i32		; <i32>:63 [#uses=0]
+	zext i32 0 to i64		; <i64>:64 [#uses=0]
+	zext i32 0 to i64		; <i64>:65 [#uses=0]
+	uitofp i32 0 to float		; <float>:66 [#uses=0]
+	uitofp i32 0 to double		; <double>:67 [#uses=0]
+	icmp ne i64 0, 0		; <i1>:68 [#uses=0]
+	trunc i64 0 to i8		; <i8>:69 [#uses=0]
+	trunc i64 0 to i8		; <i8>:70 [#uses=0]
+	trunc i64 0 to i16		; <i16>:71 [#uses=0]
+	trunc i64 0 to i16		; <i16>:72 [#uses=0]
+	trunc i64 0 to i32		; <i32>:73 [#uses=0]
+	trunc i64 0 to i32		; <i32>:74 [#uses=0]
+	bitcast i64 0 to i64		; <i64>:75 [#uses=0]
+	bitcast i64 0 to i64		; <i64>:76 [#uses=0]
+	sitofp i64 0 to float		; <float>:77 [#uses=0]
+	sitofp i64 0 to double		; <double>:78 [#uses=0]
+	icmp ne i64 1, 0		; <i1>:79 [#uses=0]
+	trunc i64 1 to i8		; <i8>:80 [#uses=0]
+	trunc i64 1 to i8		; <i8>:81 [#uses=0]
+	trunc i64 1 to i16		; <i16>:82 [#uses=0]
+	trunc i64 1 to i16		; <i16>:83 [#uses=0]
+	trunc i64 1 to i32		; <i32>:84 [#uses=0]
+	trunc i64 1 to i32		; <i32>:85 [#uses=0]
+	bitcast i64 1 to i64		; <i64>:86 [#uses=0]
+	bitcast i64 1 to i64		; <i64>:87 [#uses=0]
+	uitofp i64 1 to float		; <float>:88 [#uses=0]
+	uitofp i64 0 to double		; <double>:89 [#uses=0]
+	bitcast float 0.000000e+00 to float		; <float>:90 [#uses=0]
+	fpext float 0.000000e+00 to double		; <double>:91 [#uses=0]
+	fptosi double 0.000000e+00 to i8		; <i8>:92 [#uses=0]
+	fptoui double 0.000000e+00 to i8		; <i8>:93 [#uses=0]
+	fptosi double 0.000000e+00 to i16		; <i16>:94 [#uses=0]
+	fptoui double 0.000000e+00 to i16		; <i16>:95 [#uses=0]
+	fptosi double 0.000000e+00 to i32		; <i32>:96 [#uses=0]
+	fptoui double 0.000000e+00 to i32		; <i32>:97 [#uses=0]
+	fptosi double 0.000000e+00 to i64		; <i64>:98 [#uses=0]
+	fptrunc double 0.000000e+00 to float		; <float>:99 [#uses=0]
+	bitcast double 0.000000e+00 to double		; <double>:100 [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
new file mode 100644
index 0000000..ac1d9ac
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -use-mcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.0)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
new file mode 100644
index 0000000..6b46639
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; This tests to make sure that we can evaluate weird constant expressions
+
+@A = global i32 5		; <i32*> [#uses=1]
+@B = global i32 6		; <i32*> [#uses=1]
+
+define i32 @main() {
+	%A = or i1 false, icmp slt (i32* @A, i32* @B)		; <i1> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
new file mode 100644
index 0000000..35491df
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define double @test(double* %DP, double %Arg) {
+	%D = load double* %DP		; <double> [#uses=1]
+	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
+	%W = fsub double %V, %V		; <double> [#uses=3]
+	%X = fmul double %W, %W		; <double> [#uses=2]
+	%Y = fdiv double %X, %X		; <double> [#uses=2]
+	%Q = fadd double %Y, %Arg		; <double> [#uses=1]
+	%R = bitcast double %Q to double		; <double> [#uses=1]
+	store double %Q, double* %DP
+	ret double %Y
+}
+
+define i32 @main() {
+	%X = alloca double		; <double*> [#uses=2]
+	store double 0.000000e+00, double* %X
+	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
new file mode 100644
index 0000000..6fc5a50
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll
@@ -0,0 +1,23 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define double @test(double* %DP, double %Arg) {
+	%D = load double* %DP		; <double> [#uses=1]
+	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
+	%W = fsub double %V, %V		; <double> [#uses=3]
+	%X = fmul double %W, %W		; <double> [#uses=2]
+	%Y = fdiv double %X, %X		; <double> [#uses=2]
+	%Z = frem double %Y, %Y		; <double> [#uses=3]
+	%Z1 = fdiv double %Z, %W		; <double> [#uses=0]
+	%Q = fadd double %Z, %Arg		; <double> [#uses=1]
+	%R = bitcast double %Q to double		; <double> [#uses=1]
+	store double %R, double* %DP
+	ret double %Z
+}
+
+define i32 @main() {
+	%X = alloca double		; <double*> [#uses=2]
+	store double 0.000000e+00, double* %X
+	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
new file mode 100644
index 0000000..4a790c6
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
new file mode 100644
index 0000000..94e0250
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-global.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+@count = global i32 0, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
new file mode 100644
index 0000000..e917149
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll
@@ -0,0 +1,31 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
+	%V = load i8* %P		; <i8> [#uses=1]
+	store i8 %V, i8* %P
+	%V.upgrd.4 = load i16* %P.upgrd.1		; <i16> [#uses=1]
+	store i16 %V.upgrd.4, i16* %P.upgrd.1
+	%V.upgrd.5 = load i32* %P.upgrd.2		; <i32> [#uses=1]
+	store i32 %V.upgrd.5, i32* %P.upgrd.2
+	%V.upgrd.6 = load i64* %P.upgrd.3		; <i64> [#uses=1]
+	store i64 %V.upgrd.6, i64* %P.upgrd.3
+	ret void
+}
+
+define i32 @varalloca(i32 %Size) {
+        ;; Variable sized alloca
+	%X = alloca i32, i32 %Size		; <i32*> [#uses=2]
+	store i32 %Size, i32* %X
+	%Y = load i32* %X		; <i32> [#uses=1]
+	ret i32 %Y
+}
+
+define i32 @main() {
+	%A = alloca i8		; <i8*> [#uses=1]
+	%B = alloca i16		; <i16*> [#uses=1]
+	%C = alloca i32		; <i32*> [#uses=1]
+	%D = alloca i64		; <i64*> [#uses=1]
+	call void @test( i8* %A, i16* %B, i32* %C, i64* %D )
+	call i32 @varalloca( i32 7 )		; <i32>:1 [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
new file mode 100644
index 0000000..4f5ae47
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-local.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %count = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %count, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* %count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
new file mode 100644
index 0000000..0540c22
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll
@@ -0,0 +1,18 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	%A = and i8 4, 8		; <i8> [#uses=2]
+	%B = or i8 %A, 7		; <i8> [#uses=1]
+	%C = xor i8 %B, %A		; <i8> [#uses=0]
+	%A.upgrd.1 = and i16 4, 8		; <i16> [#uses=2]
+	%B.upgrd.2 = or i16 %A.upgrd.1, 7		; <i16> [#uses=1]
+	%C.upgrd.3 = xor i16 %B.upgrd.2, %A.upgrd.1		; <i16> [#uses=0]
+	%A.upgrd.4 = and i32 4, 8		; <i32> [#uses=2]
+	%B.upgrd.5 = or i32 %A.upgrd.4, 7		; <i32> [#uses=1]
+	%C.upgrd.6 = xor i32 %B.upgrd.5, %A.upgrd.4		; <i32> [#uses=0]
+	%A.upgrd.7 = and i64 4, 8		; <i64> [#uses=2]
+	%B.upgrd.8 = or i64 %A.upgrd.7, 7		; <i64> [#uses=1]
+	%C.upgrd.9 = xor i64 %B.upgrd.8, %A.upgrd.7		; <i64> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
new file mode 100644
index 0000000..b1dbf40
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+; <label>:0
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	%I = phi i32 [ 0, %0 ], [ %i2, %Loop ]		; <i32> [#uses=1]
+	%i2 = add i32 %I, 1		; <i32> [#uses=2]
+	%C = icmp eq i32 %i2, 10		; <i1> [#uses=1]
+	br i1 %C, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
new file mode 100644
index 0000000..fbc0808
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; test phi node
+@Y = global i32 6		; <i32*> [#uses=1]
+
+define void @blah(i32* %X) {
+; <label>:0
+	br label %T
+T:		; preds = %Dead, %0
+	phi i32* [ %X, %0 ], [ @Y, %Dead ]		; <i32*>:1 [#uses=0]
+	ret void
+Dead:		; No predecessors!
+	br label %T
+}
+
+define i32 @test(i1 %C) {
+; <label>:0
+	br i1 %C, label %T, label %T
+T:		; preds = %0, %0
+	%X = phi i32 [ 123, %0 ], [ 123, %0 ]		; <i32> [#uses=1]
+	ret i32 %X
+}
+
+define i32 @main() {
+; <label>:0
+	br label %Test
+Test:		; preds = %Dead, %0
+	%X = phi i32 [ 0, %0 ], [ %Y, %Dead ]		; <i32> [#uses=1]
+	ret i32 %X
+Dead:		; No predecessors!
+	%Y = ashr i32 12, 4		; <i32> [#uses=1]
+	br label %Test
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
new file mode 100644
index 0000000..1b90ee0
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll
@@ -0,0 +1,46 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+; test return instructions
+define void @test1() {
+	ret void
+}
+
+define i8 @test2() {
+	ret i8 1
+}
+
+define i8 @test3() {
+	ret i8 1
+}
+
+define i16 @test4() {
+	ret i16 -1
+}
+
+define i16 @test5() {
+	ret i16 -1
+}
+
+define i32 @main() {
+	ret i32 0
+}
+
+define i32 @test6() {
+	ret i32 4
+}
+
+define i64 @test7() {
+	ret i64 0
+}
+
+define i64 @test8() {
+	ret i64 0
+}
+
+define float @test9() {
+	ret float 1.000000e+00
+}
+
+define double @test10() {
+	ret double 2.000000e+00
+}
diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
new file mode 100644
index 0000000..9c399ca
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-return.ll
@@ -0,0 +1,8 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
new file mode 100644
index 0000000..030ff31
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
@@ -0,0 +1,24 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+
+define i32 @main() {
+	%double1 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
+	%double2 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
+	%float1 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
+	%float2 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
+	%test49 = fcmp oeq float %float1, %float2		; <i1> [#uses=0]
+	%test50 = fcmp oge float %float1, %float2		; <i1> [#uses=0]
+	%test51 = fcmp ogt float %float1, %float2		; <i1> [#uses=0]
+	%test52 = fcmp ole float %float1, %float2		; <i1> [#uses=0]
+	%test53 = fcmp olt float %float1, %float2		; <i1> [#uses=0]
+	%test54 = fcmp une float %float1, %float2		; <i1> [#uses=0]
+	%test55 = fcmp oeq double %double1, %double2		; <i1> [#uses=0]
+	%test56 = fcmp oge double %double1, %double2		; <i1> [#uses=0]
+	%test57 = fcmp ogt double %double1, %double2		; <i1> [#uses=0]
+	%test58 = fcmp ole double %double1, %double2		; <i1> [#uses=0]
+	%test59 = fcmp olt double %double1, %double2		; <i1> [#uses=0]
+	%test60 = fcmp une double %double1, %double2		; <i1> [#uses=0]
+	ret i32 0
+}
+
+
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
new file mode 100644
index 0000000..1113efe
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
@@ -0,0 +1,69 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	%int1 = add i32 0, 0		; <i32> [#uses=6]
+	%int2 = add i32 0, 0		; <i32> [#uses=6]
+	%long1 = add i64 0, 0		; <i64> [#uses=6]
+	%long2 = add i64 0, 0		; <i64> [#uses=6]
+	%sbyte1 = add i8 0, 0		; <i8> [#uses=6]
+	%sbyte2 = add i8 0, 0		; <i8> [#uses=6]
+	%short1 = add i16 0, 0		; <i16> [#uses=6]
+	%short2 = add i16 0, 0		; <i16> [#uses=6]
+	%ubyte1 = add i8 0, 0		; <i8> [#uses=6]
+	%ubyte2 = add i8 0, 0		; <i8> [#uses=6]
+	%uint1 = add i32 0, 0		; <i32> [#uses=6]
+	%uint2 = add i32 0, 0		; <i32> [#uses=6]
+	%ulong1 = add i64 0, 0		; <i64> [#uses=6]
+	%ulong2 = add i64 0, 0		; <i64> [#uses=6]
+	%ushort1 = add i16 0, 0		; <i16> [#uses=6]
+	%ushort2 = add i16 0, 0		; <i16> [#uses=6]
+	%test1 = icmp eq i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test2 = icmp uge i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test3 = icmp ugt i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test4 = icmp ule i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test5 = icmp ult i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test6 = icmp ne i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test7 = icmp eq i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test8 = icmp uge i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test9 = icmp ugt i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test10 = icmp ule i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test11 = icmp ult i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test12 = icmp ne i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test13 = icmp eq i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test14 = icmp uge i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test15 = icmp ugt i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test16 = icmp ule i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test17 = icmp ult i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test18 = icmp ne i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test19 = icmp eq i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test20 = icmp uge i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test21 = icmp ugt i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test22 = icmp ule i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test23 = icmp ult i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test24 = icmp ne i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test25 = icmp eq i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test26 = icmp sge i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test27 = icmp sgt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test28 = icmp sle i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test29 = icmp slt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test30 = icmp ne i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test31 = icmp eq i16 %short1, %short2		; <i1> [#uses=0]
+	%test32 = icmp sge i16 %short1, %short2		; <i1> [#uses=0]
+	%test33 = icmp sgt i16 %short1, %short2		; <i1> [#uses=0]
+	%test34 = icmp sle i16 %short1, %short2		; <i1> [#uses=0]
+	%test35 = icmp slt i16 %short1, %short2		; <i1> [#uses=0]
+	%test36 = icmp ne i16 %short1, %short2		; <i1> [#uses=0]
+	%test37 = icmp eq i32 %int1, %int2		; <i1> [#uses=0]
+	%test38 = icmp sge i32 %int1, %int2		; <i1> [#uses=0]
+	%test39 = icmp sgt i32 %int1, %int2		; <i1> [#uses=0]
+	%test40 = icmp sle i32 %int1, %int2		; <i1> [#uses=0]
+	%test41 = icmp slt i32 %int1, %int2		; <i1> [#uses=0]
+	%test42 = icmp ne i32 %int1, %int2		; <i1> [#uses=0]
+	%test43 = icmp eq i64 %long1, %long2		; <i1> [#uses=0]
+	%test44 = icmp sge i64 %long1, %long2		; <i1> [#uses=0]
+	%test45 = icmp sgt i64 %long1, %long2		; <i1> [#uses=0]
+	%test46 = icmp sle i64 %long1, %long2		; <i1> [#uses=0]
+	%test47 = icmp slt i64 %long1, %long2		; <i1> [#uses=0]
+	%test48 = icmp ne i64 %long1, %long2		; <i1> [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
new file mode 100644
index 0000000..2da824f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-mcjit %s > /dev/null
+
+define i32 @main() {
+	%shamt = add i8 0, 1		; <i8> [#uses=8]
+	%shift.upgrd.1 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%t1.s = shl i32 1, %shift.upgrd.1		; <i32> [#uses=0]
+	%t2.s = shl i32 1, 4		; <i32> [#uses=0]
+	%shift.upgrd.2 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%t1 = shl i32 1, %shift.upgrd.2		; <i32> [#uses=0]
+	%t2 = shl i32 1, 5		; <i32> [#uses=0]
+	%t2.s.upgrd.3 = shl i64 1, 4		; <i64> [#uses=0]
+	%t2.upgrd.4 = shl i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.5 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%tr1.s = ashr i32 1, %shift.upgrd.5		; <i32> [#uses=0]
+	%tr2.s = ashr i32 1, 4		; <i32> [#uses=0]
+	%shift.upgrd.6 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%tr1 = lshr i32 1, %shift.upgrd.6		; <i32> [#uses=0]
+	%tr2 = lshr i32 1, 5		; <i32> [#uses=0]
+	%tr1.l = ashr i64 1, 4		; <i64> [#uses=0]
+	%shift.upgrd.7 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr2.l = ashr i64 1, %shift.upgrd.7		; <i64> [#uses=0]
+	%tr3.l = shl i64 1, 4		; <i64> [#uses=0]
+	%shift.upgrd.8 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr4.l = shl i64 1, %shift.upgrd.8		; <i64> [#uses=0]
+	%tr1.u = lshr i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.9 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr2.u = lshr i64 1, %shift.upgrd.9		; <i64> [#uses=0]
+	%tr3.u = shl i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.10 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr4.u = shl i64 1, %shift.upgrd.10		; <i64> [#uses=0]
+	ret i32 0
+}
diff --git a/test/Feature/globalredefinition3.ll b/test/Feature/globalredefinition3.ll
index 5a5b3f1..2551d93 100644
--- a/test/Feature/globalredefinition3.ll
+++ b/test/Feature/globalredefinition3.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as %s -o /dev/null |& grep {redefinition of global '@B'}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "redefinition of global '@B'"
 
 @B = global i32 7
 @B = global i32 7
diff --git a/test/Feature/intrinsics.ll b/test/Feature/intrinsics.ll
index c4e3db6..9e7dc6d 100644
--- a/test/Feature/intrinsics.ll
+++ b/test/Feature/intrinsics.ll
@@ -1,6 +1,7 @@
 ; RUN: llvm-as < %s | llvm-dis > %t1.ll
 ; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
 ; RUN: diff %t1.ll %t2.ll
+; RUN: FileCheck %s < %t1.ll
 
 declare i1 @llvm.isunordered.f32(float, float)
 
@@ -58,3 +59,12 @@ define void @libm() {
 }
 
 ; FIXME: test ALL the intrinsics in this file.
+
+; rdar://11542750
+; CHECK: declare void @llvm.trap() noreturn nounwind
+declare void @llvm.trap()
+
+define void @trap() {
+  call void @llvm.trap()
+  ret void
+}
diff --git a/test/Feature/load_module.ll b/test/Feature/load_module.ll
index 05f6c23..14c1153 100644
--- a/test/Feature/load_module.ll
+++ b/test/Feature/load_module.ll
@@ -1,6 +1,6 @@
 ; PR1318
 ; RUN: opt < %s -load=%llvmshlibdir/LLVMHello%shlibext -hello \
-; RUN:   -disable-output |& grep Hello
+; RUN:   -disable-output 2>&1 | grep Hello
 ; REQUIRES: loadable_module
 ; FIXME: On Cygming, it might fail without building LLVMHello manually.
 
diff --git a/test/Feature/packed_struct.ll b/test/Feature/packed_struct.ll
index 4d4ace9..0766649 100644
--- a/test/Feature/packed_struct.ll
+++ b/test/Feature/packed_struct.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
 ; RUN: diff %t1.ll %t2.ll
 ; RUN: not grep cast %t2.ll
-; RUN: grep {\\}>} %t2.ll
+; RUN: grep "}>" %t2.ll
 ; END.
 
 %struct.anon = type <{ i8, i32, i32, i32 }>
diff --git a/test/Feature/vector-cast-constant-exprs.ll b/test/Feature/vector-cast-constant-exprs.ll
index ffdc0f0..992987c 100644
--- a/test/Feature/vector-cast-constant-exprs.ll
+++ b/test/Feature/vector-cast-constant-exprs.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | not grep {ret.*(}
+; RUN: llvm-as < %s | llvm-dis | not grep "ret.*("
 
 ; All of these constant expressions should fold.
 
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
new file mode 100644
index 0000000..294ca8a
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -0,0 +1,73 @@
+; Test basic address sanitizer instrumentation.
+;
+; RUN: opt < %s -asan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @test_load(i32* %a) address_safety {
+; CHECK: @test_load
+; CHECK-NOT: load
+; CHECK:   %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK:   lshr i64 %[[LOAD_ADDR]], 3
+; CHECK:   or i64
+; CHECK:   %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK:   %[[LOAD_SHADOW:[^ ]*]] = load i8* %[[LOAD_SHADOW_PTR]]
+; CHECK:   icmp ne i8
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+;
+; First instrumentation block refines the shadow test.
+; CHECK:   and i64 %[[LOAD_ADDR]], 7
+; CHECK:   add i64 %{{.*}}, 3
+; CHECK:   trunc i64 %{{.*}} to i8
+; CHECK:   icmp sge i8 %{{.*}}, %[[LOAD_SHADOW]]
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+;
+; The actual load comes next because ASan adds the crash block
+; to the end of the function.
+; CHECK:   %tmp1 = load i32* %a
+; CHECK:   ret i32 %tmp1
+
+; The crash block reports the error.
+; CHECK:   call void @__asan_report_load4(i64 %[[LOAD_ADDR]])
+; CHECK:   unreachable
+;
+
+
+entry:
+  %tmp1 = load i32* %a
+  ret i32 %tmp1
+}
+
+define void @test_store(i32* %a) address_safety {
+; CHECK: @test_store
+; CHECK-NOT: store
+; CHECK:   %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK:   lshr i64 %[[STORE_ADDR]], 3
+; CHECK:   or i64
+; CHECK:   %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK:   %[[STORE_SHADOW:[^ ]*]] = load i8* %[[STORE_SHADOW_PTR]]
+; CHECK:   icmp ne i8
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+;
+; First instrumentation block refines the shadow test.
+; CHECK:   and i64 %[[STORE_ADDR]], 7
+; CHECK:   add i64 %{{.*}}, 3
+; CHECK:   trunc i64 %{{.*}} to i8
+; CHECK:   icmp sge i8 %{{.*}}, %[[STORE_SHADOW]]
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+;
+; The actual load comes next because ASan adds the crash block
+; to the end of the function.
+; CHECK:   store i32 42, i32* %a
+; CHECK:   ret void
+;
+; The crash block reports the error.
+; CHECK:   call void @__asan_report_store4(i64 %[[STORE_ADDR]])
+; CHECK:   unreachable
+;
+
+entry:
+  store i32 42, i32* %a
+  ret void
+}
diff --git a/test/Instrumentation/AddressSanitizer/test64.ll b/test/Instrumentation/AddressSanitizer/test64.ll
index fc27de9..d544d77 100644
--- a/test/Instrumentation/AddressSanitizer/test64.ll
+++ b/test/Instrumentation/AddressSanitizer/test64.ll
@@ -12,3 +12,25 @@ entry:
 ; Check for ASAN's Offset for 64-bit (2^44)
 ; CHECK-NEXT: 17592186044416
 ; CHECK: ret
+
+define void @example_atomicrmw(i64* %ptr) nounwind uwtable address_safety {
+entry:
+  %0 = atomicrmw add i64* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; CHECK: @example_atomicrmw
+; CHECK: lshr {{.*}} 3
+; CHECK: atomicrmw
+; CHECK: ret
+
+define void @example_cmpxchg(i64* %ptr, i64 %compare_to, i64 %new_value) nounwind uwtable address_safety {
+entry:
+  %0 = cmpxchg i64* %ptr, i64 %compare_to, i64 %new_value seq_cst
+  ret void
+}
+
+; CHECK: @example_cmpxchg
+; CHECK: lshr {{.*}} 3
+; CHECK: cmpxchg
+; CHECK: ret
diff --git a/test/Instrumentation/BoundsChecking/lit.local.cfg b/test/Instrumentation/BoundsChecking/lit.local.cfg
new file mode 100644
index 0000000..19eebc0
--- /dev/null
+++ b/test/Instrumentation/BoundsChecking/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Instrumentation/BoundsChecking/many-trap.ll b/test/Instrumentation/BoundsChecking/many-trap.ll
new file mode 100644
index 0000000..0bbb959
--- /dev/null
+++ b/test/Instrumentation/BoundsChecking/many-trap.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -bounds-checking -S | FileCheck %s
+; RUN: opt < %s -bounds-checking -bounds-checking-single-trap -S | FileCheck -check-prefix=SINGLE %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: @f1
+define void @f1(i64 %x) nounwind {
+  %1 = alloca i128, i64 %x
+  %2 = load i128* %1, align 4
+  %3 = load i128* %1, align 4
+  ret void
+; CHECK: call void @llvm.trap()
+; CHECK: call void @llvm.trap()
+; CHECK-NOT: call void @llvm.trap()
+; SINGLE: call void @llvm.trap()
+; SINGLE-NOT: call void @llvm.trap()
+}
diff --git a/test/Instrumentation/BoundsChecking/phi.ll b/test/Instrumentation/BoundsChecking/phi.ll
new file mode 100644
index 0000000..86b5922
--- /dev/null
+++ b/test/Instrumentation/BoundsChecking/phi.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -bounds-checking -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@global = private unnamed_addr constant [10 x i8] c"ola\00mundo\00", align 1
+
+; CHECK: f1
+; no checks are possible here
+; CHECK-NOT: trap
+define void @f1(i8* nocapture %c) {
+entry:
+  %0 = load i8* %c, align 1
+  %tobool1 = icmp eq i8 %0, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:
+  %c.addr.02 = phi i8* [ %incdec.ptr, %while.body ], [ %c, %entry ]
+  %incdec.ptr = getelementptr inbounds i8* %c.addr.02, i64 -1
+  store i8 100, i8* %c.addr.02, align 1
+  %1 = load i8* %incdec.ptr, align 1
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+
+; CHECK: f2
+define void @f2() {
+while.body.i.preheader:
+  %addr = getelementptr inbounds [10 x i8]* @global, i64 0, i64 9
+  br label %while.body.i
+
+while.body.i:
+; CHECK: phi
+; CHECK-NEXT: phi
+; CHECK-NOT: phi
+  %c.addr.02.i = phi i8* [ %incdec.ptr.i, %while.body.i ], [ %addr, %while.body.i.preheader ]
+  %incdec.ptr.i = getelementptr inbounds i8* %c.addr.02.i, i64 -1
+; CHECK: sub i64 10, %0
+; CHECK-NEXT: icmp ult i64 10, %0
+; CHECK-NEXT: icmp ult i64 {{.*}}, 1
+; CHECK-NEXT: or i1
+; CHECK-NEXT: br {{.*}}, label %trap
+  store i8 100, i8* %c.addr.02.i, align 1
+  %0 = load i8* %incdec.ptr.i, align 1
+  %tobool.i = icmp eq i8 %0, 0
+  br i1 %tobool.i, label %fn.exit, label %while.body.i
+
+fn.exit:
+  ret void
+}
diff --git a/test/Instrumentation/BoundsChecking/simple.ll b/test/Instrumentation/BoundsChecking/simple.ll
new file mode 100644
index 0000000..16870c7
--- /dev/null
+++ b/test/Instrumentation/BoundsChecking/simple.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -bounds-checking -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str = private constant [8 x i8] c"abcdefg\00"   ; <[8 x i8]*>
+
+declare noalias i8* @malloc(i64) nounwind
+declare noalias i8* @calloc(i64, i64) nounwind
+declare noalias i8* @realloc(i8* nocapture, i64) nounwind
+
+; CHECK: @f1
+define void @f1() nounwind {
+  %1 = tail call i8* @malloc(i64 32)
+  %2 = bitcast i8* %1 to i32*
+  %idx = getelementptr inbounds i32* %2, i64 2
+; CHECK-NOT: trap
+  store i32 3, i32* %idx, align 4
+  ret void
+}
+
+; CHECK: @f2
+define void @f2() nounwind {
+  %1 = tail call i8* @malloc(i64 32)
+  %2 = bitcast i8* %1 to i32*
+  %idx = getelementptr inbounds i32* %2, i64 8
+; CHECK: trap
+  store i32 3, i32* %idx, align 4
+  ret void
+}
+
+; CHECK: @f3
+define void @f3(i64 %x) nounwind {
+  %1 = tail call i8* @calloc(i64 4, i64 %x)
+  %2 = bitcast i8* %1 to i32*
+  %idx = getelementptr inbounds i32* %2, i64 8
+; CHECK: mul i64 4, %
+; CHECK: sub i64 {{.*}}, 32
+; CHECK-NEXT: icmp ult i64 {{.*}}, 32
+; CHECK-NEXT: icmp ult i64 {{.*}}, 4
+; CHECK-NEXT: or i1
+; CHECK: trap
+  store i32 3, i32* %idx, align 4
+  ret void
+}
+
+; CHECK: @f4
+define void @f4(i64 %x) nounwind {
+  %1 = tail call i8* @realloc(i8* null, i64 %x) nounwind
+  %2 = bitcast i8* %1 to i32*
+  %idx = getelementptr inbounds i32* %2, i64 8
+; CHECK: trap
+  %3 = load i32* %idx, align 4
+  ret void
+}
+
+; CHECK: @f5
+define void @f5(i64 %x) nounwind {
+  %idx = getelementptr inbounds [8 x i8]* @.str, i64 0, i64 %x
+; CHECK: trap
+  %1 = load i8* %idx, align 4
+  ret void
+}
+
+; CHECK: @f6
+define void @f6(i64 %x) nounwind {
+  %1 = alloca i128
+; CHECK-NOT: trap
+  %2 = load i128* %1, align 4
+  ret void
+}
+
+; CHECK: @f7
+define void @f7(i64 %x) nounwind {
+  %1 = alloca i128, i64 %x
+; CHECK: mul i64 16,
+; CHECK: trap
+  %2 = load i128* %1, align 4
+  ret void
+}
+
+; CHECK: @f8
+define void @f8() nounwind {
+  %1 = alloca i128
+  %2 = alloca i128
+  %3 = select i1 undef, i128* %1, i128* %2
+; CHECK-NOT: trap
+  %4 = load i128* %3, align 4
+  ret void
+}
+
+; CHECK: @f9
+define void @f9(i128* %arg) nounwind {
+  %1 = alloca i128
+  %2 = select i1 undef, i128* %arg, i128* %1
+; CHECK-NOT: trap
+  %3 = load i128* %2, align 4
+  ret void
+}
+
+; CHECK: @f10
+define void @f10(i64 %x, i64 %y) nounwind {
+  %1 = alloca i128, i64 %x
+  %2 = alloca i128, i64 %y
+  %3 = select i1 undef, i128* %1, i128* %2
+; CHECK: select
+; CHECK: select
+; CHECK: trap
+  %4 = load i128* %3, align 4
+  ret void
+}
+
+; CHECK: @f11
+define void @f11(i128* byval %x) nounwind {
+  %1 = bitcast i128* %x to i8*
+  %2 = getelementptr inbounds i8* %1, i64 16
+; CHECK: br label
+  %3 = load i8* %2, align 4
+  ret void
+}
+
+; CHECK: @f12
+define i64 @f12(i64 %x, i64 %y) nounwind {
+  %1 = tail call i8* @calloc(i64 1, i64 %x)
+; CHECK: mul i64 %y, 8
+  %2 = bitcast i8* %1 to i64*
+  %3 = getelementptr inbounds i64* %2, i64 %y
+  %4 = load i64* %3, align 8
+  ret i64 %4
+}
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
new file mode 100644
index 0000000..02bf215
--- /dev/null
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -0,0 +1,323 @@
+; RUN: opt < %s -tsan -S | FileCheck %s
+; Check that atomic memory operations are converted to calls into ThreadSanitizer runtime.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i8 @atomic8_load_unordered(i8* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i8* %a unordered, align 1
+  ret i8 %0
+}
+; CHECK: atomic8_load_unordered
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+
+define i8 @atomic8_load_monotonic(i8* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i8* %a monotonic, align 1
+  ret i8 %0
+}
+; CHECK: atomic8_load_monotonic
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+
+define i8 @atomic8_load_acquire(i8* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i8* %a acquire, align 1
+  ret i8 %0
+}
+; CHECK: atomic8_load_acquire
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 4)
+
+define i8 @atomic8_load_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i8* %a seq_cst, align 1
+  ret i8 %0
+}
+; CHECK: atomic8_load_seq_cst
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 32)
+
+define void @atomic8_store_unordered(i8* %a) nounwind uwtable {
+entry:
+  store atomic i8 0, i8* %a unordered, align 1
+  ret void
+}
+; CHECK: atomic8_store_unordered
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+
+define void @atomic8_store_monotonic(i8* %a) nounwind uwtable {
+entry:
+  store atomic i8 0, i8* %a monotonic, align 1
+  ret void
+}
+; CHECK: atomic8_store_monotonic
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+
+define void @atomic8_store_release(i8* %a) nounwind uwtable {
+entry:
+  store atomic i8 0, i8* %a release, align 1
+  ret void
+}
+; CHECK: atomic8_store_release
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 8)
+
+define void @atomic8_store_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  store atomic i8 0, i8* %a seq_cst, align 1
+  ret void
+}
+; CHECK: atomic8_store_seq_cst
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 32)
+
+define i16 @atomic16_load_unordered(i16* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i16* %a unordered, align 2
+  ret i16 %0
+}
+; CHECK: atomic16_load_unordered
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+
+define i16 @atomic16_load_monotonic(i16* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i16* %a monotonic, align 2
+  ret i16 %0
+}
+; CHECK: atomic16_load_monotonic
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+
+define i16 @atomic16_load_acquire(i16* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i16* %a acquire, align 2
+  ret i16 %0
+}
+; CHECK: atomic16_load_acquire
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 4)
+
+define i16 @atomic16_load_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i16* %a seq_cst, align 2
+  ret i16 %0
+}
+; CHECK: atomic16_load_seq_cst
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 32)
+
+define void @atomic16_store_unordered(i16* %a) nounwind uwtable {
+entry:
+  store atomic i16 0, i16* %a unordered, align 2
+  ret void
+}
+; CHECK: atomic16_store_unordered
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+
+define void @atomic16_store_monotonic(i16* %a) nounwind uwtable {
+entry:
+  store atomic i16 0, i16* %a monotonic, align 2
+  ret void
+}
+; CHECK: atomic16_store_monotonic
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+
+define void @atomic16_store_release(i16* %a) nounwind uwtable {
+entry:
+  store atomic i16 0, i16* %a release, align 2
+  ret void
+}
+; CHECK: atomic16_store_release
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 8)
+
+define void @atomic16_store_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  store atomic i16 0, i16* %a seq_cst, align 2
+  ret void
+}
+; CHECK: atomic16_store_seq_cst
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 32)
+
+define i32 @atomic32_load_unordered(i32* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i32* %a unordered, align 4
+  ret i32 %0
+}
+; CHECK: atomic32_load_unordered
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+
+define i32 @atomic32_load_monotonic(i32* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i32* %a monotonic, align 4
+  ret i32 %0
+}
+; CHECK: atomic32_load_monotonic
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+
+define i32 @atomic32_load_acquire(i32* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i32* %a acquire, align 4
+  ret i32 %0
+}
+; CHECK: atomic32_load_acquire
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 4)
+
+define i32 @atomic32_load_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i32* %a seq_cst, align 4
+  ret i32 %0
+}
+; CHECK: atomic32_load_seq_cst
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 32)
+
+define void @atomic32_store_unordered(i32* %a) nounwind uwtable {
+entry:
+  store atomic i32 0, i32* %a unordered, align 4
+  ret void
+}
+; CHECK: atomic32_store_unordered
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+
+define void @atomic32_store_monotonic(i32* %a) nounwind uwtable {
+entry:
+  store atomic i32 0, i32* %a monotonic, align 4
+  ret void
+}
+; CHECK: atomic32_store_monotonic
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+
+define void @atomic32_store_release(i32* %a) nounwind uwtable {
+entry:
+  store atomic i32 0, i32* %a release, align 4
+  ret void
+}
+; CHECK: atomic32_store_release
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 8)
+
+define void @atomic32_store_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  store atomic i32 0, i32* %a seq_cst, align 4
+  ret void
+}
+; CHECK: atomic32_store_seq_cst
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 32)
+
+define i64 @atomic64_load_unordered(i64* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i64* %a unordered, align 8
+  ret i64 %0
+}
+; CHECK: atomic64_load_unordered
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+
+define i64 @atomic64_load_monotonic(i64* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i64* %a monotonic, align 8
+  ret i64 %0
+}
+; CHECK: atomic64_load_monotonic
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+
+define i64 @atomic64_load_acquire(i64* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i64* %a acquire, align 8
+  ret i64 %0
+}
+; CHECK: atomic64_load_acquire
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 4)
+
+define i64 @atomic64_load_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i64* %a seq_cst, align 8
+  ret i64 %0
+}
+; CHECK: atomic64_load_seq_cst
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 32)
+
+define void @atomic64_store_unordered(i64* %a) nounwind uwtable {
+entry:
+  store atomic i64 0, i64* %a unordered, align 8
+  ret void
+}
+; CHECK: atomic64_store_unordered
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+
+define void @atomic64_store_monotonic(i64* %a) nounwind uwtable {
+entry:
+  store atomic i64 0, i64* %a monotonic, align 8
+  ret void
+}
+; CHECK: atomic64_store_monotonic
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+
+define void @atomic64_store_release(i64* %a) nounwind uwtable {
+entry:
+  store atomic i64 0, i64* %a release, align 8
+  ret void
+}
+; CHECK: atomic64_store_release
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 8)
+
+define void @atomic64_store_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  store atomic i64 0, i64* %a seq_cst, align 8
+  ret void
+}
+; CHECK: atomic64_store_seq_cst
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 32)
+
+define i128 @atomic128_load_unordered(i128* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i128* %a unordered, align 16
+  ret i128 %0
+}
+; CHECK: atomic128_load_unordered
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+
+define i128 @atomic128_load_monotonic(i128* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i128* %a monotonic, align 16
+  ret i128 %0
+}
+; CHECK: atomic128_load_monotonic
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+
+define i128 @atomic128_load_acquire(i128* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i128* %a acquire, align 16
+  ret i128 %0
+}
+; CHECK: atomic128_load_acquire
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 4)
+
+define i128 @atomic128_load_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  %0 = load atomic i128* %a seq_cst, align 16
+  ret i128 %0
+}
+; CHECK: atomic128_load_seq_cst
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 32)
+
+define void @atomic128_store_unordered(i128* %a) nounwind uwtable {
+entry:
+  store atomic i128 0, i128* %a unordered, align 16
+  ret void
+}
+; CHECK: atomic128_store_unordered
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+
+define void @atomic128_store_monotonic(i128* %a) nounwind uwtable {
+entry:
+  store atomic i128 0, i128* %a monotonic, align 16
+  ret void
+}
+; CHECK: atomic128_store_monotonic
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+
+define void @atomic128_store_release(i128* %a) nounwind uwtable {
+entry:
+  store atomic i128 0, i128* %a release, align 16
+  ret void
+}
+; CHECK: atomic128_store_release
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 8)
+
+define void @atomic128_store_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  store atomic i128 0, i128* %a seq_cst, align 16
+  ret void
+}
+; CHECK: atomic128_store_seq_cst
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 32)
diff --git a/test/Integer/packed_struct_bt.ll b/test/Integer/packed_struct_bt.ll
index a4d01e7..257c1c6 100644
--- a/test/Integer/packed_struct_bt.ll
+++ b/test/Integer/packed_struct_bt.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
 ; RUN: diff %t1.ll %t2.ll
 ; RUN: not grep cast %t2.ll
-; RUN: grep {\\}>} %t2.ll
+; RUN: grep "}>" %t2.ll
 ; END.
 
 %struct.anon = type <{ i8, i35, i35, i35 }>
diff --git a/test/Integer/varargs_bt.ll b/test/Integer/varargs_bt.ll
deleted file mode 100644
index 25ad58a..0000000
--- a/test/Integer/varargs_bt.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llvm-as %s -o - | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-; Demonstrate all of the variable argument handling intrinsic functions plus 
-; the va_arg instruction.
-
-declare void @llvm.va_start(i8** %ap)
-declare void @llvm.va_copy(i8** %aq, i8** %ap)
-declare void @llvm.va_end(i8** %ap)
-
-define i33 @test(i33 %X, ...) {
-        %ap = alloca i8*
-	call void @llvm.va_start(i8** %ap)
-	%tmp = va_arg i8** %ap, i33 
-
-        %aq = alloca i8*
-	call void @llvm.va_copy(i8** %aq, i8** %ap)
-	call void @llvm.va_end(i8** %aq)
-	
-	call void @llvm.va_end(i8** %ap)
-	ret i33 %tmp
-}
diff --git a/test/Integer/varargs_new_bt.ll b/test/Integer/varargs_new_bt.ll
deleted file mode 100644
index 59bb3f2..0000000
--- a/test/Integer/varargs_new_bt.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llvm-as %s -o - | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-; Demonstrate all of the variable argument handling intrinsic functions plus 
-; the va_arg instruction.
-
-declare void @llvm.va_start(i8**)
-declare void @llvm.va_copy(i8**, i8*)
-declare void @llvm.va_end(i8**)
-
-define i31 @test(i31 %X, ...) {
-        ; Allocate two va_list items.  On this target, va_list is of type i8*
-        %ap = alloca i8*             ; <i8**> [#uses=4]
-        %aq = alloca i8*             ; <i8**> [#uses=2]
-
-        ; Initialize variable argument processing
-        call void @llvm.va_start(i8** %ap)
-
-        ; Read a single integer argument
-        %tmp = va_arg i8** %ap, i31           ; <i31> [#uses=1]
-
-        ; Demonstrate usage of llvm.va_copy and llvm_va_end
-        %apv = load i8** %ap         ; <i8*> [#uses=1]
-        call void @llvm.va_copy(i8** %aq, i8* %apv)
-        call void @llvm.va_end(i8** %aq)
-
-        ; Stop processing of arguments.
-        call void @llvm.va_end(i8** %ap)
-        ret i31 %tmp
-
-}
diff --git a/test/Linker/2003-01-30-LinkerRename.ll b/test/Linker/2003-01-30-LinkerRename.ll
index cc34634..e7431ec 100644
--- a/test/Linker/2003-01-30-LinkerRename.ll
+++ b/test/Linker/2003-01-30-LinkerRename.ll
@@ -1,9 +1,9 @@
 ; This fails because the linker renames the external symbol not the internal 
 ; one...
 
-; RUN: echo {define internal i32 @foo() \{ ret i32 7 \} } | llvm-as > %t.1.bc
+; RUN: echo "define internal i32 @foo() { ret i32 7 } " | llvm-as > %t.1.bc
 ; RUN: llvm-as %s -o %t.2.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep {@foo()} | grep -v internal
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep "@foo()" | grep -v internal
 
 define i32 @foo() { ret i32 0 }
 
diff --git a/test/Linker/2003-01-30-LinkerTypeRename.ll b/test/Linker/2003-01-30-LinkerTypeRename.ll
index 043457d..94fb5e0 100644
--- a/test/Linker/2003-01-30-LinkerTypeRename.ll
+++ b/test/Linker/2003-01-30-LinkerTypeRename.ll
@@ -1,9 +1,9 @@
 ; This fails because the linker renames the non-opaque type not the opaque
 ; one...
 
-; RUN: echo {%%Ty = type opaque @GV = external global %%Ty*} | llvm-as > %t.1.bc
+; RUN: echo "%%Ty = type opaque @GV = external global %%Ty*" | llvm-as > %t.1.bc
 ; RUN: llvm-as < %s > %t.2.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep {%%Ty } | not grep opaque
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep "%%Ty " | not grep opaque
 
 %Ty = type {i32}
 
diff --git a/test/Linker/2003-04-21-Linkage.ll b/test/Linker/2003-04-21-Linkage.ll
deleted file mode 100644
index f6d4c4b..0000000
--- a/test/Linker/2003-04-21-Linkage.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: echo {@X = linkonce global i32 5 \
-; RUN:   define linkonce i32 @foo() \{ ret i32 7 \} } | llvm-as > %t.1.bc
-; RUN: llvm-as %s -o %t.2.bc
-; RUN: llvm-link %t.1.bc  %t.2.bc
-@X = external global i32 
-
-declare i32 @foo() 
-
-define void @bar() {
-	load i32* @X
-	call i32 @foo()
-	ret void
-}
-
diff --git a/test/Linker/2003-04-23-LinkOnceLost.ll b/test/Linker/2003-04-23-LinkOnceLost.ll
index beaf6ec..98a943a 100644
--- a/test/Linker/2003-04-23-LinkOnceLost.ll
+++ b/test/Linker/2003-04-23-LinkOnceLost.ll
@@ -1,7 +1,7 @@
 ; This fails because the linker renames the non-opaque type not the opaque 
 ; one...
 
-; RUN: echo { define linkonce void @foo() \{ ret void \} } | \
+; RUN: echo " define linkonce void @foo() { ret void } " | \
 ; RUN:   llvm-as -o %t.2.bc
 ; RUN: llvm-as %s -o %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep foo | grep linkonce
diff --git a/test/Linker/2003-04-26-NullPtrLinkProblem.ll b/test/Linker/2003-04-26-NullPtrLinkProblem.ll
index d23df1b..5e8249b 100644
--- a/test/Linker/2003-04-26-NullPtrLinkProblem.ll
+++ b/test/Linker/2003-04-26-NullPtrLinkProblem.ll
@@ -1,7 +1,7 @@
 ; This one fails because the LLVM runtime is allowing two null pointers of
 ; the same type to be created!
 
-; RUN: echo {%%T = type i32} | llvm-as > %t.2.bc
+; RUN: echo "%%T = type i32" | llvm-as > %t.2.bc
 ; RUN: llvm-as %s -o %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc
 
diff --git a/test/Linker/2003-05-15-TypeProblem.ll b/test/Linker/2003-05-15-TypeProblem.ll
index 18fcea0..c1fe334 100644
--- a/test/Linker/2003-05-15-TypeProblem.ll
+++ b/test/Linker/2003-05-15-TypeProblem.ll
@@ -1,7 +1,7 @@
 ; This one fails because the LLVM runtime is allowing two null pointers of
 ; the same type to be created!
 
-; RUN: echo {%M = type \{ %N*\} %N = type opaque} | llvm-as > %t.2.bc
+; RUN: echo "%M = type { %N*} %N = type opaque" | llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc
 
diff --git a/test/Linker/2003-05-31-LinkerRename.ll b/test/Linker/2003-05-31-LinkerRename.ll
index 80e0a69..dff861d 100644
--- a/test/Linker/2003-05-31-LinkerRename.ll
+++ b/test/Linker/2003-05-31-LinkerRename.ll
@@ -4,9 +4,9 @@
 ; the function name, we must rename the internal function to something that 
 ; does not conflict.
 
-; RUN: echo { define internal i32 @foo() \{ ret i32 7 \} } | llvm-as > %t.1.bc
+; RUN: echo " define internal i32 @foo() { ret i32 7 } " | llvm-as > %t.1.bc
 ; RUN: llvm-as < %s > %t.2.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep internal | not grep {@foo(}
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep internal | not grep "@foo("
 
 declare i32 @foo() 
 
diff --git a/test/Linker/2003-06-02-TypeResolveProblem.ll b/test/Linker/2003-06-02-TypeResolveProblem.ll
index 0b0e9c1..fa24b6d 100644
--- a/test/Linker/2003-06-02-TypeResolveProblem.ll
+++ b/test/Linker/2003-06-02-TypeResolveProblem.ll
@@ -1,4 +1,4 @@
-; RUN: echo {%%T = type opaque} | llvm-as > %t.2.bc
+; RUN: echo "%%T = type opaque" | llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc
 
diff --git a/test/Linker/2003-06-02-TypeResolveProblem2.ll b/test/Linker/2003-06-02-TypeResolveProblem2.ll
index 3f9fd04..3ae23a2 100644
--- a/test/Linker/2003-06-02-TypeResolveProblem2.ll
+++ b/test/Linker/2003-06-02-TypeResolveProblem2.ll
@@ -1,4 +1,4 @@
-; RUN: echo {%%T = type i32} | llvm-as > %t.1.bc
+; RUN: echo "%%T = type i32" | llvm-as > %t.1.bc
 ; RUN: llvm-as < %s > %t.2.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc
 
diff --git a/test/Linker/2003-08-20-OpaqueTypeResolve.ll b/test/Linker/2003-08-20-OpaqueTypeResolve.ll
index c0fc620..175146f 100644
--- a/test/Linker/2003-08-20-OpaqueTypeResolve.ll
+++ b/test/Linker/2003-08-20-OpaqueTypeResolve.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as < %s > %t.out1.bc
-; RUN: echo {%M = type \{ i32, i32* \} } | llvm-as > %t.out2.bc
+; RUN: echo "%M = type { i32, i32* } " | llvm-as > %t.out2.bc
 ; RUN: llvm-link %t.out1.bc %t.out2.bc
 
 %M = type { i32, %N* }
diff --git a/test/Linker/2003-08-23-GlobalVarLinking.ll b/test/Linker/2003-08-23-GlobalVarLinking.ll
index 255cb88..e934836 100644
--- a/test/Linker/2003-08-23-GlobalVarLinking.ll
+++ b/test/Linker/2003-08-23-GlobalVarLinking.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as < %s > %t.out1.bc
-; RUN: echo {%%T1 = type opaque %%T2 = type opaque @S = external global \{ i32, %%T1* \} declare void @F(%%T2*)}\
+; RUN: echo "%%T1 = type opaque %%T2 = type opaque @S = external global { i32, %%T1* } declare void @F(%%T2*)"\
 ; RUN:   | llvm-as > %t.out2.bc
 ; RUN: llvm-link %t.out1.bc %t.out2.bc -S | not grep opaque
 
diff --git a/test/Linker/2003-08-24-InheritPtrSize.ll b/test/Linker/2003-08-24-InheritPtrSize.ll
index f93c054..51d544b 100644
--- a/test/Linker/2003-08-24-InheritPtrSize.ll
+++ b/test/Linker/2003-08-24-InheritPtrSize.ll
@@ -2,8 +2,8 @@
 ; specified pointer size should not cause a warning!
 
 ; RUN: llvm-as < %s > %t.out1.bc
-; RUN: echo {} | llvm-as > %t.out2.bc
-; RUN: llvm-link %t.out1.bc %t.out2.bc |& not grep warning
+; RUN: echo "" | llvm-as > %t.out2.bc
+; RUN: llvm-link %t.out1.bc %t.out2.bc 2>&1 | not grep warning
 
 target datalayout = "e-p:64:64"
 
diff --git a/test/Linker/2004-12-03-DisagreeingType.ll b/test/Linker/2004-12-03-DisagreeingType.ll
index 570bda8..73d7a40 100644
--- a/test/Linker/2004-12-03-DisagreeingType.ll
+++ b/test/Linker/2004-12-03-DisagreeingType.ll
@@ -1,7 +1,7 @@
-; RUN: echo {@G = weak global \{\{\{\{double\}\}\}\} zeroinitializer } | \
+; RUN: echo "@G = weak global {{{{double}}}} zeroinitializer " | \
 ; RUN:   llvm-as > %t.out2.bc
 ; RUN: llvm-as < %s > %t.out1.bc
-; RUN: llvm-link %t.out1.bc %t.out2.bc -S | not grep {\\}}
+; RUN: llvm-link %t.out1.bc %t.out2.bc -S | not grep "}"
 
 ; When linked, the global above should be eliminated, being merged with the 
 ; global below.
diff --git a/test/Linker/2005-02-12-ConstantGlobals-2.ll b/test/Linker/2005-02-12-ConstantGlobals-2.ll
index 2ceae31..30bfafe 100644
--- a/test/Linker/2005-02-12-ConstantGlobals-2.ll
+++ b/test/Linker/2005-02-12-ConstantGlobals-2.ll
@@ -1,8 +1,8 @@
 ; Test that a prototype can be marked const, and the definition is allowed
 ; to be nonconst.
 
-; RUN: echo {@X = external constant i32} | llvm-as > %t.2.bc
+; RUN: echo "@X = external constant i32" | llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep {global i32 7}
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep "global i32 7"
 
 @X = global i32 7
diff --git a/test/Linker/2005-02-12-ConstantGlobals.ll b/test/Linker/2005-02-12-ConstantGlobals.ll
index 60f176b..93709cf 100644
--- a/test/Linker/2005-02-12-ConstantGlobals.ll
+++ b/test/Linker/2005-02-12-ConstantGlobals.ll
@@ -1,8 +1,8 @@
 ; Test that a prototype can be marked const, and the definition is allowed
 ; to be nonconst.
 
-; RUN: echo {@X = global i32 7} | llvm-as > %t.2.bc
+; RUN: echo "@X = global i32 7" | llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep {global i32 7}
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep "global i32 7"
 
 @X = external constant i32		; <i32*> [#uses=0]
diff --git a/test/Linker/2005-12-06-AppendingZeroLengthArrays.ll b/test/Linker/2005-12-06-AppendingZeroLengthArrays.ll
index 7d1020d..d7a34c8 100644
--- a/test/Linker/2005-12-06-AppendingZeroLengthArrays.ll
+++ b/test/Linker/2005-12-06-AppendingZeroLengthArrays.ll
@@ -1,7 +1,7 @@
-; RUN: echo { @G = appending global \[0 x i32\] zeroinitializer } | \
+; RUN: echo " @G = appending global [0 x i32] zeroinitializer " | \
 ; RUN:   llvm-as > %t.out2.bc
 ; RUN: llvm-as < %s > %t.out1.bc
-; RUN: llvm-link %t.out1.bc %t.out2.bc -S | grep {@G =}
+; RUN: llvm-link %t.out1.bc %t.out2.bc -S | grep "@G ="
 
 ; When linked, the globals should be merged, and the result should still 
 ; be named '@G'.
diff --git a/test/Linker/2006-06-15-GlobalVarAlignment.ll b/test/Linker/2006-06-15-GlobalVarAlignment.ll
index df3284b..eec8f63 100644
--- a/test/Linker/2006-06-15-GlobalVarAlignment.ll
+++ b/test/Linker/2006-06-15-GlobalVarAlignment.ll
@@ -1,7 +1,7 @@
 ; The linker should choose the largest alignment when linking.
 
-; RUN: echo {@X = global i32 7, align 8} | llvm-as > %t.2.bc
+; RUN: echo "@X = global i32 7, align 8" | llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | grep {align 8}
+; RUN: llvm-link %t.1.bc %t.2.bc -S | grep "align 8"
 
 @X = weak global i32 7, align 4
diff --git a/test/Linker/2008-03-07-DroppedSection_a.ll b/test/Linker/2008-03-07-DroppedSection_a.ll
index 4458971..ec9d5c2 100644
--- a/test/Linker/2008-03-07-DroppedSection_a.ll
+++ b/test/Linker/2008-03-07-DroppedSection_a.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s > %t.bc
 ; RUN: llvm-as < %p/2008-03-07-DroppedSection_b.ll > %t2.bc
-; RUN: llvm-ld -r -disable-opt %t.bc %t2.bc -o %t3.bc
+; RUN: llvm-link %t.bc %t2.bc -o %t3.bc
 ; RUN: llvm-dis < %t3.bc | grep ".data.init_task"
 
 ; ModuleID = 't.bc'
diff --git a/test/Linker/2008-03-07-DroppedSection_b.ll b/test/Linker/2008-03-07-DroppedSection_b.ll
index 884bf0a..63b64f6 100644
--- a/test/Linker/2008-03-07-DroppedSection_b.ll
+++ b/test/Linker/2008-03-07-DroppedSection_b.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s > %t.bc
 ; RUN: llvm-as < %p/2008-03-07-DroppedSection_a.ll > %t2.bc
-; RUN: llvm-ld -r -disable-opt %t.bc %t2.bc -o %t3.bc
+; RUN: llvm-link %t.bc %t2.bc -o %t3.bc
 ; RUN: llvm-dis < %t3.bc | grep ".data.init_task"
 
 ; ModuleID = 'u.bc'
diff --git a/test/Linker/2008-06-13-LinkOnceRedefinition.ll b/test/Linker/2008-06-13-LinkOnceRedefinition.ll
index 49da96a..da4b48d 100644
--- a/test/Linker/2008-06-13-LinkOnceRedefinition.ll
+++ b/test/Linker/2008-06-13-LinkOnceRedefinition.ll
@@ -2,7 +2,7 @@
 ; in different modules.
 ; RUN: llvm-as %s -o %t.foo1.bc
 ; RUN: llvm-as %s -o %t.foo2.bc
-; RUN: echo {define linkonce void @foo(i32 %x) { ret void }} | llvm-as -o %t.foo3.bc
+; RUN: echo "define linkonce void @foo(i32 %x) { ret void }" | llvm-as -o %t.foo3.bc
 ; RUN: llvm-link %t.foo1.bc %t.foo2.bc -S
 ; RUN: llvm-link %t.foo1.bc %t.foo3.bc -S
 define linkonce void @foo() { ret void }
diff --git a/test/Linker/2008-06-26-AddressSpace.ll b/test/Linker/2008-06-26-AddressSpace.ll
index e3ed385..e1d3574 100644
--- a/test/Linker/2008-06-26-AddressSpace.ll
+++ b/test/Linker/2008-06-26-AddressSpace.ll
@@ -2,8 +2,8 @@
 ; in different modules.
 ; RUN: llvm-as %s -o %t.foo1.bc
 ; RUN: echo | llvm-as -o %t.foo2.bc
-; RUN: llvm-link %t.foo2.bc %t.foo1.bc -S | grep {addrspace(2)}
-; RUN: llvm-link %t.foo1.bc %t.foo2.bc -S | grep {addrspace(2)}
+; RUN: llvm-link %t.foo2.bc %t.foo1.bc -S | grep "addrspace(2)"
+; RUN: llvm-link %t.foo1.bc %t.foo2.bc -S | grep "addrspace(2)"
 ; rdar://6038021
 
 @G = addrspace(2) global i32 256 
diff --git a/test/Linker/AppendingLinkage.ll b/test/Linker/AppendingLinkage.ll
index 134a42e..014ead9 100644
--- a/test/Linker/AppendingLinkage.ll
+++ b/test/Linker/AppendingLinkage.ll
@@ -1,6 +1,6 @@
 ; Test that appending linkage works correctly.
 
-; RUN: echo {@X = appending global \[1 x i32\] \[i32 8\] } | \
+; RUN: echo "@X = appending global [1 x i32] [i32 8] " | \
 ; RUN:   llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep 7 | grep 4 | grep 8
diff --git a/test/Linker/AppendingLinkage2.ll b/test/Linker/AppendingLinkage2.ll
index 2c1302f..7385efb 100644
--- a/test/Linker/AppendingLinkage2.ll
+++ b/test/Linker/AppendingLinkage2.ll
@@ -1,6 +1,6 @@
 ; Test that appending linkage works correctly when arrays are the same size.
 
-; RUN: echo {@X = appending global \[1 x i32\] \[i32 8\] } | \
+; RUN: echo "@X = appending global [1 x i32] [i32 8] " | \
 ; RUN:   llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep 7 | grep 8
diff --git a/test/Linker/ConstantGlobals1.ll b/test/Linker/ConstantGlobals1.ll
index 8fdbe50..716eb3d 100644
--- a/test/Linker/ConstantGlobals1.ll
+++ b/test/Linker/ConstantGlobals1.ll
@@ -1,6 +1,6 @@
 ; Test that appending linkage works correctly when arrays are the same size.
 
-; RUN: echo {@X = constant \[1 x i32\] \[i32 8\] } | \
+; RUN: echo "@X = constant [1 x i32] [i32 8] " | \
 ; RUN:   llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep constant
diff --git a/test/Linker/ConstantGlobals2.ll b/test/Linker/ConstantGlobals2.ll
index ad4428b..ad0f8e2 100644
--- a/test/Linker/ConstantGlobals2.ll
+++ b/test/Linker/ConstantGlobals2.ll
@@ -1,6 +1,6 @@
 ; Test that appending linkage works correctly when arrays are the same size.
 
-; RUN: echo {@X = external global \[1 x i32\] } | \
+; RUN: echo "@X = external global [1 x i32] " | \
 ; RUN:   llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep constant
diff --git a/test/Linker/ConstantGlobals3.ll b/test/Linker/ConstantGlobals3.ll
index e25529a..5aa26bc 100644
--- a/test/Linker/ConstantGlobals3.ll
+++ b/test/Linker/ConstantGlobals3.ll
@@ -1,6 +1,6 @@
 ; Test that appending linkage works correctly when arrays are the same size.
 
-; RUN: echo {@X = external constant \[1 x i32\] } | \
+; RUN: echo "@X = external constant [1 x i32] " | \
 ; RUN:   llvm-as > %t.2.bc
 ; RUN: llvm-as < %s > %t.1.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | grep constant
diff --git a/test/Linker/Inputs/PR11464.a.ll b/test/Linker/Inputs/PR11464.a.ll
new file mode 100644
index 0000000..25a9350
--- /dev/null
+++ b/test/Linker/Inputs/PR11464.a.ll
@@ -0,0 +1,3 @@
+%bug_type = type opaque
+declare i32 @bug_a(%bug_type*)
+declare i32 @bug_b(%bug_type*)
diff --git a/test/Linker/Inputs/PR11464.b.ll b/test/Linker/Inputs/PR11464.b.ll
new file mode 100644
index 0000000..7ef5a36
--- /dev/null
+++ b/test/Linker/Inputs/PR11464.b.ll
@@ -0,0 +1,13 @@
+%bug_type = type { %bug_type* }
+%bar = type { i32 }
+
+define i32 @bug_a(%bug_type* %fp) nounwind uwtable {
+entry:
+  %d_stream = getelementptr inbounds %bug_type* %fp, i64 0, i32 0
+  ret i32 0
+}
+
+define i32 @bug_b(%bar* %a) nounwind uwtable {
+entry:
+  ret i32 0
+}
diff --git a/test/Linker/Inputs/PR8300.a.ll b/test/Linker/Inputs/PR8300.a.ll
new file mode 100644
index 0000000..c705db3
--- /dev/null
+++ b/test/Linker/Inputs/PR8300.a.ll
@@ -0,0 +1,2 @@
+%foo2 = type { [8 x i8] }
+declare void @zed(%foo2*)
diff --git a/test/Linker/Inputs/PR8300.b.ll b/test/Linker/Inputs/PR8300.b.ll
new file mode 100644
index 0000000..9e538f5
--- /dev/null
+++ b/test/Linker/Inputs/PR8300.b.ll
@@ -0,0 +1,9 @@
+%foo = type { [8 x i8] }
+%bar = type { [9 x i8] }
+
+@zed = alias bitcast (void (%bar*)* @xyz to void (%foo*)*)
+
+define void @xyz(%bar* %this) {
+entry:
+  ret void
+}
diff --git a/test/Linker/Inputs/basiclink.a.ll b/test/Linker/Inputs/basiclink.a.ll
new file mode 100644
index 0000000..997932d
--- /dev/null
+++ b/test/Linker/Inputs/basiclink.a.ll
@@ -0,0 +1,2 @@
+define i32* @foo(i32 %x) { ret i32* @baz }
+@baz = external global i32
diff --git a/test/Linker/Inputs/basiclink.b.ll b/test/Linker/Inputs/basiclink.b.ll
new file mode 100644
index 0000000..0d2abc7
--- /dev/null
+++ b/test/Linker/Inputs/basiclink.b.ll
@@ -0,0 +1,6 @@
+declare i32* @foo(...)
+define i32* @bar() {
+	%ret = call i32* (...)* @foo( i32 123 )
+	ret i32* %ret
+}
+@baz = global i32 0
diff --git a/test/Linker/Inputs/linkage.a.ll b/test/Linker/Inputs/linkage.a.ll
new file mode 100644
index 0000000..8a156f6
--- /dev/null
+++ b/test/Linker/Inputs/linkage.a.ll
@@ -0,0 +1,2 @@
+@X = linkonce global i32 5
+define linkonce i32 @foo() { ret i32 7 }
diff --git a/test/Linker/Inputs/linkage.b.ll b/test/Linker/Inputs/linkage.b.ll
new file mode 100644
index 0000000..0ada3f4
--- /dev/null
+++ b/test/Linker/Inputs/linkage.b.ll
@@ -0,0 +1,10 @@
+@X = external global i32 
+
+declare i32 @foo() 
+
+define void @bar() {
+	load i32* @X
+	call i32 @foo()
+	ret void
+}
+
diff --git a/test/Linker/PR8300.ll b/test/Linker/PR8300.ll
index f0fc1e7..7c03d5b 100644
--- a/test/Linker/PR8300.ll
+++ b/test/Linker/PR8300.ll
@@ -1,13 +1 @@
-; RUN: echo {%foo2 = type \{ \[8 x i8\] \} \
-; RUN:       declare void @zed(%foo2*) } > %t.ll
-; RUN: llvm-link %t.ll %s -o %t.bc
-
-%foo = type { [8 x i8] }
-%bar = type { [9 x i8] }
-
-@zed = alias bitcast (void (%bar*)* @xyz to void (%foo*)*)
-
-define void @xyz(%bar* %this) {
-entry:
-  ret void
-}
+; RUN: llvm-link %S/Inputs/PR8300.a.ll %S/Inputs/PR8300.b.ll -o %t.bc
diff --git a/test/Linker/basiclink.ll b/test/Linker/basiclink.ll
index afe0320..804329a 100644
--- a/test/Linker/basiclink.ll
+++ b/test/Linker/basiclink.ll
@@ -1,13 +1,6 @@
 ; Test linking two functions with different prototypes and two globals 
 ; in different modules. This is for PR411
-; RUN: llvm-as %s -o %t.bar.bc
-; RUN: echo {define i32* @foo(i32 %x) \{ ret i32* @baz \} \
-; RUN:   @baz = external global i32 } | llvm-as -o %t.foo.bc
-; RUN: llvm-link %t.bar.bc %t.foo.bc -o %t.bc
+; RUN: llvm-as %S/Inputs/basiclink.a.ll -o %t.foo.bc
+; RUN: llvm-as %S/Inputs/basiclink.b.ll -o %t.bar.bc
 ; RUN: llvm-link %t.foo.bc %t.bar.bc -o %t.bc
-declare i32* @foo(...)
-define i32* @bar() {
-	%ret = call i32* (...)* @foo( i32 123 )
-	ret i32* %ret
-}
-@baz = global i32 0
+; RUN: llvm-link %t.bar.bc %t.foo.bc -o %t.bc
diff --git a/test/Linker/link-archive.ll b/test/Linker/link-archive.ll
deleted file mode 100644
index 9251b4e..0000000
--- a/test/Linker/link-archive.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; Test linking of a bc file to an archive via llvm-ld. 
-; PR1434
-; RUN: rm -f %t.bar.a %t.foo.a
-; RUN: llvm-as %s -o %t.bar.bc
-; RUN: echo {define i32* @foo(i32 %x) \{ ret i32* @baz \} \
-; RUN:   @baz = external global i32 } | llvm-as -o %t.foo.bc
-; RUN: llvm-ar rcf %t.foo.a %t.foo.bc
-; RUN: llvm-ar rcf %t.bar.a %t.bar.bc
-; RUN: llvm-ld -disable-opt %t.bar.bc %t.foo.a -o %t.bc 
-; RUN: llvm-ld -disable-opt %t.foo.bc %t.bar.a -o %t.bc
-declare i32* @foo(...)
-define i32* @bar() {
-	%ret = call i32* (...)* @foo( i32 123 )
-	ret i32* %ret
-}
-@baz = global i32 0
diff --git a/test/Linker/link-global-to-func.ll b/test/Linker/link-global-to-func.ll
index 2fc501d..9d969d7 100644
--- a/test/Linker/link-global-to-func.ll
+++ b/test/Linker/link-global-to-func.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as %s -o %t1.bc
-; RUN: echo {declare void @__eprintf(i8*, i8*, i32, i8*) noreturn     define void @foo() {      tail call void @__eprintf( i8* undef, i8* undef, i32 4, i8* null ) noreturn nounwind       unreachable }} | llvm-as -o %t2.bc
+; RUN: echo "declare void @__eprintf(i8*, i8*, i32, i8*) noreturn     define void @foo() {      tail call void @__eprintf( i8* undef, i8* undef, i32 4, i8* null ) noreturn nounwind       unreachable }" | llvm-as -o %t2.bc
 ; RUN: llvm-link %t2.bc %t1.bc -S | grep __eprintf
 ; RUN: llvm-link %t1.bc %t2.bc -S | grep __eprintf
 
diff --git a/test/Linker/link-messages.ll b/test/Linker/link-messages.ll
index 920782d..4e7ffbc 100644
--- a/test/Linker/link-messages.ll
+++ b/test/Linker/link-messages.ll
@@ -2,10 +2,9 @@
 ; that error is printed out.
 ; RUN: llvm-as %s -o %t.one.bc
 ; RUN: llvm-as %s -o %t.two.bc
-; RUN: not llvm-ld -disable-opt -link-as-library %t.one.bc %t.two.bc \
-; RUN:   -o %t.bc 2>%t.err 
-; RUN: grep "symbol multiply defined" %t.err
+; RUN: not llvm-link %t.one.bc %t.two.bc -o %t.bc 2>&1 | FileCheck %s
 
+; CHECK: symbol multiply defined
 define i32 @bar() {
-	ret i32 0
+  ret i32 0
 }
diff --git a/test/Linker/linkage.ll b/test/Linker/linkage.ll
new file mode 100644
index 0000000..c7309aa
--- /dev/null
+++ b/test/Linker/linkage.ll
@@ -0,0 +1,3 @@
+; RUN: llvm-as %S/Inputs/linkage.a.ll -o %t.1.bc
+; RUN: llvm-as %S/Inputs/linkage.b.ll -o %t.2.bc
+; RUN: llvm-link %t.1.bc  %t.2.bc
diff --git a/test/Linker/module-flags-4-a.ll b/test/Linker/module-flags-4-a.ll
index f411a56..a656c8b 100644
--- a/test/Linker/module-flags-4-a.ll
+++ b/test/Linker/module-flags-4-a.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-link %s %p/module-flags-4-b.ll -S -o - |& FileCheck %s
+; RUN: not llvm-link %s %p/module-flags-4-b.ll -S -o - 2>&1 | FileCheck %s
 
 ; Test 'require' error.
 
diff --git a/test/Linker/module-flags-5-a.ll b/test/Linker/module-flags-5-a.ll
index 2e59ecc..8d625cd 100644
--- a/test/Linker/module-flags-5-a.ll
+++ b/test/Linker/module-flags-5-a.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-link %s %p/module-flags-5-b.ll -S -o - |& FileCheck %s
+; RUN: not llvm-link %s %p/module-flags-5-b.ll -S -o - 2>&1 | FileCheck %s
 
 ; Test the 'override' error.
 
diff --git a/test/Linker/module-flags-6-a.ll b/test/Linker/module-flags-6-a.ll
index c3e0225..5329c43 100644
--- a/test/Linker/module-flags-6-a.ll
+++ b/test/Linker/module-flags-6-a.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-link %s %p/module-flags-6-b.ll -S -o - |& FileCheck %s
+; RUN: not llvm-link %s %p/module-flags-6-b.ll -S -o - 2>&1 | FileCheck %s
 
 ; Test module flags error messages.
 
diff --git a/test/Linker/multiple-merged-structs.ll b/test/Linker/multiple-merged-structs.ll
index 348cd89..aa8204d 100644
--- a/test/Linker/multiple-merged-structs.ll
+++ b/test/Linker/multiple-merged-structs.ll
@@ -1,19 +1,2 @@
-; RUN: echo {%bug_type = type opaque \
-; RUN:     declare i32 @bug_a(%bug_type*) \
-; RUN:     declare i32 @bug_b(%bug_type*) } > %t.ll
-; RUN: llvm-link %t.ll %s
+; RUN: llvm-link %S/Inputs/PR11464.a.ll %S/Inputs/PR11464.b.ll
 ; PR11464
-
-%bug_type = type { %bug_type* }
-%bar = type { i32 }
-
-define i32 @bug_a(%bug_type* %fp) nounwind uwtable {
-entry:
-  %d_stream = getelementptr inbounds %bug_type* %fp, i64 0, i32 0
-  ret i32 0
-}
-
-define i32 @bug_b(%bar* %a) nounwind uwtable {
-entry:
-  ret i32 0
-}
diff --git a/test/Linker/redefinition.ll b/test/Linker/redefinition.ll
index 0d05689..23ba6a1 100644
--- a/test/Linker/redefinition.ll
+++ b/test/Linker/redefinition.ll
@@ -2,9 +2,9 @@
 ; in different modules.
 ; RUN: llvm-as %s -o %t.foo1.bc
 ; RUN: llvm-as %s -o %t.foo2.bc
-; RUN: echo {define void @foo(i32 %x) { ret void }} | llvm-as -o %t.foo3.bc
-; RUN: not llvm-link %t.foo1.bc %t.foo2.bc -o %t.bc |& \
-; RUN:   grep {symbol multiply defined}
-; RUN: not llvm-link %t.foo1.bc %t.foo3.bc -o %t.bc |& \
-; RUN:   grep {symbol multiply defined}
+; RUN: echo "define void @foo(i32 %x) { ret void }" | llvm-as -o %t.foo3.bc
+; RUN: not llvm-link %t.foo1.bc %t.foo2.bc -o %t.bc 2>&1 | \
+; RUN:   grep "symbol multiply defined"
+; RUN: not llvm-link %t.foo1.bc %t.foo3.bc -o %t.bc 2>&1 | \
+; RUN:   grep "symbol multiply defined"
 define void @foo() { ret void }
diff --git a/test/Linker/weakextern.ll b/test/Linker/weakextern.ll
index aa38b12..3a72a48 100644
--- a/test/Linker/weakextern.ll
+++ b/test/Linker/weakextern.ll
@@ -1,9 +1,9 @@
 ; RUN: llvm-as < %s > %t.bc
 ; RUN: llvm-as < %p/testlink1.ll > %t2.bc
 ; RUN: llvm-link %t.bc %t.bc %t2.bc -o %t1.bc
-; RUN: llvm-dis < %t1.bc | grep {kallsyms_names = extern_weak}
-; RUN: llvm-dis < %t1.bc | grep {MyVar = external global i32}
-; RUN: llvm-dis < %t1.bc | grep {Inte = global i32}
+; RUN: llvm-dis < %t1.bc | grep "kallsyms_names = extern_weak"
+; RUN: llvm-dis < %t1.bc | grep "MyVar = external global i32"
+; RUN: llvm-dis < %t1.bc | grep "Inte = global i32"
 
 @kallsyms_names = extern_weak global [0 x i8]		; <[0 x i8]*> [#uses=0]
 @MyVar = extern_weak global i32		; <i32*> [#uses=0]
diff --git a/test/MC/ARM/arm_fixups.s b/test/MC/ARM/arm_fixups.s
index 74dfb99..99eb3c5 100644
--- a/test/MC/ARM/arm_fixups.s
+++ b/test/MC/ARM/arm_fixups.s
@@ -15,3 +15,8 @@
 @ CHECK: @   fixup A - offset: 0, value: _foo, kind: fixup_arm_movw_lo16
 @ CHECK: movt	r9, :upper16:_foo       @ encoding: [A,0x90'A',0b0100AAAA,0xe3]
 @ CHECK: @   fixup A - offset: 0, value: _foo, kind: fixup_arm_movt_hi16
+
+    mov r2, fred
+
+@ CHECK: movw  r2, fred                 @ encoding: [A,0x20'A',0b0000AAAA,0xe3]
+@ CHECK: @   fixup A - offset: 0, value: fred, kind: fixup_arm_movw_lo16
diff --git a/test/MC/ARM/arm_instructions.s b/test/MC/ARM/arm_instructions.s
index 186954c..ce7e036 100644
--- a/test/MC/ARM/arm_instructions.s
+++ b/test/MC/ARM/arm_instructions.s
@@ -74,3 +74,6 @@
 @ CHECK: cpsie none, #0                @ encoding: [0x00,0x00,0x0a,0xf1]
         cpsie none, #0
 
+@ CHECK: strh r3, [r2, #-0]            @ encoding: [0xb0,0x30,0x42,0xe1]
+        strh r3, [r2, #-0]
+
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index 4788ac7..5c2a214 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -1,4 +1,4 @@
-@ RUN: llvm-mc -triple=armv7-apple-darwin -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=armv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
   .syntax unified
   .globl _func
 
@@ -141,6 +141,14 @@ Lforward:
 @ CHECK: adr	r2, #3                  @ encoding: [0x03,0x20,0x8f,0xe2]
 @ CHECK: adr	r2, #-3                 @ encoding: [0x03,0x20,0x4f,0xe2]
 
+        adr r1, #-0x0
+        adr r1, #-0x12000000
+        adr r1, #0x12000000
+
+@ CHECK: adr	r1, #-0                 @ encoding: [0x00,0x10,0x4f,0xe2]
+@ CHECK: adr	r1, #-301989888         @ encoding: [0x12,0x14,0x4f,0xe2]
+@ CHECK: adr	r1, #301989888          @ encoding: [0x12,0x14,0x8f,0xe2]
+
 
 @------------------------------------------------------------------------------
 @ ADD
@@ -206,6 +214,11 @@ Lforward:
 @ CHECK: sub	r0, r0, #4              @ encoding: [0x04,0x00,0x40,0xe2]
 @ CHECK: sub	r4, r5, #21             @ encoding: [0x15,0x40,0x45,0xe2]
 
+    @ Test right shift by 32, which is encoded as 0
+    add r3, r1, r2, lsr #32
+    add r3, r1, r2, asr #32
+@ CHECK: add	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0x81,0xe0]
+@ CHECK: add	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x81,0xe0]
 
 @------------------------------------------------------------------------------
 @ AND
@@ -265,6 +278,12 @@ Lforward:
 @ CHECK: and	r6, r6, r7, ror r2      @ encoding: [0x77,0x62,0x06,0xe0]
 @ CHECK: and	r10, r10, r1, rrx       @ encoding: [0x61,0xa0,0x0a,0xe0]
 
+    @ Test right shift by 32, which is encoded as 0
+    and r3, r1, r2, lsr #32
+    and r3, r1, r2, asr #32
+@ CHECK: and	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0x01,0xe0]
+@ CHECK: and	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x01,0xe0]
+
 @------------------------------------------------------------------------------
 @ ASR
 @------------------------------------------------------------------------------
@@ -368,6 +387,12 @@ Lforward:
 @ CHECK: bic	r6, r6, r7, ror r2      @ encoding: [0x77,0x62,0xc6,0xe1]
 @ CHECK: bic	r10, r10, r1, rrx       @ encoding: [0x61,0xa0,0xca,0xe1]
 
+    @ Test right shift by 32, which is encoded as 0
+    bic r3, r1, r2, lsr #32
+    bic r3, r1, r2, asr #32
+@ CHECK: bic	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0xc1,0xe1]
+@ CHECK: bic	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0xc1,0xe1]
+
 @------------------------------------------------------------------------------
 @ BKPT
 @------------------------------------------------------------------------------
@@ -542,6 +567,23 @@ Lforward:
 @------------------------------------------------------------------------------
 @ DMB
 @------------------------------------------------------------------------------
+        dmb #0xf
+        dmb #0xe
+        dmb #0xd
+        dmb #0xc
+        dmb #0xb
+        dmb #0xa
+        dmb #0x9
+        dmb #0x8
+        dmb #0x7
+        dmb #0x6
+        dmb #0x5
+        dmb #0x4
+        dmb #0x3
+        dmb #0x2
+        dmb #0x1
+        dmb #0x0
+
         dmb sy
         dmb st
         dmb sh
@@ -558,6 +600,23 @@ Lforward:
 
 @ CHECK: dmb	sy                      @ encoding: [0x5f,0xf0,0x7f,0xf5]
 @ CHECK: dmb	st                      @ encoding: [0x5e,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0xd                    @ encoding: [0x5d,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0xc                    @ encoding: [0x5c,0xf0,0x7f,0xf5]
+@ CHECK: dmb	ish                     @ encoding: [0x5b,0xf0,0x7f,0xf5]
+@ CHECK: dmb	ishst                   @ encoding: [0x5a,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x9                    @ encoding: [0x59,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x8                    @ encoding: [0x58,0xf0,0x7f,0xf5]
+@ CHECK: dmb	nsh                     @ encoding: [0x57,0xf0,0x7f,0xf5]
+@ CHECK: dmb	nshst                   @ encoding: [0x56,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x5                    @ encoding: [0x55,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x4                    @ encoding: [0x54,0xf0,0x7f,0xf5]
+@ CHECK: dmb	osh                     @ encoding: [0x53,0xf0,0x7f,0xf5]
+@ CHECK: dmb	oshst                   @ encoding: [0x52,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x1                    @ encoding: [0x51,0xf0,0x7f,0xf5]
+@ CHECK: dmb	#0x0                    @ encoding: [0x50,0xf0,0x7f,0xf5]
+
+@ CHECK: dmb	sy                      @ encoding: [0x5f,0xf0,0x7f,0xf5]
+@ CHECK: dmb	st                      @ encoding: [0x5e,0xf0,0x7f,0xf5]
 @ CHECK: dmb	ish                     @ encoding: [0x5b,0xf0,0x7f,0xf5]
 @ CHECK: dmb	ish                     @ encoding: [0x5b,0xf0,0x7f,0xf5]
 @ CHECK: dmb	ishst                   @ encoding: [0x5a,0xf0,0x7f,0xf5]
@@ -573,6 +632,26 @@ Lforward:
 @------------------------------------------------------------------------------
 @ DSB
 @------------------------------------------------------------------------------
+        dsb #0xf
+        dsb #0xe
+        dsb #0xd
+        dsb #0xc
+        dsb #0xb
+        dsb #0xa
+        dsb #0x9
+        dsb #0x8
+        dsb #0x7
+        dsb #0x6
+        dsb #0x5
+        dsb #0x4
+        dsb #0x3
+        dsb #0x2
+        dsb #0x1
+        dsb #0x0
+
+        dsb 8
+        dsb 7
+
         dsb sy
         dsb st
         dsb sh
@@ -589,6 +668,26 @@ Lforward:
 
 @ CHECK: dsb	sy                      @ encoding: [0x4f,0xf0,0x7f,0xf5]
 @ CHECK: dsb	st                      @ encoding: [0x4e,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0xd                    @ encoding: [0x4d,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0xc                    @ encoding: [0x4c,0xf0,0x7f,0xf5]
+@ CHECK: dsb	ish                     @ encoding: [0x4b,0xf0,0x7f,0xf5]
+@ CHECK: dsb	ishst                   @ encoding: [0x4a,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x9                    @ encoding: [0x49,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x8                    @ encoding: [0x48,0xf0,0x7f,0xf5]
+@ CHECK: dsb	nsh                     @ encoding: [0x47,0xf0,0x7f,0xf5]
+@ CHECK: dsb	nshst                   @ encoding: [0x46,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x5                    @ encoding: [0x45,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x4                    @ encoding: [0x44,0xf0,0x7f,0xf5]
+@ CHECK: dsb	osh                     @ encoding: [0x43,0xf0,0x7f,0xf5]
+@ CHECK: dsb	oshst                   @ encoding: [0x42,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x1                    @ encoding: [0x41,0xf0,0x7f,0xf5]
+@ CHECK: dsb	#0x0                    @ encoding: [0x40,0xf0,0x7f,0xf5]
+
+@ CHECK: dsb	#0x8                    @ encoding: [0x48,0xf0,0x7f,0xf5]
+@ CHECK: dsb	nsh                     @ encoding: [0x47,0xf0,0x7f,0xf5]
+
+@ CHECK: dsb	sy                      @ encoding: [0x4f,0xf0,0x7f,0xf5]
+@ CHECK: dsb	st                      @ encoding: [0x4e,0xf0,0x7f,0xf5]
 @ CHECK: dsb	ish                     @ encoding: [0x4b,0xf0,0x7f,0xf5]
 @ CHECK: dsb	ish                     @ encoding: [0x4b,0xf0,0x7f,0xf5]
 @ CHECK: dsb	ishst                   @ encoding: [0x4a,0xf0,0x7f,0xf5]
@@ -601,6 +700,12 @@ Lforward:
 @ CHECK: dsb	oshst                   @ encoding: [0x42,0xf0,0x7f,0xf5]
 @ CHECK: dsb	sy                      @ encoding: [0x4f,0xf0,0x7f,0xf5]
 
+@ With capitals
+        dsb SY
+        dsb OSHST
+
+@ CHECK: dsb	sy                      @ encoding: [0x4f,0xf0,0x7f,0xf5]
+@ CHECK: dsb	oshst                   @ encoding: [0x42,0xf0,0x7f,0xf5]
 @------------------------------------------------------------------------------
 @ EOR
 @------------------------------------------------------------------------------
@@ -658,6 +763,11 @@ Lforward:
 @ CHECK: eor	r6, r6, r7, ror r9      @ encoding: [0x77,0x69,0x26,0xe0]
 @ CHECK: eor	r4, r4, r5, rrx         @ encoding: [0x65,0x40,0x24,0xe0]
 
+    @ Test right shift by 32, which is encoded as 0
+    eor r3, r1, r2, lsr #32
+    eor r3, r1, r2, asr #32
+@ CHECK: eor	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0x21,0xe0]
+@ CHECK: eor	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x21,0xe0]
 
 @------------------------------------------------------------------------------
 @ ISB
@@ -1205,6 +1315,12 @@ Lforward:
 @ CHECK: orrslt	r6, r6, r7, ror r9      @ encoding: [0x77,0x69,0x96,0xb1]
 @ CHECK: orrsgt	r4, r4, r5, rrx         @ encoding: [0x65,0x40,0x94,0xc1]
 
+    @ Test right shift by 32, which is encoded as 0
+    orr r3, r1, r2, lsr #32
+    orr r3, r1, r2, asr #32
+@ CHECK: orr	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0x81,0xe1]
+@ CHECK: orr	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x81,0xe1]
+
 @------------------------------------------------------------------------------
 @ PKH
 @------------------------------------------------------------------------------
@@ -2210,6 +2326,11 @@ Lforward:
 @ CHECK: sub	r6, r6, r7, asr r9      @ encoding: [0x57,0x69,0x46,0xe0]
 @ CHECK: sub	r6, r6, r7, ror r9      @ encoding: [0x77,0x69,0x46,0xe0]
 
+    @ Test right shift by 32, which is encoded as 0
+    sub r3, r1, r2, lsr #32
+    sub r3, r1, r2, asr #32
+@ CHECK: sub	r3, r1, r2, lsr #32     @ encoding: [0x22,0x30,0x41,0xe0]
+@ CHECK: sub	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x41,0xe0]
 
 @------------------------------------------------------------------------------
 @ SVC
@@ -2711,10 +2832,22 @@ Lforward:
         wfilt
         yield
         yieldne
-
-@ CHECK: wfe @ encoding: [0x02,0xf0,0x20,0xe3]
-@ CHECK: wfehi @ encoding: [0x02,0xf0,0x20,0x83]
-@ CHECK: wfi @ encoding: [0x03,0xf0,0x20,0xe3]
-@ CHECK: wfilt @ encoding: [0x03,0xf0,0x20,0xb3]
-@ CHECK: yield @ encoding: [0x01,0xf0,0x20,0xe3]
-@ CHECK: yieldne @ encoding: [0x01,0xf0,0x20,0x13]
+        hint #5
+        hint #4
+        hint #3
+        hint #2
+        hint #1
+        hint #0
+
+@ CHECK: wfe                            @ encoding: [0x02,0xf0,0x20,0xe3]
+@ CHECK: wfehi                          @ encoding: [0x02,0xf0,0x20,0x83]
+@ CHECK: wfi                            @ encoding: [0x03,0xf0,0x20,0xe3]
+@ CHECK: wfilt                          @ encoding: [0x03,0xf0,0x20,0xb3]
+@ CHECK: yield                          @ encoding: [0x01,0xf0,0x20,0xe3]
+@ CHECK: yieldne                        @ encoding: [0x01,0xf0,0x20,0x13]
+@ CHECK: hint	#5                      @ encoding: [0x05,0xf0,0x20,0xe3]
+@ CHECK: sev                            @ encoding: [0x04,0xf0,0x20,0xe3]
+@ CHECK: wfi                            @ encoding: [0x03,0xf0,0x20,0xe3]
+@ CHECK: wfe                            @ encoding: [0x02,0xf0,0x20,0xe3]
+@ CHECK: yield                          @ encoding: [0x01,0xf0,0x20,0xe3]
+@ CHECK: nop                            @ encoding: [0x00,0xf0,0x20,0xe3]
diff --git a/test/MC/ARM/basic-thumb-instructions.s b/test/MC/ARM/basic-thumb-instructions.s
index bc2605c..4ee34ce 100644
--- a/test/MC/ARM/basic-thumb-instructions.s
+++ b/test/MC/ARM/basic-thumb-instructions.s
@@ -169,9 +169,9 @@ _func:
         bl _bar
         blx _baz
 
-@ CHECK: bl	_bar                    @ encoding: [A,0xf0'A',A,0xf8'A']
+@ CHECK: bl	_bar                    @ encoding: [A,0xf0'A',A,0xd0'A']
              @   fixup A - offset: 0, value: _bar, kind: fixup_arm_thumb_bl
-@ CHECK: blx	_baz                    @ encoding: [A,0xf0'A',A,0xe8'A']
+@ CHECK: blx	_baz                    @ encoding: [A,0xf0'A',A,0xc0'A']
              @   fixup A - offset: 0, value: _baz, kind: fixup_arm_thumb_blx
 
 
@@ -635,13 +635,3 @@ _func:
 @ CHECK: uxth	r1, r4                  @ encoding: [0xa1,0xb2]
 
 
-@------------------------------------------------------------------------------
-@ WFE/WFI/YIELD
-@------------------------------------------------------------------------------
-        wfe
-        wfi
-        yield
-
-@ CHECK: wfe                             @ encoding: [0x20,0xbf]
-@ CHECK: wfi                             @ encoding: [0x30,0xbf]
-@ CHECK: yield                           @ encoding: [0x10,0xbf]
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index d2e208b..23d9f59 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -1,4 +1,4 @@
-@ RUN: llvm-mc -triple=thumbv7-apple-darwin -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
   .syntax unified
   .globl _func
 
@@ -48,6 +48,7 @@ _func:
         adcs	r0, r1, r3, lsl #7
         adc.w	r0, r1, r3, lsr #31
         adcs.w	r0, r1, r3, asr #32
+        add r2, sp, ip
 
 @ CHECK: adc.w	r4, r5, r6              @ encoding: [0x45,0xeb,0x06,0x04]
 @ CHECK: adcs.w	r4, r5, r6              @ encoding: [0x55,0xeb,0x06,0x04]
@@ -57,6 +58,7 @@ _func:
 @ CHECK: adcs.w	r0, r1, r3, lsl #7      @ encoding: [0x51,0xeb,0xc3,0x10]
 @ CHECK: adc.w	r0, r1, r3, lsr #31     @ encoding: [0x41,0xeb,0xd3,0x70]
 @ CHECK: adcs.w	r0, r1, r3, asr #32     @ encoding: [0x51,0xeb,0x23,0x00]
+@ CHECK: add.w	r2, sp, r12             @ encoding: [0x0d,0xeb,0x0c,0x02]
 
 
 @------------------------------------------------------------------------------
@@ -78,6 +80,12 @@ _func:
         adds r2, r2, #56
         adds r2, #56
 
+        adds.w r2, #-16
+        adds.w r2, r2, #-16
+        addw r2, #-16
+        addw r2, #-16
+        addw r2, r2, #-16
+
 @ CHECK: itet	eq                      @ encoding: [0x0a,0xbf]
 @ CHECK: addeq	r1, r2, #4              @ encoding: [0x11,0x1d]
 @ CHECK: addwne	r5, r3, #1023           @ encoding: [0x03,0xf2,0xff,0x35]
@@ -94,6 +102,12 @@ _func:
 @ CHECK: adds	r2, #56                 @ encoding: [0x38,0x32]
 @ CHECK: adds	r2, #56                 @ encoding: [0x38,0x32]
 
+@ CHECK: subs.w	r2, r2, #16             @ encoding: [0xb2,0xf1,0x10,0x02]
+@ CHECK: subs.w	r2, r2, #16             @ encoding: [0xb2,0xf1,0x10,0x02]
+@ CHECK: subw	r2, r2, #16             @ encoding: [0xa2,0xf2,0x10,0x02]
+@ CHECK: subw	r2, r2, #16             @ encoding: [0xa2,0xf2,0x10,0x02]
+@ CHECK: subw	r2, r2, #16             @ encoding: [0xa2,0xf2,0x10,0x02]
+
 
 @------------------------------------------------------------------------------
 @ ADD (register)
@@ -121,9 +135,11 @@ _func:
 
         subw r11, pc, #3270
         adr.w r11, #-826
+        adr.w r1, #-0x0
 
 @ CHECK: subw	r11, pc, #3270          @ encoding: [0xaf,0xf6,0xc6,0x4b]
 @ CHECK: adr.w	r11, #-826              @ encoding: [0xaf,0xf2,0x3a,0x3b]
+@ CHECK: adr.w	r1, #-0                 @ encoding: [0xaf,0xf2,0x00,0x01]
 
 @------------------------------------------------------------------------------
 @ AND (immediate)
@@ -401,6 +417,23 @@ _func:
 @------------------------------------------------------------------------------
 @ DMB
 @------------------------------------------------------------------------------
+        dmb #0xf
+        dmb #0xe
+        dmb #0xd
+        dmb #0xc
+        dmb #0xb
+        dmb #0xa
+        dmb #0x9
+        dmb #0x8
+        dmb #0x7
+        dmb #0x6
+        dmb #0x5
+        dmb #0x4
+        dmb #0x3
+        dmb #0x2
+        dmb #0x1
+        dmb #0x0
+
         dmb sy
         dmb st
         dmb sh
@@ -417,6 +450,23 @@ _func:
 
 @ CHECK: dmb	sy                      @ encoding: [0xbf,0xf3,0x5f,0x8f]
 @ CHECK: dmb	st                      @ encoding: [0xbf,0xf3,0x5e,0x8f]
+@ CHECK: dmb	#0xd                    @ encoding: [0xbf,0xf3,0x5d,0x8f]
+@ CHECK: dmb	#0xc                    @ encoding: [0xbf,0xf3,0x5c,0x8f]
+@ CHECK: dmb	ish                     @ encoding: [0xbf,0xf3,0x5b,0x8f]
+@ CHECK: dmb	ishst                   @ encoding: [0xbf,0xf3,0x5a,0x8f]
+@ CHECK: dmb	#0x9                    @ encoding: [0xbf,0xf3,0x59,0x8f]
+@ CHECK: dmb	#0x8                    @ encoding: [0xbf,0xf3,0x58,0x8f]
+@ CHECK: dmb	nsh                     @ encoding: [0xbf,0xf3,0x57,0x8f]
+@ CHECK: dmb	nshst                   @ encoding: [0xbf,0xf3,0x56,0x8f]
+@ CHECK: dmb	#0x5                    @ encoding: [0xbf,0xf3,0x55,0x8f]
+@ CHECK: dmb	#0x4                    @ encoding: [0xbf,0xf3,0x54,0x8f]
+@ CHECK: dmb	osh                     @ encoding: [0xbf,0xf3,0x53,0x8f]
+@ CHECK: dmb	oshst                   @ encoding: [0xbf,0xf3,0x52,0x8f]
+@ CHECK: dmb	#0x1                    @ encoding: [0xbf,0xf3,0x51,0x8f]
+@ CHECK: dmb	#0x0                    @ encoding: [0xbf,0xf3,0x50,0x8f]
+
+@ CHECK: dmb	sy                      @ encoding: [0xbf,0xf3,0x5f,0x8f]
+@ CHECK: dmb	st                      @ encoding: [0xbf,0xf3,0x5e,0x8f]
 @ CHECK: dmb	ish                     @ encoding: [0xbf,0xf3,0x5b,0x8f]
 @ CHECK: dmb	ish                     @ encoding: [0xbf,0xf3,0x5b,0x8f]
 @ CHECK: dmb	ishst                   @ encoding: [0xbf,0xf3,0x5a,0x8f]
@@ -433,6 +483,23 @@ _func:
 @------------------------------------------------------------------------------
 @ DSB
 @------------------------------------------------------------------------------
+        dsb #0xf
+        dsb #0xe
+        dsb #0xd
+        dsb #0xc
+        dsb #0xb
+        dsb #0xa
+        dsb #0x9
+        dsb #0x8
+        dsb #0x7
+        dsb #0x6
+        dsb #0x5
+        dsb #0x4
+        dsb #0x3
+        dsb #0x2
+        dsb #0x1
+        dsb #0x0
+
         dsb sy
         dsb st
         dsb sh
@@ -449,6 +516,23 @@ _func:
 
 @ CHECK: dsb	sy                      @ encoding: [0xbf,0xf3,0x4f,0x8f]
 @ CHECK: dsb	st                      @ encoding: [0xbf,0xf3,0x4e,0x8f]
+@ CHECK: dsb	#0xd                    @ encoding: [0xbf,0xf3,0x4d,0x8f]
+@ CHECK: dsb	#0xc                    @ encoding: [0xbf,0xf3,0x4c,0x8f]
+@ CHECK: dsb	ish                     @ encoding: [0xbf,0xf3,0x4b,0x8f]
+@ CHECK: dsb	ishst                   @ encoding: [0xbf,0xf3,0x4a,0x8f]
+@ CHECK: dsb	#0x9                    @ encoding: [0xbf,0xf3,0x49,0x8f]
+@ CHECK: dsb	#0x8                    @ encoding: [0xbf,0xf3,0x48,0x8f]
+@ CHECK: dsb	nsh                     @ encoding: [0xbf,0xf3,0x47,0x8f]
+@ CHECK: dsb	nshst                   @ encoding: [0xbf,0xf3,0x46,0x8f]
+@ CHECK: dsb	#0x5                    @ encoding: [0xbf,0xf3,0x45,0x8f]
+@ CHECK: dsb	#0x4                    @ encoding: [0xbf,0xf3,0x44,0x8f]
+@ CHECK: dsb	osh                     @ encoding: [0xbf,0xf3,0x43,0x8f]
+@ CHECK: dsb	oshst                   @ encoding: [0xbf,0xf3,0x42,0x8f]
+@ CHECK: dsb	#0x1                    @ encoding: [0xbf,0xf3,0x41,0x8f]
+@ CHECK: dsb	#0x0                    @ encoding: [0xbf,0xf3,0x40,0x8f]
+
+@ CHECK: dsb	sy                      @ encoding: [0xbf,0xf3,0x4f,0x8f]
+@ CHECK: dsb	st                      @ encoding: [0xbf,0xf3,0x4e,0x8f]
 @ CHECK: dsb	ish                     @ encoding: [0xbf,0xf3,0x4b,0x8f]
 @ CHECK: dsb	ish                     @ encoding: [0xbf,0xf3,0x4b,0x8f]
 @ CHECK: dsb	ishst                   @ encoding: [0xbf,0xf3,0x4a,0x8f]
@@ -509,6 +593,19 @@ _func:
 @ CHECK: subne	r5, r6, r7              @ encoding: [0xf5,0x1b]
 @ CHECK: addeq	r1, r2, #4              @ encoding: [0x11,0x1d]
 
+@ Should also work for UPPER CASE condition codes.
+
+        ITEET EQ
+        ADDEQ R0, R1, R2
+        NOPNE
+        SUBNE R5, R6, R7
+        ADDEQ R1, R2, #4
+
+@ CHECK: iteet	eq                      @ encoding: [0x0d,0xbf]
+@ CHECK: addeq	r0, r1, r2              @ encoding: [0x88,0x18]
+@ CHECK: nopne                          @ encoding: [0x00,0xbf]
+@ CHECK: subne	r5, r6, r7              @ encoding: [0xf5,0x1b]
+@ CHECK: addeq	r1, r2, #4              @ encoding: [0x11,0x1d]
 
 @------------------------------------------------------------------------------
 @ LDC{L}/LDC2{L}
@@ -755,6 +852,9 @@ _func:
         ldrd r3, r5, [r6], #-8
         ldrd r3, r5, [r6]
         ldrd r8, r1, [r3, #0]
+        ldrd r0, r1, [r2, #-0]
+        ldrd r0, r1, [r2, #-0]!
+        ldrd r0, r1, [r2], #-0
 
 @ CHECK: ldrd	r3, r5, [r6, #24]       @ encoding: [0xd6,0xe9,0x06,0x35]
 @ CHECK: ldrd	r3, r5, [r6, #24]!      @ encoding: [0xf6,0xe9,0x06,0x35]
@@ -762,6 +862,9 @@ _func:
 @ CHECK: ldrd	r3, r5, [r6], #-8       @ encoding: [0x76,0xe8,0x02,0x35]
 @ CHECK: ldrd	r3, r5, [r6]            @ encoding: [0xd6,0xe9,0x00,0x35]
 @ CHECK: ldrd	r8, r1, [r3]            @ encoding: [0xd3,0xe9,0x00,0x81]
+@ CHECK: ldrd	r0, r1, [r2, #-0]       @ encoding: [0x52,0xe9,0x00,0x01]
+@ CHECK: ldrd	r0, r1, [r2, #-0]!      @ encoding: [0x72,0xe9,0x00,0x01]
+@ CHECK: ldrd	r0, r1, [r2], #-0       @ encoding: [0x72,0xe8,0x00,0x01]
 
 
 @------------------------------------------------------------------------------
@@ -2539,6 +2642,9 @@ _func:
         strd r3, r5, [r6], #-8
         strd r3, r5, [r6]
         strd r8, r1, [r3, #0]
+        strd r0, r1, [r2, #-0]
+        strd r0, r1, [r2, #-0]!
+        strd r0, r1, [r2], #-0
 
 @ CHECK: strd	r3, r5, [r6, #24]       @ encoding: [0xc6,0xe9,0x06,0x35]
 @ CHECK: strd	r3, r5, [r6, #24]!      @ encoding: [0xe6,0xe9,0x06,0x35]
@@ -2546,6 +2652,9 @@ _func:
 @ CHECK: strd	r3, r5, [r6], #-8       @ encoding: [0x66,0xe8,0x02,0x35]
 @ CHECK: strd	r3, r5, [r6]            @ encoding: [0xc6,0xe9,0x00,0x35]
 @ CHECK: strd	r8, r1, [r3]            @ encoding: [0xc3,0xe9,0x00,0x81]
+@ CHECK: strd   r0, r1, [r2, #-0]       @ encoding: [0x42,0xe9,0x00,0x01]
+@ CHECK: strd   r0, r1, [r2, #-0]!      @ encoding: [0x62,0xe9,0x00,0x01]
+@ CHECK: strd   r0, r1, [r2], #-0       @ encoding: [0x62,0xe8,0x00,0x01]
 
 
 @------------------------------------------------------------------------------
@@ -3342,7 +3451,7 @@ _func:
 @ CHECK: uxth.w	r7, r8                  @ encoding: [0x1f,0xfa,0x88,0xf7]
 
 @------------------------------------------------------------------------------
-@ WFE/WFI/YIELD
+@ WFE/WFI/YIELD/HINT
 @------------------------------------------------------------------------------
         wfe
         wfi
@@ -3351,6 +3460,13 @@ _func:
         wfelt
         wfige
         yieldlt
+        hint #5
+        hint.w #5
+        hint.w #4
+        hint #3
+        hint #2
+        hint #1
+        hint #0
 
 @ CHECK: wfe                            @ encoding: [0x20,0xbf]
 @ CHECK: wfi                            @ encoding: [0x30,0xbf]
@@ -3359,6 +3475,13 @@ _func:
 @ CHECK: wfelt                          @ encoding: [0x20,0xbf]
 @ CHECK: wfige                          @ encoding: [0x30,0xbf]
 @ CHECK: yieldlt                        @ encoding: [0x10,0xbf]
+@ CHECK: hint	#5                      @ encoding: [0xaf,0xf3,0x05,0x80]
+@ CHECK: hint	#5                      @ encoding: [0xaf,0xf3,0x05,0x80]
+@ CHECK: sev.w                          @ encoding: [0xaf,0xf3,0x04,0x80]
+@ CHECK: wfi.w                          @ encoding: [0xaf,0xf3,0x03,0x80]
+@ CHECK: wfe.w                          @ encoding: [0xaf,0xf3,0x02,0x80]
+@ CHECK: yield.w                        @ encoding: [0xaf,0xf3,0x01,0x80]
+@ CHECK: nop.w                          @ encoding: [0xaf,0xf3,0x00,0x80]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 7da79c3..499e05501 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -70,8 +70,8 @@
         dbg #-1
         dbg #16
 
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @  Double-check that we're synced up with the right diagnostics.
 @ CHECK-ERRORS: dbg #16
 
@@ -86,8 +86,8 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 
 
         @ Out of range immediate for MOV
@@ -115,8 +115,8 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 
         @ Shifter operand validation for PKH instructions.
         pkhbt r2, r2, r3, lsl #-1
@@ -315,3 +315,9 @@
 @ CHECK-ERRORS: error: coprocessor option must be an immediate in range [0, 255]
 @ CHECK-ERRORS:         ldc2 p2, c8, [r1], { -1 }
 @ CHECK-ERRORS:                              ^
+
+        @ Bad CPS instruction format.
+        cps f,#1
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         cps f,#1
+@ CHECK-ERRORS:               ^
diff --git a/test/MC/ARM/elf-reloc-01.ll b/test/MC/ARM/elf-reloc-01.ll
index 6899d92..c98026b 100644
--- a/test/MC/ARM/elf-reloc-01.ll
+++ b/test/MC/ARM/elf-reloc-01.ll
@@ -61,7 +61,7 @@ bb3:                                              ; preds = %bb, %entry
 declare void @exit(i32) noreturn nounwind
 
 ;; OBJ:          Relocation 1
-;; OBJ-NEXT:     'r_offset', 
+;; OBJ-NEXT:     'r_offset',
 ;; OBJ-NEXT:     'r_sym', 0x000002
 ;; OBJ-NEXT:     'r_type', 0x2b
 
diff --git a/test/MC/ARM/elf-reloc-condcall.s b/test/MC/ARM/elf-reloc-condcall.s
index dcc62d3..08b4ecc 100644
--- a/test/MC/ARM/elf-reloc-condcall.s
+++ b/test/MC/ARM/elf-reloc-condcall.s
@@ -4,6 +4,8 @@
         bleq some_label
         bl some_label
         blx some_label
+        beq some_label
+        b some_label
 // OBJ: .rel.text
 
 // OBJ: 'r_offset', 0x00000000
@@ -18,6 +20,14 @@
 // OBJ-NEXT:  'r_sym', 0x000004
 // OBJ-NEXT: 'r_type', 0x1c
 
+// OBJ: 'r_offset', 0x0000000c
+// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT: 'r_type', 0x1d
+
+// OBJ: 'r_offset', 0x00000010
+// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT: 'r_type', 0x1d
+
 // OBJ: .symtab
 // OBJ: Symbol 4
-// OBJ-NEXT: some_label
-\ No newline at end of file
+// OBJ-NEXT: some_label
diff --git a/test/MC/ARM/neon-bitwise-encoding.s b/test/MC/ARM/neon-bitwise-encoding.s
index 2ce9bcc..e8c1dd6 100644
--- a/test/MC/ARM/neon-bitwise-encoding.s
+++ b/test/MC/ARM/neon-bitwise-encoding.s
@@ -30,11 +30,16 @@
 	vbic	q8, q8, q9
 	vbic.i32	d16, #0xFF000000
 	vbic.i32	q8, #0xFF000000
+        vbic q10, q11
+        vbic d9, d1
 
 @ CHECK: vbic	d16, d17, d16           @ encoding: [0xb0,0x01,0x51,0xf2]
 @ CHECK: vbic	q8, q8, q9              @ encoding: [0xf2,0x01,0x50,0xf2]
 @ CHECK: vbic.i32	d16, #0xff000000 @ encoding: [0x3f,0x07,0xc7,0xf3]
 @ CHECK: vbic.i32	q8, #0xff000000 @ encoding: [0x7f,0x07,0xc7,0xf3]
+@ CHECK: vbic	q10, q10, q11           @ encoding: [0xf6,0x41,0x54,0xf2]
+@ CHECK: vbic	d9, d9, d1              @ encoding: [0x11,0x91,0x19,0xf2]
+
 
 	vorn	d16, d17, d16
 	vorn	q8, q8, q9
@@ -232,32 +237,38 @@
 @ CHECK: vorr	q4, q7, q3              @ encoding: [0x56,0x81,0x2e,0xf2]
 
 @ Two-operand aliases
+	vand  q6, q5
 	vand.s8  q6, q5
 	vand.s16 q7, q1
 	vand.s32 q8, q2
 	vand.f64 q8, q2
 
+	veor   q6, q5
 	veor.8   q6, q5
 	veor.p16 q7, q1
 	veor.u32 q8, q2
 	veor.d   q8, q2
 
+	veor  q6, q5
 	veor.i8  q6, q5
 	veor.16  q7, q1
 	veor.f   q8, q2
 	veor.i64 q8, q2
 
 @ CHECK: vand	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf2]
+@ CHECK: vand	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf2]
 @ CHECK: vand	q7, q7, q1              @ encoding: [0x52,0xe1,0x0e,0xf2]
 @ CHECK: vand	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf2]
 @ CHECK: vand	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf2]
 
 @ CHECK: veor	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf3]
+@ CHECK: veor	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf3]
 @ CHECK: veor	q7, q7, q1              @ encoding: [0x52,0xe1,0x0e,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
 
 @ CHECK: veor	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf3]
+@ CHECK: veor	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf3]
 @ CHECK: veor	q7, q7, q1              @ encoding: [0x52,0xe1,0x0e,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
diff --git a/test/MC/ARM/neon-shiftaccum-encoding.s b/test/MC/ARM/neon-shiftaccum-encoding.s
new file mode 100644
index 0000000..92ca7a3
--- /dev/null
+++ b/test/MC/ARM/neon-shiftaccum-encoding.s
@@ -0,0 +1,209 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding < %s | FileCheck %s
+
+	vsra.s8 d17, d16, #8
+	vsra.s16 d15, d14, #16
+	vsra.s32 d13, d12, #32
+	vsra.s64 d11, d10, #64
+	vsra.s8 q7, q2, #8
+	vsra.s16 q3, q6, #16
+	vsra.s32 q9, q5, #32
+	vsra.s64 q8, q4, #64
+	vsra.u8 d17, d16, #8
+	vsra.u16 d11, d14, #11
+	vsra.u32 d12, d15, #22
+	vsra.u64 d13, d16, #54
+	vsra.u8 q1, q7, #8
+	vsra.u16 q2, q7, #6
+	vsra.u32 q3, q6, #21
+	vsra.u64 q4, q5, #25
+
+        @ Two-operand syntax variant.
+	vsra.s8 d16, #8
+	vsra.s16 d14, #16
+	vsra.s32 d12, #32
+	vsra.s64 d10, #64
+	vsra.s8 q2, #8
+	vsra.s16 q6, #16
+	vsra.s32 q5, #32
+	vsra.s64 q4, #64
+	vsra.u8 d16, #8
+	vsra.u16 d14, #11
+	vsra.u32 d15, #22
+	vsra.u64 d16, #54
+	vsra.u8 q7, #8
+	vsra.u16 q7, #6
+	vsra.u32 q6, #21
+	vsra.u64 q5, #25
+
+@ CHECK: vsra.s8	d17, d16, #8    @ encoding: [0x30,0x11,0xc8,0xf2]
+@ CHECK: vsra.s16	d15, d14, #16   @ encoding: [0x1e,0xf1,0x90,0xf2]
+@ CHECK: vsra.s32	d13, d12, #32   @ encoding: [0x1c,0xd1,0xa0,0xf2]
+@ CHECK: vsra.s64	d11, d10, #64   @ encoding: [0x9a,0xb1,0x80,0xf2]
+@ CHECK: vsra.s8	q7, q2, #8      @ encoding: [0x54,0xe1,0x88,0xf2]
+@ CHECK: vsra.s16	q3, q6, #16     @ encoding: [0x5c,0x61,0x90,0xf2]
+@ CHECK: vsra.s32	q9, q5, #32     @ encoding: [0x5a,0x21,0xe0,0xf2]
+@ CHECK: vsra.s64	q8, q4, #64     @ encoding: [0xd8,0x01,0xc0,0xf2]
+@ CHECK: vsra.u8	d17, d16, #8    @ encoding: [0x30,0x11,0xc8,0xf3]
+@ CHECK: vsra.u16	d11, d14, #11   @ encoding: [0x1e,0xb1,0x95,0xf3]
+@ CHECK: vsra.u32	d12, d15, #22   @ encoding: [0x1f,0xc1,0xaa,0xf3]
+@ CHECK: vsra.u64	d13, d16, #54   @ encoding: [0xb0,0xd1,0x8a,0xf3]
+@ CHECK: vsra.u8	q1, q7, #8      @ encoding: [0x5e,0x21,0x88,0xf3]
+@ CHECK: vsra.u16	q2, q7, #6      @ encoding: [0x5e,0x41,0x9a,0xf3]
+@ CHECK: vsra.u32	q3, q6, #21     @ encoding: [0x5c,0x61,0xab,0xf3]
+@ CHECK: vsra.u64	q4, q5, #25     @ encoding: [0xda,0x81,0xa7,0xf3]
+
+@ CHECK: vsra.s8	d16, d16, #8            @ encoding: [0x30,0x01,0xc8,0xf2]
+@ CHECK: vsra.s16	d14, d14, #16   @ encoding: [0x1e,0xe1,0x90,0xf2]
+@ CHECK: vsra.s32	d12, d12, #32   @ encoding: [0x1c,0xc1,0xa0,0xf2]
+@ CHECK: vsra.s64	d10, d10, #64   @ encoding: [0x9a,0xa1,0x80,0xf2]
+@ CHECK: vsra.s8	q2, q2, #8              @ encoding: [0x54,0x41,0x88,0xf2]
+@ CHECK: vsra.s16	q6, q6, #16     @ encoding: [0x5c,0xc1,0x90,0xf2]
+@ CHECK: vsra.s32	q5, q5, #32     @ encoding: [0x5a,0xa1,0xa0,0xf2]
+@ CHECK: vsra.s64	q4, q4, #64     @ encoding: [0xd8,0x81,0x80,0xf2]
+@ CHECK: vsra.u8	d16, d16, #8            @ encoding: [0x30,0x01,0xc8,0xf3]
+@ CHECK: vsra.u16	d14, d14, #11   @ encoding: [0x1e,0xe1,0x95,0xf3]
+@ CHECK: vsra.u32	d15, d15, #22   @ encoding: [0x1f,0xf1,0xaa,0xf3]
+@ CHECK: vsra.u64	d16, d16, #54   @ encoding: [0xb0,0x01,0xca,0xf3]
+@ CHECK: vsra.u8	q7, q7, #8              @ encoding: [0x5e,0xe1,0x88,0xf3]
+@ CHECK: vsra.u16	q7, q7, #6      @ encoding: [0x5e,0xe1,0x9a,0xf3]
+@ CHECK: vsra.u32	q6, q6, #21     @ encoding: [0x5c,0xc1,0xab,0xf3]
+@ CHECK: vsra.u64	q5, q5, #25     @ encoding: [0xda,0xa1,0xa7,0xf3]
+
+	vrsra.s8 d5, d26, #8
+	vrsra.s16 d6, d25, #16
+	vrsra.s32 d7, d24, #32
+	vrsra.s64 d14, d23, #64
+	vrsra.u8 d15, d22, #8
+	vrsra.u16 d16, d21, #16
+	vrsra.u32 d17, d20, #32
+	vrsra.u64 d18, d19, #64
+	vrsra.s8 q1, q2, #8
+	vrsra.s16 q2, q3, #16
+	vrsra.s32 q3, q4, #32
+	vrsra.s64 q4, q5, #64
+	vrsra.u8 q5, q6, #8
+	vrsra.u16 q6, q7, #16
+	vrsra.u32 q7, q8, #32
+	vrsra.u64 q8, q9, #64
+
+        @ Two-operand syntax variant.
+	vrsra.s8 d26, #8
+	vrsra.s16 d25, #16
+	vrsra.s32 d24, #32
+	vrsra.s64 d23, #64
+	vrsra.u8 d22, #8
+	vrsra.u16 d21, #16
+	vrsra.u32 d20, #32
+	vrsra.u64 d19, #64
+	vrsra.s8 q2, #8
+	vrsra.s16 q3, #16
+	vrsra.s32 q4, #32
+	vrsra.s64 q5, #64
+	vrsra.u8 q6, #8
+	vrsra.u16 q7, #16
+	vrsra.u32 q8, #32
+	vrsra.u64 q9, #64
+
+@ CHECK: vrsra.s8	d5, d26, #8     @ encoding: [0x3a,0x53,0x88,0xf2]
+@ CHECK: vrsra.s16	d6, d25, #16    @ encoding: [0x39,0x63,0x90,0xf2]
+@ CHECK: vrsra.s32	d7, d24, #32    @ encoding: [0x38,0x73,0xa0,0xf2]
+@ CHECK: vrsra.s64	d14, d23, #64   @ encoding: [0xb7,0xe3,0x80,0xf2]
+@ CHECK: vrsra.u8	d15, d22, #8    @ encoding: [0x36,0xf3,0x88,0xf3]
+@ CHECK: vrsra.u16	d16, d21, #16   @ encoding: [0x35,0x03,0xd0,0xf3]
+@ CHECK: vrsra.u32	d17, d20, #32   @ encoding: [0x34,0x13,0xe0,0xf3]
+@ CHECK: vrsra.u64	d18, d19, #64   @ encoding: [0xb3,0x23,0xc0,0xf3]
+@ CHECK: vrsra.s8	q1, q2, #8      @ encoding: [0x54,0x23,0x88,0xf2]
+@ CHECK: vrsra.s16	q2, q3, #16     @ encoding: [0x56,0x43,0x90,0xf2]
+@ CHECK: vrsra.s32	q3, q4, #32     @ encoding: [0x58,0x63,0xa0,0xf2]
+@ CHECK: vrsra.s64	q4, q5, #64     @ encoding: [0xda,0x83,0x80,0xf2]
+@ CHECK: vrsra.u8	q5, q6, #8      @ encoding: [0x5c,0xa3,0x88,0xf3]
+@ CHECK: vrsra.u16	q6, q7, #16     @ encoding: [0x5e,0xc3,0x90,0xf3]
+@ CHECK: vrsra.u32	q7, q8, #32     @ encoding: [0x70,0xe3,0xa0,0xf3]
+@ CHECK: vrsra.u64	q8, q9, #64     @ encoding: [0xf2,0x03,0xc0,0xf3]
+
+@ CHECK: vrsra.s8	d26, d26, #8    @ encoding: [0x3a,0xa3,0xc8,0xf2]
+@ CHECK: vrsra.s16	d25, d25, #16   @ encoding: [0x39,0x93,0xd0,0xf2]
+@ CHECK: vrsra.s32	d24, d24, #32   @ encoding: [0x38,0x83,0xe0,0xf2]
+@ CHECK: vrsra.s64	d23, d23, #64   @ encoding: [0xb7,0x73,0xc0,0xf2]
+@ CHECK: vrsra.u8	d22, d22, #8    @ encoding: [0x36,0x63,0xc8,0xf3]
+@ CHECK: vrsra.u16	d21, d21, #16   @ encoding: [0x35,0x53,0xd0,0xf3]
+@ CHECK: vrsra.u32	d20, d20, #32   @ encoding: [0x34,0x43,0xe0,0xf3]
+@ CHECK: vrsra.u64	d19, d19, #64   @ encoding: [0xb3,0x33,0xc0,0xf3]
+@ CHECK: vrsra.s8	q2, q2, #8      @ encoding: [0x54,0x43,0x88,0xf2]
+@ CHECK: vrsra.s16	q3, q3, #16     @ encoding: [0x56,0x63,0x90,0xf2]
+@ CHECK: vrsra.s32	q4, q4, #32     @ encoding: [0x58,0x83,0xa0,0xf2]
+@ CHECK: vrsra.s64	q5, q5, #64     @ encoding: [0xda,0xa3,0x80,0xf2]
+@ CHECK: vrsra.u8	q6, q6, #8      @ encoding: [0x5c,0xc3,0x88,0xf3]
+@ CHECK: vrsra.u16	q7, q7, #16     @ encoding: [0x5e,0xe3,0x90,0xf3]
+@ CHECK: vrsra.u32	q8, q8, #32     @ encoding: [0x70,0x03,0xe0,0xf3]
+@ CHECK: vrsra.u64	q9, q9, #64     @ encoding: [0xf2,0x23,0xc0,0xf3]
+
+
+	vsli.8 d11, d12, #7
+	vsli.16 d12, d13, #15
+	vsli.32 d13, d14, #31
+	vsli.64 d14, d15, #63
+	vsli.8 q1, q8, #7
+	vsli.16 q2, q7, #15
+	vsli.32 q3, q4, #31
+	vsli.64 q4, q5, #63
+	vsri.8 d28, d11, #8
+	vsri.16 d26, d12, #16
+	vsri.32 d24, d13, #32
+	vsri.64 d21, d14, #64
+	vsri.8 q1, q8, #8
+	vsri.16 q5, q2, #16
+	vsri.32 q7, q4, #32
+	vsri.64 q9, q6, #64
+
+        @ Two-operand syntax variant.
+	vsli.8 d12, #7
+	vsli.16 d13, #15
+	vsli.32 d14, #31
+	vsli.64 d15, #63
+	vsli.8 q8, #7
+	vsli.16 q7, #15
+	vsli.32 q4, #31
+	vsli.64 q5, #63
+	vsri.8 d11, #8
+	vsri.16 d12, #16
+	vsri.32 d13, #32
+	vsri.64 d14, #64
+	vsri.8 q8, #8
+	vsri.16 q2, #16
+	vsri.32 q4, #32
+	vsri.64 q6, #64
+
+@ CHECK: vsli.8	d11, d12, #7            @ encoding: [0x1c,0xb5,0x8f,0xf3]
+@ CHECK: vsli.16	d12, d13, #15   @ encoding: [0x1d,0xc5,0x9f,0xf3]
+@ CHECK: vsli.32	d13, d14, #31   @ encoding: [0x1e,0xd5,0xbf,0xf3]
+@ CHECK: vsli.64	d14, d15, #63   @ encoding: [0x9f,0xe5,0xbf,0xf3]
+@ CHECK: vsli.8	q1, q8, #7              @ encoding: [0x70,0x25,0x8f,0xf3]
+@ CHECK: vsli.16	q2, q7, #15     @ encoding: [0x5e,0x45,0x9f,0xf3]
+@ CHECK: vsli.32	q3, q4, #31     @ encoding: [0x58,0x65,0xbf,0xf3]
+@ CHECK: vsli.64	q4, q5, #63     @ encoding: [0xda,0x85,0xbf,0xf3]
+@ CHECK: vsri.8	d28, d11, #8            @ encoding: [0x1b,0xc4,0xc8,0xf3]
+@ CHECK: vsri.16	d26, d12, #16   @ encoding: [0x1c,0xa4,0xd0,0xf3]
+@ CHECK: vsri.32	d24, d13, #32   @ encoding: [0x1d,0x84,0xe0,0xf3]
+@ CHECK: vsri.64	d21, d14, #64   @ encoding: [0x9e,0x54,0xc0,0xf3]
+@ CHECK: vsri.8	q1, q8, #8              @ encoding: [0x70,0x24,0x88,0xf3]
+@ CHECK: vsri.16	q5, q2, #16     @ encoding: [0x54,0xa4,0x90,0xf3]
+@ CHECK: vsri.32	q7, q4, #32     @ encoding: [0x58,0xe4,0xa0,0xf3]
+@ CHECK: vsri.64	q9, q6, #64     @ encoding: [0xdc,0x24,0xc0,0xf3]
+
+@ CHECK: vsli.8	d12, d12, #7            @ encoding: [0x1c,0xc5,0x8f,0xf3]
+@ CHECK: vsli.16	d13, d13, #15           @ encoding: [0x1d,0xd5,0x9f,0xf3]
+@ CHECK: vsli.32	d14, d14, #31           @ encoding: [0x1e,0xe5,0xbf,0xf3]
+@ CHECK: vsli.64	d15, d15, #63           @ encoding: [0x9f,0xf5,0xbf,0xf3]
+@ CHECK: vsli.8	q8, q8, #7              @ encoding: [0x70,0x05,0xcf,0xf3]
+@ CHECK: vsli.16	q7, q7, #15             @ encoding: [0x5e,0xe5,0x9f,0xf3]
+@ CHECK: vsli.32	q4, q4, #31             @ encoding: [0x58,0x85,0xbf,0xf3]
+@ CHECK: vsli.64	q5, q5, #63             @ encoding: [0xda,0xa5,0xbf,0xf3]
+@ CHECK: vsri.8	d11, d11, #8            @ encoding: [0x1b,0xb4,0x88,0xf3]
+@ CHECK: vsri.16	d12, d12, #16           @ encoding: [0x1c,0xc4,0x90,0xf3]
+@ CHECK: vsri.32	d13, d13, #32           @ encoding: [0x1d,0xd4,0xa0,0xf3]
+@ CHECK: vsri.64	d14, d14, #64           @ encoding: [0x9e,0xe4,0x80,0xf3]
+@ CHECK: vsri.8	q8, q8, #8              @ encoding: [0x70,0x04,0xc8,0xf3]
+@ CHECK: vsri.16	q2, q2, #16             @ encoding: [0x54,0x44,0x90,0xf3]
+@ CHECK: vsri.32	q4, q4, #32             @ encoding: [0x58,0x84,0xa0,0xf3]
+@ CHECK: vsri.64	q6, q6, #64             @ encoding: [0xdc,0xc4,0x80,0xf3]
diff --git a/test/MC/ARM/neon-sub-encoding.s b/test/MC/ARM/neon-sub-encoding.s
index 8eb38a5..be67aa8 100644
--- a/test/MC/ARM/neon-sub-encoding.s
+++ b/test/MC/ARM/neon-sub-encoding.s
@@ -158,3 +158,18 @@
 @ CHECK: vhsub.u8	q4, q4, q9      @ encoding: [0x62,0x82,0x08,0xf3]
 @ CHECK: vhsub.u16	q5, q5, q8      @ encoding: [0x60,0xa2,0x1a,0xf3]
 @ CHECK: vhsub.u32	q6, q6, q7      @ encoding: [0x4e,0xc2,0x2c,0xf3]
+
+
+	vsubw.s8  q6, d5
+	vsubw.s16 q7, d1
+	vsubw.s32 q8, d2
+	vsubw.u8  q6, d5
+	vsubw.u16 q7, d1
+	vsubw.u32 q8, d2
+
+@ CHECK: vsubw.s8	q6, q6, d5      @ encoding: [0x05,0xc3,0x8c,0xf2]
+@ CHECK: vsubw.s16	q7, q7, d1      @ encoding: [0x01,0xe3,0x9e,0xf2]
+@ CHECK: vsubw.s32	q8, q8, d2      @ encoding: [0x82,0x03,0xe0,0xf2]
+@ CHECK: vsubw.u8	q6, q6, d5      @ encoding: [0x05,0xc3,0x8c,0xf3]
+@ CHECK: vsubw.u16	q7, q7, d1      @ encoding: [0x01,0xe3,0x9e,0xf3]
+@ CHECK: vsubw.u32	q8, q8, d2      @ encoding: [0x82,0x03,0xe0,0xf3]
diff --git a/test/MC/ARM/neont2-absdiff-encoding.s b/test/MC/ARM/neont2-absdiff-encoding.s
index 4313483..ac2f9e7 100644
--- a/test/MC/ARM/neont2-absdiff-encoding.s
+++ b/test/MC/ARM/neont2-absdiff-encoding.s
@@ -1,4 +1,4 @@
-@RUN: llvm-mc -triple thumbv7-unknown-unknown -show-encoding < %s | FileCheck %s
+@RUN: llvm-mc -triple thumbv7-unknown-unknown -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
 
 .code 16
 
diff --git a/test/MC/ARM/neont2-dup-encoding.s b/test/MC/ARM/neont2-dup-encoding.s
index bf25d70..d6db496 100644
--- a/test/MC/ARM/neont2-dup-encoding.s
+++ b/test/MC/ARM/neont2-dup-encoding.s
@@ -1,4 +1,4 @@
-@RUN: llvm-mc -triple thumbv7-unknown-unknown -show-encoding < %s | FileCheck %s
+@RUN: llvm-mc -triple thumbv7-unknown-unknown -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
 
 .code 16
 
diff --git a/test/MC/ARM/neont2-shiftaccum-encoding.s b/test/MC/ARM/neont2-shiftaccum-encoding.s
index a3a18fc..3229b43 100644
--- a/test/MC/ARM/neont2-shiftaccum-encoding.s
+++ b/test/MC/ARM/neont2-shiftaccum-encoding.s
@@ -2,99 +2,211 @@
 
 .code 16
 
-@ CHECK: vsra.s8	d17, d16, #8            @ encoding: [0xc8,0xef,0x30,0x11]
-	vsra.s8	d17, d16, #8
-@ CHECK: vsra.s16	d17, d16, #16   @ encoding: [0xd0,0xef,0x30,0x11]
-	vsra.s16	d17, d16, #16
-@ CHECK: vsra.s32	d17, d16, #32   @ encoding: [0xe0,0xef,0x30,0x11]
-	vsra.s32	d17, d16, #32
-@ CHECK: vsra.s64	d17, d16, #64   @ encoding: [0xc0,0xef,0xb0,0x11]
-	vsra.s64	d17, d16, #64
-@ CHECK: vsra.s8	q8, q9, #8              @ encoding: [0xc8,0xef,0x72,0x01]
-	vsra.s8	q8, q9, #8
-@ CHECK: vsra.s16	q8, q9, #16     @ encoding: [0xd0,0xef,0x72,0x01]
-	vsra.s16	q8, q9, #16
-@ CHECK: vsra.s32	q8, q9, #32     @ encoding: [0xe0,0xef,0x72,0x01]
-	vsra.s32	q8, q9, #32
-@ CHECK: vsra.s64	q8, q9, #64     @ encoding: [0xc0,0xef,0xf2,0x01]
-	vsra.s64	q8, q9, #64
-@ CHECK: vsra.u8	d17, d16, #8            @ encoding: [0xc8,0xff,0x30,0x11]
-	vsra.u8	d17, d16, #8
-@ CHECK: vsra.u16	d17, d16, #16   @ encoding: [0xd0,0xff,0x30,0x11]
-	vsra.u16	d17, d16, #16
-@ CHECK: vsra.u32	d17, d16, #32   @ encoding: [0xe0,0xff,0x30,0x11]
-	vsra.u32	d17, d16, #32
-@ CHECK: vsra.u64	d17, d16, #64   @ encoding: [0xc0,0xff,0xb0,0x11]
-	vsra.u64	d17, d16, #64
-@ CHECK: vsra.u8	q8, q9, #8              @ encoding: [0xc8,0xff,0x72,0x01]
-	vsra.u8	q8, q9, #8
-@ CHECK: vsra.u16	q8, q9, #16     @ encoding: [0xd0,0xff,0x72,0x01]
-	vsra.u16	q8, q9, #16
-@ CHECK: vsra.u32	q8, q9, #32     @ encoding: [0xe0,0xff,0x72,0x01]
-	vsra.u32	q8, q9, #32
-@ CHECK: vsra.u64	q8, q9, #64     @ encoding: [0xc0,0xff,0xf2,0x01]
-	vsra.u64	q8, q9, #64
-@ CHECK: vrsra.s8	d17, d16, #8    @ encoding: [0xc8,0xef,0x30,0x13]
-	vrsra.s8	d17, d16, #8
-@ CHECK: vrsra.s16	d17, d16, #16   @ encoding: [0xd0,0xef,0x30,0x13]
-	vrsra.s16	d17, d16, #16
-@ CHECK: vrsra.s32	d17, d16, #32   @ encoding: [0xe0,0xef,0x30,0x13]
-	vrsra.s32	d17, d16, #32
-@ CHECK: vrsra.s64	d17, d16, #64   @ encoding: [0xc0,0xef,0xb0,0x13]
-	vrsra.s64	d17, d16, #64
-@ CHECK: vrsra.u8	d17, d16, #8    @ encoding: [0xc8,0xff,0x30,0x13]
-	vrsra.u8	d17, d16, #8
-@ CHECK: vrsra.u16	d17, d16, #16   @ encoding: [0xd0,0xff,0x30,0x13]
-	vrsra.u16	d17, d16, #16
-@ CHECK: vrsra.u32	d17, d16, #32   @ encoding: [0xe0,0xff,0x30,0x13]
-	vrsra.u32	d17, d16, #32
-@ CHECK: vrsra.u64	d17, d16, #64   @ encoding: [0xc0,0xff,0xb0,0x13]
-	vrsra.u64	d17, d16, #64
-@ CHECK: vrsra.s8	q8, q9, #8      @ encoding: [0xc8,0xef,0x72,0x03]
-	vrsra.s8	q8, q9, #8
-@ CHECK: vrsra.s16	q8, q9, #16     @ encoding: [0xd0,0xef,0x72,0x03]
-	vrsra.s16	q8, q9, #16
-@ CHECK: vrsra.s32	q8, q9, #32     @ encoding: [0xe0,0xef,0x72,0x03]
-	vrsra.s32	q8, q9, #32
-@ CHECK: vrsra.s64	q8, q9, #64     @ encoding: [0xc0,0xef,0xf2,0x03]
-	vrsra.s64	q8, q9, #64
-@ CHECK: vrsra.u8	q8, q9, #8      @ encoding: [0xc8,0xff,0x72,0x03]
-	vrsra.u8	q8, q9, #8
-@ CHECK: vrsra.u16	q8, q9, #16     @ encoding: [0xd0,0xff,0x72,0x03]
-	vrsra.u16	q8, q9, #16
-@ CHECK: vrsra.u32	q8, q9, #32     @ encoding: [0xe0,0xff,0x72,0x03]
-	vrsra.u32	q8, q9, #32
+	vsra.s8 d17, d16, #8
+	vsra.s16 d15, d14, #16
+	vsra.s32 d13, d12, #32
+	vsra.s64 d11, d10, #64
+	vsra.s8 q7, q2, #8
+	vsra.s16 q3, q6, #16
+	vsra.s32 q9, q5, #32
+	vsra.s64 q8, q4, #64
+	vsra.u8 d17, d16, #8
+	vsra.u16 d11, d14, #11
+	vsra.u32 d12, d15, #22
+	vsra.u64 d13, d16, #54
+	vsra.u8 q1, q7, #8
+	vsra.u16 q2, q7, #6
+	vsra.u32 q3, q6, #21
+	vsra.u64 q4, q5, #25
+
+        @ Two-operand syntax variant.
+	vsra.s8 d16, #8
+	vsra.s16 d14, #16
+	vsra.s32 d12, #32
+	vsra.s64 d10, #64
+	vsra.s8 q2, #8
+	vsra.s16 q6, #16
+	vsra.s32 q5, #32
+	vsra.s64 q4, #64
+	vsra.u8 d16, #8
+	vsra.u16 d14, #11
+	vsra.u32 d15, #22
+	vsra.u64 d16, #54
+	vsra.u8 q7, #8
+	vsra.u16 q7, #6
+	vsra.u32 q6, #21
+	vsra.u64 q5, #25
+
+@ CHECK: vsra.s8	d17, d16, #8    @ encoding: [0xc8,0xef,0x30,0x11]
+@ CHECK: vsra.s16	d15, d14, #16   @ encoding: [0x90,0xef,0x1e,0xf1]
+@ CHECK: vsra.s32	d13, d12, #32   @ encoding: [0xa0,0xef,0x1c,0xd1]
+@ CHECK: vsra.s64	d11, d10, #64   @ encoding: [0x80,0xef,0x9a,0xb1]
+@ CHECK: vsra.s8	q7, q2, #8      @ encoding: [0x88,0xef,0x54,0xe1]
+@ CHECK: vsra.s16	q3, q6, #16     @ encoding: [0x90,0xef,0x5c,0x61]
+@ CHECK: vsra.s32	q9, q5, #32     @ encoding: [0xe0,0xef,0x5a,0x21]
+@ CHECK: vsra.s64	q8, q4, #64     @ encoding: [0xc0,0xef,0xd8,0x01]
+@ CHECK: vsra.u8	d17, d16, #8    @ encoding: [0xc8,0xff,0x30,0x11]
+@ CHECK: vsra.u16	d11, d14, #11   @ encoding: [0x95,0xff,0x1e,0xb1]
+@ CHECK: vsra.u32	d12, d15, #22   @ encoding: [0xaa,0xff,0x1f,0xc1]
+@ CHECK: vsra.u64	d13, d16, #54   @ encoding: [0x8a,0xff,0xb0,0xd1]
+@ CHECK: vsra.u8	q1, q7, #8      @ encoding: [0x88,0xff,0x5e,0x21]
+@ CHECK: vsra.u16	q2, q7, #6      @ encoding: [0x9a,0xff,0x5e,0x41]
+@ CHECK: vsra.u32	q3, q6, #21     @ encoding: [0xab,0xff,0x5c,0x61]
+@ CHECK: vsra.u64	q4, q5, #25     @ encoding: [0xa7,0xff,0xda,0x81]
+
+@ CHECK: vsra.s8	d16, d16, #8            @ encoding: [0xc8,0xef,0x30,0x01]
+@ CHECK: vsra.s16	d14, d14, #16   @ encoding: [0x90,0xef,0x1e,0xe1]
+@ CHECK: vsra.s32	d12, d12, #32   @ encoding: [0xa0,0xef,0x1c,0xc1]
+@ CHECK: vsra.s64	d10, d10, #64   @ encoding: [0x80,0xef,0x9a,0xa1]
+@ CHECK: vsra.s8	q2, q2, #8              @ encoding: [0x88,0xef,0x54,0x41]
+@ CHECK: vsra.s16	q6, q6, #16     @ encoding: [0x90,0xef,0x5c,0xc1]
+@ CHECK: vsra.s32	q5, q5, #32     @ encoding: [0xa0,0xef,0x5a,0xa1]
+@ CHECK: vsra.s64	q4, q4, #64     @ encoding: [0x80,0xef,0xd8,0x81]
+@ CHECK: vsra.u8	d16, d16, #8            @ encoding: [0xc8,0xff,0x30,0x01]
+@ CHECK: vsra.u16	d14, d14, #11   @ encoding: [0x95,0xff,0x1e,0xe1]
+@ CHECK: vsra.u32	d15, d15, #22   @ encoding: [0xaa,0xff,0x1f,0xf1]
+@ CHECK: vsra.u64	d16, d16, #54   @ encoding: [0xca,0xff,0xb0,0x01]
+@ CHECK: vsra.u8	q7, q7, #8              @ encoding: [0x88,0xff,0x5e,0xe1]
+@ CHECK: vsra.u16	q7, q7, #6      @ encoding: [0x9a,0xff,0x5e,0xe1]
+@ CHECK: vsra.u32	q6, q6, #21     @ encoding: [0xab,0xff,0x5c,0xc1]
+@ CHECK: vsra.u64	q5, q5, #25     @ encoding: [0xa7,0xff,0xda,0xa1]
+
+
+	vrsra.s8 d5, d26, #8
+	vrsra.s16 d6, d25, #16
+	vrsra.s32 d7, d24, #32
+	vrsra.s64 d14, d23, #64
+	vrsra.u8 d15, d22, #8
+	vrsra.u16 d16, d21, #16
+	vrsra.u32 d17, d20, #32
+	vrsra.u64 d18, d19, #64
+	vrsra.s8 q1, q2, #8
+	vrsra.s16 q2, q3, #16
+	vrsra.s32 q3, q4, #32
+	vrsra.s64 q4, q5, #64
+	vrsra.u8 q5, q6, #8
+	vrsra.u16 q6, q7, #16
+	vrsra.u32 q7, q8, #32
+	vrsra.u64 q8, q9, #64
+
+        @ Two-operand syntax variant.
+	vrsra.s8 d26, #8
+	vrsra.s16 d25, #16
+	vrsra.s32 d24, #32
+	vrsra.s64 d23, #64
+	vrsra.u8 d22, #8
+	vrsra.u16 d21, #16
+	vrsra.u32 d20, #32
+	vrsra.u64 d19, #64
+	vrsra.s8 q2, #8
+	vrsra.s16 q3, #16
+	vrsra.s32 q4, #32
+	vrsra.s64 q5, #64
+	vrsra.u8 q6, #8
+	vrsra.u16 q7, #16
+	vrsra.u32 q8, #32
+	vrsra.u64 q9, #64
+
+@ CHECK: vrsra.s8	d5, d26, #8     @ encoding: [0x88,0xef,0x3a,0x53]
+@ CHECK: vrsra.s16	d6, d25, #16    @ encoding: [0x90,0xef,0x39,0x63]
+@ CHECK: vrsra.s32	d7, d24, #32    @ encoding: [0xa0,0xef,0x38,0x73]
+@ CHECK: vrsra.s64	d14, d23, #64   @ encoding: [0x80,0xef,0xb7,0xe3]
+@ CHECK: vrsra.u8	d15, d22, #8    @ encoding: [0x88,0xff,0x36,0xf3]
+@ CHECK: vrsra.u16	d16, d21, #16   @ encoding: [0xd0,0xff,0x35,0x03]
+@ CHECK: vrsra.u32	d17, d20, #32   @ encoding: [0xe0,0xff,0x34,0x13]
+@ CHECK: vrsra.u64	d18, d19, #64   @ encoding: [0xc0,0xff,0xb3,0x23]
+@ CHECK: vrsra.s8	q1, q2, #8      @ encoding: [0x88,0xef,0x54,0x23]
+@ CHECK: vrsra.s16	q2, q3, #16     @ encoding: [0x90,0xef,0x56,0x43]
+@ CHECK: vrsra.s32	q3, q4, #32     @ encoding: [0xa0,0xef,0x58,0x63]
+@ CHECK: vrsra.s64	q4, q5, #64     @ encoding: [0x80,0xef,0xda,0x83]
+@ CHECK: vrsra.u8	q5, q6, #8      @ encoding: [0x88,0xff,0x5c,0xa3]
+@ CHECK: vrsra.u16	q6, q7, #16     @ encoding: [0x90,0xff,0x5e,0xc3]
+@ CHECK: vrsra.u32	q7, q8, #32     @ encoding: [0xa0,0xff,0x70,0xe3]
 @ CHECK: vrsra.u64	q8, q9, #64     @ encoding: [0xc0,0xff,0xf2,0x03]
-	vrsra.u64	q8, q9, #64
-@ CHECK: vsli.8	d17, d16, #7            @ encoding: [0xcf,0xff,0x30,0x15]
-	vsli.8	d17, d16, #7
-@ CHECK: vsli.16	d17, d16, #15           @ encoding: [0xdf,0xff,0x30,0x15]
-	vsli.16	d17, d16, #15
-@ CHECK: vsli.32	d17, d16, #31           @ encoding: [0xff,0xff,0x30,0x15]
-	vsli.32	d17, d16, #31
-@ CHECK: vsli.64	d17, d16, #63           @ encoding: [0xff,0xff,0xb0,0x15]
-	vsli.64	d17, d16, #63
-@ CHECK: vsli.8	q9, q8, #7              @ encoding: [0xcf,0xff,0x70,0x25]
-	vsli.8	q9, q8, #7
-@ CHECK: vsli.16	q9, q8, #15             @ encoding: [0xdf,0xff,0x70,0x25]
-	vsli.16	q9, q8, #15
-@ CHECK: vsli.32	q9, q8, #31             @ encoding: [0xff,0xff,0x70,0x25]
-	vsli.32	q9, q8, #31
-@ CHECK: vsli.64	q9, q8, #63             @ encoding: [0xff,0xff,0xf0,0x25]
-	vsli.64	q9, q8, #63
-@ CHECK: vsri.8	d17, d16, #8            @ encoding: [0xc8,0xff,0x30,0x14]
-	vsri.8	d17, d16, #8
-@ CHECK: vsri.16	d17, d16, #16           @ encoding: [0xd0,0xff,0x30,0x14]
-	vsri.16	d17, d16, #16
-@ CHECK: vsri.32	d17, d16, #32           @ encoding: [0xe0,0xff,0x30,0x14]
-	vsri.32	d17, d16, #32
-@ CHECK: vsri.64	d17, d16, #64           @ encoding: [0xc0,0xff,0xb0,0x14]
-	vsri.64	d17, d16, #64
-@ CHECK: vsri.8	q9, q8, #8              @ encoding: [0xc8,0xff,0x70,0x24]
-	vsri.8	q9, q8, #8
-@ CHECK: vsri.16	q9, q8, #16             @ encoding: [0xd0,0xff,0x70,0x24]
-	vsri.16	q9, q8, #16
-@ CHECK: vsri.32	q9, q8, #32             @ encoding: [0xe0,0xff,0x70,0x24]
-	vsri.32	q9, q8, #32
-@ CHECK: vsri.64	q9, q8, #64             @ encoding: [0xc0,0xff,0xf0,0x24]
-	vsri.64	q9, q8, #64
+
+@ CHECK: vrsra.s8	d26, d26, #8    @ encoding: [0xc8,0xef,0x3a,0xa3]
+@ CHECK: vrsra.s16	d25, d25, #16   @ encoding: [0xd0,0xef,0x39,0x93]
+@ CHECK: vrsra.s32	d24, d24, #32   @ encoding: [0xe0,0xef,0x38,0x83]
+@ CHECK: vrsra.s64	d23, d23, #64   @ encoding: [0xc0,0xef,0xb7,0x73]
+@ CHECK: vrsra.u8	d22, d22, #8    @ encoding: [0xc8,0xff,0x36,0x63]
+@ CHECK: vrsra.u16	d21, d21, #16   @ encoding: [0xd0,0xff,0x35,0x53]
+@ CHECK: vrsra.u32	d20, d20, #32   @ encoding: [0xe0,0xff,0x34,0x43]
+@ CHECK: vrsra.u64	d19, d19, #64   @ encoding: [0xc0,0xff,0xb3,0x33]
+@ CHECK: vrsra.s8	q2, q2, #8      @ encoding: [0x88,0xef,0x54,0x43]
+@ CHECK: vrsra.s16	q3, q3, #16     @ encoding: [0x90,0xef,0x56,0x63]
+@ CHECK: vrsra.s32	q4, q4, #32     @ encoding: [0xa0,0xef,0x58,0x83]
+@ CHECK: vrsra.s64	q5, q5, #64     @ encoding: [0x80,0xef,0xda,0xa3]
+@ CHECK: vrsra.u8	q6, q6, #8      @ encoding: [0x88,0xff,0x5c,0xc3]
+@ CHECK: vrsra.u16	q7, q7, #16     @ encoding: [0x90,0xff,0x5e,0xe3]
+@ CHECK: vrsra.u32	q8, q8, #32     @ encoding: [0xe0,0xff,0x70,0x03]
+@ CHECK: vrsra.u64	q9, q9, #64     @ encoding: [0xc0,0xff,0xf2,0x23]
+
+
+	vsli.8 d11, d12, #7
+	vsli.16 d12, d13, #15
+	vsli.32 d13, d14, #31
+	vsli.64 d14, d15, #63
+	vsli.8 q1, q8, #7
+	vsli.16 q2, q7, #15
+	vsli.32 q3, q4, #31
+	vsli.64 q4, q5, #63
+	vsri.8 d28, d11, #8
+	vsri.16 d26, d12, #16
+	vsri.32 d24, d13, #32
+	vsri.64 d21, d14, #64
+	vsri.8 q1, q8, #8
+	vsri.16 q5, q2, #16
+	vsri.32 q7, q4, #32
+	vsri.64 q9, q6, #64
+
+        @ Two-operand syntax variant.
+	vsli.8 d12, #7
+	vsli.16 d13, #15
+	vsli.32 d14, #31
+	vsli.64 d15, #63
+	vsli.8 q8, #7
+	vsli.16 q7, #15
+	vsli.32 q4, #31
+	vsli.64 q5, #63
+	vsri.8 d11, #8
+	vsri.16 d12, #16
+	vsri.32 d13, #32
+	vsri.64 d14, #64
+	vsri.8 q8, #8
+	vsri.16 q2, #16
+	vsri.32 q4, #32
+	vsri.64 q6, #64
+
+@ CHECK: vsli.8	d11, d12, #7            @ encoding: [0x8f,0xff,0x1c,0xb5]
+@ CHECK: vsli.16	d12, d13, #15   @ encoding: [0x9f,0xff,0x1d,0xc5]
+@ CHECK: vsli.32	d13, d14, #31   @ encoding: [0xbf,0xff,0x1e,0xd5]
+@ CHECK: vsli.64	d14, d15, #63   @ encoding: [0xbf,0xff,0x9f,0xe5]
+@ CHECK: vsli.8	q1, q8, #7              @ encoding: [0x8f,0xff,0x70,0x25]
+@ CHECK: vsli.16	q2, q7, #15     @ encoding: [0x9f,0xff,0x5e,0x45]
+@ CHECK: vsli.32	q3, q4, #31     @ encoding: [0xbf,0xff,0x58,0x65]
+@ CHECK: vsli.64	q4, q5, #63     @ encoding: [0xbf,0xff,0xda,0x85]
+@ CHECK: vsri.8	d28, d11, #8            @ encoding: [0xc8,0xff,0x1b,0xc4]
+@ CHECK: vsri.16	d26, d12, #16   @ encoding: [0xd0,0xff,0x1c,0xa4]
+@ CHECK: vsri.32	d24, d13, #32   @ encoding: [0xe0,0xff,0x1d,0x84]
+@ CHECK: vsri.64	d21, d14, #64   @ encoding: [0xc0,0xff,0x9e,0x54]
+@ CHECK: vsri.8	q1, q8, #8              @ encoding: [0x88,0xff,0x70,0x24]
+@ CHECK: vsri.16	q5, q2, #16     @ encoding: [0x90,0xff,0x54,0xa4]
+@ CHECK: vsri.32	q7, q4, #32     @ encoding: [0xa0,0xff,0x58,0xe4]
+@ CHECK: vsri.64	q9, q6, #64     @ encoding: [0xc0,0xff,0xdc,0x24]
+
+@ CHECK: vsli.8	d12, d12, #7            @ encoding: [0x8f,0xff,0x1c,0xc5]
+@ CHECK: vsli.16	d13, d13, #15           @ encoding: [0x9f,0xff,0x1d,0xd5]
+@ CHECK: vsli.32	d14, d14, #31           @ encoding: [0xbf,0xff,0x1e,0xe5]
+@ CHECK: vsli.64	d15, d15, #63           @ encoding: [0xbf,0xff,0x9f,0xf5]
+@ CHECK: vsli.8	q8, q8, #7              @ encoding: [0xcf,0xff,0x70,0x05]
+@ CHECK: vsli.16	q7, q7, #15             @ encoding: [0x9f,0xff,0x5e,0xe5]
+@ CHECK: vsli.32	q4, q4, #31             @ encoding: [0xbf,0xff,0x58,0x85]
+@ CHECK: vsli.64	q5, q5, #63             @ encoding: [0xbf,0xff,0xda,0xa5]
+@ CHECK: vsri.8	d11, d11, #8            @ encoding: [0x88,0xff,0x1b,0xb4]
+@ CHECK: vsri.16	d12, d12, #16           @ encoding: [0x90,0xff,0x1c,0xc4]
+@ CHECK: vsri.32	d13, d13, #32           @ encoding: [0xa0,0xff,0x1d,0xd4]
+@ CHECK: vsri.64	d14, d14, #64           @ encoding: [0x80,0xff,0x9e,0xe4]
+@ CHECK: vsri.8	q8, q8, #8              @ encoding: [0xc8,0xff,0x70,0x04]
+@ CHECK: vsri.16	q2, q2, #16             @ encoding: [0x90,0xff,0x54,0x44]
+@ CHECK: vsri.32	q4, q4, #32             @ encoding: [0xa0,0xff,0x58,0x84]
+@ CHECK: vsri.64	q6, q6, #64             @ encoding: [0x80,0xff,0xdc,0xc4]
diff --git a/test/MC/ARM/simple-fp-encoding.s b/test/MC/ARM/simple-fp-encoding.s
index b592f1e..2a22620 100644
--- a/test/MC/ARM/simple-fp-encoding.s
+++ b/test/MC/ARM/simple-fp-encoding.s
@@ -1,124 +1,121 @@
 @ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7-apple-darwin -show-encoding < %s | FileCheck %s
 
+        vadd.f64  d16, d17, d16
+        vadd.f32  s0, s1, s0
 @ CHECK: vadd.f64 d16, d17, d16      @ encoding: [0xa0,0x0b,0x71,0xee]
-        vadd.f64        d16, d17, d16
-
 @ CHECK: vadd.f32 s0, s1, s0         @ encoding: [0x80,0x0a,0x30,0xee]
-        vadd.f32        s0, s1, s0
 
+        vsub.f64  d16, d17, d16
+        vsub.f32  s0, s1, s0
 @ CHECK: vsub.f64 d16, d17, d16      @ encoding: [0xe0,0x0b,0x71,0xee]
-        vsub.f64        d16, d17, d16
-
 @ CHECK: vsub.f32 s0, s1, s0         @ encoding: [0xc0,0x0a,0x30,0xee]
-        vsub.f32        s0, s1, s0
 
-@ CHECK: vdiv.f64 d16, d17, d16      @ encoding: [0xa0,0x0b,0xc1,0xee]
-        vdiv.f64        d16, d17, d16
+        vdiv.f64  d16, d17, d16
+        vdiv.f32  s0, s1, s0
+        vdiv.f32 s5, s7
+        vdiv.f64 d5, d7
 
-@ CHECK: vdiv.f32 s0, s1, s0         @ encoding: [0x80,0x0a,0x80,0xee]
-        vdiv.f32        s0, s1, s0
+@ CHECK: vdiv.f64 d16, d17, d16         @ encoding: [0xa0,0x0b,0xc1,0xee]
+@ CHECK: vdiv.f32 s0, s1, s0            @ encoding: [0x80,0x0a,0x80,0xee]
+@ CHECK: vdiv.f32	s5, s5, s7      @ encoding: [0xa3,0x2a,0xc2,0xee]
+@ CHECK: vdiv.f64	d5, d5, d7      @ encoding: [0x07,0x5b,0x85,0xee]
 
-@ CHECK: vmul.f64 d16, d17, d16      @ encoding: [0xa0,0x0b,0x61,0xee]
-        vmul.f64        d16, d17, d16
 
-@ CHECK: vmul.f64	d20, d20, d17   @ encoding: [0xa1,0x4b,0x64,0xee]
+        vmul.f64  d16, d17, d16
 	vmul.f64  d20, d17
+        vmul.f32  s0, s1, s0
+	vmul.f32  s11, s21
 
-@ CHECK: vmul.f32 s0, s1, s0         @ encoding: [0x80,0x0a,0x20,0xee]
-        vmul.f32        s0, s1, s0
 
-@ CHECK: vmul.f32	s11, s11, s21   @ encoding: [0xaa,0x5a,0x65,0xee]
-	vmul.f32  s11, s21
+@ CHECK: vmul.f64 d16, d17, d16      @ encoding: [0xa0,0x0b,0x61,0xee]
+@ CHECK: vmul.f64 d20, d20, d17      @ encoding: [0xa1,0x4b,0x64,0xee]
+@ CHECK: vmul.f32 s0, s1, s0         @ encoding: [0x80,0x0a,0x20,0xee]
+@ CHECK: vmul.f32 s11, s11, s21      @ encoding: [0xaa,0x5a,0x65,0xee]
 
-@ CHECK: vnmul.f64 d16, d17, d16     @ encoding: [0xe0,0x0b,0x61,0xee]
         vnmul.f64       d16, d17, d16
+        vnmul.f32       s0, s1, s0
 
+@ CHECK: vnmul.f64 d16, d17, d16     @ encoding: [0xe0,0x0b,0x61,0xee]
 @ CHECK: vnmul.f32 s0, s1, s0        @ encoding: [0xc0,0x0a,0x20,0xee]
-        vnmul.f32       s0, s1, s0
 
-@ CHECK: vcmpe.f64 d17, d16          @ encoding: [0xe0,0x1b,0xf4,0xee]
         vcmpe.f64       d17, d16
+        vcmpe.f32       s1, s0
 
+@ CHECK: vcmpe.f64 d17, d16          @ encoding: [0xe0,0x1b,0xf4,0xee]
 @ CHECK: vcmpe.f32 s1, s0            @ encoding: [0xc0,0x0a,0xf4,0xee]
-        vcmpe.f32       s1, s0
 
-@ CHECK: vcmpe.f64 d16, #0           @ encoding: [0xc0,0x0b,0xf5,0xee]
         vcmpe.f64       d16, #0
+        vcmpe.f32       s0, #0
 
+@ CHECK: vcmpe.f64 d16, #0           @ encoding: [0xc0,0x0b,0xf5,0xee]
 @ CHECK: vcmpe.f32 s0, #0            @ encoding: [0xc0,0x0a,0xb5,0xee]
-        vcmpe.f32       s0, #0
 
-@ CHECK: vabs.f64 d16, d16           @ encoding: [0xe0,0x0b,0xf0,0xee]
         vabs.f64        d16, d16
+        vabs.f32        s0, s0
 
+@ CHECK: vabs.f64 d16, d16           @ encoding: [0xe0,0x0b,0xf0,0xee]
 @ CHECK: vabs.f32 s0, s0             @ encoding: [0xc0,0x0a,0xb0,0xee]
-        vabs.f32        s0, s0
 
-@ CHECK: vcvt.f32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xb7,0xee]
         vcvt.f32.f64    s0, d16
+        vcvt.f64.f32    d16, s0
 
+@ CHECK: vcvt.f32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xb7,0xee]
 @ CHECK: vcvt.f64.f32 d16, s0        @ encoding: [0xc0,0x0a,0xf7,0xee]
-        vcvt.f64.f32    d16, s0
 
-@ CHECK: vneg.f64 d16, d16           @ encoding: [0x60,0x0b,0xf1,0xee]
         vneg.f64        d16, d16
+        vneg.f32        s0, s0
 
+@ CHECK: vneg.f64 d16, d16           @ encoding: [0x60,0x0b,0xf1,0xee]
 @ CHECK: vneg.f32 s0, s0             @ encoding: [0x40,0x0a,0xb1,0xee]
-        vneg.f32        s0, s0
 
-@ CHECK: vsqrt.f64 d16, d16          @ encoding: [0xe0,0x0b,0xf1,0xee]
         vsqrt.f64       d16, d16
+        vsqrt.f32       s0, s0
 
+@ CHECK: vsqrt.f64 d16, d16          @ encoding: [0xe0,0x0b,0xf1,0xee]
 @ CHECK: vsqrt.f32 s0, s0            @ encoding: [0xc0,0x0a,0xb1,0xee]
-        vsqrt.f32       s0, s0
 
-@ CHECK: vcvt.f64.s32 d16, s0        @ encoding: [0xc0,0x0b,0xf8,0xee]
         vcvt.f64.s32    d16, s0
-
-@ CHECK: vcvt.f32.s32 s0, s0         @ encoding: [0xc0,0x0a,0xb8,0xee]
         vcvt.f32.s32    s0, s0
-
-@ CHECK: vcvt.f64.u32 d16, s0        @ encoding: [0x40,0x0b,0xf8,0xee]
         vcvt.f64.u32    d16, s0
-
-@ CHECK: vcvt.f32.u32 s0, s0         @ encoding: [0x40,0x0a,0xb8,0xee]
         vcvt.f32.u32    s0, s0
-
-@ CHECK: vcvt.s32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xbd,0xee]
         vcvt.s32.f64    s0, d16
-
-@ CHECK: vcvt.s32.f32 s0, s0         @ encoding: [0xc0,0x0a,0xbd,0xee]
         vcvt.s32.f32    s0, s0
-
-@ CHECK: vcvt.u32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xbc,0xee]
         vcvt.u32.f64    s0, d16
+        vcvt.u32.f32    s0, s0
 
+@ CHECK: vcvt.f64.s32 d16, s0        @ encoding: [0xc0,0x0b,0xf8,0xee]
+@ CHECK: vcvt.f32.s32 s0, s0         @ encoding: [0xc0,0x0a,0xb8,0xee]
+@ CHECK: vcvt.f64.u32 d16, s0        @ encoding: [0x40,0x0b,0xf8,0xee]
+@ CHECK: vcvt.f32.u32 s0, s0         @ encoding: [0x40,0x0a,0xb8,0xee]
+@ CHECK: vcvt.s32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xbd,0xee]
+@ CHECK: vcvt.s32.f32 s0, s0         @ encoding: [0xc0,0x0a,0xbd,0xee]
+@ CHECK: vcvt.u32.f64 s0, d16        @ encoding: [0xe0,0x0b,0xbc,0xee]
 @ CHECK: vcvt.u32.f32 s0, s0         @ encoding: [0xc0,0x0a,0xbc,0xee]
-        vcvt.u32.f32    s0, s0
 
-@ CHECK: vmla.f64 d16, d18, d17      @ encoding: [0xa1,0x0b,0x42,0xee]
+
         vmla.f64        d16, d18, d17
+        vmla.f32        s1, s2, s0
 
+@ CHECK: vmla.f64 d16, d18, d17      @ encoding: [0xa1,0x0b,0x42,0xee]
 @ CHECK: vmla.f32 s1, s2, s0         @ encoding: [0x00,0x0a,0x41,0xee]
-        vmla.f32        s1, s2, s0
 
-@ CHECK: vmls.f64 d16, d18, d17      @ encoding: [0xe1,0x0b,0x42,0xee]
         vmls.f64        d16, d18, d17
+        vmls.f32        s1, s2, s0
 
+@ CHECK: vmls.f64 d16, d18, d17      @ encoding: [0xe1,0x0b,0x42,0xee]
 @ CHECK: vmls.f32 s1, s2, s0         @ encoding: [0x40,0x0a,0x41,0xee]
-        vmls.f32        s1, s2, s0
 
-@ CHECK: vnmla.f64 d16, d18, d17     @ encoding: [0xe1,0x0b,0x52,0xee]
         vnmla.f64       d16, d18, d17
+        vnmla.f32       s1, s2, s0
 
+@ CHECK: vnmla.f64 d16, d18, d17     @ encoding: [0xe1,0x0b,0x52,0xee]
 @ CHECK: vnmla.f32 s1, s2, s0        @ encoding: [0x40,0x0a,0x51,0xee]
-        vnmla.f32       s1, s2, s0
 
-@ CHECK: vnmls.f64 d16, d18, d17     @ encoding: [0xa1,0x0b,0x52,0xee]
         vnmls.f64       d16, d18, d17
+        vnmls.f32       s1, s2, s0
 
+@ CHECK: vnmls.f64 d16, d18, d17     @ encoding: [0xa1,0x0b,0x52,0xee]
 @ CHECK: vnmls.f32 s1, s2, s0        @ encoding: [0x00,0x0a,0x51,0xee]
-        vnmls.f32       s1, s2, s0
 
         vmrs    APSR_nzcv, fpscr
         vmrs    apsr_nzcv, fpscr
@@ -199,6 +196,27 @@
 @ CHECK: vmov r0, r1, d16            @ encoding: [0x30,0x0b,0x51,0xec]
         vmov    r0, r1, d16
 
+@ Between two single precision registers and two core registers
+        vmov s3, s4, r1, r2
+        vmov s2, s3, r1, r2
+        vmov r1, r2, s3, s4
+        vmov r1, r2, s2, s3
+@ CHECK: vmov s3, s4, r1, r2      @ encoding: [0x31,0x1a,0x42,0xec]
+@ CHECK: vmov s2, s3, r1, r2      @ encoding: [0x11,0x1a,0x42,0xec]
+@ CHECK: vmov r1, r2, s3, s4      @ encoding: [0x31,0x1a,0x52,0xec]
+@ CHECK: vmov r1, r2, s2, s3      @ encoding: [0x11,0x1a,0x52,0xec]
+
+@ Between one double precision register and two core registers
+        vmov d15, r1, r2 
+        vmov d16, r1, r2
+        vmov r1, r2, d15
+        vmov r1, r2, d16
+@ CHECK: vmov d15, r1, r2         @ encoding: [0x1f,0x1b,0x42,0xec]
+@ CHECK: vmov d16, r1, r2         @ encoding: [0x30,0x1b,0x42,0xec]
+@ CHECK: vmov r1, r2, d15         @ encoding: [0x1f,0x1b,0x52,0xec]
+@ CHECK: vmov r1, r2, d16         @ encoding: [0x30,0x1b,0x52,0xec]
+
+
 @ CHECK: vldr d17, [r0]           @ encoding: [0x00,0x1b,0xd0,0xed]
 @ CHECK: vldr s0, [lr]            @ encoding: [0x00,0x0a,0x9e,0xed]
 @ CHECK: vldr d0, [lr]            @ encoding: [0x00,0x0b,0x9e,0xed]
diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index 99d7e38..6f822d1 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s
@@ -67,7 +67,7 @@ error: invalid operand for instruction
 @ Invalid writeback and register lists for STM
         stm r1, {r2, r6}
         stm r1!, {r2, r9}
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         stm r1, {r2, r6}
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: error: registers must be in range r0-r7
@@ -95,13 +95,13 @@ error: invalid operand for instruction
         str r2, [r7, #-1]
         str r5, [r1, #3]
         str r3, [r7, #128]
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         str r2, [r7, #-1]
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         str r5, [r1, #3]
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         str r3, [r7, #128]
 @ CHECK-ERRORS:         ^
 
@@ -111,7 +111,7 @@ error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS:         svc #-1
 @ CHECK-ERRORS:             ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: arm-mode
 @ CHECK-ERRORS:         svc #256
 @ CHECK-ERRORS:         ^
 
@@ -121,15 +121,38 @@ error: invalid operand for instruction
         add sp, #3
         add sp, sp, #512
         add r2, sp, #1024
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         add sp, #-1
 @ CHECK-ERRORS:                 ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         add sp, #3
 @ CHECK-ERRORS:                 ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         add sp, sp, #512
 @ CHECK-ERRORS:                     ^
-@ CHECK-ERRORS: error: instruction requires a CPU feature not currently enabled
+@ CHECK-ERRORS: error: instruction requires: arm-mode
 @ CHECK-ERRORS:         add r2, sp, #1024
 @ CHECK-ERRORS:         ^
+
+        add r2, sp, ip
+@ CHECK-ERRORS: error: source register must be the same as destination
+@ CHECK-ERRORS:         add r2, sp, ip
+@ CHECK-ERRORS:                     ^
+ 
+@------------------------------------------------------------------------------
+@ WFE/WFI/YIELD - are not supported pre v6T2
+@------------------------------------------------------------------------------
+        wfe
+        wfi
+        yield
+
+@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: wfe
+@ CHECK-ERRORS: ^
+@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: wfi
+@ CHECK-ERRORS: ^
+@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: yield
+@ CHECK-ERRORS: ^
+
diff --git a/test/MC/ARM/thumb.s b/test/MC/ARM/thumb.s
index 625882c..2223bdc 100644
--- a/test/MC/ARM/thumb.s
+++ b/test/MC/ARM/thumb.s
@@ -44,13 +44,6 @@
         nop
 @ CHECK: nop @ encoding: [0xc0,0x46]
 
-        wfe
-        wfi
-        yield
-@ CHECK: wfe                            @ encoding: [0x20,0xbf]
-@ CHECK: wfi                            @ encoding: [0x30,0xbf]
-@ CHECK: yield                          @ encoding: [0x10,0xbf]
-
         cpsie aif
 @ CHECK: cpsie aif                      @ encoding: [0x67,0xb6]
 
diff --git a/test/MC/ARM/thumb2-diagnostics.s b/test/MC/ARM/thumb2-diagnostics.s
index e38f53c..d94c686 100644
--- a/test/MC/ARM/thumb2-diagnostics.s
+++ b/test/MC/ARM/thumb2-diagnostics.s
@@ -40,5 +40,5 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
diff --git a/test/MC/ARM/thumb2-mclass.s b/test/MC/ARM/thumb2-mclass.s
index 10460f9..b7af723 100644
--- a/test/MC/ARM/thumb2-mclass.s
+++ b/test/MC/ARM/thumb2-mclass.s
@@ -44,9 +44,21 @@
 @------------------------------------------------------------------------------
 
         msr  apsr, r0
+        msr  apsr_nzcvq, r0
+        msr  apsr_g, r0
+        msr  apsr_nzcvqg, r0
         msr  iapsr, r0
+        msr  iapsr_nzcvq, r0
+        msr  iapsr_g, r0
+        msr  iapsr_nzcvqg, r0
         msr  eapsr, r0
+        msr  eapsr_nzcvq, r0
+        msr  eapsr_g, r0
+        msr  eapsr_nzcvqg, r0
         msr  xpsr, r0
+        msr  xpsr_nzcvq, r0
+        msr  xpsr_g, r0
+        msr  xpsr_nzcvqg, r0
         msr  ipsr, r0
         msr  epsr, r0
         msr  iepsr, r0
@@ -58,17 +70,29 @@
         msr  faultmask, r0
         msr  control, r0
 
-@ CHECK: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x80]
-@ CHECK: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x80]
-@ CHECK: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x80]
-@ CHECK: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x80]
-@ CHECK: msr	ipsr, r0                @ encoding: [0x80,0xf3,0x05,0x80]
-@ CHECK: msr	epsr, r0                @ encoding: [0x80,0xf3,0x06,0x80]
-@ CHECK: msr	iepsr, r0               @ encoding: [0x80,0xf3,0x07,0x80]
-@ CHECK: msr	msp, r0                 @ encoding: [0x80,0xf3,0x08,0x80]
-@ CHECK: msr	psp, r0                 @ encoding: [0x80,0xf3,0x09,0x80]
-@ CHECK: msr	primask, r0             @ encoding: [0x80,0xf3,0x10,0x80]
-@ CHECK: msr	basepri, r0             @ encoding: [0x80,0xf3,0x11,0x80]
-@ CHECK: msr	basepri_max, r0         @ encoding: [0x80,0xf3,0x12,0x80]
-@ CHECK: msr	faultmask, r0           @ encoding: [0x80,0xf3,0x13,0x80]
-@ CHECK: msr	control, r0             @ encoding: [0x80,0xf3,0x14,0x80]
+@ CHECK: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK: msr	apsr_g, r0              @ encoding: [0x80,0xf3,0x00,0x84]
+@ CHECK: msr	apsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x00,0x8c]
+@ CHECK: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK: msr	iapsr_g, r0             @ encoding: [0x80,0xf3,0x01,0x84]
+@ CHECK: msr	iapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x01,0x8c]
+@ CHECK: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK: msr	eapsr_g, r0             @ encoding: [0x80,0xf3,0x02,0x84]
+@ CHECK: msr	eapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x02,0x8c]
+@ CHECK: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
+@ CHECK: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
+@ CHECK: msr	xpsr_g, r0              @ encoding: [0x80,0xf3,0x03,0x84]
+@ CHECK: msr	xpsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x03,0x8c]
+@ CHECK: msr	ipsr, r0                @ encoding: [0x80,0xf3,0x05,0x88]
+@ CHECK: msr	epsr, r0                @ encoding: [0x80,0xf3,0x06,0x88]
+@ CHECK: msr	iepsr, r0               @ encoding: [0x80,0xf3,0x07,0x88]
+@ CHECK: msr	msp, r0                 @ encoding: [0x80,0xf3,0x08,0x88]
+@ CHECK: msr	psp, r0                 @ encoding: [0x80,0xf3,0x09,0x88]
+@ CHECK: msr	primask, r0             @ encoding: [0x80,0xf3,0x10,0x88]
+@ CHECK: msr	basepri, r0             @ encoding: [0x80,0xf3,0x11,0x88]
+@ CHECK: msr	basepri_max, r0         @ encoding: [0x80,0xf3,0x12,0x88]
+@ CHECK: msr	faultmask, r0           @ encoding: [0x80,0xf3,0x13,0x88]
+@ CHECK: msr	control, r0             @ encoding: [0x80,0xf3,0x14,0x88]
diff --git a/test/MC/ARM/thumb2-narrow-dp.ll b/test/MC/ARM/thumb2-narrow-dp.ll
new file mode 100644
index 0000000..ae2ba35
--- /dev/null
+++ b/test/MC/ARM/thumb2-narrow-dp.ll
@@ -0,0 +1,807 @@
+// RUN: llvm-mc -triple thumbv7 -show-encoding < %s | FileCheck %s
+
+// Test each of the Thumb1 data-processing instructions
+// The assembly syntax for these instructions allows an optional Rd register
+//   OP{S}{<c>}{<q>}  {<Rd>,} <Rn>, <Rm>
+// Assemblers should chose the narrow thumb encoding when possible, i.e.
+//   - Rd == Rn 
+//   - Rd, Rn and Rm are < r8
+// In addition, some operations are commutative, allowing the transormation 
+// when:
+//   - Rd == Rn || Rd == Rm
+//   - Rd, Rn and Rm are < r8
+
+// AND (commutative)
+    ANDS     r0, r2, r1          // Must be wide - 3 distinct registers
+    ANDS     r2, r2, r1          // Should choose narrow
+    ANDS     r2, r1, r2          // Should choose narrow - commutative
+    ANDS.W   r0, r0, r1          // Explicitly wide
+    ANDS.W   r3, r1, r3  
+    AND      r0, r1, r0          // Must use wide encoding as not flag-setting
+    ANDS     r7, r7, r1          // Should use narrow
+    ANDS     r7, r1, r7          // Commutative
+    ANDS     r8, r1, r8          // high registers so must use wide encoding
+    ANDS     r8, r8, r1
+    ANDS     r0, r8, r0
+    ANDS     r1, r1, r8
+    ANDS     r2, r2, r1, lsl #1  // Must use wide - shifted register
+    ANDS     r0, r1, r0, lsr #1
+// CHECK: ands.w  r0, r2, r1              @ encoding: [0x12,0xea,0x01,0x00]
+// CHECK: ands    r2, r1                  @ encoding: [0x0a,0x40]
+// CHECK: ands    r2, r1                  @ encoding: [0x0a,0x40]
+// CHECK: ands.w  r0, r0, r1              @ encoding: [0x10,0xea,0x01,0x00]
+// CHECK: ands.w  r3, r1, r3              @ encoding: [0x11,0xea,0x03,0x03]
+// CHECK: and.w   r0, r1, r0              @ encoding: [0x01,0xea,0x00,0x00]
+// CHECK: ands    r7, r1                  @ encoding: [0x0f,0x40]
+// CHECK: ands    r7, r1                  @ encoding: [0x0f,0x40]
+// CHECK: ands.w  r8, r1, r8              @ encoding: [0x11,0xea,0x08,0x08]
+// CHECK: ands.w  r8, r8, r1              @ encoding: [0x18,0xea,0x01,0x08]
+// CHECK: ands.w  r0, r8, r0              @ encoding: [0x18,0xea,0x00,0x00]
+// CHECK: ands.w  r1, r1, r8              @ encoding: [0x11,0xea,0x08,0x01]
+// CHECK: ands.w  r2, r2, r1, lsl #1      @ encoding: [0x12,0xea,0x41,0x02]
+// CHECK: ands.w  r0, r1, r0, lsr #1      @ encoding: [0x11,0xea,0x50,0x00]
+
+    IT EQ
+    ANDEQ    r0, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    ANDEQ    r3, r3, r1          // Should choose narrow
+    IT EQ
+    ANDEQ    r3, r1, r3          // Should choose narrow - commutative
+    IT EQ
+    ANDEQ.W  r0, r0, r1          // Explicitly wide
+    IT EQ
+    ANDEQ.W  r2, r1, r2  
+    IT EQ
+    ANDSEQ   r0, r1, r0          // Must use wide encoding as flag-setting
+    IT EQ
+    ANDEQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    ANDEQ    r7, r1, r7          // Commutative
+    IT EQ
+    ANDEQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    ANDEQ    r8, r8, r1
+    IT EQ
+    ANDEQ    r4, r8, r4
+    IT EQ
+    ANDEQ    r4, r4, r8
+    IT EQ
+    ANDEQ    r0, r0, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    ANDEQ    r5, r1, r5, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r0, r2, r1             @ encoding: [0x02,0xea,0x01,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq    r3, r1                 @ encoding: [0x0b,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq    r3, r1                 @ encoding: [0x0b,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r0, r0, r1             @ encoding: [0x00,0xea,0x01,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r2, r1, r2             @ encoding: [0x01,0xea,0x02,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andseq.w r0, r1, r0             @ encoding: [0x11,0xea,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq    r7, r1                 @ encoding: [0x0f,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq    r7, r1                 @ encoding: [0x0f,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r8, r1, r8             @ encoding: [0x01,0xea,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r8, r8, r1             @ encoding: [0x08,0xea,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r4, r8, r4             @ encoding: [0x08,0xea,0x04,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r4, r4, r8             @ encoding: [0x04,0xea,0x08,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r0, r0, r1, lsl #1     @ encoding: [0x00,0xea,0x41,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: andeq.w  r5, r1, r5, lsr #1     @ encoding: [0x01,0xea,0x55,0x05]
+
+// EOR (commutative)
+    EORS     r0, r2, r1          // Must be wide - 3 distinct registers
+    EORS     r5, r5, r1          // Should choose narrow
+    EORS     r5, r1, r5          // Should choose narrow - commutative
+    EORS.W   r0, r0, r1          // Explicitly wide
+    EORS.W   r2, r1, r2  
+    EOR      r1, r1, r1          // Must use wide encoding as not flag-setting
+    EORS     r7, r7, r1          // Should use narrow
+    EORS     r7, r1, r7          // Commutative
+    EORS     r8, r1, r8          // high registers so must use wide encoding
+    EORS     r8, r8, r1
+    EORS     r6, r8, r6
+    EORS     r0, r0, r8
+    EORS     r2, r2, r1, lsl #1  // Must use wide - shifted register
+    EORS     r0, r1, r0, lsr #1
+// CHECK: eors.w  r0, r2, r1              @ encoding: [0x92,0xea,0x01,0x00]
+// CHECK: eors    r5, r1                  @ encoding: [0x4d,0x40]
+// CHECK: eors    r5, r1                  @ encoding: [0x4d,0x40]
+// CHECK: eors.w  r0, r0, r1              @ encoding: [0x90,0xea,0x01,0x00]
+// CHECK: eors.w  r2, r1, r2              @ encoding: [0x91,0xea,0x02,0x02]
+// CHECK: eor.w   r1, r1, r1              @ encoding: [0x81,0xea,0x01,0x01]
+// CHECK: eors    r7, r1                  @ encoding: [0x4f,0x40]
+// CHECK: eors    r7, r1                  @ encoding: [0x4f,0x40]
+// CHECK: eors.w  r8, r1, r8              @ encoding: [0x91,0xea,0x08,0x08]
+// CHECK: eors.w  r8, r8, r1              @ encoding: [0x98,0xea,0x01,0x08]
+// CHECK: eors.w  r6, r8, r6              @ encoding: [0x98,0xea,0x06,0x06]
+// CHECK: eors.w  r0, r0, r8              @ encoding: [0x90,0xea,0x08,0x00]
+// CHECK: eors.w  r2, r2, r1, lsl #1      @ encoding: [0x92,0xea,0x41,0x02]
+// CHECK: eors.w  r0, r1, r0, lsr #1      @ encoding: [0x91,0xea,0x50,0x00]
+
+    IT EQ
+    EOREQ    r3, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    EOREQ    r0, r0, r1          // Should choose narrow
+    IT EQ
+    EOREQ    r2, r1, r2          // Should choose narrow - commutative
+    IT EQ
+    EOREQ.W  r3, r3, r1          // Explicitly wide
+    IT EQ
+    EOREQ.W  r0, r1, r0  
+    IT EQ
+    EORSEQ   r1, r1, r1          // Must use wide encoding as flag-setting
+    IT EQ
+    EOREQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    EOREQ    r7, r1, r7          // Commutative
+    IT EQ
+    EOREQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    EOREQ    r8, r8, r1
+    IT EQ
+    EOREQ    r0, r8, r0
+    IT EQ
+    EOREQ    r3, r3, r8
+    IT EQ
+    EOREQ    r4, r4, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    EOREQ    r0, r1, r0, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r3, r2, r1             @ encoding: [0x82,0xea,0x01,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq    r0, r1                 @ encoding: [0x48,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq    r2, r1                 @ encoding: [0x4a,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r3, r3, r1             @ encoding: [0x83,0xea,0x01,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r0, r1, r0             @ encoding: [0x81,0xea,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eorseq.w r1, r1, r1             @ encoding: [0x91,0xea,0x01,0x01]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq    r7, r1                 @ encoding: [0x4f,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq    r7, r1                 @ encoding: [0x4f,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r8, r1, r8             @ encoding: [0x81,0xea,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r8, r8, r1             @ encoding: [0x88,0xea,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r0, r8, r0             @ encoding: [0x88,0xea,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r3, r3, r8             @ encoding: [0x83,0xea,0x08,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r4, r4, r1, lsl #1     @ encoding: [0x84,0xea,0x41,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: eoreq.w  r0, r1, r0, lsr #1     @ encoding: [0x81,0xea,0x50,0x00]
+
+// LSL 
+    LSLS     r0, r2, r1          // Must be wide - 3 distinct registers
+    LSLS     r2, r2, r1          // Should choose narrow
+    LSLS     r2, r1, r2          // Should choose wide - not commutative
+    LSLS.W   r0, r0, r1          // Explicitly wide
+    LSLS.W   r4, r1, r4  
+    LSL      r4, r1, r4          // Must use wide encoding as not flag-setting
+    LSLS     r7, r7, r1          // Should use narrow
+    LSLS     r8, r1, r8          // high registers so must use wide encoding
+    LSLS     r8, r8, r1
+    LSLS     r3, r8, r3
+    LSLS     r5, r5, r8
+// CHECK: lsls.w  r0, r2, r1              @ encoding: [0x12,0xfa,0x01,0xf0]
+// CHECK: lsls    r2, r1                  @ encoding: [0x8a,0x40]
+// CHECK: lsls.w  r2, r1, r2              @ encoding: [0x11,0xfa,0x02,0xf2]
+// CHECK: lsls.w  r0, r0, r1              @ encoding: [0x10,0xfa,0x01,0xf0]
+// CHECK: lsls.w  r4, r1, r4              @ encoding: [0x11,0xfa,0x04,0xf4]
+// CHECK: lsl.w   r4, r1, r4              @ encoding: [0x01,0xfa,0x04,0xf4]
+// CHECK: lsls    r7, r1                  @ encoding: [0x8f,0x40]
+// CHECK: lsls.w  r8, r1, r8              @ encoding: [0x11,0xfa,0x08,0xf8]
+// CHECK: lsls.w  r8, r8, r1              @ encoding: [0x18,0xfa,0x01,0xf8]
+// CHECK: lsls.w  r3, r8, r3              @ encoding: [0x18,0xfa,0x03,0xf3]
+// CHECK: lsls.w  r5, r5, r8              @ encoding: [0x15,0xfa,0x08,0xf5]
+
+    IT EQ
+    LSLEQ    r0, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    LSLEQ    r2, r2, r1          // Should choose narrow
+    IT EQ
+    LSLEQ    r2, r1, r2          // Should choose wide - not commutative
+    IT EQ
+    LSLEQ.W  r0, r0, r1          // Explicitly wide
+    IT EQ
+    LSLEQ.W  r3, r1, r3  
+    IT EQ
+    LSLSEQ   r4, r1, r4          // Must use wide encoding as flag-setting
+    IT EQ
+    LSLEQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    LSLEQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    LSLEQ    r8, r8, r1
+    IT EQ
+    LSLEQ    r0, r8, r0
+    IT EQ
+    LSLEQ    r3, r3, r8
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r0, r2, r1             @ encoding: [0x02,0xfa,0x01,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq    r2, r1                 @ encoding: [0x8a,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r2, r1, r2             @ encoding: [0x01,0xfa,0x02,0xf2]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r0, r0, r1             @ encoding: [0x00,0xfa,0x01,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r3, r1, r3             @ encoding: [0x01,0xfa,0x03,0xf3]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lslseq.w r4, r1, r4             @ encoding: [0x11,0xfa,0x04,0xf4]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq    r7, r1                 @ encoding: [0x8f,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r8, r1, r8             @ encoding: [0x01,0xfa,0x08,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r8, r8, r1             @ encoding: [0x08,0xfa,0x01,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r0, r8, r0             @ encoding: [0x08,0xfa,0x00,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsleq.w  r3, r3, r8             @ encoding: [0x03,0xfa,0x08,0xf3]
+
+// LSR 
+    LSRS     r6, r2, r1          // Must be wide - 3 distinct registers
+    LSRS     r2, r2, r1          // Should choose narrow
+    LSRS     r2, r1, r2          // Should choose wide - not commutative
+    LSRS.W   r2, r2, r1          // Explicitly wide
+    LSRS.W   r3, r1, r3  
+    LSR      r4, r1, r4          // Must use wide encoding as not flag-setting
+    LSRS     r7, r7, r1          // Should use narrow
+    LSRS     r8, r1, r8          // high registers so must use wide encoding
+    LSRS     r8, r8, r1
+    LSRS     r2, r8, r2
+    LSRS     r5, r5, r8
+// CHECK: lsrs.w  r6, r2, r1              @ encoding: [0x32,0xfa,0x01,0xf6]
+// CHECK: lsrs    r2, r1                  @ encoding: [0xca,0x40]
+// CHECK: lsrs.w  r2, r1, r2              @ encoding: [0x31,0xfa,0x02,0xf2]
+// CHECK: lsrs.w  r2, r2, r1              @ encoding: [0x32,0xfa,0x01,0xf2]
+// CHECK: lsrs.w  r3, r1, r3              @ encoding: [0x31,0xfa,0x03,0xf3]
+// CHECK: lsr.w   r4, r1, r4              @ encoding: [0x21,0xfa,0x04,0xf4]
+// CHECK: lsrs    r7, r1                  @ encoding: [0xcf,0x40]
+// CHECK: lsrs.w  r8, r1, r8              @ encoding: [0x31,0xfa,0x08,0xf8]
+// CHECK: lsrs.w  r8, r8, r1              @ encoding: [0x38,0xfa,0x01,0xf8]
+// CHECK: lsrs.w  r2, r8, r2              @ encoding: [0x38,0xfa,0x02,0xf2]
+// CHECK: lsrs.w  r5, r5, r8              @ encoding: [0x35,0xfa,0x08,0xf5]
+
+    IT EQ
+    LSREQ    r6, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    LSREQ    r7, r7, r1          // Should choose narrow
+    IT EQ
+    LSREQ    r7, r1, r7          // Should choose wide - not commutative
+    IT EQ
+    LSREQ.W  r7, r7, r1          // Explicitly wide
+    IT EQ
+    LSREQ.W  r2, r1, r2  
+    IT EQ
+    LSRSEQ   r0, r1, r0          // Must use wide encoding as flag-setting
+    IT EQ
+    LSREQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    LSREQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    LSREQ    r8, r8, r1
+    IT EQ
+    LSREQ    r1, r8, r1
+    IT EQ
+    LSREQ    r4, r4, r8
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r6, r2, r1             @ encoding: [0x22,0xfa,0x01,0xf6]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq    r7, r1                 @ encoding: [0xcf,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r7, r1, r7             @ encoding: [0x21,0xfa,0x07,0xf7]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r7, r7, r1             @ encoding: [0x27,0xfa,0x01,0xf7]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r2, r1, r2             @ encoding: [0x21,0xfa,0x02,0xf2]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsrseq.w r0, r1, r0             @ encoding: [0x31,0xfa,0x00,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq    r7, r1                 @ encoding: [0xcf,0x40]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r8, r1, r8             @ encoding: [0x21,0xfa,0x08,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r8, r8, r1             @ encoding: [0x28,0xfa,0x01,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r1, r8, r1             @ encoding: [0x28,0xfa,0x01,0xf1]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: lsreq.w  r4, r4, r8             @ encoding: [0x24,0xfa,0x08,0xf4]
+
+// ASR 
+    ASRS     r7, r6, r5          // Must be wide - 3 distinct registers
+    ASRS     r0, r0, r1          // Should choose narrow
+    ASRS     r0, r1, r0          // Should choose wide - not commutative
+    ASRS.W   r3, r3, r1          // Explicitly wide
+    ASRS.W   r1, r1, r1  
+    ASR      r0, r1, r0          // Must use wide encoding as not flag-setting
+    ASRS     r7, r7, r1          // Should use narrow
+    ASRS     r8, r1, r8          // high registers so must use wide encoding
+    ASRS     r8, r8, r1
+    ASRS     r5, r8, r5
+    ASRS     r5, r5, r8
+// CHECK: asrs.w  r7, r6, r5              @ encoding: [0x56,0xfa,0x05,0xf7]
+// CHECK: asrs    r0, r1                  @ encoding: [0x08,0x41]
+// CHECK: asrs.w  r0, r1, r0              @ encoding: [0x51,0xfa,0x00,0xf0]
+// CHECK: asrs.w  r3, r3, r1              @ encoding: [0x53,0xfa,0x01,0xf3]
+// CHECK: asrs.w  r1, r1, r1              @ encoding: [0x51,0xfa,0x01,0xf1]
+// CHECK: asr.w   r0, r1, r0              @ encoding: [0x41,0xfa,0x00,0xf0]
+// CHECK: asrs    r7, r1                  @ encoding: [0x0f,0x41]
+// CHECK: asrs.w  r8, r1, r8              @ encoding: [0x51,0xfa,0x08,0xf8]
+// CHECK: asrs.w  r8, r8, r1              @ encoding: [0x58,0xfa,0x01,0xf8]
+// CHECK: asrs.w  r5, r8, r5              @ encoding: [0x58,0xfa,0x05,0xf5]
+// CHECK: asrs.w  r5, r5, r8              @ encoding: [0x55,0xfa,0x08,0xf5]
+
+    IT EQ
+    ASREQ    r0, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    ASREQ    r2, r2, r1          // Should choose narrow
+    IT EQ
+    ASREQ    r1, r2, r1          // Should choose wide - not commutative
+    IT EQ
+    ASREQ.W  r4, r4, r1          // Explicitly wide
+    IT EQ
+    ASREQ.W  r6, r1, r6  
+    IT EQ
+    ASRSEQ   r3, r1, r3          // Must use wide encoding as flag-setting
+    IT EQ
+    ASREQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    ASREQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    ASREQ    r8, r8, r1
+    IT EQ
+    ASREQ    r1, r8, r1
+    IT EQ
+    ASREQ    r3, r3, r8
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r0, r2, r1             @ encoding: [0x42,0xfa,0x01,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq    r2, r1                 @ encoding: [0x0a,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r1, r2, r1             @ encoding: [0x42,0xfa,0x01,0xf1]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r4, r4, r1             @ encoding: [0x44,0xfa,0x01,0xf4]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r6, r1, r6             @ encoding: [0x41,0xfa,0x06,0xf6]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asrseq.w r3, r1, r3             @ encoding: [0x51,0xfa,0x03,0xf3]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq    r7, r1                 @ encoding: [0x0f,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r8, r1, r8             @ encoding: [0x41,0xfa,0x08,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r8, r8, r1             @ encoding: [0x48,0xfa,0x01,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r1, r8, r1             @ encoding: [0x48,0xfa,0x01,0xf1]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: asreq.w  r3, r3, r8             @ encoding: [0x43,0xfa,0x08,0xf3]
+
+// ADC (commutative)
+    ADCS     r5, r2, r1          // Must be wide - 3 distinct registers
+    ADCS     r5, r5, r1          // Should choose narrow  
+    ADCS     r3, r1, r3          // Should choose narrow - commutative
+    ADCS.W   r2, r2, r1          // Explicitly wide
+    ADCS.W   r3, r1, r3  
+    ADC      r0, r1, r0          // Must use wide encoding as not flag-setting
+    ADCS     r7, r7, r1          // Should use narrow
+    ADCS     r7, r1, r7          // Commutative
+    ADCS     r8, r1, r8          // high registers so must use wide encoding
+    ADCS     r8, r8, r1
+    ADCS     r5, r8, r5
+    ADCS     r2, r2, r8
+    ADCS     r3, r3, r1, lsl #1  // Must use wide - shifted register
+    ADCS     r4, r1, r4, lsr #1
+// CHECK: adcs.w  r5, r2, r1              @ encoding: [0x52,0xeb,0x01,0x05]
+// CHECK: adcs    r5, r1                  @ encoding: [0x4d,0x41]
+// CHECK: adcs    r3, r1                  @ encoding: [0x4b,0x41]
+// CHECK: adcs.w  r2, r2, r1              @ encoding: [0x52,0xeb,0x01,0x02]
+// CHECK: adcs.w  r3, r1, r3              @ encoding: [0x51,0xeb,0x03,0x03]
+// CHECK: adc.w   r0, r1, r0              @ encoding: [0x41,0xeb,0x00,0x00]
+// CHECK: adcs    r7, r1                  @ encoding: [0x4f,0x41]
+// CHECK: adcs    r7, r1                  @ encoding: [0x4f,0x41]
+// CHECK: adcs.w  r8, r1, r8              @ encoding: [0x51,0xeb,0x08,0x08]
+// CHECK: adcs.w  r8, r8, r1              @ encoding: [0x58,0xeb,0x01,0x08]
+// CHECK: adcs.w  r5, r8, r5              @ encoding: [0x58,0xeb,0x05,0x05]
+// CHECK: adcs.w  r2, r2, r8              @ encoding: [0x52,0xeb,0x08,0x02]
+// CHECK: adcs.w  r3, r3, r1, lsl #1      @ encoding: [0x53,0xeb,0x41,0x03]
+// CHECK: adcs.w  r4, r1, r4, lsr #1      @ encoding: [0x51,0xeb,0x54,0x04]
+
+    IT EQ
+    ADCEQ    r1, r2, r3          // Must be wide - 3 distinct registers
+    IT EQ
+    ADCEQ    r1, r1, r1          // Should choose narrow
+    IT EQ
+    ADCEQ    r3, r1, r3          // Should choose narrow - commutative
+    IT EQ
+    ADCEQ.W  r3, r3, r1          // Explicitly wide
+    IT EQ
+    ADCEQ.W  r0, r1, r0  
+    IT EQ
+    ADCSEQ   r3, r1, r3          // Must use wide encoding as flag-setting
+    IT EQ
+    ADCEQ    r7, r7, r1          // Should use narrow 
+    IT EQ
+    ADCEQ    r7, r1, r7          // Commutative
+    IT EQ
+    ADCEQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    ADCEQ    r8, r8, r1
+    IT EQ
+    ADCEQ    r3, r8, r3
+    IT EQ
+    ADCEQ    r1, r1, r8
+    IT EQ
+    ADCEQ    r2, r2, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    ADCEQ    r1, r1, r1, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r1, r2, r3             @ encoding: [0x42,0xeb,0x03,0x01]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq    r1, r1                 @ encoding: [0x49,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq    r3, r1                 @ encoding: [0x4b,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r3, r3, r1             @ encoding: [0x43,0xeb,0x01,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r0, r1, r0             @ encoding: [0x41,0xeb,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adcseq.w r3, r1, r3             @ encoding: [0x51,0xeb,0x03,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq    r7, r1                 @ encoding: [0x4f,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq    r7, r1                 @ encoding: [0x4f,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r8, r1, r8             @ encoding: [0x41,0xeb,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r8, r8, r1             @ encoding: [0x48,0xeb,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r3, r8, r3             @ encoding: [0x48,0xeb,0x03,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r1, r1, r8             @ encoding: [0x41,0xeb,0x08,0x01]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r2, r2, r1, lsl #1     @ encoding: [0x42,0xeb,0x41,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: adceq.w  r1, r1, r1, lsr #1     @ encoding: [0x41,0xeb,0x51,0x01]
+
+// SBC 
+    SBCS     r3, r2, r1          // Must be wide - 3 distinct registers
+    SBCS     r4, r4, r1          // Should choose narrow  
+    SBCS     r1, r4, r1          // Should choose wide - not commutative  
+    SBCS.W   r4, r4, r1          // Explicitly wide
+    SBCS.W   r2, r1, r2  
+    SBC      r0, r1, r0          // Must use wide encoding as not flag-setting
+    SBCS     r7, r7, r1          // Should use narrow
+    SBCS     r8, r1, r8          // high registers so must use wide encoding
+    SBCS     r8, r8, r1
+    SBCS     r4, r8, r4
+    SBCS     r3, r3, r8
+    SBCS     r2, r2, r1, lsl #1  // Must use wide - shifted register
+    SBCS     r5, r1, r5, lsr #1
+// CHECK: sbcs.w  r3, r2, r1              @ encoding: [0x72,0xeb,0x01,0x03]
+// CHECK: sbcs    r4, r1                  @ encoding: [0x8c,0x41]
+// CHECK: sbcs.w  r1, r4, r1              @ encoding: [0x74,0xeb,0x01,0x01]
+// CHECK: sbcs.w  r4, r4, r1              @ encoding: [0x74,0xeb,0x01,0x04]
+// CHECK: sbcs.w  r2, r1, r2              @ encoding: [0x71,0xeb,0x02,0x02]
+// CHECK: sbc.w   r0, r1, r0              @ encoding: [0x61,0xeb,0x00,0x00]
+// CHECK: sbcs    r7, r1                  @ encoding: [0x8f,0x41]
+// CHECK: sbcs.w  r8, r1, r8              @ encoding: [0x71,0xeb,0x08,0x08]
+// CHECK: sbcs.w  r8, r8, r1              @ encoding: [0x78,0xeb,0x01,0x08]
+// CHECK: sbcs.w  r4, r8, r4              @ encoding: [0x78,0xeb,0x04,0x04]
+// CHECK: sbcs.w  r3, r3, r8              @ encoding: [0x73,0xeb,0x08,0x03]
+// CHECK: sbcs.w  r2, r2, r1, lsl #1      @ encoding: [0x72,0xeb,0x41,0x02]
+// CHECK: sbcs.w  r5, r1, r5, lsr #1      @ encoding: [0x71,0xeb,0x55,0x05]
+
+    IT EQ
+    SBCEQ    r5, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    SBCEQ    r5, r5, r1          // Should choose narrow
+    IT EQ
+    SBCEQ    r1, r5, r1          // Should choose narrow
+    IT EQ
+    SBCEQ.W  r5, r5, r1          // Explicitly wide
+    IT EQ
+    SBCEQ.W  r0, r1, r0  
+    IT EQ
+    SBCSEQ   r2, r1, r2          // Must use wide encoding as flag-setting
+    IT EQ
+    SBCEQ    r7, r7, r1          // Should use narrow 
+    IT EQ
+    SBCEQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    SBCEQ    r8, r8, r1
+    IT EQ
+    SBCEQ    r7, r8, r7
+    IT EQ
+    SBCEQ    r7, r7, r8
+    IT EQ
+    SBCEQ    r2, r2, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    SBCEQ    r5, r1, r5, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r5, r2, r1             @ encoding: [0x62,0xeb,0x01,0x05]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq    r5, r1                 @ encoding: [0x8d,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r1, r5, r1             @ encoding: [0x65,0xeb,0x01,0x01]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r5, r5, r1             @ encoding: [0x65,0xeb,0x01,0x05]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r0, r1, r0             @ encoding: [0x61,0xeb,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbcseq.w r2, r1, r2             @ encoding: [0x71,0xeb,0x02,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq    r7, r1                 @ encoding: [0x8f,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r8, r1, r8             @ encoding: [0x61,0xeb,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r8, r8, r1             @ encoding: [0x68,0xeb,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r7, r8, r7             @ encoding: [0x68,0xeb,0x07,0x07]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r7, r7, r8             @ encoding: [0x67,0xeb,0x08,0x07]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r2, r2, r1, lsl #1     @ encoding: [0x62,0xeb,0x41,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: sbceq.w  r5, r1, r5, lsr #1     @ encoding: [0x61,0xeb,0x55,0x05]
+
+// ROR 
+    RORS     r3, r2, r1          // Must be wide - 3 distinct registers
+    RORS     r0, r0, r1          // Should choose narrow
+    RORS     r1, r0, r1          // Should choose wide - not commutative
+    RORS.W   r2, r2, r1          // Explicitly wide
+    RORS.W   r2, r1, r2  
+    ROR      r5, r1, r5          // Must use wide encoding as not flag-setting
+    RORS     r7, r7, r1          // Should use narrow
+    RORS     r8, r1, r8          // high registers so must use wide encoding
+    RORS     r8, r8, r1
+    RORS     r6, r8, r6
+    RORS     r6, r6, r8
+// CHECK: rors.w  r3, r2, r1              @ encoding: [0x72,0xfa,0x01,0xf3]
+// CHECK: rors    r0, r1                  @ encoding: [0xc8,0x41]
+// CHECK: rors.w  r1, r0, r1              @ encoding: [0x70,0xfa,0x01,0xf1]
+// CHECK: rors.w  r2, r2, r1              @ encoding: [0x72,0xfa,0x01,0xf2]
+// CHECK: rors.w  r2, r1, r2              @ encoding: [0x71,0xfa,0x02,0xf2]
+// CHECK: ror.w   r5, r1, r5              @ encoding: [0x61,0xfa,0x05,0xf5]
+// CHECK: rors    r7, r1                  @ encoding: [0xcf,0x41]
+// CHECK: rors.w  r8, r1, r8              @ encoding: [0x71,0xfa,0x08,0xf8]
+// CHECK: rors.w  r8, r8, r1              @ encoding: [0x78,0xfa,0x01,0xf8]
+// CHECK: rors.w  r6, r8, r6              @ encoding: [0x78,0xfa,0x06,0xf6]
+// CHECK: rors.w  r6, r6, r8              @ encoding: [0x76,0xfa,0x08,0xf6]
+
+    IT EQ
+    ROREQ    r4, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    ROREQ    r4, r4, r1          // Should choose narrow
+    IT EQ
+    ROREQ    r1, r4, r1          // Should choose wide - not commutative
+    IT EQ
+    ROREQ.W  r4, r4, r1          // Explicitly wide
+    IT EQ
+    ROREQ.W  r0, r1, r0  
+    IT EQ
+    RORSEQ   r0, r1, r0          // Must use wide encoding as flag-setting
+    IT EQ
+    ROREQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    ROREQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    ROREQ    r8, r8, r1
+    IT EQ
+    ROREQ    r3, r8, r3
+    IT EQ
+    ROREQ    r1, r1, r8
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r4, r2, r1             @ encoding: [0x62,0xfa,0x01,0xf4]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq    r4, r1                 @ encoding: [0xcc,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r1, r4, r1             @ encoding: [0x64,0xfa,0x01,0xf1]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r4, r4, r1             @ encoding: [0x64,0xfa,0x01,0xf4]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r0, r1, r0             @ encoding: [0x61,0xfa,0x00,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: rorseq.w r0, r1, r0             @ encoding: [0x71,0xfa,0x00,0xf0]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq    r7, r1                 @ encoding: [0xcf,0x41]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r8, r1, r8             @ encoding: [0x61,0xfa,0x08,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r8, r8, r1             @ encoding: [0x68,0xfa,0x01,0xf8]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r3, r8, r3             @ encoding: [0x68,0xfa,0x03,0xf3]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: roreq.w  r1, r1, r8             @ encoding: [0x61,0xfa,0x08,0xf1]
+
+// TST - only two register version available
+// RSB - only two register version available
+// CMP - only two register version available
+// CMN - only two register version available
+
+// ORR (commutative)
+    ORRS     r7, r2, r1          // Must be wide - 3 distinct registers
+    ORRS     r2, r2, r1          // Should choose narrow
+    ORRS     r3, r1, r3          // Should choose narrow - commutative
+    ORRS.W   r4, r4, r1          // Explicitly wide
+    ORRS.W   r5, r1, r5  
+    ORR      r2, r1, r2          // Must use wide encoding as not flag-setting
+    ORRS     r7, r7, r1          // Should use narrow
+    ORRS     r7, r1, r7          // Commutative
+    ORRS     r8, r1, r8          // high registers so must use wide encoding
+    ORRS     r8, r8, r1
+    ORRS     r1, r8, r1
+    ORRS     r0, r0, r8
+    ORRS     r1, r1, r1, lsl #1  // Must use wide - shifted register
+    ORRS     r0, r1, r0, lsr #1
+// CHECK: orrs.w  r7, r2, r1              @ encoding: [0x52,0xea,0x01,0x07]
+// CHECK: orrs    r2, r1                  @ encoding: [0x0a,0x43]
+// CHECK: orrs    r3, r1                  @ encoding: [0x0b,0x43]
+// CHECK: orrs.w  r4, r4, r1              @ encoding: [0x54,0xea,0x01,0x04]
+// CHECK: orrs.w  r5, r1, r5              @ encoding: [0x51,0xea,0x05,0x05]
+// CHECK: orr.w   r2, r1, r2              @ encoding: [0x41,0xea,0x02,0x02]
+// CHECK: orrs    r7, r1                  @ encoding: [0x0f,0x43]
+// CHECK: orrs    r7, r1                  @ encoding: [0x0f,0x43]
+// CHECK: orrs.w  r8, r1, r8              @ encoding: [0x51,0xea,0x08,0x08]
+// CHECK: orrs.w  r8, r8, r1              @ encoding: [0x58,0xea,0x01,0x08]
+// CHECK: orrs.w  r1, r8, r1              @ encoding: [0x58,0xea,0x01,0x01]
+// CHECK: orrs.w  r0, r0, r8              @ encoding: [0x50,0xea,0x08,0x00]
+// CHECK: orrs.w  r1, r1, r1, lsl #1      @ encoding: [0x51,0xea,0x41,0x01]
+// CHECK: orrs.w  r0, r1, r0, lsr #1      @ encoding: [0x51,0xea,0x50,0x00]
+
+    IT EQ
+    ORREQ    r0, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    ORREQ    r5, r5, r1          // Should choose narrow
+    IT EQ
+    ORREQ    r5, r1, r5          // Should choose narrow - commutative
+    IT EQ
+    ORREQ.W  r2, r2, r1          // Explicitly wide
+    IT EQ
+    ORREQ.W  r3, r1, r3  
+    IT EQ
+    ORRSEQ   r4, r1, r4          // Must use wide encoding as flag-setting
+    IT EQ
+    ORREQ    r7, r7, r1          // Should use narrow
+    IT EQ
+    ORREQ    r7, r1, r7          // Commutative
+    IT EQ
+    ORREQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    ORREQ    r8, r8, r1
+    IT EQ
+    ORREQ    r0, r8, r0
+    IT EQ
+    ORREQ    r0, r0, r8
+    IT EQ
+    ORREQ    r2, r2, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    ORREQ    r2, r1, r2, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r0, r2, r1             @ encoding: [0x42,0xea,0x01,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq    r5, r1                 @ encoding: [0x0d,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq    r5, r1                 @ encoding: [0x0d,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r2, r2, r1             @ encoding: [0x42,0xea,0x01,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r3, r1, r3             @ encoding: [0x41,0xea,0x03,0x03]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orrseq.w r4, r1, r4             @ encoding: [0x51,0xea,0x04,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq    r7, r1                 @ encoding: [0x0f,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq    r7, r1                 @ encoding: [0x0f,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r8, r1, r8             @ encoding: [0x41,0xea,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r8, r8, r1             @ encoding: [0x48,0xea,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r0, r8, r0             @ encoding: [0x48,0xea,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r0, r0, r8             @ encoding: [0x40,0xea,0x08,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r2, r2, r1, lsl #1     @ encoding: [0x42,0xea,0x41,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: orreq.w  r2, r1, r2, lsr #1     @ encoding: [0x41,0xea,0x52,0x02]
+
+// MUL - not affected by this change
+
+// BIC 
+    BICS     r3, r2, r1          // Must be wide - 3 distinct registers
+    BICS     r2, r2, r1          // Should choose narrow  
+    BICS     r1, r2, r1          // Should choose wide - not commutative  
+    BICS.W   r2, r2, r1          // Explicitly wide
+    BICS.W   r0, r1, r0  
+    BIC      r0, r1, r0          // Must use wide encoding as not flag-setting
+    BICS     r7, r7, r1          // Should use narrow
+    BICS     r8, r1, r8          // high registers so must use wide encoding
+    BICS     r8, r8, r1
+    BICS     r7, r8, r7
+    BICS     r5, r5, r8
+    BICS     r3, r3, r1, lsl #1  // Must use wide - shifted register
+    BICS     r4, r1, r4, lsr #1
+// CHECK: bics.w  r3, r2, r1              @ encoding: [0x32,0xea,0x01,0x03]
+// CHECK: bics    r2, r1                  @ encoding: [0x8a,0x43]
+// CHECK: bics.w  r1, r2, r1              @ encoding: [0x32,0xea,0x01,0x01]
+// CHECK: bics.w  r2, r2, r1              @ encoding: [0x32,0xea,0x01,0x02]
+// CHECK: bics.w  r0, r1, r0              @ encoding: [0x31,0xea,0x00,0x00]
+// CHECK: bic.w   r0, r1, r0              @ encoding: [0x21,0xea,0x00,0x00]
+// CHECK: bics    r7, r1                  @ encoding: [0x8f,0x43]
+// CHECK: bics.w  r8, r1, r8              @ encoding: [0x31,0xea,0x08,0x08]
+// CHECK: bics.w  r8, r8, r1              @ encoding: [0x38,0xea,0x01,0x08]
+// CHECK: bics.w  r7, r8, r7              @ encoding: [0x38,0xea,0x07,0x07]
+// CHECK: bics.w  r5, r5, r8              @ encoding: [0x35,0xea,0x08,0x05]
+// CHECK: bics.w  r3, r3, r1, lsl #1      @ encoding: [0x33,0xea,0x41,0x03]
+// CHECK: bics.w  r4, r1, r4, lsr #1      @ encoding: [0x31,0xea,0x54,0x04]
+
+    IT EQ
+    BICEQ    r0, r2, r1          // Must be wide - 3 distinct registers
+    IT EQ
+    BICEQ    r5, r5, r1          // Should choose narrow
+    IT EQ
+    BICEQ    r1, r5, r1          // Should choose wide - not commutative
+    IT EQ
+    BICEQ.W  r4, r4, r1          // Explicitly wide
+    IT EQ
+    BICEQ.W  r2, r1, r2  
+    IT EQ
+    BICSEQ   r5, r1, r5          // Must use wide encoding as flag-setting
+    IT EQ
+    BICEQ    r7, r7, r1          // Should use narrow 
+    IT EQ
+    BICEQ    r8, r1, r8          // high registers so must use wide encoding
+    IT EQ
+    BICEQ    r8, r8, r1
+    IT EQ
+    BICEQ    r0, r8, r0
+    IT EQ
+    BICEQ    r2, r2, r8
+    IT EQ
+    BICEQ    r4, r4, r1, lsl #1  // Must use wide - shifted register
+    IT EQ
+    BICEQ    r5, r1, r5, lsr #1
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r0, r2, r1             @ encoding: [0x22,0xea,0x01,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq    r5, r1                 @ encoding: [0x8d,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r1, r5, r1             @ encoding: [0x25,0xea,0x01,0x01]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r4, r4, r1             @ encoding: [0x24,0xea,0x01,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r2, r1, r2             @ encoding: [0x21,0xea,0x02,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: bicseq.w r5, r1, r5             @ encoding: [0x31,0xea,0x05,0x05]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq    r7, r1                 @ encoding: [0x8f,0x43]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r8, r1, r8             @ encoding: [0x21,0xea,0x08,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r8, r8, r1             @ encoding: [0x28,0xea,0x01,0x08]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r0, r8, r0             @ encoding: [0x28,0xea,0x00,0x00]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r2, r2, r8             @ encoding: [0x22,0xea,0x08,0x02]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r4, r4, r1, lsl #1     @ encoding: [0x24,0xea,0x41,0x04]
+// CHECK: it eq                           @ encoding: [0x08,0xbf]
+// CHECK: biceq.w  r5, r1, r5, lsr #1     @ encoding: [0x21,0xea,0x55,0x05]
+
+// CMN - only two register version available
diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s
index cc87a38..0a1fe92 100644
--- a/test/MC/ARM/vfp4.s
+++ b/test/MC/ARM/vfp4.s
@@ -1,5 +1,6 @@
 @ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4   | FileCheck %s --check-prefix=ARM
 @ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB
+@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mcpu=cortex-m4 | FileCheck %s --check-prefix=THUMB_V7EM
 
 @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
 @ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
@@ -7,6 +8,7 @@ vfma.f64 d16, d18, d17
 
 @ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
 @ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
+@ THUMB_V7EM: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
 vfma.f32 s2, s4, s0
 
 @ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
@@ -23,6 +25,7 @@ vfnma.f64 d16, d18, d17
 
 @ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
 @ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
+@ THUMB_V7EM: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
 vfnma.f32 s2, s4, s0
 
 @ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
@@ -31,6 +34,7 @@ vfms.f64 d16, d18, d17
 
 @ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
 @ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
+@ THUMB_V7EM: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
 vfms.f32 s2, s4, s0
 
 @ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
diff --git a/test/MC/ARM/vpush-vpop.s b/test/MC/ARM/vpush-vpop.s
index 4fb4dec..31f5524 100644
--- a/test/MC/ARM/vpush-vpop.s
+++ b/test/MC/ARM/vpush-vpop.s
@@ -1,5 +1,5 @@
-@ RUN: llvm-mc -triple armv7-unknown-unknown -show-encoding < %s | FileCheck --check-prefix=CHECK-ARM %s
-@ RUN: llvm-mc -triple thumbv7-unknown-unknown -show-encoding < %s | FileCheck --check-prefix=CHECK-THUMB %s
+@ RUN: llvm-mc -triple armv7-unknown-unknown -mcpu=cortex-a8 -show-encoding < %s | FileCheck --check-prefix=CHECK-ARM %s
+@ RUN: llvm-mc -triple thumbv7-unknown-unknown -mcpu=cortex-a8 -show-encoding < %s | FileCheck --check-prefix=CHECK-THUMB %s
 
 foo:
 @ CHECK: foo
diff --git a/test/MC/AsmParser/extern.s b/test/MC/AsmParser/extern.s
new file mode 100644
index 0000000..461f843
--- /dev/null
+++ b/test/MC/AsmParser/extern.s
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+
+# CHECK-NOT: foo
+.extern foo
diff --git a/test/MC/AsmParser/ifb.s b/test/MC/AsmParser/ifb.s
new file mode 100644
index 0000000..48d69f4
--- /dev/null
+++ b/test/MC/AsmParser/ifb.s
@@ -0,0 +1,67 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+
+defined:
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifb
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifb defined
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifb undefined
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifb ""
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnb
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnb defined
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnb undefined
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnb ""
+	.byte 1
+.else
+	.byte 0
+.endif
diff --git a/test/MC/AsmParser/ifc.s b/test/MC/AsmParser/ifc.s
new file mode 100644
index 0000000..20e55c0
--- /dev/null
+++ b/test/MC/AsmParser/ifc.s
@@ -0,0 +1,65 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifc foo, foo
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifc "foo space", "foo space"
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifc foo space, foo space
+	.byte 1
+.else
+	.byte 0
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifc unequal, unEqual
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnc foo, foo
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnc "foo space", "foo space"
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnc foo space, foo space
+	.byte 0
+.else
+	.byte 1
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+.ifnc unequal, unEqual
+	.byte 1
+.else
+	.byte 0
+.endif
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
index 13b197a..6d08421 100644
--- a/test/MC/AsmParser/macro-args.s
+++ b/test/MC/AsmParser/macro-args.s
@@ -42,3 +42,15 @@ top bar, 42
 // CHECK-NOT: fred
 // CHECK: _bar
 // CHECK-NEXT: fred = 42
+
+
+.macro foo
+foo_$0_$1_$2_$3:
+  nop
+.endm
+
+foo 1, 2, 3, 4
+foo 1, , 3, 4
+
+// CHECK: foo_1_2_3_4:
+// CHECK: foo_1__3_4:
diff --git a/test/MC/AsmParser/macro-err1.s b/test/MC/AsmParser/macro-err1.s
new file mode 100644
index 0000000..924deb0
--- /dev/null
+++ b/test/MC/AsmParser/macro-err1.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2> %t
+// RUN: FileCheck < %t %s
+
+.macro foo bar
+        .long \bar
+.endm
+
+foo 42,  42
+
+// CHECK: Too many arguments
diff --git a/test/MC/AsmParser/macro-irp.s b/test/MC/AsmParser/macro-irp.s
new file mode 100644
index 0000000..a368b74
--- /dev/null
+++ b/test/MC/AsmParser/macro-irp.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s | FileCheck %s
+
+.irp reg,%eax,%ebx
+        pushl \reg
+.endr
+
+// CHECK: pushl %eax
+// CHECK: pushl %ebx
diff --git a/test/MC/AsmParser/macro-irpc.s b/test/MC/AsmParser/macro-irpc.s
new file mode 100644
index 0000000..ea5efbf
--- /dev/null
+++ b/test/MC/AsmParser/macro-irpc.s
@@ -0,0 +1,9 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s | FileCheck %s
+
+.irpc foo,123
+        .long \foo
+.endr
+
+// CHECK: long 1
+// CHECK: long 2
+// CHECK: long 3
diff --git a/test/MC/AsmParser/macro-rept-err1.s b/test/MC/AsmParser/macro-rept-err1.s
new file mode 100644
index 0000000..db92856
--- /dev/null
+++ b/test/MC/AsmParser/macro-rept-err1.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2> %t
+// RUN: FileCheck < %t %s
+
+.endr
+
+// CHECK: unexpected '.endr' directive, no current .rept
diff --git a/test/MC/AsmParser/macro-rept-err2.s b/test/MC/AsmParser/macro-rept-err2.s
new file mode 100644
index 0000000..678b4c7
--- /dev/null
+++ b/test/MC/AsmParser/macro-rept-err2.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2> %t
+// RUN: FileCheck < %t %s
+
+.rept 3
+.long
+
+// CHECK: no matching '.endr' in definition
diff --git a/test/MC/AsmParser/macro-rept.s b/test/MC/AsmParser/macro-rept.s
new file mode 100644
index 0000000..1dc8060
--- /dev/null
+++ b/test/MC/AsmParser/macro-rept.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s | FileCheck %s
+
+.rept 2
+    .long 1
+.endr
+
+.rept 3
+.rept 2
+    .long 0
+.endr
+.endr
+
+// CHECK: .long	1
+// CHECK: .long	1
+
+// CHECK: .long	0
+// CHECK: .long	0
+// CHECK: .long	0
+
+// CHECK: .long	0
+// CHECK: .long	0
+// CHECK: .long	0
diff --git a/test/MC/AsmParser/macros-parsing.s b/test/MC/AsmParser/macros-parsing.s
index 65f6454..75aaac03 100644
--- a/test/MC/AsmParser/macros-parsing.s
+++ b/test/MC/AsmParser/macros-parsing.s
@@ -5,7 +5,7 @@
 .endmacro
 
 .macros_off
-// CHECK-ERRORS: 9:1: warning: ignoring directive for now
+// CHECK-ERRORS: 9:1: error: unknown directive
 .test0
 .macros_on
 
diff --git a/test/MC/AsmParser/macros.s b/test/MC/AsmParser/macros.s
index 214274d..2957592 100644
--- a/test/MC/AsmParser/macros.s
+++ b/test/MC/AsmParser/macros.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
 
 .macro .test0
@@ -9,7 +9,7 @@
 .endmacro
 
 .test1
-// CHECK-ERRORS: <instantiation>:1:1: warning: ignoring directive for now
+// CHECK-ERRORS: <instantiation>:1:1: error: unknown directive
 // CHECK-ERRORS-NEXT: macrobody0
 // CHECK-ERRORS-NEXT: ^
 // CHECK-ERRORS: <instantiation>:1:1: note: while in macro instantiation
diff --git a/test/MC/AsmParser/purgem.s b/test/MC/AsmParser/purgem.s
new file mode 100644
index 0000000..c76c1c6
--- /dev/null
+++ b/test/MC/AsmParser/purgem.s
@@ -0,0 +1,12 @@
+# RUN: not llvm-mc -triple i386-unknown-unknown %s 2>&1 | FileCheck %s
+
+.macro foo
+.err
+.endm
+
+.purgem bar
+# CHECK: error: macro 'bar' is not defined
+
+.purgem foo
+foo
+# CHECK: error: invalid instruction mnemonic 'foo'
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
index 8cafcb3..3f72805 100644
--- a/test/MC/COFF/seh.s
+++ b/test/MC/COFF/seh.s
@@ -14,7 +14,6 @@
 // CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
 // CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
 // CHECK-NEXT:   IMAGE_SCN_MEM_READ
-// CHECK-NEXT:   IMAGE_SCN_MEM_WRITE
 // CHECK-NEXT: SectionData
 // CHECK-NEXT:   09 12 08 03 00 03 0F 30 - 0E 88 00 00 09 64 02 00
 // CHECK-NEXT:   04 22 00 1A 00 00 00 00 - 00 00 00 00 21 00 00 00
diff --git a/test/MC/Disassembler/ARM/arm-tests.txt b/test/MC/Disassembler/ARM/arm-tests.txt
index 471076a..0c9aaab 100644
--- a/test/MC/Disassembler/ARM/arm-tests.txt
+++ b/test/MC/Disassembler/ARM/arm-tests.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mattr +mp | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mcpu=cortex-a9-mp | FileCheck %s
 
 # CHECK:	addpl	r4, pc, #318767104
 0x4c 0x45 0x8f 0x52
diff --git a/test/MC/Disassembler/ARM/basic-arm-instructions.txt b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
index fc7eda5..1100ce6 100644
--- a/test/MC/Disassembler/ARM/basic-arm-instructions.txt
+++ b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=armv7-apple-darwin -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=armv7-apple-darwin -mcpu=cortex-a8 -disassemble < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # ADC (immediate)
@@ -169,9 +169,15 @@
 #------------------------------------------------------------------------------
 # CHECK: add	r2, pc, #3
 # CHECK: sub	r2, pc, #3
+# CHECK: sub	r1, pc, #0
+# CHECK: sub	r1, pc, #301989888
+# CHECK: add	r1, pc, #301989888
 
 0x03 0x20 0x8f 0xe2
 0x03 0x20 0x4f 0xe2
+0x00 0x10 0x4f 0xe2
+0x12 0x14 0x4f 0xe2
+0x12 0x14 0x8f 0xe2
 
 #------------------------------------------------------------------------------
 # AND
@@ -469,47 +475,77 @@
 #------------------------------------------------------------------------------
 # DMB
 #------------------------------------------------------------------------------
-# CHECK: dmb sy
-# CHECK: dmb st
-# CHECK: dmb ish
-# CHECK: dmb ishst
-# CHECK: dmb nsh
-# CHECK: dmb nshst
-# CHECK: dmb osh
+
+# CHECK: dmb #0x0
+# CHECK: dmb #0x1
 # CHECK: dmb oshst
-# CHECK: dmb
+# CHECK: dmb osh
+# CHECK: dmb #0x4
+# CHECK: dmb #0x5
+# CHECK: dmb nshst
+# CHECK: dmb nsh
+# CHECK: dmb #0x8
+# CHECK: dmb #0x9
+# CHECK: dmb ishst
+# CHECK: dmb ish
+# CHECK: dmb #0xc
+# CHECK: dmb #0xd
+# CHECK: dmb st
+# CHECK: dmb sy
 
-0x5f 0xf0 0x7f 0xf5
-0x5e 0xf0 0x7f 0xf5
-0x5b 0xf0 0x7f 0xf5
-0x5a 0xf0 0x7f 0xf5
-0x57 0xf0 0x7f 0xf5
-0x56 0xf0 0x7f 0xf5
-0x53 0xf0 0x7f 0xf5
+0x50 0xf0 0x7f 0xf5
+0x51 0xf0 0x7f 0xf5
 0x52 0xf0 0x7f 0xf5
+0x53 0xf0 0x7f 0xf5
+0x54 0xf0 0x7f 0xf5
+0x55 0xf0 0x7f 0xf5
+0x56 0xf0 0x7f 0xf5
+0x57 0xf0 0x7f 0xf5
+0x58 0xf0 0x7f 0xf5
+0x59 0xf0 0x7f 0xf5
+0x5a 0xf0 0x7f 0xf5
+0x5b 0xf0 0x7f 0xf5
+0x5c 0xf0 0x7f 0xf5
+0x5d 0xf0 0x7f 0xf5
+0x5e 0xf0 0x7f 0xf5
 0x5f 0xf0 0x7f 0xf5
 
 #------------------------------------------------------------------------------
 # DSB
 #------------------------------------------------------------------------------
-# CHECK: dsb sy
-# CHECK: dsb st
-# CHECK: dsb ish
-# CHECK: dsb ishst
-# CHECK: dsb nsh
-# CHECK: dsb nshst
-# CHECK: dsb osh
-# CHECK: dsb oshst
-# CHECK: dsb
 
-0x4f 0xf0 0x7f 0xf5
-0x4e 0xf0 0x7f 0xf5
-0x4b 0xf0 0x7f 0xf5
-0x4a 0xf0 0x7f 0xf5
-0x47 0xf0 0x7f 0xf5
-0x46 0xf0 0x7f 0xf5
-0x43 0xf0 0x7f 0xf5
+# CHECK: dsb	#0x0
+# CHECK: dsb	#0x1
+# CHECK: dsb	oshst
+# CHECK: dsb	osh
+# CHECK: dsb	#0x4
+# CHECK: dsb	#0x5
+# CHECK: dsb	nshst
+# CHECK: dsb	nsh
+# CHECK: dsb	#0x8
+# CHECK: dsb	#0x9
+# CHECK: dsb	ishst
+# CHECK: dsb	ish
+# CHECK: dsb	#0xc
+# CHECK: dsb	#0xd
+# CHECK: dsb	st
+# CHECK: dsb	sy
+
+0x40 0xf0 0x7f 0xf5
+0x41 0xf0 0x7f 0xf5
 0x42 0xf0 0x7f 0xf5
+0x43 0xf0 0x7f 0xf5
+0x44 0xf0 0x7f 0xf5
+0x45 0xf0 0x7f 0xf5
+0x46 0xf0 0x7f 0xf5
+0x47 0xf0 0x7f 0xf5
+0x48 0xf0 0x7f 0xf5
+0x49 0xf0 0x7f 0xf5
+0x4a 0xf0 0x7f 0xf5
+0x4b 0xf0 0x7f 0xf5
+0x4c 0xf0 0x7f 0xf5
+0x4d 0xf0 0x7f 0xf5
+0x4e 0xf0 0x7f 0xf5
 0x4f 0xf0 0x7f 0xf5
 
 #------------------------------------------------------------------------------
diff --git a/test/MC/Disassembler/ARM/fp-encoding.txt b/test/MC/Disassembler/ARM/fp-encoding.txt
index 9095b84..8dedf80 100644
--- a/test/MC/Disassembler/ARM/fp-encoding.txt
+++ b/test/MC/Disassembler/ARM/fp-encoding.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple armv7-apple-darwin -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple armv7-apple-darwin -mcpu=cortex-a8 -disassemble < %s | FileCheck %s
 
 0xa0 0x0b 0x71 0xee
 # CHECK: vadd.f64        d16, d17, d16
@@ -203,6 +203,33 @@
 # CHECK: vstmia  r1, {d2, d3, d4, d5, d6, d7}
 # CHECK: vstmia  r1, {s2, s3, s4, s5, s6, s7}
 
+0x05 0x9a 0xc0 0x0c
+0x0c 0x0b 0xc7 0x0c
+0x06 0x9a 0x93 0x0c
+0x0a 0x5b 0xd2 0x0c
+# CHECK: vstmiaeq r0, {s19, s20, s21, s22, s23}
+# CHECK: vstmiaeq r7, {d16, d17, d18, d19, d20, d21}
+# CHECK: vldmiaeq r3, {s18, s19, s20, s21, s22, s23}
+# CHECK: vldmiaeq r2, {d21, d22, d23, d24, d25}
+
+0x04 0xca 0x6c 0x0d
+0x06 0x1b 0x69 0x0d
+0x03 0xaa 0x75 0x0d
+0x08 0xeb 0x37 0x0d
+# CHECK: vstmdbeq r12!, {s25, s26, s27, s28}
+# CHECK: vstmdbeq r9!, {d17, d18, d19}
+# CHECK: vldmdbeq r5!, {s21, s22, s23}
+# CHECK: vldmdbeq r7!, {d14, d15, d16, d17}
+
+0x04 0x7a 0xa6 0x0c
+0x0c 0xfb 0xa4 0x0c
+0x03 0xaa 0xf8 0x0c
+0x0a 0x3b 0xfb 0x0c
+# CHECK: vstmiaeq r6!, {s14, s15, s16, s17}
+# CHECK: vstmiaeq r4!, {d15, d16, d17, d18, d19, d20}
+# CHECK: vldmiaeq r8!, {s21, s22, s23}
+# CHECK: vldmiaeq r11!, {d19, d20, d21, d22, d23}
+
 0x40 0x0b 0xbd 0xee
 0x60 0x0a 0xbd 0xee
 0x40 0x0b 0xbc 0xee
diff --git a/test/MC/Disassembler/ARM/invalid-BFI-arm.txt b/test/MC/Disassembler/ARM/invalid-BFI-arm.txt
index a0d5944..f7acce9 100644
--- a/test/MC/Disassembler/ARM/invalid-BFI-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-BFI-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=60 Name=BFI Format=ARM_FORMAT_DPFRM(4)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt b/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt
index d2d424c..356c376 100644
--- a/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=2249 Name=tBcc Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-CPS2p-arm.txt b/test/MC/Disassembler/ARM/invalid-CPS2p-arm.txt
index 10748e9..bc8b7e1 100644
--- a/test/MC/Disassembler/ARM/invalid-CPS2p-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-CPS2p-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # invalid imod value (0b01)
 0xc0 0x67 0x4 0xf1
diff --git a/test/MC/Disassembler/ARM/invalid-CPS3p-arm.txt b/test/MC/Disassembler/ARM/invalid-CPS3p-arm.txt
index 8146b5c..842a52b 100644
--- a/test/MC/Disassembler/ARM/invalid-CPS3p-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-CPS3p-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "potentially undefined instruction encoding"
 
 # invalid (imod, M, iflags) combination
 0x93 0x00 0x02 0xf1
diff --git a/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt b/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt
index b441485..8396156 100644
--- a/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1908 Name=t2DMB Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-DSB-arm.txt b/test/MC/Disassembler/ARM/invalid-DSB-arm.txt
index de042a97..2c6e6a7 100644
--- a/test/MC/Disassembler/ARM/invalid-DSB-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-DSB-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=102 Name=DSB Format=ARM_FORMAT_MISCFRM(26)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-IT-CBNZ-thumb.txt b/test/MC/Disassembler/ARM/invalid-IT-CBNZ-thumb.txt
index 6174e92..4297c01 100644
--- a/test/MC/Disassembler/ARM/invalid-IT-CBNZ-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-IT-CBNZ-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "potentially undefined instruction encoding"
 
 # CBZ / CBNZ not allowed in IT block.
 
diff --git a/test/MC/Disassembler/ARM/invalid-IT-CC15.txt b/test/MC/Disassembler/ARM/invalid-IT-CC15.txt
index 17e25ea..733895d 100644
--- a/test/MC/Disassembler/ARM/invalid-IT-CC15.txt
+++ b/test/MC/Disassembler/ARM/invalid-IT-CC15.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-unknown-unknown |& grep und
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-unknown-unknown 2>&1 | grep und
 # rdar://10841671
 
 0xe3 0xbf
diff --git a/test/MC/Disassembler/ARM/invalid-IT-thumb.txt b/test/MC/Disassembler/ARM/invalid-IT-thumb.txt
index 9b571b3..1a8ff48 100644
--- a/test/MC/Disassembler/ARM/invalid-IT-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-IT-thumb.txt
@@ -1,3 +1,3 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-unknown-unknown |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-unknown-unknown 2>&1 | grep "potentially undefined instruction encoding"
 
 0xff 0xbf 0x6b 0x80 0x00 0x75
diff --git a/test/MC/Disassembler/ARM/invalid-LDC-form-arm.txt b/test/MC/Disassembler/ARM/invalid-LDC-form-arm.txt
index 0b0426b..6cff09e 100644
--- a/test/MC/Disassembler/ARM/invalid-LDC-form-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDC-form-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=0 Name=PHI Format=(42)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-LDM-thumb.txt b/test/MC/Disassembler/ARM/invalid-LDM-thumb.txt
index a42b248..7d8c492 100644
--- a/test/MC/Disassembler/ARM/invalid-LDM-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDM-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "potentially undefined instruction encoding"
 
 # Writeback is not allowed is Rn is in the target register list.
 
diff --git a/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt b/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt
index 6b695b9..68d22de 100644
--- a/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "potentially undefined instruction encoding"
 
 # Opcode=140 Name=LDRB_POST Format=ARM_FORMAT_LDFRM(6)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt b/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt
index 7ea1b46..4df5309 100644
--- a/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1930 Name=t2LDRD_PRE Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
diff --git a/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt b/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt
index eef2c45..0cff28a 100644
--- a/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # LDR_PRE/POST has encoding Inst{4} = 0.
diff --git a/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt b/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt
index e42e0de..30cb727 100644
--- a/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {potentially undefined instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "potentially undefined instruction encoding"
 
 # Opcode=165 Name=LDR_PRE Format=ARM_FORMAT_LDFRM(6)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-LDRrs-arm.txt b/test/MC/Disassembler/ARM/invalid-LDRrs-arm.txt
index 23a0b85..7b7286a 100644
--- a/test/MC/Disassembler/ARM/invalid-LDRrs-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-LDRrs-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # LDR (register) has encoding Inst{4} = 0.
 0xba 0xae 0x9f 0x57
diff --git a/test/MC/Disassembler/ARM/invalid-MCR-arm.txt b/test/MC/Disassembler/ARM/invalid-MCR-arm.txt
index 8343d54..bb4b06c 100644
--- a/test/MC/Disassembler/ARM/invalid-MCR-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MCR-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=171 Name=MCR Format=ARM_FORMAT_BRFRM(2)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt b/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt
index 235952f..528563a 100644
--- a/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=185 Name=MOVTi16 Format=ARM_FORMAT_DPFRM(4)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-MOVr-arm.txt b/test/MC/Disassembler/ARM/invalid-MOVr-arm.txt
index 01c1466..41ec53f 100644
--- a/test/MC/Disassembler/ARM/invalid-MOVr-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MOVr-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=0 Name=PHI Format=(42)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-MOVs-LSL-arm.txt b/test/MC/Disassembler/ARM/invalid-MOVs-LSL-arm.txt
index 757d167..e5f2a5e 100644
--- a/test/MC/Disassembler/ARM/invalid-MOVs-LSL-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MOVs-LSL-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=196 Name=MOVs Format=ARM_FORMAT_DPSOREGFRM(5)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-MOVs-arm.txt b/test/MC/Disassembler/ARM/invalid-MOVs-arm.txt
index ba48877..3f4c1e5 100644
--- a/test/MC/Disassembler/ARM/invalid-MOVs-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MOVs-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=0 Name=PHI Format=(42)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-MRRC2-arm.txt b/test/MC/Disassembler/ARM/invalid-MRRC2-arm.txt
index aaae6ce..c20ce54 100644
--- a/test/MC/Disassembler/ARM/invalid-MRRC2-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MRRC2-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: invalid instruction encoding
 0x00 0x1a 0x50 0xfc
diff --git a/test/MC/Disassembler/ARM/invalid-MSRi-arm.txt b/test/MC/Disassembler/ARM/invalid-MSRi-arm.txt
index 3765b1f..901667a 100644
--- a/test/MC/Disassembler/ARM/invalid-MSRi-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-MSRi-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=206 Name=MSRi Format=ARM_FORMAT_BRFRM(2)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-RFEorLDMIA-arm.txt b/test/MC/Disassembler/ARM/invalid-RFEorLDMIA-arm.txt
index cffd86d..499aa86 100644
--- a/test/MC/Disassembler/ARM/invalid-RFEorLDMIA-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-RFEorLDMIA-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=134 Name=LDMIA Format=ARM_FORMAT_LDSTMULFRM(10)
 # 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-SBFX-arm.txt b/test/MC/Disassembler/ARM/invalid-SBFX-arm.txt
index 9e16536..7bc97d5 100644
--- a/test/MC/Disassembler/ARM/invalid-SBFX-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-SBFX-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=271 Name=SBFX Format=ARM_FORMAT_DPFRM(4)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-SMLAD-arm.txt b/test/MC/Disassembler/ARM/invalid-SMLAD-arm.txt
index 91f3d58..fe4f43a 100644
--- a/test/MC/Disassembler/ARM/invalid-SMLAD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-SMLAD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=284 Name=SMLAD Format=ARM_FORMAT_MULFRM(1)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-SRS-arm.txt b/test/MC/Disassembler/ARM/invalid-SRS-arm.txt
index fc5c711..eedd05c 100644
--- a/test/MC/Disassembler/ARM/invalid-SRS-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-SRS-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=0 Name=PHI Format=(42)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt
index ca16724..3d5235d 100644
--- a/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=2313 Name=tSTMIA_UPD Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-SXTB-arm.txt b/test/MC/Disassembler/ARM/invalid-SXTB-arm.txt
index 400d44c..f67f38e 100644
--- a/test/MC/Disassembler/ARM/invalid-SXTB-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-SXTB-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=390 Name=SXTBr_rot Format=ARM_FORMAT_EXTFRM(14)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-UMAAL-arm.txt b/test/MC/Disassembler/ARM/invalid-UMAAL-arm.txt
index c7cbd84..f57c48f 100644
--- a/test/MC/Disassembler/ARM/invalid-UMAAL-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-UMAAL-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=419 Name=UMAAL Format=ARM_FORMAT_MULFRM(1)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
index 12da869..5ba7d61 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
diff --git a/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt
index bab32ca..58def05 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=871 Name=VLD3DUPd32_UPD Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-VLDMSDB_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLDMSDB_UPD-arm.txt
index 887b983..54fcadb 100644
--- a/test/MC/Disassembler/ARM/invalid-VLDMSDB_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLDMSDB_UPD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # core registers out of range
 0xa5 0xba 0x72 0xed
diff --git a/test/MC/Disassembler/ARM/invalid-VQADD-arm.txt b/test/MC/Disassembler/ARM/invalid-VQADD-arm.txt
index a53f940..f961c64 100644
--- a/test/MC/Disassembler/ARM/invalid-VQADD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VQADD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=1225 Name=VQADDsv16i8 Format=ARM_FORMAT_N3Reg(37)
diff --git a/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
index 8ff3a2b..2d2a628 100644
--- a/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1839 Name=VST1d8Twb_register Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-VST2b32_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VST2b32_UPD-arm.txt
index a12ca95..07a1c7a 100644
--- a/test/MC/Disassembler/ARM/invalid-VST2b32_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VST2b32_UPD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=1641 Name=VST2b32_UPD Format=ARM_FORMAT_NLdSt(30)
diff --git a/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt
index df0a642..c9f1cf1 100644
--- a/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1894 Name=t2Bcc Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt
index e1f841b8..eb415f7 100644
--- a/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1922 Name=t2LDRBT Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt
index 7c0efab..6c13560 100644
--- a/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=1934 Name=t2LDREXD Format=ARM_FORMAT_THUMBFRM(25)
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt
index a63d121..7f84e08 100644
--- a/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1953 Name=t2LDRSHi12 Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt
index f126ff0..e44cf95 100644
--- a/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=1954 Name=t2LDRSHi8 Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-t2PUSH-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2PUSH-thumb.txt
index b3daa9a..8c0d48b 100644
--- a/test/MC/Disassembler/ARM/invalid-t2PUSH-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2PUSH-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # SP and PC are not allowed in the register list on STM instructions in Thumb2.
 
diff --git a/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt
index 2198efc..64ba368 100644
--- a/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=2124 Name=t2STRD_PRE Format=ARM_FORMAT_THUMBFRM(25)
diff --git a/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt
index 3f406d4..243c11d 100644
--- a/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 # XFAIL: *
 
 # Opcode=2127 Name=t2STREXB Format=ARM_FORMAT_THUMBFRM(25)
diff --git a/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt
index 0f9a16e..7a7c4a5 100644
--- a/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=2128 Name=t2STREXD Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt
index 548ad05..2ad3e7d 100644
--- a/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # Opcode=2137 Name=t2STR_POST Format=ARM_FORMAT_THUMBFRM(25)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/ldrd-armv4.txt b/test/MC/Disassembler/ARM/ldrd-armv4.txt
index bb87ade..f2fff3f 100644
--- a/test/MC/Disassembler/ARM/ldrd-armv4.txt
+++ b/test/MC/Disassembler/ARM/ldrd-armv4.txt
@@ -1,5 +1,5 @@
-# RUN: llvm-mc --disassemble %s -triple=armv4-linux-gnueabi |& FileCheck %s -check-prefix=V4
-# RUN: llvm-mc --disassemble %s -triple=armv5te-linux-gnueabi |& FileCheck %s -check-prefix=V5TE
+# RUN: llvm-mc --disassemble %s -triple=armv4-linux-gnueabi 2>&1 | FileCheck %s -check-prefix=V4
+# RUN: llvm-mc --disassemble %s -triple=armv5te-linux-gnueabi 2>&1 | FileCheck %s -check-prefix=V5TE
 
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
 # -------------------------------------------------------------------------------------------------
diff --git a/test/MC/Disassembler/ARM/neon-tests.txt b/test/MC/Disassembler/ARM/neon-tests.txt
index f44c2a0..a7b6b1c 100644
--- a/test/MC/Disassembler/ARM/neon-tests.txt
+++ b/test/MC/Disassembler/ARM/neon-tests.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mcpu=cortex-a8 | FileCheck %s
 
 # CHECK:	vbif	q15, q7, q0
 0x50 0xe1 0x7e 0xf3
diff --git a/test/MC/Disassembler/ARM/neon.txt b/test/MC/Disassembler/ARM/neon.txt
index c5dbee3..649424a 100644
--- a/test/MC/Disassembler/ARM/neon.txt
+++ b/test/MC/Disassembler/ARM/neon.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple armv7-unknown-unknown -disassemble -mattr +fp16 < %s | FileCheck %s
+# RUN: llvm-mc -triple armv7-unknown-unknown -mcpu=cortex-a9 -disassemble < %s | FileCheck %s
 
 0x20 0x03 0xf1 0xf3
 # CHECK: vabs.s8	d16, d16
@@ -1734,6 +1734,25 @@
 0xcf 0x1a 0xe0 0xf4
 # CHECK: vld3.32	{d17[1], d19[1], d21[1]}, [r0]
 
+# CHECK: vld3.8	{d0[], d1[], d2[]}, [r4]
+0x0f 0x0e 0xa4 0xf4
+# CHECK: vld3.8	{d0[], d1[], d2[]}, [r4]!
+0x0d 0x0e 0xa4 0xf4
+# CHECK: vld3.8	{d0[], d2[], d4[]}, [r4], r5
+0x25 0x0e 0xa4 0xf4
+# CHECK: vld3.16	{d0[], d2[], d4[]}, [r4]
+0x6f 0x0e 0xa4 0xf4
+# CHECK: vld3.16	{d0[], d1[], d2[]}, [r4]!
+0x4d 0x0e 0xa4 0xf4
+# CHECK: vld3.16	{d0[], d2[], d4[]}, [r4], r5
+0x65 0x0e 0xa4 0xf4
+# CHECK: vld3.32	{d0[], d1[], d2[]}, [r4]
+0x8f 0x0e 0xa4 0xf4
+# CHECK: vld3.32	{d0[], d1[], d2[]}, [r4]!
+0x8d 0x0e 0xa4 0xf4
+# CHECK: vld3.32	{d0[], d2[], d4[]}, [r4], r5
+0xa5 0x0e 0xa4 0xf4
+
 0x3f 0x03 0xe0 0xf4
 # CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
 0x4f 0x07 0xe0 0xf4
@@ -1745,6 +1764,30 @@
 0x4f 0x1b 0xe0 0xf4
 # CHECK: vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
+0x0f 0x0f 0xa4 0xf4
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4]
+0x3f 0x0f 0xa4 0xf4
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32]
+0x1d 0x0f 0xa4 0xf4
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4, :32]!
+0x35 0x0f 0xa4 0xf4
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32], r5
+0x4f 0x0f 0xa4 0xf4
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4]
+0x7f 0x0f 0xa4 0xf4
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64]
+0x5d 0x0f 0xa4 0xf4
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4, :64]!
+0x75 0x0f 0xa4 0xf4
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64], r5
+0x8f 0x0f 0xa4 0xf4
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4]
+0xbf 0x0f 0xa4 0xf4
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :64]
+0xdd 0x0f 0xa4 0xf4
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4, :128]!
+0xf5 0x0f 0xa4 0xf4
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :128], r5
 
 
 0x1f 0x07 0x40 0xf4
@@ -1852,7 +1895,26 @@
 # CHECK: vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
 
 0x3d 0x2a 0x5e 0x6c
-# CHECK: vmovvs	r2, lr, s29, s30
+# CHECK: vmovvs	r2, lr, s27, s28
+
+0x31 0x1a 0x42 0xec
+0x11 0x1a 0x42 0xec
+0x31 0x1a 0x52 0xec
+0x11 0x1a 0x52 0xec
+# CHECK: vmov s3, s4, r1, r2
+# CHECK: vmov s2, s3, r1, r2
+# CHECK: vmov r1, r2, s3, s4
+# CHECK: vmov r1, r2, s2, s3
+
+0x1f 0x1b 0x42 0xec
+0x30 0x1b 0x42 0xec
+0x1f 0x1b 0x52 0xec
+0x30 0x1b 0x52 0xec
+# CHECK: vmov d15, r1, r2 
+# CHECK: vmov d16, r1, r2
+# CHECK: vmov r1, r2, d15
+# CHECK: vmov r1, r2, d16
+
 
 0xe9 0x1a 0xb2 0x4e
 # CHECK: vcvttmi.f32.f16	s2, s19
@@ -1869,14 +1931,6 @@
 # CHECK: vmov.f32	d0, #1.600000e+01
 # CHECK: vmov.f32	q0, #1.600000e+01
 
-# rdar://10798451
-0xe7 0xf9 0x32 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16], r2
-0xe7 0xf9 0x3d 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16]!
-0xe7 0xf9 0x3f 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16]
-
 # rdar://11034702
 0x0d 0x87 0x04 0xf4
 # CHECK: vst1.8	{d8}, [r4]!            
diff --git a/test/MC/Disassembler/ARM/neont2.txt b/test/MC/Disassembler/ARM/neont2.txt
index 65cd230..7d7010f 100644
--- a/test/MC/Disassembler/ARM/neont2.txt
+++ b/test/MC/Disassembler/ARM/neont2.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple thumbv7-unknown-unknown -disassemble -mattr +fp16 < %s | FileCheck %s
+# RUN: llvm-mc -triple thumbv7-unknown-unknown -mcpu=cortex-a9 -disassemble < %s | FileCheck %s
 
 0xf1 0xff 0x20 0x03
 # CHECK: vabs.s8	d16, d16
@@ -1475,6 +1475,25 @@
 0xe0 0xf9 0xcf 0x1a
 # CHECK: vld3.32	{d17[1], d19[1], d21[1]}, [r0]
 
+0xa4 0xf9 0x0f 0x0e
+# CHECK: vld3.8	{d0[], d1[], d2[]}, [r4]
+0xa4 0xf9 0x0d 0x0e
+# CHECK: vld3.8	{d0[], d1[], d2[]}, [r4]!
+0xa4 0xf9 0x25 0x0e
+# CHECK: vld3.8	{d0[], d2[], d4[]}, [r4], r5
+0xa4 0xf9 0x6f 0x0e
+# CHECK: vld3.16	{d0[], d2[], d4[]}, [r4]
+0xa4 0xf9 0x4d 0x0e
+# CHECK: vld3.16	{d0[], d1[], d2[]}, [r4]!
+0xa4 0xf9 0x65 0x0e
+# CHECK: vld3.16	{d0[], d2[], d4[]}, [r4], r5
+0xa4 0xf9 0x8f 0x0e
+# CHECK: vld3.32	{d0[], d1[], d2[]}, [r4]
+0xa4 0xf9 0x8d 0x0e
+# CHECK: vld3.32	{d0[], d1[], d2[]}, [r4]!
+0xa4 0xf9 0xa5 0x0e
+# CHECK: vld3.32	{d0[], d2[], d4[]}, [r4], r5
+
 0xe0 0xf9 0x3f 0x03
 # CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
 0xe0 0xf9 0x4f 0x07
@@ -1486,6 +1505,31 @@
 0xe0 0xf9 0x4f 0x1b
 # CHECK: vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
+0xa4 0xf9 0x0f 0x0f
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4] 
+0xa4 0xf9 0x3f 0x0f
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32] 
+0xa4 0xf9 0x1d 0x0f
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4, :32]! 
+0xa4 0xf9 0x35 0x0f
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32], r5 
+0xa4 0xf9 0x4f 0x0f
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4] 
+0xa4 0xf9 0x7f 0x0f
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64] 
+0xa4 0xf9 0x5d 0x0f
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4, :64]! 
+0xa4 0xf9 0x75 0x0f
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64], r5 
+0xa4 0xf9 0x8f 0x0f
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4] 
+0xa4 0xf9 0xbf 0x0f
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :64] 
+0xa4 0xf9 0xdd 0x0f
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4, :128]! 
+0xa4 0xf9 0xf5 0x0f
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :128], r5 
+
 0x40 0xf9 0x1f 0x07
 # CHECK: vst1.8	{d16}, [r0, :64]
 0x40 0xf9 0x4f 0x07
@@ -1998,3 +2042,13 @@
 # CHECK: vld2.16	{d0[], d2[]}, [r3], r4  
 0xa3 0xf9 0xa4 0x0d
 # CHECK: vld2.32	{d0[], d2[]}, [r3], r4  
+
+
+# rdar://10798451
+0xe7 0xf9 0x32 0x1d
+# CHECK: vld2.8	{d17[], d19[]}, [r7, :16], r2
+0xe7 0xf9 0x3d 0x1d
+# CHECK: vld2.8	{d17[], d19[]}, [r7, :16]!
+0xe7 0xf9 0x3f 0x1d
+# CHECK: vld2.8	{d17[], d19[]}, [r7, :16]
+
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index 18b8f47..c08585a 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 -mattr +t2xtpk,+mp | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 -mcpu=cortex-a9-mp | FileCheck %s
 
 # CHECK:	add	r5, sp, #68
 0x11 0xad
@@ -301,3 +301,11 @@
 
 # CHECK: mrs    r0, apsr
 0xef 0xf3 0x00 0x80
+
+# rdar://11313994
+# CHECK: blx	#2313244
+0x34 0xf2 0x0e 0xee
+
+# rdar://11324693
+# CHECK: bl	#-12303196
+0x44 0xf4 0x52 0xda
diff --git a/test/MC/Disassembler/ARM/thumb1.txt b/test/MC/Disassembler/ARM/thumb1.txt
index 17c4bad..5b70262 100644
--- a/test/MC/Disassembler/ARM/thumb1.txt
+++ b/test/MC/Disassembler/ARM/thumb1.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=thumbv6-apple-darwin -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=thumbv6-apple-darwin -disassemble -show-encoding < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # ADC (register)
@@ -83,6 +83,15 @@
 0xb1 0x43
 
 #------------------------------------------------------------------------------
+# B
+#------------------------------------------------------------------------------
+# CHECK: bls     #128                    @ encoding: [0x40,0xd9]
+# CHECK: beq     #-256                   @ encoding: [0x80,0xd0]
+
+0x40 0xd9
+0x80 0xd0
+
+#------------------------------------------------------------------------------
 # BKPT
 #------------------------------------------------------------------------------
 # CHECK: bkpt #0
@@ -516,15 +525,3 @@
 
 0xd7 0xb2
 0xa1 0xb2
-
-
-#------------------------------------------------------------------------------
-# WFE/WFI/YIELD
-#------------------------------------------------------------------------------
-# CHECK: wfe
-# CHECK: wfi
-# CHECK: yield
-
-0x20 0xbf
-0x30 0xbf
-0x10 0xbf
diff --git a/test/MC/Disassembler/ARM/thumb2.txt b/test/MC/Disassembler/ARM/thumb2.txt
index ed8d988..42ebe58 100644
--- a/test/MC/Disassembler/ARM/thumb2.txt
+++ b/test/MC/Disassembler/ARM/thumb2.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=thumbv7-apple-darwin -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -disassemble < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # ADC (immediate)
@@ -92,9 +92,11 @@
 #------------------------------------------------------------------------------
 # CHECK: subw r11, pc, #3270
 # CHECK: subw r11, pc, #826
+# CHECK: subw r1, pc, #0
 
 0xaf 0xf6 0xc6 0x4b
 0xaf 0xf2 0x3a 0x3b
+0xaf 0xf2 0x00 0x01
 
 #------------------------------------------------------------------------------
 # AND (immediate)
@@ -344,23 +346,37 @@
 #------------------------------------------------------------------------------
 #CHECK: dmb sy
 #CHECK: dmb st
+#CHECK: dmb #0xd
+#CHECK: dmb #0xc
 #CHECK: dmb ish
 #CHECK: dmb ishst
+#CHECK: dmb #0x9
+#CHECK: dmb #0x8
 #CHECK: dmb nsh
 #CHECK: dmb nshst
+#CHECK: dmb #0x5
+#CHECK: dmb #0x4
 #CHECK: dmb osh
 #CHECK: dmb oshst
-#CHECK: dmb
+#CHECK: dmb #0x1
+#CHECK: dmb #0x0
 
 0xbf 0xf3 0x5f 0x8f
 0xbf 0xf3 0x5e 0x8f
+0xbf 0xf3 0x5d 0x8f
+0xbf 0xf3 0x5c 0x8f
 0xbf 0xf3 0x5b 0x8f
 0xbf 0xf3 0x5a 0x8f
+0xbf 0xf3 0x59 0x8f
+0xbf 0xf3 0x58 0x8f
 0xbf 0xf3 0x57 0x8f
 0xbf 0xf3 0x56 0x8f
+0xbf 0xf3 0x55 0x8f
+0xbf 0xf3 0x54 0x8f
 0xbf 0xf3 0x53 0x8f
 0xbf 0xf3 0x52 0x8f
-0xbf 0xf3 0x5f 0x8f
+0xbf 0xf3 0x51 0x8f
+0xbf 0xf3 0x50 0x8f
 
 
 #------------------------------------------------------------------------------
@@ -368,21 +384,37 @@
 #------------------------------------------------------------------------------
 #CHECK: dsb sy
 #CHECK: dsb st
+#CHECK: dsb #0xd
+#CHECK: dsb #0xc
 #CHECK: dsb ish
 #CHECK: dsb ishst
+#CHECK: dsb #0x9
+#CHECK: dsb #0x8
 #CHECK: dsb nsh
 #CHECK: dsb nshst
+#CHECK: dsb #0x5
+#CHECK: dsb #0x4
 #CHECK: dsb osh
 #CHECK: dsb oshst
+#CHECK: dsb #0x1
+#CHECK: dsb #0x0
 
 0xbf 0xf3 0x4f 0x8f
 0xbf 0xf3 0x4e 0x8f
+0xbf 0xf3 0x4d 0x8f
+0xbf 0xf3 0x4c 0x8f
 0xbf 0xf3 0x4b 0x8f
 0xbf 0xf3 0x4a 0x8f
+0xbf 0xf3 0x49 0x8f
+0xbf 0xf3 0x48 0x8f
 0xbf 0xf3 0x47 0x8f
 0xbf 0xf3 0x46 0x8f
+0xbf 0xf3 0x45 0x8f
+0xbf 0xf3 0x44 0x8f
 0xbf 0xf3 0x43 0x8f
 0xbf 0xf3 0x42 0x8f
+0xbf 0xf3 0x41 0x8f
+0xbf 0xf3 0x40 0x8f
 
 
 #------------------------------------------------------------------------------
@@ -609,6 +641,9 @@
 # CHECK: ldrd r3, r5, [r6], #-8
 # CHECK: ldrd r3, r5, [r6]
 # CHECK: ldrd r8, r1, [r3]
+# CHECK: ldrd r0, r1, [r2], #-0
+# CHECK: ldrd r0, r1, [r2, #-0]!
+# CHECK: ldrd r0, r1, [r2, #-0]
 
 0xd6 0xe9 0x06 0x35
 0xf6 0xe9 0x06 0x35
@@ -616,6 +651,9 @@
 0x76 0xe8 0x02 0x35
 0xd6 0xe9 0x00 0x35
 0xd3 0xe9 0x00 0x81
+0x72 0xe8 0x00 0x01
+0x72 0xe9 0x00 0x01
+0x52 0xe9 0x00 0x01
 
 
 #------------------------------------------------------------------------------
@@ -1790,12 +1828,16 @@
 # STRD (immediate)
 #------------------------------------------------------------------------------
 # CHECK: strd r6, r3, [r5], #-8
-# CHECK: strd r8, r5, [r5]{{$}}
+# CHECK: strd r8, r5, [r5], #-0
 # CHECK: strd r7, r4, [r5], #-4
+# CHECK: strd r0, r1, [r2, #-0]!
+# CHECK: strd r0, r1, [r2, #-0]
 
 0x65 0xe8 0x02 0x63
 0x65 0xe8 0x00 0x85
 0x65 0xe8 0x01 0x74
+0x62 0xe9 0x00 0x01
+0x42 0xe9 0x00 0x01
 
 #------------------------------------------------------------------------------
 # STREX/STREXB/STREXH/STREXD
diff --git a/test/MC/Disassembler/ARM/unpredictable-ADC-arm.txt b/test/MC/Disassembler/ARM/unpredictable-ADC-arm.txt
index 275bae2f..d5c8cbb 100644
--- a/test/MC/Disassembler/ARM/unpredictable-ADC-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-ADC-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0x1f 0x12 0xb0 0x00
diff --git a/test/MC/Disassembler/ARM/unpredictable-ADDREXT3-arm.txt b/test/MC/Disassembler/ARM/unpredictable-ADDREXT3-arm.txt
index 635b66e..d251eb4 100644
--- a/test/MC/Disassembler/ARM/unpredictable-ADDREXT3-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-ADDREXT3-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0xd1 0xf1 0x5f 0x01
diff --git a/test/MC/Disassembler/ARM/unpredictable-AExtI-arm.txt b/test/MC/Disassembler/ARM/unpredictable-AExtI-arm.txt
new file mode 100644
index 0000000..d0cb520
--- /dev/null
+++ b/test/MC/Disassembler/ARM/unpredictable-AExtI-arm.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s -check-prefix=CHECK-WARN
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x03 0xaf 0x06
+# CHECK: sxtb
+0x74 0x03 0xaf 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xbf 0x06
+# CHECK: sxth
+0x74 0x3f 0xbf 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xa6 0x06
+# CHECK: sxtab
+0x74 0x3f 0xa6 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xb7 0x06
+# CHECK: sxtah
+0x74 0x3f 0xb7 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0x8f 0x06
+# CHECK: sxtb16
+0x74 0x3f 0x8f 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0x86 0x06
+# CHECK: sxtab16
+0x74 0x3f 0x86 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xef 0x06
+# CHECK: uxtb
+0x74 0x3f 0xef 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xff 0x06
+# CHECK: uxth
+0x74 0x3f 0xff 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xcf 0x06
+# CHECK: uxtb16
+0x74 0x3f 0xcf 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xe4 0x06
+# CHECK: uxtab
+0x74 0x3f 0xe4 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xf2 0x06
+# CHECK: uxtah
+0x74 0x3f 0xf2 0x06
+
+# CHECK-WARN: potentially undefined
+# CHECK-WARN: 0x74 0x3f 0xc4 0x06
+# CHECK: uxtab16
+0x74 0x3f 0xc4 0x06
diff --git a/test/MC/Disassembler/ARM/unpredictable-AI1cmp-arm.txt b/test/MC/Disassembler/ARM/unpredictable-AI1cmp-arm.txt
index dac4390..554ae53 100644
--- a/test/MC/Disassembler/ARM/unpredictable-AI1cmp-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-AI1cmp-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0x01 0x10 0x50 0x03
diff --git a/test/MC/Disassembler/ARM/unpredictable-LDR-arm.txt b/test/MC/Disassembler/ARM/unpredictable-LDR-arm.txt
index ed5e350..66073a8 100644
--- a/test/MC/Disassembler/ARM/unpredictable-LDR-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-LDR-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0xff 0x00 0xb9 0x00
diff --git a/test/MC/Disassembler/ARM/unpredictable-LDRD-arm.txt b/test/MC/Disassembler/ARM/unpredictable-LDRD-arm.txt
index a8f54f7..572d844 100644
--- a/test/MC/Disassembler/ARM/unpredictable-LDRD-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-LDRD-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
 # -------------------------------------------------------------------------------------------------
diff --git a/test/MC/Disassembler/ARM/unpredictable-LSL-regform.txt b/test/MC/Disassembler/ARM/unpredictable-LSL-regform.txt
index f7d6bc6..9c26953 100644
--- a/test/MC/Disassembler/ARM/unpredictable-LSL-regform.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-LSL-regform.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=196 Name=MOVs Format=ARM_FORMAT_DPSOREGFRM(5)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/unpredictable-MRRC2-arm.txt b/test/MC/Disassembler/ARM/unpredictable-MRRC2-arm.txt
index 26b286d..439aaed 100644
--- a/test/MC/Disassembler/ARM/unpredictable-MRRC2-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-MRRC2-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0x00 0x10 0x51 0xfc
diff --git a/test/MC/Disassembler/ARM/unpredictable-MRS-arm.txt b/test/MC/Disassembler/ARM/unpredictable-MRS-arm.txt
index 3e472cd..d785341 100644
--- a/test/MC/Disassembler/ARM/unpredictable-MRS-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-MRS-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # CHECK: warning: potentially undefined
 # CHECK: 0x00 0xf0 0x0f 0x01
diff --git a/test/MC/Disassembler/ARM/unpredictable-MUL-arm.txt b/test/MC/Disassembler/ARM/unpredictable-MUL-arm.txt
index 3db86cc..472868f 100644
--- a/test/MC/Disassembler/ARM/unpredictable-MUL-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-MUL-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0x93 0x12 0x01 0x00 
diff --git a/test/MC/Disassembler/ARM/unpredictable-RSC-arm.txt b/test/MC/Disassembler/ARM/unpredictable-RSC-arm.txt
index 5b13610..fdfda6d 100644
--- a/test/MC/Disassembler/ARM/unpredictable-RSC-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-RSC-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=261 Name=RSCrs Format=ARM_FORMAT_DPSOREGFRM(5)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/unpredictable-SEL-arm.txt b/test/MC/Disassembler/ARM/unpredictable-SEL-arm.txt
new file mode 100644
index 0000000..a2a8770
--- /dev/null
+++ b/test/MC/Disassembler/ARM/unpredictable-SEL-arm.txt
@@ -0,0 +1,5 @@
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
+
+# CHECK: potentially undefined
+# CHECK: 0xb4 0x38 0x80 0x06
+0xb4 0x38 0x80 0x06
diff --git a/test/MC/Disassembler/ARM/unpredictable-SHADD16-arm.txt b/test/MC/Disassembler/ARM/unpredictable-SHADD16-arm.txt
index 8ec49ca..741d059 100644
--- a/test/MC/Disassembler/ARM/unpredictable-SHADD16-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-SHADD16-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # CHECK: warning: potentially undefined
 # CHECK: shadd16	r5, r7, r0
diff --git a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
index 874378e..832aa3f 100644
--- a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=322 Name=SSAT Format=ARM_FORMAT_SATFRM(13)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/unpredictable-STRBrs-arm.txt b/test/MC/Disassembler/ARM/unpredictable-STRBrs-arm.txt
index fef6125..5e62802 100644
--- a/test/MC/Disassembler/ARM/unpredictable-STRBrs-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-STRBrs-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=355 Name=STRBrs Format=ARM_FORMAT_STFRM(7)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/unpredictable-UQADD8-arm.txt b/test/MC/Disassembler/ARM/unpredictable-UQADD8-arm.txt
index 4c4c9ab..85b52dd 100644
--- a/test/MC/Disassembler/ARM/unpredictable-UQADD8-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-UQADD8-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=426 Name=UQADD8 Format=ARM_FORMAT_DPFRM(4)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Disassembler/ARM/unpredictable-swp-arm.txt b/test/MC/Disassembler/ARM/unpredictable-swp-arm.txt
index 64bb171..eef5d9f 100644
--- a/test/MC/Disassembler/ARM/unpredictable-swp-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-swp-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-linux-gnueabi 2>&1 | FileCheck %s
 
 # CHECK: potentially undefined
 # CHECK: 0x9f 0x10 0x03 0x01
diff --git a/test/MC/Disassembler/ARM/unpredictables-thumb.txt b/test/MC/Disassembler/ARM/unpredictables-thumb.txt
index e7645f0..925dcd3 100644
--- a/test/MC/Disassembler/ARM/unpredictables-thumb.txt
+++ b/test/MC/Disassembler/ARM/unpredictables-thumb.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=thumbv7 2>&1 | FileCheck %s
 
 0x01 0x47
 # CHECK: 3:1: warning: potentially undefined
diff --git a/test/MC/Disassembler/Mips/lit.local.cfg b/test/MC/Disassembler/Mips/lit.local.cfg
new file mode 100644
index 0000000..9b698b2
--- /dev/null
+++ b/test/MC/Disassembler/Mips/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.txt']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/Disassembler/Mips/mips32.txt b/test/MC/Disassembler/Mips/mips32.txt
index 591d8c4..a193319 100644
--- a/test/MC/Disassembler/Mips/mips32.txt
+++ b/test/MC/Disassembler/Mips/mips32.txt
@@ -1,421 +1,406 @@
-# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: abs.d $f12, $f14
+0x46 0x20 0x73 0x05
 
-# CHECK: abs.d $f12,$f14
-0x46 0x20 0x39 0x85
-
-# CHECK: abs.s $f6,$f7
+# CHECK: abs.s $f6, $f7
 0x46 0x00 0x39 0x85
 
-# CHECK: add t1,a2,a3
+# CHECK: add $9, $6, $7
 0x00 0xc7 0x48 0x20
 
-# CHECK: add.d $f18,$f12,$f14
-0x46 0x27 0x32 0x40
+# CHECK: add.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x00
 
-# CHECK: add.s $f9,$f6,$f7
+# CHECK: add.s $f9, $f6, $f7
 0x46 0x07 0x32 0x40
 
-# CHECK: addi t1,a2,17767
+# CHECK: addi $9, $6, 17767
 0x20 0xc9 0x45 0x67
 
-# CHECK: addiu t1,a2,-15001
+# CHECK: addiu $9, $6, -15001
 0x24 0xc9 0xc5 0x67
 
-# CHECK: addu t1,a2,a3
+# CHECK: addu $9, $6, $7
 0x00 0xc7 0x48 0x21
 
-# CHECK: and t1,a2,a3
+# CHECK: and $9, $6, $7
 0x00 0xc7 0x48 0x24
 
-# CHECK: andi t1,a2,0x4567
+# CHECK: andi $9, $6, 17767
 0x30 0xc9 0x45 0x67
 
-# CHECK: b 00000534
+# CHECK: b 1332
 0x10 0x00 0x01 0x4c
 
-# CHECK: bal 00000534
-0x04 0x11 0x01 0x4c
-
-# CHECK: bc1f 00000534
+# CHECK: bc1f 1332
 0x45 0x00 0x01 0x4c
 
-# CHECK: bc1t 00000534
+# CHECK: bc1t 1332
 0x45 0x01 0x01 0x4c
 
-# CHECK: beq t1,a2,00000534
+# CHECK: beq $9, $6, 1332
 0x11 0x26 0x01 0x4c
 
-# CHECK: bgez a2,00000534
+# CHECK: bgez  $6, 1332
 0x04 0xc1 0x01 0x4c
 
-# CHECK: bgezal a2,00000534
+# CHECK: bgezal  $6, 1332
 0x04 0xd1 0x01 0x4c
 
-# CHECK: bgtz a2,00000534
+# CHECK: bgtz  $6, 1332
 0x1c 0xc0 0x01 0x4c
 
-# CHECK: blez a2,00000534
+# CHECK: blez  $6, 1332
 0x18 0xc0 0x01 0x4c
 
-# CHECK: bne t1,a2,00000534
+# CHECK: bne $9, $6, 1332
 0x15 0x26 0x01 0x4c
 
-# CHECK: c.eq.d $f12,$f14
-0x46 0x27 0x30 0x32
+# CHECK: c.eq.d $f12, $f14
+0x46 0x2e 0x60 0x32
 
-# CHECK: c.eq.s $f6,$f7
+# CHECK: c.eq.s $f6, $f7
 0x46 0x07 0x30 0x32
 
-# CHECK: c.f.d $f12,$f14
-0x46 0x27 0x30 0x30
+# CHECK: c.f.d $f12, $f14
+0x46 0x2e 0x60 0x30
 
-# CHECK: c.f.s $f6,$f7
+# CHECK: c.f.s $f6, $f7
 0x46 0x07 0x30 0x30
 
-# CHECK: c.le.d $f12,$f14
-0x46 0x27 0x30 0x3e
+# CHECK: c.le.d $f12, $f14
+0x46 0x2e 0x60 0x3e
 
-# CHECK: c.le.s $f6,$f7
+# CHECK: c.le.s $f6, $f7
 0x46 0x07 0x30 0x3e
 
-# CHECK: c.lt.d $f12,$f14
-0x46 0x27 0x30 0x3c
+# CHECK: c.lt.d $f12, $f14
+0x46 0x2e 0x60 0x3c
 
-# CHECK: c.lt.s $f6,$f7
+# CHECK: c.lt.s $f6, $f7
 0x46 0x07 0x30 0x3c
 
-# CHECK: c.nge.d $f12,$f14
-0x46 0x27 0x30 0x3d
+# CHECK: c.nge.d $f12, $f14
+0x46 0x2e 0x60 0x3d
 
-# CHECK: c.nge.s $f6,$f7
+# CHECK: c.nge.s $f6, $f7
 0x46 0x07 0x30 0x3d
 
-# CHECK: c.ngl.d $f12,$f14
-0x46 0x27 0x30 0x3b
+# CHECK: c.ngl.d $f12, $f14
+0x46 0x2e 0x60 0x3b
 
-# CHECK: c.ngl.s $f6,$f7
+# CHECK: c.ngl.s $f6, $f7
 0x46 0x07 0x30 0x3b
 
-# CHECK: c.ngle.d $f12,$f14
-0x46 0x27 0x30 0x39
+# CHECK: c.ngle.d $f12, $f14
+0x46 0x2e 0x60 0x39
 
-# CHECK: c.ngle.s $f6,$f7
+# CHECK: c.ngle.s $f6, $f7
 0x46 0x07 0x30 0x39
 
-# CHECK: c.ngt.d $f12,$f14
-0x46 0x27 0x30 0x3f
+# CHECK: c.ngt.d $f12, $f14
+0x46 0x2e 0x60 0x3f
 
-# CHECK: c.ngt.s $f6,$f7
+# CHECK: c.ngt.s $f6, $f7
 0x46 0x07 0x30 0x3f
 
-# CHECK: c.ole.d $f12,$f14
-0x46 0x27 0x30 0x36
+# CHECK: c.ole.d $f12, $f14
+0x46 0x2e 0x60 0x36
 
-# CHECK: c.ole.s $f6,$f7
+# CHECK: c.ole.s $f6, $f7
 0x46 0x07 0x30 0x36
 
-# CHECK: c.olt.d $f12,$f14
-0x46 0x27 0x30 0x34
+# CHECK: c.olt.d $f12, $f14
+0x46 0x2e 0x60 0x34
 
-# CHECK: c.olt.s $f6,$f7
+# CHECK: c.olt.s $f6, $f7
 0x46 0x07 0x30 0x34
 
-# CHECK: c.seq.d $f12,$f14
-0x46 0x27 0x30 0x3a
+# CHECK: c.seq.d $f12, $f14
+0x46 0x2e 0x60 0x3a
 
-# CHECK: c.seq.s $f6,$f7
+# CHECK: c.seq.s $f6, $f7
 0x46 0x07 0x30 0x3a
 
-# CHECK: c.sf.d $f12,$f14
-0x46 0x27 0x30 0x38
+# CHECK: c.sf.d $f12, $f14
+0x46 0x2e 0x60 0x38
 
-# CHECK: c.sf.s $f6,$f7
+# CHECK: c.sf.s $f6, $f7
 0x46 0x07 0x30 0x38
 
-# CHECK: c.ueq.d $f12,$f14
-0x46 0x27 0x30 0x33
+# CHECK: c.ueq.d $f12, $f14
+0x46 0x2e 0x60 0x33
 
-# CHECK: c.ueq.s $f28,$f18
+# CHECK: c.ueq.s $f28, $f18
 0x46 0x12 0xe0 0x33
 
-# CHECK: c.ule.d $f12,$f14
-0x46 0x27 0x30 0x37
+# CHECK: c.ule.d $f12, $f14
+0x46 0x2e 0x60 0x37
 
-# CHECK: c.ule.s $f6,$f7
+# CHECK: c.ule.s $f6, $f7
 0x46 0x07 0x30 0x37
 
-# CHECK: c.ult.d $f12,$f14
-0x46 0x27 0x30 0x35
+# CHECK: c.ult.d $f12, $f14
+0x46 0x2e 0x60 0x35
 
-# CHECK: c.ult.s $f6,$f7
+# CHECK: c.ult.s $f6, $f7
 0x46 0x07 0x30 0x35
 
-# CHECK: c.un.d $f12,$f14
-0x46 0x27 0x30 0x31
+# CHECK: c.un.d $f12, $f14
+0x46 0x2e 0x60 0x31
 
-# CHECK: c.un.s $f6,$f7
+# CHECK: c.un.s $f6, $f7
 0x46 0x07 0x30 0x31
 
-# CHECK: ceil.w.d $f12,$f14
-0x46 0x20 0x39 0x8e
+# CHECK: ceil.w.d $f12, $f14
+0x46 0x20 0x73 0x0e
 
-# CHECK: ceil.w.s $f6,$f7
+# CHECK: ceil.w.s $f6, $f7
 0x46 0x00 0x39 0x8e
 
-# CHECK: cfc1 a2,$7
+# CHECK: cfc1  $6, $7
 0x44 0x46 0x38 0x00
 
-# CHECK: clo a2,a3
+# CHECK: clo  $6, $7
 0x70 0xe6 0x30 0x21
 
-# CHECK: clz a2,a3
+# CHECK: clz  $6, $7
 0x70 0xe6 0x30 0x20
 
-# CHECK: ctc1 a2,$7
+# CHECK: ctc1  $6, $7
 0x44 0xc6 0x38 0x00
 
-# CHECK: cvt.d.s $f6,$f7
-0x46 0x00 0x38 0xa1
-
-# CHECK: cvt.d.w $f12,$f14
-0x46 0x80 0x38 0xa1
-
-# CHECK: cvt.l.d $f12,$f14
-0x46 0x20 0x39 0xa5
+# CHECK: cvt.d.s $f6, $f7
+0x46 0x00 0x39 0xa1
 
-# CHECK: cvt.l.s $f6,$f7
-0x46 0x00 0x39 0xa5
+# CHECK: cvt.d.w $f12, $f14
+0x46 0x80 0x73 0x21
 
-# CHECK: cvt.s.d $f12,$f14
-0x46 0x20 0x39 0xa0
+# CHECK: cvt.s.d $f12, $f14
+0x46 0x20 0x73 0x20
 
-# CHECK: cvt.s.w $f6,$f7
+# CHECK: cvt.s.w $f6, $f7
 0x46 0x80 0x39 0xa0
 
-# CHECK: cvt.w.d $f12,$f14
-0x46 0x20 0x39 0xa4
+# CHECK: cvt.w.d $f12, $f14
+0x46 0x20 0x73 0x24
 
-# CHECK: cvt.w.s $f6,$f7
+# CHECK: cvt.w.s $f6, $f7
 0x46 0x00 0x39 0xa4
 
-# CHECK: floor.w.d $f12,$f14
-0x46 0x20 0x39 0x8f
+# CHECK: floor.w.d $f12, $f14
+0x46 0x20 0x73 0x0f
 
-# CHECK: floor.w.s $f6,$f7
+# CHECK: floor.w.s $f6, $f7
 0x46 0x00 0x39 0x8f
 
-# CHECK: j 00000530
+# CHECK: j 1328
 0x08 0x00 0x01 0x4c
 
-# CHECK: jal 00000530
+# CHECK: jal 1328
 0x0c 0x00 0x01 0x4c
 
-# CHECK: jalr a2,a3
+# CHECK: jalr  $7
 0x00 0xe0 0xf8 0x09
 
-# CHECK: jr a3
+# CHECK: jr  $7
 0x00 0xe0 0x00 0x08
 
-# CHECK: lb  a0,9158(a1)
+# CHECK: lb  $4, 9158($5)
 0x80 0xa4 0x23 0xc6
 
-# CHECK: lbu a0,6(a1)
+# CHECK: lbu $4, 6($5)
 0x90 0xa4 0x00 0x06
 
-# CHECK: ldc1  $f9,9158(a3)
+# CHECK: ldc1  $f9, 9158($7)
 0xd4 0xe9 0x23 0xc6
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x84 0xa4 0x00 0x0c
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x84 0xa4 0x00 0x0c
 
-# CHECK: li  v1,17767
-0x24 0x03 0x45 0x67
-
-# CHECK: ll  t1,9158(a3)
+# CHECK: ll  $9, 9158($7)
 0xc0 0xe9 0x23 0xc6
 
-# CHECK: lui a2,0x4567
+# CHECK: lui  $6, 17767
 0x3c 0x06 0x45 0x67
 
-# CHECK: lw  a0,24(a1)
+# CHECK: lw  $4, 24($5)
 0x8c 0xa4 0x00 0x18
 
-# CHECK: lwc1  $f9,9158(a3)
+# CHECK: lwc1  $f9, 9158($7)
 0xc4 0xe9 0x23 0xc6
 
-# CHECK: madd  a2,a3
+# CHECK: lwl   $2,  3($4)
+0x88 0x82 0x00 0x03
+
+# CHECK: lwr   $3, 16($5)
+0x98 0xa3 0x00 0x10
+
+# CHECK: madd   $6,  $7
 0x70 0xc7 0x00 0x00
 
-# CHECK: maddu a2,a3
+# CHECK: maddu  $6,  $7
 0x70 0xc7 0x00 0x01
 
-# CHECK: mfc1  a2,$f7
+# CHECK: mfc1   $6, $f7
 0x44 0x06 0x38 0x00
 
-# CHECK: mfhi  a1
+# CHECK: mfhi  $5
 0x00 0x00 0x28 0x10
 
-# CHECK: mflo  a1
+# CHECK: mflo  $5
 0x00 0x00 0x28 0x12
 
-# CHECK: mov.d $f6,$f7
-0x46 0x20 0x39 0x86
+# CHECK: mov.d $f6, $f8
+0x46 0x20 0x41 0x86
 
-# CHECK: mov.s $f6,$f7
+# CHECK: mov.s $f6, $f7
 0x46 0x00 0x39 0x86
 
-# CHECK: move  a2,a1
-0x00 0xa0 0x30 0x21
-
-# CHECK: msub  a2,a3
+# CHECK: msub   $6,  $7
 0x70 0xc7 0x00 0x04
 
-# CHECK: msubu a2,a3
+# CHECK: msubu  $6,  $7
 0x70 0xc7 0x00 0x05
 
-# CHECK: mtc1  a2,$f7
+# CHECK: mtc1   $6, $f7
 0x44 0x86 0x38 0x00
 
-# CHECK: mthi  a3
+# CHECK: mthi   $7
 0x00 0xe0 0x00 0x11
 
-# CHECK: mtlo  a3
+# CHECK: mtlo   $7
 0x00 0xe0 0x00 0x13
 
-# CHECK: mul.d $f9,$f12,$f14
-0x46 0x27 0x32 0x42
+# CHECK: mul.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x02
 
-# CHECK: mul.s $f9,$f6,$f7
+# CHECK: mul.s $f9, $f6, $f7
 0x46 0x07 0x32 0x42
 
-# CHECK: mul t1,a2,a3
+# CHECK: mul $9,  $6,  $7
 0x70 0xc7 0x48 0x02
 
-# CHECK: mult  v1,a1
+# CHECK: mult  $3, $5
 0x00 0x65 0x00 0x18
 
-# CHECK: multu v1,a1
+# CHECK: multu $3, $5
 0x00 0x65 0x00 0x19
 
-# CHECK: neg.d $f12,$f14
-0x46 0x20 0x39 0x87
+# CHECK: neg.d $f12, $f14
+0x46 0x20 0x73 0x07
 
-# CHECK: neg.s $f6,$f7
+# CHECK: neg.s $f6, $f7
 0x46 0x00 0x39 0x87
 
-# CHECK: neg v1,a1
-0x00 0x05 0x18 0x22
-
 # CHECK: nop
 0x00 0x00 0x00 0x00
 
-# CHECK: nor t1,a2,a3
+# CHECK: nor $9,  $6, $7
 0x00 0xc7 0x48 0x27
 
-# CHECK: not v1,a1
-0x00 0xa0 0x18 0x27
-
-# CHECK: or  v1,v1,a1
+# CHECK: or  $3, $3, $5
 0x00 0x65 0x18 0x25
 
-# CHECK: ori t1,a2,0x4567
+# CHECK: ori $9,  $6, 17767
 0x34 0xc9 0x45 0x67
 
-# CHECK: rdhwr a2,$29
-0x7c 0x06 0xe8 0x3b
+# CHECK: round.w.d $f12, $f14
+0x46 0x20 0x73 0x0c
 
-# CHECK: round.w.d $f12,$f14
-0x46 0x20 0x39 0x8c
-
-# CHECK: round.w.s $f6,$f7
+# CHECK: round.w.s $f6, $f7
 0x46 0x00 0x39 0x8c
 
-# CHECK: sb  a0,9158(a1)
+# CHECK: sb  $4, 9158($5)
 0xa0 0xa4 0x23 0xc6
 
-# CHECK: sb  a0,6(a1)
+# CHECK: sb  $4, 6($5)
 0xa0 0xa4 0x00 0x06
 
-# CHECK: sc  t1,9158(a3)
+# CHECK: sc  $9, 9158($7)
 0xe0 0xe9 0x23 0xc6
 
-# CHECK: sdc1  $f9,9158(a3)
+# CHECK: sdc1  $f9, 9158($7)
 0xf4 0xe9 0x23 0xc6
 
-# CHECK: sh  a0,9158(a1)
+# CHECK: sh  $4, 9158($5)
 0xa4 0xa4 0x23 0xc6
 
-# CHECK: sll a0,v1,0x7
+# CHECK: sll $4, $3, 7
 0x00 0x03 0x21 0xc0
 
-# CHECK: sllv  v0,v1,a1
+# CHECK: sllv  $2, $3, $5
 0x00 0xa3 0x10 0x04
 
-# CHECK: slt v1,v1,a1
+# CHECK: slt $3, $3, $5
 0x00 0x65 0x18 0x2a
 
-# CHECK: slti  v1,v1,103
+# CHECK: slti  $3, $3, 103
 0x28 0x63 0x00 0x67
 
-# CHECK: sltiu v1,v1,103
+# CHECK: sltiu $3, $3, 103
 0x2c 0x63 0x00 0x67
 
-# CHECK: sltu  v1,v1,a1
+# CHECK: sltu  $3, $3, $5
 0x00 0x65 0x18 0x2b
 
-# CHECK: sqrt.d  $f12,$f14
-0x46 0x20 0x39 0x84
+# CHECK: sqrt.d  $f12, $f14
+0x46 0x20 0x73 0x04
 
-# CHECK: sqrt.s  $f6,$f7
+# CHECK: sqrt.s  $f6, $f7
 0x46 0x00 0x39 0x84
 
-# CHECK: sra a0,v1,0x7
+# CHECK: sra $4, $3, 7
 0x00 0x03 0x21 0xc3
 
-# CHECK: sra a0,v1,0x7
-0x00 0x03 0x21 0xc3
-
-# CHECK: srav  v0,v1,a1
+# CHECK: srav  $2, $3, $5
 0x00 0xa3 0x10 0x07
 
-# CHECK: srl a0,v1,0x7
+# CHECK: srl $4, $3, 7
 0x00 0x03 0x21 0xc2
 
-# CHECK: srlv  v0,v1,a1
+# CHECK: srlv  $2, $3, $5
 0x00 0xa3 0x10 0x06
 
-# CHECK: sub.d $f9,$f12,$f14
-0x46 0x27 0x32 0x41
+# CHECK: sub.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x01
 
-# CHECK: sub.s $f9,$f6,$f7
+# CHECK: sub.s $f9, $f6, $f7
 0x46 0x07 0x32 0x41
 
-# CHECK: sub t1,a2,a3
+# CHECK: sub $9,  $6, $7
 0x00 0xc7 0x48 0x22
 
-# CHECK: subu  a0,v1,a1
+# CHECK: subu  $4, $3, $5
 0x00 0x65 0x20 0x23
 
-# CHECK: sw  a0,24(a1)
+# CHECK: sw  $4, 24($5)
 0xac 0xa4 0x00 0x18
 
-# CHECK: swc1  $f9,9158(a3)
+# CHECK: swc1  $f9, 9158($7)
 0xe4 0xe9 0x23 0xc6
 
-# CHECK: sync  0x7
+# CHECK: swl $4,  16($5)
+0xa8 0xa4 0x00 0x10
+
+# CHECK: swr $6, 16($7)
+0xb8 0xe6 0x00 0x10
+
+# CHECK: sync  7
 0x00 0x00 0x01 0xcf
 
-# CHECK: trunc.w.d $f12,$f14
-0x46 0x20 0x39 0x8d
+# CHECK: trunc.w.d $f12, $f14
+0x46 0x20 0x73 0x0d
 
-# CHECK: trunc.w.s $f6,$f7
+# CHECK: trunc.w.s $f6, $f7
 0x46 0x00 0x39 0x8d
 
-# CHECK: xor v1,v1,a1
+# CHECK: xor $3, $3, $5
 0x00 0x65 0x18 0x26
 
-# CHECK: xori  t1,a2,0x4567
+# CHECK: xori  $9,  $6, 17767
 0x38 0xc9 0x45 0x67
diff --git a/test/MC/Disassembler/Mips/mips32_le.txt b/test/MC/Disassembler/Mips/mips32_le.txt
index a5a3cfd..08b3672 100644
--- a/test/MC/Disassembler/Mips/mips32_le.txt
+++ b/test/MC/Disassembler/Mips/mips32_le.txt
@@ -1,424 +1,406 @@
-# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: abs.d $f12, $f14
+0x05 0x73 0x20 0x46
 
-# CHECK: abs.d $f12,$f14
-0x85 0x39 0x20 0x46
-
-# CHECK: abs.s $f6,$f7
+# CHECK: abs.s $f6, $f7
 0x85 0x39 0x00 0x46
 
-# CHECK: add t1,a2,a3
+# CHECK: add $9, $6, $7
 0x20 0x48 0xc7 0x00
 
-# CHECK: add.d $f18,$f12,$f14
-0x40 0x32 0x27 0x46
+# CHECK: add.d $f8, $f12, $f14
+0x00 0x62 0x2e 0x46
 
-# CHECK: add.s $f9,$f6,$f7
+# CHECK: add.s $f9, $f6, $f7
 0x40 0x32 0x07 0x46
 
-# CHECK: addi t1,a2,17767
+# CHECK: addi $9, $6, 17767
 0x67 0x45 0xc9 0x20
 
-# CHECK: addiu t1,a2,-15001
+# CHECK: addiu $9, $6, -15001
 0x67 0xc5 0xc9 0x24
 
-# CHECK: addu t1,a2,a3
+# CHECK: addu $9, $6, $7
 0x21 0x48 0xc7 0x00
 
-# CHECK: and t1,a2,a3
+# CHECK: and $9, $6, $7
 0x24 0x48 0xc7 0x00
 
-# CHECK: andi t1,a2,0x4567
+# CHECK: andi $9, $6, 17767
 0x67 0x45 0xc9 0x30
 
-# CHECK: b 00000534
+# CHECK: b 1332
 0x4c 0x01 0x00 0x10
 
-# CHECK: bal 00000534
-0x4c 0x01 0x11 0x04
-
-# CHECK: bc1f 00000534
+# CHECK: bc1f 1332
 0x4c 0x01 0x00 0x45
 
-# CHECK: bc1t 00000534
+# CHECK: bc1t 1332
 0x4c 0x01 0x01 0x45
 
-# CHECK: beq t1,a2,00000534
+# CHECK: beq $9, $6, 1332
 0x4c 0x01 0x26 0x11
 
-# CHECK: bgez a2,00000534
+# CHECK: bgez  $6, 1332
 0x4c 0x01 0xc1 0x04
 
-# CHECK: bgezal a2,00000534
+# CHECK: bgezal  $6, 1332
 0x4c 0x01 0xd1 0x04
 
-# CHECK: bgtz a2,00000534
+# CHECK: bgtz  $6, 1332
 0x4c 0x01 0xc0 0x1c
 
-# CHECK: blez a2,00000534
+# CHECK: blez  $6, 1332
 0x4c 0x01 0xc0 0x18
 
-# CHECK: bne t1,a2,00000534
+# CHECK: bne $9, $6, 1332
 0x4c 0x01 0x26 0x15
 
-# CHECK: c.eq.d $f12,$f14
-0x32 0x30 0x27 0x46
+# CHECK: c.eq.d $f12, $f14
+0x32 0x60 0x2e 0x46
 
-# CHECK: c.eq.s $f6,$f7
+# CHECK: c.eq.s $f6, $f7
 0x32 0x30 0x07 0x46
 
-# CHECK: c.f.d $f12,$f14
-0x30 0x30 0x27 0x46
+# CHECK: c.f.d $f12, $f14
+0x30 0x60 0x2e 0x46
 
-# CHECK: c.f.s $f6,$f7
+# CHECK: c.f.s $f6, $f7
 0x30 0x30 0x07 0x46
 
-# CHECK: c.le.d $f12,$f14
-0x3e 0x30 0x27 0x46
+# CHECK: c.le.d $f12, $f14
+0x3e 0x60 0x2e 0x46
 
-# CHECK: c.le.s $f6,$f7
+# CHECK: c.le.s $f6, $f7
 0x3e 0x30 0x07 0x46
 
-# CHECK: c.lt.d $f12,$f14
-0x3c 0x30 0x27 0x46
+# CHECK: c.lt.d $f12, $f14
+0x3c 0x60 0x2e 0x46
 
-# CHECK: c.lt.s $f6,$f7
+# CHECK: c.lt.s $f6, $f7
 0x3c 0x30 0x07 0x46
 
-# CHECK: c.nge.d $f12,$f14
-0x3d 0x30 0x27 0x46
+# CHECK: c.nge.d $f12, $f14
+0x3d 0x60 0x2e 0x46
 
-# CHECK: c.nge.s $f6,$f7
+# CHECK: c.nge.s $f6, $f7
 0x3d 0x30 0x07 0x46
 
-# CHECK: c.ngl.d $f12,$f14
-0x3b 0x30 0x27 0x46
+# CHECK: c.ngl.d $f12, $f14
+0x3b 0x60 0x2e 0x46
 
-# CHECK: c.ngl.s $f6,$f7
+# CHECK: c.ngl.s $f6, $f7
 0x3b 0x30 0x07 0x46
 
-# CHECK: c.ngle.d $f12,$f14
-0x39 0x30 0x27 0x46
+# CHECK: c.ngle.d $f12, $f14
+0x39 0x60 0x2e 0x46
 
-# CHECK: c.ngle.s $f6,$f7
+# CHECK: c.ngle.s $f6, $f7
 0x39 0x30 0x07 0x46
 
-# CHECK: c.ngt.d $f12,$f14
-0x3f 0x30 0x27 0x46
+# CHECK: c.ngt.d $f12, $f14
+0x3f 0x60 0x2e 0x46
 
-# CHECK: c.ngt.s $f6,$f7
+# CHECK: c.ngt.s $f6, $f7
 0x3f 0x30 0x07 0x46
 
-# CHECK: c.ole.d $f12,$f14
-0x36 0x30 0x27 0x46
+# CHECK: c.ole.d $f12, $f14
+0x36 0x60 0x2e 0x46
 
-# CHECK: c.ole.s $f6,$f7
+# CHECK: c.ole.s $f6, $f7
 0x36 0x30 0x07 0x46
 
-# CHECK: c.olt.d $f12,$f14
-0x34 0x30 0x27 0x46
+# CHECK: c.olt.d $f12, $f14
+0x34 0x60 0x2e 0x46
 
-# CHECK: c.olt.s $f6,$f7
+# CHECK: c.olt.s $f6, $f7
 0x34 0x30 0x07 0x46
 
-# CHECK: c.seq.d $f12,$f14
-0x3a 0x30 0x27 0x46
+# CHECK: c.seq.d $f12, $f14
+0x3a 0x60 0x2e 0x46
 
-# CHECK: c.seq.s $f6,$f7
+# CHECK: c.seq.s $f6, $f7
 0x3a 0x30 0x07 0x46
 
-# CHECK: c.sf.d $f12,$f14
-0x38 0x30 0x27 0x46
+# CHECK: c.sf.d $f12, $f14
+0x38 0x60 0x2e 0x46
 
-# CHECK: c.sf.s $f6,$f7
+# CHECK: c.sf.s $f6, $f7
 0x38 0x30 0x07 0x46
 
-# CHECK: c.ueq.d $f12,$f14
-0x33 0x30 0x27 0x46
+# CHECK: c.ueq.d $f12, $f14
+0x33 0x60 0x2e 0x46
 
-# CHECK: c.ueq.s $f28,$f18
+# CHECK: c.ueq.s $f28, $f18
 0x33 0xe0 0x12 0x46
 
-# CHECK: c.ule.d $f12,$f14
-0x37 0x30 0x27 0x46
+# CHECK: c.ule.d $f12, $f14
+0x37 0x60 0x2e 0x46
 
-# CHECK: c.ule.s $f6,$f7
+# CHECK: c.ule.s $f6, $f7
 0x37 0x30 0x07 0x46
 
-# CHECK: c.ult.d $f12,$f14
-0x35 0x30 0x27 0x46
+# CHECK: c.ult.d $f12, $f14
+0x35 0x60 0x2e 0x46
 
-# CHECK: c.ult.s $f6,$f7
+# CHECK: c.ult.s $f6, $f7
 0x35 0x30 0x07 0x46
 
-# CHECK: c.un.d $f12,$f14
-0x31 0x30 0x27 0x46
+# CHECK: c.un.d $f12, $f14
+0x31 0x60 0x2e 0x46
 
-# CHECK: c.un.s $f6,$f7
+# CHECK: c.un.s $f6, $f7
 0x31 0x30 0x07 0x46
 
-# CHECK: ceil.w.d $f12,$f14
-0x8e 0x38 0x20 0x46
+# CHECK: ceil.w.d $f12, $f14
+0x0e 0x73 0x20 0x46
 
-# CHECK: ceil.w.s $f6,$f7
-0x8e 0x38 0x00 0x46
+# CHECK: ceil.w.s $f6, $f7
+0x8e 0x39 0x00 0x46
 
-# CHECK: cfc1 a2,$7
+# CHECK: cfc1  $6, $7
 0x00 0x38 0x46 0x44
 
-# CHECK: clo a2,a3
+# CHECK: clo  $6, $7
 0x21 0x30 0xe6 0x70
 
-# CHECK: clz a2,a3
+# CHECK: clz  $6, $7
 0x20 0x30 0xe6 0x70
 
-# CHECK: ctc1 a2,$7
+# CHECK: ctc1  $6, $7
 0x00 0x38 0xc6 0x44
 
-# CHECK: cvt.d.s $f6,$f7
+# CHECK: cvt.d.s $f6, $f7
 0xa1 0x39 0x00 0x46
 
-# CHECK: cvt.d.w $f12,$f14
-0xa1 0x39 0x80 0x46
-
-# CHECK: cvt.l.d $f12,$f14
-0xa5 0x39 0x20 0x46
-
-# CHECK: cvt.l.s $f6,$f7
-0xa5 0x39 0x00 0x46
+# CHECK: cvt.d.w $f12, $f14
+0x21 0x73 0x80 0x46
 
-# CHECK: cvt.s.d $f12,$f14
-0xa0 0x39 0x20 0x46
+# CHECK: cvt.s.d $f12, $f14
+0x20 0x73 0x20 0x46
 
-# CHECK: cvt.s.w $f6,$f7
+# CHECK: cvt.s.w $f6, $f7
 0xa0 0x39 0x80 0x46
 
-# CHECK: cvt.w.d $f12,$f14
-0xa4 0x39 0x20 0x46
+# CHECK: cvt.w.d $f12, $f14
+0x24 0x73 0x20 0x46
 
-# CHECK: cvt.w.s $f6,$f7
+# CHECK: cvt.w.s $f6, $f7
 0xa4 0x39 0x00 0x46
 
-# CHECK: floor.w.d $f12,$f14
-0x8f 0x39 0x20 0x46
+# CHECK: floor.w.d $f12, $f14
+0x0f 0x73 0x20 0x46
 
-# CHECK: floor.w.s $f6,$f7
+# CHECK: floor.w.s $f6, $f7
 0x8f 0x39 0x00 0x46
 
-# CHECK: j 00000530
+# CHECK: j 1328
 0x4c 0x01 0x00 0x08
 
-# CHECK: jal 00000530
+# CHECK: jal 1328
 0x4c 0x01 0x00 0x0c
 
-# CHECK: jalr a2,a3
+# CHECK: jalr  $7
 0x09 0xf8 0xe0 0x00
 
-# CHECK: jr a3
+# CHECK: jr  $7
 0x08 0x00 0xe0 0x00
 
-# CHECK: lb  a0,9158(a1)
+# CHECK: lb  $4, 9158($5)
 0xc6 0x23 0xa4 0x80
 
-# CHECK: lbu a0,6(a1)
+# CHECK: lbu $4, 6($5)
 0x06 0x00 0xa4 0x90
 
-# CHECK: ldc1  $f9,9158(a3)
+# CHECK: ldc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xd4
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x0c 0x00 0xa4 0x84
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x0c 0x00 0xa4 0x84
 
-# CHECK: li  v1,17767
-0x67 0x45 0x03 0x24
-
-# CHECK: ll  t1,9158(a3)
+# CHECK: ll  $9, 9158($7)
 0xc6 0x23 0xe9 0xc0
 
-# CHECK: lui a2,0x4567
+# CHECK: lui  $6, 17767
 0x67 0x45 0x06 0x3c
 
-# CHECK: lw  a0,24(a1)
+# CHECK: lw  $4, 24($5)
 0x18 0x00 0xa4 0x8c
 
-# CHECK lw at,-18316(v0)
-0x74 0xb8 0x41 0x8c
-
-# CHECK: lwc1  $f9,9158(a3)
+# CHECK: lwc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xc4
 
-# CHECK: madd  a2,a3
+# CHECK: lwl   $2,  3($4)
+0x03 0x00 0x82 0x88
+
+# CHECK: lwr   $3, 16($5)
+0x10 0x00 0xa3 0x98
+
+# CHECK: madd   $6,  $7
 0x00 0x00 0xc7 0x70
 
-# CHECK: maddu a2,a3
+# CHECK: maddu  $6,  $7
 0x01 0x00 0xc7 0x70
 
-# CHECK: mfc1  a2,$f7
+# CHECK: mfc1   $6, $f7
 0x00 0x38 0x06 0x44
 
-# CHECK: mfhi  a1
+# CHECK: mfhi  $5
 0x10 0x28 0x00 0x00
 
-# CHECK: mflo  a1
+# CHECK: mflo  $5
 0x12 0x28 0x00 0x00
 
-# CHECK: mov.d $f12,$f14
-0x86 0x39 0x20 0x46
+# CHECK: mov.d $f6, $f8
+0x86 0x41 0x20 0x46
 
-# CHECK: mov.s $f6,$f7
+# CHECK: mov.s $f6, $f7
 0x86 0x39 0x00 0x46
 
-# CHECK: move  a2,a1
-0x21 0x30 0xa0 0x00
-
-# CHECK: msub  a2,a3
+# CHECK: msub   $6,  $7
 0x04 0x00 0xc7 0x70
 
-# CHECK: msubu a2,a3
+# CHECK: msubu  $6,  $7
 0x05 0x00 0xc7 0x70
 
-# CHECK: mtc1  a2,$f7
+# CHECK: mtc1   $6, $f7
 0x00 0x38 0x86 0x44
 
-# CHECK: mthi  a3
+# CHECK: mthi   $7
 0x11 0x00 0xe0 0x00
 
-# CHECK: mtlo  a3
+# CHECK: mtlo   $7
 0x13 0x00 0xe0 0x00
 
-# CHECK: mul.d $f9,$f12,$f14
-0x42 0x32 0x27 0x46
+# CHECK: mul.d $f8, $f12, $f14
+0x02 0x62 0x2e 0x46
 
-# CHECK: mul.s $f9,$f6,$f7
+# CHECK: mul.s $f9, $f6, $f7
 0x42 0x32 0x07 0x46
 
-# CHECK: mul t1,a2,a3
+# CHECK: mul $9,  $6,  $7
 0x02 0x48 0xc7 0x70
 
-# CHECK: mult  v1,a1
+# CHECK: mult  $3, $5
 0x18 0x00 0x65 0x00
 
-# CHECK: multu v1,a1
+# CHECK: multu $3, $5
 0x19 0x00 0x65 0x00
 
-# CHECK: neg.d $f12,$f14
-0x87 0x39 0x20 0x46
+# CHECK: neg.d $f12, $f14
+0x07 0x73 0x20 0x46
 
-# CHECK: neg.s $f6,$f7
+# CHECK: neg.s $f6, $f7
 0x87 0x39 0x00 0x46
 
-# CHECK: neg v1,a1
-0x22 0x18 0x05 0x00
-
 # CHECK: nop
 0x00 0x00 0x00 0x00
 
-# CHECK: nor t1,a2,a3
+# CHECK: nor $9,  $6, $7
 0x27 0x48 0xc7 0x00
 
-# CHECK: not v1,a1
-0x27 0x18 0xa0 0x00
-
-# CHECK: or  v1,v1,a1
+# CHECK: or  $3, $3, $5
 0x25 0x18 0x65 0x00
 
-# CHECK: ori t1,a2,0x4567
+# CHECK: ori $9,  $6, 17767
 0x67 0x45 0xc9 0x34
 
-# CHECK: rdhwr a2,$29
-0x3b 0xe8 0x06 0x7c
-
-# CHECK: round.w.d $f12,$f14
-0x8c 0x39 0x20 0x46
+# CHECK: round.w.d $f12, $f14
+0x0c 0x73 0x20 0x46
 
-# CHECK: round.w.s $f6,$f7
+# CHECK: round.w.s $f6, $f7
 0x8c 0x39 0x00 0x46
 
-# CHECK: sb  a0,9158(a1)
+# CHECK: sb  $4, 9158($5)
 0xc6 0x23 0xa4 0xa0
 
-# CHECK: sb  a0,6(a1)
+# CHECK: sb  $4, 6($5)
 0x06 0x00 0xa4 0xa0
 
-# CHECK: sc  t1,9158(a3)
+# CHECK: sc  $9, 9158($7)
 0xc6 0x23 0xe9 0xe0
 
-# CHECK: sdc1  $f9,9158(a3)
+# CHECK: sdc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xf4
 
-# CHECK: sh  a0,9158(a1)
+# CHECK: sh  $4, 9158($5)
 0xc6 0x23 0xa4 0xa4
 
-# CHECK: sll a0,v1,0x7
+# CHECK: sll $4, $3, 7
 0xc0 0x21 0x03 0x00
 
-# CHECK: sllv  v0,v1,a1
+# CHECK: sllv  $2, $3, $5
 0x04 0x10 0xa3 0x00
 
-# CHECK: slt v1,v1,a1
+# CHECK: slt $3, $3, $5
 0x2a 0x18 0x65 0x00
 
-# CHECK: slti  v1,v1,103
+# CHECK: slti  $3, $3, 103
 0x67 0x00 0x63 0x28
 
-# CHECK: sltiu v1,v1,103
+# CHECK: sltiu $3, $3, 103
 0x67 0x00 0x63 0x2c
 
-# CHECK: sltu  v1,v1,a1
+# CHECK: sltu  $3, $3, $5
 0x2b 0x18 0x65 0x00
 
-# CHECK: sqrt.d  $f12,$f14
-0x84 0x39 0x20 0x46
+# CHECK: sqrt.d  $f12, $f14
+0x04 0x73 0x20 0x46
 
-# CHECK: sqrt.s  $f6,$f7
+# CHECK: sqrt.s  $f6, $f7
 0x84 0x39 0x00 0x46
 
-# CHECK: sra a0,v1,0x7
+# CHECK: sra $4, $3, 7
 0xc3 0x21 0x03 0x00
 
-# CHECK: sra a0,v1,0x7
-0xc3 0x21 0x03 0x00
-
-# CHECK: srav  v0,v1,a1
+# CHECK: srav  $2, $3, $5
 0x07 0x10 0xa3 0x00
 
-# CHECK: srl a0,v1,0x7
+# CHECK: srl $4, $3, 7
 0xc2 0x21 0x03 0x00
 
-# CHECK: srlv  v0,v1,a1
+# CHECK: srlv  $2, $3, $5
 0x06 0x10 0xa3 0x00
 
-# CHECK: sub.d $f9,$f12,$f14
-0x41 0x32 0x27 0x46
+# CHECK: sub.d $f8, $f12, $f14
+0x01 0x62 0x2e 0x46
 
-# CHECK: sub.s $f9,$f6,$f7
+# CHECK: sub.s $f9, $f6, $f7
 0x41 0x32 0x07 0x46
 
-# CHECK: sub t1,a2,a3
+# CHECK: sub $9,  $6, $7
 0x22 0x48 0xc7 0x00
 
-# CHECK: subu  a0,v1,a1
+# CHECK: subu  $4, $3, $5
 0x23 0x20 0x65 0x00
 
-# CHECK: sw  a0,24(a1)
+# CHECK: sw  $4, 24($5)
 0x18 0x00 0xa4 0xac
 
-# CHECK: swc1  $f9,9158(a3)
+# CHECK: swc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xe4
 
-# CHECK: sync  0x7
+# CHECK: swl $4,  16($5)
+0x10 0x00 0xa4 0xa8
+
+# CHECK: swr $6, 16($7)
+0x10 0x00 0xe6 0xb8
+
+# CHECK: sync  7
 0xcf 0x01 0x00 0x00
 
-# CHECK: trunc.w.d $f12,$f14
-0x8d 0x39 0x20 0x46
+# CHECK: trunc.w.d $f12, $f14
+0x0d 0x73 0x20 0x46
 
-# CHECK: trunc.w.s $f6,$f7
+# CHECK: trunc.w.s $f6, $f7
 0x8d 0x39 0x00 0x46
 
-# CHECK: xor v1,v1,a1
+# CHECK: xor $3, $3, $5
 0x26 0x18 0x65 0x00
 
-# CHECK: xori  t1,a2,0x4567
+# CHECK: xori  $9,  $6, 17767
 0x67 0x45 0xc9 0x38
diff --git a/test/MC/Disassembler/Mips/mips32r2.txt b/test/MC/Disassembler/Mips/mips32r2.txt
index 295ffd0..3b70db3 100644
--- a/test/MC/Disassembler/Mips/mips32r2.txt
+++ b/test/MC/Disassembler/Mips/mips32r2.txt
@@ -1,439 +1,430 @@
-# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: abs.d $f12, $f14
+0x46 0x20 0x73 0x05
 
-# CHECK: abs.d $f12,$f14
-0x46 0x20 0x39 0x85
-
-# CHECK: abs.s $f6,$f7
+# CHECK: abs.s $f6, $f7
 0x46 0x00 0x39 0x85
 
-# CHECK: add t1,a2,a3
+# CHECK: add $9, $6, $7
 0x00 0xc7 0x48 0x20
 
-# CHECK: add.d $f18,$f12,$f14
-0x46 0x27 0x32 0x40
+# CHECK: add.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x00
 
-# CHECK: add.s $f9,$f6,$f7
+# CHECK: add.s $f9, $f6, $f7
 0x46 0x07 0x32 0x40
 
-# CHECK: addi t1,a2,17767
+# CHECK: addi $9, $6, 17767
 0x20 0xc9 0x45 0x67
 
-# CHECK: addiu t1,a2,-15001
+# CHECK: addiu $9, $6, -15001
 0x24 0xc9 0xc5 0x67
 
-# CHECK: addu t1,a2,a3
+# CHECK: addu $9, $6, $7
 0x00 0xc7 0x48 0x21
 
-# CHECK: and t1,a2,a3
+# CHECK: and $9, $6, $7
 0x00 0xc7 0x48 0x24
 
-# CHECK: andi t1,a2,0x4567
+# CHECK: andi $9, $6, 17767
 0x30 0xc9 0x45 0x67
 
-# CHECK: b 00000534
+# CHECK: b 1332
 0x10 0x00 0x01 0x4c
 
-# CHECK: bal 00000534
-0x04 0x11 0x01 0x4c
-
-# CHECK: bc1f 00000534
+# CHECK: bc1f 1332
 0x45 0x00 0x01 0x4c
 
-# CHECK: bc1t 00000534
+# CHECK: bc1t 1332
 0x45 0x01 0x01 0x4c
 
-# CHECK: beq t1,a2,00000534
+# CHECK: beq $9, $6, 1332
 0x11 0x26 0x01 0x4c
 
-# CHECK: bgez a2,00000534
+# CHECK: bgez  $6, 1332
 0x04 0xc1 0x01 0x4c
 
-# CHECK: bgezal a2,00000534
+# CHECK: bgezal  $6, 1332
 0x04 0xd1 0x01 0x4c
 
-# CHECK: bgtz a2,00000534
+# CHECK: bgtz  $6, 1332
 0x1c 0xc0 0x01 0x4c
 
-# CHECK: blez a2,00000534
+# CHECK: blez  $6, 1332
 0x18 0xc0 0x01 0x4c
 
-# CHECK: bne t1,a2,00000534
+# CHECK: bne $9, $6, 1332
 0x15 0x26 0x01 0x4c
 
-# CHECK: c.eq.d $f12,$f14
-0x46 0x27 0x30 0x32
+# CHECK: c.eq.d $f12, $f14
+0x46 0x2e 0x60 0x32
 
-# CHECK: c.eq.s $f6,$f7
+# CHECK: c.eq.s $f6, $f7
 0x46 0x07 0x30 0x32
 
-# CHECK: c.f.d $f12,$f14
-0x46 0x27 0x30 0x30
+# CHECK: c.f.d $f12, $f14
+0x46 0x2e 0x60 0x30
 
-# CHECK: c.f.s $f6,$f7
+# CHECK: c.f.s $f6, $f7
 0x46 0x07 0x30 0x30
 
-# CHECK: c.le.d $f12,$f14
-0x46 0x27 0x30 0x3e
+# CHECK: c.le.d $f12, $f14
+0x46 0x2e 0x60 0x3e
 
-# CHECK: c.le.s $f6,$f7
+# CHECK: c.le.s $f6, $f7
 0x46 0x07 0x30 0x3e
 
-# CHECK: c.lt.d $f12,$f14
-0x46 0x27 0x30 0x3c
+# CHECK: c.lt.d $f12, $f14
+0x46 0x2e 0x60 0x3c
 
-# CHECK: c.lt.s $f6,$f7
+# CHECK: c.lt.s $f6, $f7
 0x46 0x07 0x30 0x3c
 
-# CHECK: c.nge.d $f12,$f14
-0x46 0x27 0x30 0x3d
+# CHECK: c.nge.d $f12, $f14
+0x46 0x2e 0x60 0x3d
 
-# CHECK: c.nge.s $f6,$f7
+# CHECK: c.nge.s $f6, $f7
 0x46 0x07 0x30 0x3d
 
-# CHECK: c.ngl.d $f12,$f14
-0x46 0x27 0x30 0x3b
+# CHECK: c.ngl.d $f12, $f14
+0x46 0x2e 0x60 0x3b
 
-# CHECK: c.ngl.s $f6,$f7
+# CHECK: c.ngl.s $f6, $f7
 0x46 0x07 0x30 0x3b
 
-# CHECK: c.ngle.d $f12,$f14
-0x46 0x27 0x30 0x39
+# CHECK: c.ngle.d $f12, $f14
+0x46 0x2e 0x60 0x39
 
-# CHECK: c.ngle.s $f6,$f7
+# CHECK: c.ngle.s $f6, $f7
 0x46 0x07 0x30 0x39
 
-# CHECK: c.ngt.d $f12,$f14
-0x46 0x27 0x30 0x3f
+# CHECK: c.ngt.d $f12, $f14
+0x46 0x2e 0x60 0x3f
 
-# CHECK: c.ngt.s $f6,$f7
+# CHECK: c.ngt.s $f6, $f7
 0x46 0x07 0x30 0x3f
 
-# CHECK: c.ole.d $f12,$f14
-0x46 0x27 0x30 0x36
+# CHECK: c.ole.d $f12, $f14
+0x46 0x2e 0x60 0x36
 
-# CHECK: c.ole.s $f6,$f7
+# CHECK: c.ole.s $f6, $f7
 0x46 0x07 0x30 0x36
 
-# CHECK: c.olt.d $f12,$f14
-0x46 0x27 0x30 0x34
+# CHECK: c.olt.d $f12, $f14
+0x46 0x2e 0x60 0x34
 
-# CHECK: c.olt.s $f6,$f7
+# CHECK: c.olt.s $f6, $f7
 0x46 0x07 0x30 0x34
 
-# CHECK: c.seq.d $f12,$f14
-0x46 0x27 0x30 0x3a
+# CHECK: c.seq.d $f12, $f14
+0x46 0x2e 0x60 0x3a
 
-# CHECK: c.seq.s $f6,$f7
+# CHECK: c.seq.s $f6, $f7
 0x46 0x07 0x30 0x3a
 
-# CHECK: c.sf.d $f12,$f14
-0x46 0x27 0x30 0x38
+# CHECK: c.sf.d $f12, $f14
+0x46 0x2e 0x60 0x38
 
-# CHECK: c.sf.s $f6,$f7
+# CHECK: c.sf.s $f6, $f7
 0x46 0x07 0x30 0x38
 
-# CHECK: c.ueq.d $f12,$f14
-0x46 0x27 0x30 0x33
+# CHECK: c.ueq.d $f12, $f14
+0x46 0x2e 0x60 0x33
 
-# CHECK: c.ueq.s $f28,$f18
+# CHECK: c.ueq.s $f28, $f18
 0x46 0x12 0xe0 0x33
 
-# CHECK: c.ule.d $f12,$f14
-0x46 0x27 0x30 0x37
+# CHECK: c.ule.d $f12, $f14
+0x46 0x2e 0x60 0x37
 
-# CHECK: c.ule.s $f6,$f7
+# CHECK: c.ule.s $f6, $f7
 0x46 0x07 0x30 0x37
 
-# CHECK: c.ult.d $f12,$f14
-0x46 0x27 0x30 0x35
+# CHECK: c.ult.d $f12, $f14
+0x46 0x2e 0x60 0x35
 
-# CHECK: c.ult.s $f6,$f7
+# CHECK: c.ult.s $f6, $f7
 0x46 0x07 0x30 0x35
 
-# CHECK: c.un.d $f12,$f14
-0x46 0x27 0x30 0x31
+# CHECK: c.un.d $f12, $f14
+0x46 0x2e 0x60 0x31
 
-# CHECK: c.un.s $f6,$f7
+# CHECK: c.un.s $f6, $f7
 0x46 0x07 0x30 0x31
 
-# CHECK: ceil.w.d $f12,$f14
-0x46 0x20 0x39 0x8e
+# CHECK: ceil.w.d $f12, $f14
+0x46 0x20 0x73 0x0e
 
-# CHECK: ceil.w.s $f6,$f7
+# CHECK: ceil.w.s $f6, $f7
 0x46 0x00 0x39 0x8e
 
-# CHECK: cfc1 a2,$7
+# CHECK: cfc1  $6, $7
 0x44 0x46 0x38 0x00
 
-# CHECK: clo a2,a3
+# CHECK: clo  $6, $7
 0x70 0xe6 0x30 0x21
 
-# CHECK: clz a2,a3
+# CHECK: clz  $6, $7
 0x70 0xe6 0x30 0x20
 
-# CHECK: ctc1 a2,$7
+# CHECK: ctc1  $6, $7
 0x44 0xc6 0x38 0x00
 
-# CHECK: cvt.d.s $f6,$f7
-0x46 0x00 0x38 0xa1
+# CHECK: cvt.d.s $f6, $f7
+0x46 0x00 0x39 0xa1
 
-# CHECK: cvt.d.w $f12,$f14
-0x46 0x80 0x38 0xa1
+# CHECK: cvt.d.w $f12, $f14
+0x46 0x80 0x73 0x21
 
-# CHECK: cvt.l.d $f12,$f14
-0x46 0x20 0x39 0xa5
+# CHECK: cvt.l.d $f12, $f14
+0x46 0x20 0x73 0x25
 
-# CHECK: cvt.l.s $f6,$f7
+# CHECK: cvt.l.s $f6, $f7
 0x46 0x00 0x39 0xa5
 
-# CHECK: cvt.s.d $f12,$f14
-0x46 0x20 0x39 0xa0
+# CHECK: cvt.s.d $f12, $f14
+0x46 0x20 0x73 0x20
 
-# CHECK: cvt.s.w $f6,$f7
+# CHECK: cvt.s.w $f6, $f7
 0x46 0x80 0x39 0xa0
 
-# CHECK: cvt.w.d $f12,$f14
-0x46 0x20 0x39 0xa4
+# CHECK: cvt.w.d $f12, $f14
+0x46 0x20 0x73 0x24
 
-# CHECK: cvt.w.s $f6,$f7
+# CHECK: cvt.w.s $f6, $f7
 0x46 0x00 0x39 0xa4
 
-# CHECK: floor.w.d $f12,$f14
-0x46 0x20 0x39 0x8f
+# CHECK: floor.w.d $f12, $f14
+0x46 0x20 0x73 0x0f
 
-# CHECK: floor.w.s $f6,$f7
+# CHECK: floor.w.s $f6, $f7
 0x46 0x00 0x39 0x8f
 
-# CHECK: ins s3,t1,0x6,0x7
+# CHECK: ins $19, $9, 6, 7
 0x7d 0x33 0x61 0x84
 
-# CHECK: j 00000530
+# CHECK: j 1328
 0x08 0x00 0x01 0x4c
 
-# CHECK: jal 00000530
+# CHECK: jal 1328
 0x0c 0x00 0x01 0x4c
 
-# CHECK: jalr a2,a3
+# CHECK: jalr  $7
 0x00 0xe0 0xf8 0x09
 
-# CHECK: jr a3
+# CHECK: jr  $7
 0x00 0xe0 0x00 0x08
 
-# CHECK: lb  a0,9158(a1)
+# CHECK: lb  $4, 9158($5)
 0x80 0xa4 0x23 0xc6
 
-# CHECK: lbu a0,6(a1)
+# CHECK: lbu $4, 6($5)
 0x90 0xa4 0x00 0x06
 
-# CHECK: ldc1  $f9,9158(a3)
+# CHECK: ldc1  $f9, 9158($7)
 0xd4 0xe9 0x23 0xc6
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x84 0xa4 0x00 0x0c
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x84 0xa4 0x00 0x0c
 
-# CHECK: li  v1,17767
-0x24 0x03 0x45 0x67
-
-# CHECK: ll  t1,9158(a3)
+# CHECK: ll  $9, 9158($7)
 0xc0 0xe9 0x23 0xc6
 
-# CHECK: lui a2,0x4567
+# CHECK: lui  $6, 17767
 0x3c 0x06 0x45 0x67
 
-# CHECK: lw  a0,24(a1)
+# CHECK: lw  $4, 24($5)
 0x8c 0xa4 0x00 0x18
 
-# CHECK: lwc1  $f9,9158(a3)
+# CHECK: lwc1  $f9, 9158($7)
 0xc4 0xe9 0x23 0xc6
 
-# CHECK: madd  a2,a3
+# CHECK: lwl   $2,  3($4)
+0x88 0x82 0x00 0x03
+
+# CHECK: lwr   $3, 16($5)
+0x98 0xa3 0x00 0x10
+
+# CHECK: madd   $6,  $7
 0x70 0xc7 0x00 0x00
 
-# CHECK: maddu a2,a3
+# CHECK: maddu  $6,  $7
 0x70 0xc7 0x00 0x01
 
-# CHECK: mfc1  a2,$f7
+# CHECK: mfc1   $6, $f7
 0x44 0x06 0x38 0x00
 
-# CHECK: mfhi  a1
+# CHECK: mfhi  $5
 0x00 0x00 0x28 0x10
 
-# CHECK: mflo  a1
+# CHECK: mflo  $5
 0x00 0x00 0x28 0x12
 
-# CHECK: mov.d $f6,$f7
-0x46 0x20 0x39 0x86
+# CHECK: mov.d $f6, $f8
+0x46 0x20 0x41 0x86
 
-# CHECK: mov.s $f6,$f7
+# CHECK: mov.s $f6, $f7
 0x46 0x00 0x39 0x86
 
-# CHECK: move  a2,a1
-0x00 0xa0 0x30 0x21
-
-# CHECK: msub  a2,a3
+# CHECK: msub   $6,  $7
 0x70 0xc7 0x00 0x04
 
-# CHECK: msubu a2,a3
+# CHECK: msubu  $6,  $7
 0x70 0xc7 0x00 0x05
 
-# CHECK: mtc1  a2,$f7
+# CHECK: mtc1   $6, $f7
 0x44 0x86 0x38 0x00
 
-# CHECK: mthi  a3
+# CHECK: mthi   $7
 0x00 0xe0 0x00 0x11
 
-# CHECK: mtlo  a3
+# CHECK: mtlo   $7
 0x00 0xe0 0x00 0x13
 
-# CHECK: mul.d $f9,$f12,$f14
-0x46 0x27 0x32 0x42
+# CHECK: mul.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x02
 
-# CHECK: mul.s $f9,$f6,$f7
+# CHECK: mul.s $f9, $f6, $f7
 0x46 0x07 0x32 0x42
 
-# CHECK: mul t1,a2,a3
+# CHECK: mul $9,  $6,  $7
 0x70 0xc7 0x48 0x02
 
-# CHECK: mult  v1,a1
+# CHECK: mult  $3, $5
 0x00 0x65 0x00 0x18
 
-# CHECK: multu v1,a1
+# CHECK: multu $3, $5
 0x00 0x65 0x00 0x19
 
-# CHECK: neg.d $f12,$f14
-0x46 0x20 0x39 0x87
+# CHECK: neg.d $f12, $f14
+0x46 0x20 0x73 0x07
 
-# CHECK: neg.s $f6,$f7
+# CHECK: neg.s $f6, $f7
 0x46 0x00 0x39 0x87
 
-# CHECK: neg v1,a1
-0x00 0x05 0x18 0x22
-
 # CHECK: nop
 0x00 0x00 0x00 0x00
 
-# CHECK: nor t1,a2,a3
+# CHECK: nor $9,  $6, $7
 0x00 0xc7 0x48 0x27
 
-# CHECK: not v1,a1
-0x00 0xa0 0x18 0x27
-
-# CHECK: or  v1,v1,a1
+# CHECK: or  $3, $3, $5
 0x00 0x65 0x18 0x25
 
-# CHECK: ori t1,a2,0x4567
+# CHECK: ori $9,  $6, 17767
 0x34 0xc9 0x45 0x67
 
-# CHECK: rdhwr a2,$29
-0x7c 0x06 0xe8 0x3b
-
-# CHECK: ror t1,a2,0x7
+# CHECK: rotr $9, $6, 7
 0x00 0x26 0x49 0xc2
 
-# CHECK: rorv  t1,a2,a3
+# CHECK:  rotrv $9, $6, $7
 0x00 0xe6 0x48 0x46
 
-# CHECK: round.w.d $f12,$f14
-0x46 0x20 0x39 0x8c
+# CHECK: round.w.d $f12, $f14
+0x46 0x20 0x73 0x0c
 
-# CHECK: round.w.s $f6,$f7
+# CHECK: round.w.s $f6, $f7
 0x46 0x00 0x39 0x8c
 
-# CHECK: sb  a0,9158(a1)
+# CHECK: sb  $4, 9158($5)
 0xa0 0xa4 0x23 0xc6
 
-# CHECK: sb  a0,6(a1)
+# CHECK: sb  $4, 6($5)
 0xa0 0xa4 0x00 0x06
 
-# CHECK: sc  t1,9158(a3)
+# CHECK: sc  $9, 9158($7)
 0xe0 0xe9 0x23 0xc6
 
-# CHECK: sdc1  $f9,9158(a3)
+# CHECK: sdc1  $f9, 9158($7)
 0xf4 0xe9 0x23 0xc6
 
-# CHECK: seb a2,a3
+# CHECK: seb $6, $7
 0x7c 0x07 0x34 0x20
 
-# CHECK: seh a2,a3
+# CHECK: seh $6, $7
 0x7c 0x07 0x36 0x20
 
-# CHECK: sh  a0,9158(a1)
+# CHECK: sh  $4, 9158($5)
 0xa4 0xa4 0x23 0xc6
 
-# CHECK: sll a0,v1,0x7
+# CHECK: sll $4, $3, 7
 0x00 0x03 0x21 0xc0
 
-# CHECK: sllv  v0,v1,a1
+# CHECK: sllv  $2, $3, $5
 0x00 0xa3 0x10 0x04
 
-# CHECK: slt v1,v1,a1
+# CHECK: slt $3, $3, $5
 0x00 0x65 0x18 0x2a
 
-# CHECK: slti  v1,v1,103
+# CHECK: slti  $3, $3, 103
 0x28 0x63 0x00 0x67
 
-# CHECK: sltiu v1,v1,103
+# CHECK: sltiu $3, $3, 103
 0x2c 0x63 0x00 0x67
 
-# CHECK: sltu  v1,v1,a1
+# CHECK: sltu  $3, $3, $5
 0x00 0x65 0x18 0x2b
 
-# CHECK: sqrt.d  $f12,$f14
-0x46 0x20 0x39 0x84
+# CHECK: sqrt.d  $f12, $f14
+0x46 0x20 0x73 0x04
 
-# CHECK: sqrt.s  $f6,$f7
+# CHECK: sqrt.s  $f6, $f7
 0x46 0x00 0x39 0x84
 
-# CHECK: sra a0,v1,0x7
-0x00 0x03 0x21 0xc3
-
-# CHECK: sra a0,v1,0x7
+# CHECK: sra $4, $3, 7
 0x00 0x03 0x21 0xc3
 
-# CHECK: srav  v0,v1,a1
+# CHECK: srav  $2, $3, $5
 0x00 0xa3 0x10 0x07
 
-# CHECK: srl a0,v1,0x7
+# CHECK: srl $4, $3, 7
 0x00 0x03 0x21 0xc2
 
-# CHECK: srlv  v0,v1,a1
+# CHECK: srlv  $2, $3, $5
 0x00 0xa3 0x10 0x06
 
-# CHECK: sub.d $f9,$f12,$f14
-0x46 0x27 0x32 0x41
+# CHECK: sub.d $f8, $f12, $f14
+0x46 0x2e 0x62 0x01
 
-# CHECK: sub.s $f9,$f6,$f7
+# CHECK: sub.s $f9, $f6, $f7
 0x46 0x07 0x32 0x41
 
-# CHECK: sub t1,a2,a3
+# CHECK: sub $9,  $6, $7
 0x00 0xc7 0x48 0x22
 
-# CHECK: subu  a0,v1,a1
+# CHECK: subu  $4, $3, $5
 0x00 0x65 0x20 0x23
 
-# CHECK: sw  a0,24(a1)
+# CHECK: sw  $4, 24($5)
 0xac 0xa4 0x00 0x18
 
-# CHECK: swc1  $f9,9158(a3)
+# CHECK: swc1  $f9, 9158($7)
 0xe4 0xe9 0x23 0xc6
 
-# CHECK: sync  0x7
+# CHECK: swl $4,  16($5)
+0xa8 0xa4 0x00 0x10
+
+# CHECK: swr $6, 16($7)
+0xb8 0xe6 0x00 0x10
+
+# CHECK: sync  7
 0x00 0x00 0x01 0xcf
 
-# CHECK: trunc.w.d $f12,$f14
-0x46 0x20 0x39 0x8d
+# CHECK: trunc.w.d $f12, $f14
+0x46 0x20 0x73 0x0d
 
-# CHECK: trunc.w.s $f6,$f7
+# CHECK: trunc.w.s $f6, $f7
 0x46 0x00 0x39 0x8d
 
-# CHECK: wsbh  a2,a3
+# CHECK: wsbh  $6, $7
 0x7c 0x07 0x30 0xa0
 
-# CHECK: xor v1,v1,a1
+# CHECK: xor $3, $3, $5
 0x00 0x65 0x18 0x26
 
-# CHECK: xori  t1,a2,0x4567
+# CHECK: xori  $9,  $6, 17767
 0x38 0xc9 0x45 0x67
diff --git a/test/MC/Disassembler/Mips/mips32r2_le.txt b/test/MC/Disassembler/Mips/mips32r2_le.txt
index 6d8be79..ecfde7a 100644
--- a/test/MC/Disassembler/Mips/mips32r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips32r2_le.txt
@@ -1,442 +1,430 @@
-# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r2
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r2 | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: abs.d $f12, $f14
+0x05 0x73 0x20 0x46
 
-# CHECK: abs.d $f12,$f14
-0x85 0x39 0x20 0x46
-
-# CHECK: abs.s $f6,$f7
+# CHECK: abs.s $f6, $f7
 0x85 0x39 0x00 0x46
 
-# CHECK: add t1,a2,a3
+# CHECK: add $9, $6, $7
 0x20 0x48 0xc7 0x00
 
-# CHECK: add.d $f18,$f12,$f14
-0x40 0x32 0x27 0x46
+# CHECK: add.d $f8, $f12, $f14
+0x00 0x62 0x2e 0x46
 
-# CHECK: add.s $f9,$f6,$f7
+# CHECK: add.s $f9, $f6, $f7
 0x40 0x32 0x07 0x46
 
-# CHECK: addi t1,a2,17767
+# CHECK: addi $9, $6, 17767
 0x67 0x45 0xc9 0x20
 
-# CHECK: addiu t1,a2,-15001
+# CHECK: addiu $9, $6, -15001
 0x67 0xc5 0xc9 0x24
 
-# CHECK: addu t1,a2,a3
+# CHECK: addu $9, $6, $7
 0x21 0x48 0xc7 0x00
 
-# CHECK: and t1,a2,a3
+# CHECK: and $9, $6, $7
 0x24 0x48 0xc7 0x00
 
-# CHECK: andi t1,a2,0x4567
+# CHECK: andi $9, $6, 17767
 0x67 0x45 0xc9 0x30
 
-# CHECK: b 00000534
+# CHECK: b 1332
 0x4c 0x01 0x00 0x10
 
-# CHECK: bal 00000534
-0x4c 0x01 0x11 0x04
-
-# CHECK: bc1f 00000534
+# CHECK: bc1f 1332
 0x4c 0x01 0x00 0x45
 
-# CHECK: bc1t 00000534
+# CHECK: bc1t 1332
 0x4c 0x01 0x01 0x45
 
-# CHECK: beq t1,a2,00000534
+# CHECK: beq $9, $6, 1332
 0x4c 0x01 0x26 0x11
 
-# CHECK: bgez a2,00000534
+# CHECK: bgez  $6, 1332
 0x4c 0x01 0xc1 0x04
 
-# CHECK: bgezal a2,00000534
+# CHECK: bgezal  $6, 1332
 0x4c 0x01 0xd1 0x04
 
-# CHECK: bgtz a2,00000534
+# CHECK: bgtz  $6, 1332
 0x4c 0x01 0xc0 0x1c
 
-# CHECK: blez a2,00000534
+# CHECK: blez  $6, 1332
 0x4c 0x01 0xc0 0x18
 
-# CHECK: bne t1,a2,00000534
+# CHECK: bne $9, $6, 1332
 0x4c 0x01 0x26 0x15
 
-# CHECK: c.eq.d $f12,$f14
-0x32 0x30 0x27 0x46
+# CHECK: c.eq.d $f12, $f14
+0x32 0x60 0x2e 0x46
 
-# CHECK: c.eq.s $f6,$f7
+# CHECK: c.eq.s $f6, $f7
 0x32 0x30 0x07 0x46
 
-# CHECK: c.f.d $f12,$f14
-0x30 0x30 0x27 0x46
+# CHECK: c.f.d $f12, $f14
+0x30 0x60 0x2e 0x46
 
-# CHECK: c.f.s $f6,$f7
+# CHECK: c.f.s $f6, $f7
 0x30 0x30 0x07 0x46
 
-# CHECK: c.le.d $f12,$f14
-0x3e 0x30 0x27 0x46
+# CHECK: c.le.d $f12, $f14
+0x3e 0x60 0x2e 0x46
 
-# CHECK: c.le.s $f6,$f7
+# CHECK: c.le.s $f6, $f7
 0x3e 0x30 0x07 0x46
 
-# CHECK: c.lt.d $f12,$f14
-0x3c 0x30 0x27 0x46
+# CHECK: c.lt.d $f12, $f14
+0x3c 0x60 0x2e 0x46
 
-# CHECK: c.lt.s $f6,$f7
+# CHECK: c.lt.s $f6, $f7
 0x3c 0x30 0x07 0x46
 
-# CHECK: c.nge.d $f12,$f14
-0x3d 0x30 0x27 0x46
+# CHECK: c.nge.d $f12, $f14
+0x3d 0x60 0x2e 0x46
 
-# CHECK: c.nge.s $f6,$f7
+# CHECK: c.nge.s $f6, $f7
 0x3d 0x30 0x07 0x46
 
-# CHECK: c.ngl.d $f12,$f14
-0x3b 0x30 0x27 0x46
+# CHECK: c.ngl.d $f12, $f14
+0x3b 0x60 0x2e 0x46
 
-# CHECK: c.ngl.s $f6,$f7
+# CHECK: c.ngl.s $f6, $f7
 0x3b 0x30 0x07 0x46
 
-# CHECK: c.ngle.d $f12,$f14
-0x39 0x30 0x27 0x46
+# CHECK: c.ngle.d $f12, $f14
+0x39 0x60 0x2e 0x46
 
-# CHECK: c.ngle.s $f6,$f7
+# CHECK: c.ngle.s $f6, $f7
 0x39 0x30 0x07 0x46
 
-# CHECK: c.ngt.d $f12,$f14
-0x3f 0x30 0x27 0x46
+# CHECK: c.ngt.d $f12, $f14
+0x3f 0x60 0x2e 0x46
 
-# CHECK: c.ngt.s $f6,$f7
+# CHECK: c.ngt.s $f6, $f7
 0x3f 0x30 0x07 0x46
 
-# CHECK: c.ole.d $f12,$f14
-0x36 0x30 0x27 0x46
+# CHECK: c.ole.d $f12, $f14
+0x36 0x60 0x2e 0x46
 
-# CHECK: c.ole.s $f6,$f7
+# CHECK: c.ole.s $f6, $f7
 0x36 0x30 0x07 0x46
 
-# CHECK: c.olt.d $f12,$f14
-0x34 0x30 0x27 0x46
+# CHECK: c.olt.d $f12, $f14
+0x34 0x60 0x2e 0x46
 
-# CHECK: c.olt.s $f6,$f7
+# CHECK: c.olt.s $f6, $f7
 0x34 0x30 0x07 0x46
 
-# CHECK: c.seq.d $f12,$f14
-0x3a 0x30 0x27 0x46
+# CHECK: c.seq.d $f12, $f14
+0x3a 0x60 0x2e 0x46
 
-# CHECK: c.seq.s $f6,$f7
+# CHECK: c.seq.s $f6, $f7
 0x3a 0x30 0x07 0x46
 
-# CHECK: c.sf.d $f12,$f14
-0x38 0x30 0x27 0x46
+# CHECK: c.sf.d $f12, $f14
+0x38 0x60 0x2e 0x46
 
-# CHECK: c.sf.s $f6,$f7
+# CHECK: c.sf.s $f6, $f7
 0x38 0x30 0x07 0x46
 
-# CHECK: c.ueq.d $f12,$f14
-0x33 0x30 0x27 0x46
+# CHECK: c.ueq.d $f12, $f14
+0x33 0x60 0x2e 0x46
 
-# CHECK: c.ueq.s $f28,$f18
+# CHECK: c.ueq.s $f28, $f18
 0x33 0xe0 0x12 0x46
 
-# CHECK: c.ule.d $f12,$f14
-0x37 0x30 0x27 0x46
+# CHECK: c.ule.d $f12, $f14
+0x37 0x60 0x2e 0x46
 
-# CHECK: c.ule.s $f6,$f7
+# CHECK: c.ule.s $f6, $f7
 0x37 0x30 0x07 0x46
 
-# CHECK: c.ult.d $f12,$f14
-0x35 0x30 0x27 0x46
+# CHECK: c.ult.d $f12, $f14
+0x35 0x60 0x2e 0x46
 
-# CHECK: c.ult.s $f6,$f7
+# CHECK: c.ult.s $f6, $f7
 0x35 0x30 0x07 0x46
 
-# CHECK: c.un.d $f12,$f14
-0x31 0x30 0x27 0x46
+# CHECK: c.un.d $f12, $f14
+0x31 0x60 0x2e 0x46
 
-# CHECK: c.un.s $f6,$f7
+# CHECK: c.un.s $f6, $f7
 0x31 0x30 0x07 0x46
 
-# CHECK: ceil.w.d $f12,$f14
-0x8e 0x38 0x20 0x46
+# CHECK: ceil.w.d $f12, $f14
+0x0e 0x73 0x20 0x46
 
-# CHECK: ceil.w.s $f6,$f7
-0x8e 0x38 0x00 0x46
+# CHECK: ceil.w.s $f6, $f7
+0x8e 0x39 0x00 0x46
 
-# CHECK: cfc1 a2,$7
+# CHECK: cfc1  $6, $7
 0x00 0x38 0x46 0x44
 
-# CHECK: clo a2,a3
+# CHECK: clo  $6, $7
 0x21 0x30 0xe6 0x70
 
-# CHECK: clz a2,a3
+# CHECK: clz  $6, $7
 0x20 0x30 0xe6 0x70
 
-# CHECK: ctc1 a2,$7
+# CHECK: ctc1  $6, $7
 0x00 0x38 0xc6 0x44
 
-# CHECK: cvt.d.s $f6,$f7
+# CHECK: cvt.d.s $f6, $f7
 0xa1 0x39 0x00 0x46
 
-# CHECK: cvt.d.w $f12,$f14
-0xa1 0x39 0x80 0x46
+# CHECK: cvt.d.w $f12, $f14
+0x21 0x73 0x80 0x46
 
-# CHECK: cvt.l.d $f12,$f14
-0xa5 0x39 0x20 0x46
+# CHECK: cvt.l.d $f12, $f14
+0x25 0x73 0x20 0x46
 
-# CHECK: cvt.l.s $f6,$f7
+# CHECK: cvt.l.s $f6, $f7
 0xa5 0x39 0x00 0x46
 
-# CHECK: cvt.s.d $f12,$f14
-0xa0 0x39 0x20 0x46
+# CHECK: cvt.s.d $f12, $f14
+0x20 0x73 0x20 0x46
 
-# CHECK: cvt.s.w $f6,$f7
+# CHECK: cvt.s.w $f6, $f7
 0xa0 0x39 0x80 0x46
 
-# CHECK: cvt.w.d $f12,$f14
-0xa4 0x39 0x20 0x46
+# CHECK: cvt.w.d $f12, $f14
+0x24 0x73 0x20 0x46
 
-# CHECK: cvt.w.s $f6,$f7
+# CHECK: cvt.w.s $f6, $f7
 0xa4 0x39 0x00 0x46
 
-# CHECK: floor.w.d $f12,$f14
-0x8f 0x39 0x20 0x46
+# CHECK: floor.w.d $f12, $f14
+0x0f 0x73 0x20 0x46
 
-# CHECK: floor.w.s $f6,$f7
+# CHECK: floor.w.s $f6, $f7
 0x8f 0x39 0x00 0x46
 
-# CHECK: ins s3,t1,0x6,0x7
+# CHECK: ins $19, $9, 6, 7
 0x84 0x61 0x33 0x7d
 
-# CHECK: j 00000530
+# CHECK: j 1328
 0x4c 0x01 0x00 0x08
 
-# CHECK: jal 00000530
+# CHECK: jal 1328
 0x4c 0x01 0x00 0x0c
 
-# CHECK: jalr a2,a3
+# CHECK: jalr  $7
 0x09 0xf8 0xe0 0x00
 
-# CHECK: jr a3
+# CHECK: jr  $7
 0x08 0x00 0xe0 0x00
 
-# CHECK: lb  a0,9158(a1)
+# CHECK: lb  $4, 9158($5)
 0xc6 0x23 0xa4 0x80
 
-# CHECK: lbu a0,6(a1)
+# CHECK: lbu $4, 6($5)
 0x06 0x00 0xa4 0x90
 
-# CHECK: ldc1  $f9,9158(a3)
+# CHECK: ldc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xd4
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x0c 0x00 0xa4 0x84
 
-# CHECK: lh  a0,12(a1)
+# CHECK: lh  $4, 12($5)
 0x0c 0x00 0xa4 0x84
 
-# CHECK: li  v1,17767
-0x67 0x45 0x03 0x24
-
-# CHECK: ll  t1,9158(a3)
+# CHECK: ll  $9, 9158($7)
 0xc6 0x23 0xe9 0xc0
 
-# CHECK: lui a2,0x4567
+# CHECK: lui  $6, 17767
 0x67 0x45 0x06 0x3c
 
-# CHECK: lw  a0,24(a1)
+# CHECK: lw  $4, 24($5)
 0x18 0x00 0xa4 0x8c
 
-# CHECK lw at,-18316(v0)
-0x74 0xb8 0x41 0x8c
-
-# CHECK: lwc1  $f9,9158(a3)
+# CHECK: lwc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xc4
 
-# CHECK: madd  a2,a3
+# CHECK: lwl   $2,  3($4)
+0x03 0x00 0x82 0x88
+
+# CHECK: lwr   $3, 16($5)
+0x10 0x00 0xa3 0x98
+
+# CHECK: madd   $6,  $7
 0x00 0x00 0xc7 0x70
 
-# CHECK: maddu a2,a3
+# CHECK: maddu  $6,  $7
 0x01 0x00 0xc7 0x70
 
-# CHECK: mfc1  a2,$f7
+# CHECK: mfc1   $6, $f7
 0x00 0x38 0x06 0x44
 
-# CHECK: mfhi  a1
+# CHECK: mfhi  $5
 0x10 0x28 0x00 0x00
 
-# CHECK: mflo  a1
+# CHECK: mflo  $5
 0x12 0x28 0x00 0x00
 
-# CHECK: mov.d $f12,$f14
-0x86 0x39 0x20 0x46
+# CHECK: mov.d $f6, $f8
+0x86 0x41 0x20 0x46
 
-# CHECK: mov.s $f6,$f7
+# CHECK: mov.s $f6, $f7
 0x86 0x39 0x00 0x46
 
-# CHECK: move  a2,a1
-0x21 0x30 0xa0 0x00
-
-# CHECK: msub  a2,a3
+# CHECK: msub   $6,  $7
 0x04 0x00 0xc7 0x70
 
-# CHECK: msubu a2,a3
+# CHECK: msubu  $6,  $7
 0x05 0x00 0xc7 0x70
 
-# CHECK: mtc1  a2,$f7
+# CHECK: mtc1   $6, $f7
 0x00 0x38 0x86 0x44
 
-# CHECK: mthi  a3
+# CHECK: mthi   $7
 0x11 0x00 0xe0 0x00
 
-# CHECK: mtlo  a3
+# CHECK: mtlo   $7
 0x13 0x00 0xe0 0x00
 
-# CHECK: mul.d $f9,$f12,$f14
-0x42 0x32 0x27 0x46
+# CHECK: mul.d $f8, $f12, $f14
+0x02 0x62 0x2e 0x46
 
-# CHECK: mul.s $f9,$f6,$f7
+# CHECK: mul.s $f9, $f6, $f7
 0x42 0x32 0x07 0x46
 
-# CHECK: mul t1,a2,a3
+# CHECK: mul $9,  $6,  $7
 0x02 0x48 0xc7 0x70
 
-# CHECK: mult  v1,a1
+# CHECK: mult  $3, $5
 0x18 0x00 0x65 0x00
 
-# CHECK: multu v1,a1
+# CHECK: multu $3, $5
 0x19 0x00 0x65 0x00
 
-# CHECK: neg.d $f12,$f14
-0x87 0x39 0x20 0x46
+# CHECK: neg.d $f12, $f14
+0x07 0x73 0x20 0x46
 
-# CHECK: neg.s $f6,$f7
+# CHECK: neg.s $f6, $f7
 0x87 0x39 0x00 0x46
 
-# CHECK: neg v1,a1
-0x22 0x18 0x05 0x00
-
 # CHECK: nop
 0x00 0x00 0x00 0x00
 
-# CHECK: nor t1,a2,a3
+# CHECK: nor $9,  $6, $7
 0x27 0x48 0xc7 0x00
 
-# CHECK: not v1,a1
-0x27 0x18 0xa0 0x00
-
-# CHECK: or  v1,v1,a1
+# CHECK: or  $3, $3, $5
 0x25 0x18 0x65 0x00
 
-# CHECK: ori t1,a2,0x4567
+# CHECK: ori $9,  $6, 17767
 0x67 0x45 0xc9 0x34
 
-# CHECK: rdhwr a2,$29
-0x3b 0xe8 0x06 0x7c
-
-# CHECK: ror t1,a2,0x7
+# CHECK: rotr $9, $6, 7
 0xc2 0x49 0x26 0x00
 
-# CHECK: rorv  t1,a2,a3
+# CHECK:  rotrv $9, $6, $7
 0x46 0x48 0xe6 0x00
 
-# CHECK: round.w.d $f12,$f14
-0x8c 0x39 0x20 0x46
+# CHECK: round.w.d $f12, $f14
+0x0c 0x73 0x20 0x46
 
-# CHECK: round.w.s $f6,$f7
+# CHECK: round.w.s $f6, $f7
 0x8c 0x39 0x00 0x46
 
-# CHECK: sb  a0,9158(a1)
+# CHECK: sb  $4, 9158($5)
 0xc6 0x23 0xa4 0xa0
 
-# CHECK: sb  a0,6(a1)
+# CHECK: sb  $4, 6($5)
 0x06 0x00 0xa4 0xa0
 
-# CHECK: sc  t1,9158(a3)
+# CHECK: sc  $9, 9158($7)
 0xc6 0x23 0xe9 0xe0
 
-# CHECK: sdc1  $f9,9158(a3)
+# CHECK: sdc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xf4
 
-# CHECK: seb a2,a3
+# CHECK: seb $6, $7
 0x20 0x34 0x07 0x7c
 
-# CHECK: seh a2,a3
+# CHECK: seh $6, $7
 0x20 0x36 0x07 0x7c
 
-# CHECK: sh  a0,9158(a1)
+# CHECK: sh  $4, 9158($5)
 0xc6 0x23 0xa4 0xa4
 
-# CHECK: sll a0,v1,0x7
+# CHECK: sll $4, $3, 7
 0xc0 0x21 0x03 0x00
 
-# CHECK: sllv  v0,v1,a1
+# CHECK: sllv  $2, $3, $5
 0x04 0x10 0xa3 0x00
 
-# CHECK: slt v1,v1,a1
+# CHECK: slt $3, $3, $5
 0x2a 0x18 0x65 0x00
 
-# CHECK: slti  v1,v1,103
+# CHECK: slti  $3, $3, 103
 0x67 0x00 0x63 0x28
 
-# CHECK: sltiu v1,v1,103
+# CHECK: sltiu $3, $3, 103
 0x67 0x00 0x63 0x2c
 
-# CHECK: sltu  v1,v1,a1
+# CHECK: sltu  $3, $3, $5
 0x2b 0x18 0x65 0x00
 
-# CHECK: sqrt.d  $f12,$f14
-0x84 0x39 0x20 0x46
+# CHECK: sqrt.d  $f12, $f14
+0x04 0x73 0x20 0x46
 
-# CHECK: sqrt.s  $f6,$f7
+# CHECK: sqrt.s  $f6, $f7
 0x84 0x39 0x00 0x46
 
-# CHECK: sra a0,v1,0x7
+# CHECK: sra $4, $3, 7
 0xc3 0x21 0x03 0x00
 
-# CHECK: sra a0,v1,0x7
-0xc3 0x21 0x03 0x00
-
-# CHECK: srav  v0,v1,a1
+# CHECK: srav  $2, $3, $5
 0x07 0x10 0xa3 0x00
 
-# CHECK: srl a0,v1,0x7
+# CHECK: srl $4, $3, 7
 0xc2 0x21 0x03 0x00
 
-# CHECK: srlv  v0,v1,a1
+# CHECK: srlv  $2, $3, $5
 0x06 0x10 0xa3 0x00
 
-# CHECK: sub.d $f9,$f12,$f14
-0x41 0x32 0x27 0x46
+# CHECK: sub.d $f8, $f12, $f14
+0x01 0x62 0x2e 0x46
 
-# CHECK: sub.s $f9,$f6,$f7
+# CHECK: sub.s $f9, $f6, $f7
 0x41 0x32 0x07 0x46
 
-# CHECK: sub t1,a2,a3
+# CHECK: sub $9,  $6, $7
 0x22 0x48 0xc7 0x00
 
-# CHECK: subu  a0,v1,a1
+# CHECK: subu  $4, $3, $5
 0x23 0x20 0x65 0x00
 
-# CHECK: sw  a0,24(a1)
+# CHECK: sw  $4, 24($5)
 0x18 0x00 0xa4 0xac
 
-# CHECK: swc1  $f9,9158(a3)
+# CHECK: swc1  $f9, 9158($7)
 0xc6 0x23 0xe9 0xe4
 
-# CHECK: sync  0x7
+# CHECK: swl $4,  16($5)
+0x10 0x00 0xa4 0xa8
+
+# CHECK: swr $6, 16($7)
+0x10 0x00 0xe6 0xb8
+
+# CHECK: sync  7
 0xcf 0x01 0x00 0x00
 
-# CHECK: trunc.w.d $f12,$f14
-0x8d 0x39 0x20 0x46
+# CHECK: trunc.w.d $f12, $f14
+0x0d 0x73 0x20 0x46
 
-# CHECK: trunc.w.s $f6,$f7
+# CHECK: trunc.w.s $f6, $f7
 0x8d 0x39 0x00 0x46
 
-# CHECK: wsbh  a2,a3
+# CHECK: wsbh  $6, $7
 0xa0 0x30 0x07 0x7c
 
-# CHECK: xor v1,v1,a1
+# CHECK: xor $3, $3, $5
 0x26 0x18 0x65 0x00
 
-# CHECK: xori  t1,a2,0x4567
+# CHECK: xori  $9,  $6, 17767
 0x67 0x45 0xc9 0x38
diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index 1c7447a..095ed18 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt
@@ -1,67 +1,67 @@
-# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux
-
-# CHECK: daddiu t3,k0,31949
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu k0,at,t3
+# CHECK: daddu $26, $at, $11
 0x00 0x2b 0xd0 0x2d
 
-# CHECK: ddiv zero,k0,s6
+# CHECK: ddiv $zero, $26, $22
 0x03 0x56 0x00 0x1e
 
-# CHECK: ddivu zero,t1,t8
+# CHECK: ddivu $zero, $9, $24
 0x01 0x38 0x00 0x1f
 
-# CHECK: dmfc1 v0,$f14
+# CHECK: dmfc1 $2, $f14
 0x44 0x22 0x70 0x00
 
-# CHECK: dmtc1 s7,$f5
+# CHECK: dmtc1 $23, $f5
 0x44 0xb7 0x28 0x00
 
-# CHECK: dmult t3,k0
+# CHECK: dmult $11, $26
 0x01 0x7a 0x00 0x1c
 
-# CHECK: dmultu s7,t5
+# CHECK: dmultu $23, $13
 0x02 0xed 0x00 0x1d
 
-# CHECK: dsll v1,t8,0x11
+# CHECK: dsll $3, $24, 17
 0x00 0x18 0x1c 0x78
 
-# CHECK: dsllv gp,k1,t8
+# CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra at,at,0x1e
+# CHECK: dsra $at, $at, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav at,at,s8
+# CHECK: dsrav $at, $at, $fp
 0x03 0xc1 0x08 0x17
 
-# CHECK: dsrl t2,gp,0x18
+# CHECK: dsrl $10, $gp, 24
 0x00 0x1c 0x56 0x3a
 
-# CHECK: dsrlv gp,t2,s7
+# CHECK: dsrlv $gp, $10, $23
 0x02 0xea 0xe0 0x16
 
-# CHECK: dsubu gp,k1,t8
+# CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw k1,-15155(at)
+# CHECK: lw $27, -15155($at)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui at,0x1
+# CHECK: lui $at, 1
 0x3c 0x01 0x00 0x01
 
-# CHECK: lwu v1,-1746(v1)
+# CHECK: lwu $3, -1746($3)
 0x9c 0x63 0xf9 0x2e
 
-# CHECK: lui ra,0x1
+# CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw k0,-15159(at)
+# CHECK: sw $26, -15159($at)
 0xac 0x3a 0xc4 0xc9
 
-# CHECK: ld k0,3958(zero)
+# CHECK: ld $26, 3958($zero)
 0xdc 0x1a 0x0f 0x76
 
-# CHECK: sd a2,17767(zero)
+# CHECK: sd $6, 17767($zero)
 0xfc 0x06 0x45 0x67
diff --git a/test/MC/Disassembler/Mips/mips64_le.txt b/test/MC/Disassembler/Mips/mips64_le.txt
index dd87522..c4e5591 100644
--- a/test/MC/Disassembler/Mips/mips64_le.txt
+++ b/test/MC/Disassembler/Mips/mips64_le.txt
@@ -1,67 +1,67 @@
-# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux
-
-# CHECK: daddiu t3,k0,31949
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu k0,at,t3
+# CHECK: daddu $26, $at, $11
 0x2d 0xd0 0x2b 0x00
 
-# CHECK: ddiv zero,k0,s6
+# CHECK: ddiv $zero, $26, $22
 0x1e 0x00 0x56 0x03
 
-# CHECK: ddivu zero,t1,t8
+# CHECK: ddivu $zero, $9, $24
 0x1f 0x00 0x38 0x01
 
-# CHECK: dmfc1 v0,$f14
+# CHECK: dmfc1 $2, $f14
 0x00 0x70 0x22 0x44
 
-# CHECK: dmtc1 s7,$f5
+# CHECK: dmtc1 $23, $f5
 0x00 0x28 0xb7 0x44
 
-# CHECK: dmult t3,k0
+# CHECK: dmult $11, $26
 0x1c 0x00 0x7a 0x01
 
-# CHECK: dmultu s7,t5
+# CHECK: dmultu $23, $13
 0x1d 0x00 0xed 0x02
 
-# CHECK: dsll v1,t8,0x11
+# CHECK: dsll $3, $24, 17
 0x78 0x1c 0x18 0x00
 
-# CHECK: dsllv gp,k1,t8
+# CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra at,at,0x1e
+# CHECK: dsra $at, $at, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav at,at,s8
+# CHECK: dsrav $at, $at, $fp
 0x17 0x08 0xc1 0x03
 
-# CHECK: dsrl t2,gp,0x18
+# CHECK: dsrl $10, $gp, 24
 0x3a 0x56 0x1c 0x00
 
-# CHECK: dsrlv gp,t2,s7
+# CHECK: dsrlv $gp, $10, $23
 0x16 0xe0 0xea 0x02
 
-# CHECK: dsubu gp,k1,t8
+# CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw k1,-15155(at)
+# CHECK: lw $27, -15155($at)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui at,0x1
+# CHECK: lui $at, 1
 0x01 0x00 0x01 0x3c
 
-# CHECK: lwu v1,-1746(v1)
+# CHECK: lwu $3, -1746($3)
 0x2e 0xf9 0x63 0x9c
 
-# CHECK: lui ra,0x1
+# CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw k0,-15159(at)
+# CHECK: sw $26, -15159($at)
 0xc9 0xc4 0x3a 0xac
 
-# CHECK: ld k0,3958(zero)
+# CHECK: ld $26, 3958($zero)
 0x76 0x0f 0x1a 0xdc
 
-# CHECK: sd a2,17767(zero)
+# CHECK: sd $6, 17767($zero)
 0x67 0x45 0x06 0xfc
diff --git a/test/MC/Disassembler/Mips/mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2.txt
index 26bc94d..41808c7 100644
--- a/test/MC/Disassembler/Mips/mips64r2.txt
+++ b/test/MC/Disassembler/Mips/mips64r2.txt
@@ -1,91 +1,91 @@
-# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mattr +mips64r2
-
-# CHECK: daddiu t3,k0,31949
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mattr +mips64r2 | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu k0,at,t3
+# CHECK: daddu $26, $at, $11
 0x00 0x2b 0xd0 0x2d
 
-# CHECK: ddiv zero,k0,s6
+# CHECK: ddiv $zero, $26, $22
 0x03 0x56 0x00 0x1e
 
-# CHECK: ddivu zero,t1,t8
+# CHECK: ddivu $zero, $9, $24
 0x01 0x38 0x00 0x1f
 
-# CHECK: dmfc1 v0,$f14
+# CHECK: dmfc1 $2, $f14
 0x44 0x22 0x70 0x00
 
-# CHECK: dmtc1 s7,$f5
+# CHECK: dmtc1 $23, $f5
 0x44 0xb7 0x28 0x00
 
-# CHECK: dmult t3,k0
+# CHECK: dmult $11, $26
 0x01 0x7a 0x00 0x1c
 
-# CHECK: dmultu s7,t5
+# CHECK: dmultu $23, $13
 0x02 0xed 0x00 0x1d
 
-# CHECK: dsll v1,t8,0x11
+# CHECK: dsll $3, $24, 17
 0x00 0x18 0x1c 0x78
 
-# CHECK: dsllv gp,k1,t8
+# CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra at,at,0x1e
+# CHECK: dsra $at, $at, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav at,at,s8
+# CHECK: dsrav $at, $at, $fp
 0x03 0xc1 0x08 0x17
 
-# CHECK: dsrl t2,gp,0x18
+# CHECK: dsrl $10, $gp, 24
 0x00 0x1c 0x56 0x3a
 
-# CHECK: dsrlv gp,t2,s7
+# CHECK: dsrlv $gp, $10, $23
 0x02 0xea 0xe0 0x16
 
-# CHECK: dsubu gp,k1,t8
+# CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw k1,-15155(at)
+# CHECK: lw $27, -15155($at)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui at,0x1
+# CHECK: lui $at, 1
 0x3c 0x01 0x00 0x01
 
-# CHECK: lwu v1,-1746(v1)
+# CHECK: lwu $3, -1746($3)
 0x9c 0x63 0xf9 0x2e
 
-# CHECK: lui ra,0x1
+# CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw k0,-15159(at)
+# CHECK: sw $26, -15159($at)
 0xac 0x3a 0xc4 0xc9
 
-# CHECK: ld k0,3958(zero)
+# CHECK: ld $26, 3958($zero)
 0xdc 0x1a 0x0f 0x76
 
-# CHECK: sd a2,17767(zero)
+# CHECK: sd $6, 17767($zero)
 0xfc 0x06 0x45 0x67
 
-# CHECK: dclo t1,t8
+# CHECK: dclo $9, $24
 0x73 0x09 0x48 0x25
 
-# CHECK: dclz k0,t1
+# CHECK: dclz $26, $9
 0x71 0x3a 0xd0 0x24
 
-# CHECK: dext a3,gp,0x1d,0x1f
+# CHECK: dext $7, $gp, 29, 31
 0x7f 0x87 0xf7 0x43
 
-# CHECK: dins s4,gp,0xf,0x1
+# CHECK: dins $20, $gp, 15, 1
 0x7f 0x94 0x7b 0xc7
 
-# CHECK: dsbh a3,gp
+# CHECK: dsbh $7, $gp
 0x7c 0x1c 0x38 0xa4
 
-# CHECK: dshd v1,t6
+# CHECK: dshd $3, $14
 0x7c 0x0e 0x19 0x64
 
-# CHECK: drotr s4,k1,0x6
+# CHECK: drotr $20, $27, 6
 0x00 0x3b 0xa1 0xba
 
-# CHECK: drotrv t8,s7,a1
+# CHECK: drotrv $24, $23, $5
 0x00 0xb7 0xc0 0x56
diff --git a/test/MC/Disassembler/Mips/mips64r2_le.txt b/test/MC/Disassembler/Mips/mips64r2_le.txt
index 81a7c66..4987f80 100644
--- a/test/MC/Disassembler/Mips/mips64r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips64r2_le.txt
@@ -1,91 +1,91 @@
-# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mattr +mips64r2
-
-# CHECK: daddiu t3,k0,31949
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mattr +mips64r2 | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+# CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu k0,at,t3
+# CHECK: daddu $26, $at, $11
 0x2d 0xd0 0x2b 0x00
 
-# CHECK: ddiv zero,k0,s6
+# CHECK: ddiv $zero, $26, $22
 0x1e 0x00 0x56 0x03
 
-# CHECK: ddivu zero,t1,t8
+# CHECK: ddivu $zero, $9, $24
 0x1f 0x00 0x38 0x01
 
-# CHECK: dmfc1 v0,$f14
+# CHECK: dmfc1 $2, $f14
 0x00 0x70 0x22 0x44
 
-# CHECK: dmtc1 s7,$f5
+# CHECK: dmtc1 $23, $f5
 0x00 0x28 0xb7 0x44
 
-# CHECK: dmult t3,k0
+# CHECK: dmult $11, $26
 0x1c 0x00 0x7a 0x01
 
-# CHECK: dmultu s7,t5
+# CHECK: dmultu $23, $13
 0x1d 0x00 0xed 0x02
 
-# CHECK: dsll v1,t8,0x11
+# CHECK: dsll $3, $24, 17
 0x78 0x1c 0x18 0x00
 
-# CHECK: dsllv gp,k1,t8
+# CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra at,at,0x1e
+# CHECK: dsra $at, $at, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav at,at,s8
+# CHECK: dsrav $at, $at, $fp
 0x17 0x08 0xc1 0x03
 
-# CHECK: dsrl t2,gp,0x18
+# CHECK: dsrl $10, $gp, 24
 0x3a 0x56 0x1c 0x00
 
-# CHECK: dsrlv gp,t2,s7
+# CHECK: dsrlv $gp, $10, $23
 0x16 0xe0 0xea 0x02
 
-# CHECK: dsubu gp,k1,t8
+# CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw k1,-15155(at)
+# CHECK: lw $27, -15155($at)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui at,0x1
+# CHECK: lui $at, 1
 0x01 0x00 0x01 0x3c
 
-# CHECK: lwu v1,-1746(v1)
+# CHECK: lwu $3, -1746($3)
 0x2e 0xf9 0x63 0x9c
 
-# CHECK: lui ra,0x1
+# CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw k0,-15159(at)
+# CHECK: sw $26, -15159($at)
 0xc9 0xc4 0x3a 0xac
 
-# CHECK: ld k0,3958(zero)
+# CHECK: ld $26, 3958($zero)
 0x76 0x0f 0x1a 0xdc
 
-# CHECK: sd a2,17767(zero)
+# CHECK: sd $6, 17767($zero)
 0x67 0x45 0x06 0xfc
 
-# CHECK: dclo t1,t8
+# CHECK: dclo $9, $24
 0x25 0x48 0x09 0x73
 
-# CHECK: dclz k0,t1
+# CHECK: dclz $26, $9
 0x24 0xd0 0x3a 0x71
 
-# CHECK: dext a3,gp,0x1d,0x1f
+# CHECK: dext $7, $gp, 29, 31
 0x43 0xf7 0x87 0x7f
 
-# CHECK: dins s4,gp,0xf,0x1
+# CHECK: dins $20, $gp, 15, 1
 0xc7 0x7b 0x94 0x7f
 
-# CHECK: dsbh a3,gp
+# CHECK: dsbh $7, $gp
 0xa4 0x38 0x1c 0x7c
 
-# CHECK: dshd v1,t6
+# CHECK: dshd $3, $14
 0x64 0x19 0x0e 0x7c
 
-# CHECK: drotr s4,k1,0x6
+# CHECK: drotr $20, $27, 6
 0xba 0xa1 0x3b 0x00
 
-# CHECK: drotrv t8,s7,a1
+# CHECK: drotrv $24, $23, $5
 0x56 0xc0 0xb7 0x00
diff --git a/test/MC/Disassembler/X86/enhanced.txt b/test/MC/Disassembler/X86/enhanced.txt
index 752ab17..deff735 100644
--- a/test/MC/Disassembler/X86/enhanced.txt
+++ b/test/MC/Disassembler/X86/enhanced.txt
@@ -1,10 +1,10 @@
-# RUN: llvm-mc --edis %s -triple=x86_64-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --edis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
 
-# CHECK: [o:jne][w:	][0-p:-][0-l:10=10] <br> 0:[RIP/111](pc)=18446744073709551606
+# CHECK: [o:jne][w:	][0-p:-][0-l:10=10] <br> 0:[RIP/112](pc)=18446744073709551606
 0x0f 0x85 0xf6 0xff 0xff 0xff
-# CHECK: [o:movq][w:	][1-r:%gs=r63][1-p::][1-l:8=8][p:,][w: ][0-r:%rcx=r108] <mov> 0:[RCX/108]=0 1:[GS/63]=8
+# CHECK: [o:movq][w:	][1-r:%gs=r64][1-p::][1-l:8=8][p:,][w: ][0-r:%rcx=r109] <mov> 0:[RCX/109]=0 1:[GS/64]=8
 0x65 0x48 0x8b 0x0c 0x25 0x08 0x00 0x00 0x00
-# CHECK: [o:xorps][w:	][2-r:%xmm1=r129][p:,][w: ][0-r:%xmm2=r130] 0:[XMM2/130]=0 1:[XMM2/130]=0 2:[XMM1/129]=0
+# CHECK: [o:xorps][w:	][2-r:%xmm1=r130][p:,][w: ][0-r:%xmm2=r131] 0:[XMM2/131]=0 1:[XMM2/131]=0 2:[XMM1/130]=0
 0x0f 0x57 0xd1
-# CHECK: [o:andps][w:	][2-r:%xmm1=r129][p:,][w: ][0-r:%xmm2=r130] 0:[XMM2/130]=0 1:[XMM2/130]=0 2:[XMM1/129]=0
+# CHECK: [o:andps][w:	][2-r:%xmm1=r130][p:,][w: ][0-r:%xmm2=r131] 0:[XMM2/131]=0 1:[XMM2/131]=0 2:[XMM1/130]=0
 0x0f 0x54 0xd1
diff --git a/test/MC/Disassembler/X86/intel-syntax.txt b/test/MC/Disassembler/X86/intel-syntax.txt
index a5dbcf2..27694cd 100644
--- a/test/MC/Disassembler/X86/intel-syntax.txt
+++ b/test/MC/Disassembler/X86/intel-syntax.txt
@@ -105,3 +105,8 @@
 # CHECK: retf
 0x66 0xcb
 
+# CHECK: vpgatherqq YMM2, QWORD PTR [RDI + 2*YMM1], YMM0
+0xc4 0xe2 0xfd 0x91 0x14 0x4f
+
+# CHECK: vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
+0xc4 0x02 0x39 0x90 0x14 0x4f
diff --git a/test/MC/Disassembler/X86/invalid-VEX-vvvv.txt b/test/MC/Disassembler/X86/invalid-VEX-vvvv.txt
index 9feb54c..31a3804 100644
--- a/test/MC/Disassembler/X86/invalid-VEX-vvvv.txt
+++ b/test/MC/Disassembler/X86/invalid-VEX-vvvv.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # This instruction would decode as movmskps if the vvvv field in the VEX prefix was all 1s.
 0xc5 0xf0 0x50 0xc0
diff --git a/test/MC/Disassembler/X86/invalid-cmp-imm.txt b/test/MC/Disassembler/X86/invalid-cmp-imm.txt
index bf8699b..7b2ea2a 100644
--- a/test/MC/Disassembler/X86/invalid-cmp-imm.txt
+++ b/test/MC/Disassembler/X86/invalid-cmp-imm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 |& grep {invalid instruction encoding}
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | grep "invalid instruction encoding"
 
 # This instruction would decode as cmpordps if the immediate byte was less than 8.
 0x0f 0xc2 0xc7 0x08
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index c0e77d06..672d239 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -123,10 +123,10 @@
 # CHECK: vcvtss2sil %xmm0, %eax
 0xc5 0xfa 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc5 0xfb 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %rax
+# CHECK: vcvtsd2siq %xmm0, %rax
 0xc4 0xe1 0xfb 0x2d 0xc0
 
 # CHECK: vmaskmovpd %xmm0, %xmm1, (%rax)
@@ -437,10 +437,10 @@
 # CHECK: vroundsd $0, %xmm0, %xmm0, %xmm0
 0xc4 0xe3 0x7d 0x0b 0xc0 0x00
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc4 0xe1 0x7f 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %rax
+# CHECK: vcvtsd2siq %xmm0, %rax
 0xc4 0xe1 0xff 0x2d 0xc0
 
 # CHECK: vucomisd %xmm1, %xmm0
@@ -725,6 +725,30 @@
 # CHECK: vpermil2ps $1, 4(%rax), %xmm2, %xmm3, %xmm0
 0xc4 0xe3 0xe1 0x48 0x40 0x04 0x21
 
+# CHECK: vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
+0xc4 0xe2 0xf9 0x92 0x14 0x4f
+
+# CHECK: vgatherdpd %ymm0, (%rdi,%xmm1,2), %ymm2
+0xc4 0xe2 0xfd 0x92 0x14 0x4f
+
+# CHECK: vgatherqps %xmm8, (%r15,%xmm9,2), %xmm10
+0xc4 0x02 0x39 0x93 0x14 0x4f
+
+# CHECK: vgatherqps %xmm8, (%r15,%ymm9,2), %xmm10
+0xc4 0x02 0x3d 0x93 0x14 0x4f
+
+# CHECK: vpgatherdq %xmm0, (%rdi,%xmm1,2), %xmm2
+0xc4 0xe2 0xf9 0x90 0x14 0x4f
+
+# CHECK: vpgatherdq %ymm0, (%rdi,%xmm1,2), %ymm2
+0xc4 0xe2 0xfd 0x90 0x14 0x4f
+
+# CHECK: vpgatherqd %xmm8, (%r15,%xmm9,2), %xmm10
+0xc4 0x02 0x39 0x91 0x14 0x4f
+
+# CHECK: vpgatherqd %xmm8, (%r15,%ymm9,2), %xmm10
+0xc4 0x02 0x3d 0x91 0x14 0x4f
+
 # rdar://8812056 lldb doesn't print the x86 lock prefix when disassembling
 # CHECK: lock
 # CHECK-NEXT: xaddq	%rcx, %rbx
diff --git a/test/MC/Disassembler/X86/truncated-input.txt b/test/MC/Disassembler/X86/truncated-input.txt
index 34cf038..83be1ca 100644
--- a/test/MC/Disassembler/X86/truncated-input.txt
+++ b/test/MC/Disassembler/X86/truncated-input.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 |& FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
 
 # CHECK: warning
 0x00
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 739fa6a..899657b 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -159,10 +159,10 @@
 # CHECK: vcvtss2sil %xmm0, %eax
 0xc5 0xfa 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc5 0xfb 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc4 0xe1 0x7b 0x2d 0xc0
 
 # CHECK: vmaskmovpd %xmm0, %xmm1, (%eax)
@@ -460,10 +460,10 @@
 # CHECK: vroundsd $0, %xmm0, %xmm0, %xmm0
 0xc4 0xe3 0x7d 0x0b 0xc0 0x00
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc4 0xe1 0x7f 0x2d 0xc0
 
-# CHECK: vcvtsd2si %xmm0, %eax
+# CHECK: vcvtsd2sil %xmm0, %eax
 0xc4 0xe1 0xff 0x2d 0xc0
 
 # CHECK: vucomisd %xmm1, %xmm0
@@ -612,3 +612,21 @@
 
 # CHECK: shrxl %esi, %ebx, %edx
 0xc4 0xe2 0x0b 0xf7 0xd3
+
+# CHECK: extrq  $2, $3, %xmm0
+0x66 0x0f 0x78 0xc0 0x03 0x02
+
+# CHECK: extrq  %xmm1, %xmm0
+0x66 0x0f 0x79 0xc1
+
+# CHECK: insertq $6, $5, %xmm1, %xmm0
+0xf2 0x0f 0x78 0xc1 0x05 0x06
+
+# CHECK: insertq %xmm1, %xmm0
+0xf2 0x0f 0x79 0xc1
+
+# CHECK: movntsd %xmm0, (%edi)
+0xf2 0x0f 0x2b 0x07
+
+# CHECK: movntss %xmm0, (%edi)
+0xf3 0x0f 0x2b 0x07
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index f4b8f46..df449a4 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -61,3 +61,21 @@
 
 # CHECK: cmpordsd 
 0xf2 0x0f 0xc2 0xc7 0x07
+
+# CHECK: extrq  $2, $3, %xmm0
+0x66 0x0f 0x78 0xc0 0x03 0x02
+
+# CHECK: extrq  %xmm1, %xmm0
+0x66 0x0f 0x79 0xc1
+
+# CHECK: insertq $6, $5, %xmm1, %xmm0
+0xf2 0x0f 0x78 0xc1 0x05 0x06
+
+# CHECK: insertq %xmm1, %xmm0
+0xf2 0x0f 0x79 0xc1
+
+# CHECK: movntsd %xmm0, (%rdi)
+0xf2 0x0f 0x2b 0x07
+
+# CHECK: movntss %xmm0, (%rdi)
+0xf3 0x0f 0x2b 0x07
diff --git a/test/MC/ELF/fde.s b/test/MC/ELF/fde.s
new file mode 100644
index 0000000..52ee33f
--- /dev/null
+++ b/test/MC/ELF/fde.s
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -filetype=obj %s -o %t.o -triple x86_64-pc-linux-gnu && llvm-objdump -s %t.o
+# PR13581
+
+# CHECK: Contents of section .debug_frame:
+# CHECK-NEXT:  0000 14000000 ffffffff 01000178 100c0708  ...........x....
+# CHECK-NEXT:  0010 90010000 00000000 1c000000 00000000  ................
+# CHECK-NEXT:  0020 00000000 00000000 11000000 00000000  ................
+# CHECK-NEXT:  0030 410e1086 02430d06                    A....C..
+
+__cxx_global_var_init:                  # @__cxx_global_var_init
+        .cfi_startproc
+.Lfunc_begin0:
+# BB#0:                                 # %entry
+        pushq   %rbp
+.Ltmp2:
+        .cfi_def_cfa_offset 16
+.Ltmp3:
+        .cfi_offset %rbp, -16
+        movq    %rsp, %rbp
+.Ltmp4:
+        .cfi_def_cfa_register %rbp
+.Ltmp5:
+        callq   _Z2rsv@PLT
+        movl    %eax, _ZL1i(%rip)
+        popq    %rbp
+        ret
+        .cfi_endproc
+        .cfi_sections .debug_frame
diff --git a/test/MC/ELF/version.s b/test/MC/ELF/version.s
new file mode 100644
index 0000000..31e952a
--- /dev/null
+++ b/test/MC/ELF/version.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o - | elf-dump --dump-section-data | FileCheck  %s
+
+.version "1234"
+.version "123"
+
+// CHECK:       (('sh_name', 0x0000000c) # '.note'
+// CHECK-NEXT:   ('sh_type', 0x00000007)
+// CHECK-NEXT:   ('sh_flags', 0x00000000)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000034)
+// CHECK-NEXT:   ('sh_size', 0x00000024)
+// CHECK-NEXT:   ('sh_link', 0x00000000)
+// CHECK-NEXT:   ('sh_info', 0x00000000)
+// CHECK-NEXT:   ('sh_addralign', 0x00000004)
+// CHECK-NEXT:   ('sh_entsize', 0x00000000)
+// CHECK-NEXT:   ('_section_data', '05000000 00000000 01000000 31323334 00000000 04000000 00000000 01000000 31323300')
+// CHECK-NEXT:  ),
diff --git a/test/MC/MachO/ARM/data-in-code.s b/test/MC/MachO/ARM/data-in-code.s
new file mode 100644
index 0000000..bbcb9aa
--- /dev/null
+++ b/test/MC/MachO/ARM/data-in-code.s
@@ -0,0 +1,33 @@
+@ RUN: llvm-mc -triple armv7-apple-darwin10 -filetype=obj -o - < %s | macho-dump | FileCheck %s
+        .text
+_foo:
+@ CHECK: # DICE 0
+@ CHECK: ('offset', 0)
+@ CHECK: ('length', 4)
+@ CHECK: ('kind', 1)
+@ CHECK: # DICE 1
+@ CHECK: ('offset', 4)
+@ CHECK: ('length', 4)
+@ CHECK: ('kind', 4)
+@ CHECK: # DICE 2
+@ CHECK: ('offset', 8)
+@ CHECK: ('length', 2)
+@ CHECK: ('kind', 3)
+@ CHECK: # DICE 3
+@ CHECK: ('offset', 10)
+@ CHECK: ('length', 1)
+@ CHECK: ('kind', 2)
+
+.data_region
+        .long 10
+.end_data_region
+.data_region jt32
+        .long 1
+.end_data_region
+.data_region jt16
+        .short 2
+.end_data_region
+.data_region jt8
+        .byte 3
+.end_data_region
+
diff --git a/test/MC/MachO/ARM/llvm-objdump-macho-stripped.s b/test/MC/MachO/ARM/llvm-objdump-macho-stripped.s
new file mode 100644
index 0000000..7fcec52
--- /dev/null
+++ b/test/MC/MachO/ARM/llvm-objdump-macho-stripped.s
@@ -0,0 +1,5 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-ios -filetype=obj -o - < %s | llvm-objdump -d -macho -triple=thumbv7-apple-ios - | FileCheck %s
+	nop
+# CHECK:        0:	00 bf                                        	nop
+# We are checking that disassembly happens when there are no symbols.
+# rdar://11460289
diff --git a/test/MC/MachO/ARM/llvm-objdump-macho.s b/test/MC/MachO/ARM/llvm-objdump-macho.s
new file mode 100644
index 0000000..c8aec93
--- /dev/null
+++ b/test/MC/MachO/ARM/llvm-objdump-macho.s
@@ -0,0 +1,20 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-ios -filetype=obj -o - < %s | llvm-objdump -d -macho -triple=thumbv7-apple-ios - | FileCheck %s
+.thumb
+.thumb_func _fib
+_fib:
+	push	{r7, lr}
+	pop	{r7, pc}
+.thumb_func _main
+_main:
+	push	{r7, lr}
+        pop	{r7, pc}
+	nop
+# CHECK: _fib:
+# CHECK:        0:	80 b5                                        	push	{r7, lr}
+# CHECK:        2:	80 bd                                        	pop	{r7, pc}
+# CHECK: _main:
+# CHECK:        4:	80 b5                                        	push	{r7, lr}
+# CHECK:        6:	80 bd                                        	pop	{r7, pc}
+# CHECK:        8:	00 bf                                        	nop
+# We are checking that second function is fully disassembled.
+# rdar://11426465
diff --git a/test/MC/MachO/ARM/thumb-bl-jbits.s b/test/MC/MachO/ARM/thumb-bl-jbits.s
new file mode 100644
index 0000000..9657968
--- /dev/null
+++ b/test/MC/MachO/ARM/thumb-bl-jbits.s
@@ -0,0 +1,19 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -filetype=obj -o - < %s | macho-dump --dump-section-data | FileCheck %s
+.thumb
+.thumb_func t
+t:	nop
+
+.data
+.space 4441096 - 4 - 2
+
+.section __TEXT, __branch, regular, pure_instructions
+.thumb
+.thumb_func b
+b:
+	bl	t
+# CHECK: '_section_data', 'c3f7fcf5'
+# We are checking that the branch and link instruction which is:
+#	bl	#-4441096
+# has it displacement encoded correctly with respect to the J1 and J2 bits when
+# the branch is assembled with a label not a displacement.
+# rdar://10149689
diff --git a/test/MC/MachO/ARM/thumb2-movw-fixup.s b/test/MC/MachO/ARM/thumb2-movw-fixup.s
new file mode 100644
index 0000000..57973a8
--- /dev/null
+++ b/test/MC/MachO/ARM/thumb2-movw-fixup.s
@@ -0,0 +1,44 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7-apple-darwin10 -filetype=obj -o - < %s | macho-dump | FileCheck %s
+
+@ rdar://10038370
+
+	.syntax unified
+  .text
+	.align	2
+	.code	16           
+	.thumb_func	_foo
+  movw	r2, :lower16:L1
+	movt	r2, :upper16:L1
+  movw	r12, :lower16:L2
+	movt	r12, :upper16:L2
+  .space 70000
+  
+  .data
+L1: .long 0
+L2: .long 0
+
+@ CHECK:  ('_relocations', [
+@ CHECK:    # Relocation 0
+@ CHECK:    (('word-0', 0xc),
+@ CHECK:     ('word-1', 0x86000002)),
+@ CHECK:    # Relocation 1
+@ CHECK:    (('word-0', 0x1184),
+@ CHECK:     ('word-1', 0x16ffffff)),
+@ CHECK:    # Relocation 2
+@ CHECK:    (('word-0', 0x8),
+@ CHECK:     ('word-1', 0x84000002)),
+@ CHECK:    # Relocation 3
+@ CHECK:    (('word-0', 0x1),
+@ CHECK:     ('word-1', 0x14ffffff)),
+@ CHECK:    # Relocation 4
+@ CHECK:    (('word-0', 0x4),
+@ CHECK:     ('word-1', 0x86000002)),
+@ CHECK:    # Relocation 5
+@ CHECK:    (('word-0', 0x1180),
+@ CHECK:     ('word-1', 0x16ffffff)),
+@ CHECK:    # Relocation 6
+@ CHECK:    (('word-0', 0x0),
+@ CHECK:     ('word-1', 0x84000002)),
+@ CHECK:    # Relocation 7
+@ CHECK:    (('word-0', 0x1),
+@ CHECK:     ('word-1', 0x14ffffff)),
diff --git a/test/MC/MachO/previous.s b/test/MC/MachO/previous.s
new file mode 100644
index 0000000..41077cd
--- /dev/null
+++ b/test/MC/MachO/previous.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -o - | FileCheck %s
+
+.text
+// CHECK: .section __TEXT,__text
+
+.data
+// CHECK: .section __DATA,__data
+
+.previous
+// CHECK: .section __TEXT,__text
+
+.previous
+// CHECK: .section __DATA,__data
diff --git a/test/MC/MachO/pushsection.s b/test/MC/MachO/pushsection.s
new file mode 100644
index 0000000..6881323
--- /dev/null
+++ b/test/MC/MachO/pushsection.s
@@ -0,0 +1,16 @@
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -o - | FileCheck %s
+
+.text
+// CHECK: .section __TEXT,__text
+
+.pushsection __DATA, __data
+// CHECK: .section __DATA,__data
+
+.pushsection __TEXT, initcode
+// CHECK: .section __TEXT,initcode
+        
+.popsection
+// CHECK: .section __DATA,__data
+        
+.popsection
+// CHECK: .section __TEXT,__text
diff --git a/test/MC/Mips/elf-N64.ll b/test/MC/Mips/elf-N64.ll
new file mode 100644
index 0000000..23ec53a
--- /dev/null
+++ b/test/MC/Mips/elf-N64.ll
@@ -0,0 +1,39 @@
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck %s
+
+; Check for N64 relocation production.
+;
+; ModuleID = '../hello.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v64:64:64-n32"
+target triple = "mips64el-unknown-linux"
+
+@str = private unnamed_addr constant [12 x i8] c"hello world\00"
+
+define i32 @main() nounwind {
+entry:
+; Check that the appropriate relocations were created.
+
+; R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+; CHECK:     ('r_type3', 0x05)
+; CHECK-NEXT:     ('r_type2', 0x18)
+; CHECK-NEXT:     ('r_type', 0x07)
+
+; R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+; CHECK:     ('r_type3', 0x06)
+; CHECK-NEXT:     ('r_type2', 0x18)
+; CHECK-NEXT:     ('r_type', 0x07)
+
+; R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE
+; CHECK:     ('r_type3', 0x00)
+; CHECK-NEXT:     ('r_type2', 0x00)
+; CHECK-NEXT:     ('r_type', 0x14)
+
+; R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE
+; CHECK:     ('r_type3', 0x00)
+; CHECK-NEXT:     ('r_type2', 0x00)
+; CHECK-NEXT:     ('r_type', 0x15)
+
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @str, i64 0, i64 0))
+  ret i32 0
+
+}
+declare i32 @puts(i8* nocapture) nounwind
diff --git a/test/MC/Mips/elf-bigendian.ll b/test/MC/Mips/elf-bigendian.ll
index 71c69bb..7111deb 100644
--- a/test/MC/Mips/elf-bigendian.ll
+++ b/test/MC/Mips/elf-bigendian.ll
@@ -1,4 +1,6 @@
-; RUN: llc -filetype=obj -mtriple mips-unknown-linux %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; DISABLE: llc -filetype=obj -mtriple mips-unknown-linux %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: false
+; XFAIL: *
 
 ; Check that this is big endian.
 ; CHECK: ('e_indent[EI_DATA]', 0x02)
diff --git a/test/MC/Mips/elf-objdump.s b/test/MC/Mips/elf-objdump.s
new file mode 100644
index 0000000..6a5c2a5
--- /dev/null
+++ b/test/MC/Mips/elf-objdump.s
@@ -0,0 +1,11 @@
+// 32 bit big endian
+// RUN: llvm-mc -filetype=obj -triple mips-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux  - | FileCheck %s
+// 32 bit little endian
+// RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux  - | FileCheck %s
+// 64 bit big endian
+// RUN: llvm-mc -filetype=obj -arch=mips64 -triple mips64-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux - | FileCheck %s
+// 64 bit little endian
+// RUN: llvm-mc -filetype=obj -arch=mips64el -triple mips64el-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux - | FileCheck %s
+
+// We just want to see if llvm-objdump works at all.
+// CHECK: .text
diff --git a/test/MC/Mips/elf_basic.s b/test/MC/Mips/elf_basic.s
index 7a79fa0..ffc3b11 100644
--- a/test/MC/Mips/elf_basic.s
+++ b/test/MC/Mips/elf_basic.s
@@ -30,3 +30,6 @@
 // CHECK-LE64: ('e_indent[EI_CLASS]', 0x02)
 // This is little endian.
 // CHECK-LE64: ('e_indent[EI_DATA]', 0x01)
+
+// Check that we are setting EI_OSABI to ELFOSABI_LINUX.
+// CHECK-LE64: ('e_indent[EI_OSABI]', 0x03)
diff --git a/test/MC/Mips/higher_highest.ll b/test/MC/Mips/higher_highest.ll
new file mode 100644
index 0000000..81a89e3
--- /dev/null
+++ b/test/MC/Mips/higher_highest.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
+
+; Check that the R_MIPS_HIGHER and R_MIPS_HIGHEST relocations were created.
+
+; CHECK:     ('r_type', 0x1d)
+; CHECK:     ('r_type', 0x1d)
+; CHECK:     ('r_type', 0x1c)
+; CHECK:     ('r_type', 0x1c)
+
+@g0 = external global i32
+
+define void @foo1(i32 %s) nounwind {
+entry:
+
+  %tobool = icmp eq i32 %s, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = load i32* @g0, align 4
+  %add = add nsw i32 %0, 12
+  store i32 %add, i32* @g0, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/MC/Mips/lea_64.ll b/test/MC/Mips/lea_64.ll
new file mode 100644
index 0000000..2e7a37b
--- /dev/null
+++ b/test/MC/Mips/lea_64.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple mips64el - \
+; RUN:  | FileCheck %s
+
+@p = external global i32*
+
+define void @f1() nounwind {
+entry:
+; CHECK: .text:
+; CHECK-NOT: addiu {{[0-9,a-f]+}}, {{[0-9,a-f]+}}, {{[0-9]+}}
+
+  %a = alloca [10 x i32], align 4
+  %arraydecay = getelementptr inbounds [10 x i32]* %a, i64 0, i64 0
+  store i32* %arraydecay, i32** @p, align 8
+  ret void
+
+; CHECK: jr $ra
+}
diff --git a/test/MC/Mips/mips64shift.ll b/test/MC/Mips/mips64shift.ll
new file mode 100644
index 0000000..7817b96
--- /dev/null
+++ b/test/MC/Mips/mips64shift.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+
+
+define i64 @f3(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsll ${{[0-9]+}}, ${{[0-9]+}}, 10
+  %shl = shl i64 %a0, 10
+  ret i64 %shl
+}
+
+define i64 @f4(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsra ${{[0-9]+}}, ${{[0-9]+}}, 10
+  %shr = ashr i64 %a0, 10
+  ret i64 %shr
+}
+
+define i64 @f5(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsrl ${{[0-9]+}}, ${{[0-9]+}}, 10
+  %shr = lshr i64 %a0, 10
+  ret i64 %shr
+}
+
+define i64 @f6(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsll32 ${{[0-9]+}}, ${{[0-9]+}}, 8
+  %shl = shl i64 %a0, 40
+  ret i64 %shl
+}
+
+define i64 @f7(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsra32 ${{[0-9]+}}, ${{[0-9]+}}, 8
+  %shr = ashr i64 %a0, 40
+  ret i64 %shr
+}
+
+define i64 @f8(i64 %a0) nounwind readnone {
+entry:
+; CHECK: dsrl32 ${{[0-9]+}}, ${{[0-9]+}}, 8
+  %shr = lshr i64 %a0, 40
+  ret i64 %shr
+}
+
diff --git a/test/MC/Mips/multi-64bit-func.ll b/test/MC/Mips/multi-64bit-func.ll
new file mode 100644
index 0000000..6e0d784
--- /dev/null
+++ b/test/MC/Mips/multi-64bit-func.ll
@@ -0,0 +1,23 @@
+; There is no real check here. If the test doesn't 
+; assert it passes.
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
+; Run it again without extra nop in delay slot
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -enable-mips-delay-filler < %s 
+
+define i32 @bosco1(i32 %x) nounwind readnone {
+entry:
+  %inc = add i32 %x, 1
+  ret i32 %inc
+}
+
+define i32 @bosco2(i32 %x) nounwind readnone {
+entry:
+  %inc = add i32 %x, 1
+  ret i32 %inc
+}
+
+define i32 @bosco3(i32 %x) nounwind readnone {
+entry:
+  %inc = add i32 %x, 1
+  ret i32 %inc
+}
diff --git a/test/MC/Mips/r-mips-got-disp.ll b/test/MC/Mips/r-mips-got-disp.ll
new file mode 100644
index 0000000..73396ac
--- /dev/null
+++ b/test/MC/Mips/r-mips-got-disp.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s -o - | elf-dump --dump-section-data  | FileCheck %s
+
+; Check that the R_MIPS_GOT_DISP relocations were created.
+
+; CHECK:     ('r_type', 0x13)
+
+@shl = global i64 1, align 8
+@.str = private unnamed_addr constant [8 x i8] c"0x%llx\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i64* @shl, align 8
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i64 %0) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
diff --git a/test/MC/Mips/sext_64_32.ll b/test/MC/Mips/sext_64_32.ll
new file mode 100644
index 0000000..e5c57b8
--- /dev/null
+++ b/test/MC/Mips/sext_64_32.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+
+; Sign extend from 32 to 64 was creating nonsense opcodes
+
+; CHECK: sll ${{[0-9]+}}, ${{[0-9]+}}, 0
+
+define i64 @foo(i32 %ival) nounwind readnone {
+entry:
+  %conv = sext i32 %ival to i64
+  ret i64 %conv
+}
+
+; CHECK: dsll32 ${{[0-9]+}}, ${{[0-9]+}}, 0
+
+define i64 @foo_2(i32 %ival_2) nounwind readnone {
+entry:
+  %conv_2 = zext i32 %ival_2 to i64
+  ret i64 %conv_2
+}
+
diff --git a/test/MC/Mips/sym-offset.ll b/test/MC/Mips/sym-offset.ll
index 5939935..5162c91 100644
--- a/test/MC/Mips/sym-offset.ll
+++ b/test/MC/Mips/sym-offset.ll
@@ -1,4 +1,6 @@
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; DISABLED: llc -filetype=obj -mtriple mipsel-unknown-linux %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: false
+; XFAIL: *
 
 ; FIXME: use assembler instead of llc when it becomes available.
 
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 7cd5677..7edd26a 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -63,4 +63,6 @@ _main:
         mov     ECX, DWORD PTR [4*ECX + _fnan]
 // CHECK:       movq    %fs:320, %rax
         mov     RAX, QWORD PTR FS:[320]
+// CHECK:       vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1
+        vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
 	ret
diff --git a/test/MC/X86/x86-32-avx.s b/test/MC/X86/x86-32-avx.s
index e13a871..586f3fe 100644
--- a/test/MC/X86/x86-32-avx.s
+++ b/test/MC/X86/x86-32-avx.s
@@ -2603,11 +2603,11 @@
 // CHECK: encoding: [0xc5,0xf9,0xe6,0xe9]
           vcvttpd2dq  %xmm1, %xmm5
 
-// CHECK: vcvttpd2dq  %ymm2, %xmm5
+// CHECK: vcvttpd2dqy %ymm2, %xmm5
 // CHECK: encoding: [0xc5,0xfd,0xe6,0xea]
           vcvttpd2dq  %ymm2, %xmm5
 
-// CHECK: vcvttpd2dqx  %xmm1, %xmm5
+// CHECK: vcvttpd2dq   %xmm1, %xmm5
 // CHECK: encoding: [0xc5,0xf9,0xe6,0xe9]
           vcvttpd2dqx  %xmm1, %xmm5
 
@@ -2623,11 +2623,11 @@
 // CHECK: encoding: [0xc5,0xfd,0xe6,0x08]
           vcvttpd2dqy  (%eax), %xmm1
 
-// CHECK: vcvtpd2ps  %ymm2, %xmm5
+// CHECK: vcvtpd2psy %ymm2, %xmm5
 // CHECK: encoding: [0xc5,0xfd,0x5a,0xea]
           vcvtpd2ps  %ymm2, %xmm5
 
-// CHECK: vcvtpd2psx  %xmm1, %xmm5
+// CHECK: vcvtpd2ps   %xmm1, %xmm5
 // CHECK: encoding: [0xc5,0xf9,0x5a,0xe9]
           vcvtpd2psx  %xmm1, %xmm5
 
@@ -2643,7 +2643,7 @@
 // CHECK: encoding: [0xc5,0xfd,0x5a,0x08]
           vcvtpd2psy  (%eax), %xmm1
 
-// CHECK: vcvtpd2dq  %ymm2, %xmm5
+// CHECK: vcvtpd2dqy %ymm2, %xmm5
 // CHECK: encoding: [0xc5,0xff,0xe6,0xea]
           vcvtpd2dq  %ymm2, %xmm5
 
@@ -2655,7 +2655,7 @@
 // CHECK: encoding: [0xc5,0xff,0xe6,0x08]
           vcvtpd2dqy  (%eax), %xmm1
 
-// CHECK: vcvtpd2dqx  %xmm1, %xmm5
+// CHECK: vcvtpd2dq   %xmm1, %xmm5
 // CHECK: encoding: [0xc5,0xfb,0xe6,0xe9]
           vcvtpd2dqx  %xmm1, %xmm5
 
@@ -3103,21 +3103,21 @@
 // CHECK: encoding: [0xc5,0xf8,0x77]
           vzeroupper
 
-// CHECK: vcvtsd2si  %xmm4, %ecx
+// CHECK: vcvtsd2sil  %xmm4, %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0xcc]
-          vcvtsd2si  %xmm4, %ecx
+          vcvtsd2sil  %xmm4, %ecx
 
-// CHECK: vcvtsd2si  (%ecx), %ecx
+// CHECK: vcvtsd2sil  (%ecx), %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0x09]
-          vcvtsd2si  (%ecx), %ecx
+          vcvtsd2sil  (%ecx), %ecx
 
-// CHECK: vcvtsi2sdl  (%ebp), %xmm0, %xmm7
+// CHECK: vcvtsi2sd  (%ebp), %xmm0, %xmm7
 // CHECK: encoding: [0xc5,0xfb,0x2a,0x7d,0x00]
-          vcvtsi2sdl  (%ebp), %xmm0, %xmm7
+          vcvtsi2sd  (%ebp), %xmm0, %xmm7
 
-// CHECK: vcvtsi2sdl  (%esp), %xmm0, %xmm7
+// CHECK: vcvtsi2sd  (%esp), %xmm0, %xmm7
 // CHECK: encoding: [0xc5,0xfb,0x2a,0x3c,0x24]
-          vcvtsi2sdl  (%esp), %xmm0, %xmm7
+          vcvtsi2sd  (%esp), %xmm0, %xmm7
 
 // CHECK: vlddqu  (%eax), %ymm2
 // CHECK: encoding: [0xc5,0xff,0xf0,0x10]
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 6c27b85..0824916 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -19626,3 +19626,29 @@
           dppd $0x81, %xmm2, %xmm1
 // CHECK: insertps $129, %xmm2, %xmm1
           insertps $0x81, %xmm2, %xmm1
+
+// PR13253 handle implicit optional third argument that must always be xmm0
+// CHECK: pblendvb %xmm2, %xmm1
+pblendvb %xmm2, %xmm1
+// CHECK: pblendvb %xmm2, %xmm1
+pblendvb %xmm0, %xmm2, %xmm1
+// CHECK: pblendvb (%eax), %xmm1
+pblendvb (%eax), %xmm1
+// CHECK: pblendvb (%eax), %xmm1
+pblendvb %xmm0, (%eax), %xmm1
+// CHECK: blendvpd %xmm2, %xmm1
+blendvpd %xmm2, %xmm1
+// CHECK: blendvpd %xmm2, %xmm1
+blendvpd %xmm0, %xmm2, %xmm1
+// CHECK: blendvpd (%eax), %xmm1
+blendvpd (%eax), %xmm1
+// CHECK: blendvpd (%eax), %xmm1
+blendvpd %xmm0, (%eax), %xmm1
+// CHECK: blendvps %xmm2, %xmm1
+blendvps %xmm2, %xmm1
+// CHECK: blendvps %xmm2, %xmm1
+blendvps %xmm0, %xmm2, %xmm1
+// CHECK: blendvps (%eax), %xmm1
+blendvps (%eax), %xmm1
+// CHECK: blendvps (%eax), %xmm1
+blendvps %xmm0, (%eax), %xmm1
diff --git a/test/MC/X86/x86_64-avx-encoding.s b/test/MC/X86/x86_64-avx-encoding.s
index bd5559a..46ff9ea 100644
--- a/test/MC/X86/x86_64-avx-encoding.s
+++ b/test/MC/X86/x86_64-avx-encoding.s
@@ -3368,11 +3368,11 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x41,0x79,0xe6,0xd3]
           vcvttpd2dq  %xmm11, %xmm10
 
-// CHECK: vcvttpd2dq  %ymm12, %xmm10
+// CHECK: vcvttpd2dqy %ymm12, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x7d,0xe6,0xd4]
           vcvttpd2dq  %ymm12, %xmm10
 
-// CHECK: vcvttpd2dqx  %xmm11, %xmm10
+// CHECK: vcvttpd2dq   %xmm11, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x79,0xe6,0xd3]
           vcvttpd2dqx  %xmm11, %xmm10
 
@@ -3388,11 +3388,11 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x7d,0xe6,0x18]
           vcvttpd2dqy  (%rax), %xmm11
 
-// CHECK: vcvtpd2ps  %ymm12, %xmm10
+// CHECK: vcvtpd2psy %ymm12, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x7d,0x5a,0xd4]
           vcvtpd2ps  %ymm12, %xmm10
 
-// CHECK: vcvtpd2psx  %xmm11, %xmm10
+// CHECK: vcvtpd2ps   %xmm11, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x79,0x5a,0xd3]
           vcvtpd2psx  %xmm11, %xmm10
 
@@ -3408,7 +3408,7 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x7d,0x5a,0x18]
           vcvtpd2psy  (%rax), %xmm11
 
-// CHECK: vcvtpd2dq  %ymm12, %xmm10
+// CHECK: vcvtpd2dqy %ymm12, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x7f,0xe6,0xd4]
           vcvtpd2dq  %ymm12, %xmm10
 
@@ -3420,7 +3420,7 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x7f,0xe6,0x18]
           vcvtpd2dqy  (%rax), %xmm11
 
-// CHECK: vcvtpd2dqx  %xmm11, %xmm10
+// CHECK: vcvtpd2dq   %xmm11, %xmm10
 // CHECK: encoding: [0xc4,0x41,0x7b,0xe6,0xd3]
           vcvtpd2dqx  %xmm11, %xmm10
 
@@ -3860,29 +3860,29 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x63,0x2d,0x06,0x18,0x07]
           vperm2f128  $7, (%rax), %ymm10, %ymm11
 
-// CHECK: vcvtsd2si  %xmm8, %r8d
+// CHECK: vcvtsd2sil  %xmm8, %r8d
 // CHECK: encoding: [0xc4,0x41,0x7b,0x2d,0xc0]
-          vcvtsd2si  %xmm8, %r8d
+          vcvtsd2sil  %xmm8, %r8d
 
-// CHECK: vcvtsd2si  (%rcx), %ecx
+// CHECK: vcvtsd2sil  (%rcx), %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0x09]
-          vcvtsd2si  (%rcx), %ecx
+          vcvtsd2sil  (%rcx), %ecx
 
-// CHECK: vcvtss2si  %xmm4, %rcx
+// CHECK: vcvtss2siq  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2d,0xcc]
-          vcvtss2si  %xmm4, %rcx
+          vcvtss2siq  %xmm4, %rcx
 
-// CHECK: vcvtss2si  (%rcx), %r8
+// CHECK: vcvtss2siq  (%rcx), %r8
 // CHECK: encoding: [0xc4,0x61,0xfa,0x2d,0x01]
-          vcvtss2si  (%rcx), %r8
+          vcvtss2siq  (%rcx), %r8
 
-// CHECK: vcvtsi2sdl  %r8d, %xmm8, %xmm15
+// CHECK: vcvtsi2sd  %r8d, %xmm8, %xmm15
 // CHECK: encoding: [0xc4,0x41,0x3b,0x2a,0xf8]
-          vcvtsi2sdl  %r8d, %xmm8, %xmm15
+          vcvtsi2sd  %r8d, %xmm8, %xmm15
 
-// CHECK: vcvtsi2sdl  (%rbp), %xmm8, %xmm15
+// CHECK: vcvtsi2sd  (%rbp), %xmm8, %xmm15
 // CHECK: encoding: [0xc5,0x3b,0x2a,0x7d,0x00]
-          vcvtsi2sdl  (%rbp), %xmm8, %xmm15
+          vcvtsi2sd  (%rbp), %xmm8, %xmm15
 
 // CHECK: vcvtsi2sdq  %rcx, %xmm4, %xmm6
 // CHECK: encoding: [0xc4,0xe1,0xdb,0x2a,0xf1]
@@ -3900,21 +3900,21 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0xe1,0xda,0x2a,0x31]
           vcvtsi2ssq  (%rcx), %xmm4, %xmm6
 
-// CHECK: vcvttsd2si  %xmm4, %rcx
+// CHECK: vcvttsd2siq  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfb,0x2c,0xcc]
-          vcvttsd2si  %xmm4, %rcx
+          vcvttsd2siq  %xmm4, %rcx
 
-// CHECK: vcvttsd2si  (%rcx), %rcx
+// CHECK: vcvttsd2siq  (%rcx), %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfb,0x2c,0x09]
-          vcvttsd2si  (%rcx), %rcx
+          vcvttsd2siq  (%rcx), %rcx
 
-// CHECK: vcvttss2si  %xmm4, %rcx
+// CHECK: vcvttss2siq  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2c,0xcc]
-          vcvttss2si  %xmm4, %rcx
+          vcvttss2siq  %xmm4, %rcx
 
-// CHECK: vcvttss2si  (%rcx), %rcx
+// CHECK: vcvttss2siq  (%rcx), %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2c,0x09]
-          vcvttss2si  (%rcx), %rcx
+          vcvttss2siq  (%rcx), %rcx
 
 // CHECK: vlddqu  (%rax), %ymm12
 // CHECK: encoding: [0xc5,0x7f,0xf0,0x20]
@@ -4121,3 +4121,67 @@ _foo:
 _foo2:
   nop
   vblendvps %ymm1, _foo2(%rip), %ymm0, %ymm0
+
+// CHECK: vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xf9,0x92,0x14,0x4f]
+          vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
+
+// CHECK: vgatherqpd %xmm0, (%rdi,%xmm1,2), %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xf9,0x93,0x14,0x4f]
+          vgatherqpd %xmm0, (%rdi,%xmm1,2), %xmm2
+
+// CHECK: vgatherdpd %ymm0, (%rdi,%xmm1,2), %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xfd,0x92,0x14,0x4f]
+          vgatherdpd %ymm0, (%rdi,%xmm1,2), %ymm2
+
+// CHECK: vgatherqpd %ymm0, (%rdi,%ymm1,2), %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xfd,0x93,0x14,0x4f]
+          vgatherqpd %ymm0, (%rdi,%ymm1,2), %ymm2
+
+// CHECK: vgatherdps %xmm8, (%r15,%xmm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x39,0x92,0x14,0x4f]
+          vgatherdps %xmm8, (%r15,%xmm9,2), %xmm10
+
+// CHECK: vgatherqps %xmm8, (%r15,%xmm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x39,0x93,0x14,0x4f]
+          vgatherqps %xmm8, (%r15,%xmm9,2), %xmm10
+
+// CHECK: vgatherdps %ymm8, (%r15,%ymm9,2), %ymm10
+// CHECK: encoding: [0xc4,0x02,0x3d,0x92,0x14,0x4f]
+          vgatherdps %ymm8, (%r15,%ymm9,2), %ymm10
+
+// CHECK: vgatherqps %xmm8, (%r15,%ymm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x3d,0x93,0x14,0x4f]
+          vgatherqps %xmm8, (%r15,%ymm9,2), %xmm10
+
+// CHECK: vpgatherdq %xmm0, (%rdi,%xmm1,2), %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xf9,0x90,0x14,0x4f]
+          vpgatherdq %xmm0, (%rdi,%xmm1,2), %xmm2
+
+// CHECK: vpgatherqq %xmm0, (%rdi,%xmm1,2), %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xf9,0x91,0x14,0x4f]
+          vpgatherqq %xmm0, (%rdi,%xmm1,2), %xmm2
+
+// CHECK: vpgatherdq %ymm0, (%rdi,%xmm1,2), %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xfd,0x90,0x14,0x4f]
+          vpgatherdq %ymm0, (%rdi,%xmm1,2), %ymm2
+
+// CHECK: vpgatherqq %ymm0, (%rdi,%ymm1,2), %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xfd,0x91,0x14,0x4f]
+          vpgatherqq %ymm0, (%rdi,%ymm1,2), %ymm2
+
+// CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x39,0x90,0x14,0x4f]
+          vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm10
+
+// CHECK: vpgatherqd %xmm8, (%r15,%xmm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x39,0x91,0x14,0x4f]
+          vpgatherqd %xmm8, (%r15,%xmm9,2), %xmm10
+
+// CHECK: vpgatherdd %ymm8, (%r15,%ymm9,2), %ymm10
+// CHECK: encoding: [0xc4,0x02,0x3d,0x90,0x14,0x4f]
+          vpgatherdd %ymm8, (%r15,%ymm9,2), %ymm10
+
+// CHECK: vpgatherqd %xmm8, (%r15,%ymm9,2), %xmm10
+// CHECK: encoding: [0xc4,0x02,0x3d,0x91,0x14,0x4f]
+          vpgatherqd %xmm8, (%r15,%ymm9,2), %xmm10
diff --git a/test/MC/X86/x86_64-sse4a.s b/test/MC/X86/x86_64-sse4a.s
new file mode 100644
index 0000000..e5ed69e
--- /dev/null
+++ b/test/MC/X86/x86_64-sse4a.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+extrq  $2, $3, %xmm0
+# CHECK: extrq  $2, $3, %xmm0
+# CHECK: encoding: [0x66,0x0f,0x78,0xc0,0x03,0x02]
+
+extrq  %xmm1, %xmm0
+# CHECK: extrq  %xmm1, %xmm0
+# CHECK: encoding: [0x66,0x0f,0x79,0xc1]
+
+insertq $6, $5, %xmm1, %xmm0
+# CHECK: insertq $6, $5, %xmm1, %xmm0
+# CHECK: encoding: [0xf2,0x0f,0x78,0xc1,0x05,0x06]
+
+insertq %xmm1, %xmm0
+# CHECK: insertq %xmm1, %xmm0
+# CHECK: encoding: [0xf2,0x0f,0x79,0xc1]
+
+movntsd %xmm0, (%rdi)
+# CHECK: movntsd %xmm0, (%rdi)
+# CHECK: encoding: [0xf2,0x0f,0x2b,0x07]
+
+movntss %xmm0, (%rdi)
+# CHECK: movntss %xmm0, (%rdi)
+# CHECK: encoding: [0xf3,0x0f,0x2b,0x07]
diff --git a/test/Makefile b/test/Makefile
index a4e53f8..9ddfabf 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -12,9 +12,6 @@ DIRS  =
 
 all:: check-local
 
-# 'lit' is the default test runner.
-check-local:: check-local-lit
-
 # Include other test rules
 include Makefile.tests
 
@@ -27,7 +24,6 @@ $(warning GREP_OPTIONS environment variable may interfere with test results)
 endif
 
 ifdef VERBOSE
-RUNTESTFLAGS := $(VERBOSE)
 LIT_ARGS := -v
 else
 LIT_ARGS := -s -v
@@ -42,7 +38,6 @@ ifdef TESTSUITE
 LIT_TESTSUITE := $(TESTSUITE)
 CLEANED_TESTSUITE := $(patsubst %/,%,$(TESTSUITE))
 CLEANED_TESTSUITE := $(patsubst test/%,%,$(CLEANED_TESTSUITE))
-RUNTESTFLAGS += --tool $(CLEANED_TESTSUITE)
 else
 LIT_TESTSUITE := .
 endif
@@ -54,8 +49,8 @@ endif
 # Check what to run for -all.
 LIT_ALL_TESTSUITES := $(LIT_TESTSUITE)
 
-extra-lit-site-cfgs::
-.PHONY: extra-lit-site-cfgs
+extra-site-cfgs::
+.PHONY: extra-site-cfgs
 
 ifneq ($(strip $(filter check-local-all,$(MAKECMDGOALS))),)
 ifndef TESTSUITE
@@ -63,21 +58,20 @@ ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/clang/Makefile && echo OK), OK)
 LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/clang/test
 
 # Force creation of Clang's lit.site.cfg.
-clang-lit-site-cfg: FORCE
+clang-site-cfg: FORCE
 	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/clang/test lit.site.cfg Unit/lit.site.cfg
-extra-lit-site-cfgs:: clang-lit-site-cfg
-endif
-endif
+extra-site-cfgs:: clang-site-cfg
 endif
 
-IGNORE_TESTS :=
+ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/clang/tools/extra/Makefile && echo OK), OK)
+LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/clang/tools/extra/test
 
-ifndef RUNLLVM2CPP
-IGNORE_TESTS += llvm2cpp.exp
+# Force creation of Clang Tools' lit.site.cfg.
+clang-tools-site-cfg: FORCE
+	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/clang/tools/extra/test lit.site.cfg
+extra-site-cfgs:: clang-tools-site-cfg
+endif
 endif
-
-ifdef IGNORE_TESTS
-RUNTESTFLAGS += --ignore "$(strip $(IGNORE_TESTS))"
 endif
 
 # ulimits like these are redundantly enforced by the buildbots, so
@@ -94,21 +88,14 @@ ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -m 512000 ; ulimit -v 1024000 ;
 endif # AuroraUX
 endif # SunOS
 
-ifneq ($(RUNTEST),)
-check-local-dg:: site.exp
-	( $(ULIMIT) \
-	  PATH="$(LLVMToolDir):$(LLVM_SRC_ROOT)/test/Scripts:$(LLVMGCCDIR)/bin:$(PATH)" \
-	  $(RUNTEST) $(RUNTESTFLAGS) )
-else
-check-local-dg:: site.exp
-	@echo "*** dejagnu not found.  Make sure 'runtest' is in your PATH, then reconfigure LLVM."
-endif
-
-check-local-lit:: lit.site.cfg Unit/lit.site.cfg
+check-local:: lit.site.cfg Unit/lit.site.cfg
 	( $(ULIMIT) \
 	  $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_TESTSUITE) )
 
-check-local-all:: lit.site.cfg Unit/lit.site.cfg extra-lit-site-cfgs
+# This is a legacy alias dating from when both DejaGNU and lit were in use.
+check-local-lit:: check-local
+
+check-local-all:: lit.site.cfg Unit/lit.site.cfg extra-site-cfgs
 	( $(ULIMIT) \
 	  $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_ALL_TESTSUITES) )
 
@@ -129,48 +116,28 @@ endif
 
 FORCE:
 
-site.exp: FORCE
-	@echo 'Making a new site.exp file...'
-	@echo '## Autogenerated by LLVM configuration.' > site.tmp
-	@echo '# Do not edit!' >> site.tmp
-	@echo 'set target_triplet "$(TARGET_TRIPLE)"' >> site.tmp
-	@echo 'set TARGETS_TO_BUILD "$(TARGETS_TO_BUILD)"' >> site.tmp
-	@echo 'set llvmshlibdir "$(SharedLibDir)"' >>site.tmp
-	@echo 'set llvm_bindings "$(BINDINGS_TO_BUILD)"' >> site.tmp
-	@echo 'set srcroot "$(LLVM_SRC_ROOT)"' >>site.tmp
-	@echo 'set objroot "$(LLVM_OBJ_ROOT)"' >>site.tmp
-	@echo 'set srcdir "$(LLVM_SRC_ROOT)/test"' >>site.tmp
-	@echo 'set objdir "$(LLVM_OBJ_ROOT)/test"' >>site.tmp
-	@echo 'set link "' $(CXX) $(CPP.Flags) $(CXX.Flags) $(TargetCommonOpts) $(CompileCommonOpts) $(LD.Flags) '"' >>site.tmp
-	@echo 'set shlibext "$(SHLIBEXT)"' >> site.tmp
-	@echo 'set ocamlopt "$(OCAMLOPT) -cc \"$(CXX_FOR_OCAMLOPT)\" -I $(LibDir)/ocaml"' >> site.tmp
-	@echo 'set valgrind "$(VALGRIND)"' >> site.tmp
-	@echo 'set grep "$(GREP)"' >>site.tmp
-	@echo 'set gas "$(GAS)"' >>site.tmp
-	@echo '## All variables above are generated by configure. Do Not Edit ## ' >>site.tmp
-	@test ! -f site.exp || \
-	sed '1,/^## All variables above are.*##/ d' site.exp >> site.tmp
-	@-rm -f site.bak
-	@test ! -f site.exp || mv site.exp site.bak
-	@mv site.tmp site.exp
-
 ifeq ($(DISABLE_ASSERTIONS),1)
 ENABLE_ASSERTIONS=0
 else
 ENABLE_ASSERTIONS=1
 endif
 
-lit.site.cfg: site.exp
+lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
-	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > lit.tmp
+	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
+	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVMGCCDIR@=$(LLVMGCCDIR)=g >> lit.tmp
+	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
+	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
+	@$(ECHOPATH) s,@OCAMLOPT@,$(OCAMLOPT) -cc \\\\\"$(CXX_FOR_OCAMLOPT)\\\\\" -I $(LibDir)/ocaml,g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
+	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
+	@$(ECHOPATH) s=@HOST_ARCH@=$(HOST_ARCH)=g >> lit.tmp
 	@sed -f lit.tmp $(PROJ_SRC_DIR)/lit.site.cfg.in > $@
 	@-rm -f lit.tmp
 
@@ -179,10 +146,11 @@ Unit/lit.site.cfg: $(PROJ_OBJ_DIR)/Unit/.dir FORCE
 	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > unit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> unit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVMGCCDIR@=$(LLVMGCCDIR)=g >> unit.tmp
 	@$(ECHOPATH) s=@LLVM_BUILD_MODE@=$(BuildMode)=g >> unit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> unit.tmp
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> unit.tmp
 	@$(ECHOPATH) s=@SHLIBPATH_VAR@=$(SHLIBPATH_VAR)=g >> unit.tmp
+	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> unit.tmp
+	@$(ECHOPATH) s=@HOST_ARCH@=$(HOST_ARCH)=g >> lit.tmp
 	@sed -f unit.tmp $(PROJ_SRC_DIR)/Unit/lit.site.cfg.in > $@
 	@-rm -f unit.tmp
diff --git a/test/Object/Inputs/COFF/i386.yaml b/test/Object/Inputs/COFF/i386.yaml
new file mode 100644
index 0000000..ca90222
--- /dev/null
+++ b/test/Object/Inputs/COFF/i386.yaml
@@ -0,0 +1,83 @@
+header: !Header
+  Machine: IMAGE_FILE_MACHINE_I386 # (0x14c)
+
+sections:
+  - !Section
+    Name: .text
+    Characteristics: [IMAGE_SCN_CNT_CODE, IMAGE_SCN_ALIGN_16BYTES, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ, ] # 0x60500020
+    SectionData:  !hex "83EC0CC744240800000000C7042400000000E800000000E8000000008B44240883C40CC3" # |....D$.......$...............D$.....|
+
+    Relocations:
+      - !Relocation
+        VirtualAddress: 0xe
+        SymbolTableIndex: 5
+        Type: IMAGE_REL_I386_DIR32
+
+      - !Relocation
+        VirtualAddress: 0x13
+        SymbolTableIndex: 6
+        Type: IMAGE_REL_I386_REL32
+
+      - !Relocation
+        VirtualAddress: 0x18
+        SymbolTableIndex: 7
+        Type: IMAGE_REL_I386_REL32
+
+  - !Section
+    Name: .data
+    Characteristics: [IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_ALIGN_1BYTES, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE, ] # 0xc0100040
+    SectionData:  !hex "48656C6C6F20576F726C642100" # |Hello World!.|
+
+symbols:
+  - !Symbol
+    Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+    NumberOfAuxSymbols: 1
+    AuxillaryData:  !hex "240000000300000000000000010000000000" # |$.................|
+
+  - !Symbol
+    Name: .data
+    Value: 0
+    SectionNumber: 2
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+    NumberOfAuxSymbols: 1
+    AuxillaryData:  !hex "0D0000000000000000000000020000000000" # |..................|
+
+  - !Symbol
+    Name: _main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
+  - !Symbol
+    Name: L_.str
+    Value: 0
+    SectionNumber: 2
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+
+  - !Symbol
+    Name: _puts
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
+  - !Symbol
+    Name: _SomeOtherFunction
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
diff --git a/test/Object/Inputs/COFF/x86-64.yaml b/test/Object/Inputs/COFF/x86-64.yaml
new file mode 100644
index 0000000..0b1265f
--- /dev/null
+++ b/test/Object/Inputs/COFF/x86-64.yaml
@@ -0,0 +1,83 @@
+header: !Header
+  Machine: IMAGE_FILE_MACHINE_AMD64 # (0x8664)
+
+sections:
+  - !Section
+    Name: .text
+    Characteristics: [IMAGE_SCN_CNT_CODE, IMAGE_SCN_ALIGN_16BYTES, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ, ] # 0x60500020
+    SectionData:  !hex "4883EC28C744242400000000488D0D00000000E800000000E8000000008B4424244883C428C3" # |H..(.D$$....H.................D$$H..(.|
+
+    Relocations:
+      - !Relocation
+        VirtualAddress: 0xf
+        SymbolTableIndex: 5
+        Type: IMAGE_REL_AMD64_REL32
+
+      - !Relocation
+        VirtualAddress: 0x14
+        SymbolTableIndex: 6
+        Type: IMAGE_REL_AMD64_REL32
+
+      - !Relocation
+        VirtualAddress: 0x19
+        SymbolTableIndex: 7
+        Type: IMAGE_REL_AMD64_REL32
+
+  - !Section
+    Name: .data
+    Characteristics: [IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_ALIGN_1BYTES, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE, ] # 0xc0100040
+    SectionData:  !hex "48656C6C6F20576F726C642100" # |Hello World!.|
+
+symbols:
+  - !Symbol
+    Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+    NumberOfAuxSymbols: 1
+    AuxillaryData:  !hex "260000000300000000000000010000000000" # |&.................|
+
+  - !Symbol
+    Name: .data
+    Value: 0
+    SectionNumber: 2
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+    NumberOfAuxSymbols: 1
+    AuxillaryData:  !hex "0D0000000000000000000000020000000000" # |..................|
+
+  - !Symbol
+    Name: main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
+  - !Symbol
+    Name: L.str
+    Value: 0
+    SectionNumber: 2
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
+
+  - !Symbol
+    Name: puts
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
+  - !Symbol
+    Name: SomeOtherFunction
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+
diff --git a/test/Object/Inputs/trivial-object-test.coff-i386 b/test/Object/Inputs/trivial-object-test.coff-i386
index 8cfd994..d4ab63b 100644
--- a/test/Object/Inputs/trivial-object-test.coff-i386
+++ b/test/Object/Inputs/trivial-object-test.coff-i386
diff --git a/test/Object/Inputs/trivial-object-test.elf-hexagon b/test/Object/Inputs/trivial-object-test.elf-hexagon
new file mode 100644
index 0000000..566fa30
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.elf-hexagon
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index e5635ab..8fd1c04 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -1,7 +1,7 @@
-RUN: llvm-nm %p/Inputs/trivial-object-test.coff-i386 \
-RUN:         | FileCheck %s -check-prefix COFF
-RUN: llvm-nm %p/Inputs/trivial-object-test.coff-x86-64 \
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm \
 RUN:         | FileCheck %s -check-prefix COFF
+RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm \
+RUN          | FileCheck %s -check-prefix COFF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \
 RUN:         | FileCheck %s -check-prefix ELF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-x86-64 \
@@ -30,4 +30,4 @@ macho: 00000000 U _puts
 macho64: 00000028 s L_.str
 macho64: 00000000 u _SomeOtherFunction
 macho64: 00000000 s _main
-macho64: 00000000 u _puts
-\ No newline at end of file
+macho64: 00000000 u _puts
diff --git a/test/Object/objdump-file-header.test b/test/Object/objdump-file-header.test
index 3fce3f4..a552113 100644
--- a/test/Object/objdump-file-header.test
+++ b/test/Object/objdump-file-header.test
@@ -1,5 +1,4 @@
-RUN: llvm-objdump -f %p/Inputs/trivial-object-test.coff-i386 \
-RUN:              | FileCheck %s -check-prefix COFF-i386
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-objdump -f - | FileCheck %s -check-prefix COFF-i386
 RUN: llvm-objdump -f %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index c4b564e..a394a23 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -6,6 +6,8 @@ RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-x86-64 \
 RUN:              | FileCheck %s -check-prefix ELF-x86-64
+RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-hexagon \
+RUN:              | FileCheck %s -check-prefix ELF-hexagon
 
 COFF-i386: .text
 COFF-i386: IMAGE_REL_I386_DIR32 L_.str
@@ -26,3 +28,11 @@ ELF-x86-64: .text
 ELF-x86-64: R_X86_64_32S .rodata.str1.1
 ELF-x86-64: R_X86_64_PC32 puts
 ELF-x86-64: R_X86_64_PC32 SomeOtherFunction
+
+ELF-hexagon: .text
+ELF-hexagon: R_HEX_GOTREL_HI16 .main
+ELF-hexagon: R_HEX_GOTREL_LO16 .main
+ELF-hexagon: R_HEX_HI16 puts
+ELF-hexagon: R_HEX_LO16 puts
+ELF-hexagon: R_HEX_B15_PCREL testf
+ELF-hexagon: R_HEX_B22_PCREL puts
diff --git a/test/Object/objdump-section-content.test b/test/Object/objdump-section-content.test
index 581e75e..f9c4f43 100644
--- a/test/Object/objdump-section-content.test
+++ b/test/Object/objdump-section-content.test
@@ -1,9 +1,8 @@
-RUN: llvm-objdump -s %p/Inputs/trivial-object-test.coff-i386 \
-RUN:              | FileCheck %s -check-prefix COFF-i386
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-objdump -s - | FileCheck %s -check-prefix COFF-i386
 RUN: llvm-objdump -s %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 
-COFF-i386: trivial-object-test.coff-i386:     file format
+COFF-i386: file format
 COFF-i386: Contents of section .text:
 COFF-i386:  0000 83ec0cc7 44240800 000000c7 04240000  ....D$.......$..
 COFF-i386:  0010 0000e800 000000e8 00000000 8b442408  .............D$.
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index 8a0f440..989ec04 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -1,17 +1,17 @@
-RUN: llvm-objdump -t %p/Inputs/trivial-object-test.coff-i386 \
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-objdump -t - \
 RUN:              | FileCheck %s -check-prefix COFF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.macho-i386 \
 RUN:              | FileCheck %s -check-prefix macho-i386
 
-COFF-i386: trivial-object-test.coff-i386:     file format
+COFF-i386: file format
 COFF-i386: SYMBOL TABLE:
 COFF-i386: [  0](sec  1)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .text
 COFF-i386: AUX scnlen 0x24 nreloc 3 nlnno 0 checksum 0x0 assoc 1 comdat 0
 COFF-i386: [  2](sec  2)(fl 0x00)(ty   0)(scl   3) (nx 1) 0x00000000 .data
 COFF-i386: AUX scnlen 0xd nreloc 0 nlnno 0 checksum 0x0 assoc 2 comdat 0
-COFF-i386: [  4](sec  1)(fl 0x00)(ty 200)(scl   2) (nx 0) 0x00000000 _main
+COFF-i386: [  4](sec  1)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 _main
 COFF-i386: [  5](sec  2)(fl 0x00)(ty   0)(scl   3) (nx 0) 0x00000000 L_.str
 COFF-i386: [  6](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 _puts
 COFF-i386: [  7](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 _SomeOtherFunction
@@ -30,4 +30,4 @@ macho-i386: trivial-object-test.macho-i386:        file format Mach-O 32-bit i38
 macho-i386: SYMBOL TABLE:
 macho-i386: 00000000 g     F __TEXT,__text  00000024 _main
 macho-i386: 00000000         *UND*  00000000 _SomeOtherFunction
-macho-i386: 00000000         *UND*  00000000 _puts
-\ No newline at end of file
+macho-i386: 00000000         *UND*  00000000 _puts
diff --git a/test/Other/2003-02-19-LoopInfoNestingBug.ll b/test/Other/2003-02-19-LoopInfoNestingBug.ll
index 13f8351..b807c44 100644
--- a/test/Other/2003-02-19-LoopInfoNestingBug.ll
+++ b/test/Other/2003-02-19-LoopInfoNestingBug.ll
@@ -3,7 +3,7 @@
 ; and instead nests it just inside loop "Top"
 ;
 ; RUN: opt < %s -analyze -loops | \
-; RUN:   grep {     Loop at depth 3 containing: %Inner<header><latch><exiting>}
+; RUN:   grep "     Loop at depth 3 containing: %Inner<header><latch><exiting>"
 ;
 define void @test() {
         br label %Top
diff --git a/test/Other/2008-10-15-MissingSpace.ll b/test/Other/2008-10-15-MissingSpace.ll
index d16ea72..cac696e 100644
--- a/test/Other/2008-10-15-MissingSpace.ll
+++ b/test/Other/2008-10-15-MissingSpace.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | not grep {void@}
+; RUN: llvm-as < %s | llvm-dis | not grep "void@"
 ; PR2894
 declare void @g()
 define void @f() {
diff --git a/test/Other/close-stderr.ll b/test/Other/close-stderr.ll
index 40a01cc..1d207c7 100644
--- a/test/Other/close-stderr.ll
+++ b/test/Other/close-stderr.ll
@@ -1,7 +1,5 @@
-; RUN: sh -c "\
-; RUN:        opt --reject-this-option 2>&-; echo \$?; \
-; RUN:        opt -o /dev/null /dev/null 2>&-; echo \$?; \
-; RUN:       " | FileCheck %s
+; RUN: sh -c 'opt --reject-this-option 2>&-; echo $?; opt -o /dev/null /dev/null 2>&-; echo $?;' \
+; RUN:   | FileCheck %s
 ; CHECK: {{^1$}}
 ; CHECK: {{^0$}}
 ; XFAIL: vg_leak
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index d28c178..eafb16e 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -263,10 +263,10 @@ define i1* @hoo1() nounwind {
 ; OPT:   ret i64 ptrtoint (double* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
 ; OPT: }
 ; OPT: define i64 @fc() nounwind {
-; OPT:   ret i64 mul nuw (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2)
+; OPT:   ret i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2)
 ; OPT: }
 ; OPT: define i64 @fd() nounwind {
-; OPT:   ret i64 mul nuw (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 11)
+; OPT:   ret i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 11)
 ; OPT: }
 ; OPT: define i64 @fe() nounwind {
 ; OPT:   ret i64 ptrtoint (double* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64)
@@ -433,7 +433,7 @@ define i64* @fO() nounwind {
 ; PLAIN:   ret i32* %t
 ; PLAIN: }
 ; OPT: define i32* @fZ() nounwind {
-; OPT:   ret i32* getelementptr inbounds (i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 0), i64 1)
+; OPT:   ret i32* getelementptr (i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 0), i64 1)
 ; OPT: }
 ; TO: define i32* @fZ() nounwind {
 ; TO:   ret i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 1)
diff --git a/test/Other/invalid-commandline-option.ll b/test/Other/invalid-commandline-option.ll
index 60840fa..583d449 100644
--- a/test/Other/invalid-commandline-option.ll
+++ b/test/Other/invalid-commandline-option.ll
@@ -1,3 +1,3 @@
-; RUN: not opt --foo |& grep {Unknown command line argument}
+; RUN: not opt --foo 2>&1 | grep "Unknown command line argument"
 
 ; there is no --foo
diff --git a/test/Other/lint.ll b/test/Other/lint.ll
index ca2b1a3..c84f56f 100644
--- a/test/Other/lint.ll
+++ b/test/Other/lint.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -lint -disable-output < %s |& FileCheck %s
+; RUN: opt -basicaa -lint -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64"
 
 declare fastcc void @bar()
diff --git a/test/Other/optimize-options.ll b/test/Other/optimize-options.ll
new file mode 100644
index 0000000..888a78f
--- /dev/null
+++ b/test/Other/optimize-options.ll
@@ -0,0 +1,8 @@
+;RUN: opt -S -O1 -debug-pass=Arguments 2>&1 | FileCheck %s
+;RUN: opt -S -O2 -debug-pass=Arguments 2>&1 | FileCheck %s
+;RUN: opt -S -Os -debug-pass=Arguments 2>&1 | FileCheck %s
+;RUN: opt -S -Oz -debug-pass=Arguments 2>&1 | FileCheck %s
+;RUN: opt -S -O3 -debug-pass=Arguments 2>&1 | FileCheck %s
+
+; Just check that we get a non-empty set of passes for each -O opton.
+;CHECK: Pass Arguments: {{.*}} -print-module
diff --git a/test/Scripts/elf-dump b/test/Scripts/elf-dump
index 58ca177..69cdacd 100755
--- a/test/Scripts/elf-dump
+++ b/test/Scripts/elf-dump
@@ -15,6 +15,7 @@ class Reader:
             self.file = open(path, "rb")
         self.isLSB = None
         self.is64Bit = None
+        self.isN64 = False
 
     def seek(self, pos):
         self.file.seek(pos)
@@ -122,15 +123,28 @@ def dumpRel(f, section, dumprela = False):
         f.seek(section.sh_offset[0] + index * section.sh_entsize[0])
         print "    # Relocation %s" % index
         print "    (('r_offset', %s)" % common_dump.HexDump(f.readWord())
-        r_info = f.readWord()[0]
-        if f.is64Bit:
-            r_sym = (r_info >> 32, 32)
-            r_type = (r_info & 0xffffffff, 32)
+
+        if f.isN64:
+            r_sym =   f.read32()
+            r_ssym =  f.read8()
+            r_type3 = f.read8()
+            r_type2 = f.read8()
+            r_type =  f.read8()
+            print "     ('r_sym', %s)" % common_dump.HexDump(r_sym)
+            print "     ('r_ssym', %s)" % common_dump.HexDump(r_ssym)
+            print "     ('r_type3', %s)" % common_dump.HexDump(r_type3)
+            print "     ('r_type2', %s)" % common_dump.HexDump(r_type2)
+            print "     ('r_type', %s)" % common_dump.HexDump(r_type)
         else:
-            r_sym = (r_info >> 8, 24)
-            r_type = (r_info & 0xff, 8)
-        print "     ('r_sym', %s)" % common_dump.HexDump(r_sym)
-        print "     ('r_type', %s)" % common_dump.HexDump(r_type)
+            r_info = f.readWord()[0]
+            if f.is64Bit:
+                r_sym = (r_info >> 32, 32)
+                r_type = (r_info & 0xffffffff, 32)
+            else:
+                r_sym = (r_info >> 8, 24)
+                r_type = (r_info & 0xff, 8)
+            print "     ('r_sym', %s)" % common_dump.HexDump(r_sym)
+            print "     ('r_type', %s)" % common_dump.HexDump(r_type)
         if dumprela:
             print "     ('r_addend', %s)" % common_dump.HexDump(f.readWord())
         print "    ),"
@@ -166,7 +180,13 @@ def dumpELF(path, opts):
     f.seek(16) # Seek to end of e_ident.
 
     print "('e_type', %s)" % common_dump.HexDump(f.read16())
-    print "('e_machine', %s)" % common_dump.HexDump(f.read16())
+
+    # Does any other architecture use N64?
+    e_machine = f.read16()
+    if e_machine[0] == 0x0008 and f.is64Bit: # EM_MIPS && 64 bit
+        f.isN64 = True 
+    
+    print "('e_machine', %s)" % common_dump.HexDump(e_machine)
     print "('e_version', %s)" % common_dump.HexDump(f.read32())
     print "('e_entry', %s)" % common_dump.HexDump(f.readWord())
     print "('e_phoff', %s)" % common_dump.HexDump(f.readWord())
diff --git a/test/TableGen/DefmInherit.td b/test/TableGen/DefmInherit.td
index 47fd81d..46d3f62 100644
--- a/test/TableGen/DefmInherit.td
+++ b/test/TableGen/DefmInherit.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {zing = 4} | count 4
+// RUN: llvm-tblgen %s | grep "zing = 4" | count 4
 // XFAIL: vg_leak
 
 class C1<int A, string B> { 
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index e2defe9..4aacc74 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -6,10 +6,19 @@ class Register<string name, int idx> {
   int Index = idx;
 }
 
+// CHECK-NOT: !strconcat
+
+foreach i = 0-3 in
+  def Q#i : Register<"Q"#i, i>;
+
+// CHECK: def Q0
+// CHECK: def Q1
+// CHECK: def Q2
+// CHECK: def Q3
+
 foreach i = [0, 1, 2, 3, 4, 5, 6, 7] in
   def R#i : Register<"R"#i, i>;
 
-
 // CHECK: def R0
 // CHECK: string Name = "R0";
 // CHECK: int Index = 0;
@@ -41,3 +50,14 @@ foreach i = [0, 1, 2, 3, 4, 5, 6, 7] in
 // CHECK: def R7
 // CHECK: string Name = "R7";
 // CHECK: int Index = 7;
+
+foreach i = {0-3,9-7} in
+  def S#i : Register<"Q"#i, i>;
+
+// CHECK: def S0
+// CHECK: def S1
+// CHECK: def S2
+// CHECK: def S3
+// CHECK: def S7
+// CHECK: def S8
+// CHECK: def S9
diff --git a/test/TableGen/LazyChange.td b/test/TableGen/LazyChange.td
index 8145a3f..306959e 100644
--- a/test/TableGen/LazyChange.td
+++ b/test/TableGen/LazyChange.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {int Y = 3}
+// RUN: llvm-tblgen %s | grep "int Y = 3"
 // XFAIL: vg_leak
 
 class C {
diff --git a/test/TableGen/ListOfList.td b/test/TableGen/ListOfList.td
index 565a99c..864401e 100644
--- a/test/TableGen/ListOfList.td
+++ b/test/TableGen/ListOfList.td
@@ -1,6 +1,6 @@
 // RUN llvm-tblgen %s | FileCheck %s
 
-// RUN: llvm-tblgen %s | grep {foo} | count 1
+// RUN: llvm-tblgen %s | grep "foo" | count 1
 // XFAIL: vg_leak
 
 class Base<string t> {
diff --git a/test/TableGen/MultiClass.td b/test/TableGen/MultiClass.td
index 04f3a56..449c5d6 100644
--- a/test/TableGen/MultiClass.td
+++ b/test/TableGen/MultiClass.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {zing = 4} | count 2
+// RUN: llvm-tblgen %s | grep "zing = 4" | count 2
 // XFAIL: vg_leak
 
 class C1<int A, string B> { 
diff --git a/test/TableGen/MultiClassInherit.td b/test/TableGen/MultiClassInherit.td
index 8b78bc7..c768fff 100644
--- a/test/TableGen/MultiClassInherit.td
+++ b/test/TableGen/MultiClassInherit.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {zing = 4} | count 28
+// RUN: llvm-tblgen %s | grep "zing = 4" | count 28
 // XFAIL: vg_leak
 
 class C1<int A, string B> { 
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
index 4d85aa3..7613323 100644
--- a/test/TableGen/SetTheory.td
+++ b/test/TableGen/SetTheory.td
@@ -161,10 +161,12 @@ def S9a : Set<(sequence "e%u", 3, 7)>;
 def S9b : Set<(sequence "e%u", 7, 3)>;
 def S9c : Set<(sequence "e%u", 0, 0)>;
 def S9d : Set<(sequence "S%ua", 7, 9)>;
+def S9e : Set<(sequence "e%u", 3, 6, 2)>;
 // CHECK: S9a = [ e3 e4 e5 e6 e7 ]
 // CHECK: S9b = [ e7 e6 e5 e4 e3 ]
 // CHECK: S9c = [ e0 ]
 // CHECK: S9d = [ a b c d e0 e3 e6 e9 e4 e5 e7 ]
+// CHECK: S9e = [ e3 e5 ]
 
 // The 'interleave' operator is almost the inverse of 'decimate'.
 def interleave;
diff --git a/test/TableGen/Slice.td b/test/TableGen/Slice.td
index 2d2822c..6d051d7 100644
--- a/test/TableGen/Slice.td
+++ b/test/TableGen/Slice.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen %s | grep {\\\[(set} | count 2
-// RUN: llvm-tblgen %s | grep {\\\[\\\]} | count 2
+// RUN: llvm-tblgen %s | grep "\[(set" | count 2
+// RUN: llvm-tblgen %s | grep "\[\]" | count 2
 // XFAIL: vg_leak
 
 class ValueType<int size, int value> {
diff --git a/test/TableGen/TargetInstrSpec.td b/test/TableGen/TargetInstrSpec.td
index 7b611e7..64b706d 100644
--- a/test/TableGen/TargetInstrSpec.td
+++ b/test/TableGen/TargetInstrSpec.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen %s | grep {\\\[(set VR128:\$dst, (int_x86_sse2_add_pd VR128:\$src1, VR128:\$src2))\\\]} | count 1
-// RUN: llvm-tblgen %s | grep {\\\[(set VR128:\$dst, (int_x86_sse2_add_ps VR128:\$src1, VR128:\$src2))\\\]} | count 1
+// RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))\]' | count 1
+// RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))\]' | count 1
 // XFAIL: vg_leak
 
 class ValueType<int size, int value> {
diff --git a/test/TableGen/cast.td b/test/TableGen/cast.td
index 8a23eb4..7948aff 100644
--- a/test/TableGen/cast.td
+++ b/test/TableGen/cast.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {add_ps} | count 3
+// RUN: llvm-tblgen %s | grep "add_ps" | count 3
 // XFAIL: vg_leak
 
 class ValueType<int size, int value> {
diff --git a/test/TableGen/foreach.td b/test/TableGen/foreach.td
index 814ae6e..902af25 100644
--- a/test/TableGen/foreach.td
+++ b/test/TableGen/foreach.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen %s | grep {Jr} | count 2
-// RUN: llvm-tblgen %s | grep {Sr} | count 2
-// RUN: llvm-tblgen %s | grep {"NAME"} | count 1
+// RUN: llvm-tblgen %s | grep 'Jr' | count 2
+// RUN: llvm-tblgen %s | grep 'Sr' | count 2
+// RUN: llvm-tblgen %s | grep '"NAME"' | count 1
 // XFAIL: vg_leak
 
 // Variables for foreach
diff --git a/test/TableGen/lisp.td b/test/TableGen/lisp.td
index 025aca9..dd85ddc 100644
--- a/test/TableGen/lisp.td
+++ b/test/TableGen/lisp.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s | grep {}
+// RUN: llvm-tblgen %s | grep ""
 // XFAIL: vg_leak
 
 class List<list<string> n> {
diff --git a/test/TableGen/subst.td b/test/TableGen/subst.td
index 5a73ec4..850ac38 100644
--- a/test/TableGen/subst.td
+++ b/test/TableGen/subst.td
@@ -1,9 +1,9 @@
-// RUN: llvm-tblgen %s | grep {Smith} | count 7
-// RUN: llvm-tblgen %s | grep {Johnson} | count 2
-// RUN: llvm-tblgen %s | grep {FIRST} | count 1
-// RUN: llvm-tblgen %s | grep {LAST} | count 1
-// RUN: llvm-tblgen %s | grep {TVAR} | count 2
-// RUN: llvm-tblgen %s | grep {Bogus} | count 1
+// RUN: llvm-tblgen %s | grep "Smith" | count 7
+// RUN: llvm-tblgen %s | grep "Johnson" | count 2
+// RUN: llvm-tblgen %s | grep "FIRST" | count 1
+// RUN: llvm-tblgen %s | grep "LAST" | count 1
+// RUN: llvm-tblgen %s | grep "TVAR" | count 2
+// RUN: llvm-tblgen %s | grep "Bogus" | count 1
 // XFAIL: vg_leak
 
 class Honorific<string t> {
diff --git a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
index d7d5eb5..210eb97 100644
--- a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
+++ b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -argpromotion -S > %t
-; RUN: cat %t | grep {define.*@callee(.*i32\\*}
+; RUN: cat %t | grep "define.*@callee(.*i32\*"
 ; PR2498
 
 ; This test tries to convince argpromotion about promoting the load from %A + 2,
diff --git a/test/Transforms/ArgumentPromotion/byval-2.ll b/test/Transforms/ArgumentPromotion/byval-2.ll
index bd62c68..368c689 100644
--- a/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -argpromotion -S | grep -F {i32* byval} | count 2
+; RUN: opt < %s -argpromotion -S | grep -F "i32* byval" | count 2
 ; Argpromote + scalarrepl should change this to passing the two integers by value.
 
 	%struct.ss = type { i32, i64 }
diff --git a/test/Transforms/ArgumentPromotion/control-flow.ll b/test/Transforms/ArgumentPromotion/control-flow.ll
index 08ca6bc..e4a61da 100644
--- a/test/Transforms/ArgumentPromotion/control-flow.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | \
-; RUN:    not grep {load i32\* null}
+; RUN:    not grep "load i32* null"
 
 define internal i32 @callee(i1 %C, i32* %P) {
         br i1 %C, label %T, label %F
diff --git a/test/Transforms/ArgumentPromotion/control-flow2.ll b/test/Transforms/ArgumentPromotion/control-flow2.ll
index 9a8afc3..2543218 100644
--- a/test/Transforms/ArgumentPromotion/control-flow2.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow2.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | \
-; RUN:   grep {load i32\\* %A}
+; RUN:   grep "load i32\* %A"
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 define internal i32 @callee(i1 %C, i32* %P) {
diff --git a/test/Transforms/BBVectorize/metadata.ll b/test/Transforms/BBVectorize/metadata.ll
new file mode 100644
index 0000000..1e3aaa1
--- /dev/null
+++ b/test/Transforms/BBVectorize/metadata.ll
@@ -0,0 +1,49 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -S | FileCheck %s
+
+; Simple 3-pair chain with loads and stores (with fpmath)
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1, !fpmath !2
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4, !fpmath !3
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: !fpmath
+; CHECK: ret void
+}
+
+; Simple 3-pair chain with loads and stores (ints with range)
+define void @test2(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load i64* %a, align 8, !range !0
+  %i1 = load i64* %b, align 8
+  %mul = mul i64 %i0, %i1
+  %arrayidx3 = getelementptr inbounds i64* %a, i64 1
+  %i3 = load i64* %arrayidx3, align 8, !range !1
+  %arrayidx4 = getelementptr inbounds i64* %b, i64 1
+  %i4 = load i64* %arrayidx4, align 8
+  %mul5 = mul i64 %i3, %i4
+  store i64 %mul, i64* %c, align 8
+  %arrayidx5 = getelementptr inbounds i64* %c, i64 1
+  store i64 %mul5, i64* %arrayidx5, align 8
+  ret void
+; CHECK: @test2
+; CHECK-NOT: !range
+; CHECK: ret void
+}
+
+!0 = metadata !{i64 0, i64 2}
+!1 = metadata !{i64 3, i64 5}
+
+!2 = metadata !{ float 5.0 }
+!3 = metadata !{ float 2.5 }
+
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 4daa571..325792a 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -1,5 +1,6 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-no-bools -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-NB
 
 ; Basic depth-3 chain with select
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
@@ -27,4 +28,32 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1
 ; CHECK: ret double %R
 }
 
+; Basic depth-3 chain with select (and vect. compare)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test2
+; CHECK-NB: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%C1 = fcmp ogt double %X1, %A1
+        %C2 = fcmp ogt double %X2, %A2
+; CHECK: %C1 = fcmp ogt <2 x double> %X1, %X1.v.i0.2
+; CHECK-NB: fcmp ogt double
+        %Z1 = select i1 %C1, double %Y1, double %B1
+        %Z2 = select i1 %C2, double %Y2, double %B2
+; CHECK: %Z1 = select <2 x i1> %C1, <2 x double> %Y1, <2 x double> %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
 
diff --git a/test/Transforms/BBVectorize/simple-tst.ll b/test/Transforms/BBVectorize/simple-tst.ll
new file mode 100644
index 0000000..42146c6
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-tst.ll
@@ -0,0 +1,18 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=256 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain (target-specific type should not vectorize)
+define ppc_fp128 @test7(ppc_fp128 %A1, ppc_fp128 %A2, ppc_fp128 %B1, ppc_fp128 %B2) {
+; CHECK: @test7
+; CHECK-NOT: <2 x ppc_fp128>
+	%X1 = fsub ppc_fp128 %A1, %B1
+	%X2 = fsub ppc_fp128 %A2, %B2
+	%Y1 = fmul ppc_fp128 %X1, %A1
+	%Y2 = fmul ppc_fp128 %X2, %A2
+	%Z1 = fadd ppc_fp128 %Y1, %B1
+	%Z2 = fadd ppc_fp128 %Y2, %B2
+	%R  = fmul ppc_fp128 %Z1, %Z2
+	ret ppc_fp128 %R
+}
+
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 904d766..88eb9c9 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -138,8 +138,7 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
         %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
         %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
-; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
-; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %Q1.v.i1 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
 	%R  = mul <8 x i8> %Q1, %Q2
 ; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/Transforms/BBVectorize/simple3.ll b/test/Transforms/BBVectorize/simple3.ll
new file mode 100644
index 0000000..153be73
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple3.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %A3, double %B1, double %B2, double %B3) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.11 = insertelement <3 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.22 = insertelement <3 x double> %X1.v.i1.11, double %B2, i32 1
+; CHECK: %X1.v.i1 = insertelement <3 x double> %X1.v.i1.22, double %B3, i32 2
+; CHECK: %X1.v.i0.13 = insertelement <3 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.24 = insertelement <3 x double> %X1.v.i0.13, double %A2, i32 1
+; CHECK: %X1.v.i0 = insertelement <3 x double> %X1.v.i0.24, double %A3, i32 2
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%X3 = fsub double %A3, %B3
+; CHECK: %X1 = fsub <3 x double> %X1.v.i0, %X1.v.i1
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Y3 = fmul double %X3, %A3
+; CHECK: %Y1 = fmul <3 x double> %X1, %X1.v.i0
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%Z3 = fadd double %Y3, %B3
+; CHECK: %Z1 = fadd <3 x double> %Y1, %X1.v.i1
+        %R1 = fmul double %Z1, %Z2
+	%R  = fmul double %R1, %Z3
+; CHECK: %Z1.v.r210 = extractelement <3 x double> %Z1, i32 2
+; CHECK: %Z1.v.r1 = extractelement <3 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <3 x double> %Z1, i32 1
+; CHECK: %R1 = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %R = fmul double %R1, %Z1.v.r210
+	ret double %R
+; CHECK: ret double %R
+}
+
diff --git a/test/Transforms/CodeGenPrepare/basic.ll b/test/Transforms/CodeGenPrepare/basic.ll
index ebf10f0..c68e77e 100644
--- a/test/Transforms/CodeGenPrepare/basic.ll
+++ b/test/Transforms/CodeGenPrepare/basic.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 ; CHECK: @test1
 ; objectsize should fold to a constant, which causes the branch to fold to an
-; uncond branch.
+; uncond branch. Next, we fold the control flow alltogether.
 ; rdar://8785296
 define i32 @test1(i8* %ptr) nounwind ssp noredzone align 2 {
 entry:
@@ -13,8 +13,8 @@ entry:
   %1 = icmp ugt i64 %0, 3
   br i1 %1, label %T, label %trap
 
-; CHECK: entry:
-; CHECK-NEXT: br label %T
+; CHECK: T:
+; CHECK-NOT: br label %
 
 trap:                                             ; preds = %0, %entry
   tail call void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/ConstProp/2002-05-03-NotOperator.ll b/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
index b957220..ca1d618 100644
--- a/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
+++ b/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
@@ -5,7 +5,7 @@
 ; Fix #2: The unary not instruction now no longer exists. Change to xor.
 
 ; RUN: opt < %s -constprop -S | \
-; RUN:   not grep {i32 0}
+; RUN:   not grep "i32 0"
 
 define i32 @test1() {
         %R = xor i32 123, -1            ; <i32> [#uses=1]
diff --git a/test/Transforms/ConstProp/2005-01-28-SetCCGEP.ll b/test/Transforms/ConstProp/2005-01-28-SetCCGEP.ll
index 0b44b99..d68cb26 100644
--- a/test/Transforms/ConstProp/2005-01-28-SetCCGEP.ll
+++ b/test/Transforms/ConstProp/2005-01-28-SetCCGEP.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -constprop -S | \
-; RUN:    not grep {ret i1 false}
+; RUN:    not grep "ret i1 false"
 
 @b = external global [2 x {  }]         ; <[2 x {  }]*> [#uses=2]
 
diff --git a/test/Transforms/ConstProp/2006-11-30-vector-cast.ll b/test/Transforms/ConstProp/2006-11-30-vector-cast.ll
index be76783..4a93144 100644
--- a/test/Transforms/ConstProp/2006-11-30-vector-cast.ll
+++ b/test/Transforms/ConstProp/2006-11-30-vector-cast.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -constprop -S | \
-; RUN:   grep {i32 -1}
+; RUN:   grep "i32 -1"
 ; RUN: opt < %s -constprop -S | \
 ; RUN:   not grep zeroinitializer
 
diff --git a/test/Transforms/ConstProp/2006-12-01-TruncBoolBug.ll b/test/Transforms/ConstProp/2006-12-01-TruncBoolBug.ll
index e46a875..ce66c70 100644
--- a/test/Transforms/ConstProp/2006-12-01-TruncBoolBug.ll
+++ b/test/Transforms/ConstProp/2006-12-01-TruncBoolBug.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep {ret i1 false}
+; RUN:   grep "ret i1 false"
 define i1 @test() {
         %X = trunc i32 320 to i1                ; <i1> [#uses=1]
         ret i1 %X
diff --git a/test/Transforms/ConstProp/2006-12-01-bool-casts.ll b/test/Transforms/ConstProp/2006-12-01-bool-casts.ll
index 3c06693..71db421 100644
--- a/test/Transforms/ConstProp/2006-12-01-bool-casts.ll
+++ b/test/Transforms/ConstProp/2006-12-01-bool-casts.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -constprop -S | \
-; RUN:    grep {ret i32 -1}
+; RUN:    grep "ret i32 -1"
 ; RUN: opt < %s -constprop -S | \
-; RUN:    grep {ret i32 1}
+; RUN:    grep "ret i32 1"
 
 define i32 @test1() {
         %A = sext i1 true to i32                ; <i32> [#uses=1]
diff --git a/test/Transforms/ConstProp/2007-02-23-sdiv.ll b/test/Transforms/ConstProp/2007-02-23-sdiv.ll
index 721199f..75f58b5 100644
--- a/test/Transforms/ConstProp/2007-02-23-sdiv.ll
+++ b/test/Transforms/ConstProp/2007-02-23-sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis | grep {global i32 0}
+; RUN: llvm-as < %s | llvm-dis | grep "global i32 0"
 ; PR1215
 
 @G = global i32 sdiv (i32 0, i32 -1)
diff --git a/test/Transforms/ConstProp/2007-11-23-cttz.ll b/test/Transforms/ConstProp/2007-11-23-cttz.ll
index a28c9b0..6d34cb1 100644
--- a/test/Transforms/ConstProp/2007-11-23-cttz.ll
+++ b/test/Transforms/ConstProp/2007-11-23-cttz.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -constprop -S | grep {ret i13 13}
+; RUN: opt < %s -constprop -S | grep "ret i13 13"
 ; PR1816
 declare i13 @llvm.cttz.i13(i13, i1)
 
diff --git a/test/Transforms/ConstProp/div-zero.ll b/test/Transforms/ConstProp/div-zero.ll
index f78a34f..a2c59d3 100644
--- a/test/Transforms/ConstProp/div-zero.ll
+++ b/test/Transforms/ConstProp/div-zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i32 0}
+; RUN: opt < %s -instcombine -S | grep "ret i32 0"
 ; PR4424
 declare void @ext()
 
diff --git a/test/Transforms/CorrelatedValuePropagation/range.ll b/test/Transforms/CorrelatedValuePropagation/range.ll
index 9b70ed2..6750546 100644
--- a/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -41,3 +41,127 @@ end:
 ; CHECK: then:
 ; CHECK-NEXT: br i1 false, label %end, label %else
 }
+
+; CHECK: @test3
+define i32 @test3(i32 %c) nounwind {
+  %cmp = icmp slt i32 %c, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret i32 1
+
+if.end:
+  %cmp1 = icmp slt i32 %c, 3
+  br i1 %cmp1, label %if.then2, label %if.end8
+
+; CHECK: if.then2
+if.then2:
+  %cmp2 = icmp eq i32 %c, 2
+; CHECK: br i1 true
+  br i1 %cmp2, label %if.then4, label %if.end6
+
+; CHECK: if.end6
+if.end6:
+  ret i32 2
+
+if.then4:
+  ret i32 3
+
+if.end8:
+  ret i32 4
+}
+
+; CHECK: @test4
+define i32 @test4(i32 %c) nounwind {
+  switch i32 %c, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 4, label %sw.bb
+  ]
+
+; CHECK: sw.bb
+sw.bb:
+  %cmp = icmp sge i32 %c, 1
+; CHECK: br i1 true
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  br label %return
+
+if.end:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 42, %sw.default ], [ 4, %if.then ], [ 9, %if.end ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test5
+define i1 @test5(i32 %c) nounwind {
+  %cmp = icmp slt i32 %c, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %cmp1 = icmp eq i32 %c, 4
+  br i1 %cmp1, label %if.end, label %if.end8
+
+if.end:
+  ret i1 true
+
+if.end8:
+  %cmp2 = icmp eq i32 %c, 3
+  %cmp3 = icmp eq i32 %c, 4
+  %cmp4 = icmp eq i32 %c, 6
+; CHECK: %or = or i1 false, false
+  %or = or i1 %cmp3, %cmp4
+; CHECK: ret i1 %cmp2
+  ret i1 %cmp2
+}
+
+; CHECK: @test6
+define i1 @test6(i32 %c) nounwind {
+  %cmp = icmp ule i32 %c, 7
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+; CHECK: icmp eq i32 %c, 6
+; CHECK: br i1
+  switch i32 %c, label %if.end [
+    i32 6, label %sw.bb
+    i32 8, label %sw.bb
+  ]
+
+if.end:
+  ret i1 true
+
+sw.bb:
+  %cmp2 = icmp eq i32 %c, 6
+; CHECK: ret i1 true
+  ret i1 %cmp2
+}
+
+; CHECK: @test7
+define i1 @test7(i32 %c) nounwind {
+entry:
+ switch i32 %c, label %sw.default [
+   i32 6, label %sw.bb
+   i32 7, label %sw.bb
+ ]
+
+sw.bb:
+ ret i1 true
+
+sw.default:
+ %cmp5 = icmp eq i32 %c, 5
+ %cmp6 = icmp eq i32 %c, 6
+ %cmp7 = icmp eq i32 %c, 7
+ %cmp8 = icmp eq i32 %c, 8
+; CHECK: %or = or i1 %cmp5, false
+ %or = or i1 %cmp5, %cmp6
+; CHECK: %or2 = or i1 false, %cmp8
+ %or2 = or i1 %cmp7, %cmp8
+ ret i1 false
+}
diff --git a/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll b/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll
index d5bd6c4..e5419f7 100644
--- a/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll
+++ b/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -deadargelim -S | grep {@test(}
+; RUN: opt < %s -deadargelim -S | grep "@test("
 ; RUN: opt < %s -deadargelim -S | not grep dead
 
 define internal i32 @test(i32 %X, i32 %dead) {
diff --git a/test/Transforms/DeadArgElim/2007-10-18-VarargsReturn.ll b/test/Transforms/DeadArgElim/2007-10-18-VarargsReturn.ll
index d4edce9..cdd893f 100644
--- a/test/Transforms/DeadArgElim/2007-10-18-VarargsReturn.ll
+++ b/test/Transforms/DeadArgElim/2007-10-18-VarargsReturn.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -deadargelim -S | not grep {ret i32 0}
+; RUN: opt < %s -deadargelim -S | not grep "ret i32 0"
 ; PR1735
 
 define internal i32 @test(i32 %A, ...) { 
diff --git a/test/Transforms/DeadArgElim/canon.ll b/test/Transforms/DeadArgElim/canon.ll
index 11cd482..79c15a0 100644
--- a/test/Transforms/DeadArgElim/canon.ll
+++ b/test/Transforms/DeadArgElim/canon.ll
@@ -1,9 +1,9 @@
 ; This test shows a few canonicalizations made by deadargelim
 ; RUN: opt < %s -deadargelim -S > %t
 ; This test should remove {} and replace it with void
-; RUN: cat %t | grep {define internal void @test}
+; RUN: cat %t | grep "define internal void @test"
 ; This test shouls replace the {i32} return value with just i32
-; RUN: cat %t | grep {define internal i32 @test2}
+; RUN: cat %t | grep "define internal i32 @test2"
 
 define internal {} @test() {
   ret {} undef
diff --git a/test/Transforms/DeadArgElim/keepalive.ll b/test/Transforms/DeadArgElim/keepalive.ll
index 4d6aae3..dc92dc9f 100644
--- a/test/Transforms/DeadArgElim/keepalive.ll
+++ b/test/Transforms/DeadArgElim/keepalive.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -deadargelim -S > %t
-; RUN: grep {define internal zeroext i32 @test1() nounwind} %t
-; RUN: grep {define internal <{ i32, i32 }> @test2} %t
+; RUN: grep "define internal zeroext i32 @test1() nounwind" %t
+; RUN: grep "define internal <{ i32, i32 }> @test2" %t
 
 %Ty = type <{ i32, i32 }>
 
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index 81eb5a8..7a8cdd5 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -164,7 +164,7 @@ define i32* @test13() {
 }
 
 declare noalias i8* @malloc(i32)
-
+declare noalias i8* @calloc(i32, i32)
 
 
 define void @test14(i32* %Q) {
@@ -258,3 +258,55 @@ define void @test20() {
 }
 ; CHECK: @test20
 ; CHECK-NEXT: ret void
+
+; CHECK: @test21
+define void @test21() {
+  %m = call i8* @calloc(i32 9, i32 7)
+  store i8 0, i8* %m
+; CHECK-NEXT: ret void
+  ret void
+}
+
+; CHECK: @test22(
+define void @test22(i1 %i, i32 %k, i32 %m) nounwind {
+  %k.addr = alloca i32
+  %m.addr = alloca i32
+  %k.addr.m.addr = select i1 %i, i32* %k.addr, i32* %m.addr
+  store i32 0, i32* %k.addr.m.addr, align 4
+; CHECK-NEXT: ret void
+  ret void
+}
+
+; PR13547
+; CHECK: @test23
+; CHECK: store i8 97
+; CHECK: store i8 0
+declare noalias i8* @strdup(i8* nocapture) nounwind
+define noalias i8* @test23() nounwind uwtable ssp {
+  %x = alloca [2 x i8], align 1
+  %arrayidx = getelementptr inbounds [2 x i8]* %x, i64 0, i64 0
+  store i8 97, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds [2 x i8]* %x, i64 0, i64 1
+  store i8 0, i8* %arrayidx1, align 1
+  %call = call i8* @strdup(i8* %arrayidx) nounwind
+  ret i8* %call
+}
+
+; Make sure same sized store to later element is deleted
+; CHECK: @test24
+; CHECK-NOT: store i32 0
+; CHECK-NOT: store i32 0
+; CHECK: store i32 %b
+; CHECK: store i32 %c
+; CHECK: ret void
+define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
+  %1 = getelementptr inbounds [2 x i32]* %a, i64 0, i64 0
+  store i32 0, i32* %1, align 4
+  %2 = getelementptr inbounds [2 x i32]* %a, i64 0, i64 1
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds [2 x i32]* %a, i64 0, i64 0
+  store i32 %b, i32* %3, align 4
+  %4 = getelementptr inbounds [2 x i32]* %a, i64 0, i64 1
+  store i32 %c, i32* %4, align 4
+  ret void
+}
diff --git a/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll
index 7ef5f06..f38c03a 100644
--- a/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll
+++ b/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -functionattrs -S | not grep {nocapture *%%q}
-; RUN: opt < %s -functionattrs -S | grep {nocapture *%%p}
+; RUN: opt < %s -functionattrs -S | not grep "nocapture *%%q"
+; RUN: opt < %s -functionattrs -S | grep "nocapture *%%p"
 
 define i32* @a(i32** %p) {
 	%tmp = load i32** %p
diff --git a/test/Transforms/GVN/2007-07-25-InfiniteLoop.ll b/test/Transforms/GVN/2007-07-25-InfiniteLoop.ll
index 9983374..7e9c982 100644
--- a/test/Transforms/GVN/2007-07-25-InfiniteLoop.ll
+++ b/test/Transforms/GVN/2007-07-25-InfiniteLoop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | not grep {tmp10 =}
+; RUN: opt < %s -basicaa -gvn -S | not grep "tmp10 ="
 
 	%struct.INT2 = type { i32, i32 }
 @blkshifts = external global %struct.INT2*		; <%struct.INT2**> [#uses=2]
diff --git a/test/Transforms/GVN/2007-07-31-NoDomInherit.ll b/test/Transforms/GVN/2007-07-31-NoDomInherit.ll
index f2c0012..5018a07 100644
--- a/test/Transforms/GVN/2007-07-31-NoDomInherit.ll
+++ b/test/Transforms/GVN/2007-07-31-NoDomInherit.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {tmp47 = phi i32 }
+; RUN: opt < %s -basicaa -gvn -S | grep "tmp47 = phi i32 "
 
 	%struct.anon = type { i32 (i32, i32, i32)*, i32, i32, [3 x i32], i8*, i8*, i8* }
 @debug = external constant i32		; <i32*> [#uses=0]
diff --git a/test/Transforms/GVN/2007-07-31-RedundantPhi.ll b/test/Transforms/GVN/2007-07-31-RedundantPhi.ll
index a570e35..13419d1 100644
--- a/test/Transforms/GVN/2007-07-31-RedundantPhi.ll
+++ b/test/Transforms/GVN/2007-07-31-RedundantPhi.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | not grep {tmp701 =}
+; RUN: opt < %s -basicaa -gvn -S | not grep "tmp701 ="
 
 @img_width = external global i16		; <i16*> [#uses=2]
 
diff --git a/test/Transforms/GVN/2008-07-02-Unreachable.ll b/test/Transforms/GVN/2008-07-02-Unreachable.ll
index 407940b..4f07868 100644
--- a/test/Transforms/GVN/2008-07-02-Unreachable.ll
+++ b/test/Transforms/GVN/2008-07-02-Unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {ret i8 \[%\]tmp3}
+; RUN: opt < %s -basicaa -gvn -S | grep "ret i8 [%]tmp3"
 ; PR2503
 
 @g_3 = external global i8		; <i8*> [#uses=2]
diff --git a/test/Transforms/GVN/2012-05-22-PreCrash.ll b/test/Transforms/GVN/2012-05-22-PreCrash.ll
new file mode 100644
index 0000000..b488dda
--- /dev/null
+++ b/test/Transforms/GVN/2012-05-22-PreCrash.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -gvn
+; PR12858
+
+define void @fn5(i16 signext %p1, i8 signext %p2) nounwind uwtable {
+entry:
+  br i1 undef, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %conv = sext i16 %p1 to i32
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %conv1 = sext i16 %p1 to i32
+  br i1 undef, label %if.then3, label %if.else4
+
+if.then3:                                         ; preds = %if.end
+  br label %if.end12
+
+if.else4:                                         ; preds = %if.end
+  %conv7 = sext i8 %p2 to i32
+  %cmp8 = icmp eq i32 %conv1, %conv7
+  br i1 %cmp8, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else4
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.then10, %if.else4, %if.then3
+  %conv13 = sext i8 %p2 to i32
+  ret void
+}
diff --git a/test/Transforms/GVN/basic.ll b/test/Transforms/GVN/basic.ll
index 1decafa..6f4aace 100644
--- a/test/Transforms/GVN/basic.ll
+++ b/test/Transforms/GVN/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -gvn -S | not grep {%z2 =}
+; RUN: opt < %s -gvn -S | not grep "%z2 ="
 
 define i32 @main() {
 block1:
diff --git a/test/Transforms/GVN/calls-readonly.ll b/test/Transforms/GVN/calls-readonly.ll
index 97ec915..a477740 100644
--- a/test/Transforms/GVN/calls-readonly.ll
+++ b/test/Transforms/GVN/calls-readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {call.*strlen} | count 1
+; RUN: opt < %s -basicaa -gvn -S | grep "call.*strlen" | count 1
 ; Should delete the second call to strlen even though the intervening strchr call exists.
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/GVN/fpmath.ll b/test/Transforms/GVN/fpmath.ll
new file mode 100644
index 0000000..8ab2854
--- /dev/null
+++ b/test/Transforms/GVN/fpmath.ll
@@ -0,0 +1,45 @@
+; RUN: opt %s -gvn -S -o - | FileCheck %s
+
+define double @test1(double %x, double %y) {
+; CHECK: @test1(double %x, double %y)
+; CHECK: %add1 = fadd double %x, %y
+; CHECK-NOT: fpmath
+; CHECK: %foo = fadd double %add1, %add1
+  %add1 = fadd double %x, %y, !fpmath !0
+  %add2 = fadd double %x, %y
+  %foo = fadd double %add1, %add2
+  ret double %foo
+}
+
+define double @test2(double %x, double %y) {
+; CHECK: @test2(double %x, double %y)
+; CHECK: %add1 = fadd double %x, %y, !fpmath !0
+; CHECK: %foo = fadd double %add1, %add1
+  %add1 = fadd double %x, %y, !fpmath !0
+  %add2 = fadd double %x, %y, !fpmath !0
+  %foo = fadd double %add1, %add2
+  ret double %foo
+}
+
+define double @test3(double %x, double %y) {
+; CHECK: @test3(double %x, double %y)
+; CHECK: %add1 = fadd double %x, %y, !fpmath !1
+; CHECK: %foo = fadd double %add1, %add1
+  %add1 = fadd double %x, %y, !fpmath !1
+  %add2 = fadd double %x, %y, !fpmath !0
+  %foo = fadd double %add1, %add2
+  ret double %foo
+}
+
+define double @test4(double %x, double %y) {
+; CHECK: @test4(double %x, double %y)
+; CHECK: %add1 = fadd double %x, %y, !fpmath !1
+; CHECK: %foo = fadd double %add1, %add1
+  %add1 = fadd double %x, %y, !fpmath !0
+  %add2 = fadd double %x, %y, !fpmath !1
+  %foo = fadd double %add1, %add2
+  ret double %foo
+}
+
+!0 = metadata !{ float 5.0 }
+!1 = metadata !{ float 2.5 }
diff --git a/test/Transforms/GVN/load-constant-mem.ll b/test/Transforms/GVN/load-constant-mem.ll
index 314c806..a7dacea 100644
--- a/test/Transforms/GVN/load-constant-mem.ll
+++ b/test/Transforms/GVN/load-constant-mem.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -instcombine -S | grep {ret i32 0}
+; RUN: opt < %s -basicaa -gvn -instcombine -S | grep "ret i32 0"
 ; PR4189
 @G = external constant [4 x i32]
 
diff --git a/test/Transforms/GVN/local-pre.ll b/test/Transforms/GVN/local-pre.ll
index 5f03984..1d0dadf 100644
--- a/test/Transforms/GVN/local-pre.ll
+++ b/test/Transforms/GVN/local-pre.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -gvn -enable-pre -S | grep {b.pre}
+; RUN: opt < %s -gvn -enable-pre -S | grep "b.pre"
 
 define i32 @main(i32 %p) {
 block1:
diff --git a/test/Transforms/GVN/nonescaping-malloc.ll b/test/Transforms/GVN/nonescaping-malloc.ll
index dba9d81..afcb7fe 100644
--- a/test/Transforms/GVN/nonescaping-malloc.ll
+++ b/test/Transforms/GVN/nonescaping-malloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -stats -disable-output |& grep {Number of loads deleted}
+; RUN: opt < %s -basicaa -gvn -stats -disable-output 2>&1 | grep "Number of loads deleted"
 ; rdar://7363102
 
 ; GVN should be able to eliminate load %tmp22.i, because it is redundant with
diff --git a/test/Transforms/GVN/pr12979.ll b/test/Transforms/GVN/pr12979.ll
new file mode 100644
index 0000000..669da91
--- /dev/null
+++ b/test/Transforms/GVN/pr12979.ll
@@ -0,0 +1,79 @@
+; RUN: opt %s -gvn -S -o - | FileCheck %s
+
+define i32 @test1(i32 %x, i32 %y) {
+; CHECK: @test1(i32 %x, i32 %y)
+; CHECK: %add1 = add i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nsw i32 %x, %y
+  %add2 = add     i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test2(i32 %x, i32 %y) {
+; CHECK: @test2(i32 %x, i32 %y)
+; CHECK: %add1 = add i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nuw i32 %x, %y
+  %add2 = add     i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test3(i32 %x, i32 %y) {
+; CHECK: @test3(i32 %x, i32 %y)
+; CHECK: %add1 = add i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nuw nsw i32 %x, %y
+  %add2 = add     i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test4(i32 %x, i32 %y) {
+; CHECK: @test4(i32 %x, i32 %y)
+; CHECK: %add1 = add nsw i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test5(i32 %x, i32 %y) {
+; CHECK: @test5(i32 %x, i32 %y)
+; CHECK: %add1 = add i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nuw i32 %x, %y
+  %add2 = add nsw i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test6(i32 %x, i32 %y) {
+; CHECK: @test6(i32 %x, i32 %y)
+; CHECK: %add1 = add nsw i32 %x, %y
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add nuw nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
+
+define i32 @test7(i32 %x, i32 %y) {
+; CHECK: @test7(i32 %x, i32 %y)
+; CHECK: %add1 = add i32 %x, %y
+; CHECK-NOT: what_is_this
+; CHECK: %foo = add i32 %add1, %add1
+
+  %add1 = add i32 %x, %y, !what_is_this !{}
+  %add2 = add i32 %x, %y
+  %foo = add i32 %add1, %add2
+  ret i32 %foo
+}
diff --git a/test/Transforms/GVN/pre-basic-add.ll b/test/Transforms/GVN/pre-basic-add.ll
index c13099f..4bde05c 100644
--- a/test/Transforms/GVN/pre-basic-add.ll
+++ b/test/Transforms/GVN/pre-basic-add.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -gvn -enable-pre -S | grep {.pre}
+; RUN: opt < %s -gvn -enable-pre -S | grep ".pre"
 
 @H = common global i32 0		; <i32*> [#uses=2]
 @G = common global i32 0		; <i32*> [#uses=1]
diff --git a/test/Transforms/GVN/range.ll b/test/Transforms/GVN/range.ll
new file mode 100644
index 0000000..3759c41
--- /dev/null
+++ b/test/Transforms/GVN/range.ll
@@ -0,0 +1,101 @@
+; RUN: opt %s -basicaa -gvn -S -o - | FileCheck %s
+
+define i32 @test1(i32* %p) {
+; CHECK: @test1(i32* %p)
+; CHECK: %a = load i32* %p, !range !0
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !0
+  %b = load i32* %p, !range !0
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test2(i32* %p) {
+; CHECK: @test2(i32* %p)
+; CHECK: %a = load i32* %p
+; CHECK-NOT: range
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !0
+  %b = load i32* %p
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test3(i32* %p) {
+; CHECK: @test3(i32* %p)
+; CHECK: %a = load i32* %p, !range ![[DISJOINT_RANGE:[0-9]+]]
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !0
+  %b = load i32* %p, !range !1
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test4(i32* %p) {
+; CHECK: @test4(i32* %p)
+; CHECK: %a = load i32* %p, !range ![[MERGED_RANGE:[0-9]+]]
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !0
+  %b = load i32* %p, !range !2
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test5(i32* %p) {
+; CHECK: @test5(i32* %p)
+; CHECK: %a = load i32* %p, !range ![[MERGED_SIGNED_RANGE:[0-9]+]]
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !3
+  %b = load i32* %p, !range !4
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test6(i32* %p) {
+; CHECK: @test6(i32* %p)
+; CHECK: %a = load i32* %p, !range ![[MERGED_TEST6:[0-9]+]]
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !5
+  %b = load i32* %p, !range !6
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test7(i32* %p) {
+; CHECK: @test7(i32* %p)
+; CHECK: %a = load i32* %p, !range ![[MERGED_TEST7:[0-9]+]]
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !7
+  %b = load i32* %p, !range !8
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test8(i32* %p) {
+; CHECK: @test8(i32* %p)
+; CHECK: %a = load i32* %p
+; CHECK-NOT: range
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !range !9
+  %b = load i32* %p, !range !10
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+; CHECK: ![[DISJOINT_RANGE]] = metadata !{i32 0, i32 2, i32 3, i32 5}
+; CHECK: ![[MERGED_RANGE]] = metadata !{i32 0, i32 5}
+; CHECK: ![[MERGED_SIGNED_RANGE]] = metadata !{i32 -3, i32 -2, i32 1, i32 2}
+; CHECK: ![[MERGED_TEST6]] = metadata !{i32 10, i32 1}
+; CHECK: ![[MERGED_TEST7]] = metadata !{i32 3, i32 4, i32 5, i32 2}
+
+!0 = metadata !{i32 0, i32 2}
+!1 = metadata !{i32 3, i32 5}
+!2 = metadata !{i32 2, i32 5}
+!3 = metadata !{i32 -3, i32 -2}
+!4 = metadata !{i32 1, i32 2}
+!5 = metadata !{i32 10, i32 1}
+!6 = metadata !{i32 12, i32 13}
+!7 = metadata !{i32 1, i32 2, i32 3, i32 4}
+!8 = metadata !{i32 5, i32 1}
+!9 = metadata !{i32 1, i32 5}
+!10 = metadata !{i32 5, i32 1}
diff --git a/test/Transforms/GVN/rle-must-alias.ll b/test/Transforms/GVN/rle-must-alias.ll
index 4797240..e7dc9c4 100644
--- a/test/Transforms/GVN/rle-must-alias.ll
+++ b/test/Transforms/GVN/rle-must-alias.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {DEAD = phi i32 }
+; RUN: opt < %s -basicaa -gvn -S | grep "DEAD = phi i32 "
 
 ; GVN should eliminate the fully redundant %9 GEP which 
 ; allows DEAD to be removed.  This is PR3198.
diff --git a/test/Transforms/GVN/rle-semidominated.ll b/test/Transforms/GVN/rle-semidominated.ll
index c6cd1fd..71aa548 100644
--- a/test/Transforms/GVN/rle-semidominated.ll
+++ b/test/Transforms/GVN/rle-semidominated.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -gvn -S | grep {DEAD = phi i32 }
+; RUN: opt < %s -basicaa -gvn -S | grep "DEAD = phi i32 "
 
 define i32 @main(i32* %p) {
 block1:
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
new file mode 100644
index 0000000..90661c6
--- /dev/null
+++ b/test/Transforms/GVN/tbaa.ll
@@ -0,0 +1,81 @@
+; RUN: opt %s -basicaa -gvn -S -o - | FileCheck %s
+
+define i32 @test1(i8* %p, i8* %q) {
+; CHECK: @test1(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p)
+; CHECK-NOT: tbaa
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !0
+  %b = call i32 @foo(i8* %p)
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test2(i8* %p, i8* %q) {
+; CHECK: @test2(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa !0
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !0
+  %b = call i32 @foo(i8* %p), !tbaa !0
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test3(i8* %p, i8* %q) {
+; CHECK: @test3(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa !3
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !3
+  %b = call i32 @foo(i8* %p), !tbaa !3
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test4(i8* %p, i8* %q) {
+; CHECK: @test4(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !1
+  %b = call i32 @foo(i8* %p), !tbaa !0
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test5(i8* %p, i8* %q) {
+; CHECK: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !0
+  %b = call i32 @foo(i8* %p), !tbaa !1
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test6(i8* %p, i8* %q) {
+; CHECK: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !0
+  %b = call i32 @foo(i8* %p), !tbaa !3
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test7(i8* %p, i8* %q) {
+; CHECK: @test7(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p)
+; CHECK-NOT: tbaa
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !4
+  %b = call i32 @foo(i8* %p), !tbaa !3
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+declare i32 @foo(i8*) readonly
+
+!0 = metadata !{metadata !"C", metadata !1}
+!1 = metadata !{metadata !"A", metadata !2}
+!2 = metadata !{metadata !"tbaa root", null}
+!3 = metadata !{metadata !"B", metadata !1}
+!4 = metadata !{metadata !"another root", null}
diff --git a/test/Transforms/GlobalOpt/2008-01-13-OutOfRangeSROA.ll b/test/Transforms/GlobalOpt/2008-01-13-OutOfRangeSROA.ll
index 82abc8fe..7c07d5d 100644
--- a/test/Transforms/GlobalOpt/2008-01-13-OutOfRangeSROA.ll
+++ b/test/Transforms/GlobalOpt/2008-01-13-OutOfRangeSROA.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalopt -S | grep {16 x .31 x double.. zeroinitializer}
+; RUN: opt < %s -globalopt -S | grep "16 x .31 x double.. zeroinitializer"
 
 ; The 'X' indices could be larger than 31.  Do not SROA the outer indices of this array.
 @mm = internal global [16 x [31 x double]] zeroinitializer, align 32
diff --git a/test/Transforms/GlobalOpt/2008-01-29-VolatileGlobal.ll b/test/Transforms/GlobalOpt/2008-01-29-VolatileGlobal.ll
index 588d5c9..08b2cb1 100644
--- a/test/Transforms/GlobalOpt/2008-01-29-VolatileGlobal.ll
+++ b/test/Transforms/GlobalOpt/2008-01-29-VolatileGlobal.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalopt -S | grep {load volatile}
+; RUN: opt < %s -globalopt -S | grep "load volatile"
 @t0.1441 = internal global double 0x3FD5555555555555, align 8		; <double*> [#uses=1]
 
 define double @foo() nounwind  {
diff --git a/test/Transforms/GlobalOpt/2008-04-26-SROA-Global-Align.ll b/test/Transforms/GlobalOpt/2008-04-26-SROA-Global-Align.ll
index 5b06fea..d58becd 100644
--- a/test/Transforms/GlobalOpt/2008-04-26-SROA-Global-Align.ll
+++ b/test/Transforms/GlobalOpt/2008-04-26-SROA-Global-Align.ll
@@ -2,9 +2,9 @@
 ; alignments.  Elements 0 and 2 must be 16-byte aligned, and element 
 ; 1 must be at least 8 byte aligned (but could be more). 
 
-; RUN: opt < %s -globalopt -S | grep {@G.0 = internal unnamed_addr global .*align 16}
-; RUN: opt < %s -globalopt -S | grep {@G.1 = internal unnamed_addr global .*align 8}
-; RUN: opt < %s -globalopt -S | grep {@G.2 = internal unnamed_addr global .*align 16}
+; RUN: opt < %s -globalopt -S | grep "@G.0 = internal unnamed_addr global .*align 16"
+; RUN: opt < %s -globalopt -S | grep "@G.1 = internal unnamed_addr global .*align 8"
+; RUN: opt < %s -globalopt -S | grep "@G.2 = internal unnamed_addr global .*align 16"
 ; rdar://5891920
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/Transforms/GlobalOpt/2009-01-13-phi-user.ll b/test/Transforms/GlobalOpt/2009-01-13-phi-user.ll
index c4b6e52..e76c44d 100644
--- a/test/Transforms/GlobalOpt/2009-01-13-phi-user.ll
+++ b/test/Transforms/GlobalOpt/2009-01-13-phi-user.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalopt -S | grep {phi.*@head}
+; RUN: opt < %s -globalopt -S | grep "phi.*@head"
 ; PR3321
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index 3154856..0f3efa0 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalopt -stats -disable-output |& grep "1 globalopt - Number of global vars shrunk to booleans"
+; RUN: opt < %s -globalopt -stats -disable-output 2>&1 | grep "1 globalopt - Number of global vars shrunk to booleans"
 
 @Stop = internal global i32 0                     ; <i32*> [#uses=3]
 
diff --git a/test/Transforms/GlobalOpt/2009-03-07-PromotePtrToBool.ll b/test/Transforms/GlobalOpt/2009-03-07-PromotePtrToBool.ll
index d645ce4..059af1c 100644
--- a/test/Transforms/GlobalOpt/2009-03-07-PromotePtrToBool.ll
+++ b/test/Transforms/GlobalOpt/2009-03-07-PromotePtrToBool.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -globalopt -S | grep {@X = internal unnamed_addr global i32}
+; RUN: opt < %s -globalopt -S | grep "@X = internal unnamed_addr global i32"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 @X = internal global i32* null		; <i32**> [#uses=2]
diff --git a/test/Transforms/GlobalOpt/2009-11-16-BrokenPerformHeapAllocSRoA.ll b/test/Transforms/GlobalOpt/2009-11-16-BrokenPerformHeapAllocSRoA.ll
index 54e8f90..40862bd 100644
--- a/test/Transforms/GlobalOpt/2009-11-16-BrokenPerformHeapAllocSRoA.ll
+++ b/test/Transforms/GlobalOpt/2009-11-16-BrokenPerformHeapAllocSRoA.ll
@@ -17,7 +17,7 @@ define void @test() nounwind ssp {
   %2 = sext i32 %1 to i64                         ; <i64> [#uses=1]
   %3 = mul i64 %2, ptrtoint (%struct.strchartype* getelementptr (%struct.strchartype* null, i64 1) to i64) ; <i64> [#uses=1]
   %4 = tail call i8* @malloc(i64 %3)              ; <i8*> [#uses=1]
-; CHECK: call i8* @malloc(i64
+; CHECK-NOT: call i8* @malloc(i64
   %5 = bitcast i8* %4 to %struct.strchartype*     ; <%struct.strchartype*> [#uses=1]
   store %struct.strchartype* %5, %struct.strchartype** @chartypes, align 8
   ret void
diff --git a/test/Transforms/GlobalOpt/2012-05-11-blockaddress.ll b/test/Transforms/GlobalOpt/2012-05-11-blockaddress.ll
new file mode 100644
index 0000000..0c58c1a
--- /dev/null
+++ b/test/Transforms/GlobalOpt/2012-05-11-blockaddress.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+; Check that the mere presence of a blockaddress doesn't prevent -globalopt
+; from promoting @f to fastcc.
+
+; CHECK: define{{.*}}fastcc{{.*}}@f
+define internal i8* @f() {
+  ret i8* blockaddress(@f, %L1)
+L1:
+  ret i8* null
+}
+
+define void @g() {
+  ; CHECK: call{{.*}}fastcc{{.*}}@f
+  %p = call i8* @f()
+  ret void
+}
diff --git a/test/Transforms/GlobalOpt/cleanup-pointer-root-users.ll b/test/Transforms/GlobalOpt/cleanup-pointer-root-users.ll
new file mode 100644
index 0000000..a472f10
--- /dev/null
+++ b/test/Transforms/GlobalOpt/cleanup-pointer-root-users.ll
@@ -0,0 +1,49 @@
+; RUN: opt -globalopt -S -o - < %s | FileCheck %s
+
+@glbl = internal global i8* null
+
+define void @test1a() {
+; CHECK: @test1a
+; CHECK-NOT: store
+; CHECK-NEXT: ret void
+  store i8* null, i8** @glbl
+  ret void
+}
+
+define void @test1b(i8* %p) {
+; CHECK: @test1b
+; CHECK-NEXT: store
+; CHECK-NEXT: ret void
+  store i8* %p, i8** @glbl
+  ret void
+}
+
+define void @test2() {
+; CHECK: @test2
+; CHECK: alloca i8
+  %txt = alloca i8
+  call void @foo2(i8* %txt)
+  %call2 = call i8* @strdup(i8* %txt)
+  store i8* %call2, i8** @glbl
+  ret void
+}
+declare i8* @strdup(i8*)
+declare void @foo2(i8*)
+
+define void @test3() uwtable {
+; CHECK: @test3
+; CHECK-NOT: bb1:
+; CHECK-NOT: bb2:
+; CHECK: invoke
+  %ptr = invoke i8* @_Znwm(i64 1)
+          to label %bb1 unwind label %bb2
+bb1:
+  store i8* %ptr, i8** @glbl
+  unreachable
+bb2:
+  %tmp1 = landingpad { i8*, i32 } personality i32 (i32, i64, i8*, i8*)* @__gxx_personality_v0
+          cleanup
+  resume { i8*, i32 } %tmp1
+}
+declare i32 @__gxx_personality_v0(i32, i64, i8*, i8*)
+declare i8* @_Znwm(i64)
diff --git a/test/Transforms/GlobalOpt/constantexpr-dangle.ll b/test/Transforms/GlobalOpt/constantexpr-dangle.ll
index 099c607..be13a98 100644
--- a/test/Transforms/GlobalOpt/constantexpr-dangle.ll
+++ b/test/Transforms/GlobalOpt/constantexpr-dangle.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -globalopt -S | \
-; RUN:   grep {internal fastcc float @foo}
+; RUN:   grep "internal fastcc float @foo"
 
 define internal float @foo() {
         ret float 0.000000e+00
diff --git a/test/Transforms/GlobalOpt/deadglobal.ll b/test/Transforms/GlobalOpt/deadglobal.ll
index c8d8e76..cad5a91 100644
--- a/test/Transforms/GlobalOpt/deadglobal.ll
+++ b/test/Transforms/GlobalOpt/deadglobal.ll
@@ -1,9 +1,25 @@
-; RUN: opt < %s -globalopt -S | not grep internal
+; RUN: opt < %s -globalopt -S | FileCheck %s
 
-@G = internal global i32 123            ; <i32*> [#uses=1]
+@G1 = internal global i32 123            ; <i32*> [#uses=1]
 
-define void @foo() {
-        store i32 1, i32* @G
+; CHECK-NOT: @G1
+; CHECK: @G2
+; CHECK-NOT: @G3
+
+define void @foo1() {
+; CHECK: define void @foo
+; CHECK-NEXT: ret
+        store i32 1, i32* @G1
+        ret void
+}
+
+@G2 = linkonce_odr constant i32 42
+
+define void @foo2() {
+; CHECK: define void @foo2
+; CHECK-NEXT: store
+        store i32 1, i32* @G2
         ret void
 }
 
+@G3 = linkonce_odr constant i32 42
diff --git a/test/Transforms/GlobalOpt/globalsra-unknown-index.ll b/test/Transforms/GlobalOpt/globalsra-unknown-index.ll
index 1e0db6a..cc655e9 100644
--- a/test/Transforms/GlobalOpt/globalsra-unknown-index.ll
+++ b/test/Transforms/GlobalOpt/globalsra-unknown-index.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -globalopt -S > %t
-; RUN: grep {@Y = internal unnamed_addr global \\\[3 x \[%\]struct.X\\\] zeroinitializer} %t
+; RUN: grep "@Y = internal unnamed_addr global \[3 x [%]struct.X\] zeroinitializer" %t
 ; RUN: grep load %t | count 6
-; RUN: grep {add i32 \[%\]a, \[%\]b} %t | count 3
+; RUN: grep "add i32 [%]a, [%]b" %t | count 3
 
 ; globalopt should not sra the global, because it can't see the index.
 
diff --git a/test/Transforms/GlobalOpt/heap-sra-phi.ll b/test/Transforms/GlobalOpt/heap-sra-phi.ll
index 6188e5a..123ad85 100644
--- a/test/Transforms/GlobalOpt/heap-sra-phi.ll
+++ b/test/Transforms/GlobalOpt/heap-sra-phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -globalopt -S | grep {tmp.f1 = phi i32. }
-; RUN: opt < %s -globalopt -S | grep {tmp.f0 = phi i32. }
+; RUN: opt < %s -globalopt -S | grep "tmp.f1 = phi i32. "
+; RUN: opt < %s -globalopt -S | grep "tmp.f0 = phi i32. "
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 	%struct.foo = type { i32, i32 }
diff --git a/test/Transforms/GlobalOpt/integer-bool.ll b/test/Transforms/GlobalOpt/integer-bool.ll
index 59403b1..5a34a9c 100644
--- a/test/Transforms/GlobalOpt/integer-bool.ll
+++ b/test/Transforms/GlobalOpt/integer-bool.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -globalopt -instcombine | \
-; RUN:    llvm-dis | grep {ret i1 true}
+; RUN:    llvm-dis | grep "ret i1 true"
 
 ;; check that global opt turns integers that only hold 0 or 1 into bools.
 
diff --git a/test/Transforms/GlobalOpt/memcpy.ll b/test/Transforms/GlobalOpt/memcpy.ll
index 94e07a0..dcfe009 100644
--- a/test/Transforms/GlobalOpt/memcpy.ll
+++ b/test/Transforms/GlobalOpt/memcpy.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -globalopt -S | \
-; RUN:   grep {G1 = internal unnamed_addr constant}
+; RUN:   grep "G1 = internal unnamed_addr constant"
 
 @G1 = internal global [58 x i8] c"asdlfkajsdlfkajsd;lfkajds;lfkjasd;flkajsd;lkfja;sdlkfjasd\00"         ; <[58 x i8]*> [#uses=1]
 
diff --git a/test/Transforms/GlobalOpt/storepointer-compare.ll b/test/Transforms/GlobalOpt/storepointer-compare.ll
index 2f5ae86..09e20a8 100644
--- a/test/Transforms/GlobalOpt/storepointer-compare.ll
+++ b/test/Transforms/GlobalOpt/storepointer-compare.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -globalopt -S | \
-; RUN:   grep {call void @Actual}
+; RUN:   grep "call void @Actual"
 
 ; Check that a comparison does not prevent an indirect call from being made 
 ; direct.  The global will still remain, but indirect call elim is still good.
diff --git a/test/Transforms/GlobalOpt/unnamed-addr.ll b/test/Transforms/GlobalOpt/unnamed-addr.ll
index be02821..ee75058 100644
--- a/test/Transforms/GlobalOpt/unnamed-addr.ll
+++ b/test/Transforms/GlobalOpt/unnamed-addr.ll
@@ -4,17 +4,31 @@
 @b = internal global i32 0, align 4
 @c = internal global i32 0, align 4
 @d = internal constant [4 x i8] c"foo\00", align 1
+@e = linkonce_odr global i32 0
 
 ; CHECK: @a = internal global i32 0, align 4
 ; CHECK: @b = internal global i32 0, align 4
 ; CHECK: @c = internal unnamed_addr global i32 0, align 4
 ; CHECK: @d = internal unnamed_addr constant [4 x i8] c"foo\00", align 1
+; CHECK: @e = linkonce_odr global i32 0
+
+define i32 @get_e() {
+       %t = load i32* @e
+       ret i32 %t
+}
+
+define void @set_e(i32 %x) {
+       store i32 %x, i32* @e
+       ret void
+}
 
 define i1 @bah(i64 %i) nounwind readonly optsize ssp {
 entry:
   %arrayidx4 = getelementptr inbounds [4 x i8]* @d, i64 0, i64 %i
   %tmp5 = load i8* %arrayidx4, align 1
-  %cmp = icmp eq i8 %tmp5, 42
+  %array0 = bitcast [4 x i8]* @d to i8*
+  %tmp6 = load i8* %array0, align 1
+  %cmp = icmp eq i8 %tmp5, %tmp6
   ret i1 %cmp
 }
 
diff --git a/test/Transforms/IPConstantProp/2008-06-09-WeakProp.ll b/test/Transforms/IPConstantProp/2008-06-09-WeakProp.ll
index 6640336..54a65d6 100644
--- a/test/Transforms/IPConstantProp/2008-06-09-WeakProp.ll
+++ b/test/Transforms/IPConstantProp/2008-06-09-WeakProp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -ipconstprop -S | grep {ret i32 %r}
+; RUN: opt < %s -ipconstprop -S | grep "ret i32 %r"
 ; Should not propagate the result of a weak function.
 ; PR2411
 
diff --git a/test/Transforms/IPConstantProp/return-argument.ll b/test/Transforms/IPConstantProp/return-argument.ll
index f4b7018..2a14f05 100644
--- a/test/Transforms/IPConstantProp/return-argument.ll
+++ b/test/Transforms/IPConstantProp/return-argument.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -ipconstprop -S > %t
-; RUN: cat %t | grep {store i32 %Z, i32\\* %Q}
-; RUN: cat %t | grep {add i32 1, 3}
+; RUN: cat %t | grep "store i32 %Z, i32\* %Q"
+; RUN: cat %t | grep "add i32 1, 3"
 
 ;; This function returns its second argument on all return statements
 define internal i32* @incdec(i1 %C, i32* %V) {
diff --git a/test/Transforms/IPConstantProp/return-constant.ll b/test/Transforms/IPConstantProp/return-constant.ll
index ff15df7..499d383 100644
--- a/test/Transforms/IPConstantProp/return-constant.ll
+++ b/test/Transforms/IPConstantProp/return-constant.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -ipconstprop -instcombine | \
-; RUN:    llvm-dis | grep {ret i1 true} | count 2
+; RUN:    llvm-dis | grep "ret i1 true" | count 2
 define internal i32 @foo(i1 %C) {
         br i1 %C, label %T, label %F
 
diff --git a/test/Transforms/IPConstantProp/return-constants.ll b/test/Transforms/IPConstantProp/return-constants.ll
index 2cd99fe..be2ca71 100644
--- a/test/Transforms/IPConstantProp/return-constants.ll
+++ b/test/Transforms/IPConstantProp/return-constants.ll
@@ -1,8 +1,8 @@
 ; RUN: opt < %s -ipconstprop -S > %t
 ;; Check that the 21 constants got propagated properly
-; RUN: cat %t | grep {%M = add i32 21, 21}
+; RUN: cat %t | grep "%M = add i32 21, 21"
 ;; Check that the second return values didn't get propagated
-; RUN: cat %t | grep {%N = add i32 %B, %D}
+; RUN: cat %t | grep "%N = add i32 %B, %D"
 
 %0 = type { i32, i32 }
 
diff --git a/test/Transforms/IndVarSimplify/2005-02-26-ExitValueCompute.ll b/test/Transforms/IndVarSimplify/2005-02-26-ExitValueCompute.ll
index 1ba6982..edeead1 100644
--- a/test/Transforms/IndVarSimplify/2005-02-26-ExitValueCompute.ll
+++ b/test/Transforms/IndVarSimplify/2005-02-26-ExitValueCompute.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -indvars -S | \
-; RUN:   grep {ret i32 152}
+; RUN:   grep "ret i32 152"
 
 define i32 @main() {
 entry:
diff --git a/test/Transforms/IndVarSimplify/2006-03-31-NegativeStride.ll b/test/Transforms/IndVarSimplify/2006-03-31-NegativeStride.ll
index 1bbc631..c4e6cd4 100644
--- a/test/Transforms/IndVarSimplify/2006-03-31-NegativeStride.ll
+++ b/test/Transforms/IndVarSimplify/2006-03-31-NegativeStride.ll
@@ -1,6 +1,6 @@
 ; PR726
 ; RUN: opt < %s -indvars -S | \
-; RUN:   grep {ret i32 27}
+; RUN:   grep "ret i32 27"
 
 ; Make sure to compute the right exit value based on negative strides.
 
diff --git a/test/Transforms/IndVarSimplify/2007-01-06-TripCount.ll b/test/Transforms/IndVarSimplify/2007-01-06-TripCount.ll
index 268b8d1..6366c8c 100644
--- a/test/Transforms/IndVarSimplify/2007-01-06-TripCount.ll
+++ b/test/Transforms/IndVarSimplify/2007-01-06-TripCount.ll
@@ -1,5 +1,5 @@
 ; PR1015
-; RUN: opt < %s -indvars -S | not grep {ret i32 0}
+; RUN: opt < %s -indvars -S | not grep "ret i32 0"
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8"
diff --git a/test/Transforms/IndVarSimplify/2009-04-14-shorten_iv_vars.ll b/test/Transforms/IndVarSimplify/2009-04-14-shorten_iv_vars.ll
index dd400be..b461566 100644
--- a/test/Transforms/IndVarSimplify/2009-04-14-shorten_iv_vars.ll
+++ b/test/Transforms/IndVarSimplify/2009-04-14-shorten_iv_vars.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S | not grep {sext}
+; RUN: opt < %s -indvars -S | not grep "sext"
 ; ModuleID = '<stdin>'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
 target triple = "x86_64-apple-darwin9.6"
diff --git a/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll b/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
index 55e8a50..0722d89 100644
--- a/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
+++ b/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -instcombine -S | not grep {\[sz\]ext}
+; RUN: opt < %s -indvars -instcombine -S | not grep "[sz]ext"
 ; ModuleID = '<stdin>'
 ;extern int *a, *b, *c, *d, *e, *f;  /* 64 bit */
 ;extern int K[256];
diff --git a/test/Transforms/IndVarSimplify/2012-07-17-lftr-undef.ll b/test/Transforms/IndVarSimplify/2012-07-17-lftr-undef.ll
new file mode 100644
index 0000000..7c5f818
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2012-07-17-lftr-undef.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+; PR13371: indvars pass incorrectly substitutes 'undef' values
+;
+; LFTR should not user %undef as the loop counter.
+; CHECK: @test
+; CHECK-NOT: icmp{{.*}}undef
+@.str3 = private constant [6 x i8] c"%lld\0A\00", align 1
+declare i32 @printf(i8* noalias nocapture, ...) nounwind
+define i64 @test() nounwind {
+func_start:
+  br label %block9
+block9:                                           ; preds = %block9,%func_start
+  %undef = phi i64 [ %next_undef, %block9 ], [ undef, %func_start ]
+  %iter = phi i64 [ %next_iter, %block9 ], [ 1, %func_start ]
+  %next_iter = add nsw i64 %iter, 1
+  %0 = tail call i32 (i8*, ...)* @printf(i8* noalias nocapture getelementptr inbounds ([6 x i8]* @.str3, i64 0, i64 0), i64 %next_iter, i64 %undef)
+  %next_undef = add nsw i64 %undef, 1
+  %_tmp_3 = icmp slt i64 %next_iter, 100
+  br i1 %_tmp_3, label %block9, label %exit
+exit:                                             ; preds = %block9
+  ret i64 0
+}
diff --git a/test/Transforms/IndVarSimplify/eliminate-max.ll b/test/Transforms/IndVarSimplify/eliminate-max.ll
index c25bd0e..98510ea 100644
--- a/test/Transforms/IndVarSimplify/eliminate-max.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-max.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -indvars | grep {= icmp} | count 3
+; RUN: opt < %s -S -indvars | grep "= icmp" | count 3
 ; PR4914.ll
 
 ; Indvars should be able to do range analysis and eliminate icmps.
diff --git a/test/Transforms/IndVarSimplify/lftr-reuse.ll b/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 9abfe13..7fb36e5 100644
--- a/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -153,6 +153,9 @@ return:
 ; Remove %i which is only used by the exit test.
 ; Verify that SCEV can still compute a backedge count from the sign
 ; extended %n, used for pointer comparison by LFTR.
+;
+; TODO: Fix for PR13371 currently makes this impossible. See
+; IndVarSimplify.cpp hasConcreteDef(). We may want to change to undef rules.
 define void @geplftr(i8* %base, i32 %x, i32 %y, i32 %n) nounwind {
 entry:
   %x.ext = sext i32 %x to i64
@@ -162,13 +165,13 @@ entry:
   %lim = add i32 %x, %n
   %cmp.ph = icmp ult i32 %x, %lim
   br i1 %cmp.ph, label %loop, label %exit
-
+; CHECK: @geplftr
 ; CHECK: loop:
 ; CHECK: phi i8*
-; CHECK-NOT: phi
+; DISABLE-NOT: phi      // This check is currently disabled
 ; CHECK: getelementptr
 ; CHECK: store
-; CHECK: icmp ne i8*
+; DISABLE: icmp ne i8*  // This check is currently disabled
 ; CHECK: br i1
 loop:
   %i = phi i32 [ %x, %entry ], [ %inc, %loop ]
@@ -187,7 +190,7 @@ exit:
 define void @nevertaken() nounwind uwtable ssp {
 entry:
   br label %loop
-
+; CHECK: @nevertaken
 ; CHECK: loop:
 ; CHECK-NOT: phi
 ; CHECK-NOT: add
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate10.ll b/test/Transforms/IndVarSimplify/loop_evaluate10.ll
index c3619f6..e51a341 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate10.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate10.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -indvars -S \
-; RUN:   | grep {%b.1 = phi i32 \\\[ 2, %bb \\\], \\\[ 1, %bb2 \\\]}
+; RUN:   | grep "%b.1 = phi i32 [ 2, %bb ], [ 1, %bb2 ]"
 ;
 ; This loop has multiple exits, and the value of %b1 depends on which
 ; exit is taken. Indvars should correctly compute the exit values.
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate9.ll b/test/Transforms/IndVarSimplify/loop_evaluate9.ll
index 9f3bcaf..21fb7ef 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate9.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate9.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -indvars -S > %t
-; RUN: grep {\[%\]tmp7 = icmp eq i8 -28, -28} %t
-; RUN: grep {\[%\]tmp8 = icmp eq i8 63, 63} %t
+; RUN: grep "[%]tmp7 = icmp eq i8 -28, -28" %t
+; RUN: grep "[%]tmp8 = icmp eq i8 63, 63" %t
 ; PR4477
 ; Indvars should compute the exit values in loop.
 ;
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_3.ll b/test/Transforms/IndVarSimplify/loop_evaluate_3.ll
index 65c66f7..0c1b590 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate_3.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S | grep {ret i32 600000}
+; RUN: opt < %s -indvars -S | grep "ret i32 600000"
 ; PR1179
 
 define i32 @foo() {
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_4.ll b/test/Transforms/IndVarSimplify/loop_evaluate_4.ll
index e4b642c..d7eb406 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate_4.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S | grep {ret i32 9900}
+; RUN: opt < %s -indvars -S | grep "ret i32 9900"
 ; PR1179
 
 define i32 @test4() {
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_5.ll b/test/Transforms/IndVarSimplify/loop_evaluate_5.ll
index 80b961a..38f95bf 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate_5.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S | grep {120, %bb2.bb3_crit_edge}
+; RUN: opt < %s -indvars -S | grep "120, %bb2.bb3_crit_edge"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/Transforms/IndVarSimplify/shrunk-constant.ll b/test/Transforms/IndVarSimplify/shrunk-constant.ll
index 271f8ed..45297d6 100644
--- a/test/Transforms/IndVarSimplify/shrunk-constant.ll
+++ b/test/Transforms/IndVarSimplify/shrunk-constant.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep {\\-->  (zext i4 {-7,+,-8}<%loop> to i32)}
+; RUN:  | grep "\-->  (zext i4 {-7,+,-8}<%loop> to i32)"
 
 define fastcc void @foo() nounwind {
 entry:
diff --git a/test/Transforms/IndVarSimplify/ult-sub-to-eq.ll b/test/Transforms/IndVarSimplify/ult-sub-to-eq.ll
new file mode 100644
index 0000000..c58a3af
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/ult-sub-to-eq.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+define void @test1(float* nocapture %autoc, float* nocapture %data, float %d, i32 %data_len, i32 %sample) nounwind {
+entry:
+  %sub = sub i32 %data_len, %sample
+  %cmp4 = icmp eq i32 %data_len, %sample
+  br i1 %cmp4, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %0 = trunc i64 %indvars.iv to i32
+  %add = add i32 %0, %sample
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds float* %data, i64 %idxprom
+  %1 = load float* %arrayidx, align 4
+  %mul = fmul float %1, %d
+  %arrayidx2 = getelementptr inbounds float* %autoc, i64 %indvars.iv
+  %2 = load float* %arrayidx2, align 4
+  %add3 = fadd float %2, %mul
+  store float %add3, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp ult i32 %3, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+
+; CHECK: @test1
+
+; First check that we move the sub into the preheader, it doesn't have to be
+; executed if %cmp4 == false
+; CHECK: for.body.preheader:
+; CHECK: sub i32 %data_len, %sample
+; CHECK: br label %for.body
+
+; Second, check that we turn the IV test into an eq.
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: %exitcond = icmp ne i32 %lftr.wideiv, %0
+; CHECK: br i1 %exitcond, label %for.body, label %for.end.loopexit
+}
+
diff --git a/test/Transforms/Inline/2007-04-15-InlineEH.ll b/test/Transforms/Inline/2007-04-15-InlineEH.ll
index 8fbcf92..b114537 100644
--- a/test/Transforms/Inline/2007-04-15-InlineEH.ll
+++ b/test/Transforms/Inline/2007-04-15-InlineEH.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S | not grep {invoke void asm}
+; RUN: opt < %s -inline -S | not grep "invoke void asm"
 ; PR1335
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/Transforms/Inline/casts.ll b/test/Transforms/Inline/casts.ll
index 166185a..a7b051b 100644
--- a/test/Transforms/Inline/casts.ll
+++ b/test/Transforms/Inline/casts.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S | grep {ret i32 1}
+; RUN: opt < %s -inline -S | grep "ret i32 1"
 ; ModuleID = 'short.opt.bc'
 
 define i32 @testBool(i1 %X) {
diff --git a/test/Transforms/Inline/delete-call.ll b/test/Transforms/Inline/delete-call.ll
index 3505608..7716d6a 100644
--- a/test/Transforms/Inline/delete-call.ll
+++ b/test/Transforms/Inline/delete-call.ll
@@ -1,5 +1,5 @@
-; RUN: opt %s -S  -inline -functionattrs -stats |& grep {Number of call sites deleted, not inlined}
-; RUN: opt %s -S  -inline -stats |& grep {Number of functions inlined}
+; RUN: opt %s -S  -inline -functionattrs -stats 2>&1 | grep "Number of call sites deleted, not inlined"
+; RUN: opt %s -S  -inline -stats 2>&1 | grep "Number of functions inlined"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin9.8"
diff --git a/test/Transforms/Inline/externally_available.ll b/test/Transforms/Inline/externally_available.ll
index 08b5638..07274e7 100644
--- a/test/Transforms/Inline/externally_available.ll
+++ b/test/Transforms/Inline/externally_available.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -inline -constprop -S > %t
 ; RUN: not grep test_function %t
-; RUN: grep {ret i32 5} %t
+; RUN: grep "ret i32 5" %t
 
 
 ; test_function should not be emitted to the .s file.
diff --git a/test/Transforms/Inline/inline-byval-bonus.ll b/test/Transforms/Inline/inline-byval-bonus.ll
new file mode 100644
index 0000000..f3ed819
--- /dev/null
+++ b/test/Transforms/Inline/inline-byval-bonus.ll
@@ -0,0 +1,193 @@
+; RUN: opt -S -inline -inline-threshold=275 < %s | FileCheck %s
+; PR13095
+
+; The performance of the c-ray benchmark largely depends on the inlining of a
+; specific call to @ray_sphere. This test case is designed to verify that it's
+; inlined at -O3.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.sphere = type { %struct.vec3, double, %struct.material, %struct.sphere* }
+%struct.vec3 = type { double, double, double }
+%struct.material = type { %struct.vec3, double, double }
+%struct.ray = type { %struct.vec3, %struct.vec3 }
+%struct.spoint = type { %struct.vec3, %struct.vec3, %struct.vec3, double }
+
+define i32 @caller(%struct.sphere* %i) {
+  %shadow_ray = alloca %struct.ray, align 8
+  call void @fix(%struct.ray* %shadow_ray)
+
+  %call = call i32 @ray_sphere(%struct.sphere* %i, %struct.ray* byval align 8 %shadow_ray, %struct.spoint* null)
+  ret i32 %call
+
+; CHECK: @caller
+; CHECK-NOT: call i32 @ray_sphere
+; CHECK: ret i32
+}
+
+declare void @fix(%struct.ray*)
+
+define i32 @ray_sphere(%struct.sphere* nocapture %sph, %struct.ray* nocapture byval align 8 %ray, %struct.spoint* %sp) nounwind uwtable ssp {
+  %1 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 0
+  %2 = load double* %1, align 8
+  %3 = fmul double %2, %2
+  %4 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 1
+  %5 = load double* %4, align 8
+  %6 = fmul double %5, %5
+  %7 = fadd double %3, %6
+  %8 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 2
+  %9 = load double* %8, align 8
+  %10 = fmul double %9, %9
+  %11 = fadd double %7, %10
+  %12 = fmul double %2, 2.000000e+00
+  %13 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 0
+  %14 = load double* %13, align 8
+  %15 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 0
+  %16 = load double* %15, align 8
+  %17 = fsub double %14, %16
+  %18 = fmul double %12, %17
+  %19 = fmul double %5, 2.000000e+00
+  %20 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 1
+  %21 = load double* %20, align 8
+  %22 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 1
+  %23 = load double* %22, align 8
+  %24 = fsub double %21, %23
+  %25 = fmul double %19, %24
+  %26 = fadd double %18, %25
+  %27 = fmul double %9, 2.000000e+00
+  %28 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 2
+  %29 = load double* %28, align 8
+  %30 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 2
+  %31 = load double* %30, align 8
+  %32 = fsub double %29, %31
+  %33 = fmul double %27, %32
+  %34 = fadd double %26, %33
+  %35 = fmul double %16, %16
+  %36 = fmul double %23, %23
+  %37 = fadd double %35, %36
+  %38 = fmul double %31, %31
+  %39 = fadd double %37, %38
+  %40 = fmul double %14, %14
+  %41 = fadd double %40, %39
+  %42 = fmul double %21, %21
+  %43 = fadd double %42, %41
+  %44 = fmul double %29, %29
+  %45 = fadd double %44, %43
+  %46 = fsub double -0.000000e+00, %16
+  %47 = fmul double %14, %46
+  %48 = fmul double %21, %23
+  %49 = fsub double %47, %48
+  %50 = fmul double %29, %31
+  %51 = fsub double %49, %50
+  %52 = fmul double %51, 2.000000e+00
+  %53 = fadd double %52, %45
+  %54 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 1
+  %55 = load double* %54, align 8
+  %56 = fmul double %55, %55
+  %57 = fsub double %53, %56
+  %58 = fmul double %34, %34
+  %59 = fmul double %11, 4.000000e+00
+  %60 = fmul double %59, %57
+  %61 = fsub double %58, %60
+  %62 = fcmp olt double %61, 0.000000e+00
+  br i1 %62, label %130, label %63
+
+; <label>:63                                      ; preds = %0
+  %64 = tail call double @sqrt(double %61) nounwind readnone
+  %65 = fsub double -0.000000e+00, %34
+  %66 = fsub double %64, %34
+  %67 = fmul double %11, 2.000000e+00
+  %68 = fdiv double %66, %67
+  %69 = fsub double %65, %64
+  %70 = fdiv double %69, %67
+  %71 = fcmp olt double %68, 1.000000e-06
+  %72 = fcmp olt double %70, 1.000000e-06
+  %or.cond = and i1 %71, %72
+  br i1 %or.cond, label %130, label %73
+
+; <label>:73                                      ; preds = %63
+  %74 = fcmp ogt double %68, 1.000000e+00
+  %75 = fcmp ogt double %70, 1.000000e+00
+  %or.cond1 = and i1 %74, %75
+  br i1 %or.cond1, label %130, label %76
+
+; <label>:76                                      ; preds = %73
+  %77 = icmp eq %struct.spoint* %sp, null
+  br i1 %77, label %130, label %78
+
+; <label>:78                                      ; preds = %76
+  %t1.0 = select i1 %71, double %70, double %68
+  %t2.0 = select i1 %72, double %t1.0, double %70
+  %79 = fcmp olt double %t1.0, %t2.0
+  %80 = select i1 %79, double %t1.0, double %t2.0
+  %81 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 3
+  store double %80, double* %81, align 8
+  %82 = fmul double %80, %2
+  %83 = fadd double %14, %82
+  %84 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 0, i32 0
+  store double %83, double* %84, align 8
+  %85 = fmul double %5, %80
+  %86 = fadd double %21, %85
+  %87 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 0, i32 1
+  store double %86, double* %87, align 8
+  %88 = fmul double %9, %80
+  %89 = fadd double %29, %88
+  %90 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 0, i32 2
+  store double %89, double* %90, align 8
+  %91 = load double* %15, align 8
+  %92 = fsub double %83, %91
+  %93 = load double* %54, align 8
+  %94 = fdiv double %92, %93
+  %95 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 1, i32 0
+  store double %94, double* %95, align 8
+  %96 = load double* %22, align 8
+  %97 = fsub double %86, %96
+  %98 = load double* %54, align 8
+  %99 = fdiv double %97, %98
+  %100 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 1, i32 1
+  store double %99, double* %100, align 8
+  %101 = load double* %30, align 8
+  %102 = fsub double %89, %101
+  %103 = load double* %54, align 8
+  %104 = fdiv double %102, %103
+  %105 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 1, i32 2
+  store double %104, double* %105, align 8
+  %106 = fmul double %2, %94
+  %107 = fmul double %5, %99
+  %108 = fadd double %106, %107
+  %109 = fmul double %9, %104
+  %110 = fadd double %108, %109
+  %111 = fmul double %110, 2.000000e+00
+  %112 = fmul double %94, %111
+  %113 = fsub double %112, %2
+  %114 = fsub double -0.000000e+00, %113
+  %115 = fmul double %99, %111
+  %116 = fsub double %115, %5
+  %117 = fsub double -0.000000e+00, %116
+  %118 = fmul double %104, %111
+  %119 = fsub double %118, %9
+  %120 = fsub double -0.000000e+00, %119
+  %.06 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 2, i32 0
+  %.18 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 2, i32 1
+  %.210 = getelementptr inbounds %struct.spoint* %sp, i64 0, i32 2, i32 2
+  %121 = fmul double %113, %113
+  %122 = fmul double %116, %116
+  %123 = fadd double %121, %122
+  %124 = fmul double %119, %119
+  %125 = fadd double %123, %124
+  %126 = tail call double @sqrt(double %125) nounwind readnone
+  %127 = fdiv double %114, %126
+  store double %127, double* %.06, align 8
+  %128 = fdiv double %117, %126
+  store double %128, double* %.18, align 8
+  %129 = fdiv double %120, %126
+  store double %129, double* %.210, align 8
+  br label %130
+
+; <label>:130                                     ; preds = %78, %76, %73, %63, %0
+  %.0 = phi i32 [ 0, %0 ], [ 0, %73 ], [ 0, %63 ], [ 1, %76 ], [ 1, %78 ]
+  ret i32 %.0
+}
+
+declare double @sqrt(double) nounwind readnone
diff --git a/test/Transforms/Inline/inline-invoke-tail.ll b/test/Transforms/Inline/inline-invoke-tail.ll
index 1f34113..e077523 100644
--- a/test/Transforms/Inline/inline-invoke-tail.ll
+++ b/test/Transforms/Inline/inline-invoke-tail.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S | not grep {tail call void @llvm.memcpy.i32}
+; RUN: opt < %s -inline -S | not grep "tail call void @llvm.memcpy.i32"
 ; PR3550
 
 define internal void @foo(i32* %p, i32* %q) {
diff --git a/test/Transforms/Inline/inline-optsize.ll b/test/Transforms/Inline/inline-optsize.ll
new file mode 100644
index 0000000..20d7426
--- /dev/null
+++ b/test/Transforms/Inline/inline-optsize.ll
@@ -0,0 +1,33 @@
+; RUN: opt -S -Oz %s | FileCheck %s -check-prefix=OZ
+; RUN: opt -S -O2 %s | FileCheck %s -check-prefix=O2
+
+; The inline threshold for a function with the optsize attribute is currently
+; the same as the global inline threshold for -Os. Check that the optsize
+; function attribute don't alter the function specific inline threshold if the
+; global inline threshold is lower (as for -Oz).
+
+@a = global i32 4
+
+; This function should be larger than the inline threshold for -Oz (25), but
+; smaller than the inline threshold for optsize (75).
+define i32 @inner() {
+  %a1 = load volatile i32* @a
+  %x1 = add i32 %a1,  %a1
+  %a2 = load volatile i32* @a
+  %x2 = add i32 %x1, %a2
+  %a3 = load volatile i32* @a
+  %x3 = add i32 %x2, %a3
+  %a4 = load volatile i32* @a
+  %x4 = add i32 %x3, %a4
+  %a5 = load volatile i32* @a
+  %x5 = add i32 %x3, %a5
+  ret i32 %x5
+}
+
+; @inner() should be inlined for -O2 but not for -Oz.
+; OZ: call
+; O2-NOT: call
+define i32 @outer() optsize {
+   %r = call i32 @inner()
+   ret i32 %r
+}
diff --git a/test/Transforms/Inline/inline_constprop.ll b/test/Transforms/Inline/inline_constprop.ll
index dc35b60..0b48a72 100644
--- a/test/Transforms/Inline/inline_constprop.ll
+++ b/test/Transforms/Inline/inline_constprop.ll
@@ -110,3 +110,65 @@ bb.merge:
 bb.false:
   ret i32 %sub
 }
+
+
+define i32 @PR13412.main() {
+; This is a somewhat complicated three layer subprogram that was reported to
+; compute the wrong value for a branch due to assuming that an argument
+; mid-inline couldn't be equal to another pointer.
+;
+; After inlining, the branch should point directly to the exit block, not to
+; the intermediate block.
+; CHECK: @PR13412.main
+; CHECK: br i1 true, label %[[TRUE_DEST:.*]], label %[[FALSE_DEST:.*]]
+; CHECK: [[FALSE_DEST]]:
+; CHECK-NEXT: call void @PR13412.fail()
+; CHECK: [[TRUE_DEST]]:
+; CHECK-NEXT: ret i32 0
+
+entry:
+  %i1 = alloca i64
+  store i64 0, i64* %i1
+  %arraydecay = bitcast i64* %i1 to i32*
+  %call = call i1 @PR13412.first(i32* %arraydecay, i32* %arraydecay)
+  br i1 %call, label %cond.end, label %cond.false
+
+cond.false:
+  call void @PR13412.fail()
+  br label %cond.end
+
+cond.end:
+  ret i32 0
+}
+
+define internal i1 @PR13412.first(i32* %a, i32* %b) {
+entry:
+  %call = call i32* @PR13412.second(i32* %a, i32* %b)
+  %cmp = icmp eq i32* %call, %b
+  ret i1 %cmp
+}
+
+declare void @PR13412.fail()
+
+define internal i32* @PR13412.second(i32* %a, i32* %b) {
+entry:
+  %sub.ptr.lhs.cast = ptrtoint i32* %b to i64
+  %sub.ptr.rhs.cast = ptrtoint i32* %a to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div = ashr exact i64 %sub.ptr.sub, 2
+  %cmp = icmp ugt i64 %sub.ptr.div, 1
+  br i1 %cmp, label %if.then, label %if.end3
+
+if.then:
+  %0 = load i32* %a
+  %1 = load i32* %b
+  %cmp1 = icmp eq i32 %0, %1
+  br i1 %cmp1, label %return, label %if.end3
+
+if.end3:
+  br label %return
+
+return:
+  %retval.0 = phi i32* [ %b, %if.end3 ], [ %a, %if.then ]
+  ret i32* %retval.0
+}
diff --git a/test/Transforms/Inline/inline_prune.ll b/test/Transforms/Inline/inline_prune.ll
index 658a422..4c1574d 100644
--- a/test/Transforms/Inline/inline_prune.ll
+++ b/test/Transforms/Inline/inline_prune.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -inline -S | \
-; RUN:    not grep {callee\[12\](}
+; RUN:    not grep "callee[12]("
 ; RUN: opt < %s -inline -S | not grep mul
 
 define internal i32 @callee1(i32 %A, i32 %B) {
diff --git a/test/Transforms/Inline/invoke_test-1.ll b/test/Transforms/Inline/invoke_test-1.ll
index e0e6d60..922351f 100644
--- a/test/Transforms/Inline/invoke_test-1.ll
+++ b/test/Transforms/Inline/invoke_test-1.ll
@@ -2,7 +2,7 @@
 ; instructions
 
 ; RUN: opt < %s -inline -S | \
-; RUN:   not grep {call\[^e\]}
+; RUN:   not grep "call[^e]"
 
 declare void @might_throw()
 
diff --git a/test/Transforms/InstCombine/2004-08-10-BoolSetCC.ll b/test/Transforms/InstCombine/2004-08-10-BoolSetCC.ll
index 1154bb4..4233797 100644
--- a/test/Transforms/InstCombine/2004-08-10-BoolSetCC.ll
+++ b/test/Transforms/InstCombine/2004-08-10-BoolSetCC.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {ret i1 false}
+; RUN:    grep "ret i1 false"
 
 define i1 @test(i1 %V) {
         %Y = icmp ult i1 %V, false              ; <i1> [#uses=1]
diff --git a/test/Transforms/InstCombine/2004-09-20-BadLoadCombine.ll b/test/Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
index 8169d21..d17db8d 100644
--- a/test/Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
+++ b/test/Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -mem2reg -S | \
-; RUN:   not grep {i32 1}
+; RUN:   not grep "i32 1"
 
 ; When propagating the load through the select, make sure that the load is
 ; inserted where the original load was, not where the select is.  Not doing
diff --git a/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll b/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
index e646edf..0d5fc81 100644
--- a/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
+++ b/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -mem2reg -simplifycfg | \
-; RUN:   llvm-dis | grep -v store | not grep {i32 1}
+; RUN:   llvm-dis | grep -v store | not grep "i32 1"
 
 ; Test to make sure that instcombine does not accidentally propagate the load
 ; into the PHI, which would break the program.
diff --git a/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll b/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll
index 38553d7..02bc043 100644
--- a/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll
+++ b/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   not grep {ret i1 false}
+; RUN:   not grep "ret i1 false"
 
 define i1 @test(i64 %tmp.169) {
         %tmp.1710 = lshr i64 %tmp.169, 1                ; <i64> [#uses=1]
diff --git a/test/Transforms/InstCombine/2005-06-16-SetCCOrSetCCMiscompile.ll b/test/Transforms/InstCombine/2005-06-16-SetCCOrSetCCMiscompile.ll
index 3d887dd..0a513c6 100644
--- a/test/Transforms/InstCombine/2005-06-16-SetCCOrSetCCMiscompile.ll
+++ b/test/Transforms/InstCombine/2005-06-16-SetCCOrSetCCMiscompile.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep {ret i1 true}
+; RUN:   grep "ret i1 true"
 ; PR586
 
 @g_07918478 = external global i32               ; <i32*> [#uses=1]
diff --git a/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll b/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll
index 5a74bd2..295006c 100644
--- a/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll
+++ b/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep {icmp sgt}
+; RUN:   grep "icmp sgt"
 ; END.
 target datalayout = "e-p:32:32"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/Transforms/InstCombine/2006-12-15-Range-Test.ll b/test/Transforms/InstCombine/2006-12-15-Range-Test.ll
index c3700a0..0c8eece 100644
--- a/test/Transforms/InstCombine/2006-12-15-Range-Test.ll
+++ b/test/Transforms/InstCombine/2006-12-15-Range-Test.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | \
 ; RUN:   grep icmp | count 1
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep {icmp ugt} | count 1
+; RUN:   grep "icmp ugt" | count 1
 ; END.
 
 target datalayout = "e-p:32:32"
diff --git a/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll b/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll
index e5238a5..635a09c 100644
--- a/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll
+++ b/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp ugt}
+; RUN: opt < %s -instcombine -S | grep "icmp ugt"
 ; PR1107
 ; PR1940
 
diff --git a/test/Transforms/InstCombine/2007-01-14-FcmpSelf.ll b/test/Transforms/InstCombine/2007-01-14-FcmpSelf.ll
index d2d215f..4fcfd26 100644
--- a/test/Transforms/InstCombine/2007-01-14-FcmpSelf.ll
+++ b/test/Transforms/InstCombine/2007-01-14-FcmpSelf.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {fcmp uno.*0.0}
+; RUN: opt < %s -instcombine -S | grep "fcmp uno.*0.0"
 ; PR1111
 define i1 @test(double %X) {
   %tmp = fcmp une double %X, %X
diff --git a/test/Transforms/InstCombine/2007-01-27-AndICmp.ll b/test/Transforms/InstCombine/2007-01-27-AndICmp.ll
index bd15dce..4d1b982 100644
--- a/test/Transforms/InstCombine/2007-01-27-AndICmp.ll
+++ b/test/Transforms/InstCombine/2007-01-27-AndICmp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ugt.*, 1}
+; RUN: opt < %s -instcombine -S | grep "ugt.*, 1"
 
 define i1 @test(i32 %tmp1030) {
 	%tmp1037 = icmp ne i32 %tmp1030, 40		; <i1> [#uses=1]
diff --git a/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll b/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll
index 05891a2..e2bebec 100644
--- a/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll
+++ b/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -instcombine -mem2reg -S | grep {%A = alloca} 
+; RUN: opt < %s -instcombine -mem2reg -S | grep "%A = alloca" 
 ; RUN: opt < %s -instcombine -mem2reg -S | \
-; RUN:    not grep {%B = alloca}
+; RUN:    not grep "%B = alloca"
 ; END.
 
 ; Ensure that instcombine doesn't sink the loads in entry/cond_true into 
diff --git a/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll b/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll
index 109e4a2..826d68a 100644
--- a/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll
+++ b/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp sle}
+; RUN: opt < %s -instcombine -S | grep "icmp sle"
 ; PR1244
 
 define i1 @test(i32 %c.3.i, i32 %d.292.2.i) {
diff --git a/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll b/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll
index ca93af3..719da70 100644
--- a/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll
+++ b/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll
@@ -1,5 +1,5 @@
 ; For PR1248
-; RUN: opt < %s -instcombine -S | grep {ugt i32 .*, 11}
+; RUN: opt < %s -instcombine -S | grep "ugt i32 .*, 11"
 define i1 @test(i32 %tmp6) {
   %tmp7 = sdiv i32 %tmp6, 12     ; <i32> [#uses=1]
   icmp ne i32 %tmp7, -6           ; <i1>:1 [#uses=1]
diff --git a/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll b/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
index c794004..7e9c9e2 100644
--- a/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
+++ b/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
@@ -1,6 +1,6 @@
 ; PR1271
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {icmp eq i32 .tmp.*, 2146435072}
+; RUN:    grep "icmp eq i32 .tmp.*, 2146435072"
 %struct..0anon = type { i32, i32 }
 %struct..1anon = type { double }
 
diff --git a/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll b/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll
index 807efcf..c4070a1 100644
--- a/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll
+++ b/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll
@@ -1,6 +1,6 @@
 ; PR1271
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {ashr exact i32 %.mp137, 2}
+; RUN:    grep "ashr exact i32 %.mp137, 2"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll b/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
index 15988b6..eb0c364 100644
--- a/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
+++ b/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {call.*sret}
+; RUN: opt < %s -instcombine -S | grep "call.*sret"
 ; Make sure instcombine doesn't drop the sret attribute.
 
 define void @blah(i16* %tmp10) {
diff --git a/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll b/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll
index 62b9351..082b215 100644
--- a/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll
+++ b/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ashr}
+; RUN: opt < %s -instcombine -S | grep "ashr"
 ; PR1499
 
 define void @av_cmp_q_cond_true(i32* %retval, i32* %tmp9, i64* %tmp10) {
diff --git a/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll b/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll
index af539c1..b2b04d6 100644
--- a/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll
+++ b/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 true}
+; RUN: opt < %s -instcombine -S | grep "ret i1 true"
 ; rdar://5278853
 
 define i1 @test(i32 %tmp468) {
diff --git a/test/Transforms/InstCombine/2007-10-28-stacksave.ll b/test/Transforms/InstCombine/2007-10-28-stacksave.ll
index 4c5c367..95a445c 100644
--- a/test/Transforms/InstCombine/2007-10-28-stacksave.ll
+++ b/test/Transforms/InstCombine/2007-10-28-stacksave.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {call.*stacksave}
+; RUN: opt < %s -instcombine -S | grep "call.*stacksave"
 ; PR1745
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
diff --git a/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll b/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll
index 5282739..6b83dd9 100644
--- a/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll
+++ b/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp eq i32 %In, 1}
+; RUN: opt < %s -instcombine -S | grep "icmp eq i32 %In, 1"
 ; PR1800
 
 define i1 @test(i32 %In) {
diff --git a/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll b/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll
index 6420537b..89f8672 100644
--- a/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll
+++ b/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll
@@ -1,6 +1,6 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
-; RUN: opt < %s -instcombine -S | not grep {ret i1 0}
+; RUN: opt < %s -instcombine -S | not grep "ret i1 0"
 ; PR1850
 
 define i1 @test() {
diff --git a/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll b/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll
index cc89f6d..3745e87 100644
--- a/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll
+++ b/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {add} | count 1
+; RUN: opt < %s -instcombine -S | grep "add" | count 1
 
 define i32 @foo(i32 %a) {
 entry:
diff --git a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll b/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
index 28a94ce..1f9c47c 100644
--- a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
+++ b/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | not grep {a.off}
+; RUN: opt < %s -instcombine -S | not grep "a.off"
 ; PR1949
 
 define i1 @test1(i32 %a) {
diff --git a/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll b/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll
index af61c15..917d3d9 100644
--- a/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll
+++ b/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i.* 0} | count 2
+; RUN: opt < %s -instcombine -S | grep "ret i.* 0" | count 2
 ; PR2048
 
 define i32 @i(i32 %a) {
diff --git a/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll b/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll
index d26dec1..854f8cb 100644
--- a/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll
+++ b/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {sdiv i8 \%a, 9}
+; RUN: opt < %s -instcombine -S | grep "sdiv i8 \%a, 9"
 ; PR2048
 
 define i8 @i(i8 %a) {
diff --git a/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll b/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll
index da7e49e..0fa4d71 100644
--- a/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll
+++ b/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {16} | count 1
+; RUN: opt < %s -instcombine -S | grep "16" | count 1
 
 define i8* @bork(i8** %qux) {
   %tmp275 = load i8** %qux, align 1
diff --git a/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll b/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll
index de08c32..dba6cdb 100644
--- a/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll
+++ b/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store volatile}
+; RUN: opt < %s -instcombine -S | grep "store volatile"
 
 define void @test() {
 	%votf = alloca <4 x float>		; <<4 x float>*> [#uses=1]
diff --git a/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll b/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll
index 1286e3d..fd0217e 100644
--- a/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll
+++ b/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {load volatile} | count 2
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 @g_1 = internal global i32 0		; <i32*> [#uses=3]
diff --git a/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll b/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll
index ebbd3a7..8022414 100644
--- a/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll
+++ b/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {load volatile} | count 2
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
 ; PR2262
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll b/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll
index bbd0042..7a1c844 100644
--- a/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll
+++ b/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store i8} | count 3
+; RUN: opt < %s -instcombine -S | grep "store i8" | count 3
 ; PR2297
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll b/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll
index b34fc1e..a0e95a9 100644
--- a/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll
+++ b/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 false} | count 2
+; RUN: opt < %s -instcombine -S | grep "ret i1 false" | count 2
 ; PR2329
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/Transforms/InstCombine/2008-05-23-CompareFold.ll b/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
index 2de5af7..acb259b 100644
--- a/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
+++ b/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 false}
+; RUN: opt < %s -instcombine -S | grep "ret i1 false"
 ; PR2359
 define i1 @f(i8* %x) {
 entry:
diff --git a/test/Transforms/InstCombine/2008-05-31-AddBool.ll b/test/Transforms/InstCombine/2008-05-31-AddBool.ll
index 5416693..ed20690 100644
--- a/test/Transforms/InstCombine/2008-05-31-AddBool.ll
+++ b/test/Transforms/InstCombine/2008-05-31-AddBool.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {xor}
+; RUN: opt < %s -instcombine -S | grep "xor"
 ; PR2389
 
 define i1 @test(i1 %a, i1 %b) {
diff --git a/test/Transforms/InstCombine/2008-05-31-Bools.ll b/test/Transforms/InstCombine/2008-05-31-Bools.ll
index a0fe47a..7c33f2d 100644
--- a/test/Transforms/InstCombine/2008-05-31-Bools.ll
+++ b/test/Transforms/InstCombine/2008-05-31-Bools.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S > %t
-; RUN: grep {xor} %t
-; RUN: grep {and} %t
-; RUN: not grep {div} %t
+; RUN: grep "xor" %t
+; RUN: grep "and" %t
+; RUN: not grep "div" %t
 
 define i1 @foo1(i1 %a, i1 %b) {
   %A = sub i1 %a, %b
diff --git a/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll b/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll
index 917d3ae..ec94623 100644
--- a/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll
+++ b/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {phi i32} | count 2
+; RUN: opt < %s -instcombine -S | grep "phi i32" | count 2
 
 define void @test() nounwind  {
 entry:
diff --git a/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll b/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll
index 08959c9..cc46926 100644
--- a/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll
+++ b/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store i32} | count 2
+; RUN: opt < %s -instcombine -S | grep "store i32" | count 2
 
 @g_139 = global i32 0           ; <i32*> [#uses=2]
 
diff --git a/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll b/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll
index aed1b14..bf5e96b 100644
--- a/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll
+++ b/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store i8} | count 2
+; RUN: opt < %s -instcombine -S | grep "store i8" | count 2
 
 define i32 @a(i8* %s) nounwind  {
 entry:
diff --git a/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll b/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll
index c3371c6..80bd83b 100644
--- a/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll
+++ b/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp eq i32 %In, 15}
+; RUN: opt < %s -instcombine -S | grep "icmp eq i32 %In, 15"
 ; PR2479
 ; (See also PR1800.)
 
diff --git a/test/Transforms/InstCombine/2008-06-24-StackRestore.ll b/test/Transforms/InstCombine/2008-06-24-StackRestore.ll
index 4f4709b..9c4c1b5 100644
--- a/test/Transforms/InstCombine/2008-06-24-StackRestore.ll
+++ b/test/Transforms/InstCombine/2008-06-24-StackRestore.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {call.*llvm.stackrestore}
+; RUN: opt < %s -instcombine -S | grep "call.*llvm.stackrestore"
 ; PR2488
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
diff --git a/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll b/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll
index 8245b4d..cfca72a 100644
--- a/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll
+++ b/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp ne i32 \%a}
+; RUN: opt < %s -instcombine -S | grep "icmp ne i32 \%a"
 ; PR2330
 
 define i1 @foo(i32 %a) nounwind  {
diff --git a/test/Transforms/InstCombine/2008-07-08-SubAnd.ll b/test/Transforms/InstCombine/2008-07-08-SubAnd.ll
index 0091159..a3d44cb 100644
--- a/test/Transforms/InstCombine/2008-07-08-SubAnd.ll
+++ b/test/Transforms/InstCombine/2008-07-08-SubAnd.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep -v {i32 8}
+; RUN: opt < %s -instcombine -S | grep -v "i32 8"
 ; PR2330
 
 define i32 @a(i32 %a) nounwind  {
diff --git a/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll b/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll
index 1ed5323..dcf4bef 100644
--- a/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll
+++ b/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {load volatile} | count 2
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
 ; PR2496
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/Transforms/InstCombine/2008-07-09-SubAndError.ll b/test/Transforms/InstCombine/2008-07-09-SubAndError.ll
index 47a7590..ed01414 100644
--- a/test/Transforms/InstCombine/2008-07-09-SubAndError.ll
+++ b/test/Transforms/InstCombine/2008-07-09-SubAndError.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | not grep {sub i32 0}
+; RUN: opt < %s -instcombine -S | not grep "sub i32 0"
 ; PR2330
 
 define i32 @foo(i32 %a) nounwind {
diff --git a/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll b/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll
index e911532..786f0c5 100644
--- a/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll
+++ b/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {%C = xor i1 %A, true}
-; RUN: opt < %s -instcombine -S | grep {ret i1 false}
+; RUN: opt < %s -instcombine -S | grep "%C = xor i1 %A, true"
+; RUN: opt < %s -instcombine -S | grep "ret i1 false"
 ; PR2539
 
 define i1 @test1(i1 %A) {
diff --git a/test/Transforms/InstCombine/2008-07-13-DivZero.ll b/test/Transforms/InstCombine/2008-07-13-DivZero.ll
index be1f8c2..18c99542 100644
--- a/test/Transforms/InstCombine/2008-07-13-DivZero.ll
+++ b/test/Transforms/InstCombine/2008-07-13-DivZero.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {lshr.*3}
-; RUN: opt < %s -instcombine -S | grep {call .*%cond}
+; RUN: opt < %s -instcombine -S | grep "lshr.*3"
+; RUN: opt < %s -instcombine -S | grep "call .*%cond"
 ; PR2506
 
 ; We can simplify the operand of udiv to '8', but not the operand to the
diff --git a/test/Transforms/InstCombine/2008-07-16-sse2_storel_dq.ll b/test/Transforms/InstCombine/2008-07-16-sse2_storel_dq.ll
index 501d8a6..b469887 100644
--- a/test/Transforms/InstCombine/2008-07-16-sse2_storel_dq.ll
+++ b/test/Transforms/InstCombine/2008-07-16-sse2_storel_dq.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | not grep {store }
+; RUN: opt < %s -instcombine -S | not grep "store "
 ; PR2296
 
 @G = common global double 0.000000e+00, align 16
diff --git a/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll b/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll
index 31ea94a..4d00d49 100644
--- a/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll
+++ b/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {or i1}
+; RUN: opt < %s -instcombine -S | grep "or i1"
 ; PR2844
 
 define i32 @test(i32 %p_74) {
diff --git a/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll b/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll
index fd36d86..cf29f8d 100644
--- a/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll
+++ b/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 false}
+; RUN: opt < %s -instcombine -S | grep "ret i1 false"
 ; PR2697
 
 define i1 @x(i32 %x) nounwind {
diff --git a/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll b/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll
index aa077e2..679cc5f 100644
--- a/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll
+++ b/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 true}
+; RUN: opt < %s -instcombine -S | grep "ret i1 true"
 ; PR2993
 
 define i1 @foo(i32 %x) {
diff --git a/test/Transforms/InstCombine/2008-11-08-FCmp.ll b/test/Transforms/InstCombine/2008-11-08-FCmp.ll
index c636288..f33a1f5 100644
--- a/test/Transforms/InstCombine/2008-11-08-FCmp.ll
+++ b/test/Transforms/InstCombine/2008-11-08-FCmp.ll
@@ -45,3 +45,12 @@ define i1 @test6(i32 %val) {
   ret i1 %2
 ; CHECK: ret i1 false
 }
+
+; Check that optimizing unsigned >= comparisons correctly distinguishes
+; positive and negative constants.  <rdar://problem/12029145>
+define i1 @test7(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp oge double %1, 3.200000e+00
+  ret i1 %2
+; CHECK: icmp ugt i32 %val, 3
+}
diff --git a/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll b/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll
index e4c7ebc..75bd5e0 100644
--- a/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll
+++ b/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {i8 2, i8 2}
+; RUN: opt < %s -instcombine -S | grep "i8 2, i8 2"
 ; PR2756
 
 define <2 x i8> @foo(<2 x i8> %x) {
diff --git a/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll b/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
index a61a94e..50ea2f4 100644
--- a/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
+++ b/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -instcombine -S > %t
-; RUN: grep {, align 4} %t | count 3
-; RUN: grep {, align 8} %t | count 3
+; RUN: grep ", align 4" %t | count 3
+; RUN: grep ", align 8" %t | count 3
 ; rdar://6480438
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.6"
diff --git a/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll b/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
index ce62f35..949fc59 100644
--- a/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
+++ b/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store.*addrspace(1)}
+; RUN: opt < %s -instcombine -S | grep "store.*addrspace(1)"
 ; PR3335
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.6"
diff --git a/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll b/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll
index 1421347..68c51b4 100644
--- a/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll
+++ b/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -simplifycfg -instcombine -S | grep 0x7FF8000000000000 | count 12
-; RUN: opt < %s -simplifycfg -instcombine -S | grep {0\\.0} | count 3
-; RUN: opt < %s -simplifycfg -instcombine -S | grep {3\\.5} | count 1
+; RUN: opt < %s -simplifycfg -instcombine -S | grep "0\.0" | count 3
+; RUN: opt < %s -simplifycfg -instcombine -S | grep "3\.5" | count 1
 ;
 
 ; ModuleID = 'apf.c'
diff --git a/test/Transforms/InstCombine/2009-01-31-Pressure.ll b/test/Transforms/InstCombine/2009-01-31-Pressure.ll
index c3ee9a3..666b02e 100644
--- a/test/Transforms/InstCombine/2009-01-31-Pressure.ll
+++ b/test/Transforms/InstCombine/2009-01-31-Pressure.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {%B = add i8 %b, %x}
+; RUN: opt < %s -instcombine -S | grep "%B = add i8 %b, %x"
 ; PR2698
 
 declare void @use1(i1)
diff --git a/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll b/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
index a51c47d..9146a8e 100644
--- a/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
+++ b/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -scalarrepl -S | not grep { = alloca}
+; RUN: opt < %s -instcombine -scalarrepl -S | not grep " = alloca"
 ; rdar://6417724
 ; Instcombine shouldn't do anything to this function that prevents promoting the allocas inside it.
 
diff --git a/test/Transforms/InstCombine/2009-02-21-LoadCST.ll b/test/Transforms/InstCombine/2009-02-21-LoadCST.ll
index f56fc38..cb8a77c 100644
--- a/test/Transforms/InstCombine/2009-02-21-LoadCST.ll
+++ b/test/Transforms/InstCombine/2009-02-21-LoadCST.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i32 3679669}
+; RUN: opt < %s -instcombine -S | grep "ret i32 3679669"
 ; PR3595
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll b/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll
index 0a07bf3..4d47977 100644
--- a/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll
+++ b/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ashr i32 %val, 31}
+; RUN: opt < %s -instcombine -S | grep "ashr i32 %val, 31"
 ; PR3851
 
 define i32 @foo2(i32 %val) nounwind {
diff --git a/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll b/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll
index 244b22a..b79edf6 100644
--- a/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll
+++ b/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {mul i64}
+; RUN: opt < %s -instcombine -S | grep "mul i64"
 ; rdar://6762288
 
 ; Instcombine should not promote the mul to i96 because it is definitely
diff --git a/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll b/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll
index e5355b8..468c1cd 100644
--- a/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll
+++ b/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {store i32 0,}
+; RUN: opt < %s -instcombine -S | grep "store i32 0,"
 ; PR4366
 
 define void @a() {
diff --git a/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll b/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll
index 441d5f9..eb28994 100644
--- a/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll
+++ b/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll
@@ -5,8 +5,8 @@
 define i32 @main(i32 %argc) nounwind ssp {
 entry:
   %tmp3151 = trunc i32 %argc to i8
-; CHECK: %tmp3162 = shl i8 %tmp3151, 5
-; CHECK: and i8 %tmp3162, 64
+; CHECK: %tmp3163 = shl i8 %tmp3162, 6
+; CHECK: and i8 %tmp3163, 64
 ; CHECK-NOT: shl
 ; CHECK-NOT: shr
   %tmp3161 = or i8 %tmp3151, -17
@@ -38,8 +38,8 @@ bb:
   %tmp10 = lshr i8 %tmp8, 7
   %tmp11 = shl i8 %tmp10, 5
 
-; CHECK: %0 = lshr i8 %tmp8, 2
-; CHECK: %tmp11 = and i8 %0, 32
+; CHECK: %tmp10 = lshr i8 %tmp8, 7
+; CHECK: %tmp11 = shl nuw nsw i8 %tmp10, 5
 
   %tmp12 = xor i8 %tmp11, %tmp9
   ret i8 %tmp12
diff --git a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
index 2f72b73..fedb46d 100644
--- a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
+++ b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
@@ -2,8 +2,10 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0.0"
 
+; CHECK: define void @fu1
 define void @fu1(i32 %parm) nounwind ssp {
   %1 = alloca i32, align 4
+; CHECK: alloca double*
   %ptr = alloca double*, align 4
   store i32 %parm, i32* %1, align 4
   store double* null, double** %ptr, align 4
@@ -16,12 +18,12 @@ define void @fu1(i32 %parm) nounwind ssp {
   %6 = mul nsw i32 %5, 8
 ; With "nsw", the alloca and its bitcast can be fused:
   %7 = add nsw i32 %6, 2048
-; CHECK: alloca double*
+;  CHECK: alloca double
   %8 = alloca i8, i32 %7
   %9 = bitcast i8* %8 to double*
+; CHECK-NEXT: store double*
   store double* %9, double** %ptr, align 4
   br label %10
-
 ; <label>:10                                      ; preds = %4, %0
   %11 = load double** %ptr, align 4
   call void @bar(double* %11)
@@ -31,6 +33,7 @@ define void @fu1(i32 %parm) nounwind ssp {
 
 declare void @bar(double*)
 
+; CHECK: define void @fu2
 define void @fu2(i32 %parm) nounwind ssp {
   %1 = alloca i32, align 4
   %ptr = alloca double*, align 4
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
new file mode 100644
index 0000000..0907c490
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -0,0 +1,68 @@
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios0"
+
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMul() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulS() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulU() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b  
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/2012-04-24-vselect.ll b/test/Transforms/InstCombine/2012-04-24-vselect.ll
new file mode 100644
index 0000000..8d2de2b
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-04-24-vselect.ll
@@ -0,0 +1,13 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; CHECK: @foo
+; CHECK: <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+define <8 x i32> @foo() nounwind {
+entry:
+  %v1.i = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>,
+    <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %v1.i
+}
+
diff --git a/test/Transforms/InstCombine/2012-05-27-Negative-Shift-Crash.ll b/test/Transforms/InstCombine/2012-05-27-Negative-Shift-Crash.ll
new file mode 100644
index 0000000..2ec0a32
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-05-27-Negative-Shift-Crash.ll
@@ -0,0 +1,61 @@
+; RUN: opt -inline -instcombine -S < %s
+; PR12967
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@e = common global i32 0, align 4
+@f = common global i32 0, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+define signext i8 @fn1(i32 %p1) nounwind uwtable readnone ssp {
+entry:
+  %shr = lshr i32 1, %p1
+  %conv = trunc i32 %shr to i8
+  ret i8 %conv
+}
+
+define void @fn4() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @d, align 4, !tbaa !0
+  %cmp = icmp eq i32 %0, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @c, align 4, !tbaa !0
+  tail call void @fn3(i32 %conv) nounwind
+  ret void
+}
+
+define void @fn3(i32 %p1) nounwind uwtable ssp {
+entry:
+  %and = and i32 %p1, 8
+  store i32 %and, i32* @e, align 4, !tbaa !0
+  %sub = add nsw i32 %and, -1
+  store i32 %sub, i32* @f, align 4, !tbaa !0
+  %0 = load i32* @a, align 4, !tbaa !0
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32* @b, align 4, !tbaa !0
+  %.lobit = lshr i32 %1, 31
+  %2 = trunc i32 %.lobit to i8
+  %.not = xor i8 %2, 1
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call = tail call signext i8 @fn1(i32 %sub) nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge.in = phi i8 [ %call, %if.else ], [ %.not, %if.then ]
+  %storemerge = sext i8 %storemerge.in to i32
+  store i32 %storemerge, i32* @b, align 4
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/InstCombine/2012-05-28-select-hang.ll b/test/Transforms/InstCombine/2012-05-28-select-hang.ll
new file mode 100644
index 0000000..c580bac
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-05-28-select-hang.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+@c = common global i8 0, align 1
+@a = common global i8 0, align 1
+@b = common global i8 0, align 1
+
+define void @func() nounwind uwtable ssp {
+entry:
+  %0 = load i8* @c, align 1
+  %conv = zext i8 %0 to i32
+  %or = or i32 %conv, 1
+  %conv1 = trunc i32 %or to i8
+  store i8 %conv1, i8* @a, align 1
+  %conv2 = zext i8 %conv1 to i32
+  %neg = xor i32 %conv2, -1
+  %and = and i32 1, %neg
+  %conv3 = trunc i32 %and to i8
+  store i8 %conv3, i8* @b, align 1
+  %1 = load i8* @a, align 1
+  %conv4 = zext i8 %1 to i32
+  %conv5 = zext i8 %conv3 to i32
+  %tobool = icmp ne i32 %conv4, 0
+  br i1 %tobool, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %tobool8 = icmp ne i32 %conv5, 0
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %tobool8, %land.rhs ]
+  %land.ext = zext i1 %2 to i32
+  %mul = mul nsw i32 3, %land.ext
+  %conv9 = trunc i32 %mul to i8
+  store i8 %conv9, i8* @a, align 1
+  ret void
+
+; CHECK: @func
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll b/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll
new file mode 100644
index 0000000..22466a9
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll
@@ -0,0 +1,162 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; <rdar://problem/10889741>
+
+define void @func(double %r, double %g, double %b, double* %outH, double* %outS, double* %outL) nounwind uwtable ssp {
+bb:
+  %tmp = alloca double, align 8
+  %tmp1 = alloca double, align 8
+  %tmp2 = alloca double, align 8
+  store double %r, double* %tmp, align 8
+  store double %g, double* %tmp1, align 8
+  store double %b, double* %tmp2, align 8
+  %tmp3 = fcmp ogt double %r, %g
+  br i1 %tmp3, label %bb4, label %bb8
+
+bb4:                                              ; preds = %bb
+  %tmp5 = fcmp ogt double %r, %b
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:                                              ; preds = %bb4
+  br label %bb12
+
+bb7:                                              ; preds = %bb4
+  br label %bb12
+
+bb8:                                              ; preds = %bb
+  %tmp9 = fcmp ogt double %g, %b
+  br i1 %tmp9, label %bb10, label %bb11
+
+bb10:                                             ; preds = %bb8
+  br label %bb12
+
+bb11:                                             ; preds = %bb8
+  br label %bb12
+
+bb12:                                             ; preds = %bb11, %bb10, %bb7, %bb6
+  %max.0 = phi double* [ %tmp, %bb6 ], [ %tmp2, %bb7 ], [ %tmp1, %bb10 ], [ %tmp2, %bb11 ]
+; CHECK: %tmp13 = load double* %tmp, align 8
+; CHECK: %tmp14 = load double* %tmp1, align 8
+; CHECK: %tmp15 = fcmp olt double %tmp13, %tmp14
+  %tmp13 = load double* %tmp, align 8
+  %tmp14 = load double* %tmp1, align 8
+  %tmp15 = fcmp olt double %tmp13, %tmp14
+  br i1 %tmp15, label %bb16, label %bb21
+
+bb16:                                             ; preds = %bb12
+  %tmp17 = load double* %tmp2, align 8
+  %tmp18 = fcmp olt double %tmp13, %tmp17
+  br i1 %tmp18, label %bb19, label %bb20
+
+bb19:                                             ; preds = %bb16
+  br label %bb26
+
+bb20:                                             ; preds = %bb16
+  br label %bb26
+
+bb21:                                             ; preds = %bb12
+  %tmp22 = load double* %tmp2, align 8
+  %tmp23 = fcmp olt double %tmp14, %tmp22
+  br i1 %tmp23, label %bb24, label %bb25
+
+bb24:                                             ; preds = %bb21
+  br label %bb26
+
+bb25:                                             ; preds = %bb21
+  br label %bb26
+
+bb26:                                             ; preds = %bb25, %bb24, %bb20, %bb19
+  %min.0 = phi double* [ %tmp, %bb19 ], [ %tmp2, %bb20 ], [ %tmp1, %bb24 ], [ %tmp2, %bb25 ]
+; CHECK: %tmp27 = load double* %min.0, align 8
+; CHECK: %tmp28 = load double* %max.0
+; CHECK: %tmp29 = fadd double %tmp27, %tmp28
+  %tmp27 = load double* %min.0, align 8
+  %tmp28 = load double* %max.0
+  %tmp29 = fadd double %tmp27, %tmp28
+  %tmp30 = fdiv double %tmp29, 2.000000e+00
+  store double %tmp30, double* %outL
+  %tmp31 = load double* %min.0
+  %tmp32 = load double* %max.0
+  %tmp33 = fcmp oeq double %tmp31, %tmp32
+  br i1 %tmp33, label %bb34, label %bb35
+
+bb34:                                             ; preds = %bb26
+  store double 0.000000e+00, double* %outS
+  store double 0.000000e+00, double* %outH
+  br label %bb81
+
+bb35:                                             ; preds = %bb26
+  %tmp36 = fcmp olt double %tmp30, 5.000000e-01
+  %tmp37 = fsub double %tmp32, %tmp31
+  br i1 %tmp36, label %bb38, label %bb41
+
+bb38:                                             ; preds = %bb35
+  %tmp39 = fadd double %tmp32, %tmp31
+  %tmp40 = fdiv double %tmp37, %tmp39
+  store double %tmp40, double* %outS
+  br label %bb45
+
+bb41:                                             ; preds = %bb35
+  %tmp42 = fsub double 2.000000e+00, %tmp32
+  %tmp43 = fsub double %tmp42, %tmp31
+  %tmp44 = fdiv double %tmp37, %tmp43
+  store double %tmp44, double* %outS
+  br label %bb45
+
+bb45:                                             ; preds = %bb41, %bb38
+  %tmp46 = icmp eq double* %max.0, %tmp
+  br i1 %tmp46, label %bb47, label %bb55
+
+bb47:                                             ; preds = %bb45
+  %tmp48 = load double* %tmp1, align 8
+  %tmp49 = load double* %tmp2, align 8
+  %tmp50 = fsub double %tmp48, %tmp49
+  %tmp51 = load double* %max.0
+  %tmp52 = load double* %min.0
+  %tmp53 = fsub double %tmp51, %tmp52
+  %tmp54 = fdiv double %tmp50, %tmp53
+  store double %tmp54, double* %outH
+  br label %bb75
+
+bb55:                                             ; preds = %bb45
+  %tmp56 = icmp eq double* %max.0, %tmp1
+  br i1 %tmp56, label %bb57, label %bb66
+
+bb57:                                             ; preds = %bb55
+  %tmp58 = load double* %tmp2, align 8
+  %tmp59 = load double* %tmp, align 8
+  %tmp60 = fsub double %tmp58, %tmp59
+  %tmp61 = load double* %max.0
+  %tmp62 = load double* %min.0
+  %tmp63 = fsub double %tmp61, %tmp62
+  %tmp64 = fdiv double %tmp60, %tmp63
+  %tmp65 = fadd double 2.000000e+00, %tmp64
+  store double %tmp65, double* %outH
+  br label %bb75
+
+bb66:                                             ; preds = %bb55
+  %tmp67 = load double* %tmp, align 8
+  %tmp68 = load double* %tmp1, align 8
+  %tmp69 = fsub double %tmp67, %tmp68
+  %tmp70 = load double* %max.0
+  %tmp71 = load double* %min.0
+  %tmp72 = fsub double %tmp70, %tmp71
+  %tmp73 = fdiv double %tmp69, %tmp72
+  %tmp74 = fadd double 4.000000e+00, %tmp73
+  store double %tmp74, double* %outH
+  br label %bb75
+
+bb75:                                             ; preds = %bb66, %bb57, %bb47
+  %tmp76 = load double* %outH
+  %tmp77 = fdiv double %tmp76, 6.000000e+00
+  store double %tmp77, double* %outH
+  %tmp78 = fcmp olt double %tmp77, 0.000000e+00
+  br i1 %tmp78, label %bb79, label %bb81
+
+bb79:                                             ; preds = %bb75
+  %tmp80 = fadd double %tmp77, 1.000000e+00
+  store double %tmp80, double* %outH
+  br label %bb81
+
+bb81:                                             ; preds = %bb79, %bb75, %bb34
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
new file mode 100644
index 0000000..73e5a66
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR13442
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
+
+define i64 @foo() {
+  %ret = load i64* bitcast (i8* getelementptr (i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
+  ret i64 %ret
+  ; CHECK: ret i64 844424930263040
+}
diff --git a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
new file mode 100644
index 0000000..6f3df5b
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: bitcast
+
+@base = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 16
+declare void @foo(i32*)
+
+define void @test() nounwind {
+  call void @foo(i32* getelementptr (i32* bitcast ([16 x i32] addrspace(3)* @base to i32*), i64 2147483647)) nounwind
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll b/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll
new file mode 100644
index 0000000..cb527f8
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: bitcast
+
+define void @foo(<16 x i8> %a, <16 x i8> %b, <4 x i32>* %c) {
+  %aa = bitcast <16 x i8> %a to <4 x i32>
+  %bb = bitcast <16 x i8> %b to <4 x i32>
+  %select_v = select <4 x i1> zeroinitializer, <4 x i32> %aa, <4 x i32> %bb
+  store <4 x i32> %select_v, <4 x i32>* %c, align 4
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/CPP_min_max.ll b/test/Transforms/InstCombine/CPP_min_max.ll
index 531ce2b..b3d081b 100644
--- a/test/Transforms/InstCombine/CPP_min_max.ll
+++ b/test/Transforms/InstCombine/CPP_min_max.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep select | not grep {i32\\*}
+; RUN:   grep select | not grep 'i32\*'
 
 ; This testcase corresponds to PR362, which notices that this horrible code
 ; is generated by the C++ front-end and LLVM optimizers, which has lots of
diff --git a/test/Transforms/InstCombine/JavaCompare.ll b/test/Transforms/InstCombine/JavaCompare.ll
index 46b6c19..8c1f307 100644
--- a/test/Transforms/InstCombine/JavaCompare.ll
+++ b/test/Transforms/InstCombine/JavaCompare.ll
@@ -1,7 +1,7 @@
 ; This is the sequence of stuff that the Java front-end expands for a single 
 ; <= comparison.  Check to make sure we turn it into a <= (only)
 
-; RUN: opt < %s -instcombine -S | grep {icmp sle i32 %A, %B}
+; RUN: opt < %s -instcombine -S | grep "icmp sle i32 %A, %B"
 
 define i1 @le(i32 %A, i32 %B) {
         %c1 = icmp sgt i32 %A, %B               ; <i1> [#uses=1]
diff --git a/test/Transforms/InstCombine/add-shrink.ll b/test/Transforms/InstCombine/add-shrink.ll
index cc57478..3edb392 100644
--- a/test/Transforms/InstCombine/add-shrink.ll
+++ b/test/Transforms/InstCombine/add-shrink.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {add nsw i32}
+; RUN: opt < %s -instcombine -S | grep "add nsw i32"
 ; RUN: opt < %s -instcombine -S | grep sext | count 1
 
 ; Should only have one sext and the add should be i32 instead of i64.
diff --git a/test/Transforms/InstCombine/add-sitofp.ll b/test/Transforms/InstCombine/add-sitofp.ll
index 98a8cb4..40edf71 100644
--- a/test/Transforms/InstCombine/add-sitofp.ll
+++ b/test/Transforms/InstCombine/add-sitofp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {add nsw i32}
+; RUN: opt < %s -instcombine -S | grep "add nsw i32"
 
 define double @x(i32 %a, i32 %b) nounwind {
   %m = lshr i32 %a, 24
diff --git a/test/Transforms/InstCombine/addnegneg.ll b/test/Transforms/InstCombine/addnegneg.ll
index a3a09f2..ad8791d 100644
--- a/test/Transforms/InstCombine/addnegneg.ll
+++ b/test/Transforms/InstCombine/addnegneg.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep { sub } | count 1
+; RUN: opt < %s -instcombine -S | grep " sub " | count 1
 ; PR2047
 
 define i32 @l(i32 %a, i32 %b, i32 %c, i32 %d) {
diff --git a/test/Transforms/InstCombine/adjust-for-sminmax.ll b/test/Transforms/InstCombine/adjust-for-sminmax.ll
index b9b6f70..1fb7193 100644
--- a/test/Transforms/InstCombine/adjust-for-sminmax.ll
+++ b/test/Transforms/InstCombine/adjust-for-sminmax.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {icmp s\[lg\]t i32 %n, 0} | count 16
+; RUN: opt < %s -instcombine -S | grep "icmp s[lg]t i32 %n, 0" | count 16
 
 ; Instcombine should recognize that this code can be adjusted
 ; to fit the canonical smax/smin pattern.
diff --git a/test/Transforms/InstCombine/align-2d-gep.ll b/test/Transforms/InstCombine/align-2d-gep.ll
index eeca5c0..5bca46d 100644
--- a/test/Transforms/InstCombine/align-2d-gep.ll
+++ b/test/Transforms/InstCombine/align-2d-gep.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {align 16} | count 1
+; RUN: opt < %s -instcombine -S | grep "align 16" | count 1
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 ; A multi-dimensional array in a nested loop doing vector stores that
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index ef7185c..50e0347 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -5,8 +5,11 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 
 declare void @use(...)
 
-; Zero byte allocas should be deleted.
+@int = global i32 zeroinitializer
+
+; Zero byte allocas should be merged if they can't be deleted.
 ; CHECK: @test
+; CHECK: alloca
 ; CHECK-NOT: alloca
 define void @test() {
         %X = alloca [0 x i32]           ; <[0 x i32]*> [#uses=1]
@@ -15,6 +18,9 @@ define void @test() {
         call void (...)* @use( i32* %Y )
         %Z = alloca {  }                ; <{  }*> [#uses=1]
         call void (...)* @use( {  }* %Z )
+        %size = load i32* @int
+        %A = alloca {{}}, i32 %size
+        call void (...)* @use( {{}}* %A )
         ret void
 }
 
diff --git a/test/Transforms/InstCombine/and-fcmp.ll b/test/Transforms/InstCombine/and-fcmp.ll
index 91868d1..838c2f7 100644
--- a/test/Transforms/InstCombine/and-fcmp.ll
+++ b/test/Transforms/InstCombine/and-fcmp.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep fcmp | count 3
-; RUN: opt < %s -instcombine -S | grep ret | grep 0
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define zeroext i8 @t1(float %x, float %y) nounwind {
        %a = fcmp ueq float %x, %y
@@ -7,6 +6,11 @@ define zeroext i8 @t1(float %x, float %y) nounwind {
        %c = and i1 %a, %b
        %retval = zext i1 %c to i8
        ret i8 %retval
+; CHECK: t1
+; CHECK: fcmp oeq float %x, %y
+; CHECK-NOT: fcmp ueq float %x, %y
+; CHECK-NOT: fcmp ord float %x, %y
+; CHECK-NOW: and
 }
 
 define zeroext i8 @t2(float %x, float %y) nounwind {
@@ -15,6 +19,10 @@ define zeroext i8 @t2(float %x, float %y) nounwind {
        %c = and i1 %a, %b
        %retval = zext i1 %c to i8
        ret i8 %retval
+; CHECK: t2
+; CHECK: fcmp olt float %x, %y
+; CHECK-NOT: fcmp ord float %x, %y
+; CHECK-NOT: and
 }
 
 define zeroext i8 @t3(float %x, float %y) nounwind {
@@ -23,6 +31,8 @@ define zeroext i8 @t3(float %x, float %y) nounwind {
        %c = and i1 %a, %b
        %retval = zext i1 %c to i8
        ret i8 %retval
+; CHECK: t3
+; CHECK: ret i8 0
 }
 
 define zeroext i8 @t4(float %x, float %y) nounwind {
@@ -31,4 +41,39 @@ define zeroext i8 @t4(float %x, float %y) nounwind {
        %c = and i1 %a, %b
        %retval = zext i1 %c to i8
        ret i8 %retval
+; CHECK: t4
+; CHECK: fcmp one float %y, %x
+; CHECK-NOT: fcmp ord float %x, %y
+; CHECK-NOT: and
+}
+
+define zeroext i8 @t5(float %x, float %y) nounwind {
+       %a = fcmp ord float %x, %y
+       %b = fcmp uno float %x, %y
+       %c = and i1 %a, %b
+       %retval = zext i1 %c to i8
+       ret i8 %retval
+; CHECK: t5
+; CHECK: ret i8 0
+}
+
+define zeroext i8 @t6(float %x, float %y) nounwind {
+       %a = fcmp uno float %x, %y
+       %b = fcmp ord float %x, %y
+       %c = and i1 %a, %b
+       %retval = zext i1 %c to i8
+       ret i8 %retval
+; CHECK: t6
+; CHECK: ret i8 0
+}
+
+define zeroext i8 @t7(float %x, float %y) nounwind {
+       %a = fcmp uno float %x, %y
+       %b = fcmp ult float %x, %y
+       %c = and i1 %a, %b
+       %retval = zext i1 %c to i8
+       ret i8 %retval
+; CHECK: t7
+; CHECK: fcmp uno
+; CHECK-NOT: fcmp ult
 }
diff --git a/test/Transforms/InstCombine/and-not-or.ll b/test/Transforms/InstCombine/and-not-or.ll
index 9dce7b4..a42140b 100644
--- a/test/Transforms/InstCombine/and-not-or.ll
+++ b/test/Transforms/InstCombine/and-not-or.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {and i32 %x, %y} | count 4
-; RUN: opt < %s -instcombine -S | not grep {or}
+; RUN: opt < %s -instcombine -S | grep "and i32 %x, %y" | count 4
+; RUN: opt < %s -instcombine -S | not grep "or"
 
 define i32 @func1(i32 %x, i32 %y) nounwind {
 entry:
diff --git a/test/Transforms/InstCombine/and-or-and.ll b/test/Transforms/InstCombine/and-or-and.ll
index 216cd46..34cad82 100644
--- a/test/Transforms/InstCombine/and-or-and.ll
+++ b/test/Transforms/InstCombine/and-or-and.ll
@@ -10,7 +10,7 @@
 ; Which corresponds to test1.
 
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   not grep {or }
+; RUN:   not grep "or "
 
 define i32 @test1(i32 %X, i32 %Y) {
         %A = and i32 %X, 7              ; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/and-or-not.ll b/test/Transforms/InstCombine/and-or-not.ll
index bd878b0..cc661d5 100644
--- a/test/Transforms/InstCombine/and-or-not.ll
+++ b/test/Transforms/InstCombine/and-or-not.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -instcombine -S | grep xor | count 4
 ; RUN: opt < %s -instcombine -S | not grep and
-; RUN: opt < %s -instcombine -S | not grep { or}
+; RUN: opt < %s -instcombine -S | not grep " or"
 
 ; PR1510
 
diff --git a/test/Transforms/InstCombine/and-or.ll b/test/Transforms/InstCombine/and-or.ll
index b4224b3..0ae12a3 100644
--- a/test/Transforms/InstCombine/and-or.ll
+++ b/test/Transforms/InstCombine/and-or.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {and i32 %a, 1} | count 4
-; RUN: opt < %s -instcombine -S | grep {or i32 %0, %b} | count 4
+; RUN: opt < %s -instcombine -S | grep "and i32 %a, 1" | count 4
+; RUN: opt < %s -instcombine -S | grep "or i32 %0, %b" | count 4
 
 
 define i32 @func1(i32 %a, i32 %b) nounwind readnone {
diff --git a/test/Transforms/InstCombine/and-xor-or.ll b/test/Transforms/InstCombine/and-xor-or.ll
new file mode 100644
index 0000000..7ff810b
--- /dev/null
+++ b/test/Transforms/InstCombine/and-xor-or.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; rdar://10770603
+; (x & y) | (x ^ y) -> x | y 
+define i64 @or(i64 %x, i64 %y) nounwind uwtable readnone ssp {
+  %1 = and i64 %y, %x
+  %2 = xor i64 %y, %x
+  %3 = add i64 %1, %2
+  ret i64 %3
+; CHECK: @or
+; CHECK: or i64
+; CHECK-NEXT: ret
+}
+
+; (x & y) + (x ^ y) -> x | y 
+define i64 @or2(i64 %x, i64 %y) nounwind uwtable readnone ssp {
+  %1 = and i64 %y, %x
+  %2 = xor i64 %y, %x
+  %3 = or i64 %1, %2
+  ret i64 %3
+; CHECK: @or2
+; CHECK: or i64
+; CHECK-NEXT: ret
+}
diff --git a/test/Transforms/InstCombine/apint-and-or-and.ll b/test/Transforms/InstCombine/apint-and-or-and.ll
index 17d29b6..43536d7 100644
--- a/test/Transforms/InstCombine/apint-and-or-and.ll
+++ b/test/Transforms/InstCombine/apint-and-or-and.ll
@@ -11,7 +11,7 @@
 ; 
 ; This tests arbitrary precision integers.
 
-; RUN: opt < %s -instcombine -S | not grep {or }
+; RUN: opt < %s -instcombine -S | not grep "or "
 ; END.
 
 define i17 @test1(i17 %X, i17 %Y) {
diff --git a/test/Transforms/InstCombine/apint-and1.ll b/test/Transforms/InstCombine/apint-and1.ll
index cd4cbb9..fcd2dcd 100644
--- a/test/Transforms/InstCombine/apint-and1.ll
+++ b/test/Transforms/InstCombine/apint-and1.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that and instructions are properly eliminated.
 ; This test is for Integer BitWidth <= 64 && BitWidth % 8 != 0.
 
-; RUN: opt < %s -instcombine -S | not grep {and }
+; RUN: opt < %s -instcombine -S | not grep "and "
 ; END.
 
 define i39 @test0(i39 %A) {
diff --git a/test/Transforms/InstCombine/apint-and2.ll b/test/Transforms/InstCombine/apint-and2.ll
index ae74472..78dc8f9 100644
--- a/test/Transforms/InstCombine/apint-and2.ll
+++ b/test/Transforms/InstCombine/apint-and2.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that and instructions are properly eliminated.
 ; This test is for Integer BitWidth > 64 && BitWidth <= 1024.
 
-; RUN: opt < %s -instcombine -S | not grep {and }
+; RUN: opt < %s -instcombine -S | not grep "and "
 ; END.
 
 
diff --git a/test/Transforms/InstCombine/apint-shift-simplify.ll b/test/Transforms/InstCombine/apint-shift-simplify.ll
index 1a3340a..818ae66 100644
--- a/test/Transforms/InstCombine/apint-shift-simplify.ll
+++ b/test/Transforms/InstCombine/apint-shift-simplify.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    egrep {shl|lshr|ashr} | count 3
+; RUN:    egrep "shl|lshr|ashr" | count 3
 
 define i41 @test0(i41 %A, i41 %B, i41 %C) {
 	%X = shl i41 %A, %C
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index 0ea73a0..73f630e 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -47,13 +47,21 @@ define i32 @test5a(i32 %A) {
 }
 
 ; CHECK: @test6
-; CHECK-NOT: sh
+; CHECK: mul i55 %A, 6
 define i55 @test6(i55 %A) {
 	%B = shl i55 %A, 1		; <i55> [#uses=1]
 	%C = mul i55 %B, 3		; <i55> [#uses=1]
 	ret i55 %C
 }
 
+; CHECK: @test6a
+; CHECK: mul i55 %A, 6
+define i55 @test6a(i55 %A) {
+	%B = mul i55 %A, 3		; <i55> [#uses=1]
+	%C = shl i55 %B, 1		; <i55> [#uses=1]
+	ret i55 %C
+}
+
 ; CHECK: @test7
 ; CHECK-NOT: sh
 define i29 @test7(i8 %X) {
@@ -87,7 +95,8 @@ define i19 @test10(i19 %A) {
 }
 
 ; CHECK: @test11
-; CHECK-NOT: sh
+; Don't hide the shl from scalar evolution. DAGCombine will get it.
+; CHECK: shl
 define i23 @test11(i23 %A) {
 	%a = mul i23 %A, 3		; <i23> [#uses=1]
 	%B = lshr i23 %a, 11		; <i23> [#uses=1]
@@ -104,7 +113,8 @@ define i47 @test12(i47 %A) {
 }
 
 ; CHECK: @test13
-; CHECK-NOT: sh
+; Don't hide the shl from scalar evolution. DAGCombine will get it.
+; CHECK: shl
 define i18 @test13(i18 %A) {
 	%a = mul i18 %A, 3		; <i18> [#uses=1]
 	%B = ashr i18 %a, 8		; <i18> [#uses=1]
diff --git a/test/Transforms/InstCombine/apint-sub.ll b/test/Transforms/InstCombine/apint-sub.ll
index 8b9ff14..df8ec52 100644
--- a/test/Transforms/InstCombine/apint-sub.ll
+++ b/test/Transforms/InstCombine/apint-sub.ll
@@ -3,7 +3,7 @@
 ;
 
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep -v {sub i19 %Cok, %Bok} | grep -v {sub i25 0, %Aok} | not grep sub
+; RUN:   grep -v "sub i19 %Cok, %Bok" | grep -v "sub i25 0, %Aok" | not grep sub
 ; END.
 
 define i23 @test1(i23 %A) {
diff --git a/test/Transforms/InstCombine/apint-xor1.ll b/test/Transforms/InstCombine/apint-xor1.ll
index 849c659..01cbcf1 100644
--- a/test/Transforms/InstCombine/apint-xor1.ll
+++ b/test/Transforms/InstCombine/apint-xor1.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that xor instructions are properly eliminated.
 ; This test is for Integer BitWidth <= 64 && BitWidth % 8 != 0.
 
-; RUN: opt < %s -instcombine -S | not grep {xor }
+; RUN: opt < %s -instcombine -S | not grep "xor "
 
 
 define i47 @test1(i47 %A, i47 %B) {
diff --git a/test/Transforms/InstCombine/apint-xor2.ll b/test/Transforms/InstCombine/apint-xor2.ll
index cacc179..ab93c92 100644
--- a/test/Transforms/InstCombine/apint-xor2.ll
+++ b/test/Transforms/InstCombine/apint-xor2.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that xor instructions are properly eliminated.
 ; This test is for Integer BitWidth > 64 && BitWidth <= 1024.
 
-; RUN: opt < %s -instcombine -S | not grep {xor }
+; RUN: opt < %s -instcombine -S | not grep "xor "
 ; END.
 
 
diff --git a/test/Transforms/InstCombine/badmalloc.ll b/test/Transforms/InstCombine/badmalloc.ll
index f5a623d..3abe28a 100644
--- a/test/Transforms/InstCombine/badmalloc.ll
+++ b/test/Transforms/InstCombine/badmalloc.ll
@@ -16,5 +16,26 @@ define i1 @test1() {
   ret i1 %B
 
 ; CHECK: @test1
-; CHECK: ret i1 %B
+; CHECK: ret i1 false
+}
+
+; CHECK: @test2
+define noalias i8* @test2() nounwind {
+entry:
+; CHECK: @malloc
+  %A = call noalias i8* @malloc(i64 4) nounwind
+; CHECK: icmp eq
+  %tobool = icmp eq i8* %A, null
+; CHECK: br i1
+  br i1 %tobool, label %return, label %if.end
+
+if.end:
+; CHECK: store
+  store i8 7, i8* %A
+  br label %return
+
+return:
+; CHECK: phi
+  %retval.0 = phi i8* [ %A, %if.end ], [ null, %entry ]
+  ret i8* %retval.0
 }
diff --git a/test/Transforms/InstCombine/bit-checks.ll b/test/Transforms/InstCombine/bit-checks.ll
index 79a096f..62c9ddc 100644
--- a/test/Transforms/InstCombine/bit-checks.ll
+++ b/test/Transforms/InstCombine/bit-checks.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that these instructions are properly eliminated.
 ;
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    not grep {tobool}
+; RUN:    not grep "tobool"
 ; END.
 define i32 @main(i32 %argc, i8** %argv) nounwind ssp {
 entry:
diff --git a/test/Transforms/InstCombine/bitcount.ll b/test/Transforms/InstCombine/bitcount.ll
index a6fd837..318ca73 100644
--- a/test/Transforms/InstCombine/bitcount.ll
+++ b/test/Transforms/InstCombine/bitcount.ll
@@ -1,5 +1,5 @@
 ; Tests to make sure bit counts of constants are folded
-; RUN: opt < %s -instcombine -S | grep {ret i32 19}
+; RUN: opt < %s -instcombine -S | grep "ret i32 19"
 ; RUN: opt < %s -instcombine -S | \
 ; RUN:   grep -v declare | not grep llvm.ct
 
diff --git a/test/Transforms/InstCombine/bittest.ll b/test/Transforms/InstCombine/bittest.ll
index 92863d5..84ee7dd 100644
--- a/test/Transforms/InstCombine/bittest.ll
+++ b/test/Transforms/InstCombine/bittest.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -simplifycfg -S |\
-; RUN:    not grep {call void @abort}
+; RUN:    not grep "call void @abort"
 
 @b_rec.0 = external global i32          ; <i32*> [#uses=2]
 
diff --git a/test/Transforms/InstCombine/bswap.ll b/test/Transforms/InstCombine/bswap.ll
index 168b3e8..ba7df31 100644
--- a/test/Transforms/InstCombine/bswap.ll
+++ b/test/Transforms/InstCombine/bswap.ll
@@ -1,7 +1,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {call.*llvm.bswap} | count 6
+; RUN:    grep "call.*llvm.bswap" | count 6
 
 define i32 @test1(i32 %i) {
 	%tmp1 = lshr i32 %i, 24		; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 19d5a0a..56e5ca3 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -457,10 +457,12 @@ define i64 @test50(i64 %A) {
   %E = sext i32 %D to i64
   ret i64 %E
 ; CHECK: @test50
-; CHECK-NEXT: shl i64 %A, 30
+; lshr+shl will be handled by DAGCombine.
+; CHECK-NEXT: lshr i64 %A, 2
+; CHECK-NEXT: shl i64 %a, 32
 ; CHECK-NEXT: add i64 {{.*}}, -4294967296
-; CHECK-NEXT: %sext = ashr i64 {{.*}}, 32
-; CHECK-NEXT: ret i64 %sext
+; CHECK-NEXT: %E = ashr exact i64 {{.*}}, 32
+; CHECK-NEXT: ret i64 %E
 }
 
 define i64 @test51(i64 %A, i1 %cond) {
@@ -677,3 +679,18 @@ define i64 @test_mmx_const(<2 x i32> %c) nounwind {
 ; CHECK: @test_mmx_const
 ; CHECK-NOT: x86_mmx
 }
+
+; PR12514
+define i1 @test67(i1 %a, i32 %b) {
+  %tmp2 = zext i1 %a to i32
+  %conv6 = xor i32 %tmp2, 1
+  %and = and i32 %b, %conv6
+  %sext = shl nuw nsw i32 %and, 24
+  %neg.i = xor i32 %sext, -16777216
+  %conv.i.i = ashr exact i32 %neg.i, 24
+  %trunc = trunc i32 %conv.i.i to i8
+  %tobool.i = icmp eq i8 %trunc, 0
+  ret i1 %tobool.i
+; CHECK: @test67
+; CHECK: ret i1 false
+}
diff --git a/test/Transforms/InstCombine/crash.ll b/test/Transforms/InstCombine/crash.ll
index d5af532..2ef6ac6 100644
--- a/test/Transforms/InstCombine/crash.ll
+++ b/test/Transforms/InstCombine/crash.ll
@@ -132,12 +132,14 @@ define i32 @test5a() {
 }
 
 define void @test5() {
-       store i1 true, i1* undef
-       %1 = invoke i32 @test5a() to label %exit unwind label %exit
+  store i1 true, i1* undef
+  %r = invoke i32 @test5a() to label %exit unwind label %unwind
+unwind:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+          cleanup
+  br label %exit
 exit:
-       %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
-                cleanup
-       ret void
+  ret void
 }
 
 
diff --git a/test/Transforms/InstCombine/dce-iterate.ll b/test/Transforms/InstCombine/dce-iterate.ll
index 1d2cc53..1dd4522 100644
--- a/test/Transforms/InstCombine/dce-iterate.ll
+++ b/test/Transforms/InstCombine/dce-iterate.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret double .sy}
+; RUN: opt < %s -instcombine -S | grep "ret double .sy"
 
 define internal double @ScaleObjectAdd(double %sx, double %sy, double %sz) nounwind {
 entry:
diff --git a/test/Transforms/InstCombine/deadcode.ll b/test/Transforms/InstCombine/deadcode.ll
index 7c7f1ab..8fe673d 100644
--- a/test/Transforms/InstCombine/deadcode.ll
+++ b/test/Transforms/InstCombine/deadcode.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i32 %A}
+; RUN: opt < %s -instcombine -S | grep "ret i32 %A"
 ; RUN: opt < %s -die -S | not grep call.*llvm
 
 define i32 @test(i32 %A) {
diff --git a/test/Transforms/InstCombine/div-shift.ll b/test/Transforms/InstCombine/div-shift.ll
new file mode 100644
index 0000000..a07f3ea
--- /dev/null
+++ b/test/Transforms/InstCombine/div-shift.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @t1(i16 zeroext %x, i32 %y) nounwind {
+entry:
+; CHECK: t1
+; CHECK-NOT: sdiv
+; CHECK: lshr i32 %conv
+  %conv = zext i16 %x to i32
+  %s = shl i32 2, %y
+  %d = sdiv i32 %conv, %s
+  ret i32 %d
+}
+
+; rdar://11721329
+define i64 @t2(i64 %x, i32 %y) nounwind  {
+; CHECK: t2
+; CHECK-NOT: udiv
+; CHECK: lshr i64 %x
+  %1 = shl i32 1, %y
+  %2 = zext i32 %1 to i64
+  %3 = udiv i64 %x, %2
+  ret i64 %3
+}
diff --git a/test/Transforms/InstCombine/enforce-known-alignment.ll b/test/Transforms/InstCombine/enforce-known-alignment.ll
index 9e9be7f..6645d99 100644
--- a/test/Transforms/InstCombine/enforce-known-alignment.ll
+++ b/test/Transforms/InstCombine/enforce-known-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep alloca | grep {align 16}
+; RUN: opt < %s -instcombine -S | grep alloca | grep "align 16"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.6"
 
diff --git a/test/Transforms/InstCombine/fp-ret-bitcast.ll b/test/Transforms/InstCombine/fp-ret-bitcast.ll
index 35ece42..b2fbc0b 100644
--- a/test/Transforms/InstCombine/fp-ret-bitcast.ll
+++ b/test/Transforms/InstCombine/fp-ret-bitcast.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {call float bitcast} | count 1
+; RUN:    grep "call float bitcast" | count 1
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 	%struct.NSObject = type { %struct.objc_class* }
  	%struct.NSArray = type { %struct.NSObject }
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index a9ae221..eaff87d 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -637,3 +637,25 @@ define i1 @test62(i8* %a) {
 ; CHECK: @test62
 ; CHECK-NEXT: ret i1 true
 }
+
+define i1 @test63(i8 %a, i32 %b) nounwind {
+  %z = zext i8 %a to i32
+  %t = and i32 %b, 255
+  %c = icmp eq i32 %z, %t
+  ret i1 %c
+; CHECK: @test63
+; CHECK-NEXT: %1 = trunc i32 %b to i8
+; CHECK-NEXT: %c = icmp eq i8 %1, %a
+; CHECK-NEXT: ret i1 %c
+}
+
+define i1 @test64(i8 %a, i32 %b) nounwind {
+  %t = and i32 %b, 255
+  %z = zext i8 %a to i32
+  %c = icmp eq i32 %t, %z
+  ret i1 %c
+; CHECK: @test64
+; CHECK-NEXT: %1 = trunc i32 %b to i8
+; CHECK-NEXT: %c = icmp eq i8 %1, %a
+; CHECK-NEXT: ret i1 %c
+}
diff --git a/test/Transforms/InstCombine/invoke.ll b/test/Transforms/InstCombine/invoke.ll
new file mode 100644
index 0000000..04eaf86
--- /dev/null
+++ b/test/Transforms/InstCombine/invoke.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_call_unexpected(i8*)
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+declare i8* @_Znwm(i64)
+
+
+; CHECK: @f1
+define i64 @f1() nounwind uwtable ssp {
+entry:
+; CHECK: nvoke noalias i8* undef()
+  %call = invoke noalias i8* undef()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64(i8* %call, i1 false)
+  ret i64 %0
+
+lpad:
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %2 = extractvalue { i8*, i32 } %1, 0
+  tail call void @__cxa_call_unexpected(i8* %2) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f2
+define i64 @f2() nounwind uwtable ssp {
+entry:
+; CHECK: nvoke noalias i8* null()
+  %call = invoke noalias i8* null()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64(i8* %call, i1 false)
+  ret i64 %0
+
+lpad:
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %2 = extractvalue { i8*, i32 } %1, 0
+  tail call void @__cxa_call_unexpected(i8* %2) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f3
+define void @f3() nounwind uwtable ssp {
+; CHECK: invoke void @llvm.donothing()
+  %call = invoke noalias i8* @_Znwm(i64 13)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %2 = extractvalue { i8*, i32 } %1, 0
+  tail call void @__cxa_call_unexpected(i8* %2) noreturn nounwind
+  unreachable
+}
diff --git a/test/Transforms/InstCombine/known_align.ll b/test/Transforms/InstCombine/known_align.ll
index 5382abf..0249951 100644
--- a/test/Transforms/InstCombine/known_align.ll
+++ b/test/Transforms/InstCombine/known_align.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {align 1}
+; RUN: opt < %s -instcombine -S | grep "align 1"
 ; END.
 
 	%struct.p = type <{ i8, i32 }>
diff --git a/test/Transforms/InstCombine/loadstore-alignment.ll b/test/Transforms/InstCombine/loadstore-alignment.ll
index 1d932d2..2263cb2 100644
--- a/test/Transforms/InstCombine/loadstore-alignment.ll
+++ b/test/Transforms/InstCombine/loadstore-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {, align 16} | count 14
+; RUN: opt < %s -instcombine -S | grep ", align 16" | count 14
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @x = external global <2 x i64>, align 16
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index eae973d..4e3217d 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -1,17 +1,17 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 ; PR1201
 define i32 @main(i32 %argc, i8** %argv) {
+; CHECK: @main
     %c_19 = alloca i8*
     %malloc_206 = tail call i8* @malloc(i32 mul (i32 ptrtoint (i8* getelementptr (i8* null, i32 1) to i32), i32 10))
     store i8* %malloc_206, i8** %c_19
     %tmp_207 = load i8** %c_19
     tail call void @free(i8* %tmp_207)
     ret i32 0
-; CHECK-NOT: malloc
-; CHECK-NOT: free
-; CHECK: ret i32 0
+; CHECK-NEXT: ret i32 0
 }
 
+declare noalias i8* @calloc(i32, i32) nounwind
 declare noalias i8* @malloc(i32)
 declare void @free(i8*)
 
@@ -26,13 +26,24 @@ define i1 @foo() {
 
 declare void @llvm.lifetime.start(i64, i8*)
 declare void @llvm.lifetime.end(i64, i8*)
+declare i64 @llvm.objectsize.i64(i8*, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1) nounwind
 
-define void @test3() {
+define void @test3(i8* %src) {
 ; CHECK: @test3
 ; CHECK-NEXT: ret void
   %a = call noalias i8* @malloc(i32 10)
   call void @llvm.lifetime.start(i64 10, i8* %a)
   call void @llvm.lifetime.end(i64 10, i8* %a)
+  %size = call i64 @llvm.objectsize.i64(i8* %a, i1 true)
+  store i8 42, i8* %a
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %src, i32 32, i32 1, i1 false)
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a, i8* %src, i32 32, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %a, i8 5, i32 32, i32 1, i1 false)
+  %alloc2 = call noalias i8* @calloc(i32 5, i32 7) nounwind
+  %z = icmp ne i8* %alloc2, null
   ret void
 }
 
@@ -46,3 +57,37 @@ define void @test4() {
   call void @free(i8* %C)
   ret void
 }
+
+; CHECK: @test5
+define void @test5(i8* %ptr, i8** %esc) {
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call i8* @malloc
+; CHECK-NEXT: call void @llvm.memcpy
+; CHECK-NEXT: call void @llvm.memmove
+; CHECK-NEXT: store
+; CHECK-NEXT: call void @llvm.memcpy
+; CHECK-NEXT: call void @llvm.memmove
+; CHECK-NEXT: call void @llvm.memset
+; CHECK-NEXT: store volatile
+; CHECK-NEXT: ret
+  %a = call i8* @malloc(i32 700)
+  %b = call i8* @malloc(i32 700)
+  %c = call i8* @malloc(i32 700)
+  %d = call i8* @malloc(i32 700)
+  %e = call i8* @malloc(i32 700)
+  %f = call i8* @malloc(i32 700)
+  %g = call i8* @malloc(i32 700)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %a, i32 32, i32 1, i1 false)
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %ptr, i8* %b, i32 32, i32 1, i1 false)
+  store i8* %c, i8** %esc
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %ptr, i32 32, i32 1, i1 true)
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %e, i8* %ptr, i32 32, i32 1, i1 true)
+  call void @llvm.memset.p0i8.i32(i8* %f, i8 5, i32 32, i32 1, i1 true)
+  store volatile i8 4, i8* %g
+  ret void
+}
diff --git a/test/Transforms/InstCombine/memcpy-to-load.ll b/test/Transforms/InstCombine/memcpy-to-load.ll
index 04aac98..bcc9e18 100644
--- a/test/Transforms/InstCombine/memcpy-to-load.ll
+++ b/test/Transforms/InstCombine/memcpy-to-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {load double}
+; RUN: opt < %s -instcombine -S | grep "load double"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
 
diff --git a/test/Transforms/InstCombine/memmove.ll b/test/Transforms/InstCombine/memmove.ll
index 4602c12..9d51ea0 100644
--- a/test/Transforms/InstCombine/memmove.ll
+++ b/test/Transforms/InstCombine/memmove.ll
@@ -1,6 +1,6 @@
 ; This test makes sure that memmove instructions are properly eliminated.
 ;
-; RUN: opt < %s -instcombine -S | not grep {call void @llvm.memmove}
+; RUN: opt < %s -instcombine -S | not grep "call void @llvm.memmove"
 
 @S = internal constant [33 x i8] c"panic: restorelist inconsistency\00"		; <[33 x i8]*> [#uses=1]
 @h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=1]
diff --git a/test/Transforms/InstCombine/memset.ll b/test/Transforms/InstCombine/memset.ll
index 7f7bc9f..7f02dad 100644
--- a/test/Transforms/InstCombine/memset.ll
+++ b/test/Transforms/InstCombine/memset.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | not grep {call.*llvm.memset}
+; RUN: opt < %s -instcombine -S | not grep "call.*llvm.memset"
 
 define i32 @main() {
   %target = alloca [1024 x i8]
diff --git a/test/Transforms/InstCombine/mul.ll b/test/Transforms/InstCombine/mul.ll
index edb5305..6c8e634 100644
--- a/test/Transforms/InstCombine/mul.ll
+++ b/test/Transforms/InstCombine/mul.ll
@@ -138,8 +138,9 @@ define i32 @test16(i32 %b, i1 %c) {
         ; e = b & (a >> 31)
         %e = mul i32 %d, %b             ; <i32> [#uses=1]
         ret i32 %e
-; CHECK: [[TEST16:%.*]] = sext i1 %c to i32
-; CHECK-NEXT: %e = and i32 [[TEST16]], %b
+; CHECK: [[TEST16:%.*]] = zext i1 %c to i32
+; CHECK-NEXT: %1 = sub i32 0, [[TEST16]]
+; CHECK-NEXT: %e = and i32 %1, %b
 ; CHECK-NEXT: ret i32 %e
 }
 
diff --git a/test/Transforms/InstCombine/multi-use-or.ll b/test/Transforms/InstCombine/multi-use-or.ll
index 8c6a0e0..8b90e0d 100644
--- a/test/Transforms/InstCombine/multi-use-or.ll
+++ b/test/Transforms/InstCombine/multi-use-or.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {fadd double .sx, .sy}
+; RUN: opt < %s -instcombine -S | grep "fadd double .sx, .sy"
 ; The 'or' has multiple uses, make sure that this doesn't prevent instcombine
 ; from propagating the extends to the truncs.
 
diff --git a/test/Transforms/InstCombine/narrow.ll b/test/Transforms/InstCombine/narrow.ll
index 1b96a06..5dd13a0 100644
--- a/test/Transforms/InstCombine/narrow.ll
+++ b/test/Transforms/InstCombine/narrow.ll
@@ -1,7 +1,7 @@
 ; This file contains various testcases that check to see that instcombine
 ; is narrowing computations when possible.
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep {ret i1 false}
+; RUN:    grep "ret i1 false"
 
 ; test1 - Eliminating the casts in this testcase (by narrowing the AND
 ; operation) allows instcombine to realize the function always returns false.
diff --git a/test/Transforms/InstCombine/objsize-64.ll b/test/Transforms/InstCombine/objsize-64.ll
new file mode 100644
index 0000000..530e123
--- /dev/null
+++ b/test/Transforms/InstCombine/objsize-64.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare noalias i8* @malloc(i32) nounwind
+declare noalias i8* @_Znwm(i64)  ; new(unsigned long)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_call_unexpected(i8*)
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+
+; CHECK: @f1
+define i64 @f1(i8 **%esc) {
+  %call = call i8* @malloc(i32 4)
+  store i8* %call, i8** %esc
+  %size = call i64 @llvm.objectsize.i64(i8* %call, i1 false)
+; CHECK: ret i64 4
+  ret i64 %size
+}
+
+
+; CHECK: @f2
+define i64 @f2(i8** %esc) nounwind uwtable ssp {
+entry:
+; CHECK: invoke noalias i8* @_Znwm(i64 13)
+  %call = invoke noalias i8* @_Znwm(i64 13)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; CHECK: ret i64 13
+  store i8* %call, i8** %esc
+  %0 = tail call i64 @llvm.objectsize.i64(i8* %call, i1 false)
+  ret i64 %0
+
+lpad:
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %2 = extractvalue { i8*, i32 } %1, 0
+  tail call void @__cxa_call_unexpected(i8* %2) noreturn nounwind
+  unreachable
+}
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 28ceb68..dbb0ffc 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -42,7 +42,7 @@ define i32 @f() nounwind {
 
 define i1 @baz() nounwind {
 ; CHECK: @baz
-; CHECK-NEXT: ret i1 true
+; CHECK-NEXT: objectsize
   %1 = tail call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([0 x i8]* @window, i32 0, i32 0), i1 false)
   %2 = icmp eq i32 %1, -1
   ret i1 %2
@@ -106,7 +106,7 @@ bb12:
 
 %struct.data = type { [100 x i32], [100 x i32], [1024 x i8] }
 
-define i32 @test4() nounwind ssp {
+define i32 @test4(i8** %esc) nounwind ssp {
 ; CHECK: @test4
 entry:
   %0 = alloca %struct.data, align 8
@@ -115,13 +115,14 @@ entry:
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memset.p0i8.i32(i8* %1, i8 0, i32 1824, i32 8, i1 false)
   %3 = call i8* @__memset_chk(i8* %1, i32 0, i32 1824, i32 %2) nounwind
+  store i8* %1, i8** %esc
   ret i32 0
 }
 
 ; rdar://7782496
 @s = external global i8*
 
-define void @test5(i32 %n) nounwind ssp {
+define i8* @test5(i32 %n) nounwind ssp {
 ; CHECK: @test5
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
@@ -130,7 +131,7 @@ entry:
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 10, i32 1, i1 false)
   %3 = tail call i8* @__memcpy_chk(i8* %0, i8* %2, i32 10, i32 %1) nounwind
-  ret void
+  ret i8* %0
 }
 
 define void @test6(i32 %n) nounwind ssp {
@@ -149,12 +150,91 @@ declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 
 declare noalias i8* @malloc(i32) nounwind
 
-define i32 @test7() {
+define i32 @test7(i8** %esc) {
 ; CHECK: @test7
   %alloc = call noalias i8* @malloc(i32 48) nounwind
+  store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8* %alloc, i32 16
   %objsize = call i32 @llvm.objectsize.i32(i8* %gep, i1 false) nounwind readonly
-; CHECK-NEXT: ret i32 32
+; CHECK: ret i32 32
+  ret i32 %objsize
+}
+
+declare noalias i8* @calloc(i32, i32) nounwind
+
+define i32 @test8(i8** %esc) {
+; CHECK: @test8
+  %alloc = call noalias i8* @calloc(i32 5, i32 7) nounwind
+  store i8* %alloc, i8** %esc
+  %gep = getelementptr inbounds i8* %alloc, i32 5
+  %objsize = call i32 @llvm.objectsize.i32(i8* %gep, i1 false) nounwind readonly
+; CHECK: ret i32 30
   ret i32 %objsize
 }
 
+declare noalias i8* @strdup(i8* nocapture) nounwind
+declare noalias i8* @strndup(i8* nocapture, i32) nounwind
+
+; CHECK: @test9
+define i32 @test9(i8** %esc) {
+  %call = tail call i8* @strdup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0)) nounwind
+  store i8* %call, i8** %esc, align 8
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+; CHECK: ret i32 8
+  ret i32 %1
+}
+
+; CHECK: @test10
+define i32 @test10(i8** %esc) {
+  %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 3) nounwind
+  store i8* %call, i8** %esc, align 8
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+; CHECK: ret i32 4
+  ret i32 %1
+}
+
+; CHECK: @test11
+define i32 @test11(i8** %esc) {
+  %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 7) nounwind
+  store i8* %call, i8** %esc, align 8
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+; CHECK: ret i32 8
+  ret i32 %1
+}
+
+; CHECK: @test12
+define i32 @test12(i8** %esc) {
+  %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 8) nounwind
+  store i8* %call, i8** %esc, align 8
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+; CHECK: ret i32 8
+  ret i32 %1
+}
+
+; CHECK: @test13
+define i32 @test13(i8** %esc) {
+  %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 57) nounwind
+  store i8* %call, i8** %esc, align 8
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+; CHECK: ret i32 8
+  ret i32 %1
+}
+
+; CHECK: @PR13390
+define i32 @PR13390(i1 %bool, i8* %a) {
+entry:
+  %cond = or i1 %bool, true
+  br i1 %cond, label %return, label %xpto
+
+xpto:
+  %select = select i1 %bool, i8* %select, i8* %a
+  %select2 = select i1 %bool, i8* %a, i8* %select2
+  %0 = tail call i32 @llvm.objectsize.i32(i8* %select, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %select2, i1 true)
+  %2 = add i32 %0, %1
+; CHECK: ret i32 undef
+  ret i32 %2
+
+return:
+  ret i32 42
+}
diff --git a/test/Transforms/InstCombine/odr-linkage.ll b/test/Transforms/InstCombine/odr-linkage.ll
index 61365b4..2ce6246 100644
--- a/test/Transforms/InstCombine/odr-linkage.ll
+++ b/test/Transforms/InstCombine/odr-linkage.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i32 10}
+; RUN: opt < %s -instcombine -S | grep "ret i32 10"
 
 @g1 = available_externally constant i32 1
 @g2 = linkonce_odr constant i32 2
diff --git a/test/Transforms/InstCombine/or-to-xor.ll b/test/Transforms/InstCombine/or-to-xor.ll
index 1495ee4..8847cb7 100644
--- a/test/Transforms/InstCombine/or-to-xor.ll
+++ b/test/Transforms/InstCombine/or-to-xor.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {xor i32 %a, %b} | count 4
-; RUN: opt < %s -instcombine -S | not grep {and}
+; RUN: opt < %s -instcombine -S | grep "xor i32 %a, %b" | count 4
+; RUN: opt < %s -instcombine -S | not grep "and"
 
 define i32 @func1(i32 %a, i32 %b) nounwind readnone {
 entry:
diff --git a/test/Transforms/InstCombine/phi-merge-gep.ll b/test/Transforms/InstCombine/phi-merge-gep.ll
index 2671749..25c9cea 100644
--- a/test/Transforms/InstCombine/phi-merge-gep.ll
+++ b/test/Transforms/InstCombine/phi-merge-gep.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -S -instcombine > %t
-; RUN: grep {= getelementptr} %t | count 20
-; RUN: grep {= phi} %t | count 13
+; RUN: grep "= getelementptr" %t | count 20
+; RUN: grep "= phi" %t | count 13
 
 ; Don't push the geps through these phis, because they would require
 ; two phis each, which burdens the loop with high register pressure.
diff --git a/test/Transforms/InstCombine/phi.ll b/test/Transforms/InstCombine/phi.ll
index 219545c..1c307d4 100644
--- a/test/Transforms/InstCombine/phi.ll
+++ b/test/Transforms/InstCombine/phi.ll
@@ -620,3 +620,13 @@ end:
 ; CHECK-NOT: phi i32
 ; CHECK: ret i1 %z
 }
+
+; CHECK: @test27(
+; CHECK: ret i32 undef
+define i32 @test27(i1 %b) {
+entry:
+  br label %done
+done:
+  %y = phi i32 [ undef, %entry ]
+  ret i32 %y
+}
diff --git a/test/Transforms/InstCombine/pr12338.ll b/test/Transforms/InstCombine/pr12338.ll
new file mode 100644
index 0000000..2b5c8f8
--- /dev/null
+++ b/test/Transforms/InstCombine/pr12338.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @entry() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:
+  %local = phi <1 x i32> [ <i32 0>, %entry ], [ %phi2, %cond.end47 ]
+; CHECK: sub <1 x i32> <i32 92>, %local
+  %phi3 = sub <1 x i32> zeroinitializer, %local
+  br label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %cond = phi <1 x i32> [ %phi3, %for.cond ], [ undef, %cond.false ]
+  br label %cond.end47
+
+cond.end47:
+  %sum = add <1 x i32> %cond, <i32 92>
+  %phi2 = sub <1 x i32> zeroinitializer, %sum
+  br label %for.cond
+}
diff --git a/test/Transforms/InstCombine/pr2645-0.ll b/test/Transforms/InstCombine/pr2645-0.ll
index 9bcaa43..e8aeb2a 100644
--- a/test/Transforms/InstCombine/pr2645-0.ll
+++ b/test/Transforms/InstCombine/pr2645-0.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {insertelement <4 x float> undef}
+; RUN: opt < %s -instcombine -S | grep "insertelement <4 x float> undef"
 
 ; Instcombine should be able to prove that none of the
 ; insertelement's first operand's elements are needed.
diff --git a/test/Transforms/InstCombine/sdiv-shift.ll b/test/Transforms/InstCombine/sdiv-shift.ll
deleted file mode 100644
index f4d2b36..0000000
--- a/test/Transforms/InstCombine/sdiv-shift.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep div
-
-define i32 @a(i16 zeroext %x, i32 %y) nounwind {
-entry:
-	%conv = zext i16 %x to i32
-	%s = shl i32 2, %y
-	%d = sdiv i32 %conv, %s
-	ret i32 %d
-}
diff --git a/test/Transforms/InstCombine/select-crash.ll b/test/Transforms/InstCombine/select-crash.ll
index 18af152..946ea2b 100644
--- a/test/Transforms/InstCombine/select-crash.ll
+++ b/test/Transforms/InstCombine/select-crash.ll
@@ -30,3 +30,20 @@ define <4 x float> @foo(i1 %b, <4 x float> %x, <4 x float> %y, <4 x float> %z) {
   %sel = select i1 %b, <4 x float> %a, <4 x float> %sub 
   ret <4 x float> %sel
 }
+
+; CHECK: @test3
+define i32 @test3(i1 %bool, i32 %a) {
+entry:
+  %cond = or i1 %bool, true
+  br i1 %cond, label %return, label %xpto
+
+; technically reachable, but this malformed IR may appear as a result of constant propagation
+xpto:
+  %select = select i1 %bool, i32 %a, i32 %select
+  %select2 = select i1 %bool, i32 %select2, i32 %a
+  %sum = add i32 %select, %select2
+  ret i32 %sum
+
+return:
+  ret i32 7
+}
diff --git a/test/Transforms/InstCombine/select-load-call.ll b/test/Transforms/InstCombine/select-load-call.ll
index bef0cf8..b63468d 100644
--- a/test/Transforms/InstCombine/select-load-call.ll
+++ b/test/Transforms/InstCombine/select-load-call.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i32 1}
+; RUN: opt < %s -instcombine -S | grep "ret i32 1"
 
 declare void @test2()
 
diff --git a/test/Transforms/InstCombine/setcc-strength-reduce.ll b/test/Transforms/InstCombine/setcc-strength-reduce.ll
index 62ab116..138712e 100644
--- a/test/Transforms/InstCombine/setcc-strength-reduce.ll
+++ b/test/Transforms/InstCombine/setcc-strength-reduce.ll
@@ -3,7 +3,7 @@
 ; into equivalent setne,eq instructions.
 ;
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep -v {icmp eq} | grep -v {icmp ne} | not grep icmp
+; RUN:    grep -v "icmp eq" | grep -v "icmp ne" | not grep icmp
 ; END.
 
 define i1 @test1(i32 %A) {
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 52310e3..25e708b 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -65,8 +65,17 @@ define i32 @test6(i32 %A) {
 ; CHECK: @test6
 ; CHECK-NEXT: mul i32 %A, 6
 ; CHECK-NEXT: ret i32
-        %B = shl i32 %A, 1      ;; convert to an mul instruction 
-        %C = mul i32 %B, 3             
+        %B = shl i32 %A, 1      ;; convert to an mul instruction
+        %C = mul i32 %B, 3
+        ret i32 %C
+}
+
+define i32 @test6a(i32 %A) {
+; CHECK: @test6a
+; CHECK-NEXT: mul i32 %A, 6
+; CHECK-NEXT: ret i32
+        %B = mul i32 %A, 3
+        %C = shl i32 %B, 1      ;; convert to an mul instruction
         ret i32 %C
 }
 
@@ -97,7 +106,9 @@ define i8 @test9(i8 %A) {
         ret i8 %C
 }
 
+;; This transformation is deferred to DAGCombine:
 ;; (A >> 7) << 7 === A & 128
+;; The shl may be valuable to scalar evolution.
 define i8 @test10(i8 %A) {
 ; CHECK: @test10
 ; CHECK-NEXT: and i8 %A, -128
@@ -107,11 +118,21 @@ define i8 @test10(i8 %A) {
         ret i8 %C
 }
 
+;; Allow the simplification when the lshr shift is exact.
+define i8 @test10a(i8 %A) {
+; CHECK: @test10a
+; CHECK-NEXT: ret i8 %A
+        %B = lshr exact i8 %A, 7
+        %C = shl i8 %B, 7
+        ret i8 %C
+}
+
+;; This transformation is deferred to DAGCombine:
 ;; (A >> 3) << 4 === (A & 0x1F) << 1
+;; The shl may be valuable to scalar evolution.
 define i8 @test11(i8 %A) {
 ; CHECK: @test11
-; CHECK-NEXT: mul i8 %A, 6
-; CHECK-NEXT: and i8
+; CHECK: shl i8
 ; CHECK-NEXT: ret i8
         %a = mul i8 %A, 3               ; <i8> [#uses=1]
         %B = lshr i8 %a, 3              ; <i8> [#uses=1]
@@ -119,6 +140,18 @@ define i8 @test11(i8 %A) {
         ret i8 %C
 }
 
+;; Allow the simplification in InstCombine when the lshr shift is exact.
+define i8 @test11a(i8 %A) {
+; CHECK: @test11a
+; CHECK-NEXT: mul i8 %A, 6
+; CHECK-NEXT: ret i8
+        %a = mul i8 %A, 3
+        %B = lshr exact i8 %a, 3
+        %C = shl i8 %B, 4
+        ret i8 %C
+}
+
+;; This is deferred to DAGCombine unless %B is single-use.
 ;; (A >> 8) << 8 === A & -256
 define i32 @test12(i32 %A) {
 ; CHECK: @test12
@@ -129,11 +162,12 @@ define i32 @test12(i32 %A) {
         ret i32 %C
 }
 
+;; This transformation is deferred to DAGCombine:
 ;; (A >> 3) << 4 === (A & -8) * 2
+;; The shl may be valuable to scalar evolution.
 define i8 @test13(i8 %A) {
 ; CHECK: @test13
-; CHECK-NEXT: mul i8 %A, 6
-; CHECK-NEXT: and i8
+; CHECK: shl i8
 ; CHECK-NEXT: ret i8
         %a = mul i8 %A, 3               ; <i8> [#uses=1]
         %B = ashr i8 %a, 3              ; <i8> [#uses=1]
@@ -141,6 +175,16 @@ define i8 @test13(i8 %A) {
         ret i8 %C
 }
 
+define i8 @test13a(i8 %A) {
+; CHECK: @test13a
+; CHECK-NEXT: mul i8 %A, 6
+; CHECK-NEXT: ret i8
+        %a = mul i8 %A, 3
+        %B = ashr exact i8 %a, 3
+        %C = shl i8 %B, 4
+        ret i8 %C
+}
+
 ;; D = ((B | 1234) << 4) === ((B << 4)|(1234 << 4)
 define i32 @test14(i32 %A) {
 ; CHECK: @test14
@@ -477,10 +521,11 @@ entry:
   %tmp49 = lshr i8 %tmp48, 5
   %tmp50 = mul i8 %tmp49, 64
   %tmp51 = xor i8 %tmp50, %tmp5
-; CHECK: and i8 %0, 16
   %tmp52 = and i8 %tmp51, -128
   %tmp53 = lshr i8 %tmp52, 7
+; CHECK: lshr i8 %tmp51, 7
   %tmp54 = mul i8 %tmp53, 16
+; CHECK: shl nuw nsw i8 %tmp53, 4
   %tmp55 = xor i8 %tmp54, %tmp51
 ; CHECK: ret i8 %tmp551
   ret i8 %tmp55
diff --git a/test/Transforms/InstCombine/shufflemask-undef.ll b/test/Transforms/InstCombine/shufflemask-undef.ll
index cf87aef..aa6baa9 100644
--- a/test/Transforms/InstCombine/shufflemask-undef.ll
+++ b/test/Transforms/InstCombine/shufflemask-undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | not grep {shufflevector.\*i32 8}
+; RUN: opt < %s -instcombine -S | not grep "shufflevector.*i32 8"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9"
diff --git a/test/Transforms/InstCombine/shufflevec-constant.ll b/test/Transforms/InstCombine/shufflevec-constant.ll
index 29ae5a7..a002b2a 100644
--- a/test/Transforms/InstCombine/shufflevec-constant.ll
+++ b/test/Transforms/InstCombine/shufflevec-constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0x7FF0000000000000, float 0x7FF0000000000000>}
+; RUN: opt < %s -instcombine -S | grep "ret <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0x7FF0000000000000, float 0x7FF0000000000000>"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9"
diff --git a/test/Transforms/InstCombine/signed-comparison.ll b/test/Transforms/InstCombine/signed-comparison.ll
index 9a08c64..ab0e7e7 100644
--- a/test/Transforms/InstCombine/signed-comparison.ll
+++ b/test/Transforms/InstCombine/signed-comparison.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S > %t
 ; RUN: not grep zext %t
 ; RUN: not grep slt %t
-; RUN: grep {icmp ult} %t
+; RUN: grep "icmp ult" %t
 
 ; Instcombine should convert the zext+slt into a simple ult.
 
diff --git a/test/Transforms/InstCombine/srem-simplify-bug.ll b/test/Transforms/InstCombine/srem-simplify-bug.ll
index af824a4..3458714 100644
--- a/test/Transforms/InstCombine/srem-simplify-bug.ll
+++ b/test/Transforms/InstCombine/srem-simplify-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i1 false}
+; RUN: opt < %s -instcombine -S | grep "ret i1 false"
 ; PR2276
 
 define i1 @f(i32 %x) {
diff --git a/test/Transforms/InstCombine/stack-overalign.ll b/test/Transforms/InstCombine/stack-overalign.ll
index 2fc8414..80c2ee8 100644
--- a/test/Transforms/InstCombine/stack-overalign.ll
+++ b/test/Transforms/InstCombine/stack-overalign.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {align 32} | count 1
+; RUN: opt < %s -instcombine -S | grep "align 32" | count 1
 
 ; It's tempting to have an instcombine in which the src pointer of a
 ; memcpy is aligned up to the alignment of the destination, however
diff --git a/test/Transforms/InstCombine/stacksaverestore.ll b/test/Transforms/InstCombine/stacksaverestore.ll
index 0fcaefa..f5c7a6f 100644
--- a/test/Transforms/InstCombine/stacksaverestore.ll
+++ b/test/Transforms/InstCombine/stacksaverestore.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {call.*stackrestore} | count 1
+; RUN: opt < %s -instcombine -S | grep "call.*stackrestore" | count 1
 
 declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)
diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll
index 6ec342a..cbbad7f 100644
--- a/test/Transforms/InstCombine/trunc.ll
+++ b/test/Transforms/InstCombine/trunc.ll
@@ -12,8 +12,8 @@ define i64 @test1(i64 %a) {
   call void @use(i32 %b)
   ret i64 %d
 ; CHECK: @test1
-; CHECK: %d = and i64 %a, 15
-; CHECK: ret i64 %d
+; CHECK-NOT: ext
+; CHECK: ret
 }
 define i64 @test2(i64 %a) {
   %b = trunc i64 %a to i32
@@ -34,8 +34,8 @@ define i64 @test3(i64 %a) {
   call void @use(i32 %b)
   ret i64 %d
 ; CHECK: @test3
-; CHECK: %d = and i64 %a, 8
-; CHECK: ret i64 %d
+; CHECK-NOT: ext
+; CHECK: ret
 }
 define i64 @test4(i64 %a) {
   %b = trunc i64 %a to i32
@@ -46,8 +46,9 @@ define i64 @test4(i64 %a) {
   ret i64 %d
 ; CHECK: @test4
 ; CHECK: = and i64 %a, 8
-; CHECK: %d = xor i64 {{.*}}, 8
-; CHECK: ret i64 %d
+; CHECK: = xor i64 {{.*}}, 8
+; CHECK-NOT: ext
+; CHECK: ret
 }
 
 define i32 @test5(i32 %A) {
diff --git a/test/Transforms/InstCombine/udiv-simplify-bug-0.ll b/test/Transforms/InstCombine/udiv-simplify-bug-0.ll
index bfdd98c..064e721 100644
--- a/test/Transforms/InstCombine/udiv-simplify-bug-0.ll
+++ b/test/Transforms/InstCombine/udiv-simplify-bug-0.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret i64 0} | count 2
+; RUN: opt < %s -instcombine -S | grep "ret i64 0" | count 2
 
 define i64 @foo(i32 %x) nounwind {
   %y = lshr i32 %x, 1
diff --git a/test/Transforms/InstCombine/urem-simplify-bug.ll b/test/Transforms/InstCombine/urem-simplify-bug.ll
index 229f1a8..3e94ab5 100644
--- a/test/Transforms/InstCombine/urem-simplify-bug.ll
+++ b/test/Transforms/InstCombine/urem-simplify-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {= or i32 %x, -5}
+; RUN: opt < %s -instcombine -S | grep "= or i32 %x, -5"
 
 @.str = internal constant [5 x i8] c"foo\0A\00"		; <[5 x i8]*> [#uses=1]
 @.str1 = internal constant [5 x i8] c"bar\0A\00"		; <[5 x i8]*> [#uses=1]
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index cc63371..0019a57 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -162,4 +162,51 @@ entry:
   ret <4 x float> %shuffle9.i
 }
 
+define <2 x float> @test_fptrunc(double %f) {
+; CHECK: @test_fptrunc
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK-NOT: insertelement
+  %tmp9 = insertelement <4 x double> undef, double %f, i32 0
+  %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
+  %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
+  %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
+  %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
+  %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %ret
+}
+
+define <2 x double> @test_fpext(float %f) {
+; CHECK: @test_fpext
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK-NOT: insertelement
+  %tmp9 = insertelement <4 x float> undef, float %f, i32 0
+  %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
+  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
+  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
+  %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
+  %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x float> @test_select(float %f, float %g) {
+; CHECK: @test_select
+; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NOT: insertelement
+; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
+; CHECK-NOT: insertelement
+; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
+  %a0 = insertelement <4 x float> undef, float %f, i32 0
+  %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
+  %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
+  %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
+  %b0 = insertelement <4 x float> undef, float %g, i32 0
+  %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
+  %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
+  %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
+  %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
+  ret <4 x float> %ret
+}
+
 
diff --git a/test/Transforms/InstCombine/vec_insertelt.ll b/test/Transforms/InstCombine/vec_insertelt.ll
index eedf882..e35fa5e 100644
--- a/test/Transforms/InstCombine/vec_insertelt.ll
+++ b/test/Transforms/InstCombine/vec_insertelt.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {ret <4 x i32> %A}
+; RUN: opt < %s -instcombine -S | grep "ret <4 x i32> %A"
 
 ; PR1286
 define <4 x i32> @test1(<4 x i32> %A) {
diff --git a/test/Transforms/InstCombine/vec_narrow.ll b/test/Transforms/InstCombine/vec_narrow.ll
index 2be43599..b4c41f6 100644
--- a/test/Transforms/InstCombine/vec_narrow.ll
+++ b/test/Transforms/InstCombine/vec_narrow.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {fadd float}
+; RUN: opt < %s -instcombine -S | grep "fadd float"
 
 
 define float @test(<4 x float> %A, <4 x float> %B, float %f) {
diff --git a/test/Transforms/InstCombine/vector-srem.ll b/test/Transforms/InstCombine/vector-srem.ll
index acb11c5..b1ed49e 100644
--- a/test/Transforms/InstCombine/vector-srem.ll
+++ b/test/Transforms/InstCombine/vector-srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {srem <4 x i32>}
+; RUN: opt < %s -instcombine -S | grep "srem <4 x i32>"
 
 define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u)
 {
diff --git a/test/Transforms/InstCombine/volatile_store.ll b/test/Transforms/InstCombine/volatile_store.ll
index 2256678..7cab199 100644
--- a/test/Transforms/InstCombine/volatile_store.ll
+++ b/test/Transforms/InstCombine/volatile_store.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep {store volatile}
-; RUN: opt < %s -instcombine -S | grep {load volatile}
+; RUN: opt < %s -instcombine -S | grep "store volatile"
+; RUN: opt < %s -instcombine -S | grep "load volatile"
 
 @x = weak global i32 0		; <i32*> [#uses=2]
 
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index a7bcdac..3722697 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that these instructions are properly eliminated.
 ;
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    not grep {xor }
+; RUN:    not grep "xor "
 ; END.
 @G1 = global i32 0		; <i32*> [#uses=1]
 @G2 = global i32 0		; <i32*> [#uses=1]
diff --git a/test/Transforms/InstCombine/zeroext-and-reduce.ll b/test/Transforms/InstCombine/zeroext-and-reduce.ll
index 592b8a1..315033d 100644
--- a/test/Transforms/InstCombine/zeroext-and-reduce.ll
+++ b/test/Transforms/InstCombine/zeroext-and-reduce.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:   grep {and i32 %Y, 8}
+; RUN:   grep "and i32 %Y, 8"
 
 define i32 @test1(i8 %X) {
         %Y = zext i8 %X to i32          ; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/zext-bool-add-sub.ll b/test/Transforms/InstCombine/zext-bool-add-sub.ll
index 1164273..78bcedb 100644
--- a/test/Transforms/InstCombine/zext-bool-add-sub.ll
+++ b/test/Transforms/InstCombine/zext-bool-add-sub.ll
@@ -1,29 +1,16 @@
-; RUN: opt < %s -instcombine -S | not grep zext
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; rdar://11748024
 
-define i32 @a(i1 %x) {
+define i32 @a(i1 zeroext %x, i1 zeroext %y) {
 entry:
-        %y = zext i1 %x to i32
-        %res = add i32 %y, 1
-        ret i32 %res
-}
-
-define i32 @b(i1 %x) {
-entry:
-        %y = zext i1 %x to i32
-        %res = add i32 %y, -1
-        ret i32 %res
-}
-
-define i32 @c(i1 %x) {
-entry:
-        %y = zext i1 %x to i32
-        %res = sub i32 0, %y
-        ret i32 %res
-}
-
-define i32 @d(i1 %x) {
-entry:
-        %y = zext i1 %x to i32
-        %res = sub i32 3, %y
-        ret i32 %res
+; CHECK: @a
+; CHECK: [[TMP1:%.*]] = zext i1 %y to i32
+; CHECK: [[TMP2:%.*]] = select i1 %x, i32 2, i32 1
+; CHECK-NEXT: sub i32 [[TMP2]], [[TMP1]]
+  %conv = zext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %conv3.neg = sub i32 0, %conv3
+  %sub = add i32 %conv, 1
+  %add = add i32 %sub, %conv3.neg
+  ret i32 %add
 }
diff --git a/test/Transforms/InstCombine/zext-fold.ll b/test/Transforms/InstCombine/zext-fold.ll
index 9521101..e5f316b 100644
--- a/test/Transforms/InstCombine/zext-fold.ll
+++ b/test/Transforms/InstCombine/zext-fold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep {zext } | count 1
+; RUN: opt < %s -instcombine -S | grep "zext " | count 1
 ; PR1570
 
 define i32 @test2(float %X, float %Y) {
diff --git a/test/Transforms/JumpThreading/2008-11-27-EntryMunge.ll b/test/Transforms/JumpThreading/2008-11-27-EntryMunge.ll
index b5d1065..6a50d4f 100644
--- a/test/Transforms/JumpThreading/2008-11-27-EntryMunge.ll
+++ b/test/Transforms/JumpThreading/2008-11-27-EntryMunge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -jump-threading -S | grep {ret i32 0}
+; RUN: opt < %s -jump-threading -S | grep "ret i32 0"
 ; PR3138
 
 define i32 @jt() {
diff --git a/test/Transforms/JumpThreading/2012-07-19-NoSuccessorIndirectBr.ll b/test/Transforms/JumpThreading/2012-07-19-NoSuccessorIndirectBr.ll
new file mode 100644
index 0000000..1c2c0c7
--- /dev/null
+++ b/test/Transforms/JumpThreading/2012-07-19-NoSuccessorIndirectBr.ll
@@ -0,0 +1,8 @@
+; RUN: opt < %s -jump-threading
+; PR 13405
+; Just check that it doesn't crash / assert
+
+define i32 @f() nounwind {
+entry:
+  indirectbr i8* undef, []
+}
diff --git a/test/Transforms/JumpThreading/compare.ll b/test/Transforms/JumpThreading/compare.ll
index 581785c..9b05b44 100644
--- a/test/Transforms/JumpThreading/compare.ll
+++ b/test/Transforms/JumpThreading/compare.ll
@@ -1,5 +1,5 @@
 ; There should be no phi nodes left.
-; RUN: opt < %s -jump-threading  -S | not grep {phi i32}
+; RUN: opt < %s -jump-threading  -S | not grep "phi i32"
 
 declare i32 @f1()
 declare i32 @f2()
diff --git a/test/Transforms/JumpThreading/no-irreducible-loops.ll b/test/Transforms/JumpThreading/no-irreducible-loops.ll
index a4914f9..c6e9faa 100644
--- a/test/Transforms/JumpThreading/no-irreducible-loops.ll
+++ b/test/Transforms/JumpThreading/no-irreducible-loops.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -jump-threading -loop-rotate -instcombine -indvars -loop-unroll -simplifycfg -S -verify-dom-info -verify-loop-info > %t
-; RUN: grep {store volatile} %t | count 3
-; RUN: not grep {br label} %t
+; RUN: grep "store volatile" %t | count 3
+; RUN: not grep "br label" %t
 
 ; Jump threading should not prevent this loop from being unrolled.
 
diff --git a/test/Transforms/JumpThreading/phi-eq.ll b/test/Transforms/JumpThreading/phi-eq.ll
new file mode 100644
index 0000000..40d3c7e
--- /dev/null
+++ b/test/Transforms/JumpThreading/phi-eq.ll
@@ -0,0 +1,209 @@
+; RUN: llvm-as < %s | opt -jump-threading | llvm-dis | FileCheck %s
+; Test whether two consecutive switches with identical structures assign the
+; proper value to the proper variable.  This is really testing 
+; Instruction::isIdenticalToWhenDefined, as previously that function was 
+; returning true if the value part of the operands of two phis were identical, 
+; even if the incoming blocks were not.
+; NB: this function should be pruned down more.
+
+%struct._GList = type { i8*, %struct._GList*, %struct._GList* }
+%struct.filter_def = type { i8*, i8* }
+
+@capture_filters = external hidden global %struct._GList*, align 8
+@display_filters = external hidden global %struct._GList*, align 8
+@.str2 = external hidden unnamed_addr constant [10 x i8], align 1
+@__PRETTY_FUNCTION__.copy_filter_list = external hidden unnamed_addr constant [62 x i8], align 1
+@.str12 = external hidden unnamed_addr constant [22 x i8], align 1
+@.str13 = external hidden unnamed_addr constant [31 x i8], align 1
+@capture_edited_filters = external hidden global %struct._GList*, align 8
+@display_edited_filters = external hidden global %struct._GList*, align 8
+@__PRETTY_FUNCTION__.get_filter_list = external hidden unnamed_addr constant [44 x i8], align 1
+
+declare void @g_assertion_message(i8*, i8*, i32, i8*, i8*) noreturn
+
+declare void @g_free(i8*)
+
+declare %struct._GList* @g_list_first(%struct._GList*)
+
+declare noalias i8* @g_malloc(i64)
+
+define void @copy_filter_list(i32 %dest_type, i32 %src_type) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %entry
+  %cmp = icmp ne i32 %dest_type, %src_type
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %do.body
+  br label %if.end
+
+if.else:                                          ; preds = %do.body
+  call void @g_assertion_message_expr(i8* null, i8* getelementptr inbounds ([10 x i8]* @.str2, i32 0, i32 0), i32 581, i8* getelementptr inbounds ([62 x i8]* @__PRETTY_FUNCTION__.copy_filter_list, i32 0, i32 0), i8* getelementptr inbounds ([22 x i8]* @.str12, i32 0, i32 0)) noreturn
+  unreachable
+
+if.end:                                           ; preds = %if.then
+  br label %do.end
+
+do.end:                                           ; preds = %if.end
+  switch i32 %dest_type, label %sw.default.i [
+    i32 0, label %sw.bb.i
+    i32 1, label %sw.bb1.i
+    i32 2, label %sw.bb2.i
+    i32 3, label %sw.bb3.i
+  ]
+
+sw.bb.i:                                          ; preds = %do.end
+  br label %get_filter_list.exit
+
+sw.bb1.i:                                         ; preds = %do.end
+  br label %get_filter_list.exit
+
+sw.bb2.i:                                         ; preds = %do.end
+  br label %get_filter_list.exit
+
+sw.bb3.i:                                         ; preds = %do.end
+  br label %get_filter_list.exit
+
+sw.default.i:                                     ; preds = %do.end
+  call void @g_assertion_message(i8* null, i8* getelementptr inbounds ([10 x i8]* @.str2, i32 0, i32 0), i32 408, i8* getelementptr inbounds ([44 x i8]* @__PRETTY_FUNCTION__.get_filter_list, i32 0, i32 0), i8* null) noreturn nounwind
+  unreachable
+
+get_filter_list.exit:                             ; preds = %sw.bb3.i, %sw.bb2.i, %sw.bb1.i, %sw.bb.i
+  %0 = phi %struct._GList** [ @display_edited_filters, %sw.bb3.i ], [ @capture_edited_filters, %sw.bb2.i ], [ @display_filters, %sw.bb1.i ], [ @capture_filters, %sw.bb.i ]
+  switch i32 %src_type, label %sw.default.i5 [
+    i32 0, label %sw.bb.i1
+    i32 1, label %sw.bb1.i2
+    i32 2, label %sw.bb2.i3
+    i32 3, label %sw.bb3.i4
+  ]
+
+sw.bb.i1:                                         ; preds = %get_filter_list.exit
+  br label %get_filter_list.exit6
+
+sw.bb1.i2:                                        ; preds = %get_filter_list.exit
+  br label %get_filter_list.exit6
+
+sw.bb2.i3:                                        ; preds = %get_filter_list.exit
+  br label %get_filter_list.exit6
+
+sw.bb3.i4:                                        ; preds = %get_filter_list.exit
+  br label %get_filter_list.exit6
+
+sw.default.i5:                                    ; preds = %get_filter_list.exit
+  call void @g_assertion_message(i8* null, i8* getelementptr inbounds ([10 x i8]* @.str2, i32 0, i32 0), i32 408, i8* getelementptr inbounds ([44 x i8]* @__PRETTY_FUNCTION__.get_filter_list, i32 0, i32 0), i8* null) noreturn nounwind
+  unreachable
+
+; CHECK: get_filter_list.exit
+get_filter_list.exit6:                            ; preds = %sw.bb3.i4, %sw.bb2.i3, %sw.bb1.i2, %sw.bb.i1
+  %1 = phi %struct._GList** [ @display_edited_filters, %sw.bb3.i4 ], [ @capture_edited_filters, %sw.bb2.i3 ], [ @display_filters, %sw.bb1.i2 ], [ @capture_filters, %sw.bb.i1 ]
+; CHECK: %2 = load
+  %2 = load %struct._GList** %1, align 8
+; We should have jump-threading insert an additional load here for the value
+; coming out of the first switch, which is picked up by a subsequent phi
+; CHECK: {{%\.pr = load %[^%]* %0}}
+; CHECK-NEXT:  br label %while.cond
+  br label %while.cond
+
+; CHECK: while.cond
+while.cond:                                       ; preds = %while.body, %get_filter_list.exit6
+; CHECK: {{= phi .*%.pr}}
+  %3 = load %struct._GList** %0, align 8
+; CHECK: tobool
+  %tobool = icmp ne %struct._GList* %3, null
+  br i1 %tobool, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %4 = load %struct._GList** %0, align 8
+  %5 = load %struct._GList** %0, align 8
+  %call2 = call %struct._GList* @g_list_first(%struct._GList* %5)
+  %data.i = getelementptr inbounds %struct._GList* %call2, i32 0, i32 0
+  %6 = load i8** %data.i, align 8
+  %7 = bitcast i8* %6 to %struct.filter_def*
+  %name.i = getelementptr inbounds %struct.filter_def* %7, i32 0, i32 0
+  %8 = load i8** %name.i, align 8
+  call void @g_free(i8* %8) nounwind
+  %strval.i = getelementptr inbounds %struct.filter_def* %7, i32 0, i32 1
+  %9 = load i8** %strval.i, align 8
+  call void @g_free(i8* %9) nounwind
+  %10 = bitcast %struct.filter_def* %7 to i8*
+  call void @g_free(i8* %10) nounwind
+  %call.i = call %struct._GList* @g_list_remove_link(%struct._GList* %4, %struct._GList* %call2) nounwind
+  store %struct._GList* %call.i, %struct._GList** %0, align 8
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  br label %do.body4
+
+do.body4:                                         ; preds = %while.end
+  %11 = load %struct._GList** %0, align 8
+  %call5 = call i32 @g_list_length(%struct._GList* %11)
+  %cmp6 = icmp eq i32 %call5, 0
+  br i1 %cmp6, label %if.then7, label %if.else8
+
+if.then7:                                         ; preds = %do.body4
+  br label %if.end9
+
+if.else8:                                         ; preds = %do.body4
+  call void @g_assertion_message_expr(i8* null, i8* getelementptr inbounds ([10 x i8]* @.str2, i32 0, i32 0), i32 600, i8* getelementptr inbounds ([62 x i8]* @__PRETTY_FUNCTION__.copy_filter_list, i32 0, i32 0), i8* getelementptr inbounds ([31 x i8]* @.str13, i32 0, i32 0)) noreturn
+  unreachable
+
+if.end9:                                          ; preds = %if.then7
+  br label %do.end10
+
+do.end10:                                         ; preds = %if.end9
+  br label %while.cond11
+
+while.cond11:                                     ; preds = %cond.end, %do.end10
+  %cond10 = phi %struct._GList* [ %cond, %cond.end ], [ %2, %do.end10 ]
+  %tobool12 = icmp ne %struct._GList* %cond10, null
+  br i1 %tobool12, label %while.body13, label %while.end16
+
+while.body13:                                     ; preds = %while.cond11
+  %data = getelementptr inbounds %struct._GList* %cond10, i32 0, i32 0
+  %12 = load i8** %data, align 8
+  %13 = bitcast i8* %12 to %struct.filter_def*
+  %14 = load %struct._GList** %0, align 8
+  %name = getelementptr inbounds %struct.filter_def* %13, i32 0, i32 0
+  %15 = load i8** %name, align 8
+  %strval = getelementptr inbounds %struct.filter_def* %13, i32 0, i32 1
+  %16 = load i8** %strval, align 8
+  %call.i7 = call noalias i8* @g_malloc(i64 16) nounwind
+  %17 = bitcast i8* %call.i7 to %struct.filter_def*
+  %call1.i = call noalias i8* @g_strdup(i8* %15) nounwind
+  %name.i8 = getelementptr inbounds %struct.filter_def* %17, i32 0, i32 0
+  store i8* %call1.i, i8** %name.i8, align 8
+  %call2.i = call noalias i8* @g_strdup(i8* %16) nounwind
+  %strval.i9 = getelementptr inbounds %struct.filter_def* %17, i32 0, i32 1
+  store i8* %call2.i, i8** %strval.i9, align 8
+  %18 = bitcast %struct.filter_def* %17 to i8*
+  %call3.i = call %struct._GList* @g_list_append(%struct._GList* %14, i8* %18) nounwind
+  store %struct._GList* %call3.i, %struct._GList** %0, align 8
+  %tobool15 = icmp ne %struct._GList* %cond10, null
+  br i1 %tobool15, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %while.body13
+  %next = getelementptr inbounds %struct._GList* %cond10, i32 0, i32 1
+  %19 = load %struct._GList** %next, align 8
+  br label %cond.end
+
+cond.false:                                       ; preds = %while.body13
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi %struct._GList* [ %19, %cond.true ], [ null, %cond.false ]
+  br label %while.cond11
+
+while.end16:                                      ; preds = %while.cond11
+  ret void
+}
+
+declare void @g_assertion_message_expr(i8*, i8*, i32, i8*, i8*) noreturn
+
+declare i32 @g_list_length(%struct._GList*)
+
+declare noalias i8* @g_strdup(i8*)
+
+declare %struct._GList* @g_list_append(%struct._GList*, i8*)
+
+declare %struct._GList* @g_list_remove_link(%struct._GList*, %struct._GList*)
diff --git a/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll b/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll
index 1534585..dd43c88 100644
--- a/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll
+++ b/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-simplify -lcssa -S | \
-; RUN:   grep {%%SJE.0.0.lcssa = phi .struct.SetJmpMapEntry}
+; RUN:   grep "%%SJE.0.0.lcssa = phi .struct.SetJmpMapEntry"
 
         %struct.SetJmpMapEntry = type { i8*, i32, %struct.SetJmpMapEntry* }
 
diff --git a/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll b/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll
index ad4f144..575f816 100644
--- a/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll
+++ b/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -lcssa -S | \
-; RUN:    grep {%X.1.lcssa}
+; RUN:    grep "%X.1.lcssa"
 ; RUN: opt < %s -lcssa -S | \
-; RUN:    not grep {%X.1.lcssa1}
+; RUN:    not grep "%X.1.lcssa1"
 
 declare i1 @c1()
 
diff --git a/test/Transforms/LCSSA/basictest.ll b/test/Transforms/LCSSA/basictest.ll
index 23ab2c0..4b05ad9 100644
--- a/test/Transforms/LCSSA/basictest.ll
+++ b/test/Transforms/LCSSA/basictest.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -lcssa -S | \
-; RUN:   grep {X3.lcssa = phi i32}
+; RUN:   grep "X3.lcssa = phi i32"
 ; RUN: opt < %s -lcssa -S | \
-; RUN:   grep {X4 = add i32 3, %X3.lcssa}
+; RUN:   grep "X4 = add i32 3, %X3.lcssa"
 
 define void @lcssa(i1 %S2) {
 entry:
diff --git a/test/Transforms/LCSSA/unreachable-use.ll b/test/Transforms/LCSSA/unreachable-use.ll
index c389c9c..71ae134 100644
--- a/test/Transforms/LCSSA/unreachable-use.ll
+++ b/test/Transforms/LCSSA/unreachable-use.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -lcssa -S -verify-loop-info | grep {\[%\]tmp33 = load i1\\*\\* \[%\]tmp}
+; RUN: opt < %s -lcssa -S -verify-loop-info | grep "[%]tmp33 = load i1\*\* [%]tmp"
 ; PR6546
 
 ; LCSSA doesn't need to transform uses in blocks not reachable
diff --git a/test/Transforms/LCSSA/unused-phis.ll b/test/Transforms/LCSSA/unused-phis.ll
index aa2ab96..01b214b 100644
--- a/test/Transforms/LCSSA/unused-phis.ll
+++ b/test/Transforms/LCSSA/unused-phis.ll
@@ -2,9 +2,9 @@
 ; CHECK: exit1:
 ; CHECK: .lcssa =
 ; CHECK: exit2:
-; CHECK: .lcssa2 =
+; CHECK: .lcssa1 =
 ; CHECK: exit3:
-; CHECK-NOT: .lcssa1 =
+; CHECK-NOT: .lcssa
 
 ; Test to ensure that when there are multiple exit blocks, PHI nodes are
 ; only inserted by LCSSA when there is a use dominated by a given exit
diff --git a/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll b/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll
index 70a04c7..b54d520 100644
--- a/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll
+++ b/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll
@@ -4,7 +4,7 @@
 ; case... bad.
 
 ; RUN: opt < %s -licm -loop-deletion -simplifycfg -S | \
-; RUN:   not grep {br }
+; RUN:   not grep "br "
 
 define i32 @main(i32 %argc) {
 ; <label>:0
diff --git a/test/Transforms/LICM/2007-05-22-VolatileSink.ll b/test/Transforms/LICM/2007-05-22-VolatileSink.ll
index 4df6ea7..94511cc 100644
--- a/test/Transforms/LICM/2007-05-22-VolatileSink.ll
+++ b/test/Transforms/LICM/2007-05-22-VolatileSink.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm -S | grep {store volatile}
+; RUN: opt < %s -licm -S | grep "store volatile"
 ; PR1435
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-apple-darwin8"
diff --git a/test/Transforms/LICM/hoist-invariant-load.ll b/test/Transforms/LICM/hoist-invariant-load.ll
index 4e100d3..f9fc551 100644
--- a/test/Transforms/LICM/hoist-invariant-load.ll
+++ b/test/Transforms/LICM/hoist-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm -stats -S |& grep "1 licm"
+; RUN: opt < %s -licm -stats -S 2>&1 | grep "1 licm"
 
 @"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1
 @"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
diff --git a/test/Transforms/LICM/promote-order.ll b/test/Transforms/LICM/promote-order.ll
new file mode 100644
index 0000000..b016265
--- /dev/null
+++ b/test/Transforms/LICM/promote-order.ll
@@ -0,0 +1,41 @@
+; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
+
+; LICM should keep the stores in their original order when it sinks/promotes them.
+; rdar://12045203
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@p = external global i8*
+
+define i32* @_Z4doiti(i32 %n, float* %tmp1, i32* %tmp3) nounwind {
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  store float 1.000000e+00, float* %tmp1, align 4, !tbaa !1
+  store i32 1, i32* %tmp3, align 4, !tbaa !2
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+; CHECK: for.cond.for.end_crit_edge:
+; CHECK: store float 1.000000e+00, float* %tmp1
+; CHECK: store i32 1, i32* %tmp3
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi i32* [ %tmp3, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %r.0.lcssa = phi i32* [ %split, %for.cond.for.end_crit_edge ], [ undef, %entry ]
+  ret i32* %r.0.lcssa
+}
+
+!0 = metadata !{metadata !"minimal TBAA"}
+!1 = metadata !{metadata !"float", metadata !0}
+!2 = metadata !{metadata !"int", metadata !0}
diff --git a/test/Transforms/LICM/speculate.ll b/test/Transforms/LICM/speculate.ll
index 507b193..4c4d036 100644
--- a/test/Transforms/LICM/speculate.ll
+++ b/test/Transforms/LICM/speculate.ll
@@ -165,3 +165,25 @@ for.inc:                                          ; preds = %if.then, %for.body
 for.end:                                          ; preds = %for.inc, %entry
   ret void
 }
+
+; SDiv is unsafe to speculate inside an infinite loop.
+
+define void @unsafe_sdiv_c(i64 %a, i64 %b, i64* %p) {
+entry:
+; CHECK: entry:
+; CHECK-NOT: sdiv
+; CHECK: br label %for.body
+  br label %for.body
+
+for.body:
+  %c = icmp eq i64 %b, 0
+  br i1 %c, label %backedge, label %if.then
+
+if.then:
+  %d = sdiv i64 %a, %b
+  store i64 %d, i64* %p
+  br label %backedge
+
+backedge:
+  br label %for.body
+}
diff --git a/test/Transforms/LoopRotate/PhiRename-1.ll b/test/Transforms/LoopRotate/PhiRename-1.ll
index 9cb55b4..a224777 100644
--- a/test/Transforms/LoopRotate/PhiRename-1.ll
+++ b/test/Transforms/LoopRotate/PhiRename-1.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | not grep {\\\[ .tmp224} 
-; END.
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | FileCheck %s
+; CHECK-NOT: [ {{.}}tmp224
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 
 	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
diff --git a/test/Transforms/LoopSimplify/indirectbr.ll b/test/Transforms/LoopSimplify/indirectbr.ll
index 9814d4a..ca05f43 100644
--- a/test/Transforms/LoopSimplify/indirectbr.ll
+++ b/test/Transforms/LoopSimplify/indirectbr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-simplify -lcssa -verify-loop-info -verify-dom-info -S \
-; RUN:   | grep -F {indirectbr i8* %x, \[label %L0, label %L1\]} \
+; RUN:   | grep -F "indirectbr i8* %x, [label %L0, label %L1]" \
 ; RUN:   | count 6
 
 ; LoopSimplify should not try to transform loops when indirectbr is involved.
diff --git a/test/Transforms/LoopSimplify/merge-exits.ll b/test/Transforms/LoopSimplify/merge-exits.ll
index 40ad2f4..8de5938 100644
--- a/test/Transforms/LoopSimplify/merge-exits.ll
+++ b/test/Transforms/LoopSimplify/merge-exits.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info > %t
 ; RUN: not grep sext %t
-; RUN: grep {phi i64} %t | count 1
+; RUN: grep "phi i64" %t | count 1
 
 ; Loopsimplify should be able to merge the two loop exits
 ; into one, so that loop rotate can rotate the loop, so
diff --git a/test/Transforms/LoopSimplify/preserve-scev.ll b/test/Transforms/LoopSimplify/preserve-scev.ll
index 23ac7f2..854c612 100644
--- a/test/Transforms/LoopSimplify/preserve-scev.ll
+++ b/test/Transforms/LoopSimplify/preserve-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S < %s -indvars | opt -analyze -iv-users | grep {%cmp = icmp slt i32} | grep {= \{%\\.ph,+,1\}<%for.cond>}
+; RUN: opt -S < %s -indvars | opt -analyze -iv-users | grep "%cmp = icmp slt i32" | grep "= {%\.ph,+,1}<%for.cond>"
 ; PR8079
 
 ; LoopSimplify should invalidate indvars when splitting out the
diff --git a/test/Transforms/LoopStrengthReduce/2012-07-13-ExpandUDiv.ll b/test/Transforms/LoopStrengthReduce/2012-07-13-ExpandUDiv.ll
new file mode 100644
index 0000000..a122208
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2012-07-13-ExpandUDiv.ll
@@ -0,0 +1,90 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+;
+; PR11356: likely wrong code bug
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@g_66 = global [1 x i32] zeroinitializer, align 4
+@g_775 = global i32 0, align 4
+@g_752 = global i32 0, align 4
+@g_3 = global i32 0, align 4
+
+; Ensure that %div.i.i.us is not hoisted.
+; CHECK: @main
+; CHECK: for.body.i.i.us:
+; CHECK: %div.i.i.i.us
+; CHECK: %cmp5.i.i.us
+define i32 @main() nounwind uwtable ssp {
+entry:
+  %l_2 = alloca [1 x i32], align 4
+  %arrayidx = getelementptr inbounds [1 x i32]* %l_2, i64 0, i64 0
+  store i32 0, i32* %arrayidx, align 4, !tbaa !0
+  %tmp = load i32* @g_3, align 4, !tbaa !0
+  %idxprom = sext i32 %tmp to i64
+  %arrayidx1 = getelementptr inbounds [1 x i32]* %l_2, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx1, align 4, !tbaa !0
+  %conv.i.i = and i32 %tmp1, 65535
+  %tobool.i.i.i = icmp ne i32 %tmp, 0
+  br label %codeRepl
+
+codeRepl.loopexit.us-lcssa:                       ; preds = %for.body.i.i, %codeRepl5
+  br label %codeRepl.loopexit
+
+codeRepl.loopexit:                                ; preds = %codeRepl.loopexit.us-lcssa.us, %codeRepl.loopexit.us-lcssa
+  br label %codeRepl
+
+codeRepl:                                         ; preds = %codeRepl.loopexit, %entry
+  br i1 %tobool.i.i.i, label %codeRepl.split.us, label %codeRepl.codeRepl.split_crit_edge
+
+codeRepl.codeRepl.split_crit_edge:                ; preds = %codeRepl
+  br label %codeRepl.split
+
+codeRepl.split.us:                                ; preds = %codeRepl
+  br label %for.cond.i.i.us
+
+for.cond.i.i.us:                                  ; preds = %for.inc.i.i.us, %codeRepl.split.us
+  %tmp2 = phi i32 [ 0, %codeRepl.split.us ], [ %add.i.i.us, %for.inc.i.i.us ]
+  br label %codeRepl5.us
+
+for.inc.i.i.us:                                   ; preds = %for.body.i.i.us
+  %add.i.i.us = add nsw i32 %tmp2, 1
+  store i32 %add.i.i.us, i32* @g_752, align 4, !tbaa !0
+  br label %for.cond.i.i.us
+
+for.body.i.i.us:                                  ; preds = %codeRepl5.us
+  %div.i.i.i.us = udiv i32 1, %conv.i.i
+  %cmp5.i.i.us = icmp eq i32 %div.i.i.i.us, %tmp2
+  br i1 %cmp5.i.i.us, label %codeRepl.loopexit.us-lcssa.us, label %for.inc.i.i.us
+
+codeRepl5.us:                                     ; preds = %for.cond.i.i.us
+  br i1 true, label %codeRepl.loopexit.us-lcssa.us, label %for.body.i.i.us
+
+codeRepl.loopexit.us-lcssa.us:                    ; preds = %codeRepl5.us, %for.body.i.i.us
+  br label %codeRepl.loopexit
+
+codeRepl.split:                                   ; preds = %codeRepl.codeRepl.split_crit_edge
+  br label %for.cond.i.i
+
+for.cond.i.i:                                     ; preds = %for.inc.i.i, %codeRepl.split
+  %tmp3 = phi i32 [ 0, %codeRepl.split ], [ %add.i.i, %for.inc.i.i ]
+  br label %codeRepl5
+
+codeRepl5:                                        ; preds = %for.cond.i.i
+  br i1 true, label %codeRepl.loopexit.us-lcssa, label %for.body.i.i
+
+for.body.i.i:                                     ; preds = %codeRepl5
+  %cmp5.i.i = icmp eq i32 0, %tmp3
+  br i1 %cmp5.i.i, label %codeRepl.loopexit.us-lcssa, label %for.inc.i.i
+
+for.inc.i.i:                                      ; preds = %for.body.i.i
+  %add.i.i = add nsw i32 %tmp3, 1
+  store i32 %add.i.i, i32* @g_752, align 4, !tbaa !0
+  br label %for.cond.i.i
+
+func_4.exit:                                      ; No predecessors!
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
new file mode 100644
index 0000000..3793bac
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
@@ -0,0 +1,517 @@
+; RUN: opt -loop-reduce -disable-output -debug-only=loop-reduce %s 2> %t
+; RUN: FileCheck %s < %t
+; REQUIRES: asserts
+;
+; PR13361: LSR + SCEV "hangs" on reasonably sized test with sequence of loops
+;
+; Without limits on CollectSubexpr, we have thousands of formulae for
+; the use that crosses loops. With limits we have five.
+; CHECK: LSR on loop %bb221:
+; CHECK: After generating reuse formulae:
+; CHECK: LSR is examining the following uses:
+; CHECK: LSR Use: Kind=Special
+; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK-NOT:reg
+; CHECK: Filtering for use
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-freebsd9"
+
+%struct.snork = type { %struct.fuga, i32, i32, i32, i32, i32, i32 }
+%struct.fuga = type { %struct.gork, i64 }
+%struct.gork = type { i8*, i32, i32, %struct.noot* }
+%struct.noot = type opaque
+%struct.jim = type { [5120 x i8], i32, i32, [2048 x i8], i32, [256 x i8] }
+
+@global = external global %struct.snork, align 8
+@global1 = external hidden unnamed_addr constant [52 x i8], align 1
+@global2 = external hidden unnamed_addr constant [18 x i8], align 1
+@global3 = external hidden global %struct.jim, align 32
+@global4 = external hidden unnamed_addr constant [40 x i8], align 1
+
+declare void @snork(...) nounwind
+
+declare fastcc void @blarg() nounwind uwtable readonly
+
+define hidden fastcc void @boogle() nounwind uwtable {
+bb:
+  %tmp = trunc i64 0 to i32
+  %tmp1 = icmp slt i32 %tmp, 2047
+  %tmp2 = add i32 0, -1
+  %tmp3 = icmp ult i32 %tmp2, 255
+  %tmp4 = and i1 %tmp1, %tmp3
+  br i1 %tmp4, label %bb6, label %bb5
+
+bb5:                                              ; preds = %bb
+  tail call void (...)* @snork(i8* getelementptr inbounds ([52 x i8]* @global1, i64 0, i64 0), i32 2021) nounwind
+  tail call void (...)* @snork(i8* getelementptr inbounds (%struct.jim* @global3, i64 0, i32 3, i64 1), i32 -2146631418) nounwind
+  unreachable
+
+bb6:                                              ; preds = %bb
+  tail call void @zot(i8* getelementptr inbounds (%struct.jim* @global3, i64 0, i32 5, i64 0), i8* getelementptr inbounds (%struct.jim* @global3, i64 0, i32 3, i64 1), i64 undef, i32 1, i1 false) nounwind
+  %tmp7 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 5, i64 undef
+  store i8 0, i8* %tmp7, align 1
+  %tmp8 = add nsw i32 0, 1
+  %tmp9 = sext i32 %tmp8 to i64
+  %tmp10 = add i64 %tmp9, 1
+  %tmp11 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 3, i64 %tmp10
+  %tmp12 = sub i64 2047, %tmp9
+  %tmp13 = icmp eq i32 undef, 1
+  br i1 %tmp13, label %bb14, label %bb15
+
+bb14:                                             ; preds = %bb6
+  tail call fastcc void @blarg()
+  unreachable
+
+bb15:                                             ; preds = %bb6
+  %tmp16 = trunc i64 %tmp12 to i32
+  br label %bb17
+
+bb17:                                             ; preds = %bb26, %bb15
+  %tmp18 = phi i64 [ %tmp28, %bb26 ], [ 0, %bb15 ]
+  %tmp19 = phi i32 [ %tmp29, %bb26 ], [ 0, %bb15 ]
+  %tmp20 = trunc i64 %tmp18 to i32
+  %tmp21 = icmp slt i32 %tmp20, %tmp16
+  br i1 %tmp21, label %bb22, label %bb32
+
+bb22:                                             ; preds = %bb17
+  %tmp23 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 3, i64 0
+  %tmp24 = load i8* %tmp23, align 1
+  %tmp25 = icmp eq i8 %tmp24, 58
+  br i1 %tmp25, label %bb30, label %bb26
+
+bb26:                                             ; preds = %bb22
+  %tmp27 = icmp eq i8 %tmp24, 0
+  %tmp28 = add i64 %tmp18, 1
+  %tmp29 = add nsw i32 %tmp19, 1
+  br i1 %tmp27, label %bb32, label %bb17
+
+bb30:                                             ; preds = %bb22
+  %tmp31 = icmp ult i32 undef, 255
+  br i1 %tmp31, label %bb33, label %bb32
+
+bb32:                                             ; preds = %bb30, %bb26, %bb17
+  tail call void (...)* @snork(i8* getelementptr inbounds ([52 x i8]* @global1, i64 0, i64 0), i32 2038) nounwind
+  tail call void (...)* @snork(i8* %tmp11, i32 -2146631418) nounwind
+  unreachable
+
+bb33:                                             ; preds = %bb30
+  tail call void @zot(i8* getelementptr inbounds (%struct.jim* @global3, i64 0, i32 5, i64 0), i8* %tmp11, i64 undef, i32 1, i1 false) nounwind
+  %tmp34 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 5, i64 undef
+  store i8 0, i8* %tmp34, align 1
+  %tmp35 = add nsw i32 %tmp19, 1
+  %tmp36 = sext i32 %tmp35 to i64
+  %tmp37 = add i64 %tmp36, %tmp10
+  %tmp38 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 3, i64 %tmp37
+  %tmp39 = sub i64 %tmp12, %tmp36
+  br i1 false, label %bb40, label %bb41
+
+bb40:                                             ; preds = %bb33
+  br label %bb41
+
+bb41:                                             ; preds = %bb40, %bb33
+  %tmp42 = trunc i64 %tmp39 to i32
+  br label %bb43
+
+bb43:                                             ; preds = %bb52, %bb41
+  %tmp44 = phi i64 [ %tmp53, %bb52 ], [ 0, %bb41 ]
+  %tmp45 = phi i32 [ %tmp54, %bb52 ], [ 0, %bb41 ]
+  %tmp46 = trunc i64 %tmp44 to i32
+  %tmp47 = icmp slt i32 %tmp46, %tmp42
+  br i1 %tmp47, label %bb48, label %bb58
+
+bb48:                                             ; preds = %bb43
+  %tmp49 = add i64 %tmp44, %tmp37
+  %tmp50 = load i8* undef, align 1
+  %tmp51 = icmp eq i8 %tmp50, 58
+  br i1 %tmp51, label %bb55, label %bb52
+
+bb52:                                             ; preds = %bb48
+  %tmp53 = add i64 %tmp44, 1
+  %tmp54 = add nsw i32 %tmp45, 1
+  br i1 undef, label %bb58, label %bb43
+
+bb55:                                             ; preds = %bb48
+  %tmp56 = add i32 %tmp45, -1
+  %tmp57 = icmp ult i32 %tmp56, 255
+  br i1 %tmp57, label %bb59, label %bb58
+
+bb58:                                             ; preds = %bb55, %bb52, %bb43
+  tail call void (...)* @snork(i8* getelementptr inbounds ([52 x i8]* @global1, i64 0, i64 0), i32 2055) nounwind
+  tail call void (...)* @snork(i8* %tmp38, i32 -2146631418) nounwind
+  br label %bb247
+
+bb59:                                             ; preds = %bb55
+  %tmp60 = sext i32 %tmp45 to i64
+  tail call void @zot(i8* getelementptr inbounds (%struct.jim* @global3, i64 0, i32 5, i64 0), i8* %tmp38, i64 %tmp60, i32 1, i1 false) nounwind
+  %tmp61 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 5, i64 %tmp60
+  store i8 0, i8* %tmp61, align 1
+  %tmp62 = add nsw i32 %tmp45, 1
+  %tmp63 = sext i32 %tmp62 to i64
+  %tmp64 = add i64 %tmp63, %tmp37
+  %tmp65 = sub i64 %tmp39, %tmp63
+  %tmp66 = icmp eq i32 undef, 2
+  br i1 %tmp66, label %bb67, label %bb68
+
+bb67:                                             ; preds = %bb59
+  tail call fastcc void @blarg()
+  unreachable
+
+bb68:                                             ; preds = %bb59
+  switch i32 undef, label %bb71 [
+    i32 0, label %bb74
+    i32 -1, label %bb69
+  ]
+
+bb69:                                             ; preds = %bb68
+  tail call void (...)* @snork(i8* getelementptr inbounds ([52 x i8]* @global1, i64 0, i64 0), i32 2071) nounwind
+  %tmp70 = load i32* getelementptr inbounds (%struct.snork* @global, i64 0, i32 2), align 4
+  unreachable
+
+bb71:                                             ; preds = %bb68
+  %tmp72 = load i32* getelementptr inbounds (%struct.snork* @global, i64 0, i32 4), align 4
+  %tmp73 = icmp eq i32 undef, 0
+  br i1 %tmp73, label %bb247, label %bb74
+
+bb74:                                             ; preds = %bb71, %bb68
+  %tmp75 = trunc i64 %tmp65 to i32
+  br label %bb76
+
+bb76:                                             ; preds = %bb82, %bb74
+  %tmp77 = phi i64 [ %tmp84, %bb82 ], [ 0, %bb74 ]
+  %tmp78 = phi i32 [ %tmp85, %bb82 ], [ 0, %bb74 ]
+  %tmp79 = trunc i64 %tmp77 to i32
+  %tmp80 = icmp slt i32 %tmp79, %tmp75
+  br i1 %tmp80, label %bb81, label %bb87
+
+bb81:                                             ; preds = %bb76
+  br i1 false, label %bb86, label %bb82
+
+bb82:                                             ; preds = %bb81
+  %tmp83 = icmp eq i8 0, 0
+  %tmp84 = add i64 %tmp77, 1
+  %tmp85 = add nsw i32 %tmp78, 1
+  br i1 %tmp83, label %bb87, label %bb76
+
+bb86:                                             ; preds = %bb81
+  br i1 undef, label %bb88, label %bb87
+
+bb87:                                             ; preds = %bb86, %bb82, %bb76
+  unreachable
+
+bb88:                                             ; preds = %bb86
+  %tmp89 = add nsw i32 %tmp78, 1
+  %tmp90 = sext i32 %tmp89 to i64
+  %tmp91 = add i64 %tmp90, %tmp64
+  %tmp92 = sub i64 %tmp65, %tmp90
+  br i1 false, label %bb93, label %bb94
+
+bb93:                                             ; preds = %bb88
+  unreachable
+
+bb94:                                             ; preds = %bb88
+  %tmp95 = trunc i64 %tmp92 to i32
+  br label %bb96
+
+bb96:                                             ; preds = %bb102, %bb94
+  %tmp97 = phi i64 [ %tmp103, %bb102 ], [ 0, %bb94 ]
+  %tmp98 = phi i32 [ %tmp104, %bb102 ], [ 0, %bb94 ]
+  %tmp99 = trunc i64 %tmp97 to i32
+  %tmp100 = icmp slt i32 %tmp99, %tmp95
+  br i1 %tmp100, label %bb101, label %bb106
+
+bb101:                                            ; preds = %bb96
+  br i1 undef, label %bb105, label %bb102
+
+bb102:                                            ; preds = %bb101
+  %tmp103 = add i64 %tmp97, 1
+  %tmp104 = add nsw i32 %tmp98, 1
+  br i1 false, label %bb106, label %bb96
+
+bb105:                                            ; preds = %bb101
+  br i1 undef, label %bb107, label %bb106
+
+bb106:                                            ; preds = %bb105, %bb102, %bb96
+  br label %bb247
+
+bb107:                                            ; preds = %bb105
+  %tmp108 = add nsw i32 %tmp98, 1
+  %tmp109 = sext i32 %tmp108 to i64
+  %tmp110 = add i64 %tmp109, %tmp91
+  %tmp111 = sub i64 %tmp92, %tmp109
+  br i1 false, label %bb112, label %bb113
+
+bb112:                                            ; preds = %bb107
+  unreachable
+
+bb113:                                            ; preds = %bb107
+  %tmp114 = trunc i64 %tmp111 to i32
+  br label %bb115
+
+bb115:                                            ; preds = %bb121, %bb113
+  %tmp116 = phi i64 [ %tmp122, %bb121 ], [ 0, %bb113 ]
+  %tmp117 = phi i32 [ %tmp123, %bb121 ], [ 0, %bb113 ]
+  %tmp118 = trunc i64 %tmp116 to i32
+  %tmp119 = icmp slt i32 %tmp118, %tmp114
+  br i1 %tmp119, label %bb120, label %bb125
+
+bb120:                                            ; preds = %bb115
+  br i1 undef, label %bb124, label %bb121
+
+bb121:                                            ; preds = %bb120
+  %tmp122 = add i64 %tmp116, 1
+  %tmp123 = add nsw i32 %tmp117, 1
+  br i1 false, label %bb125, label %bb115
+
+bb124:                                            ; preds = %bb120
+  br i1 false, label %bb126, label %bb125
+
+bb125:                                            ; preds = %bb124, %bb121, %bb115
+  unreachable
+
+bb126:                                            ; preds = %bb124
+  %tmp127 = add nsw i32 %tmp117, 1
+  %tmp128 = sext i32 %tmp127 to i64
+  %tmp129 = add i64 %tmp128, %tmp110
+  %tmp130 = sub i64 %tmp111, %tmp128
+  tail call fastcc void @blarg()
+  br i1 false, label %bb132, label %bb131
+
+bb131:                                            ; preds = %bb126
+  unreachable
+
+bb132:                                            ; preds = %bb126
+  %tmp133 = trunc i64 %tmp130 to i32
+  br label %bb134
+
+bb134:                                            ; preds = %bb140, %bb132
+  %tmp135 = phi i64 [ %tmp141, %bb140 ], [ 0, %bb132 ]
+  %tmp136 = phi i32 [ %tmp142, %bb140 ], [ 0, %bb132 ]
+  %tmp137 = trunc i64 %tmp135 to i32
+  %tmp138 = icmp slt i32 %tmp137, %tmp133
+  br i1 %tmp138, label %bb139, label %bb144
+
+bb139:                                            ; preds = %bb134
+  br i1 false, label %bb143, label %bb140
+
+bb140:                                            ; preds = %bb139
+  %tmp141 = add i64 %tmp135, 1
+  %tmp142 = add nsw i32 %tmp136, 1
+  br i1 false, label %bb144, label %bb134
+
+bb143:                                            ; preds = %bb139
+  br i1 false, label %bb145, label %bb144
+
+bb144:                                            ; preds = %bb143, %bb140, %bb134
+  br label %bb247
+
+bb145:                                            ; preds = %bb143
+  %tmp146 = add nsw i32 %tmp136, 1
+  %tmp147 = sext i32 %tmp146 to i64
+  %tmp148 = add i64 %tmp147, %tmp129
+  %tmp149 = sub i64 %tmp130, %tmp147
+  switch i32 0, label %bb152 [
+    i32 0, label %bb150
+    i32 16, label %bb150
+    i32 32, label %bb150
+    i32 48, label %bb150
+    i32 64, label %bb150
+    i32 256, label %bb150
+    i32 4096, label %bb150
+  ]
+
+bb150:                                            ; preds = %bb145, %bb145, %bb145, %bb145, %bb145, %bb145, %bb145
+  %tmp151 = trunc i64 %tmp149 to i32
+  br label %bb153
+
+bb152:                                            ; preds = %bb145
+  unreachable
+
+bb153:                                            ; preds = %bb160, %bb150
+  %tmp154 = phi i64 [ %tmp161, %bb160 ], [ 0, %bb150 ]
+  %tmp155 = phi i32 [ %tmp162, %bb160 ], [ 0, %bb150 ]
+  %tmp156 = trunc i64 %tmp154 to i32
+  %tmp157 = icmp slt i32 %tmp156, %tmp151
+  br i1 %tmp157, label %bb158, label %bb166
+
+bb158:                                            ; preds = %bb153
+  %tmp159 = add i64 %tmp154, %tmp148
+  br i1 false, label %bb163, label %bb160
+
+bb160:                                            ; preds = %bb158
+  %tmp161 = add i64 %tmp154, 1
+  %tmp162 = add nsw i32 %tmp155, 1
+  br i1 false, label %bb166, label %bb153
+
+bb163:                                            ; preds = %bb158
+  %tmp164 = add i32 %tmp155, -1
+  %tmp165 = icmp ult i32 %tmp164, 255
+  br i1 %tmp165, label %bb167, label %bb166
+
+bb166:                                            ; preds = %bb163, %bb160, %bb153
+  unreachable
+
+bb167:                                            ; preds = %bb163
+  %tmp168 = add nsw i32 %tmp155, 1
+  %tmp169 = sext i32 %tmp168 to i64
+  %tmp170 = add i64 %tmp169, %tmp148
+  %tmp171 = sub i64 %tmp149, %tmp169
+  br i1 false, label %bb173, label %bb172
+
+bb172:                                            ; preds = %bb167
+  unreachable
+
+bb173:                                            ; preds = %bb167
+  %tmp174 = trunc i64 %tmp171 to i32
+  br label %bb175
+
+bb175:                                            ; preds = %bb181, %bb173
+  %tmp176 = phi i64 [ %tmp183, %bb181 ], [ 0, %bb173 ]
+  %tmp177 = phi i32 [ %tmp184, %bb181 ], [ 0, %bb173 ]
+  %tmp178 = trunc i64 %tmp176 to i32
+  %tmp179 = icmp slt i32 %tmp178, %tmp174
+  br i1 %tmp179, label %bb180, label %bb186
+
+bb180:                                            ; preds = %bb175
+  br i1 false, label %bb185, label %bb181
+
+bb181:                                            ; preds = %bb180
+  %tmp182 = icmp eq i8 0, 0
+  %tmp183 = add i64 %tmp176, 1
+  %tmp184 = add nsw i32 %tmp177, 1
+  br i1 %tmp182, label %bb186, label %bb175
+
+bb185:                                            ; preds = %bb180
+  br i1 false, label %bb187, label %bb186
+
+bb186:                                            ; preds = %bb185, %bb181, %bb175
+  unreachable
+
+bb187:                                            ; preds = %bb185
+  %tmp188 = add nsw i32 %tmp177, 1
+  %tmp189 = sext i32 %tmp188 to i64
+  %tmp190 = sub i64 %tmp171, %tmp189
+  br i1 false, label %bb192, label %bb191
+
+bb191:                                            ; preds = %bb187
+  unreachable
+
+bb192:                                            ; preds = %bb187
+  %tmp193 = trunc i64 %tmp190 to i32
+  br label %bb194
+
+bb194:                                            ; preds = %bb200, %bb192
+  %tmp195 = phi i64 [ %tmp201, %bb200 ], [ 0, %bb192 ]
+  %tmp196 = phi i32 [ %tmp202, %bb200 ], [ 0, %bb192 ]
+  %tmp197 = trunc i64 %tmp195 to i32
+  %tmp198 = icmp slt i32 %tmp197, %tmp193
+  br i1 %tmp198, label %bb199, label %bb204
+
+bb199:                                            ; preds = %bb194
+  br i1 false, label %bb203, label %bb200
+
+bb200:                                            ; preds = %bb199
+  %tmp201 = add i64 %tmp195, 1
+  %tmp202 = add nsw i32 %tmp196, 1
+  br i1 false, label %bb204, label %bb194
+
+bb203:                                            ; preds = %bb199
+  br i1 undef, label %bb205, label %bb204
+
+bb204:                                            ; preds = %bb203, %bb200, %bb194
+  unreachable
+
+bb205:                                            ; preds = %bb203
+  %tmp206 = add nsw i32 %tmp196, 1
+  %tmp207 = sext i32 %tmp206 to i64
+  %tmp208 = add i64 %tmp207, 0
+  %tmp209 = sub i64 %tmp190, %tmp207
+  br i1 %tmp13, label %bb210, label %bb211
+
+bb210:                                            ; preds = %bb205
+  unreachable
+
+bb211:                                            ; preds = %bb205
+  %tmp212 = trunc i64 %tmp209 to i32
+  %tmp213 = icmp slt i32 0, %tmp212
+  br i1 false, label %bb215, label %bb214
+
+bb214:                                            ; preds = %bb211
+  unreachable
+
+bb215:                                            ; preds = %bb211
+  %tmp216 = add i64 undef, %tmp208
+  %tmp217 = sub i64 %tmp209, undef
+  br i1 false, label %bb218, label %bb219
+
+bb218:                                            ; preds = %bb215
+  br label %bb219
+
+bb219:                                            ; preds = %bb218, %bb215
+  %tmp220 = trunc i64 %tmp217 to i32
+  br label %bb221
+
+bb221:                                            ; preds = %bb230, %bb219
+  %tmp222 = phi i64 [ %tmp231, %bb230 ], [ 0, %bb219 ]
+  %tmp223 = phi i32 [ %tmp232, %bb230 ], [ 0, %bb219 ]
+  %tmp224 = trunc i64 %tmp222 to i32
+  %tmp225 = icmp slt i32 %tmp224, %tmp220
+  br i1 %tmp225, label %bb226, label %bb234
+
+bb226:                                            ; preds = %bb221
+  %tmp227 = add i64 %tmp222, %tmp216
+  %tmp228 = getelementptr inbounds %struct.jim* @global3, i64 0, i32 3, i64 %tmp227
+  %tmp229 = load i8* %tmp228, align 1
+  br i1 false, label %bb233, label %bb230
+
+bb230:                                            ; preds = %bb226
+  %tmp231 = add i64 %tmp222, 1
+  %tmp232 = add nsw i32 %tmp223, 1
+  br i1 undef, label %bb234, label %bb221
+
+bb233:                                            ; preds = %bb226
+  br i1 undef, label %bb235, label %bb234
+
+bb234:                                            ; preds = %bb233, %bb230, %bb221
+  br label %bb247
+
+bb235:                                            ; preds = %bb233
+  %tmp236 = add nsw i32 %tmp223, 1
+  %tmp237 = sext i32 %tmp236 to i64
+  %tmp238 = sub i64 %tmp217, %tmp237
+  br i1 %tmp66, label %bb239, label %bb240
+
+bb239:                                            ; preds = %bb235
+  unreachable
+
+bb240:                                            ; preds = %bb235
+  switch i32 0, label %bb244 [
+    i32 0, label %bb241
+    i32 1, label %bb241
+    i32 4, label %bb241
+    i32 6, label %bb241
+    i32 9, label %bb241
+  ]
+
+bb241:                                            ; preds = %bb240, %bb240, %bb240, %bb240, %bb240
+  %tmp242 = trunc i64 %tmp238 to i32
+  %tmp243 = icmp slt i32 0, %tmp242
+  br i1 false, label %bb246, label %bb245
+
+bb244:                                            ; preds = %bb240
+  unreachable
+
+bb245:                                            ; preds = %bb241
+  unreachable
+
+bb246:                                            ; preds = %bb241
+  unreachable
+
+bb247:                                            ; preds = %bb234, %bb144, %bb106, %bb71, %bb58
+  ret void
+}
+
+declare void @zot(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
new file mode 100644
index 0000000..b5124ea
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
@@ -0,0 +1,102 @@
+; RUN: llc < %s -O3 -march=thumb -mcpu=cortex-a8 | FileCheck %s
+;
+; LSR should only check for valid address modes when the IV user is a
+; memory address.
+; svn r158536, rdar://11635990
+;
+; Note that we still don't produce the best code here because we fail
+; to coalesce the IV. See <rdar://problem/11680670> [coalescer] IVs
+; need to be scheduled to expose coalescing.
+
+; LSR before the fix:
+;The chosen solution requires 4 regs, with addrec cost 1, plus 3 base adds, plus 2 setup cost:
+;  LSR Use: Kind=Special, Offsets={0}, all-fixups-outside-loop, widest fixup type: i32
+;    reg(%v3) + reg({0,+,-1}<%while.cond.i.i>) + imm(1)
+;  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
+;    reg(%v3) + reg({0,+,-1}<%while.cond.i.i>)
+;  LSR Use: Kind=Address of i32, Offsets={0}, widest fixup type: i32*
+;    reg((-4 + (4 * %v3) + %v1)) + 4*reg({0,+,-1}<%while.cond.i.i>)
+;  LSR Use: Kind=Address of i32, Offsets={0}, widest fixup type: i32*
+;    reg((-4 + (4 * %v3) + %v4)) + 4*reg({0,+,-1}<%while.cond.i.i>)
+;  LSR Use: Kind=Special, Offsets={0}, all-fixups-outside-loop, widest fixup type: i32
+;    reg(%v3)
+;
+; LSR after the fix:
+;The chosen solution requires 4 regs, with addrec cost 1, plus 1 base add, plus 2 setup cost:
+;  LSR Use: Kind=Special, Offsets={0}, all-fixups-outside-loop, widest fixup type: i32
+;    reg({%v3,+,-1}<nsw><%while.cond.i.i>) + imm(1)
+;  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
+;    reg({%v3,+,-1}<nsw><%while.cond.i.i>)
+;  LSR Use: Kind=Address of i32, Offsets={0}, widest fixup type: i32*
+;    reg((-4 + %v1)) + 4*reg({%v3,+,-1}<nsw><%while.cond.i.i>)
+;  LSR Use: Kind=Address of i32, Offsets={0}, widest fixup type: i32*
+;    reg((-4 + %v4)) + 4*reg({%v3,+,-1}<nsw><%while.cond.i.i>)
+;  LSR Use: Kind=Special, Offsets={0}, all-fixups-outside-loop, widest fixup type: i32
+;    reg(%v3)
+
+
+%s = type { i32* }
+
+@ncol = external global i32, align 4
+
+declare i32* @getptr() nounwind
+declare %s* @getstruct() nounwind
+
+; CHECK: @main
+; Check that the loop preheader contains no address computation.
+; CHECK: %end_of_chain
+; CHECK-NOT: add{{.*}}lsl
+; CHECK: ldr{{.*}}lsl #2
+; CHECK: ldr{{.*}}lsl #2
+define i32 @main() nounwind ssp {
+entry:
+  %v0 = load i32* @ncol, align 4, !tbaa !0
+  %v1 = tail call i32* @getptr() nounwind
+  %cmp10.i = icmp eq i32 %v0, 0
+  br label %while.cond.outer
+
+while.cond.outer:
+  %call18 = tail call %s* @getstruct() nounwind
+  br label %while.cond
+
+while.cond:
+  %cmp20 = icmp eq i32* %v1, null
+  br label %while.body
+
+while.body:
+  %v3 = load i32* @ncol, align 4, !tbaa !0
+  br label %end_of_chain
+
+end_of_chain:
+  %state.i = getelementptr inbounds %s* %call18, i32 0, i32 0
+  %v4 = load i32** %state.i, align 4, !tbaa !3
+  br label %while.cond.i.i
+
+while.cond.i.i:
+  %counter.0.i.i = phi i32 [ %v3, %end_of_chain ], [ %dec.i.i, %land.rhs.i.i ]
+  %dec.i.i = add nsw i32 %counter.0.i.i, -1
+  %tobool.i.i = icmp eq i32 %counter.0.i.i, 0
+  br i1 %tobool.i.i, label %where.exit, label %land.rhs.i.i
+
+land.rhs.i.i:
+  %arrayidx.i.i = getelementptr inbounds i32* %v4, i32 %dec.i.i
+  %v5 = load i32* %arrayidx.i.i, align 4, !tbaa !0
+  %arrayidx1.i.i = getelementptr inbounds i32* %v1, i32 %dec.i.i
+  %v6 = load i32* %arrayidx1.i.i, align 4, !tbaa !0
+  %cmp.i.i = icmp eq i32 %v5, %v6
+  br i1 %cmp.i.i, label %while.cond.i.i, label %equal_data.exit.i
+
+equal_data.exit.i:
+  ret i32 %counter.0.i.i
+
+where.exit:
+  br label %while.end.i
+
+while.end.i:
+  ret i32 %v3
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index ed32ca8..c3b8b89 100644
--- a/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -90,3 +90,59 @@ for.inc498:                                       ; preds = %for.inc498, %for.bo
 while.end:                                        ; preds = %entry
   ret void
 }
+
+; PR12898: SCEVExpander crash
+; Test redundant phi elimination when the deleted phi's increment is
+; itself a phi.
+;
+; CHECK: @test3
+; CHECK: %for.body3.lr.ph.us.i.loopexit
+; CHECK-NEXT: in Loop: Header
+; CHECK-NEXT: incq
+; CHECK-NEXT: %for.body3.us.i
+; CHECK-NEXT: Inner Loop
+; CHECK: testb
+; CHECK: jne
+; CHECK: jmp
+define fastcc void @test3(double* nocapture %u) nounwind uwtable ssp {
+entry:
+  br i1 undef, label %meshBB1, label %meshBB5
+
+for.inc8.us.i:                                    ; preds = %for.body3.us.i
+  br i1 undef, label %meshBB1, label %meshBB
+
+for.body3.us.i:                                   ; preds = %meshBB, %for.body3.lr.ph.us.i
+  %indvars.iv.i.SV.phi = phi i64 [ %indvars.iv.next.i, %meshBB ], [ 0, %for.body3.lr.ph.us.i ]
+  %storemerge13.us.i.SV.phi = phi i32 [ 0, %meshBB ], [ 0, %for.body3.lr.ph.us.i ]
+  %Opq.sa.calc12 = sub i32 undef, 227
+  %0 = add nsw i64 %indvars.iv.i.SV.phi, %indvars.iv8.i.SV.phi26
+  %1 = trunc i64 %0 to i32
+  %mul.i.us.i = mul nsw i32 0, %1
+  %arrayidx5.us.i = getelementptr inbounds double* %u, i64 %indvars.iv.i.SV.phi
+  %2 = load double* %arrayidx5.us.i, align 8
+  %indvars.iv.next.i = add i64 %indvars.iv.i.SV.phi, 1
+  br i1 undef, label %for.inc8.us.i, label %meshBB
+
+for.body3.lr.ph.us.i:                             ; preds = %meshBB1, %meshBB
+  %indvars.iv8.i.SV.phi26 = phi i64 [ undef, %meshBB1 ], [ %indvars.iv8.i.SV.phi24, %meshBB ]
+  %arrayidx.us.i = getelementptr inbounds double* undef, i64 %indvars.iv8.i.SV.phi26
+  %3 = add i64 %indvars.iv8.i.SV.phi26, 1
+  br label %for.body3.us.i
+
+for.inc8.us.i2:                                   ; preds = %meshBB5
+  unreachable
+
+eval_At_times_u.exit:                             ; preds = %meshBB5
+  ret void
+
+meshBB:                                           ; preds = %for.body3.us.i, %for.inc8.us.i
+  %indvars.iv8.i.SV.phi24 = phi i64 [ undef, %for.body3.us.i ], [ %3, %for.inc8.us.i ]
+  %meshStackVariable.phi = phi i32 [ %Opq.sa.calc12, %for.body3.us.i ], [ undef, %for.inc8.us.i ]
+  br i1 undef, label %for.body3.lr.ph.us.i, label %for.body3.us.i
+
+meshBB1:                                          ; preds = %for.inc8.us.i, %entry
+  br label %for.body3.lr.ph.us.i
+
+meshBB5:                                          ; preds = %entry
+  br i1 undef, label %eval_At_times_u.exit, label %for.inc8.us.i2
+}
diff --git a/test/Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll b/test/Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll
index 4136486..f7a82f6 100644
--- a/test/Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll
+++ b/test/Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   not grep {bitcast i32 1 to i32}
+; RUN:   not grep "bitcast i32 1 to i32"
 ; END.
 ; The setlt wants to use a value that is incremented one more than the dominant
 ; IV.  Don't insert the 1 outside the loop, preventing folding it into the add.
diff --git a/test/Transforms/LoopStrengthReduce/dont_reverse.ll b/test/Transforms/LoopStrengthReduce/dont_reverse.ll
index 4c5db04..d65213d 100644
--- a/test/Transforms/LoopStrengthReduce/dont_reverse.ll
+++ b/test/Transforms/LoopStrengthReduce/dont_reverse.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S \
-; RUN:    | grep {icmp eq i2 %lsr.iv.next, %xmp4344}
+; RUN:    | grep "icmp eq i2 %lsr.iv.next, %xmp4344"
 
 ; Don't reverse the iteration if the rhs of the compare is defined
 ; inside the loop.
diff --git a/test/Transforms/LoopStrengthReduce/invariant_value_first.ll b/test/Transforms/LoopStrengthReduce/invariant_value_first.ll
index 2ca6787..5c18809 100644
--- a/test/Transforms/LoopStrengthReduce/invariant_value_first.ll
+++ b/test/Transforms/LoopStrengthReduce/invariant_value_first.ll
@@ -1,6 +1,6 @@
 ; Check that the index of 'P[outer]' is pulled out of the loop.
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   not grep {getelementptr.*%outer.*%INDVAR}
+; RUN:   not grep "getelementptr.*%outer.*%INDVAR"
 
 target datalayout = "e-p:32:32:32-n8:16:32"
 declare i1 @pred()
diff --git a/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll b/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll
index 86c4d91..8eb8f05 100644
--- a/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll
+++ b/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll
@@ -1,6 +1,6 @@
 ; Check that the index of 'P[outer]' is pulled out of the loop.
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   not grep {getelementptr.*%outer.*%INDVAR}
+; RUN:   not grep "getelementptr.*%outer.*%INDVAR"
 
 target datalayout = "e-p:32:32:32-n32"
 declare i1 @pred()
diff --git a/test/Transforms/LoopStrengthReduce/pr2570.ll b/test/Transforms/LoopStrengthReduce/pr2570.ll
index 80efb9f..7b56971 100644
--- a/test/Transforms/LoopStrengthReduce/pr2570.ll
+++ b/test/Transforms/LoopStrengthReduce/pr2570.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | grep {phi\\>} | count 8
+; RUN: opt < %s -loop-reduce -S | grep "phi\>" | count 8
 ; PR2570
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll b/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
index 59f14fc..0118241 100644
--- a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
+++ b/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -iv-users | grep {\{1,+,3,+,2\}<%loop> (post-inc with loop %loop)}
+; RUN: opt < %s -analyze -iv-users | grep "{1,+,3,+,2}<%loop> (post-inc with loop %loop)"
 
 ; The value of %r is dependent on a polynomial iteration expression.
 
diff --git a/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll b/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
index 5ed37dd..005e4c6 100644
--- a/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
+++ b/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   grep {add i32 %indvar630.ui, 1}
+; RUN:   grep "add i32 %indvar630.ui, 1"
 ;
 ; Make sure that the use of the IV outside of the loop (the store) uses the 
 ; post incremented value of the IV, not the preincremented value.  This 
diff --git a/test/Transforms/LoopStrengthReduce/var_stride_used_by_compare.ll b/test/Transforms/LoopStrengthReduce/var_stride_used_by_compare.ll
index 64ef4f9..3405b26 100644
--- a/test/Transforms/LoopStrengthReduce/var_stride_used_by_compare.ll
+++ b/test/Transforms/LoopStrengthReduce/var_stride_used_by_compare.ll
@@ -1,10 +1,10 @@
 ; Base should not be i*3, it should be i*2.
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   not grep {mul.*%i, 3}
+; RUN:   not grep "mul.*%i, 3"
 
 ; Indvar should not start at zero:
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   not grep {phi i32 .* 0}
+; RUN:   not grep "phi i32 .* 0"
 ; END.
 
 ; mul uint %i, 3
diff --git a/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll b/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
index 20f2c2b..9d73d31 100644
--- a/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
+++ b/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-unswitch -stats -disable-output |& grep "1 loop-unswitch - Number of branches unswitched" | count 1
+; RUN: opt < %s -loop-unswitch -stats -disable-output 2>&1 | grep "1 loop-unswitch - Number of branches unswitched" | count 1
 ; PR 3170
 define i32 @a(i32 %x, i32 %y) nounwind {
 entry:
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
index 8389fe4..c1fd588 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
@@ -35,11 +35,11 @@
 ; CHECK:      loop_begin.us1:                                   ; preds = %loop_begin.backedge.us5, %.split.split.us
 ; CHECK-NEXT:   %var_val.us2 = load i32* %var
 ; CHECK-NEXT:   switch i32 2, label %default.us-lcssa.us-lcssa.us [
-; CHECK-NEXT:     i32 1, label %inc.us3
-; CHECK-NEXT:     i32 2, label %dec.us4
+; CHECK-NEXT:     i32 1, label %inc.us4
+; CHECK-NEXT:     i32 2, label %dec.us3
 ; CHECK-NEXT:   ]
 
-; CHECK:      dec.us4:                                          ; preds = %loop_begin.us1
+; CHECK:      dec.us3:                                          ; preds = %loop_begin.us1
 ; CHECK-NEXT:   call void @decf() noreturn nounwind
 ; CHECK-NEXT:   br label %loop_begin.backedge.us5
 
@@ -81,7 +81,7 @@ inc:
 dec:
   call void @decf() noreturn nounwind
   br label %loop_begin
-default:  
+default:
   br label %loop_exit
 loop_exit:
   ret i32 0
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
index 05d98d5..f3db471 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
@@ -19,15 +19,15 @@
 ; CHECK:        switch i32 1, label %second_switch.us [
 ; CHECK-NEXT:     i32 1, label %inc.us
 
-; CHECK:      inc.us:                                           ; preds = %second_switch.us, %loop_begin.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
-; CHECK-NEXT:   br label %loop_begin.backedge.us
-
 ; CHECK:      second_switch.us:                                 ; preds = %loop_begin.us
 ; CHECK-NEXT:   switch i32 %d, label %default.us [
 ; CHECK-NEXT:     i32 1, label %inc.us
 ; CHECK-NEXT:   ]
 
+; CHECK:      inc.us:                                           ; preds = %second_switch.us, %loop_begin.us
+; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   br label %loop_begin.backedge.us
+
 ; CHECK:      .split:                                           ; preds = %..split_crit_edge
 ; CHECK-NEXT:   br label %loop_begin
 
@@ -73,7 +73,7 @@ inc:
   call void @incf() noreturn nounwind
   br label %loop_begin
 
-default:  
+default:
   br label %loop_begin
 
 loop_exit:
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
index 1b186d6..2708996 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
@@ -25,14 +25,14 @@
 ; CHECK-NEXT:   switch i32 1, label %second_switch.us.us [
 ; CHECK-NEXT:     i32 1, label %inc.us.us
 
-; CHECK:      inc.us.us:                                        ; preds = %second_switch.us.us, %loop_begin.us.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
-; CHECK-NEXT:   br label %loop_begin.backedge.us.us
-
 ; CHECK:      second_switch.us.us:                              ; preds = %loop_begin.us.us
 ; CHECK-NEXT:   switch i32 1, label %default.us.us [
 ; CHECK-NEXT:     i32 1, label %inc.us.us
 
+; CHECK:      inc.us.us:                                        ; preds = %second_switch.us.us, %loop_begin.us.us
+; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   br label %loop_begin.backedge.us.us
+
 ; CHECK:      .split.us.split:                                  ; preds = %.split.us..split.us.split_crit_edge
 ; CHECK-NEXT:   br label %loop_begin.us
 
@@ -41,10 +41,6 @@
 ; CHECK-NEXT:   switch i32 1, label %second_switch.us [
 ; CHECK-NEXT:     i32 1, label %inc.us
 
-; CHECK:      inc.us:                                           ; preds = %second_switch.us.inc.us_crit_edge, %loop_begin.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
-; CHECK-NEXT:   br label %loop_begin.backedge.us
-
 ; CHECK:      second_switch.us:                                 ; preds = %loop_begin.us
 ; CHECK-NEXT:   switch i32 %d, label %default.us [
 ; CHECK-NEXT:     i32 1, label %second_switch.us.inc.us_crit_edge
@@ -53,6 +49,10 @@
 ; CHECK:      second_switch.us.inc.us_crit_edge:                ; preds = %second_switch.us
 ; CHECK-NEXT:   br i1 true, label %us-unreachable8, label %inc.us
 
+; CHECK:      inc.us:                                           ; preds = %second_switch.us.inc.us_crit_edge, %loop_begin.us
+; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   br label %loop_begin.backedge.us
+
 ; CHECK:      .split:                                           ; preds = %..split_crit_edge
 ; CHECK-NEXT:   %3 = icmp eq i32 %d, 1
 ; CHECK-NEXT:   br i1 %3, label %.split.split.us, label %.split..split.split_crit_edge
@@ -65,21 +65,21 @@
 
 ; CHECK:      loop_begin.us1:                                   ; preds = %loop_begin.backedge.us6, %.split.split.us
 ; CHECK-NEXT:   %var_val.us2 = load i32* %var
-; CHECK-NEXT:   switch i32 %c, label %second_switch.us4 [
+; CHECK-NEXT:   switch i32 %c, label %second_switch.us3 [
 ; CHECK-NEXT:     i32 1, label %loop_begin.inc_crit_edge.us
 ; CHECK-NEXT:   ]
 
-; CHECK:      inc.us3:                                          ; preds = %loop_begin.inc_crit_edge.us, %second_switch.us4
-; CHECK-NEXT:   call void @incf() noreturn nounwind
-; CHECK-NEXT:   br label %loop_begin.backedge.us6
-
-; CHECK:      second_switch.us4:                                ; preds = %loop_begin.us1
+; CHECK:      second_switch.us3:                                ; preds = %loop_begin.us1
 ; CHECK-NEXT:   switch i32 1, label %default.us5 [
-; CHECK-NEXT:     i32 1, label %inc.us3
+; CHECK-NEXT:     i32 1, label %inc.us4
 ; CHECK-NEXT:   ]
 
+; CHECK:      inc.us4:                                          ; preds = %loop_begin.inc_crit_edge.us, %second_switch.us3
+; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   br label %loop_begin.backedge.us6
+
 ; CHECK:      loop_begin.inc_crit_edge.us:                      ; preds = %loop_begin.us1
-; CHECK-NEXT:   br i1 true, label %us-unreachable.us-lcssa.us, label %inc.us3
+; CHECK-NEXT:   br i1 true, label %us-unreachable.us-lcssa.us, label %inc.us4
 
 ; CHECK:      .split.split:                                     ; preds = %.split..split.split_crit_edge
 ; CHECK-NEXT:   br label %loop_begin
@@ -127,7 +127,7 @@ inc:
   call void @incf() noreturn nounwind
   br label %loop_begin
 
-default:  
+default:
   br label %loop_begin
 
 loop_exit:
diff --git a/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll b/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll
new file mode 100644
index 0000000..4c63a56
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -loop-unswitch -disable-output
+; PR12887
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+define void @func() noreturn nounwind uwtable {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp eq i32 %0, 0
+  %1 = load i32* @b, align 4
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %d.0 = phi i8 [ undef, %entry ], [ %conv2, %while.body ]
+  %conv = sext i8 %d.0 to i32
+  %cond = select i1 %tobool, i32 0, i32 %conv
+  %conv11 = zext i8 %d.0 to i32
+  %add = add i32 %1, %conv11
+  %conv2 = trunc i32 %add to i8
+  br label %while.body
+}
diff --git a/test/Transforms/LowerSwitch/feature.ll b/test/Transforms/LowerSwitch/feature.ll
index cdfa0f3..cc77d3c 100644
--- a/test/Transforms/LowerSwitch/feature.ll
+++ b/test/Transforms/LowerSwitch/feature.ll
@@ -1,10 +1,99 @@
-; RUN: opt < %s -lowerswitch -S > %t
-; RUN: grep slt %t | count 10
-; RUN: grep ule %t | count 3
-; RUN: grep eq  %t | count 9
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+
+; We have switch on input.
+; On output we should got binary comparison tree. Check that all is fine.
+
+;CHECK:      entry:
+;CHECK-NEXT:   br label %NodeBlock37
+
+;CHECK:      NodeBlock37:                                      ; preds = %entry
+;CHECK-NEXT:   %Pivot38 = icmp ult i32 %tmp158, 11
+;CHECK-NEXT:   br i1 %Pivot38, label %NodeBlock13, label %NodeBlock35
+
+;CHECK:      NodeBlock35:                                      ; preds = %NodeBlock37
+;CHECK-NEXT:   %Pivot36 = icmp ult i32 %tmp158, 14
+;CHECK-NEXT:   br i1 %Pivot36, label %NodeBlock23, label %NodeBlock33
+
+;CHECK:      NodeBlock33:                                      ; preds = %NodeBlock35
+;CHECK-NEXT:   %Pivot34 = icmp ult i32 %tmp158, 15
+;CHECK-NEXT:   br i1 %Pivot34, label %LeafBlock25, label %NodeBlock31
+
+;CHECK:      NodeBlock31:                                      ; preds = %NodeBlock33
+;CHECK-NEXT:   %Pivot32 = icmp ult i32 %tmp158, -6
+;CHECK-NEXT:   br i1 %Pivot32, label %LeafBlock27, label %LeafBlock29
+
+;CHECK:      LeafBlock29:                                      ; preds = %NodeBlock31
+;CHECK-NEXT:   %tmp158.off = add i32 %tmp158, 6
+;CHECK-NEXT:   %SwitchLeaf30 = icmp ule i32 %tmp158.off, 4
+;CHECK-NEXT:   br i1 %SwitchLeaf30, label %bb338, label %NewDefault
+
+;CHECK:      LeafBlock27:                                      ; preds = %NodeBlock31
+;CHECK-NEXT:   %SwitchLeaf28 = icmp eq i32 %tmp158, 15
+;CHECK-NEXT:   br i1 %SwitchLeaf28, label %bb334, label %NewDefault
+
+;CHECK:      LeafBlock25:                                      ; preds = %NodeBlock33
+;CHECK-NEXT:   %SwitchLeaf26 = icmp eq i32 %tmp158, 14
+;CHECK-NEXT:   br i1 %SwitchLeaf26, label %bb332, label %NewDefault
+
+;CHECK:      NodeBlock23:                                      ; preds = %NodeBlock35
+;CHECK-NEXT:   %Pivot24 = icmp ult i32 %tmp158, 12
+;CHECK-NEXT:   br i1 %Pivot24, label %LeafBlock15, label %NodeBlock21
+
+;CHECK:      NodeBlock21:                                      ; preds = %NodeBlock23
+;CHECK-NEXT:   %Pivot22 = icmp ult i32 %tmp158, 13
+;CHECK-NEXT:   br i1 %Pivot22, label %LeafBlock17, label %LeafBlock19
+
+;CHECK:      LeafBlock19:                                      ; preds = %NodeBlock21
+;CHECK-NEXT:   %SwitchLeaf20 = icmp eq i32 %tmp158, 13
+;CHECK-NEXT:   br i1 %SwitchLeaf20, label %bb330, label %NewDefault
+
+;CHECK:      LeafBlock17:                                      ; preds = %NodeBlock21
+;CHECK-NEXT:   %SwitchLeaf18 = icmp eq i32 %tmp158, 12
+;CHECK-NEXT:   br i1 %SwitchLeaf18, label %bb328, label %NewDefault
+
+;CHECK:      LeafBlock15:                                      ; preds = %NodeBlock23
+;CHECK-NEXT:   %SwitchLeaf16 = icmp eq i32 %tmp158, 11
+;CHECK-NEXT:   br i1 %SwitchLeaf16, label %bb326, label %NewDefault
+
+;CHECK:      NodeBlock13:                                      ; preds = %NodeBlock37
+;CHECK-NEXT:   %Pivot14 = icmp ult i32 %tmp158, 8
+;CHECK-NEXT:   br i1 %Pivot14, label %NodeBlock, label %NodeBlock11
+
+;CHECK:      NodeBlock11:                                      ; preds = %NodeBlock13
+;CHECK-NEXT:   %Pivot12 = icmp ult i32 %tmp158, 9
+;CHECK-NEXT:   br i1 %Pivot12, label %LeafBlock3, label %NodeBlock9
+
+;CHECK:      NodeBlock9:                                       ; preds = %NodeBlock11
+;CHECK-NEXT:   %Pivot10 = icmp ult i32 %tmp158, 10
+;CHECK-NEXT:   br i1 %Pivot10, label %LeafBlock5, label %LeafBlock7
+
+;CHECK:      LeafBlock7:                                       ; preds = %NodeBlock9
+;CHECK-NEXT:   %SwitchLeaf8 = icmp eq i32 %tmp158, 10
+;CHECK-NEXT:   br i1 %SwitchLeaf8, label %bb324, label %NewDefault
+
+;CHECK:      LeafBlock5:                                       ; preds = %NodeBlock9
+;CHECK-NEXT:   %SwitchLeaf6 = icmp eq i32 %tmp158, 9
+;CHECK-NEXT:   br i1 %SwitchLeaf6, label %bb322, label %NewDefault
+
+;CHECK:      LeafBlock3:                                       ; preds = %NodeBlock11
+;CHECK-NEXT:   %SwitchLeaf4 = icmp eq i32 %tmp158, 8
+;CHECK-NEXT:   br i1 %SwitchLeaf4, label %bb338, label %NewDefault
+
+;CHECK:      NodeBlock:                                        ; preds = %NodeBlock13
+;CHECK-NEXT:   %Pivot = icmp ult i32 %tmp158, 7
+;CHECK-NEXT:   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+;CHECK:      LeafBlock1:                                       ; preds = %NodeBlock
+;CHECK-NEXT:   %SwitchLeaf2 = icmp eq i32 %tmp158, 7
+;CHECK-NEXT:   br i1 %SwitchLeaf2, label %bb, label %NewDefault
+
+;CHECK:      LeafBlock:                                        ; preds = %NodeBlock
+;CHECK-NEXT:   %SwitchLeaf = icmp ule i32 %tmp158, 6
+;CHECK-NEXT:   br i1 %SwitchLeaf, label %bb338, label %NewDefault
 
 define i32 @main(i32 %tmp158) {
 entry:
+
         switch i32 %tmp158, label %bb336 [
                  i32 -2, label %bb338
                  i32 -3, label %bb338
diff --git a/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll b/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
index b95ad91..d124be5 100644
--- a/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
+++ b/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -memcpyopt -dse -S | grep {call.*initialize} | not grep memtmp
+; RUN: opt < %s -basicaa -memcpyopt -dse -S | grep "call.*initialize" | not grep memtmp
 ; PR2077
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index 24cf576..61ba3c7 100644
--- a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -memcpyopt -S | not grep {call.*memcpy.}
+; RUN: opt < %s -basicaa -memcpyopt -S | not grep "call.*memcpy."
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 %a = type { i32 }
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index 63d0ebf..3fa1628 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -148,3 +148,25 @@ define void @test8() {
 }
 
 declare noalias i8* @malloc(i32)
+
+; rdar://11341081
+%struct.big = type { [50 x i32] }
+
+define void @test9() nounwind uwtable ssp {
+entry:
+; CHECK: test9
+; CHECK: f1
+; CHECK-NOT: memcpy
+; CHECK: f2
+  %b = alloca %struct.big, align 4
+  %tmp = alloca %struct.big, align 4
+  call void @f1(%struct.big* sret %tmp)
+  %0 = bitcast %struct.big* %b to i8*
+  %1 = bitcast %struct.big* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 200, i32 4, i1 false)
+  call void @f2(%struct.big* %b)
+  ret void
+}
+
+declare void @f1(%struct.big* sret)
+declare void @f2(%struct.big*)
diff --git a/test/Transforms/MemCpyOpt/sret.ll b/test/Transforms/MemCpyOpt/sret.ll
index 8eac7da..1bbb5fe 100644
--- a/test/Transforms/MemCpyOpt/sret.ll
+++ b/test/Transforms/MemCpyOpt/sret.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -memcpyopt -S | not grep {call.*memcpy}
+; RUN: opt < %s -basicaa -memcpyopt -S | not grep "call.*memcpy"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9"
diff --git a/test/Transforms/MergeFunc/fold-weak.ll b/test/Transforms/MergeFunc/fold-weak.ll
index 23e4d33..4df6e39 100644
--- a/test/Transforms/MergeFunc/fold-weak.ll
+++ b/test/Transforms/MergeFunc/fold-weak.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -mergefunc -S > %t
-; RUN: grep {define weak} %t | count 2
-; RUN: grep {call} %t | count 2
+; RUN: grep "define weak" %t | count 2
+; RUN: grep "call" %t | count 2
 ; XFAIL: *
 
 ; This test is off for a bit as we change this particular sort of folding to
diff --git a/test/Transforms/MergeFunc/phi-speculation1.ll b/test/Transforms/MergeFunc/phi-speculation1.ll
index 7b2a2fe..fd0baff 100644
--- a/test/Transforms/MergeFunc/phi-speculation1.ll
+++ b/test/Transforms/MergeFunc/phi-speculation1.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mergefunc -stats -disable-output |& not grep {functions merged}
+; RUN: opt < %s -mergefunc -stats -disable-output 2>&1 | not grep "functions merged"
 
 define i32 @foo1(i32 %x) {
 entry:
diff --git a/test/Transforms/MergeFunc/phi-speculation2.ll b/test/Transforms/MergeFunc/phi-speculation2.ll
index f080191..eec8b5c 100644
--- a/test/Transforms/MergeFunc/phi-speculation2.ll
+++ b/test/Transforms/MergeFunc/phi-speculation2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mergefunc -stats -disable-output |& grep {functions merged}
+; RUN: opt < %s -mergefunc -stats -disable-output 2>&1 | grep "functions merged"
 
 define i32 @foo1(i32 %x) {
 entry:
diff --git a/test/Transforms/MergeFunc/vector.ll b/test/Transforms/MergeFunc/vector.ll
index 6954fce..4af079f 100644
--- a/test/Transforms/MergeFunc/vector.ll
+++ b/test/Transforms/MergeFunc/vector.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mergefunc -stats -disable-output < %s |& grep {functions merged}
+; RUN: opt -mergefunc -stats -disable-output < %s 2>&1 | grep "functions merged"
 
 ; This test is checks whether we can merge
 ;   vector<intptr_t>::push_back(0)
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index ba2f778..d9bb3f2 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -1871,6 +1871,30 @@ return:                                           ; preds = %if.then, %entry
   ret i8* %retval
 }
 
+; An objc_retain can serve as a may-use for a different pointer.
+; rdar://11931823
+
+; CHECK: define void @test66(
+; CHECK:   %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
+; CHECK:   tail call void @objc_release(i8* %cond) nounwind
+; CHECK: }
+define void @test66(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
+entry:
+  br i1 %tobool, label %cond.true, label %cond.end
+
+cond.true:
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.true, %entry
+  %cond = phi i8* [ %tmp5, %cond.true ], [ %call, %entry ]
+  %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
+  tail call void @objc_release(i8* %call) nounwind
+  %tmp8 = select i1 %tobool1, i8* %cond, i8* %bar
+  %tmp9 = tail call i8* @objc_retain(i8* %tmp8) nounwind
+  tail call void @objc_release(i8* %cond) nounwind
+  ret void
+}
+
 declare void @bar(i32 ()*)
 
 ; A few real-world testcases.
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index 4ff0596..2922f81 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -4,6 +4,7 @@ target datalayout = "e-p:64:64:64"
 
 declare i8* @objc_retain(i8*)
 declare void @objc_release(i8*)
+declare void @use_pointer(i8*)
 
 @x = external global i8*
 
@@ -57,3 +58,112 @@ entry:
   tail call void @objc_release(i8* %tmp) nounwind
   ret void
 }
+
+; Don't do this if there's a use of the old pointer value between the store
+; and the release.
+
+; CHECK:      define void @test3(i8* %newValue) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+; CHECK-NEXT:   %x1 = load i8** @x, align 8
+; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
+; CHECK-NEXT:   tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
+; CHECK-NEXT:   tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test3(i8* %newValue) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x1 = load i8** @x, align 8
+  store i8* %newValue, i8** @x, align 8
+  tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+; Like test3, but with an icmp use instead of a call, for good measure.
+
+; CHECK:      define i1 @test4(i8* %newValue, i8* %foo) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+; CHECK-NEXT:   %x1 = load i8** @x, align 8
+; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
+; CHECK-NEXT:   %t = icmp eq i8* %x1, %foo
+; CHECK-NEXT:   tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT:   ret i1 %t
+; CHECK-NEXT: }
+define i1 @test4(i8* %newValue, i8* %foo) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x1 = load i8** @x, align 8
+  store i8* %newValue, i8** @x, align 8
+  %t = icmp eq i8* %x1, %foo
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  ret i1 %t
+}
+
+; Do form an objc_storeStrong here, because the use is before the store.
+
+; CHECK: define i1 @test5(i8* %newValue, i8* %foo) {
+; CHECK: %t = icmp eq i8* %x1, %foo
+; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) nounwind
+define i1 @test5(i8* %newValue, i8* %foo) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x1 = load i8** @x, align 8
+  %t = icmp eq i8* %x1, %foo
+  store i8* %newValue, i8** @x, align 8
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  ret i1 %t
+}
+
+; Like test5, but the release is before the store.
+
+; CHECK: define i1 @test6(i8* %newValue, i8* %foo) {
+; CHECK: %t = icmp eq i8* %x1, %foo
+; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) nounwind
+define i1 @test6(i8* %newValue, i8* %foo) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x1 = load i8** @x, align 8
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  %t = icmp eq i8* %x1, %foo
+  store i8* %newValue, i8** @x, align 8
+  ret i1 %t
+}
+
+; Like test0, but there's no store, so don't form an objc_storeStrong.
+
+;      CHECK: define void @test7(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %tmp = load i8** @x, align 8
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test7(i8* %p) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %tmp = load i8** @x, align 8
+  tail call void @objc_release(i8* %tmp) nounwind
+  ret void
+}
+
+; Like test0, but there's no retain, so don't form an objc_storeStrong.
+
+;      CHECK: define void @test8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %tmp = load i8** @x, align 8
+; CHECK-NEXT:   store i8* %p, i8** @x, align 8
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test8(i8* %p) {
+entry:
+  %tmp = load i8** @x, align 8
+  store i8* %p, i8** @x, align 8
+  tail call void @objc_release(i8* %tmp) nounwind
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/contract-testcases.ll b/test/Transforms/ObjCARC/contract-testcases.ll
index 69fa837..1510ed0 100644
--- a/test/Transforms/ObjCARC/contract-testcases.ll
+++ b/test/Transforms/ObjCARC/contract-testcases.ll
@@ -4,17 +4,17 @@
 %0 = type opaque
 %1 = type opaque
 %2 = type { i64, i64 }
-%3 = type { i8*, i8* }
 %4 = type opaque
 
 declare %0* @"\01-[NSAttributedString(Terminal) pathAtIndex:effectiveRange:]"(%1*, i8* nocapture, i64, %2*) optsize
 declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_msgSend_fixup(i8*, %3*, ...)
+declare i8* @objc_msgSend_fixup(i8*, i8*, ...)
+declare i8* @objc_msgSend(i8*, i8*, ...)
 declare void @objc_release(i8*)
 declare %2 @NSUnionRange(i64, i64, i64, i64) optsize
 declare i8* @objc_autoreleaseReturnValue(i8*)
 declare i8* @objc_autorelease(i8*)
-declare i8* @objc_msgSend() nonlazybind
+declare i32 @__gxx_personality_sj0(...)
 
 ; Don't get in trouble on bugpointed code.
 
@@ -52,7 +52,7 @@ bb6:                                              ; preds = %bb5, %bb4, %bb4, %b
 ; CHECK: %tmp8 = phi %0* [ %0, %bb ], [ %0, %bb ]
 define void @test1() {
 bb:
-  %tmp = tail call %0* bitcast (i8* ()* @objc_msgSend to %0* ()*)()
+  %tmp = tail call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* ()*)()
   %tmp2 = bitcast %0* %tmp to i8*
   %tmp3 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp2) nounwind
   br i1 undef, label %bb7, label %bb7
@@ -61,3 +61,30 @@ bb7:                                              ; preds = %bb6, %bb6, %bb5
   %tmp8 = phi %0* [ %tmp, %bb ], [ %tmp, %bb ]
   unreachable
 }
+
+; When looking for the defining instruction for an objc_retainAutoreleasedReturnValue
+; call, handle the case where it's an invoke in a different basic block.
+; rdar://11714057
+
+; CHECK: define void @_Z6doTestP8NSString() {
+; CHECK: invoke.cont:                                      ; preds = %entry
+; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
+; CHECK-NEXT: %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+define void @_Z6doTestP8NSString() {
+entry:
+  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* ()*)()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  unreachable
+
+lpad:                                             ; preds = %entry
+  %tmp1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  resume { i8*, i32 } undef
+}
+
+!clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
+
+!0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
diff --git a/test/Transforms/ObjCARC/split-backedge.ll b/test/Transforms/ObjCARC/split-backedge.ll
new file mode 100644
index 0000000..08e2dce
--- /dev/null
+++ b/test/Transforms/ObjCARC/split-backedge.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -objc-arc < %s | FileCheck %s
+
+; Handle a retain+release pair entirely contained within a split loop backedge.
+; rdar://11256239
+
+; CHECK: define void @test0
+; CHECK: call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call i8* @objc_retain(i8* %cond) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %cond) nounwind
+define void @test0() {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %call = invoke i8* @returner()
+          to label %invoke.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+invoke.cont:                                      ; preds = %while.body
+  %t0 = call i8* @objc_retain(i8* %call) nounwind
+  %t1 = call i8* @objc_retain(i8* %call) nounwind
+  %call.i1 = invoke i8* @returner()
+          to label %invoke.cont1 unwind label %lpad
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  %cond = select i1 undef, i8* null, i8* %call
+  %t2 = call i8* @objc_retain(i8* %cond) nounwind
+  call void @objc_release(i8* %call) nounwind
+  call void @objc_release(i8* %call) nounwind
+  call void @use_pointer(i8* %cond)
+  call void @objc_release(i8* %cond) nounwind
+  br label %while.body
+
+lpad:                                             ; preds = %invoke.cont, %while.body
+  %t4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          catch i8* null
+  ret void
+}
+
+declare i8* @returner()
+declare i32 @__objc_personality_v0(...)
+declare void @objc_release(i8*)
+declare i8* @objc_retain(i8*)
+declare void @use_pointer(i8*)
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/weak-dce.ll b/test/Transforms/ObjCARC/weak-dce.ll
new file mode 100644
index 0000000..f094671
--- /dev/null
+++ b/test/Transforms/ObjCARC/weak-dce.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -basicaa -objc-arc < %s | FileCheck %s
+; rdar://11434915
+
+; Delete the weak calls and replace them with just the net retain.
+
+;      CHECK: define void @test0(i8* %p) {
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: ret void
+
+define void @test0(i8* %p) {
+  %weakBlock = alloca i8*, align 8
+  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
+  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  ret void
+}
+
+;      CHECK: define i8* @test1(i8* %p) {
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: ret i8* %p
+
+define i8* @test1(i8* %p) {
+  %weakBlock = alloca i8*, align 8
+  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
+  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  ret i8* %tmp26
+}
+
+;      CHECK: define i8* @test2(i8* %p, i8* %q) {
+; CHECK-NEXT: call i8* @objc_retain(i8* %q)
+; CHECK-NEXT: ret i8* %q
+
+define i8* @test2(i8* %p, i8* %q) {
+  %weakBlock = alloca i8*, align 8
+  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp19 = call i8* @objc_storeWeak(i8** %weakBlock, i8* %q) nounwind
+  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
+  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  ret i8* %tmp26
+}
+
+declare i8* @objc_initWeak(i8**, i8*)
+declare void @objc_destroyWeak(i8**)
+declare i8* @objc_loadWeakRetained(i8**)
+declare i8* @objc_storeWeak(i8** %weakBlock, i8* %q)
diff --git a/test/Transforms/PhaseOrdering/PR6627.ll b/test/Transforms/PhaseOrdering/PR6627.ll
new file mode 100644
index 0000000..ef9947f
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/PR6627.ll
@@ -0,0 +1,93 @@
+; RUN: opt -O3 -S %s | FileCheck %s
+; XFAIL: *
+
+declare i32 @doo(...)
+
+; PR6627 - This whole nasty sequence should be flattened down to a single
+; 32-bit comparison.
+define void @test2(i8* %arrayidx) nounwind ssp {
+entry:
+  %xx = bitcast i8* %arrayidx to i32*
+  %x1 = load i32* %xx, align 4
+  %tmp = trunc i32 %x1 to i8
+  %conv = zext i8 %tmp to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2
+; CHECK: %x1 = load i32* %xx, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
+
+; PR6627 - This should all be flattened down to one compare.  This is the same
+; as test2, except that the initial load is done as an i8 instead of i32, thus
+; requiring widening.
+define void @test2a(i8* %arrayidx) nounwind ssp {
+entry:
+  %x1 = load i8* %arrayidx, align 4
+  %conv = zext i8 %x1 to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2a
+; CHECK: %x1 = load i32* {{.*}}, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
diff --git a/test/Transforms/PhaseOrdering/basic.ll b/test/Transforms/PhaseOrdering/basic.ll
index e5b2ba4..88ebca0 100644
--- a/test/Transforms/PhaseOrdering/basic.ll
+++ b/test/Transforms/PhaseOrdering/basic.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -O3 -S %s | FileCheck %s
-; XFAIL: *
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
@@ -24,95 +23,29 @@ define void @test1() nounwind ssp {
 ; CHECK-NEXT: ret void
 }
 
-
-; PR6627 - This whole nasty sequence should be flattened down to a single
-; 32-bit comparison.
-define void @test2(i8* %arrayidx) nounwind ssp {
+; This function exposes a phase ordering problem when InstCombine is
+; turning %add into a bitmask, making it difficult to spot a 0 return value.
+;
+; It it also important that %add is expressed as a multiple of %div so scalar
+; evolution can recognize it.
+define i32 @test2(i32 %a, i32* %p) nounwind uwtable ssp {
 entry:
-  %xx = bitcast i8* %arrayidx to i32*
-  %x1 = load i32* %xx, align 4
-  %tmp = trunc i32 %x1 to i8
-  %conv = zext i8 %tmp to i32
-  %cmp = icmp eq i32 %conv, 127
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
-  %tmp5 = load i8* %arrayidx4, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  %cmp7 = icmp eq i32 %conv6, 69
-  br i1 %cmp7, label %land.lhs.true9, label %if.end
-
-land.lhs.true9:                                   ; preds = %land.lhs.true
-  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
-  %tmp13 = load i8* %arrayidx12, align 1
-  %conv14 = zext i8 %tmp13 to i32
-  %cmp15 = icmp eq i32 %conv14, 76
-  br i1 %cmp15, label %land.lhs.true17, label %if.end
-
-land.lhs.true17:                                  ; preds = %land.lhs.true9
-  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
-  %tmp21 = load i8* %arrayidx20, align 1
-  %conv22 = zext i8 %tmp21 to i32
-  %cmp23 = icmp eq i32 %conv22, 70
-  br i1 %cmp23, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true17
-  %call25 = call i32 (...)* @doo()
-  br label %if.end
-
-if.end:
-  ret void
+  %div = udiv i32 %a, 4
+  %arrayidx = getelementptr inbounds i32* %p, i64 0
+  store i32 %div, i32* %arrayidx, align 4
+  %add = add i32 %div, %div
+  %arrayidx1 = getelementptr inbounds i32* %p, i64 1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32* %p, i64 1
+  %0 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %p, i64 0
+  %1 = load i32* %arrayidx3, align 4
+  %mul = mul i32 2, %1
+  %sub = sub i32 %0, %mul
+  ret i32 %sub
 
 ; CHECK: @test2
-; CHECK: %x1 = load i32* %xx, align 4
-; CHECK-NEXT: icmp eq i32 %x1, 1179403647
-; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+; CHECK: %div = lshr i32 %a, 2
+; CHECK: %add = shl nuw nsw i32 %div, 1
+; CHECK: ret i32 0
 }
-
-declare i32 @doo(...)
-
-; PR6627 - This should all be flattened down to one compare.  This is the same
-; as test2, except that the initial load is done as an i8 instead of i32, thus
-; requiring widening.
-define void @test2a(i8* %arrayidx) nounwind ssp {
-entry:
-  %x1 = load i8* %arrayidx, align 4
-  %conv = zext i8 %x1 to i32
-  %cmp = icmp eq i32 %conv, 127
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
-  %tmp5 = load i8* %arrayidx4, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  %cmp7 = icmp eq i32 %conv6, 69
-  br i1 %cmp7, label %land.lhs.true9, label %if.end
-
-land.lhs.true9:                                   ; preds = %land.lhs.true
-  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
-  %tmp13 = load i8* %arrayidx12, align 1
-  %conv14 = zext i8 %tmp13 to i32
-  %cmp15 = icmp eq i32 %conv14, 76
-  br i1 %cmp15, label %land.lhs.true17, label %if.end
-
-land.lhs.true17:                                  ; preds = %land.lhs.true9
-  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
-  %tmp21 = load i8* %arrayidx20, align 1
-  %conv22 = zext i8 %tmp21 to i32
-  %cmp23 = icmp eq i32 %conv22, 70
-  br i1 %cmp23, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true17
-  %call25 = call i32 (...)* @doo()
-  br label %if.end
-
-if.end:
-  ret void
-
-; CHECK: @test2a
-; CHECK: %x1 = load i32* {{.*}}, align 4
-; CHECK-NEXT: icmp eq i32 %x1, 1179403647
-; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
-}
-
diff --git a/test/Transforms/PhaseOrdering/scev.ll b/test/Transforms/PhaseOrdering/scev.ll
new file mode 100644
index 0000000..c731280
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/scev.ll
@@ -0,0 +1,64 @@
+; RUN: opt -O3 -S -analyze -scalar-evolution %s | FileCheck %s
+;
+; This file contains phase ordering tests for scalar evolution.
+; Test that the standard passes don't obfuscate the IR so scalar evolution can't
+; recognize expressions.
+
+; CHECK: test1
+; The loop body contains two increments by %div.
+; Make sure that 2*%div is recognizable, and not expressed as a bit mask of %d.
+; CHECK: -->  {%p,+,(2 * (%d /u 4) * sizeof(i32))}
+define void @test1(i64 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i64 %d, 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32* %p.addr.0, i64 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32* %add.ptr, i64 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; CHECK: test1a
+; Same thing as test1, but it is even more tempting to fold 2 * (%d /u 2)
+; CHECK: -->  {%p,+,(2 * (%d /u 2) * sizeof(i32))}
+define void @test1a(i64 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i64 %d, 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32* %p.addr.0, i64 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32* %add.ptr, i64 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/test/Transforms/PruneEH/2003-09-14-ExternalCall.ll b/test/Transforms/PruneEH/2003-09-14-ExternalCall.ll
deleted file mode 100644
index 64aba46..0000000
--- a/test/Transforms/PruneEH/2003-09-14-ExternalCall.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt < %s -prune-eh -S | grep invoke
-
-declare void @External()
-
-define void @foo() {
-	invoke void @External( )
-			to label %Cont unwind label %Cont
-Cont:		; preds = %0, %0
-        %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
-                 cleanup
-	ret void
-}
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/PruneEH/simplenoreturntest.ll b/test/Transforms/PruneEH/simplenoreturntest.ll
index 61e2f15..ec5d100 100644
--- a/test/Transforms/PruneEH/simplenoreturntest.ll
+++ b/test/Transforms/PruneEH/simplenoreturntest.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -prune-eh -S | not grep {ret i32}
+; RUN: opt < %s -prune-eh -S | not grep "ret i32"
 
 declare void @noreturn() noreturn
 
diff --git a/test/Transforms/Reassociate/2005-08-24-Crash.ll b/test/Transforms/Reassociate/2005-08-24-Crash.ll
deleted file mode 100644
index 9864de4..0000000
--- a/test/Transforms/Reassociate/2005-08-24-Crash.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -reassociate -disable-output
-
-define void @test(i32 %a, i32 %b, i32 %c, i32 %d) {
-	%tmp.2 = xor i32 %a, %b		; <i32> [#uses=1]
-	%tmp.5 = xor i32 %c, %d		; <i32> [#uses=1]
-	%tmp.6 = xor i32 %tmp.2, %tmp.5		; <i32> [#uses=1]
-	%tmp.9 = xor i32 %c, %a		; <i32> [#uses=1]
-	%tmp.12 = xor i32 %b, %d		; <i32> [#uses=1]
-	%tmp.13 = xor i32 %tmp.9, %tmp.12		; <i32> [#uses=1]
-	%tmp.16 = xor i32 %tmp.6, %tmp.13		; <i32> [#uses=0]
-	ret void
-}
-
diff --git a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
index 33e44d4..f66148b 100644
--- a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
+++ b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -reassociate -instcombine -S |\
-; RUN:   grep {ret i32 0}
+; RUN:   grep "ret i32 0"
 
 define i32 @f(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 	%tmp.2 = add i32 %a4, %a3		; <i32> [#uses=1]
diff --git a/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll b/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll
new file mode 100644
index 0000000..2f5a53e
--- /dev/null
+++ b/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+; PR12169
+; PR12764
+; XFAIL: *
+; Transform disabled until PR13021 is fixed.
+
+define i64 @f(i64 %x0) {
+; CHECK: @f
+; CHECK-NEXT: mul i64 %x0, 208
+; CHECK-NEXT: add i64 %{{.*}}, 1617
+; CHECK-NEXT: ret i64
+  %t0 = add i64 %x0, 1
+  %t1 = add i64 %x0, 2
+  %t2 = add i64 %x0, 3
+  %t3 = add i64 %x0, 4
+  %t4 = add i64 %x0, 5
+  %t5 = add i64 %x0, 6
+  %t6 = add i64 %x0, 7
+  %t7 = add i64 %x0, 8
+  %t8 = add i64 %x0, 9
+  %t9 = add i64 %x0, 10
+  %t10 = add i64 %x0, 11
+  %t11 = add i64 %x0, 12
+  %t12 = add i64 %x0, 13
+  %t13 = add i64 %x0, 14
+  %t14 = add i64 %x0, 15
+  %t15 = add i64 %x0, 16
+  %t16 = add i64 %x0, 17
+  %t17 = add i64 %x0, 18
+  %t18 = add i64 %t17, %t0
+  %t19 = add i64 %t18, %t1
+  %t20 = add i64 %t19, %t2
+  %t21 = add i64 %t20, %t3
+  %t22 = add i64 %t21, %t4
+  %t23 = add i64 %t22, %t5
+  %t24 = add i64 %t23, %t6
+  %t25 = add i64 %t24, %t7
+  %t26 = add i64 %t25, %t8
+  %t27 = add i64 %t26, %t9
+  %t28 = add i64 %t27, %t10
+  %t29 = add i64 %t28, %t11
+  %t30 = add i64 %t29, %t12
+  %t31 = add i64 %t30, %t13
+  %t32 = add i64 %t31, %t14
+  %t33 = add i64 %t32, %t15
+  %t34 = add i64 %t33, %t16
+  %t35 = add i64 %t34, %x0
+  %t36 = add i64 %t0, %t1
+  %t37 = add i64 %t36, %t2
+  %t38 = add i64 %t37, %t3
+  %t39 = add i64 %t38, %t4
+  %t40 = add i64 %t39, %t5
+  %t41 = add i64 %t40, %t6
+  %t42 = add i64 %t41, %t7
+  %t43 = add i64 %t42, %t8
+  %t44 = add i64 %t43, %t9
+  %t45 = add i64 %t44, %t10
+  %t46 = add i64 %t45, %t11
+  %t47 = add i64 %t46, %t12
+  %t48 = add i64 %t47, %t13
+  %t49 = add i64 %t48, %t14
+  %t50 = add i64 %t49, %t15
+  %t51 = add i64 %t50, %t16
+  %t52 = add i64 %t51, %t17
+  %t53 = add i64 %t52, %t18
+  %t54 = add i64 %t53, %t19
+  %t55 = add i64 %t54, %t20
+  %t56 = add i64 %t55, %t21
+  %t57 = add i64 %t56, %t22
+  %t58 = add i64 %t57, %t23
+  %t59 = add i64 %t58, %t24
+  %t60 = add i64 %t59, %t25
+  %t61 = add i64 %t60, %t26
+  %t62 = add i64 %t61, %t27
+  %t63 = add i64 %t62, %t28
+  %t64 = add i64 %t63, %t29
+  %t65 = add i64 %t64, %t30
+  %t66 = add i64 %t65, %t31
+  %t67 = add i64 %t66, %t32
+  %t68 = add i64 %t67, %t33
+  %t69 = add i64 %t68, %t34
+  %t70 = add i64 %t69, %t35
+  %t71 = add i64 %t70, %x0
+  ret i64 %t71
+}
diff --git a/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll b/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
new file mode 100644
index 0000000..6e62a28
--- /dev/null
+++ b/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -reassociate -disable-output
+; PR13041
+
+define void @foo() {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %sub2, %while.body ]
+  %c.0 = phi i32 [ undef, %entry ], [ %sub3, %while.body ]
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %sub = sub nsw i32 0, %b.0
+  %sub2 = sub nsw i32 %sub, %c.0
+  %sub3 = sub nsw i32 0, %c.0
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
diff --git a/test/Transforms/Reassociate/absorption.ll b/test/Transforms/Reassociate/absorption.ll
new file mode 100644
index 0000000..2ccc2b5
--- /dev/null
+++ b/test/Transforms/Reassociate/absorption.ll
@@ -0,0 +1,11 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+; Check that if constants combine to an absorbing value then the expression is
+; evaluated as the absorbing value.
+define i8 @foo(i8 %x) {
+  %tmp1 = or i8 %x, 127
+  %tmp2 = or i8 %tmp1, 128
+  ret i8 %tmp2
+; CHECK: @foo
+; CHECK: ret i8 -1
+}
diff --git a/test/Transforms/Reassociate/crash.ll b/test/Transforms/Reassociate/crash.ll
index 7a81942..ce586e1 100644
--- a/test/Transforms/Reassociate/crash.ll
+++ b/test/Transforms/Reassociate/crash.ll
@@ -67,3 +67,80 @@ _33:                                              ; preds = %_33, %_
   %tmp367 = add i32 %tmp365, %tmp366
   br label %_33
 }
+
+define void @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %tmp.2 = xor i32 %a, %b		; <i32> [#uses=1]
+  %tmp.5 = xor i32 %c, %d		; <i32> [#uses=1]
+  %tmp.6 = xor i32 %tmp.2, %tmp.5		; <i32> [#uses=1]
+  %tmp.9 = xor i32 %c, %a		; <i32> [#uses=1]
+  %tmp.12 = xor i32 %b, %d		; <i32> [#uses=1]
+  %tmp.13 = xor i32 %tmp.9, %tmp.12		; <i32> [#uses=1]
+  %tmp.16 = xor i32 %tmp.6, %tmp.13		; <i32> [#uses=0]
+  ret void
+}
+
+define i128 @foo() {
+  %mul = mul i128 0, 0
+  ret i128 %mul
+}
+
+define void @infinite_loop() {
+entry:
+  br label %loop
+loop:
+  %x = phi i32 [undef, %entry], [%x, %loop]
+  %dead = add i32 %x, 0
+  br label %loop
+unreachable1:
+  %y1 = add i32 %y1, 0
+  %z1 = add i32 %y1, 0
+  ret void
+unreachable2:
+  %y2 = add i32 %y2, 0
+  %z2 = add i32 %y2, %y2
+  ret void
+unreachable3:
+  %y3 = add i32 %y3, %y3
+  %z3 = add i32 %y3, 0
+  ret void
+unreachable4:
+  %y4 = add i32 %y4, %y4
+  %z4 = add i32 %y4, %y4
+  ret void
+}
+
+; PR13185
+define void @pr13185(i16 %p) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %x.0 = phi i32 [ undef, %entry ], [ %conv, %for.cond ]
+  %conv = zext i16 %p to i32
+  br label %for.cond
+}
+
+; PR12963
+@a = external global i8
+define i8 @f0(i8 %x) {
+  %t0 = load i8* @a
+  %t1 = mul i8 %x, %x
+  %t2 = mul i8 %t1, %t1
+  %t3 = mul i8 %t2, %t2
+  %t4 = mul i8 %t3, %x
+  %t5 = mul i8 %t4, %t4
+  %t6 = mul i8 %t5, %x
+  %t7 = mul i8 %t6, %t0
+  ret i8 %t7
+}
+
+define i32 @sozefx_(i32 %x, i32 %y) {
+  %t0 = sub i32 %x, %x
+  %t1 = mul i32 %t0, %t0
+  %t2 = mul i32 %x, %t0
+  %t3 = mul i32 %t1, %t1
+  %t4 = add i32 %t2, %t3
+  %t5 = mul i32 %x, %y
+  %t6 = add i32 %t4, %t5
+  ret i32 %t6
+}
diff --git a/test/Transforms/Reassociate/fp-commute.ll b/test/Transforms/Reassociate/fp-commute.ll
new file mode 100644
index 0000000..025689b
--- /dev/null
+++ b/test/Transforms/Reassociate/fp-commute.ll
@@ -0,0 +1,18 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+target triple = "armv7-apple-ios"
+
+declare void @use(float)
+
+; CHECK: test
+define void @test(float %x, float %y) {
+entry:
+; CHECK: fmul float %x, %y
+; CHECK: fmul float %x, %y
+  %0 = fmul float %x, %y
+  %1 = fmul float %y, %x
+  %2 = fsub float %0, %1
+  call void @use(float %0)
+  call void @use(float %2)
+  ret void
+}
diff --git a/test/Transforms/Reassociate/mightymul.ll b/test/Transforms/Reassociate/mightymul.ll
new file mode 100644
index 0000000..cfbc485
--- /dev/null
+++ b/test/Transforms/Reassociate/mightymul.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate
+; PR13021
+
+define i32 @foo(i32 %x) {
+  %t0 = mul i32 %x, %x
+  %t1 = mul i32 %t0, %t0
+  %t2 = mul i32 %t1, %t1
+  %t3 = mul i32 %t2, %t2
+  %t4 = mul i32 %t3, %t3
+  %t5 = mul i32 %t4, %t4
+  %t6 = mul i32 %t5, %t5
+  %t7 = mul i32 %t6, %t6
+  %t8 = mul i32 %t7, %t7
+  %t9 = mul i32 %t8, %t8
+  %t10 = mul i32 %t9, %t9
+  %t11 = mul i32 %t10, %t10
+  %t12 = mul i32 %t11, %t11
+  %t13 = mul i32 %t12, %t12
+  %t14 = mul i32 %t13, %t13
+  %t15 = mul i32 %t14, %t14
+  %t16 = mul i32 %t15, %t15
+  %t17 = mul i32 %t16, %t16
+  %t18 = mul i32 %t17, %t17
+  %t19 = mul i32 %t18, %t18
+  %t20 = mul i32 %t19, %t19
+  %t21 = mul i32 %t20, %t20
+  %t22 = mul i32 %t21, %t21
+  %t23 = mul i32 %t22, %t22
+  %t24 = mul i32 %t23, %t23
+  %t25 = mul i32 %t24, %t24
+  %t26 = mul i32 %t25, %t25
+  %t27 = mul i32 %t26, %t26
+  %t28 = mul i32 %t27, %t27
+  ret i32 %t28
+}
diff --git a/test/Transforms/Reassociate/mulfactor.ll b/test/Transforms/Reassociate/mulfactor.ll
index f279727..6c099b4 100644
--- a/test/Transforms/Reassociate/mulfactor.ll
+++ b/test/Transforms/Reassociate/mulfactor.ll
@@ -1,14 +1,134 @@
-; RUN: opt < %s -reassociate -instcombine -S | grep mul | count 2
+; RUN: opt < %s -reassociate -S | FileCheck %s
 
-; This should have exactly 2 multiplies when we're done.
+define i32 @test1(i32 %a, i32 %b) {
+; CHECK: @test1
+; CHECK: mul i32 %a, %a
+; CHECK-NEXT: mul i32 %a, 2
+; CHECK-NEXT: add
+; CHECK-NEXT: mul
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
 
-define i32 @f(i32 %a, i32 %b) {
-	%tmp.2 = mul i32 %a, %a		; <i32> [#uses=1]
-	%tmp.5 = shl i32 %a, 1		; <i32> [#uses=1]
-	%tmp.6 = mul i32 %tmp.5, %b		; <i32> [#uses=1]
-	%tmp.10 = mul i32 %b, %b		; <i32> [#uses=1]
-	%tmp.7 = add i32 %tmp.6, %tmp.2		; <i32> [#uses=1]
-	%tmp.11 = add i32 %tmp.7, %tmp.10		; <i32> [#uses=1]
+entry:
+	%tmp.2 = mul i32 %a, %a
+	%tmp.5 = shl i32 %a, 1
+	%tmp.6 = mul i32 %tmp.5, %b
+	%tmp.10 = mul i32 %b, %b
+	%tmp.7 = add i32 %tmp.6, %tmp.2
+	%tmp.11 = add i32 %tmp.7, %tmp.10
 	ret i32 %tmp.11
 }
 
+define i32 @test2(i32 %t) {
+; CHECK: @test2
+; CHECK: mul
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+
+entry:
+	%a = mul i32 %t, 6
+	%b = mul i32 %t, 36
+	%c = add i32 %b, 15
+	%d = add i32 %c, %a
+	ret i32 %d
+}
+
+define i32 @test3(i32 %x) {
+; (x^8)
+; CHECK: @test3
+; CHECK: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+
+entry:
+  %a = mul i32 %x, %x
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  %f = mul i32 %e, %x
+  %g = mul i32 %f, %x
+  ret i32 %g
+}
+
+define i32 @test4(i32 %x) {
+; (x^7)
+; CHECK: @test4
+; CHECK: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+
+entry:
+  %a = mul i32 %x, %x
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  %f = mul i32 %e, %x
+  ret i32 %f
+}
+
+define i32 @test5(i32 %x, i32 %y) {
+; (x^4) * (y^2)
+; CHECK: @test5
+; CHECK: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+
+entry:
+  %a = mul i32 %x, %y
+  %b = mul i32 %a, %y
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  ret i32 %e
+}
+
+define i32 @test6(i32 %x, i32 %y, i32 %z) {
+; (x^5) * (y^3) * z
+; CHECK: @test6
+; CHECK: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+
+entry:
+  %a = mul i32 %x, %y
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %y
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %y
+  %f = mul i32 %e, %x
+  %g = mul i32 %f, %z
+  %h = mul i32 %g, %x
+  ret i32 %h
+}
+
+define i32 @test7(i32 %x, i32 %y, i32 %z) {
+; (x^4) * (y^3) * (z^2)
+; CHECK: @test7
+; CHECK: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+
+entry:
+  %a = mul i32 %y, %x
+  %b = mul i32 %a, %z
+  %c = mul i32 %b, %z
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %y
+  %f = mul i32 %e, %y
+  %g = mul i32 %f, %x
+  %h = mul i32 %g, %x
+  ret i32 %h
+}
diff --git a/test/Transforms/Reassociate/mulfactor2.ll b/test/Transforms/Reassociate/mulfactor2.ll
deleted file mode 100644
index 8116554..0000000
--- a/test/Transforms/Reassociate/mulfactor2.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; This should turn into one multiply and one add.
-
-; RUN: opt < %s -instcombine -reassociate -instcombine -S > %t
-; RUN: grep mul %t | count 1
-; RUN: grep add %t | count 1
-
-define i32 @main(i32 %t) {
-	%tmp.3 = mul i32 %t, 12		; <i32> [#uses=1]
-	%tmp.4 = add i32 %tmp.3, 5		; <i32> [#uses=1]
-	%tmp.6 = mul i32 %t, 6		; <i32> [#uses=1]
-	%tmp.8 = mul i32 %tmp.4, 3		; <i32> [#uses=1]
-	%tmp.9 = add i32 %tmp.8, %tmp.6		; <i32> [#uses=1]
-	ret i32 %tmp.9
-}
-
diff --git a/test/Transforms/Reassociate/multistep.ll b/test/Transforms/Reassociate/multistep.ll
new file mode 100644
index 0000000..7466d2e
--- /dev/null
+++ b/test/Transforms/Reassociate/multistep.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define i64 @multistep1(i64 %a, i64 %b, i64 %c) {
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
+; CHECK: @multistep1
+  %t0 = mul i64 %a, %b
+  %t1 = mul i64 %a, %t0 ; a*(a*b)
+  %t2 = mul i64 %a, %c
+  %t3 = mul i64 %a, %t2 ; a*(a*c)
+  %t4 = add i64 %t1, %t3
+; CHECK-NEXT: add i64 %c, %b
+; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
+; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
+; CHECK-NEXT: ret
+  ret i64 %t4
+}
+
+define i64 @multistep2(i64 %a, i64 %b, i64 %c, i64 %d) {
+; Check that a*b+a*c+d is turned into a*(b+c)+d.
+; CHECK: @multistep2
+  %t0 = mul i64 %a, %b
+  %t1 = mul i64 %a, %c
+  %t2 = add i64 %t1, %d ; a*c+d
+  %t3 = add i64 %t0, %t2 ; a*b+(a*c+d)
+; CHECK-NEXT: add i64 %c, %b
+; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
+; CHECK-NEXT: add i64 %tmp{{.*}}, %d
+; CHECK-NEXT: ret
+  ret i64 %t3
+}
+
diff --git a/test/Transforms/Reassociate/no-op.ll b/test/Transforms/Reassociate/no-op.ll
new file mode 100644
index 0000000..0444cf0
--- /dev/null
+++ b/test/Transforms/Reassociate/no-op.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; When there is nothing to do, or not much to do, check that reassociate leaves
+; things alone.
+
+declare void @use(i32)
+
+define void @test1(i32 %a, i32 %b) {
+; Shouldn't change or move any of the add instructions.  Should commute but
+; otherwise not change or move any of the mul instructions.
+; CHECK: @test1
+  %a0 = add nsw i32 %a, 1
+; CHECK-NEXT: %a0 = add nsw i32 %a, 1
+  %m0 = mul nsw i32 3, %a
+; CHECK-NEXT: %m0 = mul nsw i32 %a, 3
+  %a1 = add nsw i32 %a0, %b
+; CHECK-NEXT: %a1 = add nsw i32 %a0, %b
+  %m1 = mul nsw i32 %b, %m0
+; CHECK-NEXT: %m1 = mul nsw i32 %m0, %b
+  call void @use(i32 %a1)
+; CHECK-NEXT: call void @use
+  call void @use(i32 %m1)
+  ret void
+}
+
+define void @test2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; The initial add doesn't change so should not lose the nsw flag.
+; CHECK: @test2
+  %a0 = add nsw i32 %b, %a
+; CHECK-NEXT: %a0 = add nsw i32 %b, %a
+  %a1 = add nsw i32 %a0, %d
+; CHECK-NEXT: %a1 = add i32 %a0, %c
+  %a2 = add nsw i32 %a1, %c
+; CHECK-NEXT: %a2 = add i32 %a1, %d
+  call void @use(i32 %a2)
+; CHECK-NEXT: call void @use
+  ret void
+}
diff --git a/test/Transforms/Reassociate/repeats.ll b/test/Transforms/Reassociate/repeats.ll
new file mode 100644
index 0000000..6a02047
--- /dev/null
+++ b/test/Transforms/Reassociate/repeats.ll
@@ -0,0 +1,252 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Tests involving repeated operations on the same value.
+
+define i8 @nilpotent(i8 %x) {
+; CHECK: @nilpotent
+  %tmp = xor i8 %x, %x
+  ret i8 %tmp
+; CHECK: ret i8 0
+}
+
+define i2 @idempotent(i2 %x) {
+; CHECK: @idempotent
+  %tmp1 = and i2 %x, %x
+  %tmp2 = and i2 %tmp1, %x
+  %tmp3 = and i2 %tmp2, %x
+  ret i2 %tmp3
+; CHECK: ret i2 %x
+}
+
+define i2 @add(i2 %x) {
+; CHECK: @add
+  %tmp1 = add i2 %x, %x
+  %tmp2 = add i2 %tmp1, %x
+  %tmp3 = add i2 %tmp2, %x
+  ret i2 %tmp3
+; CHECK: ret i2 0
+}
+
+define i2 @cst_add() {
+; CHECK: @cst_add
+  %tmp1 = add i2 1, 1
+  %tmp2 = add i2 %tmp1, 1
+  ret i2 %tmp2
+; CHECK: ret i2 -1
+}
+
+define i8 @cst_mul() {
+; CHECK: @cst_mul
+  %tmp1 = mul i8 3, 3
+  %tmp2 = mul i8 %tmp1, 3
+  %tmp3 = mul i8 %tmp2, 3
+  %tmp4 = mul i8 %tmp3, 3
+  ret i8 %tmp4
+; CHECK: ret i8 -13
+}
+
+define i3 @foo3x5(i3 %x) {
+; Can be done with two multiplies.
+; CHECK: @foo3x5
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  ret i3 %tmp4
+}
+
+define i3 @foo3x6(i3 %x) {
+; Can be done with two multiplies.
+; CHECK: @foo3x6
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  %tmp5 = mul i3 %tmp4, %x
+  ret i3 %tmp5
+}
+
+define i3 @foo3x7(i3 %x) {
+; Can be done with two multiplies.
+; CHECK: @foo3x7
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  %tmp5 = mul i3 %tmp4, %x
+  %tmp6 = mul i3 %tmp5, %x
+  ret i3 %tmp6
+}
+
+define i4 @foo4x8(i4 %x) {
+; Can be done with two multiplies.
+; CHECK: @foo4x8
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  ret i4 %tmp7
+}
+
+define i4 @foo4x9(i4 %x) {
+; Can be done with three multiplies.
+; CHECK: @foo4x9
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  ret i4 %tmp8
+}
+
+define i4 @foo4x10(i4 %x) {
+; Can be done with three multiplies.
+; CHECK: @foo4x10
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  ret i4 %tmp9
+}
+
+define i4 @foo4x11(i4 %x) {
+; Can be done with four multiplies.
+; CHECK: @foo4x11
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  ret i4 %tmp10
+}
+
+define i4 @foo4x12(i4 %x) {
+; Can be done with two multiplies.
+; CHECK: @foo4x12
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  ret i4 %tmp11
+}
+
+define i4 @foo4x13(i4 %x) {
+; Can be done with three multiplies.
+; CHECK: @foo4x13
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  ret i4 %tmp12
+}
+
+define i4 @foo4x14(i4 %x) {
+; Can be done with three multiplies.
+; CHECK: @foo4x14
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  %tmp13 = mul i4 %tmp12, %x
+  ret i4 %tmp13
+}
+
+define i4 @foo4x15(i4 %x) {
+; Can be done with four multiplies.
+; CHECK: @foo4x15
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  %tmp13 = mul i4 %tmp12, %x
+  %tmp14 = mul i4 %tmp13, %x
+  ret i4 %tmp14
+}
diff --git a/test/Transforms/Reassociate/shifttest.ll b/test/Transforms/Reassociate/shifttest.ll
index 8b2cbc9..d9a5336 100644
--- a/test/Transforms/Reassociate/shifttest.ll
+++ b/test/Transforms/Reassociate/shifttest.ll
@@ -1,7 +1,7 @@
 ; With shl->mul reassociation, we can see that this is (shl A, 9) * A
 ;
 ; RUN: opt < %s -reassociate -instcombine -S |\
-; RUN:    grep {shl .*, 9}
+; RUN:    grep "shl .*, 9"
 
 define i32 @test(i32 %A, i32 %B) {
 	%X = shl i32 %A, 5		; <i32> [#uses=1]
diff --git a/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll b/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
index 4adfde3..c847b4e 100644
--- a/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
+++ b/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -sccp -S | grep {ret i32 1}
+; RUN: opt < %s -sccp -S | grep "ret i32 1"
 
 ; This function definitely returns 1, even if we don't know the direction
 ; of the branch.
diff --git a/test/Transforms/SCCP/2006-12-19-UndefBug.ll b/test/Transforms/SCCP/2006-12-19-UndefBug.ll
index ec69ce0..ede1a32 100644
--- a/test/Transforms/SCCP/2006-12-19-UndefBug.ll
+++ b/test/Transforms/SCCP/2006-12-19-UndefBug.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -sccp -S | \
-; RUN:   grep {ret i1 false}
+; RUN:   grep "ret i1 false"
 
 define i1 @foo() {
 	%X = and i1 false, undef		; <i1> [#uses=1]
diff --git a/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll b/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll
index a40455c..e7168dd 100644
--- a/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll
+++ b/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -sccp -S | grep {ret i32 %Z}
+; RUN: opt < %s -sccp -S | grep "ret i32 %Z"
 ; rdar://5778210
 
 declare {i32, i32} @bar(i32 %A) 
diff --git a/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll b/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll
index 63f41db..4688643 100644
--- a/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll
+++ b/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -sccp -S | not grep {ret i32 undef}
+; RUN: opt < %s -sccp -S | not grep "ret i32 undef"
 ; PR2358
 target datalayout =
 "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll b/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
index f62ed70..c05f897 100644
--- a/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
+++ b/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -ipsccp -S | grep {ret i32 42}
-; RUN: opt < %s -ipsccp -S | grep {ret i32 undef}
+; RUN: opt < %s -ipsccp -S | grep "ret i32 42"
+; RUN: opt < %s -ipsccp -S | grep "ret i32 undef"
 ; PR3325
 
 define i32 @main() {
diff --git a/test/Transforms/SCCP/apint-array.ll b/test/Transforms/SCCP/apint-array.ll
index 1e75878..888b9e1 100644
--- a/test/Transforms/SCCP/apint-array.ll
+++ b/test/Transforms/SCCP/apint-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -sccp -S | grep {ret i101 12}
+; RUN: opt < %s -sccp -S | grep "ret i101 12"
 
 @Y = constant [6 x i101] [ i101 12, i101 123456789000000, i101 -12,i101 
 -123456789000000, i101 0,i101 9123456789000000]
diff --git a/test/Transforms/SCCP/apint-basictest4.ll b/test/Transforms/SCCP/apint-basictest4.ll
index 8624260..572f97c 100644
--- a/test/Transforms/SCCP/apint-basictest4.ll
+++ b/test/Transforms/SCCP/apint-basictest4.ll
@@ -4,7 +4,7 @@
 
 ; RUN: opt < %s -sccp -S | not grep and
 ; RUN: opt < %s -sccp -S | not grep trunc
-; RUN: opt < %s -sccp -S | grep {ret i100 -1}
+; RUN: opt < %s -sccp -S | grep "ret i100 -1"
 
 define i100 @test(i133 %A) {
         %B = and i133 0, %A
diff --git a/test/Transforms/SCCP/apint-ipsccp1.ll b/test/Transforms/SCCP/apint-ipsccp1.ll
index fda40f5..f6f18fe 100644
--- a/test/Transforms/SCCP/apint-ipsccp1.ll
+++ b/test/Transforms/SCCP/apint-ipsccp1.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -ipsccp -S | grep -v {ret i512 undef} | \
-; RUN:   grep {ret i8 2}
+; RUN: opt < %s -ipsccp -S | grep -v "ret i512 undef" | \
+; RUN:   grep "ret i8 2"
 
 define internal i512 @test(i1 %B) {
 	br i1 %B, label %BB1, label %BB2
diff --git a/test/Transforms/SCCP/apint-ipsccp2.ll b/test/Transforms/SCCP/apint-ipsccp2.ll
index 3c02e05..834cca4 100644
--- a/test/Transforms/SCCP/apint-ipsccp2.ll
+++ b/test/Transforms/SCCP/apint-ipsccp2.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -ipsccp -S | grep -v {ret i101 0} | \
-; RUN:    grep -v {ret i101 undef} | not grep ret
+; RUN: opt < %s -ipsccp -S | grep -v "ret i101 0" | \
+; RUN:    grep -v "ret i101 undef" | not grep ret
 
 
 define internal i101 @bar(i101 %A) {
diff --git a/test/Transforms/SCCP/logical-nuke.ll b/test/Transforms/SCCP/logical-nuke.ll
index b3d845c..45f6f44 100644
--- a/test/Transforms/SCCP/logical-nuke.ll
+++ b/test/Transforms/SCCP/logical-nuke.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -sccp -S | grep {ret i32 0}
+; RUN: opt < %s -sccp -S | grep "ret i32 0"
 
 ; Test that SCCP has basic knowledge of when and/or nuke overdefined values.
 
diff --git a/test/Transforms/SCCP/vector-bitcast.ll b/test/Transforms/SCCP/vector-bitcast.ll
new file mode 100644
index 0000000..b032085
--- /dev/null
+++ b/test/Transforms/SCCP/vector-bitcast.ll
@@ -0,0 +1,20 @@
+; RUN: opt -sccp -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+; CHECK: store volatile <2 x i64> zeroinitializer, <2 x i64>* %p
+; rdar://11324230
+
+define void @foo(<2 x i64>* %p) nounwind {
+entry:
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %entry
+  %vWorkExponent.i.033 = phi <4 x i32> [ %sub.i.i, %while.body.i ], [ <i32 939524096, i32 939524096, i32 939524096, i32 939524096>, %entry ]
+  %sub.i.i = add <4 x i32> %vWorkExponent.i.033, <i32 -8388608, i32 -8388608, i32 -8388608, i32 -8388608>
+  %0 = bitcast <4 x i32> %sub.i.i to <2 x i64>
+  %and.i119.i = and <2 x i64> %0, zeroinitializer
+  store volatile <2 x i64> %and.i119.i, <2 x i64>* %p
+  br label %while.body.i
+}
+
diff --git a/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll b/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
index eb1c945..0b5e415 100644
--- a/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
+++ b/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
@@ -1,7 +1,7 @@
 ; Scalar replacement was incorrectly promoting this alloca!!
 ;
 ; RUN: opt < %s -scalarrepl -S | \
-; RUN:   sed {s/;.*//g} | grep {\\\[}
+; RUN:   sed "s/;.*//g" | grep "\["
 
 define i8* @test() {
 	%A = alloca [30 x i8]		; <[30 x i8]*> [#uses=1]
diff --git a/test/Transforms/ScalarRepl/2003-10-29-ArrayProblem.ll b/test/Transforms/ScalarRepl/2003-10-29-ArrayProblem.ll
index 00e43a7..77c7b54 100644
--- a/test/Transforms/ScalarRepl/2003-10-29-ArrayProblem.ll
+++ b/test/Transforms/ScalarRepl/2003-10-29-ArrayProblem.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | grep {alloca %%T}
+; RUN: opt < %s -scalarrepl -S | grep "alloca %%T"
 
 %T = type { [80 x i8], i32, i32 }
 declare i32 @.callback_1(i8*)
diff --git a/test/Transforms/ScalarRepl/2008-01-29-PromoteBug.ll b/test/Transforms/ScalarRepl/2008-01-29-PromoteBug.ll
index 8bc4ff0..a53f3de 100644
--- a/test/Transforms/ScalarRepl/2008-01-29-PromoteBug.ll
+++ b/test/Transforms/ScalarRepl/2008-01-29-PromoteBug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -instcombine -S | grep {ret i8 17}
+; RUN: opt < %s -scalarrepl -instcombine -S | grep "ret i8 17"
 ; rdar://5707076
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin9.1.0"
diff --git a/test/Transforms/ScalarRepl/2008-06-22-LargeArray.ll b/test/Transforms/ScalarRepl/2008-06-22-LargeArray.ll
index 71ba601..f597613 100644
--- a/test/Transforms/ScalarRepl/2008-06-22-LargeArray.ll
+++ b/test/Transforms/ScalarRepl/2008-06-22-LargeArray.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | grep {call.*mem} 
+; RUN: opt < %s -scalarrepl -S | grep "call.*mem" 
 ; PR2369
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/ScalarRepl/2008-08-22-out-of-range-array-promote.ll b/test/Transforms/ScalarRepl/2008-08-22-out-of-range-array-promote.ll
index 7cccb19..b2a9d43 100644
--- a/test/Transforms/ScalarRepl/2008-08-22-out-of-range-array-promote.ll
+++ b/test/Transforms/ScalarRepl/2008-08-22-out-of-range-array-promote.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | grep {s = alloca .struct.x}
+; RUN: opt < %s -scalarrepl -S | grep "s = alloca .struct.x"
 ; PR2423
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/Transforms/ScalarRepl/2009-02-02-ScalarPromoteOutOfRange.ll b/test/Transforms/ScalarRepl/2009-02-02-ScalarPromoteOutOfRange.ll
index 9c0f203..3c8a364 100644
--- a/test/Transforms/ScalarRepl/2009-02-02-ScalarPromoteOutOfRange.ll
+++ b/test/Transforms/ScalarRepl/2009-02-02-ScalarPromoteOutOfRange.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -instcombine -S | grep {ret i32 %x}
+; RUN: opt < %s -scalarrepl -instcombine -S | grep "ret i32 %x"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
 
diff --git a/test/Transforms/ScalarRepl/2009-02-05-LoadFCA.ll b/test/Transforms/ScalarRepl/2009-02-05-LoadFCA.ll
index f8ab875..67228a7 100644
--- a/test/Transforms/ScalarRepl/2009-02-05-LoadFCA.ll
+++ b/test/Transforms/ScalarRepl/2009-02-05-LoadFCA.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -instcombine -inline -instcombine -S | grep {ret i32 42}
+; RUN: opt < %s -scalarrepl -instcombine -inline -instcombine -S | grep "ret i32 42"
 ; PR3489
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
diff --git a/test/Transforms/ScalarRepl/2009-03-04-MemCpyAlign.ll b/test/Transforms/ScalarRepl/2009-03-04-MemCpyAlign.ll
index 3218d59..a4182d4 100644
--- a/test/Transforms/ScalarRepl/2009-03-04-MemCpyAlign.ll
+++ b/test/Transforms/ScalarRepl/2009-03-04-MemCpyAlign.ll
@@ -1,6 +1,6 @@
 ; The store into %p should end up with a known alignment of 1, since the memcpy
 ; is only known to access it with 1-byte alignment.
-; RUN: opt < %s -scalarrepl -S | grep {store i16 1, .*, align 1}
+; RUN: opt < %s -scalarrepl -S | grep "store i16 1, .*, align 1"
 ; PR3720
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
diff --git a/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
index 98fa1c6..4596885 100644
--- a/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
+++ b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
@@ -10,8 +10,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 
 ; CHECK: main
 ; CHECK-NOT: alloca
-; CHECK: %[[A:[a-z0-9]*]] = and i128
-; CHECK: %[[B:[a-z0-9]*]] = trunc i128 %[[A]] to i32
+; CHECK: extractelement <2 x float> zeroinitializer, i32 0
 
 define void @main() uwtable ssp {
 entry:
@@ -28,8 +27,7 @@ entry:
 
 ; CHECK: test1
 ; CHECK-NOT: alloca
-; CHECK: %[[A:[a-z0-9]*]] = and i128
-; CHECK: %[[B:[a-z0-9]*]] = trunc i128 %[[A]] to i32
+; CHECK: extractelement <2 x float> zeroinitializer, i32 0
 
 define void @test1() uwtable ssp {
 entry:
@@ -43,9 +41,8 @@ entry:
 
 ; CHECK: test2
 ; CHECK-NOT: alloca
-; CHECK: and i128
-; CHECK: or i128
-; CHECK: trunc i128
+; CHECK: %[[A:[a-z0-9]*]] = extractelement <2 x float> zeroinitializer, i32 0
+; CHECK: fadd float %[[A]], 1.000000e+00
 ; CHECK-NOT: insertelement
 ; CHECK-NOT: extractelement
 
@@ -62,3 +59,17 @@ entry:
   %r = fadd float %r1, %r2
   ret float %r
 }
+
+; CHECK: test3
+; CHECK: %[[A:[a-z0-9]*]] = extractelement <2 x float> <float 2.000000e+00, float 3.000000e+00>, i32 1
+; CHECK: ret float %[[A]]
+
+define float @test3() {
+entry:
+  %ai = alloca { <2 x float>, <2 x float> }, align 8
+  store { <2 x float>, <2 x float> } {<2 x float> <float 0.0, float 1.0>, <2 x float> <float 2.0, float 3.0>}, { <2 x float>, <2 x float> }* %ai, align 8
+  %tmpcast = bitcast { <2 x float>, <2 x float> }* %ai to [4 x float]*
+  %arrayidx = getelementptr inbounds [4 x float]* %tmpcast, i64 0, i64 3
+  %f = load float* %arrayidx, align 4
+  ret float %f
+}
diff --git a/test/Transforms/ScalarRepl/crash.ll b/test/Transforms/ScalarRepl/crash.ll
index cd4dc32..58c5a3a 100644
--- a/test/Transforms/ScalarRepl/crash.ll
+++ b/test/Transforms/ScalarRepl/crash.ll
@@ -260,5 +260,27 @@ entry:
   ret void
 }
 
+; rdar://11861001 - The dynamic GEP here was incorrectly making all accesses
+; to the alloca think they were also dynamic.  Inserts and extracts created to
+; access the vector were all being based from the dynamic access, even in BBs
+; not dominated by the GEP.
+define fastcc void @test() optsize inlinehint ssp align 2 {
+entry:
+  %alloc.0.0 = alloca <4 x float>, align 16
+  %bitcast = bitcast <4 x float>* %alloc.0.0 to [4 x float]*
+  %idx3 = getelementptr inbounds [4 x float]* %bitcast, i32 0, i32 3
+  store float 0.000000e+00, float* %idx3, align 4
+  br label %for.body10
+
+for.body10:                                       ; preds = %for.body10, %entry
+  %loopidx = phi i32 [ 0, %entry ], [ undef, %for.body10 ]
+  %unusedidx = getelementptr inbounds <4 x float>* %alloc.0.0, i32 0, i32 %loopidx
+  br i1 undef, label %for.end, label %for.body10
+
+for.end:                                          ; preds = %for.body10
+  store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00>, <4 x float>* %alloc.0.0, align 16
+  ret void
+}
+
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/Transforms/ScalarRepl/dynamic-vector-gep.ll b/test/Transforms/ScalarRepl/dynamic-vector-gep.ll
new file mode 100644
index 0000000..565cd76
--- /dev/null
+++ b/test/Transforms/ScalarRepl/dynamic-vector-gep.ll
@@ -0,0 +1,167 @@
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; CHECK: @test1
+; CHECK: %[[alloc:[\.a-z0-9]*]] = alloca <4 x float>
+; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc]]
+; CHECK: memset
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
+
+; Split the array but don't replace the memset with an insert
+; element as its not a constant offset.
+; The load, however, can be replaced with an extract element.
+define float @test1(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
+  %ptr1 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0, i32 %idx1
+  %cast = bitcast float* %ptr1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %cast, i8 0, i32 4, i32 4, i1 false)
+  %ptr2 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 1, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: @test2
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> %[[ins]], i32 %idx2
+
+; Do SROA on the array when it has dynamic vector reads and writes.
+define float @test2(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
+  %ptr1 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: test3
+; CHECK: %0 = alloca [4 x <4 x float>]
+; CHECK-NOT: alloca
+
+; Don't do SROA on a dynamically indexed vector when it spans
+; more than one array element of the alloca array it is within.
+define float @test3(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
+  %bigvec = bitcast [4 x <4 x float>]* %0 to <16 x float>*
+  %ptr1 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: test4
+; CHECK: insertelement <16 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <16 x float> %0, i32 %idx2
+
+; Don't do SROA on a dynamically indexed vector when it spans
+; more than one array element of the alloca array it is within.
+; However, unlike test3, the store is on the vector type
+; so SROA will convert the large alloca into the large vector
+; type and do all accesses with insert/extract element
+define float @test4(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  %bigvec = bitcast [4 x <4 x float>]* %0 to <16 x float>*
+  store <16 x float> zeroinitializer, <16 x float>* %bigvec
+  %ptr1 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: @test5
+; CHECK: %0 = alloca [4 x <4 x float>]
+; CHECK-NOT: alloca
+
+; Don't do SROA as the is a second dynamically indexed array
+; which may span multiple elements of the alloca.
+define float @test5(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
+  %ptr1 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0, i32 %idx1
+  %ptr2 = bitcast float* %ptr1 to [1 x <2 x float>]*
+  %ptr3 = getelementptr [1 x <2 x float>]* %ptr2, i32 0, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr4 = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0, i32 %idx2
+  %ret = load float* %ptr4
+  ret float %ret
+}
+
+; CHECK: test6
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
+
+%vector.pair = type { %vector.anon, %vector.anon }
+%vector.anon = type { %vector }
+%vector = type { <4 x float> }
+
+; Dynamic GEPs on vectors were crashing when the vector was inside a struct
+; as the new GEP for the new alloca might not include all the indices from
+; the original GEP, just the indices it needs to get to the correct offset of
+; some type, not necessarily the dynamic vector.
+; This test makes sure we don't have this crash.
+define float @test6(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca %vector.pair
+  store %vector.pair zeroinitializer, %vector.pair* %0
+  %ptr1 = getelementptr %vector.pair* %0, i32 0, i32 0, i32 0, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr %vector.pair* %0, i32 0, i32 1, i32 0, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: test7
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
+
+%array.pair = type { [2 x %array.anon], %array.anon }
+%array.anon = type { [2 x %vector] }
+
+; This is the same as test6 and tests the same crash, but on arrays.
+define float @test7(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca %array.pair
+  store %array.pair zeroinitializer, %array.pair* %0
+  %ptr1 = getelementptr %array.pair* %0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr %array.pair* %0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: test8
+; CHECK: %[[offset1:[\.a-z0-9]*]] = add i32 %idx1, 1
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %[[offset1]]
+; CHECK: %[[offset2:[\.a-z0-9]*]] = add i32 %idx2, 2
+; CHECK: extractelement <4 x float> %[[ins]], i32 %[[offset2]]
+
+; Do SROA on the vector when it has dynamic vector reads and writes
+; from a non-zero offset.
+define float @test8(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca <4 x float>
+  store <4 x float> zeroinitializer, <4 x float>* %0
+  %ptr1 = getelementptr <4 x float>* %0, i32 0, i32 1
+  %ptr2 = bitcast float* %ptr1 to <3 x float>*
+  %ptr3 = getelementptr <3 x float>* %ptr2, i32 0, i32 %idx1
+  store float 1.0, float* %ptr3
+  %ptr4 = getelementptr <4 x float>* %0, i32 0, i32 2
+  %ptr5 = bitcast float* %ptr4 to <2 x float>*
+  %ptr6 = getelementptr <2 x float>* %ptr5, i32 0, i32 %idx2
+  %ret = load float* %ptr6
+  ret float %ret
+}
+
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
diff --git a/test/Transforms/ScalarRepl/memcpy-from-global.ll b/test/Transforms/ScalarRepl/memcpy-from-global.ll
index 59475ad..5557a8f 100644
--- a/test/Transforms/ScalarRepl/memcpy-from-global.ll
+++ b/test/Transforms/ScalarRepl/memcpy-from-global.ll
@@ -45,8 +45,10 @@ declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
+%U = type { i32, i32, i32, i32, i32 }
 
 @G = constant %T {i8 1, [123 x i8] zeroinitializer }
+@H = constant [2 x %U] zeroinitializer, align 16
 
 define void @test2() {
   %A = alloca %T
@@ -108,3 +110,37 @@ define void @test5() {
 
 
 declare void @baz(i8* byval)
+
+
+define void @test6() {
+  %A = alloca %U, align 16
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([2 x %U]* @H to i8*), i64 20, i32 16, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK: @test6
+; CHECK-NEXT: %a = bitcast
+; CHECK-NEXT: call void @bar(i8* %a)
+  ret void
+}
+
+define void @test7() {
+  %A = alloca %U, align 16
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 0) to i8*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK: @test7
+; CHECK-NEXT: %a = bitcast
+; CHECK-NEXT: call void @bar(i8* %a)
+  ret void
+}
+
+define void @test8() {
+  %A = alloca %U, align 16
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 1) to i8*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK: @test8
+; CHECK: llvm.memcpy
+; CHECK: bar
+  ret void
+}
diff --git a/test/Transforms/ScalarRepl/memset-aggregate-byte-leader.ll b/test/Transforms/ScalarRepl/memset-aggregate-byte-leader.ll
index 0d61e5a..3510dfc 100644
--- a/test/Transforms/ScalarRepl/memset-aggregate-byte-leader.ll
+++ b/test/Transforms/ScalarRepl/memset-aggregate-byte-leader.ll
@@ -1,6 +1,6 @@
 ; PR1226
 ; RUN: opt < %s -scalarrepl -S | \
-; RUN:   not grep {call void @llvm.memcpy.i32}
+; RUN:   not grep "call void @llvm.memcpy.i32"
 ; RUN: opt < %s -scalarrepl -S | grep getelementptr
 ; END.
 
diff --git a/test/Transforms/ScalarRepl/memset-aggregate.ll b/test/Transforms/ScalarRepl/memset-aggregate.ll
index 42e7a0f..95ecf17 100644
--- a/test/Transforms/ScalarRepl/memset-aggregate.ll
+++ b/test/Transforms/ScalarRepl/memset-aggregate.ll
@@ -1,7 +1,7 @@
 ; PR1226
-; RUN: opt < %s -scalarrepl -S | grep {ret i32 16843009}
+; RUN: opt < %s -scalarrepl -S | grep "ret i32 16843009"
 ; RUN: opt < %s -scalarrepl -S | not grep alloca
-; RUN: opt < %s -scalarrepl -instcombine -S | grep {ret i16 514}
+; RUN: opt < %s -scalarrepl -instcombine -S | grep "ret i16 514"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-apple-darwin8"
diff --git a/test/Transforms/ScalarRepl/not-a-vector.ll b/test/Transforms/ScalarRepl/not-a-vector.ll
index f873456..67fefb4 100644
--- a/test/Transforms/ScalarRepl/not-a-vector.ll
+++ b/test/Transforms/ScalarRepl/not-a-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -scalarrepl -S | not grep alloca
-; RUN: opt < %s -scalarrepl -S | not grep {7 x double}
-; RUN: opt < %s -scalarrepl -instcombine -S | grep {ret double %B}
+; RUN: opt < %s -scalarrepl -S | not grep "7 x double"
+; RUN: opt < %s -scalarrepl -instcombine -S | grep "ret double %B"
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 define double @test(double %A, double %B) {
diff --git a/test/Transforms/ScalarRepl/union-fp-int.ll b/test/Transforms/ScalarRepl/union-fp-int.ll
index 8b7e50d..6a49918 100644
--- a/test/Transforms/ScalarRepl/union-fp-int.ll
+++ b/test/Transforms/ScalarRepl/union-fp-int.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -scalarrepl -S | \
 ; RUN:   not grep alloca
 ; RUN: opt < %s -scalarrepl -S | \
-; RUN:   grep {bitcast.*float.*i32}
+; RUN:   grep "bitcast.*float.*i32"
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 define i32 @test(float %X) {
diff --git a/test/Transforms/ScalarRepl/union-pointer.ll b/test/Transforms/ScalarRepl/union-pointer.ll
index ea4ec14..03d25ac 100644
--- a/test/Transforms/ScalarRepl/union-pointer.ll
+++ b/test/Transforms/ScalarRepl/union-pointer.ll
@@ -1,7 +1,7 @@
 ; PR892
 ; RUN: opt < %s -scalarrepl -S | \
 ; RUN:   not grep alloca
-; RUN: opt < %s -scalarrepl -S | grep {ret i8}
+; RUN: opt < %s -scalarrepl -S | grep "ret i8"
 
 target datalayout = "e-p:32:32-n8:16:32"
 target triple = "i686-apple-darwin8.7.2"
diff --git a/test/Transforms/ScalarRepl/vector_memcpy.ll b/test/Transforms/ScalarRepl/vector_memcpy.ll
index decbd30..33e8034 100644
--- a/test/Transforms/ScalarRepl/vector_memcpy.ll
+++ b/test/Transforms/ScalarRepl/vector_memcpy.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -scalarrepl -S > %t
-; RUN: grep {ret <16 x float> %A} %t
-; RUN: grep {ret <16 x float> zeroinitializer} %t
+; RUN: grep "ret <16 x float> %A" %t
+; RUN: grep "ret <16 x float> zeroinitializer" %t
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 define <16 x float> @foo(<16 x float> %A) nounwind {
diff --git a/test/Transforms/ScalarRepl/volatile.ll b/test/Transforms/ScalarRepl/volatile.ll
index fadf1aa..056526c 100644
--- a/test/Transforms/ScalarRepl/volatile.ll
+++ b/test/Transforms/ScalarRepl/volatile.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -scalarrepl -S | grep {load volatile}
-; RUN: opt < %s -scalarrepl -S | grep {store volatile}
+; RUN: opt < %s -scalarrepl -S | grep "load volatile"
+; RUN: opt < %s -scalarrepl -S | grep "store volatile"
 
 define i32 @voltest(i32 %T) {
 	%A = alloca {i32, i32}
diff --git a/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll b/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll
index 414235b..feffb4e 100644
--- a/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll
+++ b/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll
@@ -1,7 +1,7 @@
 ; Basic block #2 should not be merged into BB #3!
 ;
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:   grep {br label}
+; RUN:   grep "br label"
 ;
 
 declare void @foo()
diff --git a/test/Transforms/SimplifyCFG/2003-08-05-MishandleInvoke.ll b/test/Transforms/SimplifyCFG/2003-08-05-MishandleInvoke.ll
deleted file mode 100644
index bc61a75..0000000
--- a/test/Transforms/SimplifyCFG/2003-08-05-MishandleInvoke.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; Do not remove the invoke!
-;
-; RUN: opt < %s -simplifycfg -S | grep invoke
-
-define i32 @test() {
-	invoke i32 @test( )
-			to label %Ret unwind label %Ret		; <i32>:1 [#uses=0]
-Ret:		; preds = %0, %0
-        %val = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                 catch i8* null
-	%A = add i32 0, 1		; <i32> [#uses=1]
-	ret i32 %A
-}
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SimplifyCFG/2003-08-17-BranchFold.ll b/test/Transforms/SimplifyCFG/2003-08-17-BranchFold.ll
index 8ac9ae4..fc89b16 100644
--- a/test/Transforms/SimplifyCFG/2003-08-17-BranchFold.ll
+++ b/test/Transforms/SimplifyCFG/2003-08-17-BranchFold.ll
@@ -2,7 +2,7 @@
 ; 'br Dest'
 
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:   not grep {br i1 %c2}
+; RUN:   not grep "br i1 %c2"
 
 declare void @noop()
 
diff --git a/test/Transforms/SimplifyCFG/2003-08-17-BranchFoldOrdering.ll b/test/Transforms/SimplifyCFG/2003-08-17-BranchFoldOrdering.ll
index 888e187..c1b032f 100644
--- a/test/Transforms/SimplifyCFG/2003-08-17-BranchFoldOrdering.ll
+++ b/test/Transforms/SimplifyCFG/2003-08-17-BranchFoldOrdering.ll
@@ -4,7 +4,7 @@
 ; the ConstantFoldTerminator function.
 
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:   not grep {br i1 %c2}
+; RUN:   not grep "br i1 %c2"
 
 declare void @noop()
 
diff --git a/test/Transforms/SimplifyCFG/2006-10-29-InvokeCrash.ll b/test/Transforms/SimplifyCFG/2006-10-29-InvokeCrash.ll
deleted file mode 100644
index 27d9d8f..0000000
--- a/test/Transforms/SimplifyCFG/2006-10-29-InvokeCrash.ll
+++ /dev/null
@@ -1,567 +0,0 @@
-; RUN: opt < %s -simplifycfg -disable-output
-; END.
-	%struct..4._102 = type { %struct.QVectorData* }
-	%struct..5._125 = type { %struct.QMapData* }
-	%struct.QAbstractTextDocumentLayout = type { %struct.QObject }
-	%struct.QBasicAtomic = type { i32 }
-	%struct.QFont = type { %struct.QFontPrivate*, i32 }
-	%struct.QFontMetrics = type { %struct.QFontPrivate* }
-	%struct.QFontPrivate = type opaque
-	%"struct.QFragmentMap<QTextBlockData>" = type { %struct.QFragmentMapData }
-	%struct.QFragmentMapData = type { %"struct.QFragmentMapData::._154", i32 }
-	%"struct.QFragmentMapData::._154" = type { %"struct.QFragmentMapData::Header"* }
-	%"struct.QFragmentMapData::Header" = type { i32, i32, i32, i32, i32, i32, i32, i32 }
-	%"struct.QHash<uint,QHashDummyValue>" = type { %"struct.QHash<uint,QHashDummyValue>::._152" }
-	%"struct.QHash<uint,QHashDummyValue>::._152" = type { %struct.QHashData* }
-	%struct.QHashData = type { %"struct.QHashData::Node"*, %"struct.QHashData::Node"**, %struct.QBasicAtomic, i32, i32, i16, i16, i32, i8 }
-	%"struct.QHashData::Node" = type { %"struct.QHashData::Node"*, i32 }
-	%"struct.QList<QObject*>::._92" = type { %struct.QListData }
-	%"struct.QList<QPointer<QObject> >" = type { %"struct.QList<QObject*>::._92" }
-	%struct.QListData = type { %"struct.QListData::Data"* }
-	%"struct.QListData::Data" = type { %struct.QBasicAtomic, i32, i32, i32, i8, [1 x i8*] }
-	%"struct.QMap<QUrl,QVariant>" = type { %struct..5._125 }
-	%struct.QMapData = type { %"struct.QMapData::Node"*, [12 x %"struct.QMapData::Node"*], %struct.QBasicAtomic, i32, i32, i32, i8 }
-	%"struct.QMapData::Node" = type { %"struct.QMapData::Node"*, [1 x %"struct.QMapData::Node"*] }
-	%struct.QObject = type { i32 (...)**, %struct.QObjectData* }
-	%struct.QObjectData = type { i32 (...)**, %struct.QObject*, %struct.QObject*, %"struct.QList<QPointer<QObject> >", i8, [3 x i8], i32, i32 }
-	%struct.QObjectPrivate = type { %struct.QObjectData, i32, %struct.QObject*, %"struct.QList<QPointer<QObject> >", %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %struct.QString }
-	%struct.QPaintDevice = type { i32 (...)**, i16 }
-	%struct.QPainter = type { %struct.QPainterPrivate* }
-	%struct.QPainterPrivate = type opaque
-	%struct.QPointF = type { double, double }
-	%struct.QPrinter = type { %struct.QPaintDevice, %struct.QPrinterPrivate* }
-	%struct.QPrinterPrivate = type opaque
-	%struct.QRectF = type { double, double, double, double }
-	%"struct.QSet<uint>" = type { %"struct.QHash<uint,QHashDummyValue>" }
-	%"struct.QSharedDataPointer<QTextFormatPrivate>" = type { %struct.QTextFormatPrivate* }
-	%struct.QString = type { %"struct.QString::Data"* }
-	%"struct.QString::Data" = type { %struct.QBasicAtomic, i32, i32, i16*, i8, i8, [1 x i16] }
-	%struct.QTextBlockFormat = type { %struct.QTextFormat }
-	%struct.QTextBlockGroup = type { %struct.QAbstractTextDocumentLayout }
-	%struct.QTextDocumentConfig = type { %struct.QString }
-	%struct.QTextDocumentPrivate = type { %struct.QObjectPrivate, %struct.QString, %"struct.QVector<QAbstractTextDocumentLayout::Selection>", i1, i32, i32, i1, i32, i32, i32, i32, i1, %struct.QTextFormatCollection, %struct.QTextBlockGroup*, %struct.QAbstractTextDocumentLayout*, %"struct.QFragmentMap<QTextBlockData>", %"struct.QFragmentMap<QTextBlockData>", i32, %"struct.QList<QPointer<QObject> >", %"struct.QList<QPointer<QObject> >", %"struct.QMap<QUrl,QVariant>", %"struct.QMap<QUrl,QVariant>", %"struct.QMap<QUrl,QVariant>", %struct.QTextDocumentConfig, i1, i1, %struct.QPointF }
-	%struct.QTextFormat = type { %"struct.QSharedDataPointer<QTextFormatPrivate>", i32 }
-	%struct.QTextFormatCollection = type { %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %"struct.QSet<uint>", %struct.QFont }
-	%struct.QTextFormatPrivate = type opaque
-	%"struct.QVector<QAbstractTextDocumentLayout::Selection>" = type { %struct..4._102 }
-	%struct.QVectorData = type { %struct.QBasicAtomic, i32, i32, i8 }
-
-define void @_ZNK13QTextDocument5printEP8QPrinter(%struct.QAbstractTextDocumentLayout* %this, %struct.QPrinter* %printer) {
-entry:
-	%tmp = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=2]
-	%tmp.upgrd.1 = alloca %struct.QRectF, align 16		; <%struct.QRectF*> [#uses=5]
-	%tmp2 = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=3]
-	%tmp.upgrd.2 = alloca %struct.QFontMetrics, align 16		; <%struct.QFontMetrics*> [#uses=4]
-	%tmp.upgrd.3 = alloca %struct.QFont, align 16		; <%struct.QFont*> [#uses=4]
-	%tmp3 = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=2]
-	%p = alloca %struct.QPainter, align 16		; <%struct.QPainter*> [#uses=14]
-	%body = alloca %struct.QRectF, align 16		; <%struct.QRectF*> [#uses=9]
-	%pageNumberPos = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=4]
-	%scaledPageSize = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=6]
-	%printerPageSize = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=3]
-	%fmt = alloca %struct.QTextBlockFormat, align 16		; <%struct.QTextBlockFormat*> [#uses=5]
-	%font = alloca %struct.QFont, align 16		; <%struct.QFont*> [#uses=5]
-	%tmp.upgrd.4 = call %struct.QTextDocumentPrivate* @_ZNK13QTextDocument6d_funcEv( %struct.QAbstractTextDocumentLayout* %this )		; <%struct.QTextDocumentPrivate*> [#uses=5]
-	%tmp.upgrd.5 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	call void @_ZN8QPainterC1EP12QPaintDevice( %struct.QPainter* %p, %struct.QPaintDevice* %tmp.upgrd.5 )
-	%tmp.upgrd.6 = invoke i1 @_ZNK8QPainter8isActiveEv( %struct.QPainter* %p )
-			to label %invcont unwind label %cleanup329		; <i1> [#uses=1]
-invcont:		; preds = %entry
-	br i1 %tmp.upgrd.6, label %cond_next, label %cleanup328
-cond_next:		; preds = %invcont
-	%tmp8 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %this )
-			to label %invcont7 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=0]
-invcont7:		; preds = %cond_next
-	%tmp10 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	call void @_ZN7QPointFC1Edd( %struct.QPointF* %tmp, double 0.000000e+00, double 0.000000e+00 )
-	call void @_ZN6QRectFC1ERK7QPointFRK6QSizeF( %struct.QRectF* %body, %struct.QPointF* %tmp, %struct.QPointF* %tmp10 )
-	call void @_ZN7QPointFC1Ev( %struct.QPointF* %pageNumberPos )
-	%tmp12 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	%tmp13 = call i1 @_ZNK6QSizeF7isValidEv( %struct.QPointF* %tmp12 )		; <i1> [#uses=1]
-	br i1 %tmp13, label %cond_next15, label %bb
-cond_next15:		; preds = %invcont7
-	%tmp17 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	%tmp.upgrd.7 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %tmp17 )		; <double> [#uses=1]
-	%tmp18 = fcmp oeq double %tmp.upgrd.7, 0x41DFFFFFFFC00000		; <i1> [#uses=1]
-	br i1 %tmp18, label %bb, label %cond_next20
-cond_next20:		; preds = %cond_next15
-	br label %bb21
-bb:		; preds = %cond_next15, %invcont7
-	br label %bb21
-bb21:		; preds = %bb, %cond_next20
-	%iftmp.406.0 = phi i1 [ false, %bb ], [ true, %cond_next20 ]		; <i1> [#uses=1]
-	br i1 %iftmp.406.0, label %cond_true24, label %cond_false
-cond_true24:		; preds = %bb21
-	%tmp.upgrd.8 = invoke i32 @_Z13qt_defaultDpiv( )
-			to label %invcont25 unwind label %cleanup329		; <i32> [#uses=1]
-invcont25:		; preds = %cond_true24
-	%tmp26 = sitofp i32 %tmp.upgrd.8 to double		; <double> [#uses=2]
-	%tmp30 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %this )
-			to label %invcont29 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=1]
-invcont29:		; preds = %invcont25
-	%tmp32 = invoke %struct.QPaintDevice* @_ZNK27QAbstractTextDocumentLayout11paintDeviceEv( %struct.QAbstractTextDocumentLayout* %tmp30 )
-			to label %invcont31 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=3]
-invcont31:		; preds = %invcont29
-	%tmp34 = icmp eq %struct.QPaintDevice* %tmp32, null		; <i1> [#uses=1]
-	br i1 %tmp34, label %cond_next42, label %cond_true35
-cond_true35:		; preds = %invcont31
-	%tmp38 = invoke i32 @_ZNK12QPaintDevice11logicalDpiXEv( %struct.QPaintDevice* %tmp32 )
-			to label %invcont37 unwind label %cleanup329		; <i32> [#uses=1]
-invcont37:		; preds = %cond_true35
-	%tmp38.upgrd.9 = sitofp i32 %tmp38 to double		; <double> [#uses=1]
-	%tmp41 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp32 )
-			to label %invcont40 unwind label %cleanup329		; <i32> [#uses=1]
-invcont40:		; preds = %invcont37
-	%tmp41.upgrd.10 = sitofp i32 %tmp41 to double		; <double> [#uses=1]
-	br label %cond_next42
-cond_next42:		; preds = %invcont40, %invcont31
-	%sourceDpiY.2 = phi double [ %tmp41.upgrd.10, %invcont40 ], [ %tmp26, %invcont31 ]		; <double> [#uses=1]
-	%sourceDpiX.2 = phi double [ %tmp38.upgrd.9, %invcont40 ], [ %tmp26, %invcont31 ]		; <double> [#uses=1]
-	%tmp44 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp46 = invoke i32 @_ZNK12QPaintDevice11logicalDpiXEv( %struct.QPaintDevice* %tmp44 )
-			to label %invcont45 unwind label %cleanup329		; <i32> [#uses=1]
-invcont45:		; preds = %cond_next42
-	%tmp46.upgrd.11 = sitofp i32 %tmp46 to double		; <double> [#uses=1]
-	%tmp48 = fdiv double %tmp46.upgrd.11, %sourceDpiX.2		; <double> [#uses=2]
-	%tmp50 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp52 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp50 )
-			to label %invcont51 unwind label %cleanup329		; <i32> [#uses=1]
-invcont51:		; preds = %invcont45
-	%tmp52.upgrd.12 = sitofp i32 %tmp52 to double		; <double> [#uses=1]
-	%tmp54 = fdiv double %tmp52.upgrd.12, %sourceDpiY.2		; <double> [#uses=2]
-	invoke void @_ZN8QPainter5scaleEdd( %struct.QPainter* %p, double %tmp48, double %tmp54 )
-			to label %invcont57 unwind label %cleanup329
-invcont57:		; preds = %invcont51
-	%tmp.upgrd.13 = getelementptr %struct.QPointF* %scaledPageSize, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp60 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26, i32 0		; <double*> [#uses=1]
-	%tmp61 = load double* %tmp60		; <double> [#uses=1]
-	store double %tmp61, double* %tmp.upgrd.13
-	%tmp62 = getelementptr %struct.QPointF* %scaledPageSize, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp63 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26, i32 1		; <double*> [#uses=1]
-	%tmp64 = load double* %tmp63		; <double> [#uses=1]
-	store double %tmp64, double* %tmp62
-	%tmp65 = call double* @_ZN6QSizeF6rwidthEv( %struct.QPointF* %scaledPageSize )		; <double*> [#uses=2]
-	%tmp67 = load double* %tmp65		; <double> [#uses=1]
-	%tmp69 = fmul double %tmp67, %tmp48		; <double> [#uses=1]
-	store double %tmp69, double* %tmp65
-	%tmp71 = call double* @_ZN6QSizeF7rheightEv( %struct.QPointF* %scaledPageSize )		; <double*> [#uses=2]
-	%tmp73 = load double* %tmp71		; <double> [#uses=1]
-	%tmp75 = fmul double %tmp73, %tmp54		; <double> [#uses=1]
-	store double %tmp75, double* %tmp71
-	%tmp78 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp80 = invoke i32 @_ZNK12QPaintDevice6heightEv( %struct.QPaintDevice* %tmp78 )
-			to label %invcont79 unwind label %cleanup329		; <i32> [#uses=1]
-invcont79:		; preds = %invcont57
-	%tmp82 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp84 = invoke i32 @_ZNK12QPaintDevice5widthEv( %struct.QPaintDevice* %tmp82 )
-			to label %invcont83 unwind label %cleanup329		; <i32> [#uses=1]
-invcont83:		; preds = %invcont79
-	%tmp80.upgrd.14 = sitofp i32 %tmp80 to double		; <double> [#uses=1]
-	%tmp84.upgrd.15 = sitofp i32 %tmp84 to double		; <double> [#uses=1]
-	call void @_ZN6QSizeFC1Edd( %struct.QPointF* %printerPageSize, double %tmp84.upgrd.15, double %tmp80.upgrd.14 )
-	%tmp85 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %printerPageSize )		; <double> [#uses=1]
-	%tmp86 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %scaledPageSize )		; <double> [#uses=1]
-	%tmp87 = fdiv double %tmp85, %tmp86		; <double> [#uses=1]
-	%tmp88 = call double @_ZNK6QSizeF5widthEv( %struct.QPointF* %printerPageSize )		; <double> [#uses=1]
-	%tmp89 = call double @_ZNK6QSizeF5widthEv( %struct.QPointF* %scaledPageSize )		; <double> [#uses=1]
-	%tmp90 = fdiv double %tmp88, %tmp89		; <double> [#uses=1]
-	invoke void @_ZN8QPainter5scaleEdd( %struct.QPainter* %p, double %tmp90, double %tmp87 )
-			to label %cond_next194 unwind label %cleanup329
-cond_false:		; preds = %bb21
-	%tmp.upgrd.16 = getelementptr %struct.QAbstractTextDocumentLayout* %this, i32 0, i32 0		; <%struct.QObject*> [#uses=1]
-	%tmp95 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument5cloneEP7QObject( %struct.QAbstractTextDocumentLayout* %this, %struct.QObject* %tmp.upgrd.16 )
-			to label %invcont94 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=9]
-invcont94:		; preds = %cond_false
-	%tmp99 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont98 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=1]
-invcont98:		; preds = %invcont94
-	%tmp101 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont100 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=1]
-invcont100:		; preds = %invcont98
-	invoke void @_ZN27QAbstractTextDocumentLayout14setPaintDeviceEP12QPaintDevice( %struct.QAbstractTextDocumentLayout* %tmp99, %struct.QPaintDevice* %tmp101 )
-			to label %invcont103 unwind label %cleanup329
-invcont103:		; preds = %invcont100
-	%tmp105 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont104 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=1]
-invcont104:		; preds = %invcont103
-	%tmp107 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp105 )
-			to label %invcont106 unwind label %cleanup329		; <i32> [#uses=1]
-invcont106:		; preds = %invcont104
-	%tmp108 = sitofp i32 %tmp107 to double		; <double> [#uses=1]
-	%tmp109 = fmul double %tmp108, 0x3FE93264C993264C		; <double> [#uses=1]
-	%tmp109.upgrd.17 = fptosi double %tmp109 to i32		; <i32> [#uses=3]
-	%tmp.upgrd.18 = call %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv( %struct.QAbstractTextDocumentLayout* %tmp95 )		; <%struct.QTextBlockGroup*> [#uses=1]
-	invoke void @_ZNK10QTextFrame11frameFormatEv( %struct.QTextBlockFormat* sret  %fmt, %struct.QTextBlockGroup* %tmp.upgrd.18 )
-			to label %invcont111 unwind label %cleanup329
-invcont111:		; preds = %invcont106
-	%tmp112 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	invoke void @_ZN16QTextFrameFormat9setMarginEd( %struct.QTextBlockFormat* %fmt, double %tmp112 )
-			to label %invcont114 unwind label %cleanup192
-invcont114:		; preds = %invcont111
-	%tmp116 = call %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv( %struct.QAbstractTextDocumentLayout* %tmp95 )		; <%struct.QTextBlockGroup*> [#uses=1]
-	invoke void @_ZN10QTextFrame14setFrameFormatERK16QTextFrameFormat( %struct.QTextBlockGroup* %tmp116, %struct.QTextBlockFormat* %fmt )
-			to label %invcont117 unwind label %cleanup192
-invcont117:		; preds = %invcont114
-	%tmp119 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont118 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont118:		; preds = %invcont117
-	%tmp121 = invoke i32 @_ZNK12QPaintDevice6heightEv( %struct.QPaintDevice* %tmp119 )
-			to label %invcont120 unwind label %cleanup192		; <i32> [#uses=1]
-invcont120:		; preds = %invcont118
-	%tmp121.upgrd.19 = sitofp i32 %tmp121 to double		; <double> [#uses=1]
-	%tmp123 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont122 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont122:		; preds = %invcont120
-	%tmp125 = invoke i32 @_ZNK12QPaintDevice5widthEv( %struct.QPaintDevice* %tmp123 )
-			to label %invcont124 unwind label %cleanup192		; <i32> [#uses=1]
-invcont124:		; preds = %invcont122
-	%tmp125.upgrd.20 = sitofp i32 %tmp125 to double		; <double> [#uses=1]
-	call void @_ZN6QRectFC1Edddd( %struct.QRectF* %tmp.upgrd.1, double 0.000000e+00, double 0.000000e+00, double %tmp125.upgrd.20, double %tmp121.upgrd.19 )
-	%tmp126 = getelementptr %struct.QRectF* %body, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp127 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp128 = load double* %tmp127		; <double> [#uses=1]
-	store double %tmp128, double* %tmp126
-	%tmp129 = getelementptr %struct.QRectF* %body, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp130 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp131 = load double* %tmp130		; <double> [#uses=1]
-	store double %tmp131, double* %tmp129
-	%tmp132 = getelementptr %struct.QRectF* %body, i32 0, i32 2		; <double*> [#uses=1]
-	%tmp133 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 2		; <double*> [#uses=1]
-	%tmp134 = load double* %tmp133		; <double> [#uses=1]
-	store double %tmp134, double* %tmp132
-	%tmp135 = getelementptr %struct.QRectF* %body, i32 0, i32 3		; <double*> [#uses=1]
-	%tmp136 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 3		; <double*> [#uses=1]
-	%tmp137 = load double* %tmp136		; <double> [#uses=1]
-	store double %tmp137, double* %tmp135
-	%tmp138 = call double @_ZNK6QRectF6heightEv( %struct.QRectF* %body )		; <double> [#uses=1]
-	%tmp139 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	%tmp140 = fsub double %tmp138, %tmp139		; <double> [#uses=1]
-	%tmp142 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont141 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont141:		; preds = %invcont124
-	invoke void @_ZNK13QTextDocument11defaultFontEv( %struct.QFont* sret  %tmp.upgrd.3, %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont144 unwind label %cleanup192
-invcont144:		; preds = %invcont141
-	invoke void @_ZN12QFontMetricsC1ERK5QFontP12QPaintDevice( %struct.QFontMetrics* %tmp.upgrd.2, %struct.QFont* %tmp.upgrd.3, %struct.QPaintDevice* %tmp142 )
-			to label %invcont146 unwind label %cleanup173
-invcont146:		; preds = %invcont144
-	%tmp149 = invoke i32 @_ZNK12QFontMetrics6ascentEv( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %invcont148 unwind label %cleanup168		; <i32> [#uses=1]
-invcont148:		; preds = %invcont146
-	%tmp149.upgrd.21 = sitofp i32 %tmp149 to double		; <double> [#uses=1]
-	%tmp150 = fadd double %tmp140, %tmp149.upgrd.21		; <double> [#uses=1]
-	%tmp152 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont151 unwind label %cleanup168		; <%struct.QPaintDevice*> [#uses=1]
-invcont151:		; preds = %invcont148
-	%tmp154 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp152 )
-			to label %invcont153 unwind label %cleanup168		; <i32> [#uses=1]
-invcont153:		; preds = %invcont151
-	%tmp155 = mul i32 %tmp154, 5		; <i32> [#uses=1]
-	%tmp156 = sdiv i32 %tmp155, 72		; <i32> [#uses=1]
-	%tmp156.upgrd.22 = sitofp i32 %tmp156 to double		; <double> [#uses=1]
-	%tmp157 = fadd double %tmp150, %tmp156.upgrd.22		; <double> [#uses=1]
-	%tmp158 = call double @_ZNK6QRectF5widthEv( %struct.QRectF* %body )		; <double> [#uses=1]
-	%tmp159 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	%tmp160 = fsub double %tmp158, %tmp159		; <double> [#uses=1]
-	call void @_ZN7QPointFC1Edd( %struct.QPointF* %tmp2, double %tmp160, double %tmp157 )
-	%tmp161 = getelementptr %struct.QPointF* %pageNumberPos, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp162 = getelementptr %struct.QPointF* %tmp2, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp163 = load double* %tmp162		; <double> [#uses=1]
-	store double %tmp163, double* %tmp161
-	%tmp164 = getelementptr %struct.QPointF* %pageNumberPos, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp165 = getelementptr %struct.QPointF* %tmp2, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp166 = load double* %tmp165		; <double> [#uses=1]
-	store double %tmp166, double* %tmp164
-	invoke void @_ZN12QFontMetricsD1Ev( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %cleanup171 unwind label %cleanup173
-cleanup168:		; preds = %invcont151, %invcont148, %invcont146
-        %val168 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN12QFontMetricsD1Ev( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %cleanup173 unwind label %cleanup173
-cleanup171:		; preds = %invcont153
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %tmp.upgrd.3 )
-			to label %finally170 unwind label %cleanup192
-cleanup173:		; preds = %cleanup168, %cleanup168, %invcont153, %invcont144
-        %val173 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %tmp.upgrd.3 )
-			to label %cleanup192 unwind label %cleanup192
-finally170:		; preds = %cleanup171
-	invoke void @_ZNK13QTextDocument11defaultFontEv( %struct.QFont* sret  %font, %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont177 unwind label %cleanup192
-invcont177:		; preds = %finally170
-	invoke void @_ZN5QFont12setPointSizeEi( %struct.QFont* %font, i32 10 )
-			to label %invcont179 unwind label %cleanup187
-invcont179:		; preds = %invcont177
-	invoke void @_ZN13QTextDocument14setDefaultFontERK5QFont( %struct.QAbstractTextDocumentLayout* %tmp95, %struct.QFont* %font )
-			to label %invcont181 unwind label %cleanup187
-invcont181:		; preds = %invcont179
-	call void @_ZNK6QRectF4sizeEv( %struct.QPointF* sret  %tmp3, %struct.QRectF* %body )
-	invoke void @_ZN13QTextDocument11setPageSizeERK6QSizeF( %struct.QAbstractTextDocumentLayout* %tmp95, %struct.QPointF* %tmp3 )
-			to label %cleanup185 unwind label %cleanup187
-cleanup185:		; preds = %invcont181
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %font )
-			to label %cleanup190 unwind label %cleanup192
-cleanup187:		; preds = %invcont181, %invcont179, %invcont177
-        %val187 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %font )
-			to label %cleanup192 unwind label %cleanup192
-cleanup190:		; preds = %cleanup185
-	invoke void @_ZN16QTextFrameFormatD1Ev( %struct.QTextBlockFormat* %fmt )
-			to label %cond_next194 unwind label %cleanup329
-cleanup192:		; preds = %cleanup187, %cleanup187, %cleanup185, %finally170, %cleanup173, %cleanup173, %cleanup171, %invcont141, %invcont124, %invcont122, %invcont120, %invcont118, %invcont117, %invcont114, %invcont111
-        %val192 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN16QTextFrameFormatD1Ev( %struct.QTextBlockFormat* %fmt )
-			to label %cleanup329 unwind label %cleanup329
-cond_next194:		; preds = %cleanup190, %invcont83
-	%clonedDoc.1 = phi %struct.QAbstractTextDocumentLayout* [ null, %invcont83 ], [ %tmp95, %cleanup190 ]		; <%struct.QAbstractTextDocumentLayout*> [#uses=3]
-	%doc.1 = phi %struct.QAbstractTextDocumentLayout* [ %this, %invcont83 ], [ %tmp95, %cleanup190 ]		; <%struct.QAbstractTextDocumentLayout*> [#uses=2]
-	%tmp197 = invoke i1 @_ZNK8QPrinter13collateCopiesEv( %struct.QPrinter* %printer )
-			to label %invcont196 unwind label %cleanup329		; <i1> [#uses=1]
-invcont196:		; preds = %cond_next194
-	br i1 %tmp197, label %cond_true200, label %cond_false204
-cond_true200:		; preds = %invcont196
-	%tmp203 = invoke i32 @_ZNK8QPrinter9numCopiesEv( %struct.QPrinter* %printer )
-			to label %invcont202 unwind label %cleanup329		; <i32> [#uses=1]
-invcont202:		; preds = %cond_true200
-	br label %cond_next208
-cond_false204:		; preds = %invcont196
-	%tmp207 = invoke i32 @_ZNK8QPrinter9numCopiesEv( %struct.QPrinter* %printer )
-			to label %invcont206 unwind label %cleanup329		; <i32> [#uses=1]
-invcont206:		; preds = %cond_false204
-	br label %cond_next208
-cond_next208:		; preds = %invcont206, %invcont202
-	%pageCopies.0 = phi i32 [ %tmp203, %invcont202 ], [ 1, %invcont206 ]		; <i32> [#uses=2]
-	%docCopies.0 = phi i32 [ 1, %invcont202 ], [ %tmp207, %invcont206 ]		; <i32> [#uses=2]
-	%tmp211 = invoke i32 @_ZNK8QPrinter8fromPageEv( %struct.QPrinter* %printer )
-			to label %invcont210 unwind label %cleanup329		; <i32> [#uses=3]
-invcont210:		; preds = %cond_next208
-	%tmp214 = invoke i32 @_ZNK8QPrinter6toPageEv( %struct.QPrinter* %printer )
-			to label %invcont213 unwind label %cleanup329		; <i32> [#uses=3]
-invcont213:		; preds = %invcont210
-	%tmp216 = icmp eq i32 %tmp211, 0		; <i1> [#uses=1]
-	br i1 %tmp216, label %cond_true217, label %cond_next225
-cond_true217:		; preds = %invcont213
-	%tmp219 = icmp eq i32 %tmp214, 0		; <i1> [#uses=1]
-	br i1 %tmp219, label %cond_true220, label %cond_next225
-cond_true220:		; preds = %cond_true217
-	%tmp223 = invoke i32 @_ZNK13QTextDocument9pageCountEv( %struct.QAbstractTextDocumentLayout* %doc.1 )
-			to label %invcont222 unwind label %cleanup329		; <i32> [#uses=1]
-invcont222:		; preds = %cond_true220
-	br label %cond_next225
-cond_next225:		; preds = %invcont222, %cond_true217, %invcont213
-	%toPage.1 = phi i32 [ %tmp223, %invcont222 ], [ %tmp214, %cond_true217 ], [ %tmp214, %invcont213 ]		; <i32> [#uses=2]
-	%fromPage.1 = phi i32 [ 1, %invcont222 ], [ %tmp211, %cond_true217 ], [ %tmp211, %invcont213 ]		; <i32> [#uses=2]
-	%tmp.page = invoke i32 @_ZNK8QPrinter9pageOrderEv( %struct.QPrinter* %printer )
-			to label %invcont227 unwind label %cleanup329		; <i32> [#uses=1]
-invcont227:		; preds = %cond_next225
-	%tmp228 = icmp eq i32 %tmp.page, 1		; <i1> [#uses=1]
-	br i1 %tmp228, label %cond_true230, label %cond_next234
-cond_true230:		; preds = %invcont227
-	br label %cond_next234
-cond_next234:		; preds = %cond_true230, %invcont227
-	%ascending.1 = phi i1 [ false, %cond_true230 ], [ true, %invcont227 ]		; <i1> [#uses=1]
-	%toPage.2 = phi i32 [ %fromPage.1, %cond_true230 ], [ %toPage.1, %invcont227 ]		; <i32> [#uses=1]
-	%fromPage.2 = phi i32 [ %toPage.1, %cond_true230 ], [ %fromPage.1, %invcont227 ]		; <i32> [#uses=1]
-	br label %bb309
-bb237:		; preds = %cond_true313, %cond_next293
-	%iftmp.410.4 = phi i1 [ %iftmp.410.5, %cond_true313 ], [ %iftmp.410.1, %cond_next293 ]		; <i1> [#uses=1]
-	%page.4 = phi i32 [ %fromPage.2, %cond_true313 ], [ %page.3, %cond_next293 ]		; <i32> [#uses=4]
-	br label %bb273
-invcont240:		; preds = %cond_true277
-	%tmp242 = icmp eq i32 %tmp241, 2		; <i1> [#uses=1]
-	br i1 %tmp242, label %bb252, label %cond_next244
-cond_next244:		; preds = %invcont240
-	%tmp247 = invoke i32 @_ZNK8QPrinter12printerStateEv( %struct.QPrinter* %printer )
-			to label %invcont246 unwind label %cleanup329		; <i32> [#uses=1]
-invcont246:		; preds = %cond_next244
-	%tmp248 = icmp eq i32 %tmp247, 3		; <i1> [#uses=1]
-	br i1 %tmp248, label %bb252, label %bb253
-bb252:		; preds = %invcont246, %invcont240
-	br label %bb254
-bb253:		; preds = %invcont246
-	br label %bb254
-bb254:		; preds = %bb253, %bb252
-	%iftmp.410.0 = phi i1 [ true, %bb252 ], [ false, %bb253 ]		; <i1> [#uses=2]
-	br i1 %iftmp.410.0, label %UserCanceled, label %cond_next258
-cond_next258:		; preds = %bb254
-	invoke fastcc void @_Z9printPageiP8QPainterPK13QTextDocumentRK6QRectFRK7QPointF( i32 %page.4, %struct.QPainter* %p, %struct.QAbstractTextDocumentLayout* %doc.1, %struct.QRectF* %body, %struct.QPointF* %pageNumberPos )
-			to label %invcont261 unwind label %cleanup329
-invcont261:		; preds = %cond_next258
-	%tmp263 = add i32 %pageCopies.0, -1		; <i32> [#uses=1]
-	%tmp265 = icmp sgt i32 %tmp263, %j.4		; <i1> [#uses=1]
-	br i1 %tmp265, label %cond_true266, label %cond_next270
-cond_true266:		; preds = %invcont261
-	%tmp269 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %cond_next270 unwind label %cleanup329		; <i1> [#uses=0]
-cond_next270:		; preds = %cond_true266, %invcont261
-	%tmp272 = add i32 %j.4, 1		; <i32> [#uses=1]
-	br label %bb273
-bb273:		; preds = %cond_next270, %bb237
-	%iftmp.410.1 = phi i1 [ %iftmp.410.4, %bb237 ], [ %iftmp.410.0, %cond_next270 ]		; <i1> [#uses=2]
-	%j.4 = phi i32 [ 0, %bb237 ], [ %tmp272, %cond_next270 ]		; <i32> [#uses=3]
-	%tmp276 = icmp slt i32 %j.4, %pageCopies.0		; <i1> [#uses=1]
-	br i1 %tmp276, label %cond_true277, label %bb280
-cond_true277:		; preds = %bb273
-	%tmp241 = invoke i32 @_ZNK8QPrinter12printerStateEv( %struct.QPrinter* %printer )
-			to label %invcont240 unwind label %cleanup329		; <i32> [#uses=1]
-bb280:		; preds = %bb273
-	%tmp283 = icmp eq i32 %page.4, %toPage.2		; <i1> [#uses=1]
-	br i1 %tmp283, label %bb297, label %cond_next285
-cond_next285:		; preds = %bb280
-	br i1 %ascending.1, label %cond_true287, label %cond_false290
-cond_true287:		; preds = %cond_next285
-	%tmp289 = add i32 %page.4, 1		; <i32> [#uses=1]
-	br label %cond_next293
-cond_false290:		; preds = %cond_next285
-	%tmp292 = add i32 %page.4, -1		; <i32> [#uses=1]
-	br label %cond_next293
-cond_next293:		; preds = %cond_false290, %cond_true287
-	%page.3 = phi i32 [ %tmp289, %cond_true287 ], [ %tmp292, %cond_false290 ]		; <i32> [#uses=1]
-	%tmp296 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %bb237 unwind label %cleanup329		; <i1> [#uses=0]
-bb297:		; preds = %bb280
-	%tmp299 = add i32 %docCopies.0, -1		; <i32> [#uses=1]
-	%tmp301 = icmp sgt i32 %tmp299, %i.1		; <i1> [#uses=1]
-	br i1 %tmp301, label %cond_true302, label %cond_next306
-cond_true302:		; preds = %bb297
-	%tmp305 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %cond_next306 unwind label %cleanup329		; <i1> [#uses=0]
-cond_next306:		; preds = %cond_true302, %bb297
-	%tmp308 = add i32 %i.1, 1		; <i32> [#uses=1]
-	br label %bb309
-bb309:		; preds = %cond_next306, %cond_next234
-	%iftmp.410.5 = phi i1 [ undef, %cond_next234 ], [ %iftmp.410.1, %cond_next306 ]		; <i1> [#uses=1]
-	%i.1 = phi i32 [ 0, %cond_next234 ], [ %tmp308, %cond_next306 ]		; <i32> [#uses=3]
-	%tmp312 = icmp slt i32 %i.1, %docCopies.0		; <i1> [#uses=1]
-	br i1 %tmp312, label %cond_true313, label %UserCanceled
-cond_true313:		; preds = %bb309
-	br label %bb237
-UserCanceled:		; preds = %bb309, %bb254
-	%tmp318 = icmp eq %struct.QAbstractTextDocumentLayout* %clonedDoc.1, null		; <i1> [#uses=1]
-	br i1 %tmp318, label %cleanup327, label %cond_true319
-cond_true319:		; preds = %UserCanceled
-	%tmp.upgrd.23 = getelementptr %struct.QAbstractTextDocumentLayout* %clonedDoc.1, i32 0, i32 0, i32 0		; <i32 (...)***> [#uses=1]
-	%tmp.upgrd.24 = load i32 (...)*** %tmp.upgrd.23		; <i32 (...)**> [#uses=1]
-	%tmp322 = getelementptr i32 (...)** %tmp.upgrd.24, i32 4		; <i32 (...)**> [#uses=1]
-	%tmp.upgrd.25 = load i32 (...)** %tmp322		; <i32 (...)*> [#uses=1]
-	%tmp.upgrd.26 = bitcast i32 (...)* %tmp.upgrd.25 to void (%struct.QAbstractTextDocumentLayout*)*		; <void (%struct.QAbstractTextDocumentLayout*)*> [#uses=1]
-	invoke void %tmp.upgrd.26( %struct.QAbstractTextDocumentLayout* %clonedDoc.1 )
-			to label %cleanup327 unwind label %cleanup329
-cleanup327:		; preds = %cond_true319, %UserCanceled
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	ret void
-cleanup328:		; preds = %invcont
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	ret void
-cleanup329:		; preds = %cond_true319, %cond_true302, %cond_next293, %cond_true277, %cond_true266, %cond_next258, %cond_next244, %cond_next225, %cond_true220, %invcont210, %cond_next208, %cond_false204, %cond_true200, %cond_next194, %cleanup192, %cleanup192, %cleanup190, %invcont106, %invcont104, %invcont103, %invcont100, %invcont98, %invcont94, %cond_false, %invcont83, %invcont79, %invcont57, %invcont51, %invcont45, %cond_next42, %invcont37, %cond_true35, %invcont29, %invcont25, %cond_true24, %cond_next, %entry
-        %val = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                 cleanup
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	resume { i8*, i32 } %val
-}
-
-declare void @_ZN6QSizeFC1Edd(%struct.QPointF*, double, double)
-
-declare i1 @_ZNK6QSizeF7isValidEv(%struct.QPointF*)
-
-declare double @_ZNK6QSizeF5widthEv(%struct.QPointF*)
-
-declare double @_ZNK6QSizeF6heightEv(%struct.QPointF*)
-
-declare double* @_ZN6QSizeF6rwidthEv(%struct.QPointF*)
-
-declare double* @_ZN6QSizeF7rheightEv(%struct.QPointF*)
-
-declare %struct.QTextDocumentPrivate* @_ZNK13QTextDocument6d_funcEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN7QPointFC1Ev(%struct.QPointF*)
-
-declare void @_ZN7QPointFC1Edd(%struct.QPointF*, double, double)
-
-declare void @_ZN16QTextFrameFormat9setMarginEd(%struct.QTextBlockFormat*, double)
-
-declare void @_ZN6QRectFC1Edddd(%struct.QRectF*, double, double, double, double)
-
-declare void @_ZN6QRectFC1ERK7QPointFRK6QSizeF(%struct.QRectF*, %struct.QPointF*, %struct.QPointF*)
-
-declare double @_ZNK6QRectF5widthEv(%struct.QRectF*)
-
-declare double @_ZNK6QRectF6heightEv(%struct.QRectF*)
-
-declare void @_ZNK6QRectF4sizeEv(%struct.QPointF*, %struct.QRectF*)
-
-declare void @_ZN16QTextFrameFormatD1Ev(%struct.QTextBlockFormat*)
-
-declare void @_ZNK10QTextFrame11frameFormatEv(%struct.QTextBlockFormat*, %struct.QTextBlockGroup*)
-
-declare void @_ZN10QTextFrame14setFrameFormatERK16QTextFrameFormat(%struct.QTextBlockGroup*, %struct.QTextBlockFormat*)
-
-declare i32 @_ZNK12QPaintDevice5widthEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice6heightEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice11logicalDpiXEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice11logicalDpiYEv(%struct.QPaintDevice*)
-
-declare %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument5cloneEP7QObject(%struct.QAbstractTextDocumentLayout*, %struct.QObject*)
-
-declare void @_ZN5QFontD1Ev(%struct.QFont*)
-
-declare %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv(%struct.QAbstractTextDocumentLayout*)
-
-declare %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv(%struct.QAbstractTextDocumentLayout*)
-
-declare i32 @_ZNK13QTextDocument9pageCountEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZNK13QTextDocument11defaultFontEv(%struct.QFont*, %struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN13QTextDocument14setDefaultFontERK5QFont(%struct.QAbstractTextDocumentLayout*, %struct.QFont*)
-
-declare void @_ZN13QTextDocument11setPageSizeERK6QSizeF(%struct.QAbstractTextDocumentLayout*, %struct.QPointF*)
-
-declare void @_Z9printPageiP8QPainterPK13QTextDocumentRK6QRectFRK7QPointF(i32, %struct.QPainter*, %struct.QAbstractTextDocumentLayout*, %struct.QRectF*, %struct.QPointF*)
-
-declare void @_ZN12QFontMetricsD1Ev(%struct.QFontMetrics*)
-
-declare void @_ZN8QPainterC1EP12QPaintDevice(%struct.QPainter*, %struct.QPaintDevice*)
-
-declare i1 @_ZNK8QPainter8isActiveEv(%struct.QPainter*)
-
-declare i32 @_Z13qt_defaultDpiv()
-
-declare %struct.QPaintDevice* @_ZNK27QAbstractTextDocumentLayout11paintDeviceEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN8QPainter5scaleEdd(%struct.QPainter*, double, double)
-
-declare %struct.QPaintDevice* @_ZNK8QPainter6deviceEv(%struct.QPainter*)
-
-declare void @_ZN27QAbstractTextDocumentLayout14setPaintDeviceEP12QPaintDevice(%struct.QAbstractTextDocumentLayout*, %struct.QPaintDevice*)
-
-declare void @_ZN12QFontMetricsC1ERK5QFontP12QPaintDevice(%struct.QFontMetrics*, %struct.QFont*, %struct.QPaintDevice*)
-
-declare i32 @_ZNK12QFontMetrics6ascentEv(%struct.QFontMetrics*)
-
-declare void @_ZN5QFont12setPointSizeEi(%struct.QFont*, i32)
-
-declare i1 @_ZNK8QPrinter13collateCopiesEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter9numCopiesEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter8fromPageEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter6toPageEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter9pageOrderEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter12printerStateEv(%struct.QPrinter*)
-
-declare i1 @_ZN8QPrinter7newPageEv(%struct.QPrinter*)
-
-declare void @_ZN8QPainterD1Ev(%struct.QPainter*)
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll b/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
index 00f2d5b..14baeea 100644
--- a/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
+++ b/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
@@ -1,5 +1,5 @@
 ; The phi should not be eliminated in this case, because the fp op could trap.
-; RUN: opt < %s -simplifycfg -S | grep {= phi double}
+; RUN: opt < %s -simplifycfg -S | grep "= phi double"
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
diff --git a/test/Transforms/SimplifyCFG/2008-05-16-PHIBlockMerge.ll b/test/Transforms/SimplifyCFG/2008-05-16-PHIBlockMerge.ll
index 56f43b6..13ccad6 100644
--- a/test/Transforms/SimplifyCFG/2008-05-16-PHIBlockMerge.ll
+++ b/test/Transforms/SimplifyCFG/2008-05-16-PHIBlockMerge.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -simplifycfg -S > %t
-; RUN: not grep {^BB.tomerge} %t
-; RUN: grep {^BB.nomerge} %t | count 2
+; RUN: not grep "^BB.tomerge" %t
+; RUN: grep "^BB.nomerge" %t | count 2
 
 ; ModuleID = '<stdin>' 
 declare i1 @foo()
diff --git a/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll b/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll
index d025dee..9b6084f 100644
--- a/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll
+++ b/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S | grep {%outval = phi i32 .*mux}
+; RUN: opt < %s -simplifycfg -S | grep "%outval = phi i32 .*mux"
 ; PR2540
 ; Outval should end up with a select from 0/2, not all constants.
 
diff --git a/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll b/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll
deleted file mode 100644
index abf4455..0000000
--- a/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll
+++ /dev/null
@@ -1,569 +0,0 @@
-; RUN: opt < %s -simplifycfg -disable-output
-; END.
-	%struct..4._102 = type { %struct.QVectorData* }
-	%struct..5._125 = type { %struct.QMapData* }
-	%struct.QAbstractTextDocumentLayout = type { %struct.QObject }
-	%struct.QBasicAtomic = type { i32 }
-	%struct.QFont = type { %struct.QFontPrivate*, i32 }
-	%struct.QFontMetrics = type { %struct.QFontPrivate* }
-	%struct.QFontPrivate = type opaque
-	%"struct.QFragmentMap<QTextBlockData>" = type { %struct.QFragmentMapData }
-	%struct.QFragmentMapData = type { %"struct.QFragmentMapData::._154", i32 }
-	%"struct.QFragmentMapData::._154" = type { %"struct.QFragmentMapData::Header"* }
-	%"struct.QFragmentMapData::Header" = type { i32, i32, i32, i32, i32, i32, i32, i32 }
-	%"struct.QHash<uint,QHashDummyValue>" = type { %"struct.QHash<uint,QHashDummyValue>::._152" }
-	%"struct.QHash<uint,QHashDummyValue>::._152" = type { %struct.QHashData* }
-	%struct.QHashData = type { %"struct.QHashData::Node"*, %"struct.QHashData::Node"**, %struct.QBasicAtomic, i32, i32, i16, i16, i32, i8 }
-	%"struct.QHashData::Node" = type { %"struct.QHashData::Node"*, i32 }
-	%"struct.QList<QObject*>::._92" = type { %struct.QListData }
-	%"struct.QList<QPointer<QObject> >" = type { %"struct.QList<QObject*>::._92" }
-	%struct.QListData = type { %"struct.QListData::Data"* }
-	%"struct.QListData::Data" = type { %struct.QBasicAtomic, i32, i32, i32, i8, [1 x i8*] }
-	%"struct.QMap<QUrl,QVariant>" = type { %struct..5._125 }
-	%struct.QMapData = type { %"struct.QMapData::Node"*, [12 x %"struct.QMapData::Node"*], %struct.QBasicAtomic, i32, i32, i32, i8 }
-	%"struct.QMapData::Node" = type { %"struct.QMapData::Node"*, [1 x %"struct.QMapData::Node"*] }
-	%struct.QObject = type { i32 (...)**, %struct.QObjectData* }
-	%struct.QObjectData = type { i32 (...)**, %struct.QObject*, %struct.QObject*, %"struct.QList<QPointer<QObject> >", i8, [3 x i8], i32, i32 }
-	%struct.QObjectPrivate = type { %struct.QObjectData, i32, %struct.QObject*, %"struct.QList<QPointer<QObject> >", %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %struct.QString }
-	%struct.QPaintDevice = type { i32 (...)**, i16 }
-	%struct.QPainter = type { %struct.QPainterPrivate* }
-	%struct.QPainterPrivate = type opaque
-	%struct.QPointF = type { double, double }
-	%struct.QPrinter = type { %struct.QPaintDevice, %struct.QPrinterPrivate* }
-	%struct.QPrinterPrivate = type opaque
-	%struct.QRectF = type { double, double, double, double }
-	%"struct.QSet<uint>" = type { %"struct.QHash<uint,QHashDummyValue>" }
-	%"struct.QSharedDataPointer<QTextFormatPrivate>" = type { %struct.QTextFormatPrivate* }
-	%struct.QString = type { %"struct.QString::Data"* }
-	%"struct.QString::Data" = type { %struct.QBasicAtomic, i32, i32, i16*, i8, i8, [1 x i16] }
-	%struct.QTextBlockFormat = type { %struct.QTextFormat }
-	%struct.QTextBlockGroup = type { %struct.QAbstractTextDocumentLayout }
-	%struct.QTextDocumentConfig = type { %struct.QString }
-	%struct.QTextDocumentPrivate = type { %struct.QObjectPrivate, %struct.QString, %"struct.QVector<QAbstractTextDocumentLayout::Selection>", i1, i32, i32, i1, i32, i32, i32, i32, i1, %struct.QTextFormatCollection, %struct.QTextBlockGroup*, %struct.QAbstractTextDocumentLayout*, %"struct.QFragmentMap<QTextBlockData>", %"struct.QFragmentMap<QTextBlockData>", i32, %"struct.QList<QPointer<QObject> >", %"struct.QList<QPointer<QObject> >", %"struct.QMap<QUrl,QVariant>", %"struct.QMap<QUrl,QVariant>", %"struct.QMap<QUrl,QVariant>", %struct.QTextDocumentConfig, i1, i1, %struct.QPointF }
-	%struct.QTextFormat = type { %"struct.QSharedDataPointer<QTextFormatPrivate>", i32 }
-	%struct.QTextFormatCollection = type { %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %"struct.QVector<QAbstractTextDocumentLayout::Selection>", %"struct.QSet<uint>", %struct.QFont }
-	%struct.QTextFormatPrivate = type opaque
-	%"struct.QVector<QAbstractTextDocumentLayout::Selection>" = type { %struct..4._102 }
-	%struct.QVectorData = type { %struct.QBasicAtomic, i32, i32, i8 }
-
-define void @_ZNK13QTextDocument5printEP8QPrinter(%struct.QAbstractTextDocumentLayout* %this, %struct.QPrinter* %printer) {
-entry:
-	%tmp = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=2]
-	%tmp.upgrd.1 = alloca %struct.QRectF, align 16		; <%struct.QRectF*> [#uses=5]
-	%tmp2 = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=3]
-	%tmp.upgrd.2 = alloca %struct.QFontMetrics, align 16		; <%struct.QFontMetrics*> [#uses=4]
-	%tmp.upgrd.3 = alloca %struct.QFont, align 16		; <%struct.QFont*> [#uses=4]
-	%tmp3 = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=2]
-	%p = alloca %struct.QPainter, align 16		; <%struct.QPainter*> [#uses=14]
-	%body = alloca %struct.QRectF, align 16		; <%struct.QRectF*> [#uses=9]
-        %foo = alloca double, align 8
-        %bar = alloca double, align 8
-	%pageNumberPos = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=4]
-	%scaledPageSize = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=6]
-	%printerPageSize = alloca %struct.QPointF, align 16		; <%struct.QPointF*> [#uses=3]
-	%fmt = alloca %struct.QTextBlockFormat, align 16		; <%struct.QTextBlockFormat*> [#uses=5]
-	%font = alloca %struct.QFont, align 16		; <%struct.QFont*> [#uses=5]
-	%tmp.upgrd.4 = call %struct.QTextDocumentPrivate* @_ZNK13QTextDocument6d_funcEv( %struct.QAbstractTextDocumentLayout* %this )		; <%struct.QTextDocumentPrivate*> [#uses=5]
-	%tmp.upgrd.5 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	call void @_ZN8QPainterC1EP12QPaintDevice( %struct.QPainter* %p, %struct.QPaintDevice* %tmp.upgrd.5 )
-	%tmp.upgrd.6 = invoke i1 @_ZNK8QPainter8isActiveEv( %struct.QPainter* %p )
-			to label %invcont unwind label %cleanup329		; <i1> [#uses=1]
-invcont:		; preds = %entry
-	br i1 %tmp.upgrd.6, label %cond_next, label %cleanup328
-cond_next:		; preds = %invcont
-	%tmp8 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %this )
-			to label %invcont7 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=0]
-invcont7:		; preds = %cond_next
-	%tmp10 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	call void @_ZN7QPointFC1Edd( %struct.QPointF* %tmp, double 0.000000e+00, double 0.000000e+00 )
-	call void @_ZN6QRectFC1ERK7QPointFRK6QSizeF( %struct.QRectF* %body, %struct.QPointF* %tmp, %struct.QPointF* %tmp10 )
-	call void @_ZN7QPointFC1Ev( %struct.QPointF* %pageNumberPos )
-	%tmp12 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	%tmp13 = call i1 @_ZNK6QSizeF7isValidEv( %struct.QPointF* %tmp12 )		; <i1> [#uses=1]
-	br i1 %tmp13, label %cond_next15, label %bb
-cond_next15:		; preds = %invcont7
-	%tmp17 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26		; <%struct.QPointF*> [#uses=1]
-	%tmp.upgrd.7 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %tmp17 )		; <double> [#uses=1]
-	%tmp18 = fcmp oeq double %tmp.upgrd.7, 0x41DFFFFFFFC00000		; <i1> [#uses=1]
-	br i1 %tmp18, label %bb, label %cond_next20
-cond_next20:		; preds = %cond_next15
-	br label %bb21
-bb:		; preds = %cond_next15, %invcont7
-	br label %bb21
-bb21:		; preds = %bb, %cond_next20
-	%iftmp.406.0 = phi i1 [ false, %bb ], [ true, %cond_next20 ]		; <i1> [#uses=1]
-	br i1 %iftmp.406.0, label %cond_true24, label %cond_false
-cond_true24:		; preds = %bb21
-	%tmp.upgrd.8 = invoke i32 @_Z13qt_defaultDpiv( )
-			to label %invcont25 unwind label %cleanup329		; <i32> [#uses=1]
-invcont25:		; preds = %cond_true24
-	%tmp26 = sitofp i32 %tmp.upgrd.8 to double		; <double> [#uses=2]
-	%tmp30 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %this )
-			to label %invcont29 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=1]
-invcont29:		; preds = %invcont25
-	%tmp32 = invoke %struct.QPaintDevice* @_ZNK27QAbstractTextDocumentLayout11paintDeviceEv( %struct.QAbstractTextDocumentLayout* %tmp30 )
-			to label %invcont31 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=3]
-invcont31:		; preds = %invcont29
-	%tmp34 = icmp eq %struct.QPaintDevice* %tmp32, null		; <i1> [#uses=1]
-	br i1 %tmp34, label %cond_next42, label %cond_true35
-cond_true35:		; preds = %invcont31
-	%tmp38 = invoke i32 @_ZNK12QPaintDevice11logicalDpiXEv( %struct.QPaintDevice* %tmp32 )
-			to label %invcont37 unwind label %cleanup329		; <i32> [#uses=1]
-invcont37:		; preds = %cond_true35
-	%tmp38.upgrd.9 = sitofp i32 %tmp38 to double		; <double> [#uses=1]
-	%tmp41 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp32 )
-			to label %invcont40 unwind label %cleanup329		; <i32> [#uses=1]
-invcont40:		; preds = %invcont37
-	%tmp41.upgrd.10 = sitofp i32 %tmp41 to double		; <double> [#uses=1]
-	br label %cond_next42
-cond_next42:		; preds = %invcont40, %invcont31
-	%sourceDpiY.2 = phi double [ %tmp41.upgrd.10, %invcont40 ], [ %tmp26, %invcont31 ]		; <double> [#uses=1]
-	%sourceDpiX.2 = phi double [ %tmp38.upgrd.9, %invcont40 ], [ %tmp26, %invcont31 ]		; <double> [#uses=1]
-	%tmp44 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp46 = invoke i32 @_ZNK12QPaintDevice11logicalDpiXEv( %struct.QPaintDevice* %tmp44 )
-			to label %invcont45 unwind label %cleanup329		; <i32> [#uses=1]
-invcont45:		; preds = %cond_next42
-	%tmp46.upgrd.11 = sitofp i32 %tmp46 to double		; <double> [#uses=1]
-	%tmp48 = fdiv double %tmp46.upgrd.11, %sourceDpiX.2		; <double> [#uses=2]
-	%tmp50 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp52 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp50 )
-			to label %invcont51 unwind label %cleanup329		; <i32> [#uses=1]
-invcont51:		; preds = %invcont45
-	%tmp52.upgrd.12 = sitofp i32 %tmp52 to double		; <double> [#uses=1]
-	%tmp54 = fdiv double %tmp52.upgrd.12, %sourceDpiY.2		; <double> [#uses=2]
-	invoke void @_ZN8QPainter5scaleEdd( %struct.QPainter* %p, double %tmp48, double %tmp54 )
-			to label %invcont57 unwind label %cleanup329
-invcont57:		; preds = %invcont51
-	%tmp.upgrd.13 = getelementptr %struct.QPointF* %scaledPageSize, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp60 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26, i32 0		; <double*> [#uses=1]
-	%tmp61 = load double* %tmp60		; <double> [#uses=1]
-	store double %tmp61, double* %tmp.upgrd.13
-	%tmp62 = getelementptr %struct.QPointF* %scaledPageSize, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp63 = getelementptr %struct.QTextDocumentPrivate* %tmp.upgrd.4, i32 0, i32 26, i32 1		; <double*> [#uses=1]
-	%tmp64 = load double* %tmp63		; <double> [#uses=1]
-	store double %tmp64, double* %tmp62
-	%tmp65 = call double* @_ZN6QSizeF6rwidthEv( %struct.QPointF* %scaledPageSize )		; <double*> [#uses=2]
-	%tmp67 = load double* %tmp65		; <double> [#uses=1]
-	%tmp69 = fmul double %tmp67, %tmp48		; <double> [#uses=1]
-	store double %tmp69, double* %tmp65
-	%tmp71 = call double* @_ZN6QSizeF7rheightEv( %struct.QPointF* %scaledPageSize )		; <double*> [#uses=2]
-	%tmp73 = load double* %tmp71		; <double> [#uses=1]
-	%tmp75 = fmul double %tmp73, %tmp54		; <double> [#uses=1]
-	store double %tmp75, double* %tmp71
-	%tmp78 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp80 = invoke i32 @_ZNK12QPaintDevice6heightEv( %struct.QPaintDevice* %tmp78 )
-			to label %invcont79 unwind label %cleanup329		; <i32> [#uses=1]
-invcont79:		; preds = %invcont57
-	%tmp82 = getelementptr %struct.QPrinter* %printer, i32 0, i32 0		; <%struct.QPaintDevice*> [#uses=1]
-	%tmp84 = invoke i32 @_ZNK12QPaintDevice5widthEv( %struct.QPaintDevice* %tmp82 )
-			to label %invcont83 unwind label %cleanup329		; <i32> [#uses=1]
-invcont83:		; preds = %invcont79
-	%tmp80.upgrd.14 = sitofp i32 %tmp80 to double		; <double> [#uses=1]
-	%tmp84.upgrd.15 = sitofp i32 %tmp84 to double		; <double> [#uses=1]
-	call void @_ZN6QSizeFC1Edd( %struct.QPointF* %printerPageSize, double %tmp84.upgrd.15, double %tmp80.upgrd.14 )
-	%tmp85 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %printerPageSize )		; <double> [#uses=1]
-	%tmp86 = call double @_ZNK6QSizeF6heightEv( %struct.QPointF* %scaledPageSize )		; <double> [#uses=1]
-	%tmp87 = fdiv double %tmp85, %tmp86		; <double> [#uses=1]
-	%tmp88 = call double @_ZNK6QSizeF5widthEv( %struct.QPointF* %printerPageSize )		; <double> [#uses=1]
-	%tmp89 = call double @_ZNK6QSizeF5widthEv( %struct.QPointF* %scaledPageSize )		; <double> [#uses=1]
-	%tmp90 = fdiv double %tmp88, %tmp89		; <double> [#uses=1]
-	invoke void @_ZN8QPainter5scaleEdd( %struct.QPainter* %p, double %tmp90, double %tmp87 )
-			to label %cond_next194 unwind label %cleanup329
-cond_false:		; preds = %bb21
-	%tmp.upgrd.16 = getelementptr %struct.QAbstractTextDocumentLayout* %this, i32 0, i32 0		; <%struct.QObject*> [#uses=1]
-	%tmp95 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument5cloneEP7QObject( %struct.QAbstractTextDocumentLayout* %this, %struct.QObject* %tmp.upgrd.16 )
-			to label %invcont94 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=9]
-invcont94:		; preds = %cond_false
-	%tmp99 = invoke %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv( %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont98 unwind label %cleanup329		; <%struct.QAbstractTextDocumentLayout*> [#uses=1]
-invcont98:		; preds = %invcont94
-	%tmp101 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont100 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=1]
-invcont100:		; preds = %invcont98
-	invoke void @_ZN27QAbstractTextDocumentLayout14setPaintDeviceEP12QPaintDevice( %struct.QAbstractTextDocumentLayout* %tmp99, %struct.QPaintDevice* %tmp101 )
-			to label %invcont103 unwind label %cleanup329
-invcont103:		; preds = %invcont100
-	%tmp105 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont104 unwind label %cleanup329		; <%struct.QPaintDevice*> [#uses=1]
-invcont104:		; preds = %invcont103
-	%tmp107 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp105 )
-			to label %invcont106 unwind label %cleanup329		; <i32> [#uses=1]
-invcont106:		; preds = %invcont104
-	%tmp108 = sitofp i32 %tmp107 to double		; <double> [#uses=1]
-	%tmp109 = fmul double %tmp108, 0x3FE93264C993264C		; <double> [#uses=1]
-	%tmp109.upgrd.17 = fptosi double %tmp109 to i32		; <i32> [#uses=3]
-	%tmp.upgrd.18 = call %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv( %struct.QAbstractTextDocumentLayout* %tmp95 )		; <%struct.QTextBlockGroup*> [#uses=1]
-	invoke void @_ZNK10QTextFrame11frameFormatEv( %struct.QTextBlockFormat* sret  %fmt, %struct.QTextBlockGroup* %tmp.upgrd.18 )
-			to label %invcont111 unwind label %cleanup329
-invcont111:		; preds = %invcont106
-	%tmp112 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	invoke void @_ZN16QTextFrameFormat9setMarginEd( %struct.QTextBlockFormat* %fmt, double %tmp112 )
-			to label %invcont114 unwind label %cleanup192
-invcont114:		; preds = %invcont111
-	%tmp116 = call %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv( %struct.QAbstractTextDocumentLayout* %tmp95 )		; <%struct.QTextBlockGroup*> [#uses=1]
-	invoke void @_ZN10QTextFrame14setFrameFormatERK16QTextFrameFormat( %struct.QTextBlockGroup* %tmp116, %struct.QTextBlockFormat* %fmt )
-			to label %invcont117 unwind label %cleanup192
-invcont117:		; preds = %invcont114
-	%tmp119 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont118 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont118:		; preds = %invcont117
-	%tmp121 = invoke i32 @_ZNK12QPaintDevice6heightEv( %struct.QPaintDevice* %tmp119 )
-			to label %invcont120 unwind label %cleanup192		; <i32> [#uses=1]
-invcont120:		; preds = %invcont118
-	%tmp121.upgrd.19 = sitofp i32 %tmp121 to double		; <double> [#uses=1]
-	%tmp123 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont122 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont122:		; preds = %invcont120
-	%tmp125 = invoke i32 @_ZNK12QPaintDevice5widthEv( %struct.QPaintDevice* %tmp123 )
-			to label %invcont124 unwind label %cleanup192		; <i32> [#uses=1]
-invcont124:		; preds = %invcont122
-	%tmp125.upgrd.20 = sitofp i32 %tmp125 to double		; <double> [#uses=1]
-	call void @_ZN6QRectFC1Edddd( %struct.QRectF* %tmp.upgrd.1, double 0.000000e+00, double 0.000000e+00, double %tmp125.upgrd.20, double %tmp121.upgrd.19 )
-	%tmp126 = getelementptr %struct.QRectF* %body, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp127 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp128 = load double* %tmp127		; <double> [#uses=1]
-	store double %tmp128, double* %tmp126
-	%tmp129 = getelementptr %struct.QRectF* %body, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp130 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp131 = load double* %tmp130		; <double> [#uses=1]
-	store double %tmp131, double* %tmp129
-	%tmp132 = getelementptr %struct.QRectF* %body, i32 0, i32 2		; <double*> [#uses=1]
-	%tmp133 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 2		; <double*> [#uses=1]
-	%tmp134 = load double* %tmp133		; <double> [#uses=1]
-	store double %tmp134, double* %tmp132
-	%tmp135 = getelementptr %struct.QRectF* %body, i32 0, i32 3		; <double*> [#uses=1]
-	%tmp136 = getelementptr %struct.QRectF* %tmp.upgrd.1, i32 0, i32 3		; <double*> [#uses=1]
-	%tmp137 = load double* %tmp136		; <double> [#uses=1]
-	store double %tmp137, double* %tmp135
-	%tmp138 = call double @_ZNK6QRectF6heightEv( %struct.QRectF* %body )		; <double> [#uses=1]
-	%tmp139 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	%tmp140 = fsub double %tmp138, %tmp139		; <double> [#uses=1]
-	%tmp142 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont141 unwind label %cleanup192		; <%struct.QPaintDevice*> [#uses=1]
-invcont141:		; preds = %invcont124
-	invoke void @_ZNK13QTextDocument11defaultFontEv( %struct.QFont* sret  %tmp.upgrd.3, %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont144 unwind label %cleanup192
-invcont144:		; preds = %invcont141
-	invoke void @_ZN12QFontMetricsC1ERK5QFontP12QPaintDevice( %struct.QFontMetrics* %tmp.upgrd.2, %struct.QFont* %tmp.upgrd.3, %struct.QPaintDevice* %tmp142 )
-			to label %invcont146 unwind label %cleanup173
-invcont146:		; preds = %invcont144
-	%tmp149 = invoke i32 @_ZNK12QFontMetrics6ascentEv( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %invcont148 unwind label %cleanup168		; <i32> [#uses=1]
-invcont148:		; preds = %invcont146
-	%tmp149.upgrd.21 = sitofp i32 %tmp149 to double		; <double> [#uses=1]
-	%tmp150 = fadd double %tmp140, %tmp149.upgrd.21		; <double> [#uses=1]
-	%tmp152 = invoke %struct.QPaintDevice* @_ZNK8QPainter6deviceEv( %struct.QPainter* %p )
-			to label %invcont151 unwind label %cleanup168		; <%struct.QPaintDevice*> [#uses=1]
-invcont151:		; preds = %invcont148
-	%tmp154 = invoke i32 @_ZNK12QPaintDevice11logicalDpiYEv( %struct.QPaintDevice* %tmp152 )
-			to label %invcont153 unwind label %cleanup168		; <i32> [#uses=1]
-invcont153:		; preds = %invcont151
-	%tmp155 = mul i32 %tmp154, 5		; <i32> [#uses=1]
-	%tmp156 = sdiv i32 %tmp155, 72		; <i32> [#uses=1]
-	%tmp156.upgrd.22 = sitofp i32 %tmp156 to double		; <double> [#uses=1]
-	%tmp157 = fadd double %tmp150, %tmp156.upgrd.22		; <double> [#uses=1]
-	%tmp158 = call double @_ZNK6QRectF5widthEv( %struct.QRectF* %body )		; <double> [#uses=1]
-	%tmp159 = sitofp i32 %tmp109.upgrd.17 to double		; <double> [#uses=1]
-	%tmp160 = fsub double %tmp158, %tmp159		; <double> [#uses=1]
-	call void @_ZN7QPointFC1Edd( %struct.QPointF* %tmp2, double %tmp160, double %tmp157 )
-	%tmp161 = getelementptr %struct.QPointF* %pageNumberPos, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp162 = getelementptr %struct.QPointF* %tmp2, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp163 = load double* %tmp162		; <double> [#uses=1]
-	store double %tmp163, double* %tmp161
-	%tmp164 = getelementptr %struct.QPointF* %pageNumberPos, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp165 = getelementptr %struct.QPointF* %tmp2, i32 0, i32 1		; <double*> [#uses=1]
-	%tmp166 = load double* %tmp165		; <double> [#uses=1]
-	store double %tmp166, double* %tmp164
-	invoke void @_ZN12QFontMetricsD1Ev( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %cleanup171 unwind label %cleanup173
-cleanup168:		; preds = %invcont151, %invcont148, %invcont146
-        %val168 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN12QFontMetricsD1Ev( %struct.QFontMetrics* %tmp.upgrd.2 )
-			to label %cleanup173 unwind label %cleanup173
-cleanup171:		; preds = %invcont153
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %tmp.upgrd.3 )
-			to label %finally170 unwind label %cleanup192
-cleanup173:		; preds = %cleanup168, %cleanup168, %invcont153, %invcont144
-        %val173 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %tmp.upgrd.3 )
-			to label %cleanup192 unwind label %cleanup192
-finally170:		; preds = %cleanup171
-	invoke void @_ZNK13QTextDocument11defaultFontEv( %struct.QFont* sret  %font, %struct.QAbstractTextDocumentLayout* %tmp95 )
-			to label %invcont177 unwind label %cleanup192
-invcont177:		; preds = %finally170
-	invoke void @_ZN5QFont12setPointSizeEi( %struct.QFont* %font, i32 10 )
-			to label %invcont179 unwind label %cleanup187
-invcont179:		; preds = %invcont177
-	invoke void @_ZN13QTextDocument14setDefaultFontERK5QFont( %struct.QAbstractTextDocumentLayout* %tmp95, %struct.QFont* %font )
-			to label %invcont181 unwind label %cleanup187
-invcont181:		; preds = %invcont179
-	call void @_ZNK6QRectF4sizeEv( %struct.QPointF* sret  %tmp3, %struct.QRectF* %body )
-	invoke void @_ZN13QTextDocument11setPageSizeERK6QSizeF( %struct.QAbstractTextDocumentLayout* %tmp95, %struct.QPointF* %tmp3 )
-			to label %cleanup185 unwind label %cleanup187
-cleanup185:		; preds = %invcont181
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %font )
-			to label %cleanup190 unwind label %cleanup192
-cleanup187:		; preds = %invcont181, %invcont179, %invcont177
-        %val187 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN5QFontD1Ev( %struct.QFont* %font )
-			to label %cleanup192 unwind label %cleanup192
-cleanup190:		; preds = %cleanup185
-	invoke void @_ZN16QTextFrameFormatD1Ev( %struct.QTextBlockFormat* %fmt )
-			to label %cond_next194 unwind label %cleanup329
-cleanup192:		; preds = %cleanup187, %cleanup187, %cleanup185, %finally170, %cleanup173, %cleanup173, %cleanup171, %invcont141, %invcont124, %invcont122, %invcont120, %invcont118, %invcont117, %invcont114, %invcont111
-        %val192 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	invoke void @_ZN16QTextFrameFormatD1Ev( %struct.QTextBlockFormat* %fmt )
-			to label %cleanup329 unwind label %cleanup329
-cond_next194:		; preds = %cleanup190, %invcont83
-	%clonedDoc.1 = phi %struct.QAbstractTextDocumentLayout* [ null, %invcont83 ], [ %tmp95, %cleanup190 ]		; <%struct.QAbstractTextDocumentLayout*> [#uses=3]
-	%doc.1 = phi %struct.QAbstractTextDocumentLayout* [ %this, %invcont83 ], [ %tmp95, %cleanup190 ]		; <%struct.QAbstractTextDocumentLayout*> [#uses=2]
-	%tmp197 = invoke i1 @_ZNK8QPrinter13collateCopiesEv( %struct.QPrinter* %printer )
-			to label %invcont196 unwind label %cleanup329		; <i1> [#uses=1]
-invcont196:		; preds = %cond_next194
-	br i1 %tmp197, label %cond_true200, label %cond_false204
-cond_true200:		; preds = %invcont196
-	%tmp2000 = load double* %foo
-	store double %tmp2000, double* %bar
-	%tmp203 = invoke i32 @_ZNK8QPrinter9numCopiesEv( %struct.QPrinter* %printer )
-			to label %cond_next208 unwind label %cleanup329		; <i32> [#uses=1]
-cond_false204:		; preds = %invcont196
-	%tmp2001 = load double* %foo
-	store double %tmp2001, double* %bar
-	%tmp207 = invoke i32 @_ZNK8QPrinter9numCopiesEv( %struct.QPrinter* %printer )
-			to label %cond_next208 unwind label %cleanup329		; <i32> [#uses=1]
-cond_next208:		; preds = %invcont206, %invcont202
-	%pageCopies.0 = phi i32 [ %tmp203, %cond_true200 ], [ 1, %cond_false204 ]		; <i32> [#uses=2]
-	%docCopies.0 = phi i32 [ 1, %cond_true200 ], [ %tmp207, %cond_false204 ]		; <i32> [#uses=2]
-	%tmp211 = invoke i32 @_ZNK8QPrinter8fromPageEv( %struct.QPrinter* %printer )
-			to label %invcont210 unwind label %cleanup329		; <i32> [#uses=3]
-invcont210:		; preds = %cond_next208
-	%tmp214 = invoke i32 @_ZNK8QPrinter6toPageEv( %struct.QPrinter* %printer )
-			to label %invcont213 unwind label %cleanup329		; <i32> [#uses=3]
-invcont213:		; preds = %invcont210
-	%tmp216 = icmp eq i32 %tmp211, 0		; <i1> [#uses=1]
-	br i1 %tmp216, label %cond_true217, label %cond_next225
-cond_true217:		; preds = %invcont213
-	%tmp219 = icmp eq i32 %tmp214, 0		; <i1> [#uses=1]
-	br i1 %tmp219, label %cond_true220, label %cond_next225
-cond_true220:		; preds = %cond_true217
-	%tmp223 = invoke i32 @_ZNK13QTextDocument9pageCountEv( %struct.QAbstractTextDocumentLayout* %doc.1 )
-			to label %invcont222 unwind label %cleanup329		; <i32> [#uses=1]
-invcont222:		; preds = %cond_true220
-	br label %cond_next225
-cond_next225:		; preds = %invcont222, %cond_true217, %invcont213
-	%toPage.1 = phi i32 [ %tmp223, %invcont222 ], [ %tmp214, %cond_true217 ], [ %tmp214, %invcont213 ]		; <i32> [#uses=2]
-	%fromPage.1 = phi i32 [ 1, %invcont222 ], [ %tmp211, %cond_true217 ], [ %tmp211, %invcont213 ]		; <i32> [#uses=2]
-	%tmp.page = invoke i32 @_ZNK8QPrinter9pageOrderEv( %struct.QPrinter* %printer )
-			to label %invcont227 unwind label %cleanup329		; <i32> [#uses=1]
-invcont227:		; preds = %cond_next225
-	%tmp228 = icmp eq i32 %tmp.page, 1		; <i1> [#uses=1]
-	br i1 %tmp228, label %cond_true230, label %cond_next234
-cond_true230:		; preds = %invcont227
-	br label %cond_next234
-cond_next234:		; preds = %cond_true230, %invcont227
-	%ascending.1 = phi i1 [ false, %cond_true230 ], [ true, %invcont227 ]		; <i1> [#uses=1]
-	%toPage.2 = phi i32 [ %fromPage.1, %cond_true230 ], [ %toPage.1, %invcont227 ]		; <i32> [#uses=1]
-	%fromPage.2 = phi i32 [ %toPage.1, %cond_true230 ], [ %fromPage.1, %invcont227 ]		; <i32> [#uses=1]
-	br label %bb309
-bb237:		; preds = %cond_true313, %cond_next293
-	%iftmp.410.4 = phi i1 [ %iftmp.410.5, %cond_true313 ], [ %iftmp.410.1, %cond_next293 ]		; <i1> [#uses=1]
-	%page.4 = phi i32 [ %fromPage.2, %cond_true313 ], [ %page.3, %cond_next293 ]		; <i32> [#uses=4]
-	br label %bb273
-invcont240:		; preds = %cond_true277
-	%tmp242 = icmp eq i32 %tmp241, 2		; <i1> [#uses=1]
-	br i1 %tmp242, label %bb252, label %cond_next244
-cond_next244:		; preds = %invcont240
-	%tmp247 = invoke i32 @_ZNK8QPrinter12printerStateEv( %struct.QPrinter* %printer )
-			to label %invcont246 unwind label %cleanup329		; <i32> [#uses=1]
-invcont246:		; preds = %cond_next244
-	%tmp248 = icmp eq i32 %tmp247, 3		; <i1> [#uses=1]
-	br i1 %tmp248, label %bb252, label %bb253
-bb252:		; preds = %invcont246, %invcont240
-	br label %bb254
-bb253:		; preds = %invcont246
-	br label %bb254
-bb254:		; preds = %bb253, %bb252
-	%iftmp.410.0 = phi i1 [ true, %bb252 ], [ false, %bb253 ]		; <i1> [#uses=2]
-	br i1 %iftmp.410.0, label %UserCanceled, label %cond_next258
-cond_next258:		; preds = %bb254
-	invoke fastcc void @_Z9printPageiP8QPainterPK13QTextDocumentRK6QRectFRK7QPointF( i32 %page.4, %struct.QPainter* %p, %struct.QAbstractTextDocumentLayout* %doc.1, %struct.QRectF* %body, %struct.QPointF* %pageNumberPos )
-			to label %invcont261 unwind label %cleanup329
-invcont261:		; preds = %cond_next258
-	%tmp263 = add i32 %pageCopies.0, -1		; <i32> [#uses=1]
-	%tmp265 = icmp sgt i32 %tmp263, %j.4		; <i1> [#uses=1]
-	br i1 %tmp265, label %cond_true266, label %cond_next270
-cond_true266:		; preds = %invcont261
-	%tmp269 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %cond_next270 unwind label %cleanup329		; <i1> [#uses=0]
-cond_next270:		; preds = %cond_true266, %invcont261
-	%tmp272 = add i32 %j.4, 1		; <i32> [#uses=1]
-	br label %bb273
-bb273:		; preds = %cond_next270, %bb237
-	%iftmp.410.1 = phi i1 [ %iftmp.410.4, %bb237 ], [ %iftmp.410.0, %cond_next270 ]		; <i1> [#uses=2]
-	%j.4 = phi i32 [ 0, %bb237 ], [ %tmp272, %cond_next270 ]		; <i32> [#uses=3]
-	%tmp276 = icmp slt i32 %j.4, %pageCopies.0		; <i1> [#uses=1]
-	br i1 %tmp276, label %cond_true277, label %bb280
-cond_true277:		; preds = %bb273
-	%tmp241 = invoke i32 @_ZNK8QPrinter12printerStateEv( %struct.QPrinter* %printer )
-			to label %invcont240 unwind label %cleanup329		; <i32> [#uses=1]
-bb280:		; preds = %bb273
-	%tmp283 = icmp eq i32 %page.4, %toPage.2		; <i1> [#uses=1]
-	br i1 %tmp283, label %bb297, label %cond_next285
-cond_next285:		; preds = %bb280
-	br i1 %ascending.1, label %cond_true287, label %cond_false290
-cond_true287:		; preds = %cond_next285
-	%tmp289 = add i32 %page.4, 1		; <i32> [#uses=1]
-	br label %cond_next293
-cond_false290:		; preds = %cond_next285
-	%tmp292 = add i32 %page.4, -1		; <i32> [#uses=1]
-	br label %cond_next293
-cond_next293:		; preds = %cond_false290, %cond_true287
-	%page.3 = phi i32 [ %tmp289, %cond_true287 ], [ %tmp292, %cond_false290 ]		; <i32> [#uses=1]
-	%tmp296 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %bb237 unwind label %cleanup329		; <i1> [#uses=0]
-bb297:		; preds = %bb280
-	%tmp299 = add i32 %docCopies.0, -1		; <i32> [#uses=1]
-	%tmp301 = icmp sgt i32 %tmp299, %i.1		; <i1> [#uses=1]
-	br i1 %tmp301, label %cond_true302, label %cond_next306
-cond_true302:		; preds = %bb297
-	%tmp305 = invoke i1 @_ZN8QPrinter7newPageEv( %struct.QPrinter* %printer )
-			to label %cond_next306 unwind label %cleanup329		; <i1> [#uses=0]
-cond_next306:		; preds = %cond_true302, %bb297
-	%tmp308 = add i32 %i.1, 1		; <i32> [#uses=1]
-	br label %bb309
-bb309:		; preds = %cond_next306, %cond_next234
-	%iftmp.410.5 = phi i1 [ undef, %cond_next234 ], [ %iftmp.410.1, %cond_next306 ]		; <i1> [#uses=1]
-	%i.1 = phi i32 [ 0, %cond_next234 ], [ %tmp308, %cond_next306 ]		; <i32> [#uses=3]
-	%tmp312 = icmp slt i32 %i.1, %docCopies.0		; <i1> [#uses=1]
-	br i1 %tmp312, label %cond_true313, label %UserCanceled
-cond_true313:		; preds = %bb309
-	br label %bb237
-UserCanceled:		; preds = %bb309, %bb254
-	%tmp318 = icmp eq %struct.QAbstractTextDocumentLayout* %clonedDoc.1, null		; <i1> [#uses=1]
-	br i1 %tmp318, label %cleanup327, label %cond_true319
-cond_true319:		; preds = %UserCanceled
-	%tmp.upgrd.23 = getelementptr %struct.QAbstractTextDocumentLayout* %clonedDoc.1, i32 0, i32 0, i32 0		; <i32 (...)***> [#uses=1]
-	%tmp.upgrd.24 = load i32 (...)*** %tmp.upgrd.23		; <i32 (...)**> [#uses=1]
-	%tmp322 = getelementptr i32 (...)** %tmp.upgrd.24, i32 4		; <i32 (...)**> [#uses=1]
-	%tmp.upgrd.25 = load i32 (...)** %tmp322		; <i32 (...)*> [#uses=1]
-	%tmp.upgrd.26 = bitcast i32 (...)* %tmp.upgrd.25 to void (%struct.QAbstractTextDocumentLayout*)*		; <void (%struct.QAbstractTextDocumentLayout*)*> [#uses=1]
-	invoke void %tmp.upgrd.26( %struct.QAbstractTextDocumentLayout* %clonedDoc.1 )
-			to label %cleanup327 unwind label %cleanup329
-cleanup327:		; preds = %cond_true319, %UserCanceled
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	ret void
-cleanup328:		; preds = %invcont
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	ret void
-cleanup329:		; preds = %cond_true319, %cond_true302, %cond_next293, %cond_true277, %cond_true266, %cond_next258, %cond_next244, %cond_next225, %cond_true220, %invcont210, %cond_next208, %cond_false204, %cond_true200, %cond_next194, %cleanup192, %cleanup192, %cleanup190, %invcont106, %invcont104, %invcont103, %invcont100, %invcont98, %invcont94, %cond_false, %invcont83, %invcont79, %invcont57, %invcont51, %invcont45, %cond_next42, %invcont37, %cond_true35, %invcont29, %invcont25, %cond_true24, %cond_next, %entry
-        %val329 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-                    cleanup
-	call void @_ZN8QPainterD1Ev( %struct.QPainter* %p )
-	resume { i8*, i32 } %val329
-}
-
-declare void @_ZN6QSizeFC1Edd(%struct.QPointF*, double, double)
-
-declare i1 @_ZNK6QSizeF7isValidEv(%struct.QPointF*)
-
-declare double @_ZNK6QSizeF5widthEv(%struct.QPointF*)
-
-declare double @_ZNK6QSizeF6heightEv(%struct.QPointF*)
-
-declare double* @_ZN6QSizeF6rwidthEv(%struct.QPointF*)
-
-declare double* @_ZN6QSizeF7rheightEv(%struct.QPointF*)
-
-declare %struct.QTextDocumentPrivate* @_ZNK13QTextDocument6d_funcEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN7QPointFC1Ev(%struct.QPointF*)
-
-declare void @_ZN7QPointFC1Edd(%struct.QPointF*, double, double)
-
-declare void @_ZN16QTextFrameFormat9setMarginEd(%struct.QTextBlockFormat*, double)
-
-declare void @_ZN6QRectFC1Edddd(%struct.QRectF*, double, double, double, double)
-
-declare void @_ZN6QRectFC1ERK7QPointFRK6QSizeF(%struct.QRectF*, %struct.QPointF*, %struct.QPointF*)
-
-declare double @_ZNK6QRectF5widthEv(%struct.QRectF*)
-
-declare double @_ZNK6QRectF6heightEv(%struct.QRectF*)
-
-declare void @_ZNK6QRectF4sizeEv(%struct.QPointF*, %struct.QRectF*)
-
-declare void @_ZN16QTextFrameFormatD1Ev(%struct.QTextBlockFormat*)
-
-declare void @_ZNK10QTextFrame11frameFormatEv(%struct.QTextBlockFormat*, %struct.QTextBlockGroup*)
-
-declare void @_ZN10QTextFrame14setFrameFormatERK16QTextFrameFormat(%struct.QTextBlockGroup*, %struct.QTextBlockFormat*)
-
-declare i32 @_ZNK12QPaintDevice5widthEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice6heightEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice11logicalDpiXEv(%struct.QPaintDevice*)
-
-declare i32 @_ZNK12QPaintDevice11logicalDpiYEv(%struct.QPaintDevice*)
-
-declare %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument5cloneEP7QObject(%struct.QAbstractTextDocumentLayout*, %struct.QObject*)
-
-declare void @_ZN5QFontD1Ev(%struct.QFont*)
-
-declare %struct.QAbstractTextDocumentLayout* @_ZNK13QTextDocument14documentLayoutEv(%struct.QAbstractTextDocumentLayout*)
-
-declare %struct.QTextBlockGroup* @_ZNK13QTextDocument9rootFrameEv(%struct.QAbstractTextDocumentLayout*)
-
-declare i32 @_ZNK13QTextDocument9pageCountEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZNK13QTextDocument11defaultFontEv(%struct.QFont*, %struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN13QTextDocument14setDefaultFontERK5QFont(%struct.QAbstractTextDocumentLayout*, %struct.QFont*)
-
-declare void @_ZN13QTextDocument11setPageSizeERK6QSizeF(%struct.QAbstractTextDocumentLayout*, %struct.QPointF*)
-
-declare void @_Z9printPageiP8QPainterPK13QTextDocumentRK6QRectFRK7QPointF(i32, %struct.QPainter*, %struct.QAbstractTextDocumentLayout*, %struct.QRectF*, %struct.QPointF*)
-
-declare void @_ZN12QFontMetricsD1Ev(%struct.QFontMetrics*)
-
-declare void @_ZN8QPainterC1EP12QPaintDevice(%struct.QPainter*, %struct.QPaintDevice*)
-
-declare i1 @_ZNK8QPainter8isActiveEv(%struct.QPainter*)
-
-declare i32 @_Z13qt_defaultDpiv()
-
-declare %struct.QPaintDevice* @_ZNK27QAbstractTextDocumentLayout11paintDeviceEv(%struct.QAbstractTextDocumentLayout*)
-
-declare void @_ZN8QPainter5scaleEdd(%struct.QPainter*, double, double)
-
-declare %struct.QPaintDevice* @_ZNK8QPainter6deviceEv(%struct.QPainter*)
-
-declare void @_ZN27QAbstractTextDocumentLayout14setPaintDeviceEP12QPaintDevice(%struct.QAbstractTextDocumentLayout*, %struct.QPaintDevice*)
-
-declare void @_ZN12QFontMetricsC1ERK5QFontP12QPaintDevice(%struct.QFontMetrics*, %struct.QFont*, %struct.QPaintDevice*)
-
-declare i32 @_ZNK12QFontMetrics6ascentEv(%struct.QFontMetrics*)
-
-declare void @_ZN5QFont12setPointSizeEi(%struct.QFont*, i32)
-
-declare i1 @_ZNK8QPrinter13collateCopiesEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter9numCopiesEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter8fromPageEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter6toPageEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter9pageOrderEv(%struct.QPrinter*)
-
-declare i32 @_ZNK8QPrinter12printerStateEv(%struct.QPrinter*)
-
-declare i1 @_ZN8QPrinter7newPageEv(%struct.QPrinter*)
-
-declare void @_ZN8QPainterD1Ev(%struct.QPainter*)
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SimplifyCFG/BrUnwind.ll b/test/Transforms/SimplifyCFG/BrUnwind.ll
index 7ab8faa..1485364 100644
--- a/test/Transforms/SimplifyCFG/BrUnwind.ll
+++ b/test/Transforms/SimplifyCFG/BrUnwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN: not grep {br label}
+; RUN: not grep "br label"
 
 define void @test(i1 %C) {
         br i1 %C, label %A, label %B
diff --git a/test/Transforms/SimplifyCFG/DeadSetCC.ll b/test/Transforms/SimplifyCFG/DeadSetCC.ll
index 8339462..c625600 100644
--- a/test/Transforms/SimplifyCFG/DeadSetCC.ll
+++ b/test/Transforms/SimplifyCFG/DeadSetCC.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:   not grep {icmp eq}
+; RUN:   not grep "icmp eq"
 
 ; Check that simplifycfg deletes a dead 'seteq' instruction when it
 ; folds a conditional branch into a switch instruction.
diff --git a/test/Transforms/SimplifyCFG/UncondBranchToReturn.ll b/test/Transforms/SimplifyCFG/UncondBranchToReturn.ll
index bf9d953..b6d54d3 100644
--- a/test/Transforms/SimplifyCFG/UncondBranchToReturn.ll
+++ b/test/Transforms/SimplifyCFG/UncondBranchToReturn.ll
@@ -3,7 +3,7 @@
 ; important case.  This is basically the most trivial form of tail-duplication.
 
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:    not grep {br label}
+; RUN:    not grep "br label"
 
 define i32 @test(i1 %B, i32 %A, i32 %B.upgrd.1) {
         br i1 %B, label %T, label %F
diff --git a/test/Transforms/SimplifyCFG/branch-fold.ll b/test/Transforms/SimplifyCFG/branch-fold.ll
index 2b29681..7097dea 100644
--- a/test/Transforms/SimplifyCFG/branch-fold.ll
+++ b/test/Transforms/SimplifyCFG/branch-fold.ll
@@ -17,3 +17,54 @@ b:
 c:
         ret void
 }
+
+; rdar://10554090
+define zeroext i1 @test2(i64 %i0, i64 %i1) nounwind uwtable readonly ssp {
+entry:
+; CHECK: test2
+; CHECK: br i1
+  %and.i.i = and i64 %i0, 281474976710655
+  %and.i11.i = and i64 %i1, 281474976710655
+  %or.cond = icmp eq i64 %and.i.i, %and.i11.i
+  br i1 %or.cond, label %c, label %a
+
+a:
+; CHECK: br
+  %shr.i4.i = lshr i64 %i0, 48
+  %and.i5.i = and i64 %shr.i4.i, 32767
+  %shr.i.i = lshr i64 %i1, 48
+  %and.i2.i = and i64 %shr.i.i, 32767
+  %cmp9.i = icmp ult i64 %and.i5.i, %and.i2.i
+  br i1 %cmp9.i, label %c, label %b
+
+b:
+; CHECK-NOT: br
+  %shr.i13.i9 = lshr i64 %i1, 48
+  %and.i14.i10 = and i64 %shr.i13.i9, 32767
+  %shr.i.i11 = lshr i64 %i0, 48
+  %and.i11.i12 = and i64 %shr.i.i11, 32767
+  %phitmp = icmp uge i64 %and.i14.i10, %and.i11.i12
+  br label %c
+
+c:
+  %o2 = phi i1 [ false, %a ], [ %phitmp, %b ], [ false, %entry ]
+  ret i1 %o2
+}
+
+; PR13180
+define void @pr13180(i8 %p) {
+entry:
+  %tobool = icmp eq i8 %p, 0
+  br i1 %tobool, label %cond.false, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %phitmp = icmp eq i8 %p, 0
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i1 [ undef, %cond.true ], [ %phitmp, %cond.false ]
+  unreachable
+}
diff --git a/test/Transforms/SimplifyCFG/branch-phi-thread.ll b/test/Transforms/SimplifyCFG/branch-phi-thread.ll
index f52d979..c19ba69 100644
--- a/test/Transforms/SimplifyCFG/branch-phi-thread.ll
+++ b/test/Transforms/SimplifyCFG/branch-phi-thread.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -simplifycfg -adce -S | \
-; RUN:   not grep {call void @f1}
+; RUN:   not grep "call void @f1"
 ; END.
 
 declare void @f1()
diff --git a/test/Transforms/SimplifyCFG/duplicate-phis.ll b/test/Transforms/SimplifyCFG/duplicate-phis.ll
index 5129f9fb..4788406 100644
--- a/test/Transforms/SimplifyCFG/duplicate-phis.ll
+++ b/test/Transforms/SimplifyCFG/duplicate-phis.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -simplifycfg -S | grep { = phi } | count 1
+; RUN: opt < %s -instcombine -simplifycfg -S | grep " = phi " | count 1
 
 ; instcombine should sort the PHI operands so that simplifycfg can see the
 ; duplicate and remove it.
diff --git a/test/Transforms/SimplifyCFG/invoke.ll b/test/Transforms/SimplifyCFG/invoke.ll
new file mode 100644
index 0000000..10dc41b
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/invoke.ll
@@ -0,0 +1,139 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_call_unexpected(i8*)
+declare void @purefn() nounwind readnone
+declare i32 @read_only() nounwind readonly
+declare i32 @nounwind_fn() nounwind
+declare i32 @fn()
+
+
+; CHECK: @f1
+define i8* @f1() nounwind uwtable ssp {
+entry:
+; CHECK: call void @llvm.trap()
+; CHECK: unreachable
+  %call = invoke noalias i8* undef()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i8* %call
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @__cxa_call_unexpected(i8* %1) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f2
+define i8* @f2() nounwind uwtable ssp {
+entry:
+; CHECK: call void @llvm.trap()
+; CHECK: unreachable
+  %call = invoke noalias i8* null()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i8* %call
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @__cxa_call_unexpected(i8* %1) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f3
+define i32 @f3() nounwind uwtable ssp {
+; CHECK-NEXT: entry
+entry:
+; CHECK-NEXT: ret i32 3
+  %call = invoke i32 @read_only()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 3
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @__cxa_call_unexpected(i8* %1) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f4
+define i32 @f4() nounwind uwtable ssp {
+; CHECK-NEXT: entry
+entry:
+; CHECK-NEXT: call i32 @read_only()
+  %call = invoke i32 @read_only()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; CHECK-NEXT: ret i32 %call
+  ret i32 %call
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  %1 = extractvalue { i8*, i32 } %0, 0
+  tail call void @__cxa_call_unexpected(i8* %1) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f5
+define i32 @f5(i1 %cond, i8* %a, i8* %b) {
+entry:
+  br i1 %cond, label %x, label %y
+
+x:
+; CHECK: invoke i32 @fn()
+  %call = invoke i32 @fn()
+          to label %cont unwind label %lpad
+
+y:
+; CHECK: call i32 @nounwind_fn()
+  %call2 = invoke i32 @nounwind_fn()
+           to label %cont unwind label %lpad
+
+cont:
+; CHECK: phi i32
+; CHECK: ret i32 %phi
+  %phi = phi i32 [%call, %x], [%call2, %y]
+  ret i32 %phi
+
+lpad:
+; CHECK-NOT: phi
+  %phi2 = phi i8* [%a, %x], [%b, %y]
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+; CHECK: __cxa_call_unexpected(i8* %a)
+  tail call void @__cxa_call_unexpected(i8* %phi2) noreturn nounwind
+  unreachable
+}
+
+; CHECK: @f6
+define void @f6() {
+entry:
+  invoke void @purefn()
+          to label %invoke.cont1 unwind label %lpad
+
+invoke.cont1:
+  %foo = invoke i32 @fn()
+          to label %invoke.cont2 unwind label %lpad
+
+invoke.cont2:
+  ret void
+
+lpad:
+; CHECK-NOT: phi
+  %tmp = phi i8* [ null, %invoke.cont1 ], [ null, %entry ]
+  landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  ret void
+}
diff --git a/test/Transforms/SimplifyCFG/switch_thread.ll b/test/Transforms/SimplifyCFG/switch_thread.ll
index bd85fcc..9396684 100644
--- a/test/Transforms/SimplifyCFG/switch_thread.ll
+++ b/test/Transforms/SimplifyCFG/switch_thread.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -simplifycfg -S | \
-; RUN:   not grep {call void @DEAD}
+; RUN:   not grep "call void @DEAD"
 
 ; Test that we can thread a simple known condition through switch statements.
 
diff --git a/test/Transforms/SimplifyLibCalls/2007-04-06-strchr-miscompile.ll b/test/Transforms/SimplifyLibCalls/2007-04-06-strchr-miscompile.ll
index 8e9f206..ae917f7 100644
--- a/test/Transforms/SimplifyLibCalls/2007-04-06-strchr-miscompile.ll
+++ b/test/Transforms/SimplifyLibCalls/2007-04-06-strchr-miscompile.ll
@@ -1,8 +1,8 @@
 ; PR1307
 ; RUN: opt < %s -simplify-libcalls -instcombine -S > %t
-; RUN: grep {@str,.*i64 3} %t
-; RUN: grep {@str1,.*i64 7} %t
-; RUN: grep {ret i8.*null} %t
+; RUN: grep "@str,.*i64 3" %t
+; RUN: grep "@str1,.*i64 7" %t
+; RUN: grep "ret i8.*null" %t
 ; END.
 
 @str = internal constant [5 x i8] c"foog\00"
diff --git a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll b/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
index f8a0c88..2717228 100644
--- a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
+++ b/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
@@ -1,14 +1,14 @@
-; RUN: opt < %s -simplify-libcalls -S > %t
-; RUN: grep nocapture %t | count 2
-; RUN: grep null %t | grep nocapture | count 1
-; RUN: grep null %t | grep call | not grep readonly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
 
 ; Test that we add nocapture to the declaration, and to the second call only.
 
+; CHECK: declare float @strtol(i8*, i8** nocapture, i32) nounwind
 declare float @strtol(i8* %s, i8** %endptr, i32 %base)
 
 define void @foo(i8* %x, i8** %endptr) {
+; CHECK:  call float @strtol(i8* %x, i8** %endptr, i32 10)
   call float @strtol(i8* %x, i8** %endptr, i32 10)
+; CHECK: %2 = call float @strtol(i8* nocapture %x, i8** null, i32 10)
   call float @strtol(i8* %x, i8** null, i32 10)
   ret void
 }
diff --git a/test/Transforms/SimplifyLibCalls/2009-05-30-memcmp-byte.ll b/test/Transforms/SimplifyLibCalls/2009-05-30-memcmp-byte.ll
index 9056499..f4c80ed 100644
--- a/test/Transforms/SimplifyLibCalls/2009-05-30-memcmp-byte.ll
+++ b/test/Transforms/SimplifyLibCalls/2009-05-30-memcmp-byte.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -instcombine -S | grep {ret i32 -65}
+; RUN: opt < %s -simplify-libcalls -instcombine -S | grep "ret i32 -65"
 ; PR4284
 
 define i32 @test() nounwind {
diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll
index ab45f18..e38d783 100644
--- a/test/Transforms/SimplifyLibCalls/FFS.ll
+++ b/test/Transforms/SimplifyLibCalls/FFS.ll
@@ -1,6 +1,6 @@
 ; Test that the ToAsciiOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*@ffs}
+; RUN:   not grep "call.*@ffs"
 
 @non_const = external global i32		; <i32*> [#uses=1]
 
diff --git a/test/Transforms/SimplifyLibCalls/FPrintF.ll b/test/Transforms/SimplifyLibCalls/FPrintF.ll
index 4a0d232..51733e4 100644
--- a/test/Transforms/SimplifyLibCalls/FPrintF.ll
+++ b/test/Transforms/SimplifyLibCalls/FPrintF.ll
@@ -1,6 +1,6 @@
 ; Test that the FPrintFOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*fprintf}
+; RUN:   not grep "call.*fprintf"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/FPuts.ll b/test/Transforms/SimplifyLibCalls/FPuts.ll
index 1f72ede..aa01aba 100644
--- a/test/Transforms/SimplifyLibCalls/FPuts.ll
+++ b/test/Transforms/SimplifyLibCalls/FPuts.ll
@@ -1,6 +1,6 @@
 ; Test that the FPutsOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*fputs}
+; RUN:   not grep "call.*fputs"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/MemCpy.ll b/test/Transforms/SimplifyLibCalls/MemCpy.ll
index c711178..1faad03 100644
--- a/test/Transforms/SimplifyLibCalls/MemCpy.ll
+++ b/test/Transforms/SimplifyLibCalls/MemCpy.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -constprop -instcombine -S | not grep {call.*llvm.memcpy.i32}
+; RUN: opt < %s -constprop -instcombine -S | not grep "call.*llvm.memcpy.i32"
 
 @h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=1]
 @hel = constant [4 x i8] c"hel\00"		; <[4 x i8]*> [#uses=1]
diff --git a/test/Transforms/SimplifyLibCalls/SPrintF.ll b/test/Transforms/SimplifyLibCalls/SPrintF.ll
index 847e363..514a7d9 100644
--- a/test/Transforms/SimplifyLibCalls/SPrintF.ll
+++ b/test/Transforms/SimplifyLibCalls/SPrintF.ll
@@ -1,6 +1,6 @@
 ; Test that the SPrintFOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*sprintf}
+; RUN:   not grep "call.*sprintf"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/StpCpy.ll b/test/Transforms/SimplifyLibCalls/StpCpy.ll
new file mode 100644
index 0000000..914b095
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/StpCpy.ll
@@ -0,0 +1,43 @@
+; Test that the StpCpyOptimizer works correctly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+
+declare i8* @stpcpy(i8*, i8*)
+
+declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
+
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+
+define i32 @t1() {
+; CHECK: @t1
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @stpcpy( i8* %arg1, i8* %arg2 )
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret i32 0
+}
+
+define i32 @t2() {
+; CHECK: @t2
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
+  %rslt1 = call i8* @__stpcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
+; CHECK: @__memcpy_chk
+  ret i32 0
+}
+
+define i8* @t3(i8* %arg) {
+; CHECK: @t3
+  %stpcpy = tail call i8* @stpcpy(i8* %arg, i8* %arg)
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen(i8* %arg)
+; CHECK-NEXT: getelementptr inbounds i8* %arg, i32 [[LEN]]
+  ret i8* %stpcpy
+}
diff --git a/test/Transforms/SimplifyLibCalls/StrCat.ll b/test/Transforms/SimplifyLibCalls/StrCat.ll
index 4e3d0ab..3ea691a 100644
--- a/test/Transforms/SimplifyLibCalls/StrCat.ll
+++ b/test/Transforms/SimplifyLibCalls/StrCat.ll
@@ -1,9 +1,9 @@
 ; Test that the StrCatOptimizer works correctly
 ; PR3661
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*strcat}
+; RUN:   not grep "call.*strcat"
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep {puts.*%arg1}
+; RUN:   grep "puts.*%arg1"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/StrLen.ll b/test/Transforms/SimplifyLibCalls/StrLen.ll
index acd8aaf..4a20bbd 100644
--- a/test/Transforms/SimplifyLibCalls/StrLen.ll
+++ b/test/Transforms/SimplifyLibCalls/StrLen.ll
@@ -1,6 +1,6 @@
 ; Test that the StrCatOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:    not grep {call.*strlen}
+; RUN:    not grep "call.*strlen"
 
 target datalayout = "e-p:32:32"
 @hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=3]
diff --git a/test/Transforms/SimplifyLibCalls/StrNCat.ll b/test/Transforms/SimplifyLibCalls/StrNCat.ll
index d09c022..073792b 100644
--- a/test/Transforms/SimplifyLibCalls/StrNCat.ll
+++ b/test/Transforms/SimplifyLibCalls/StrNCat.ll
@@ -1,8 +1,8 @@
 ; Test that the StrNCatOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*strncat}
+; RUN:   not grep "call.*strncat"
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep {puts.*%arg1}
+; RUN:   grep "puts.*%arg1"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/StrNCpy.ll b/test/Transforms/SimplifyLibCalls/StrNCpy.ll
index c8af3ca..4e47b31 100644
--- a/test/Transforms/SimplifyLibCalls/StrNCpy.ll
+++ b/test/Transforms/SimplifyLibCalls/StrNCpy.ll
@@ -1,6 +1,6 @@
 ; Test that the StrNCpyOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*strncpy}
+; RUN:   not grep "call.*strncpy"
 
 ; This transformation requires the pointer size, as it assumes that size_t is
 ; the size of a pointer.
diff --git a/test/Transforms/SimplifyLibCalls/ToAscii.ll b/test/Transforms/SimplifyLibCalls/ToAscii.ll
index e2b5683..aef4733 100644
--- a/test/Transforms/SimplifyLibCalls/ToAscii.ll
+++ b/test/Transforms/SimplifyLibCalls/ToAscii.ll
@@ -1,6 +1,6 @@
 ; Test that the ToAsciiOptimizer works correctly
 ; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep {call.*toascii}
+; RUN:   not grep "call.*toascii"
 
 declare i32 @toascii(i32)
 
diff --git a/test/Transforms/SimplifyLibCalls/abs.ll b/test/Transforms/SimplifyLibCalls/abs.ll
index 6fbe0b9..3934a5b 100644
--- a/test/Transforms/SimplifyLibCalls/abs.ll
+++ b/test/Transforms/SimplifyLibCalls/abs.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -S | grep {select i1 %ispos}
+; RUN: opt < %s -simplify-libcalls -S | grep "select i1 %ispos"
 ; PR2337
 
 define i32 @test(i32 %x) {
diff --git a/test/Transforms/SimplifyLibCalls/exp2.ll b/test/Transforms/SimplifyLibCalls/exp2.ll
index 2f5d910..a592775 100644
--- a/test/Transforms/SimplifyLibCalls/exp2.ll
+++ b/test/Transforms/SimplifyLibCalls/exp2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -S | grep {call.*ldexp} | count 4
+; RUN: opt < %s -simplify-libcalls -S | grep "call.*ldexp" | count 4
 ; rdar://5852514
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/SimplifyLibCalls/memmove.ll b/test/Transforms/SimplifyLibCalls/memmove.ll
index c0c0050..5aaeeeb 100644
--- a/test/Transforms/SimplifyLibCalls/memmove.ll
+++ b/test/Transforms/SimplifyLibCalls/memmove.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -S | grep {llvm.memmove}
+; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memmove"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/Transforms/SimplifyLibCalls/memset-64.ll b/test/Transforms/SimplifyLibCalls/memset-64.ll
index fb752c4..92412de 100644
--- a/test/Transforms/SimplifyLibCalls/memset-64.ll
+++ b/test/Transforms/SimplifyLibCalls/memset-64.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -S | grep {llvm.memset}
+; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux-gnu"
 
diff --git a/test/Transforms/SimplifyLibCalls/memset.ll b/test/Transforms/SimplifyLibCalls/memset.ll
index 0aede06..853215a 100644
--- a/test/Transforms/SimplifyLibCalls/memset.ll
+++ b/test/Transforms/SimplifyLibCalls/memset.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplify-libcalls -S | grep {llvm.memset}
+; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/Transforms/SimplifyLibCalls/pow2.ll b/test/Transforms/SimplifyLibCalls/pow2.ll
index f8364f7..f0964e7 100644
--- a/test/Transforms/SimplifyLibCalls/pow2.ll
+++ b/test/Transforms/SimplifyLibCalls/pow2.ll
@@ -1,6 +1,6 @@
 ; Testcase for calls to the standard C "pow" function
 ;
-; RUN: opt < %s -simplify-libcalls -S | not grep {call .pow}
+; RUN: opt < %s -simplify-libcalls -S | not grep "call .pow"
 
 
 declare double @pow(double, double)
diff --git a/test/Transforms/Sink/basic.ll b/test/Transforms/Sink/basic.ll
index 4c531d8..1d0b6b5 100644
--- a/test/Transforms/Sink/basic.ll
+++ b/test/Transforms/Sink/basic.ll
@@ -36,3 +36,29 @@ true:
 false:
   ret i32 0
 }
+
+; Sink to the nearest post-dominator
+
+;      CHECK: @diamond
+;      CHECK: X:
+; CHECK-NEXT: phi
+; CHECK-NEXT: mul nsw
+; CHECK-NEXT: sub
+
+define i32 @diamond(i32 %a, i32 %b, i32 %c) {
+  %1 = mul nsw i32 %c, %b
+  %2 = icmp sgt i32 %a, 0
+  br i1 %2, label %B0, label %B1
+
+B0:                                       ; preds = %0
+  br label %X
+
+B1:                                      ; preds = %0
+  br label %X
+
+X:                                     ; preds = %5, %3
+  %.01 = phi i32 [ %c, %B0 ], [ %a, %B1 ]
+  %R = sub i32 %1, %.01
+  ret i32 %R
+}
+
diff --git a/test/Transforms/TailCallElim/ackermann.ll b/test/Transforms/TailCallElim/ackermann.ll
index 0c140ad..5b5dbcc 100644
--- a/test/Transforms/TailCallElim/ackermann.ll
+++ b/test/Transforms/TailCallElim/ackermann.ll
@@ -1,5 +1,5 @@
 ; This function contains two tail calls, which should be eliminated
-; RUN: opt < %s -tailcallelim -stats -disable-output |& grep {2 tailcallelim}
+; RUN: opt < %s -tailcallelim -stats -disable-output 2>&1 | grep "2 tailcallelim"
 
 define i32 @Ack(i32 %M.1, i32 %N.1) {
 entry:
diff --git a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
index 5cc92e1..e4f8b48 100644
--- a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
+++ b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep {call i32 @foo}
+; RUN:    grep "call i32 @foo"
 
 declare void @bar(i32*)
 
diff --git a/test/Transforms/TailCallElim/dup_tail.ll b/test/Transforms/TailCallElim/dup_tail.ll
index 9363880..42ac2f9 100644
--- a/test/Transforms/TailCallElim/dup_tail.ll
+++ b/test/Transforms/TailCallElim/dup_tail.ll
@@ -1,5 +1,5 @@
 ; Duplicate the return into if.end to enable TCE.
-; RUN: opt %s -tailcallelim -stats -disable-output |& grep {Number of return duplicated}
+; RUN: opt %s -tailcallelim -stats -disable-output 2>&1 | grep "Number of return duplicated"
 
 define i32 @fib(i32 %n) nounwind ssp {
 entry:
diff --git a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
index 3dddb01..3d01d17 100644
--- a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
+++ b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep {tail call void @foo}
+; RUN:    grep "tail call void @foo"
 
 
 declare void @foo()
diff --git a/test/Transforms/TailDup/2008-06-11-AvoidDupLoopHeader.ll b/test/Transforms/TailDup/2008-06-11-AvoidDupLoopHeader.ll
index 03e99bc..7853d7b 100644
--- a/test/Transforms/TailDup/2008-06-11-AvoidDupLoopHeader.ll
+++ b/test/Transforms/TailDup/2008-06-11-AvoidDupLoopHeader.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -tailduplicate -taildup-threshold=3 -stats -disable-output |& not grep tailduplicate
+; RUN: opt < %s -tailduplicate -taildup-threshold=3 -stats -disable-output 2>&1 | not grep tailduplicate
 ; XFAIL: *
 
 define i32 @foo(i32 %l) nounwind  {
diff --git a/test/Verifier/2002-04-13-RetTypes.ll b/test/Verifier/2002-04-13-RetTypes.ll
index 4c1ddd1..af46839 100644
--- a/test/Verifier/2002-04-13-RetTypes.ll
+++ b/test/Verifier/2002-04-13-RetTypes.ll
@@ -1,6 +1,6 @@
-; RUN: not llvm-as < %s |& grep {value doesn't match function result type 'i32'}
+; RUN: not llvm-as < %s 2>&1 | grep "value doesn't match function result type 'i32'"
 
-; Verify the the operand type of the ret instructions in a function match the
+; Verify the operand type of the ret instructions in a function match the
 ; delcared return type of the function they live in.
 ;
 
diff --git a/test/Verifier/2002-11-05-GetelementptrPointers.ll b/test/Verifier/2002-11-05-GetelementptrPointers.ll
index 1f71387..108ae5f 100644
--- a/test/Verifier/2002-11-05-GetelementptrPointers.ll
+++ b/test/Verifier/2002-11-05-GetelementptrPointers.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {invalid getelementptr indices}
+; RUN: not llvm-as < %s 2>&1 | grep "invalid getelementptr indices"
 
 ; This testcase is invalid because we are indexing into a pointer that is 
 ; contained WITHIN a structure.
diff --git a/test/Verifier/2006-07-11-StoreStruct.ll b/test/Verifier/2006-07-11-StoreStruct.ll
index 80ab122..65b229d 100644
--- a/test/Verifier/2006-07-11-StoreStruct.ll
+++ b/test/Verifier/2006-07-11-StoreStruct.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s |& not grep {Instruction operands must be first-class}
+; RUN: llvm-as < %s 2>&1 | not grep "Instruction operands must be first-class"
 
 ; This previously was for PR826, but structs are now first-class so
 ; the following is now valid.
diff --git a/test/Verifier/2006-10-15-AddrLabel.ll b/test/Verifier/2006-10-15-AddrLabel.ll
index 0b73b47..c8fedb5 100644
--- a/test/Verifier/2006-10-15-AddrLabel.ll
+++ b/test/Verifier/2006-10-15-AddrLabel.ll
@@ -1,4 +1,5 @@
-; RUN: not llvm-as < %s > /dev/null |& grep {basic block pointers are invalid}
+; RUN: not llvm-as < %s > /dev/null 2> %t
+; RUN: grep "basic block pointers are invalid" %t
 
 define i32 @main() {
          %foo  = call i8* %llvm.stacksave()
diff --git a/test/Verifier/2006-12-12-IntrinsicDefine.ll b/test/Verifier/2006-12-12-IntrinsicDefine.ll
index 8d09b51..6e7468c 100644
--- a/test/Verifier/2006-12-12-IntrinsicDefine.ll
+++ b/test/Verifier/2006-12-12-IntrinsicDefine.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {llvm intrinsics cannot be defined}
+; RUN: not llvm-as < %s 2>&1 | grep "llvm intrinsics cannot be defined"
 ; PR1047
 
 define void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) {
diff --git a/test/Verifier/2008-03-01-AllocaSized.ll b/test/Verifier/2008-03-01-AllocaSized.ll
index 079a75d..51258be 100644
--- a/test/Verifier/2008-03-01-AllocaSized.ll
+++ b/test/Verifier/2008-03-01-AllocaSized.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as %s -o /dev/null |& grep {Cannot allocate unsized type}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Cannot allocate unsized type"
 ; PR2113
 
 define void @test() {
diff --git a/test/Verifier/2008-08-22-MemCpyAlignment.ll b/test/Verifier/2008-08-22-MemCpyAlignment.ll
index aaf69ae..c6d5afd 100644
--- a/test/Verifier/2008-08-22-MemCpyAlignment.ll
+++ b/test/Verifier/2008-08-22-MemCpyAlignment.ll
@@ -1,11 +1,11 @@
-; RUN: not llvm-as %s -o /dev/null |& grep {alignment argument of memory intrinsics must be a constant int}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "alignment argument of memory intrinsics must be a constant int"
 ; PR2318
 
 define void @x(i8* %a, i8* %src, i64 %len, i32 %align) nounwind  {
 entry:
-        tail call void @llvm.memcpy.i64( i8* %a, i8* %src, i64 %len, i32 %align) nounwind 
+        tail call void @llvm.memcpy.p0i8.p0i8.i64( i8* %a, i8* %src, i64 %len, i32 %align, i1 false) nounwind 
         ret void
 }
 
-declare void @llvm.memcpy.i64( i8* %a, i8* %src, i64 %len, i32)
+declare void @llvm.memcpy.p0i8.p0i8.i64( i8* %a, i8* %src, i64 %len, i32, i1)
 
diff --git a/test/Verifier/2008-11-15-RetVoid.ll b/test/Verifier/2008-11-15-RetVoid.ll
index aaef703..42503fa 100644
--- a/test/Verifier/2008-11-15-RetVoid.ll
+++ b/test/Verifier/2008-11-15-RetVoid.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {value doesn't match function result type 'void'}
+; RUN: not llvm-as < %s 2>&1 | grep "value doesn't match function result type 'void'"
 
 define void @foo() {
   ret i32 0
diff --git a/test/Verifier/2010-08-07-PointerIntrinsic.ll b/test/Verifier/2010-08-07-PointerIntrinsic.ll
index bf5563d..3136c61 100644
--- a/test/Verifier/2010-08-07-PointerIntrinsic.ll
+++ b/test/Verifier/2010-08-07-PointerIntrinsic.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-as < %s 2> %t
-; RUN: grep {Broken module} %t
+; RUN: grep "Broken module" %t
 ; PR7316
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
diff --git a/test/Verifier/AmbiguousPhi.ll b/test/Verifier/AmbiguousPhi.ll
index 9a72530..f31bc10 100644
--- a/test/Verifier/AmbiguousPhi.ll
+++ b/test/Verifier/AmbiguousPhi.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {multiple entries for the same basic block}
+; RUN: not llvm-as < %s 2>&1 | grep "multiple entries for the same basic block"
 
 
 
diff --git a/test/Verifier/PhiGrouping.ll b/test/Verifier/PhiGrouping.ll
index dc529dc..7b42fd2 100644
--- a/test/Verifier/PhiGrouping.ll
+++ b/test/Verifier/PhiGrouping.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& grep {PHI nodes not grouped at top}
+; RUN: not llvm-as < %s 2>&1 | grep "PHI nodes not grouped at top"
 
 
 
diff --git a/test/Verifier/SelfReferential.ll b/test/Verifier/SelfReferential.ll
index 70154b7..c24c0eb 100644
--- a/test/Verifier/SelfReferential.ll
+++ b/test/Verifier/SelfReferential.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as %s -o /dev/null |& grep {Only PHI nodes may reference their own value}
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Only PHI nodes may reference their own value"
 
 ; Test that self referential instructions are not allowed
 
diff --git a/test/Verifier/aliasing-chain.ll b/test/Verifier/aliasing-chain.ll
index fc5ef1c..a52e796 100644
--- a/test/Verifier/aliasing-chain.ll
+++ b/test/Verifier/aliasing-chain.ll
@@ -1,4 +1,4 @@
-; RUN:  not llvm-as %s -o /dev/null |& grep {Aliasing chain should end with function or global variable}
+; RUN:  not llvm-as %s -o /dev/null 2>&1 | grep "Aliasing chain should end with function or global variable"
 
 ; Test that alising chain does not create a cycle
 
diff --git a/test/Verifier/cttz-undef-arg.ll b/test/Verifier/cttz-undef-arg.ll
index 48cd061..66c5396 100644
--- a/test/Verifier/cttz-undef-arg.ll
+++ b/test/Verifier/cttz-undef-arg.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s -o /dev/null |& FileCheck %s
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
 
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
diff --git a/test/Verifier/dominates.ll b/test/Verifier/dominates.ll
new file mode 100644
index 0000000..17e2c33
--- /dev/null
+++ b/test/Verifier/dominates.ll
@@ -0,0 +1,57 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define i32 @f1(i32 %x) {
+       %y = add i32 %z, 1
+       %z = add i32 %x, 1
+       ret i32 %y
+; CHECK: Instruction does not dominate all uses!
+; CHECK-NEXT:  %z = add i32 %x, 1
+; CHECK-NEXT:  %y = add i32 %z, 1
+}
+
+declare i32 @g()
+define void @f2(i32 %x) {
+bb0:
+  %y1 = invoke i32 @g() to label %bb1 unwind label %bb2
+bb1:
+  ret void
+bb2:
+  %y2 = phi i32 [%y1, %bb0]
+  %y3 = landingpad i32 personality i32 ()* @g
+          cleanup
+  ret void
+; CHECK: Instruction does not dominate all uses!
+; CHECK-NEXT:  %y1 = invoke i32 @g()
+; CHECK-NEXT:        to label %bb1 unwind label %bb2
+; CHECK-NEXT:  %y2 = phi i32 [ %y1, %bb0 ]
+}
+
+define void @f3(i32 %x) {
+bb0:
+  %y1 = invoke i32 @g() to label %bb1 unwind label %bb2
+bb1:
+  ret void
+bb2:
+  %y2 = landingpad i32 personality i32 ()* @g
+          cleanup
+  br label %bb3
+bb3:
+  %y3 = phi i32 [%y1, %bb2]
+  ret void
+; CHECK: Instruction does not dominate all uses!
+; CHECK-NEXT:  %y1 = invoke i32 @g()
+; CHECK-NEXT:          to label %bb1 unwind label %bb2
+; CHECK-NEXT:  %y3 = phi i32 [ %y1, %bb2 ]
+}
+
+define void @f4(i32 %x) {
+bb0:
+  br label %bb1
+bb1:
+  %y3 = phi i32 [%y1, %bb0]
+  %y1 = add i32 %x, 1
+  ret void
+; CHECK: Instruction does not dominate all uses!
+; CHECK-NEXT:  %y1 = add i32 %x, 1
+; CHECK-NEXT:  %y3 = phi i32 [ %y1, %bb0 ]
+}
diff --git a/test/Verifier/fpmath.ll b/test/Verifier/fpmath.ll
index b764a63..7002c5c 100644
--- a/test/Verifier/fpmath.ll
+++ b/test/Verifier/fpmath.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s |& FileCheck %s
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 define void @fpmath1(i32 %i, float %f, <2 x float> %g) {
   %s = add i32 %i, %i, !fpmath !0
diff --git a/test/Verifier/invoke-1.ll b/test/Verifier/invoke-1.ll
deleted file mode 100644
index 427abe0..0000000
--- a/test/Verifier/invoke-1.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as < %s |& grep {not verify as correct}
-; PR1042
-
-define i32 @foo() {
-	%A = invoke i32 @foo( )
-			to label %L unwind label %L		; <i32> [#uses=1]
-L:		; preds = %0, %0
-	ret i32 %A
-}
-
diff --git a/test/Verifier/invoke-2.ll b/test/Verifier/invoke-2.ll
deleted file mode 100644
index 0145935..0000000
--- a/test/Verifier/invoke-2.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llvm-as %s |& grep {not verify as correct}
-; PR1042
-
-define i32 @foo() {
-	br i1 false, label %L1, label %L2
-L1:		; preds = %0
-	%A = invoke i32 @foo( )
-			to label %L unwind label %L		; <i32> [#uses=1]
-L2:		; preds = %0
-	br label %L
-L:		; preds = %L2, %L1, %L1
-	ret i32 %A
-}
-
diff --git a/test/Verifier/invoke.ll b/test/Verifier/invoke.ll
new file mode 100644
index 0000000..a48f9b6
--- /dev/null
+++ b/test/Verifier/invoke.ll
@@ -0,0 +1,80 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+; PR1042
+define i32 @foo() {
+; CHECK: The unwind destination does not have a landingpad instruction
+	%A = invoke i32 @foo( )
+			to label %L unwind label %L		; <i32> [#uses=1]
+L:		; preds = %0, %0
+	ret i32 %A
+}
+
+; PR1042
+define i32 @bar() {
+	br i1 false, label %L1, label %L2
+L1:		; preds = %0
+	%A = invoke i32 @bar( )
+			to label %L unwind label %L		; <i32> [#uses=1]
+L2:		; preds = %0
+	br label %L
+L:		; preds = %L2, %L1, %L1
+; CHECK: The unwind destination does not have a landingpad instruction
+; CHECK: Instruction does not dominate all uses
+	ret i32 %A
+}
+
+
+declare i32 @__gxx_personality_v0(...)
+declare void @llvm.donothing()
+declare void @llvm.trap()
+declare i8 @llvm.expect.i8(i8,i8)
+declare i32 @fn(i8 (i8, i8)*)
+
+define void @f1() {
+entry:
+; OK
+  invoke void @llvm.donothing()
+  to label %conta unwind label %contb
+
+conta:
+  ret void
+
+contb:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret void
+}
+
+define i8 @f2() {
+entry:
+; CHECK: Cannot invoke an intrinsinc other than donothing
+  invoke void @llvm.trap()
+  to label %cont unwind label %lpad
+
+cont:
+  ret i8 3
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret i8 2
+}
+
+define i32 @f3() {
+entry:
+; CHECK: Cannot take the address of an intrinsic
+  %call = call i32 @fn(i8 (i8, i8)* @llvm.expect.i8)
+  ret i32 %call
+}
+
+define void @f4() {
+entry:
+  invoke void @llvm.donothing()
+  to label %cont unwind label %cont
+
+cont:
+; CHECK: Block containing LandingPadInst must be jumped to only by the unwind edge of an invoke.
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret void
+}
diff --git a/test/Verifier/range-1.ll b/test/Verifier/range-1.ll
index 611933a..b6a75d1 100644
--- a/test/Verifier/range-1.ll
+++ b/test/Verifier/range-1.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s -o /dev/null |& FileCheck %s
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
 
 define void @f1(i8* %x) {
 entry:
@@ -76,3 +76,67 @@ entry:
 }
 !8 = metadata !{i8 0, i8 0}
 ; CHECK: Range must not be empty!
+
+define i8 @f10(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !9
+  ret i8 %y
+}
+!9 = metadata !{i8 0, i8 2, i8 1, i8 3}
+; CHECK: Intervals are overlapping
+
+define i8 @f11(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !10
+  ret i8 %y
+}
+!10 = metadata !{i8 0, i8 2, i8 2, i8 3}
+; CHECK: Intervals are contiguous
+
+define i8 @f12(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !11
+  ret i8 %y
+}
+!11 = metadata !{i8 1, i8 2, i8 -1, i8 0}
+; CHECK: Intervals are not in order
+
+define i8 @f13(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !12
+  ret i8 %y
+}
+!12 = metadata !{i8 1, i8 3, i8 5, i8 1}
+; CHECK: Intervals are contiguous
+
+define i8 @f14(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !13
+  ret i8 %y
+}
+!13 = metadata !{i8 1, i8 3, i8 5, i8 2}
+; CHECK: Intervals are overlapping
+
+define i8 @f15(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !14
+  ret i8 %y
+}
+!14 = metadata !{i8 10, i8 1, i8 12, i8 13}
+; CHECK: Intervals are overlapping
+
+define i8 @f16(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !16
+  ret i8 %y
+}
+!16 = metadata !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 2}
+; CHECK: Intervals are overlapping
+
+define i8 @f17(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !17
+  ret i8 %y
+}
+!17 = metadata !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 1}
+; CHECK: Intervals are contiguous
diff --git a/test/Verifier/range-2.ll b/test/Verifier/range-2.ll
index ef542c8..8d85d19 100644
--- a/test/Verifier/range-2.ll
+++ b/test/Verifier/range-2.ll
@@ -20,3 +20,17 @@ entry:
   ret i8 %y
 }
 !2 = metadata !{i8 1, i8 3, i8 5, i8 42}
+
+define i8 @f4(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !3
+  ret i8 %y
+}
+!3 = metadata !{i8 -1, i8 0, i8 1, i8 2}
+
+define i8 @f5(i8* %x) {
+entry:
+  %y = load i8* %x, align 1, !range !4
+  ret i8 %y
+}
+!4 = metadata !{i8 -1, i8 0, i8 1, i8 -2}
diff --git a/test/YAMLParser/spec-05-02-utf8.data b/test/YAMLParser/spec-05-02-utf8.data
index b306bdb..028f41b 100644
--- a/test/YAMLParser/spec-05-02-utf8.data
+++ b/test/YAMLParser/spec-05-02-utf8.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 
 # Invalid use of BOM
 # inside a
diff --git a/test/YAMLParser/spec-05-10.data b/test/YAMLParser/spec-05-10.data
index 6788f0b..bab2c1b 100644
--- a/test/YAMLParser/spec-05-10.data
+++ b/test/YAMLParser/spec-05-10.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 
 commercial-at: @text
 grave-accent: `text
diff --git a/test/YAMLParser/spec-05-12.data b/test/YAMLParser/spec-05-12.data
index 7dadff7..eedfc08 100644
--- a/test/YAMLParser/spec-05-12.data
+++ b/test/YAMLParser/spec-05-12.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # We don't currently reject tabs as indentation.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-05-15.data b/test/YAMLParser/spec-05-15.data
index cd8421a..27dbd83 100644
--- a/test/YAMLParser/spec-05-15.data
+++ b/test/YAMLParser/spec-05-15.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 
 Bad escapes:
   "\c
diff --git a/test/YAMLParser/spec-07-03.data b/test/YAMLParser/spec-07-03.data
index 7ca9483..c4a5299 100644
--- a/test/YAMLParser/spec-07-03.data
+++ b/test/YAMLParser/spec-07-03.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 
 %YAML 1.1
 %YAML 1.1
diff --git a/test/YAMLParser/spec-07-05.data b/test/YAMLParser/spec-07-05.data
index 279b54a..f7cff3a 100644
--- a/test/YAMLParser/spec-07-05.data
+++ b/test/YAMLParser/spec-07-05.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # We don't currently parse TAG directives.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-08-04.data b/test/YAMLParser/spec-08-04.data
index f13538b..73c493d 100644
--- a/test/YAMLParser/spec-08-04.data
+++ b/test/YAMLParser/spec-08-04.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # We don't currently look at the content of literal tags.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-08-06.data b/test/YAMLParser/spec-08-06.data
index a811bfd..9844f53 100644
--- a/test/YAMLParser/spec-08-06.data
+++ b/test/YAMLParser/spec-08-06.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # We don't currently validate tags.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-09-02.data b/test/YAMLParser/spec-09-02.data
index f690378..9d8a58c 100644
--- a/test/YAMLParser/spec-09-02.data
+++ b/test/YAMLParser/spec-09-02.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # Indent trimming is not yet implemented.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-09-14.data b/test/YAMLParser/spec-09-14.data
index 890f6bf..a83fcd4 100644
--- a/test/YAMLParser/spec-09-14.data
+++ b/test/YAMLParser/spec-09-14.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # Not quite sure why this doesn't fail.
 # XFAIL: *
diff --git a/test/YAMLParser/spec-09-21.data b/test/YAMLParser/spec-09-21.data
index 2bcc283..6eb7917 100644
--- a/test/YAMLParser/spec-09-21.data
+++ b/test/YAMLParser/spec-09-21.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 
 - |
 
diff --git a/test/YAMLParser/spec-10-08.data b/test/YAMLParser/spec-10-08.data
index 5b981e9..53faeb9 100644
--- a/test/YAMLParser/spec-10-08.data
+++ b/test/YAMLParser/spec-10-08.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s |& FileCheck %s
+# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
 #
 # This fails because even without a key token, some contexts (in this case flow
 # maps) allow implicit null keys, which mix with this in weird ways.
diff --git a/test/lit.cfg b/test/lit.cfg
index d74bc7b..6f44bb3 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -9,8 +9,22 @@ import re
 # name: The name of this test suite.
 config.name = 'LLVM'
 
+# Tweak PATH for Win32 to decide to use bash.exe or not.
+if sys.platform in ['win32']:
+    # Seek sane tools in directories and set to $PATH.
+    path = getattr(config, 'lit_tools_dir', None)
+    path = lit.getToolsPath(path,
+                            config.environment['PATH'],
+                            ['cmp.exe', 'grep.exe', 'sed.exe'])
+    if path is not None:
+        path = os.path.pathsep.join((path,
+                                     config.environment['PATH']))
+        config.environment['PATH'] = path
+
 # testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.TclTest()
+execute_external = (not sys.platform in ['win32']
+                    or lit.getBashPath() not in [None, ""])
+config.test_format = lit.formats.ShTest(execute_external)
 
 # To ignore test output on stderr so it doesn't trigger failures uncomment this:
 #config.test_format = lit.formats.TclTest(ignoreStdErr=True)
@@ -19,21 +33,14 @@ config.test_format = lit.formats.TclTest()
 # set by on_clone().
 config.suffixes = []
 
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = ['Inputs']
+
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 
-# Tweak PATH for Win32
-if sys.platform in ['win32']:
-    # Seek sane tools in directories and set to $PATH.
-    path = getattr(config, 'lit_tools_dir', None)
-    path = lit.getToolsPath(path,
-                            config.environment['PATH'],
-                            ['cmp.exe', 'grep.exe', 'sed.exe'])
-    if path is not None:
-        path = os.path.pathsep.join((path,
-                                     config.environment['PATH']))
-        config.environment['PATH'] = path
-
 # test_exec_root: The root path where tests should be run.
 llvm_obj_root = getattr(config, 'llvm_obj_root', None)
 if llvm_obj_root is not None:
@@ -132,18 +139,6 @@ if config.test_exec_root is None:
 
 ###
 
-# Load site data from DejaGNU's site.exp.
-import re
-site_exp = {}
-# FIXME: Implement lit.site.cfg.
-for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
-    m = re.match('set ([^ ]+) "(.*)"', line)
-    if m:
-        site_exp[m.group(1)] = m.group(2)
-
-# Provide target_triple for use in XFAIL and XTARGET.
-config.target_triple = site_exp['target_triplet']
-
 # When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the
 # triple so we can check it with XFAIL and XTARGET.
 config.target_triple += lit.valgrindTriple
@@ -164,9 +159,10 @@ if jit_impl_cfg == 'mcjit':
 else:
   config.substitutions.append( ('%lli', 'lli') )
 
-# Add substitutions.
-for sub in ['link', 'shlibext', 'ocamlopt', 'llvmshlibdir']:
-    config.substitutions.append(('%' + sub, site_exp[sub]))
+# Add site-specific substitutions.
+config.substitutions.append( ('%ocamlopt', config.ocamlopt_executable) )
+config.substitutions.append( ('%llvmshlibdir', config.llvm_shlib_dir) )
+config.substitutions.append( ('%shlibext', config.llvm_shlib_ext) )
 
 # For each occurrence of an llvm tool name as its own word, replace it
 # with the full path to the build directory holding that tool.  This
@@ -187,12 +183,12 @@ for pattern in [r"\bbugpoint\b(?!-)",   r"(?<!/|-)\bclang\b(?!-)",
                 r"\bllvm-bcanalyzer\b", r"\bllvm-config\b",
                 r"\bllvm-cov\b",        r"\bllvm-diff\b",
                 r"\bllvm-dis\b",        r"\bllvm-dwarfdump\b",
-                r"\bllvm-extract\b",    r"\bllvm-ld\b",
+                r"\bllvm-extract\b",
                 r"\bllvm-link\b",       r"\bllvm-mc\b",
                 r"\bllvm-nm\b",         r"\bllvm-objdump\b",
                 r"\bllvm-prof\b",       r"\bllvm-ranlib\b",
                 r"\bllvm-rtdyld\b",     r"\bllvm-shlib\b",
-                r"\bllvm-size\b",       r"\bllvm-stub\b",
+                r"\bllvm-size\b",
                 # Don't match '-llvmc'.
                 r"(?<!-)\bllvmc\b",     r"\blto\b",
                                         # Don't match '.opt', '-opt',
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 8b81186..178b22f 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -1,14 +1,20 @@
 ## Autogenerated by LLVM/Clang configuration.
 # Do not edit!
+config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.python_executable = "@PYTHON_EXECUTABLE@"
+config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
+config.host_os = "@HOST_OS@"
+config.host_arch = "@HOST_ARCH@"
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
diff --git a/test/site.exp.in b/test/site.exp.in
deleted file mode 100644
index cfb2eac..0000000
--- a/test/site.exp.in
+++ /dev/null
@@ -1,16 +0,0 @@
-## Autogenerated by LLVM configuration.
-# Do not edit!
-set target_triplet "@TARGET_TRIPLE@"
-set TARGETS_TO_BUILD "@TARGETS_TO_BUILD@"
-set llvmshlibdir "@SHLIBDIR@"
-set llvm_bindings "@LLVM_BINDINGS@"
-set srcroot "@LLVM_SOURCE_DIR@"
-set objroot "@LLVM_BINARY_DIR@"
-set srcdir "@LLVM_SOURCE_DIR@"
-set objdir "@LLVM_BINARY_DIR@"
-set link "@TEST_LINK_CMD@"
-set shlibext "@SHLIBEXT@"
-set ocamlopt "@OCAMLOPT@"
-set valgrind "@VALGRIND@"
-set grep "@GREP@"
-set gas "@AS@"
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 9668c76..1bfc2fe 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -2,14 +2,6 @@
 # three small executables. This is done to minimize memory load in parallel
 # builds.  Please retain this ordering.
 
-# If polly exists and is not disabled compile it and add it to the LLVM tools.
-option(LLVM_BUILD_POLLY "Compile polly" ON)
-if( EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/polly/CMakeLists.txt )
-  if (LLVM_BUILD_POLLY)
-    add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/polly)
-  endif (LLVM_BUILD_POLLY)
-endif( EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/polly/CMakeLists.txt )
-
 if( NOT WIN32 OR MSYS OR CYGWIN )
   # We currently require 'sed' to build llvm-config, so don't try to build it
   # on pure Win32.
@@ -27,7 +19,6 @@ add_subdirectory(llvm-ar)
 add_subdirectory(llvm-nm)
 add_subdirectory(llvm-size)
 
-add_subdirectory(llvm-ld)
 add_subdirectory(llvm-cov)
 add_subdirectory(llvm-prof)
 add_subdirectory(llvm-link)
@@ -44,7 +35,6 @@ add_subdirectory(llvm-dwarfdump)
 add_subdirectory(bugpoint)
 add_subdirectory(bugpoint-passes)
 add_subdirectory(llvm-bcanalyzer)
-add_subdirectory(llvm-stub)
 add_subdirectory(llvm-stress)
 
 if( NOT WIN32 )
@@ -58,14 +48,8 @@ if( LLVM_ENABLE_PIC )
   endif()
 endif()
 
-set(LLVM_CLANG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/clang" CACHE PATH "Path to Clang source directory")
-
-if (NOT ${LLVM_CLANG_SOURCE_DIR} STREQUAL ""
-    AND EXISTS ${LLVM_CLANG_SOURCE_DIR}/CMakeLists.txt)
-  option(LLVM_BUILD_CLANG "Whether to build Clang as part of LLVM" ON)
-  if (${LLVM_BUILD_CLANG})
-    add_subdirectory(${LLVM_CLANG_SOURCE_DIR} clang)
-  endif()
-endif ()
+add_llvm_external_project(clang)
+add_llvm_external_project(lld)
+add_llvm_external_project(polly)
 
 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index aba990f..df4aa9f 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-ld llvm-link llvm-mc llvm-nm llvm-objdump llvm-prof llvm-ranlib llvm-rtdyld llvm-size llvm-stub macho-dump opt
+subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-link llvm-mc llvm-nm llvm-objdump llvm-prof llvm-ranlib llvm-rtdyld llvm-size macho-dump opt
 
 [component_0]
 type = Group
diff --git a/tools/Makefile b/tools/Makefile
index 8bf091a..2b4b9b7 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -29,9 +29,9 @@ OPTIONAL_DIRS := lldb
 DIRS := llvm-config
 PARALLEL_DIRS := opt llvm-as llvm-dis \
                  llc llvm-ranlib llvm-ar llvm-nm \
-                 llvm-ld llvm-prof llvm-link \
+                 llvm-prof llvm-link \
                  lli llvm-extract llvm-mc \
-                 bugpoint llvm-bcanalyzer llvm-stub \
+                 bugpoint llvm-bcanalyzer \
                  llvm-diff macho-dump llvm-objdump llvm-readobj \
 	         llvm-rtdyld llvm-dwarfdump llvm-cov \
 	         llvm-size llvm-stress
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index 6b219bf..21636ea 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -156,7 +156,7 @@ bool BugDriver::run(std::string &ErrMsg) {
   // If we're not running as a child, the first thing that we must do is
   // determine what the problem is. Does the optimization series crash the
   // compiler, or does it produce illegal code?  We make the top-level
-  // decision by trying to run all of the passes on the the input program,
+  // decision by trying to run all of the passes on the input program,
   // which should generate a bitcode file.  If it does generate a bitcode
   // file, then we know the compiler didn't crash, so try to diagnose a
   // miscompilation.
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index ac8e159..888d2c8 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 25a2bae..d975d68 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -128,7 +128,7 @@ static int RunProgramRemotelyWithTimeout(const sys::Path &RemoteClientPath,
       ErrorFile.close();
     }
 
-    errs() << OS;
+    errs() << OS.str();
   }
 
   return ReturnCode;
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index cfd84c0..9c17da6 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -153,6 +153,8 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
         switch (tv->tv_u.tv_val) {
           case LDPO_REL:  // .o
           case LDPO_DYN:  // .so
+          // FIXME: Replace 3 with LDPO_PIE once that is in a released binutils.
+          case 3: // position independent executable
             output_type = LTO_CODEGEN_PIC_MODEL_DYNAMIC;
             break;
           case LDPO_EXEC:  // .exe
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index ceff8a6..8951050 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -18,6 +18,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/IRReader.h"
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <memory>
 using namespace llvm;
@@ -118,7 +120,7 @@ FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
        clEnumValN(TargetMachine::CGFT_AssemblyFile, "asm",
                   "Emit an assembly ('.s') file"),
        clEnumValN(TargetMachine::CGFT_ObjectFile, "obj",
-                  "Emit a native object ('.o') file [experimental]"),
+                  "Emit a native object ('.o') file"),
        clEnumValN(TargetMachine::CGFT_Null, "null",
                   "Emit nothing, for performance testing"),
        clEnumValEnd));
@@ -146,11 +148,6 @@ EnableFPMAD("enable-fp-mad",
   cl::init(false));
 
 static cl::opt<bool>
-PrintCode("print-machineinstrs",
-  cl::desc("Print generated machine code"),
-  cl::init(false));
-
-static cl::opt<bool>
 DisableFPElim("disable-fp-elim",
   cl::desc("Disable frame pointer elimination optimization"),
   cl::init(false));
@@ -161,11 +158,6 @@ DisableFPElimNonLeaf("disable-non-leaf-fp-elim",
   cl::init(false));
 
 static cl::opt<bool>
-DisableExcessPrecision("disable-excess-fp-precision",
-  cl::desc("Disable optimizations that may increase FP precision"),
-  cl::init(false));
-
-static cl::opt<bool>
 EnableUnsafeFPMath("enable-unsafe-fp-math",
   cl::desc("Enable optimizations that may decrease FP precision"),
   cl::init(false));
@@ -204,12 +196,30 @@ FloatABIForCalls("float-abi",
                "Hard float ABI (uses FP registers)"),
     clEnumValEnd));
 
+static cl::opt<llvm::FPOpFusion::FPOpFusionMode>
+FuseFPOps("fp-contract",
+  cl::desc("Enable aggresive formation of fused FP ops"),
+  cl::init(FPOpFusion::Standard),
+  cl::values(
+    clEnumValN(FPOpFusion::Fast, "fast",
+               "Fuse FP ops whenever profitable"),
+    clEnumValN(FPOpFusion::Standard, "on",
+               "Only fuse 'blessed' FP ops."),
+    clEnumValN(FPOpFusion::Strict, "off",
+               "Only fuse FP ops when the result won't be effected."),
+    clEnumValEnd));
+
 static cl::opt<bool>
 DontPlaceZerosInBSS("nozero-initialized-in-bss",
   cl::desc("Don't place zero-initialized symbols into bss section"),
   cl::init(false));
 
 static cl::opt<bool>
+DisableSimplifyLibCalls("disable-simplify-libcalls",
+  cl::desc("Disable simplify-libcalls"),
+  cl::init(false));
+
+static cl::opt<bool>
 EnableGuaranteedTailCallOpt("tailcallopt",
   cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
   cl::init(false));
@@ -229,11 +239,6 @@ EnableRealignStack("realign-stack",
   cl::desc("Realign stack if needed"),
   cl::init(true));
 
-static cl::opt<bool>
-DisableSwitchTables(cl::Hidden, "disable-jump-tables",
-  cl::desc("Do not generate jump tables."),
-  cl::init(false));
-
 static cl::opt<std::string>
 TrapFuncName("trap-func", cl::Hidden,
   cl::desc("Emit a call to trap function rather than a trap instruction"),
@@ -249,6 +254,19 @@ SegmentedStacks("segmented-stacks",
   cl::desc("Use segmented stacks if possible."),
   cl::init(false));
 
+static cl::opt<bool>
+UseInitArray("use-init-array",
+  cl::desc("Use .init_array instead of .ctors."),
+  cl::init(false));
+
+static cl::opt<std::string> StopAfter("stop-after",
+  cl::desc("Stop compilation after a specific pass"),
+  cl::value_desc("pass-name"),
+  cl::init(""));
+static cl::opt<std::string> StartAfter("start-after",
+  cl::desc("Resume compilation after a specific pass"),
+  cl::value_desc("pass-name"),
+  cl::init(""));
 
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
@@ -346,6 +364,15 @@ int main(int argc, char **argv) {
   InitializeAllAsmPrinters();
   InitializeAllAsmParsers();
 
+  // Initialize codegen and IR passes used by llc so that the -print-after,
+  // -print-before, and -stop-after options work.
+  PassRegistry *Registry = PassRegistry::getPassRegistry();
+  initializeCore(*Registry);
+  initializeCodeGen(*Registry);
+  initializeLoopStrengthReducePass(*Registry);
+  initializeLowerIntrinsicsPass(*Registry);
+  initializeUnreachableBlockElimPass(*Registry);
+
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
@@ -354,54 +381,39 @@ int main(int argc, char **argv) {
   // Load the module to be compiled...
   SMDiagnostic Err;
   std::auto_ptr<Module> M;
+  Module *mod = 0;
+  Triple TheTriple;
+
+  bool SkipModule = MCPU == "help" ||
+                    (!MAttrs.empty() && MAttrs.front() == "help");
+
+  // If user just wants to list available options, skip module loading
+  if (!SkipModule) {
+    M.reset(ParseIRFile(InputFilename, Err, Context));
+    mod = M.get();
+    if (mod == 0) {
+      Err.print(argv[0], errs());
+      return 1;
+    }
 
-  M.reset(ParseIRFile(InputFilename, Err, Context));
-  if (M.get() == 0) {
-    Err.print(argv[0], errs());
-    return 1;
+    // If we are supposed to override the target triple, do so now.
+    if (!TargetTriple.empty())
+      mod->setTargetTriple(Triple::normalize(TargetTriple));
+    TheTriple = Triple(mod->getTargetTriple());
+  } else {
+    TheTriple = Triple(Triple::normalize(TargetTriple));
   }
-  Module &mod = *M.get();
 
-  // If we are supposed to override the target triple, do so now.
-  if (!TargetTriple.empty())
-    mod.setTargetTriple(Triple::normalize(TargetTriple));
-
-  Triple TheTriple(mod.getTargetTriple());
   if (TheTriple.getTriple().empty())
     TheTriple.setTriple(sys::getDefaultTargetTriple());
 
-  // Allocate target machine.  First, check whether the user has explicitly
-  // specified an architecture to compile for. If so we have to look it up by
-  // name, because it might be a backend that has no mapping to a target triple.
-  const Target *TheTarget = 0;
-  if (!MArch.empty()) {
-    for (TargetRegistry::iterator it = TargetRegistry::begin(),
-           ie = TargetRegistry::end(); it != ie; ++it) {
-      if (MArch == it->getName()) {
-        TheTarget = &*it;
-        break;
-      }
-    }
-
-    if (!TheTarget) {
-      errs() << argv[0] << ": error: invalid target '" << MArch << "'.\n";
-      return 1;
-    }
-
-    // Adjust the triple to match (if known), otherwise stick with the
-    // module/host triple.
-    Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
-    if (Type != Triple::UnknownArch)
-      TheTriple.setArch(Type);
-  } else {
-    std::string Err;
-    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Err);
-    if (TheTarget == 0) {
-      errs() << argv[0] << ": error auto-selecting target for module '"
-             << Err << "'.  Please use the -march option to explicitly "
-             << "pick a target.\n";
-      return 1;
-    }
+  // Get the target specific parser.
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
+                                                         Error);
+  if (!TheTarget) {
+    errs() << argv[0] << ": " << Error;
+    return 1;
   }
 
   // Package up features to be passed to target/subtarget
@@ -427,10 +439,9 @@ int main(int argc, char **argv) {
 
   TargetOptions Options;
   Options.LessPreciseFPMADOption = EnableFPMAD;
-  Options.PrintMachineCode = PrintCode;
   Options.NoFramePointerElim = DisableFPElim;
   Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf;
-  Options.NoExcessFPPrecision = DisableExcessPrecision;
+  Options.AllowFPOpFusion = FuseFPOps;
   Options.UnsafeFPMath = EnableUnsafeFPMath;
   Options.NoInfsFPMath = EnableNoInfsFPMath;
   Options.NoNaNsFPMath = EnableNoNaNsFPMath;
@@ -444,16 +455,17 @@ int main(int argc, char **argv) {
   Options.DisableTailCalls = DisableTailCalls;
   Options.StackAlignmentOverride = OverrideStackAlignment;
   Options.RealignStack = EnableRealignStack;
-  Options.DisableJumpTables = DisableSwitchTables;
   Options.TrapFuncName = TrapFuncName;
   Options.PositionIndependentExecutable = EnablePIE;
   Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
 
   std::auto_ptr<TargetMachine>
     target(TheTarget->createTargetMachine(TheTriple.getTriple(),
                                           MCPU, FeaturesStr, Options,
                                           RelocModel, CMModel, OLvl));
   assert(target.get() && "Could not allocate target machine!");
+  assert(mod && "Should have exited after outputting help!");
   TargetMachine &Target = *target.get();
 
   if (DisableDotLoc)
@@ -473,7 +485,7 @@ int main(int argc, char **argv) {
       TheTriple.isMacOSXVersionLT(10, 6))
     Target.setMCUseLoc(false);
 
-  // Figure out where we are going to send the output...
+  // Figure out where we are going to send the output.
   OwningPtr<tool_output_file> Out
     (GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]));
   if (!Out) return 1;
@@ -481,11 +493,17 @@ int main(int argc, char **argv) {
   // Build up all of the passes that we want to do to the module.
   PassManager PM;
 
+  // Add an appropriate TargetLibraryInfo pass for the module's triple.
+  TargetLibraryInfo *TLI = new TargetLibraryInfo(TheTriple);
+  if (DisableSimplifyLibCalls)
+    TLI->disableAllFunctions();
+  PM.add(TLI);
+
   // Add the target data from the target machine, if it exists, or the module.
   if (const TargetData *TD = Target.getTargetData())
     PM.add(new TargetData(*TD));
   else
-    PM.add(new TargetData(&mod));
+    PM.add(new TargetData(mod));
 
   // Override default to generate verbose assembly.
   Target.setAsmVerbosityDefault(true);
@@ -501,8 +519,29 @@ int main(int argc, char **argv) {
   {
     formatted_raw_ostream FOS(Out->os());
 
+    AnalysisID StartAfterID = 0;
+    AnalysisID StopAfterID = 0;
+    const PassRegistry *PR = PassRegistry::getPassRegistry();
+    if (!StartAfter.empty()) {
+      const PassInfo *PI = PR->getPassInfo(StartAfter);
+      if (!PI) {
+        errs() << argv[0] << ": start-after pass is not registered.\n";
+        return 1;
+      }
+      StartAfterID = PI->getTypeInfo();
+    }
+    if (!StopAfter.empty()) {
+      const PassInfo *PI = PR->getPassInfo(StopAfter);
+      if (!PI) {
+        errs() << argv[0] << ": stop-after pass is not registered.\n";
+        return 1;
+      }
+      StopAfterID = PI->getTypeInfo();
+    }
+
     // Ask the target to add backend passes as necessary.
-    if (Target.addPassesToEmitFile(PM, FOS, FileType, NoVerify)) {
+    if (Target.addPassesToEmitFile(PM, FOS, FileType, NoVerify,
+                                   StartAfterID, StopAfterID)) {
       errs() << argv[0] << ": target does not support generation of this"
              << " file type!\n";
       return 1;
@@ -511,7 +550,7 @@ int main(int argc, char **argv) {
     // Before executing passes, print the final values of the LLVM options.
     cl::PrintOptionValues();
 
-    PM.run(mod);
+    PM.run(*mod);
   }
 
   // Declare success.
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 2e2bf7d..b6c9299 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -35,8 +35,20 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Memory.h"
 #include <cerrno>
 
+#ifdef __linux__
+// These includes used by LLIMCJITMemoryManager::getPointerToNamedFunction()
+// for Glibc trickery. Look comments in this function for more information.
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
 #ifdef __CYGWIN__
 #include <cygwin/version.h>
 #if defined(CYGWIN_VERSION_DLL_MAJOR) && CYGWIN_VERSION_DLL_MAJOR<1007
@@ -175,6 +187,191 @@ static void do_shutdown() {
 #endif
 }
 
+// Memory manager for MCJIT
+class LLIMCJITMemoryManager : public JITMemoryManager {
+public:
+  SmallVector<sys::MemoryBlock, 16> AllocatedDataMem;
+  SmallVector<sys::MemoryBlock, 16> AllocatedCodeMem;
+  SmallVector<sys::MemoryBlock, 16> FreeCodeMem;
+
+  LLIMCJITMemoryManager() { }
+  ~LLIMCJITMemoryManager();
+
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true);
+
+  // Invalidate instruction cache for code sections. Some platforms with
+  // separate data cache and instruction cache require explicit cache flush,
+  // otherwise JIT code manipulations (like resolved relocations) will get to
+  // the data cache but not to the instruction cache.
+  virtual void invalidateInstructionCache();
+
+  // The MCJITMemoryManager doesn't use the following functions, so we don't
+  // need implement them.
+  virtual void setMemoryWritable() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void setMemoryExecutable() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void setPoisonMemory(bool poison) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void AllocateGOT() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t *getGOTBase() const {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *startFunctionBody(const Function *F,
+                                     uintptr_t &ActualSize){
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                                unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                               uint8_t *FunctionEnd) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void deallocateFunctionBody(void *Body) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t* startExceptionTable(const Function* F,
+                                       uintptr_t &ActualSize) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void endExceptionTable(const Function *F, uint8_t *TableStart,
+                                 uint8_t *TableEnd, uint8_t* FrameRegister) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void deallocateExceptionTable(void *ET) {
+    llvm_unreachable("Unexpected call!");
+  }
+};
+
+uint8_t *LLIMCJITMemoryManager::allocateDataSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID) {
+  if (!Alignment)
+    Alignment = 16;
+  uint8_t *Addr = (uint8_t*)calloc((Size + Alignment - 1)/Alignment, Alignment);
+  AllocatedDataMem.push_back(sys::MemoryBlock(Addr, Size));
+  return Addr;
+}
+
+uint8_t *LLIMCJITMemoryManager::allocateCodeSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID) {
+  if (!Alignment)
+    Alignment = 16;
+  unsigned NeedAllocate = Alignment * ((Size + Alignment - 1)/Alignment + 1);
+  uintptr_t Addr = 0;
+  // Look in the list of free code memory regions and use a block there if one
+  // is available.
+  for (int i = 0, e = FreeCodeMem.size(); i != e; ++i) {
+    sys::MemoryBlock &MB = FreeCodeMem[i];
+    if (MB.size() >= NeedAllocate) {
+      Addr = (uintptr_t)MB.base();
+      uintptr_t EndOfBlock = Addr + MB.size();
+      // Align the address.
+      Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+      // Store cutted free memory block.
+      FreeCodeMem[i] = sys::MemoryBlock((void*)(Addr + Size),
+                                        EndOfBlock - Addr - Size);
+      return (uint8_t*)Addr;
+    }
+  }
+
+  // No pre-allocated free block was large enough. Allocate a new memory region.
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(NeedAllocate, 0, 0);
+
+  AllocatedCodeMem.push_back(MB);
+  Addr = (uintptr_t)MB.base();
+  uintptr_t EndOfBlock = Addr + MB.size();
+  // Align the address.
+  Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+  // The AllocateRWX may allocate much more memory than we need. In this case,
+  // we store the unused memory as a free memory block.
+  unsigned FreeSize = EndOfBlock-Addr-Size;
+  if (FreeSize > 16)
+    FreeCodeMem.push_back(sys::MemoryBlock((void*)(Addr + Size), FreeSize));
+
+  // Return aligned address
+  return (uint8_t*)Addr;
+}
+
+void LLIMCJITMemoryManager::invalidateInstructionCache() {
+  for (int i = 0, e = AllocatedCodeMem.size(); i != e; ++i)
+    sys::Memory::InvalidateInstructionCache(AllocatedCodeMem[i].base(),
+                                            AllocatedCodeMem[i].size());
+}
+
+void *LLIMCJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                       bool AbortOnFailure) {
+#if defined(__linux__)
+  //===--------------------------------------------------------------------===//
+  // Function stubs that are invoked instead of certain library calls
+  //
+  // Force the following functions to be linked in to anything that uses the
+  // JIT. This is a hack designed to work around the all-too-clever Glibc
+  // strategy of making these functions work differently when inlined vs. when
+  // not inlined, and hiding their real definitions in a separate archive file
+  // that the dynamic linker can't see. For more info, search for
+  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+  if (Name == "stat") return (void*)(intptr_t)&stat;
+  if (Name == "fstat") return (void*)(intptr_t)&fstat;
+  if (Name == "lstat") return (void*)(intptr_t)&lstat;
+  if (Name == "stat64") return (void*)(intptr_t)&stat64;
+  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
+  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
+  if (Name == "atexit") return (void*)(intptr_t)&atexit;
+  if (Name == "mknod") return (void*)(intptr_t)&mknod;
+#endif // __linux__
+
+  const char *NameStr = Name.c_str();
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  if (AbortOnFailure)
+    report_fatal_error("Program used external function '" + Name +
+                      "' which could not be resolved!");
+  return 0;
+}
+
+LLIMCJITMemoryManager::~LLIMCJITMemoryManager() {
+  for (unsigned i = 0, e = AllocatedCodeMem.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(AllocatedCodeMem[i]);
+  for (unsigned i = 0, e = AllocatedDataMem.size(); i != e; ++i)
+    free(AllocatedDataMem[i].base());
+}
+
 //===----------------------------------------------------------------------===//
 // main Driver function
 //
@@ -222,8 +419,6 @@ int main(int argc, char **argv, char * const *envp) {
   builder.setRelocationModel(RelocModel);
   builder.setCodeModel(CMModel);
   builder.setErrorStr(&ErrorMsg);
-  builder.setJITMemoryManager(ForceInterpreter ? 0 :
-                              JITMemoryManager::CreateDefaultMemManager());
   builder.setEngineKind(ForceInterpreter
                         ? EngineKind::Interpreter
                         : EngineKind::JIT);
@@ -233,9 +428,14 @@ int main(int argc, char **argv, char * const *envp) {
     Mod->setTargetTriple(Triple::normalize(TargetTriple));
 
   // Enable MCJIT if desired.
+  LLIMCJITMemoryManager *JMM = 0;
   if (UseMCJIT && !ForceInterpreter) {
     builder.setUseMCJIT(true);
-    builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+    JMM = new LLIMCJITMemoryManager();
+    builder.setJITMemoryManager(JMM);
+  } else {
+    builder.setJITMemoryManager(ForceInterpreter ? 0 :
+                                JITMemoryManager::CreateDefaultMemManager());
   }
 
   CodeGenOpt::Level OLvl = CodeGenOpt::Default;
@@ -266,6 +466,10 @@ int main(int argc, char **argv, char * const *envp) {
     exit(1);
   }
 
+  // Clear instruction cache before code will be executed.
+  if (JMM)
+    JMM->invalidateInstructionCache();
+
   // The following functions have no effect if their respective profiling
   // support wasn't enabled in the build configuration.
   EE->RegisterJITEventListener(
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index c1c8b24..7c53701 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -50,7 +50,7 @@ static cl::extrahelp MoreHelp(
   "  m[abiSs]     - move file(s) in the archive\n"
   "  p[kN]        - print file(s) found in the archive\n"
   "  q[ufsS]      - quick append file(s) to the archive\n"
-  "  r[abfiuzRsS] - replace or insert file(s) into the archive\n"
+  "  r[abfiuRsS]  - replace or insert file(s) into the archive\n"
   "  t            - display contents of archive\n"
   "  x[No]        - extract file(s) from the archive\n"
   "\nMODIFIERS (operation specific):\n"
@@ -66,7 +66,6 @@ static cl::extrahelp MoreHelp(
   "  [s] - create an archive index (cf. ranlib)\n"
   "  [S] - do not build a symbol table\n"
   "  [u] - update only files newer than archive contents\n"
-  "  [z] - compress files before inserting/extracting\n"
   "\nMODIFIERS (generic):\n"
   "  [c] - do not warn if the library had to be created\n"
   "  [v] - be verbose about actions taken\n"
@@ -101,7 +100,6 @@ bool SymTable = true;            ///< 's' & 'S' modifiers
 bool OnlyUpdate = false;         ///< 'u' modifier
 bool Verbose = false;            ///< 'v' modifier
 bool ReallyVerbose = false;      ///< 'V' modifier
-bool Compression = false;        ///< 'z' modifier
 
 // Relative Positional Argument (for insert/move). This variable holds
 // the name of the archive member to which the 'a', 'b' or 'i' modifier
@@ -208,7 +206,6 @@ ArchiveOperation parseCommandLine() {
     case 'u': OnlyUpdate = true; break;
     case 'v': Verbose = true; break;
     case 'V': Verbose = ReallyVerbose = true; break;
-    case 'z': Compression = true; break;
     case 'a':
       getRelPos();
       AddAfter = true;
@@ -260,8 +257,6 @@ ArchiveOperation parseCommandLine() {
     throw "The 'f' modifier is only applicable to the 'q' and 'r' operations";
   if (OnlyUpdate && Operation != ReplaceOrInsert)
     throw "The 'u' modifier is only applicable to the 'r' operation";
-  if (Compression && Operation!=ReplaceOrInsert && Operation!=Extract)
-    throw "The 'z' modifier is only applicable to the 'r' and 'x' operations";
   if (Count > 1 && Members.size() > 1)
     throw "Only one member name may be specified with the 'N' modifier";
 
@@ -413,8 +408,6 @@ doDisplayTable(std::string* ErrMsg) {
         // Zrw-r--r--  500/ 500    525 Nov  8 17:42 2004 Makefile
         if (I->isBitcode())
           outs() << "b";
-        else if (I->isCompressed())
-          outs() << "Z";
         else
           outs() << " ";
         unsigned mode = I->getMode();
@@ -437,7 +430,7 @@ doDisplayTable(std::string* ErrMsg) {
 }
 
 // doExtract - Implement the 'x' operation. This function extracts files back to
-// the file system, making sure to uncompress any that were compressed
+// the file system.
 bool
 doExtract(std::string* ErrMsg) {
   if (buildPaths(false, ErrMsg))
@@ -503,7 +496,7 @@ doDelete(std::string* ErrMsg) {
   }
 
   // We're done editting, reconstruct the archive.
-  if (TheArchive->writeToDisk(SymTable,TruncateNames,Compression,ErrMsg))
+  if (TheArchive->writeToDisk(SymTable,TruncateNames,ErrMsg))
     return true;
   if (ReallyVerbose)
     printSymbolTable();
@@ -558,7 +551,7 @@ doMove(std::string* ErrMsg) {
   }
 
   // We're done editting, reconstruct the archive.
-  if (TheArchive->writeToDisk(SymTable,TruncateNames,Compression,ErrMsg))
+  if (TheArchive->writeToDisk(SymTable,TruncateNames,ErrMsg))
     return true;
   if (ReallyVerbose)
     printSymbolTable();
@@ -583,7 +576,7 @@ doQuickAppend(std::string* ErrMsg) {
   }
 
   // We're done editting, reconstruct the archive.
-  if (TheArchive->writeToDisk(SymTable,TruncateNames,Compression,ErrMsg))
+  if (TheArchive->writeToDisk(SymTable,TruncateNames,ErrMsg))
     return true;
   if (ReallyVerbose)
     printSymbolTable();
@@ -681,7 +674,7 @@ doReplaceOrInsert(std::string* ErrMsg) {
   }
 
   // We're done editting, reconstruct the archive.
-  if (TheArchive->writeToDisk(SymTable,TruncateNames,Compression,ErrMsg))
+  if (TheArchive->writeToDisk(SymTable,TruncateNames,ErrMsg))
     return true;
   if (ReallyVerbose)
     printSymbolTable();
diff --git a/tools/llvm-config/Makefile b/tools/llvm-config/Makefile
index 3f11730..e8c8692 100644
--- a/tools/llvm-config/Makefile
+++ b/tools/llvm-config/Makefile
@@ -57,3 +57,11 @@ $(ObjDir)/BuildVariables.inc: $(BUILDVARIABLES_SRCPATH) Makefile $(ObjDir)/.dir
 	  >> temp.sed
 	$(Verb) $(SED) -f temp.sed < $< > $@
 	$(Verb) $(RM) temp.sed
+
+# When cross-compiling, install a version of llvm-config that runs on the host.
+ifeq ($(LLVM_CROSS_COMPILING),1)
+install:: $(DESTDIR)$(PROJ_bindir)
+	$(Echo) Installing llvm-config-host
+	$(Verb) $(ProgInstall) $(BuildLLVMToolDir)/llvm-config \
+	  $(DESTDIR)$(PROJ_bindir)/llvm-config-host
+endif
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index 126542c..7edf5ec 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -190,9 +190,9 @@ int main(int argc, char **argv) {
     sys::path::parent_path(CurrentPath)).str();
 
   // Check to see if we are inside a development tree by comparing to possible
-  // locations (prefix style or CMake style). This could be wrong in the face of
-  // symbolic links, but is good enough.
-  if (CurrentExecPrefix == std::string(LLVM_OBJ_ROOT) + "/" + LLVM_BUILDMODE) {
+  // locations (prefix style or CMake style).
+  if (sys::fs::equivalent(CurrentExecPrefix,
+                          Twine(LLVM_OBJ_ROOT) + "/" + LLVM_BUILDMODE)) {
     IsInDevelopmentTree = true;
     DevelopmentTreeLayout = MakefileStyle;
 
@@ -204,11 +204,12 @@ int main(int argc, char **argv) {
     } else {
       ActiveObjRoot = LLVM_OBJ_ROOT;
     }
-  } else if (CurrentExecPrefix == std::string(LLVM_OBJ_ROOT)) {
+  } else if (sys::fs::equivalent(CurrentExecPrefix, LLVM_OBJ_ROOT)) {
     IsInDevelopmentTree = true;
     DevelopmentTreeLayout = CMakeStyle;
     ActiveObjRoot = LLVM_OBJ_ROOT;
-  } else if (CurrentExecPrefix == std::string(LLVM_OBJ_ROOT) + "/bin") {
+  } else if (sys::fs::equivalent(CurrentExecPrefix,
+                                 Twine(LLVM_OBJ_ROOT) + "/bin")) {
     IsInDevelopmentTree = true;
     DevelopmentTreeLayout = CMakeBuildModeStyle;
     ActiveObjRoot = LLVM_OBJ_ROOT;
diff --git a/tools/llvm-diff/DiffConsumer.cpp b/tools/llvm-diff/DiffConsumer.cpp
index 0528039..91c1699 100644
--- a/tools/llvm-diff/DiffConsumer.cpp
+++ b/tools/llvm-diff/DiffConsumer.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This files implements the the LLVM difference Consumer
+// This files implements the LLVM difference Consumer
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/tools/llvm-diff/DiffConsumer.h b/tools/llvm-diff/DiffConsumer.h
index 2060fe1..98e369b 100644
--- a/tools/llvm-diff/DiffConsumer.h
+++ b/tools/llvm-diff/DiffConsumer.h
@@ -67,8 +67,6 @@ namespace llvm {
     };
 
     raw_ostream &out;
-    Module *LModule;
-    Module *RModule;
     SmallVector<DiffContext, 5> contexts;
     bool Differences;
     unsigned Indent;
@@ -78,8 +76,8 @@ namespace llvm {
     void indent();
 
   public:
-    DiffConsumer(Module *L, Module *R)
-      : out(errs()), LModule(L), RModule(R), Differences(false), Indent(0) {}
+    DiffConsumer()
+      : out(errs()), Differences(false), Indent(0) {}
 
     bool hadDifferences() const;
     void enterContext(Value *L, Value *R);
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index a5a99f5..0c1e30c 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -318,15 +318,15 @@ class FunctionDifferenceEngine {
 
       bool Difference = false;
 
-      DenseMap<ConstantInt*,BasicBlock*> LCases;
+      DenseMap<Constant*, BasicBlock*> LCases;
       
       for (SwitchInst::CaseIt I = LI->case_begin(), E = LI->case_end();
            I != E; ++I)
-        LCases[I.getCaseValue()] = I.getCaseSuccessor();
+        LCases[I.getCaseValueEx()] = I.getCaseSuccessor();
         
       for (SwitchInst::CaseIt I = RI->case_begin(), E = RI->case_end();
            I != E; ++I) {
-        ConstantInt *CaseValue = I.getCaseValue();
+        IntegersSubset CaseValue = I.getCaseValueEx();
         BasicBlock *LCase = LCases[CaseValue];
         if (LCase) {
           if (TryUnify) tryUnify(LCase, I.getCaseSuccessor());
@@ -338,7 +338,7 @@ class FunctionDifferenceEngine {
         }
       }
       if (!Difference)
-        for (DenseMap<ConstantInt*,BasicBlock*>::iterator
+        for (DenseMap<Constant*, BasicBlock*>::iterator
                I = LCases.begin(), E = LCases.end(); I != E; ++I) {
           if (Complain)
             Engine.logf("left switch has extra case %l") << I->first;
diff --git a/tools/llvm-diff/DifferenceEngine.h b/tools/llvm-diff/DifferenceEngine.h
index 7ea79e4..0246d8f 100644
--- a/tools/llvm-diff/DifferenceEngine.h
+++ b/tools/llvm-diff/DifferenceEngine.h
@@ -59,8 +59,8 @@ namespace llvm {
       virtual ~Oracle() {}
     };
 
-    DifferenceEngine(LLVMContext &context, Consumer &consumer)
-      : context(context), consumer(consumer), globalValueOracle(0) {}
+    DifferenceEngine(Consumer &consumer)
+      : consumer(consumer), globalValueOracle(0) {}
 
     void diff(Module *L, Module *R);
     void diff(Function *L, Function *R);
@@ -84,7 +84,6 @@ namespace llvm {
     bool equivalentAsOperands(GlobalValue *L, GlobalValue *R);
 
   private:
-    LLVMContext &context;
     Consumer &consumer;
     Oracle *globalValueOracle;
   };
diff --git a/tools/llvm-diff/llvm-diff.cpp b/tools/llvm-diff/llvm-diff.cpp
index 774169b..45957b3 100644
--- a/tools/llvm-diff/llvm-diff.cpp
+++ b/tools/llvm-diff/llvm-diff.cpp
@@ -78,8 +78,8 @@ int main(int argc, char **argv) {
   Module *RModule = ReadModule(Context, RightFilename);
   if (!LModule || !RModule) return 1;
 
-  DiffConsumer Consumer(LModule, RModule);
-  DifferenceEngine Engine(Context, Consumer);
+  DiffConsumer Consumer;
+  DifferenceEngine Engine(Consumer);
 
   // If any global names were given, just diff those.
   if (!GlobalsToCompare.empty()) {
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 6450ea6..41f023d 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -17,11 +17,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LLVMContext.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/AssemblyAnnotationWriter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataStream.h"
@@ -93,7 +93,6 @@ public:
         DIVariable Var(DDI->getVariable());
         if (!Padded) {
           OS.PadToColumn(50);
-          Padded = true;
           OS << ";";
         }
         OS << " [debug variable = " << Var.getName() << "]";
@@ -102,7 +101,6 @@ public:
         DIVariable Var(DVI->getVariable());
         if (!Padded) {
           OS.PadToColumn(50);
-          Padded = true;
           OS << ";";
         }
         OS << " [debug variable = " << Var.getName() << "]";
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index ca0493d..ec0b4ae 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -39,6 +39,11 @@ static cl::opt<unsigned long long>
 Address("address", cl::init(-1ULL),
         cl::desc("Print line information for a given address"));
 
+static cl::opt<bool>
+PrintFunctions("functions", cl::init(false),
+               cl::desc("Print function names as well as line information "
+                        "for a given address"));
+
 static void DumpInput(const StringRef &Filename) {
   OwningPtr<MemoryBuffer> Buff;
 
@@ -92,7 +97,14 @@ static void DumpInput(const StringRef &Filename) {
     dictx->dump(outs());
   } else {
     // Print line info for the specified address.
-    DILineInfo dli = dictx->getLineInfoForAddress(Address);
+    int spec_flags = DILineInfoSpecifier::FileLineInfo |
+                     DILineInfoSpecifier::AbsoluteFilePath;
+    if (PrintFunctions)
+      spec_flags |= DILineInfoSpecifier::FunctionName;
+    DILineInfo dli = dictx->getLineInfoForAddress(Address, spec_flags);
+    if (PrintFunctions)
+      outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
+             << "\n";
     outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
            << dli.getLine() << ':' << dli.getColumn() << '\n';
   }
diff --git a/tools/llvm-ld/CMakeLists.txt b/tools/llvm-ld/CMakeLists.txt
deleted file mode 100644
index d328a04..0000000
--- a/tools/llvm-ld/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize)
-
-add_llvm_tool(llvm-ld
-  Optimize.cpp
-  llvm-ld.cpp
-  )
-
-add_dependencies(llvm-ld llvm-stub)
diff --git a/tools/llvm-ld/LLVMBuild.txt b/tools/llvm-ld/LLVMBuild.txt
deleted file mode 100644
index eed0452..0000000
--- a/tools/llvm-ld/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./tools/llvm-ld/LLVMBuild.txt ----------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Tool
-name = llvm-ld
-parent = Tools
-required_libraries = Archive BitWriter IPO Linker Scalar
diff --git a/tools/llvm-ld/Optimize.cpp b/tools/llvm-ld/Optimize.cpp
deleted file mode 100644
index 7f3f900..0000000
--- a/tools/llvm-ld/Optimize.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===- Optimize.cpp - Optimize a complete program -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements all optimization of the linked module for llvm-ld.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/PassNameParser.h"
-#include "llvm/Support/PluginLoader.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-// Pass Name Options as generated by the PassNameParser
-static cl::list<const PassInfo*, bool, PassNameParser>
-  OptimizationList(cl::desc("Optimizations available:"));
-
-//Don't verify at the end
-static cl::opt<bool> DontVerify("disable-verify", cl::ReallyHidden);
-
-static cl::opt<bool> DisableInline("disable-inlining",
-  cl::desc("Do not run the inliner pass"));
-
-static cl::opt<bool>
-DisableOptimizations("disable-opt",
-  cl::desc("Do not run any optimization passes"));
-
-static cl::opt<bool> DisableInternalize("disable-internalize",
-  cl::desc("Do not mark all symbols as internal"));
-
-static cl::opt<bool> VerifyEach("verify-each",
- cl::desc("Verify intermediate results of all passes"));
-
-static cl::alias ExportDynamic("export-dynamic",
-  cl::aliasopt(DisableInternalize),
-  cl::desc("Alias for -disable-internalize"));
-
-static cl::opt<bool> Strip("strip-all", 
-  cl::desc("Strip all symbol info from executable"));
-
-static cl::alias A0("s", cl::desc("Alias for --strip-all"), 
-  cl::aliasopt(Strip));
-
-static cl::opt<bool> StripDebug("strip-debug",
-  cl::desc("Strip debugger symbol info from executable"));
-
-static cl::alias A1("S", cl::desc("Alias for --strip-debug"),
-  cl::aliasopt(StripDebug));
-
-// A utility function that adds a pass to the pass manager but will also add
-// a verifier pass after if we're supposed to verify.
-static inline void addPass(PassManager &PM, Pass *P) {
-  // Add the pass to the pass manager...
-  PM.add(P);
-
-  // If we are verifying all of the intermediate steps, add the verifier...
-  if (VerifyEach)
-    PM.add(createVerifierPass());
-}
-
-namespace llvm {
-/// Optimize - Perform link time optimizations. This will run the scalar
-/// optimizations, any loaded plugin-optimization modules, and then the
-/// inter-procedural optimizations if applicable.
-void Optimize(Module *M) {
-
-  // Instantiate the pass manager to organize the passes.
-  PassManager Passes;
-
-  // If we're verifying, start off with a verification pass.
-  if (VerifyEach)
-    Passes.add(createVerifierPass());
-
-  // Add an appropriate TargetData instance for this module...
-  addPass(Passes, new TargetData(M));
-
-  if (!DisableOptimizations)
-    PassManagerBuilder().populateLTOPassManager(Passes, !DisableInternalize,
-                                                !DisableInline);
-
-  // If the -s or -S command line options were specified, strip the symbols out
-  // of the resulting program to make it smaller.  -s and -S are GNU ld options
-  // that we are supporting; they alias -strip-all and -strip-debug.
-  if (Strip || StripDebug)
-    addPass(Passes, createStripSymbolsPass(StripDebug && !Strip));
-
-  // Create a new optimization pass for each one specified on the command line
-  std::auto_ptr<TargetMachine> target;
-  for (unsigned i = 0; i < OptimizationList.size(); ++i) {
-    const PassInfo *Opt = OptimizationList[i];
-    if (Opt->getNormalCtor())
-      addPass(Passes, Opt->getNormalCtor()());
-    else
-      errs() << "llvm-ld: cannot create pass: " << Opt->getPassName() 
-             << "\n";
-  }
-
-  // The user's passes may leave cruft around. Clean up after them them but
-  // only if we haven't got DisableOptimizations set
-  if (!DisableOptimizations) {
-    addPass(Passes, createInstructionCombiningPass());
-    addPass(Passes, createCFGSimplificationPass());
-    addPass(Passes, createAggressiveDCEPass());
-    addPass(Passes, createGlobalDCEPass());
-  }
-
-  // Make sure everything is still good.
-  if (!DontVerify)
-    Passes.add(createVerifierPass());
-
-  // Run our queue of passes all at once now, efficiently.
-  Passes.run(*M);
-}
-
-}
diff --git a/tools/llvm-ld/llvm-ld.cpp b/tools/llvm-ld/llvm-ld.cpp
deleted file mode 100644
index ecf0476..0000000
--- a/tools/llvm-ld/llvm-ld.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-//===- llvm-ld.cpp - LLVM 'ld' compatible linker --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This utility is intended to be compatible with GCC, and follows standard
-// system 'ld' conventions.  As such, the default output file is ./a.out.
-// Additionally, this program outputs a shell script that is used to invoke LLI
-// to execute the program.  In this manner, the generated executable (a.out for
-// example), is directly executable, whereas the bitcode file actually lives in
-// the a.out.bc file generated by this program.
-//
-// Note that if someone (or a script) deletes the executable program generated,
-// the .bc file will be left around.  Considering that this is a temporary hack,
-// I'm not too worried about this.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/LinkAllVMCore.h"
-#include "llvm/Linker.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/Signals.h"
-#include <memory>
-#include <cstring>
-using namespace llvm;
-
-// Rightly this should go in a header file but it just seems such a waste.
-namespace llvm {
-extern void Optimize(Module*);
-}
-
-// Input/Output Options
-static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
-  cl::desc("<input bitcode files>"));
-
-static cl::opt<std::string> OutputFilename("o", cl::init("a.out"),
-  cl::desc("Override output filename"),
-  cl::value_desc("filename"));
-
-static cl::opt<std::string> BitcodeOutputFilename("b", cl::init(""),
-  cl::desc("Override bitcode output filename"),
-  cl::value_desc("filename"));
-
-static cl::opt<bool> Verbose("v",
-  cl::desc("Print information about actions taken"));
-
-static cl::list<std::string> LibPaths("L", cl::Prefix,
-  cl::desc("Specify a library search path"),
-  cl::value_desc("directory"));
-
-static cl::list<std::string> FrameworkPaths("F", cl::Prefix,
-  cl::desc("Specify a framework search path"),
-  cl::value_desc("directory"));
-
-static cl::list<std::string> Libraries("l", cl::Prefix,
-  cl::desc("Specify libraries to link to"),
-  cl::value_desc("library prefix"));
-
-static cl::list<std::string> Frameworks("framework",
-  cl::desc("Specify frameworks to link to"),
-  cl::value_desc("framework"));
-
-// Options to control the linking, optimization, and code gen processes
-static cl::opt<bool> LinkAsLibrary("link-as-library",
-  cl::desc("Link the .bc files together as a library, not an executable"));
-
-static cl::alias Relink("r", cl::aliasopt(LinkAsLibrary),
-  cl::desc("Alias for -link-as-library"));
-
-static cl::opt<bool> Native("native",
-  cl::desc("Generate a native binary instead of a shell script"));
-
-static cl::opt<bool>NativeCBE("native-cbe",
-  cl::desc("Generate a native binary with the C backend and GCC"));
-
-static cl::list<std::string> PostLinkOpts("post-link-opts",
-  cl::value_desc("path"),
-  cl::desc("Run one or more optimization programs after linking"));
-
-static cl::list<std::string> XLinker("Xlinker", cl::value_desc("option"),
-  cl::desc("Pass options to the system linker"));
-
-// Compatibility options that llvm-ld ignores but are supported for
-// compatibility with LD
-static cl::opt<std::string> CO3("soname", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static cl::opt<std::string> CO4("version-script", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static cl::opt<bool> CO5("eh-frame-hdr", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static  cl::opt<std::string> CO6("h", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static cl::opt<bool> CO7("start-group", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static cl::opt<bool> CO8("end-group", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-static cl::opt<std::string> CO9("m", cl::Hidden,
-  cl::desc("Compatibility option: ignored"));
-
-/// This is just for convenience so it doesn't have to be passed around
-/// everywhere.
-static std::string progname;
-
-/// FileRemover objects to clean up output files in the event of an error.
-static FileRemover OutputRemover;
-static FileRemover BitcodeOutputRemover;
-
-/// PrintAndExit - Prints a message to standard error and exits with error code
-///
-/// Inputs:
-///  Message  - The message to print to standard error.
-///
-static void PrintAndExit(const std::string &Message, Module *M, int errcode = 1) {
-  errs() << progname << ": " << Message << "\n";
-  delete M;
-  llvm_shutdown();
-  exit(errcode);
-}
-
-static void PrintCommand(const std::vector<const char*> &args) {
-  std::vector<const char*>::const_iterator I = args.begin(), E = args.end();
-  for (; I != E; ++I)
-    if (*I)
-      errs() << "'" << *I << "'" << " ";
-  errs() << "\n";
-}
-
-/// CopyEnv - This function takes an array of environment variables and makes a
-/// copy of it.  This copy can then be manipulated any way the caller likes
-/// without affecting the process's real environment.
-///
-/// Inputs:
-///  envp - An array of C strings containing an environment.
-///
-/// Return value:
-///  NULL - An error occurred.
-///
-///  Otherwise, a pointer to a new array of C strings is returned.  Every string
-///  in the array is a duplicate of the one in the original array (i.e. we do
-///  not copy the char *'s from one array to another).
-///
-static char ** CopyEnv(char ** const envp) {
-  // Count the number of entries in the old list;
-  unsigned entries;   // The number of entries in the old environment list
-  for (entries = 0; envp[entries] != NULL; entries++)
-    /*empty*/;
-
-  // Add one more entry for the NULL pointer that ends the list.
-  ++entries;
-
-  // If there are no entries at all, just return NULL.
-  if (entries == 0)
-    return NULL;
-
-  // Allocate a new environment list.
-  char **newenv = new char* [entries];
-  if (newenv == NULL)
-    return NULL;
-
-  // Make a copy of the list.  Don't forget the NULL that ends the list.
-  entries = 0;
-  while (envp[entries] != NULL) {
-    size_t len = strlen(envp[entries]) + 1;
-    newenv[entries] = new char[len];
-    memcpy(newenv[entries], envp[entries], len);
-    ++entries;
-  }
-  newenv[entries] = NULL;
-
-  return newenv;
-}
-
-
-/// RemoveEnv - Remove the specified environment variable from the environment
-/// array.
-///
-/// Inputs:
-///  name - The name of the variable to remove.  It cannot be NULL.
-///  envp - The array of environment variables.  It cannot be NULL.
-///
-/// Notes:
-///  This is mainly done because functions to remove items from the environment
-///  are not available across all platforms.  In particular, Solaris does not
-///  seem to have an unsetenv() function or a setenv() function (or they are
-///  undocumented if they do exist).
-///
-static void RemoveEnv(const char * name, char ** const envp) {
-  for (unsigned index=0; envp[index] != NULL; index++) {
-    // Find the first equals sign in the array and make it an EOS character.
-    char *p = strchr (envp[index], '=');
-    if (p == NULL)
-      continue;
-    else
-      *p = '\0';
-
-    // Compare the two strings.  If they are equal, zap this string.
-    // Otherwise, restore it.
-    if (!strcmp(name, envp[index]))
-      *envp[index] = '\0';
-    else
-      *p = '=';
-  }
-
-  return;
-}
-
-/// GenerateBitcode - generates a bitcode file from the module provided
-void GenerateBitcode(Module* M, const std::string& FileName) {
-
-  if (Verbose)
-    errs() << "Generating Bitcode To " << FileName << '\n';
-
-  // Create the output file.
-  std::string ErrorInfo;
-  tool_output_file Out(FileName.c_str(), ErrorInfo,
-                       raw_fd_ostream::F_Binary);
-  if (!ErrorInfo.empty()) {
-    PrintAndExit(ErrorInfo, M);
-    return;
-  }
-
-  // Write it out
-  WriteBitcodeToFile(M, Out.os());
-  Out.keep();
-}
-
-/// GenerateAssembly - generates a native assembly language source file from the
-/// specified bitcode file.
-///
-/// Inputs:
-///  InputFilename  - The name of the input bitcode file.
-///  OutputFilename - The name of the file to generate.
-///  llc            - The pathname to use for LLC.
-///  envp           - The environment to use when running LLC.
-///
-/// Return non-zero value on error.
-///
-static int GenerateAssembly(const std::string &OutputFilename,
-                            const std::string &InputFilename,
-                            const sys::Path &llc,
-                            std::string &ErrMsg ) {
-  // Run LLC to convert the bitcode file into assembly code.
-  std::vector<const char*> args;
-  args.push_back(llc.c_str());
-  // We will use GCC to assemble the program so set the assembly syntax to AT&T,
-  // regardless of what the target in the bitcode file is.
-  args.push_back("-x86-asm-syntax=att");
-  args.push_back("-o");
-  args.push_back(OutputFilename.c_str());
-  args.push_back(InputFilename.c_str());
-  args.push_back(0);
-
-  if (Verbose) {
-    errs() << "Generating Assembly With: \n";
-    PrintCommand(args);
-  }
-
-  return sys::Program::ExecuteAndWait(llc, &args[0], 0, 0, 0, 0, &ErrMsg);
-}
-
-/// GenerateCFile - generates a C source file from the specified bitcode file.
-static int GenerateCFile(const std::string &OutputFile,
-                         const std::string &InputFile,
-                         const sys::Path &llc,
-                         std::string& ErrMsg) {
-  // Run LLC to convert the bitcode file into C.
-  std::vector<const char*> args;
-  args.push_back(llc.c_str());
-  args.push_back("-march=c");
-  args.push_back("-o");
-  args.push_back(OutputFile.c_str());
-  args.push_back(InputFile.c_str());
-  args.push_back(0);
-
-  if (Verbose) {
-    errs() << "Generating C Source With: \n";
-    PrintCommand(args);
-  }
-
-  return sys::Program::ExecuteAndWait(llc, &args[0], 0, 0, 0, 0, &ErrMsg);
-}
-
-/// GenerateNative - generates a native object file from the
-/// specified bitcode file.
-///
-/// Inputs:
-///  InputFilename   - The name of the input bitcode file.
-///  OutputFilename  - The name of the file to generate.
-///  NativeLinkItems - The native libraries, files, code with which to link
-///  LibPaths        - The list of directories in which to find libraries.
-///  FrameworksPaths - The list of directories in which to find frameworks.
-///  Frameworks      - The list of frameworks (dynamic libraries)
-///  gcc             - The pathname to use for GGC.
-///  envp            - A copy of the process's current environment.
-///
-/// Outputs:
-///  None.
-///
-/// Returns non-zero value on error.
-///
-static int GenerateNative(const std::string &OutputFilename,
-                          const std::string &InputFilename,
-                          const Linker::ItemList &LinkItems,
-                          const sys::Path &gcc, char ** const envp,
-                          std::string& ErrMsg) {
-  // Remove these environment variables from the environment of the
-  // programs that we will execute.  It appears that GCC sets these
-  // environment variables so that the programs it uses can configure
-  // themselves identically.
-  //
-  // However, when we invoke GCC below, we want it to use its normal
-  // configuration.  Hence, we must sanitize its environment.
-  char ** clean_env = CopyEnv(envp);
-  if (clean_env == NULL)
-    return 1;
-  RemoveEnv("LIBRARY_PATH", clean_env);
-  RemoveEnv("COLLECT_GCC_OPTIONS", clean_env);
-  RemoveEnv("GCC_EXEC_PREFIX", clean_env);
-  RemoveEnv("COMPILER_PATH", clean_env);
-  RemoveEnv("COLLECT_GCC", clean_env);
-
-
-  // Run GCC to assemble and link the program into native code.
-  //
-  // Note:
-  //  We can't just assemble and link the file with the system assembler
-  //  and linker because we don't know where to put the _start symbol.
-  //  GCC mysteriously knows how to do it.
-  std::vector<std::string> args;
-  args.push_back(gcc.c_str());
-  args.push_back("-fno-strict-aliasing");
-  args.push_back("-O3");
-  args.push_back("-o");
-  args.push_back(OutputFilename);
-  args.push_back(InputFilename);
-
-  // Add in the library and framework paths
-  for (unsigned index = 0; index < LibPaths.size(); index++) {
-    args.push_back("-L" + LibPaths[index]);
-  }
-  for (unsigned index = 0; index < FrameworkPaths.size(); index++) {
-    args.push_back("-F" + FrameworkPaths[index]);
-  }
-
-  // Add the requested options
-  for (unsigned index = 0; index < XLinker.size(); index++)
-    args.push_back(XLinker[index]);
-
-  // Add in the libraries to link.
-  for (unsigned index = 0; index < LinkItems.size(); index++)
-    if (LinkItems[index].first != "crtend") {
-      if (LinkItems[index].second)
-        args.push_back("-l" + LinkItems[index].first);
-      else
-        args.push_back(LinkItems[index].first);
-    }
-
-  // Add in frameworks to link.
-  for (unsigned index = 0; index < Frameworks.size(); index++) {
-    args.push_back("-framework");
-    args.push_back(Frameworks[index]);
-  }
-
-  // Now that "args" owns all the std::strings for the arguments, call the c_str
-  // method to get the underlying string array.  We do this game so that the
-  // std::string array is guaranteed to outlive the const char* array.
-  std::vector<const char *> Args;
-  for (unsigned i = 0, e = args.size(); i != e; ++i)
-    Args.push_back(args[i].c_str());
-  Args.push_back(0);
-
-  if (Verbose) {
-    errs() << "Generating Native Executable With:\n";
-    PrintCommand(Args);
-  }
-
-  // Run the compiler to assembly and link together the program.
-  int R = sys::Program::ExecuteAndWait(
-    gcc, &Args[0], const_cast<const char **>(clean_env), 0, 0, 0, &ErrMsg);
-  delete [] clean_env;
-  return R;
-}
-
-/// EmitShellScript - Output the wrapper file that invokes the JIT on the LLVM
-/// bitcode file for the program.
-static void EmitShellScript(char **argv, Module *M) {
-  if (Verbose)
-    errs() << "Emitting Shell Script\n";
-#if defined(_WIN32)
-  // Windows doesn't support #!/bin/sh style shell scripts in .exe files.  To
-  // support windows systems, we copy the llvm-stub.exe executable from the
-  // build tree to the destination file.
-  std::string ErrMsg;
-  sys::Path llvmstub = PrependMainExecutablePath("llvm-stub", argv[0],
-                                                 (void *)(intptr_t)&Optimize);
-  if (llvmstub.isEmpty())
-    PrintAndExit("Could not find llvm-stub.exe executable!", M);
-
-  if (0 != sys::CopyFile(sys::Path(OutputFilename), llvmstub, &ErrMsg))
-    PrintAndExit(ErrMsg, M);
-
-  return;
-#else
-
-  // Output the script to start the program...
-  std::string ErrorInfo;
-  tool_output_file Out2(OutputFilename.c_str(), ErrorInfo);
-  if (!ErrorInfo.empty())
-    PrintAndExit(ErrorInfo, M);
-
-  Out2.os() << "#!/bin/sh\n";
-  // Allow user to setenv LLVMINTERP if lli is not in their PATH.
-  Out2.os() << "lli=${LLVMINTERP-lli}\n";
-  Out2.os() << "exec $lli \\\n";
-  // gcc accepts -l<lib> and implicitly searches /lib and /usr/lib.
-  LibPaths.push_back("/lib");
-  LibPaths.push_back("/usr/lib");
-  LibPaths.push_back("/usr/X11R6/lib");
-  // We don't need to link in libc! In fact, /usr/lib/libc.so may not be a
-  // shared object at all! See RH 8: plain text.
-  std::vector<std::string>::iterator libc =
-    std::find(Libraries.begin(), Libraries.end(), "c");
-  if (libc != Libraries.end()) Libraries.erase(libc);
-  // List all the shared object (native) libraries this executable will need
-  // on the command line, so that we don't have to do this manually!
-  for (std::vector<std::string>::iterator i = Libraries.begin(),
-         e = Libraries.end(); i != e; ++i) {
-    // try explicit -L arguments first:
-    sys::Path FullLibraryPath;
-    for (cl::list<std::string>::const_iterator P = LibPaths.begin(),
-           E = LibPaths.end(); P != E; ++P) {
-      FullLibraryPath = *P;
-      FullLibraryPath.appendComponent("lib" + *i);
-      FullLibraryPath.appendSuffix(sys::Path::GetDLLSuffix());
-      if (!FullLibraryPath.isEmpty()) {
-        if (!FullLibraryPath.isDynamicLibrary()) {
-          // Not a native shared library; mark as invalid
-          FullLibraryPath = sys::Path();
-        } else break;
-      }
-    }
-    if (FullLibraryPath.isEmpty())
-      FullLibraryPath = sys::Path::FindLibrary(*i);
-    if (!FullLibraryPath.isEmpty())
-      Out2.os() << "    -load=" << FullLibraryPath.str() << " \\\n";
-  }
-  Out2.os() << "    "  << BitcodeOutputFilename << " ${1+\"$@\"}\n";
-  Out2.keep();
-#endif
-}
-
-// BuildLinkItems -- This function generates a LinkItemList for the LinkItems
-// linker function by combining the Files and Libraries in the order they were
-// declared on the command line.
-static void BuildLinkItems(
-  Linker::ItemList& Items,
-  const cl::list<std::string>& Files,
-  const cl::list<std::string>& Libraries) {
-
-  // Build the list of linkage items for LinkItems.
-
-  cl::list<std::string>::const_iterator fileIt = Files.begin();
-  cl::list<std::string>::const_iterator libIt  = Libraries.begin();
-
-  int libPos = -1, filePos = -1;
-  while ( libIt != Libraries.end() || fileIt != Files.end() ) {
-    if (libIt != Libraries.end())
-      libPos = Libraries.getPosition(libIt - Libraries.begin());
-    else
-      libPos = -1;
-    if (fileIt != Files.end())
-      filePos = Files.getPosition(fileIt - Files.begin());
-    else
-      filePos = -1;
-
-    if (filePos != -1 && (libPos == -1 || filePos < libPos)) {
-      // Add a source file
-      Items.push_back(std::make_pair(*fileIt++, false));
-    } else if (libPos != -1 && (filePos == -1 || libPos < filePos)) {
-      // Add a library
-      Items.push_back(std::make_pair(*libIt++, true));
-    }
-  }
-}
-
-int main(int argc, char **argv, char **envp) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(argc, argv);
-
-  LLVMContext &Context = getGlobalContext();
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
-
-  // Initialize passes
-  PassRegistry &Registry = *PassRegistry::getPassRegistry();
-  initializeCore(Registry);
-  initializeScalarOpts(Registry);
-  initializeIPO(Registry);
-  initializeAnalysis(Registry);
-  initializeIPA(Registry);
-  initializeTransformUtils(Registry);
-  initializeInstCombine(Registry);
-  initializeTarget(Registry);
-
-  // Initial global variable above for convenience printing of program name.
-  progname = sys::path::stem(argv[0]);
-
-  // Parse the command line options
-  cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-  if (!LinkAsLibrary) {
-    // Default to "a.exe" instead of "a.out".
-    if (OutputFilename.getNumOccurrences() == 0)
-      OutputFilename = "a.exe";
-
-    // If there is no suffix add an "exe" one.
-    if (sys::path::extension(OutputFilename).empty())
-      OutputFilename.append(".exe");
-  }
-#endif
-
-  // Generate the bitcode for the optimized module.
-  // If -b wasn't specified, use the name specified
-  // with -o to construct BitcodeOutputFilename.
-  if (BitcodeOutputFilename.empty()) {
-    BitcodeOutputFilename = OutputFilename;
-    if (!LinkAsLibrary) BitcodeOutputFilename += ".bc";
-  }
-
-  // Arrange for the bitcode output file to be deleted on any errors.
-  BitcodeOutputRemover.setFile(BitcodeOutputFilename);
-  sys::RemoveFileOnSignal(sys::Path(BitcodeOutputFilename));
-
-  // Arrange for the output file to be deleted on any errors.
-  if (!LinkAsLibrary) {
-    OutputRemover.setFile(OutputFilename);
-    sys::RemoveFileOnSignal(sys::Path(OutputFilename));
-  }
-
-  // Construct a Linker (now that Verbose is set)
-  Linker TheLinker(progname, OutputFilename, Context, Verbose);
-
-  // Keep track of the native link items (versus the bitcode items)
-  Linker::ItemList NativeLinkItems;
-
-  // Add library paths to the linker
-  TheLinker.addPaths(LibPaths);
-  TheLinker.addSystemPaths();
-
-  // Remove any consecutive duplicates of the same library...
-  Libraries.erase(std::unique(Libraries.begin(), Libraries.end()),
-                  Libraries.end());
-
-  if (LinkAsLibrary) {
-    std::vector<sys::Path> Files;
-    for (unsigned i = 0; i < InputFilenames.size(); ++i )
-      Files.push_back(sys::Path(InputFilenames[i]));
-    if (TheLinker.LinkInFiles(Files))
-      return 1; // Error already printed
-
-    // The libraries aren't linked in but are noted as "dependent" in the
-    // module.
-    for (cl::list<std::string>::const_iterator I = Libraries.begin(),
-         E = Libraries.end(); I != E ; ++I) {
-      TheLinker.getModule()->addLibrary(*I);
-    }
-  } else {
-    // Build a list of the items from our command line
-    Linker::ItemList Items;
-    BuildLinkItems(Items, InputFilenames, Libraries);
-
-    // Link all the items together
-    if (TheLinker.LinkInItems(Items, NativeLinkItems) )
-      return 1; // Error already printed
-  }
-
-  std::auto_ptr<Module> Composite(TheLinker.releaseModule());
-
-  // Optimize the module
-  Optimize(Composite.get());
-
-  // Generate the bitcode output.
-  GenerateBitcode(Composite.get(), BitcodeOutputFilename);
-
-  // If we are not linking a library, generate either a native executable
-  // or a JIT shell script, depending upon what the user wants.
-  if (!LinkAsLibrary) {
-    // If the user wants to run a post-link optimization, run it now.
-    if (!PostLinkOpts.empty()) {
-      std::vector<std::string> opts = PostLinkOpts;
-      for (std::vector<std::string>::iterator I = opts.begin(),
-           E = opts.end(); I != E; ++I) {
-        sys::Path prog(*I);
-        if (!prog.canExecute()) {
-          prog = sys::Program::FindProgramByName(*I);
-          if (prog.isEmpty())
-            PrintAndExit(std::string("Optimization program '") + *I +
-                         "' is not found or not executable.", Composite.get());
-        }
-        // Get the program arguments
-        sys::Path tmp_output("opt_result");
-        std::string ErrMsg;
-        if (tmp_output.createTemporaryFileOnDisk(true, &ErrMsg))
-          PrintAndExit(ErrMsg, Composite.get());
-
-        const char* args[4];
-        args[0] = I->c_str();
-        args[1] = BitcodeOutputFilename.c_str();
-        args[2] = tmp_output.c_str();
-        args[3] = 0;
-        if (0 == sys::Program::ExecuteAndWait(prog, args, 0,0,0,0, &ErrMsg)) {
-          if (tmp_output.isBitcodeFile()) {
-            sys::Path target(BitcodeOutputFilename);
-            target.eraseFromDisk();
-            if (tmp_output.renamePathOnDisk(target, &ErrMsg))
-              PrintAndExit(ErrMsg, Composite.get(), 2);
-          } else
-            PrintAndExit("Post-link optimization output is not bitcode",
-                         Composite.get());
-        } else {
-          PrintAndExit(ErrMsg, Composite.get());
-        }
-      }
-    }
-
-    // If the user wants to generate a native executable, compile it from the
-    // bitcode file.
-    //
-    // Otherwise, create a script that will run the bitcode through the JIT.
-    if (Native) {
-      // Name of the Assembly Language output file
-      sys::Path AssemblyFile ( OutputFilename);
-      AssemblyFile.appendSuffix("s");
-
-      // Mark the output files for removal.
-      FileRemover AssemblyFileRemover(AssemblyFile.str());
-      sys::RemoveFileOnSignal(AssemblyFile);
-
-      // Determine the locations of the llc and gcc programs.
-      sys::Path llc = PrependMainExecutablePath("llc", argv[0],
-                                                (void *)(intptr_t)&Optimize);
-      if (llc.isEmpty())
-        PrintAndExit("Failed to find llc", Composite.get());
-
-      sys::Path gcc = sys::Program::FindProgramByName("gcc");
-      if (gcc.isEmpty())
-        PrintAndExit("Failed to find gcc", Composite.get());
-
-      // Generate an assembly language file for the bitcode.
-      std::string ErrMsg;
-      if (0 != GenerateAssembly(AssemblyFile.str(), BitcodeOutputFilename,
-          llc, ErrMsg))
-        PrintAndExit(ErrMsg, Composite.get());
-
-      if (0 != GenerateNative(OutputFilename, AssemblyFile.str(),
-                              NativeLinkItems, gcc, envp, ErrMsg))
-        PrintAndExit(ErrMsg, Composite.get());
-    } else if (NativeCBE) {
-      sys::Path CFile (OutputFilename);
-      CFile.appendSuffix("cbe.c");
-
-      // Mark the output files for removal.
-      FileRemover CFileRemover(CFile.str());
-      sys::RemoveFileOnSignal(CFile);
-
-      // Determine the locations of the llc and gcc programs.
-      sys::Path llc = PrependMainExecutablePath("llc", argv[0],
-                                                (void *)(intptr_t)&Optimize);
-      if (llc.isEmpty())
-        PrintAndExit("Failed to find llc", Composite.get());
-
-      sys::Path gcc = sys::Program::FindProgramByName("gcc");
-      if (gcc.isEmpty())
-        PrintAndExit("Failed to find gcc", Composite.get());
-
-      // Generate an assembly language file for the bitcode.
-      std::string ErrMsg;
-      if (GenerateCFile(CFile.str(), BitcodeOutputFilename, llc, ErrMsg))
-        PrintAndExit(ErrMsg, Composite.get());
-
-      if (GenerateNative(OutputFilename, CFile.str(),
-                         NativeLinkItems, gcc, envp, ErrMsg))
-        PrintAndExit(ErrMsg, Composite.get());
-    } else {
-      EmitShellScript(argv, Composite.get());
-    }
-
-    // Make the script executable...
-    std::string ErrMsg;
-    if (sys::Path(OutputFilename).makeExecutableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg, Composite.get());
-
-    // Make the bitcode file readable and directly executable in LLEE as well
-    if (sys::Path(BitcodeOutputFilename).makeExecutableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg, Composite.get());
-
-    if (sys::Path(BitcodeOutputFilename).makeReadableOnDisk(&ErrMsg))
-      PrintAndExit(ErrMsg, Composite.get());
-  }
-
-  // Operations which may fail are now complete.
-  BitcodeOutputRemover.releaseFile();
-  if (!LinkAsLibrary)
-    OutputRemover.releaseFile();
-
-  // Graceful exit
-  return 0;
-}
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 36a482e..3bceb14 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -180,38 +180,16 @@ static const Target *GetTarget(const char *ProgName) {
     TripleName = sys::getDefaultTargetTriple();
   Triple TheTriple(Triple::normalize(TripleName));
 
-  const Target *TheTarget = 0;
-  if (!ArchName.empty()) {
-    for (TargetRegistry::iterator it = TargetRegistry::begin(),
-           ie = TargetRegistry::end(); it != ie; ++it) {
-      if (ArchName == it->getName()) {
-        TheTarget = &*it;
-        break;
-      }
-    }
-
-    if (!TheTarget) {
-      errs() << ProgName << ": error: invalid target '" << ArchName << "'.\n";
-      return 0;
-    }
-
-    // Adjust the triple to match (if known), otherwise stick with the
-    // module/host triple.
-    Triple::ArchType Type = Triple::getArchTypeForLLVMName(ArchName);
-    if (Type != Triple::UnknownArch)
-      TheTriple.setArch(Type);
-  } else {
-    // Get the target specific parser.
-    std::string Error;
-    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error);
-    if (TheTarget == 0) {
-      errs() << ProgName << ": error: unable to get target for '"
-             << TheTriple.getTriple()
-             << "', see --version and --triple.\n";
-      return 0;
-    }
+  // Get the target specific parser.
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
+                                                         Error);
+  if (!TheTarget) {
+    errs() << ProgName << ": " << Error;
+    return 0;
   }
 
+  // Update the triple name and return the found target.
   TripleName = TheTriple.getTriple();
   return TheTarget;
 }
@@ -430,7 +408,7 @@ int main(int argc, char **argv) {
     MCCodeEmitter *CE = 0;
     MCAsmBackend *MAB = 0;
     if (ShowEncoding) {
-      CE = TheTarget->createMCCodeEmitter(*MCII, *STI, Ctx);
+      CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
       MAB = TheTarget->createMCAsmBackend(TripleName);
     }
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/true,
@@ -443,7 +421,7 @@ int main(int argc, char **argv) {
     Str.reset(createNullStreamer(Ctx));
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
-    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *STI, Ctx);
+    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName);
     Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB,
                                                 FOS, CE, RelaxAll,
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 8d9e51e..9afbd4d 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This program is a utility that works like traditional Unix "nm",
-// that is, it prints out the names of symbols in a bitcode file,
-// along with some information about each symbol.
+// This program is a utility that works like traditional Unix "nm", that is, it
+// prints out the names of symbols in a bitcode or object file, along with some
+// information about each symbol.
 //
-// This "nm" does not print symbols' addresses. It supports many of
-// the features of GNU "nm", including its different output formats.
+// This "nm" supports many of the features of GNU "nm", including its different
+// output formats.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 0e7f3fd..1feea42 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -44,7 +44,7 @@ using namespace object;
 
 static cl::opt<bool>
   CFG("cfg", cl::desc("Create a CFG for every symbol in the object file and"
-                      "write it to a graphviz file (MachO-only)"));
+                      " write it to a graphviz file (MachO-only)"));
 
 static cl::opt<bool>
   UseDbg("g", cl::desc("Print line information from debug info if available"));
@@ -286,8 +286,10 @@ void llvm::DisassembleInputMachO(StringRef Filename) {
 
   // Read and register the symbol table data.
   InMemoryStruct<macho::SymtabLoadCommand> SymtabLC;
-  MachOObj->ReadSymtabLoadCommand(*SymtabLCI, SymtabLC);
-  MachOObj->RegisterStringTable(*SymtabLC);
+  if (SymtabLCI) {
+    MachOObj->ReadSymtabLoadCommand(*SymtabLCI, SymtabLC);
+    MachOObj->RegisterStringTable(*SymtabLC);
+  }
 
   std::vector<SectionRef> Sections;
   std::vector<SymbolRef> Symbols;
@@ -430,7 +432,7 @@ void llvm::DisassembleInputMachO(StringRef Filename) {
 
       // Stop disassembling either at the beginning of the next symbol or at
       // the end of the section.
-      bool containsNextSym = true;
+      bool containsNextSym = false;
       uint64_t NextSym = 0;
       uint64_t NextSymIdx = SymIdx+1;
       while (Symbols.size() > NextSymIdx) {
@@ -498,6 +500,29 @@ void llvm::DisassembleInputMachO(StringRef Filename) {
             InstrAnalysis.get(), Start, DebugOut, FunctionMap, Functions);
       }
     }
+    if (!CFG && !symbolTableWorked) {
+      // Reading the symbol table didn't work, disassemble the whole section. 
+      uint64_t SectAddress;
+      Sections[SectIdx].getAddress(SectAddress);
+      uint64_t SectSize;
+      Sections[SectIdx].getSize(SectSize);
+      uint64_t InstSize;
+      for (uint64_t Index = 0; Index < SectSize; Index += InstSize) {
+        MCInst Inst;
+
+        if (DisAsm->getInstruction(Inst, InstSize, memoryObject, Index,
+                                   DebugOut, nulls())) {
+          outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
+          DumpBytes(StringRef(Bytes.data() + Index, InstSize));
+          IP->printInst(&Inst, outs(), "");
+          outs() << "\n";
+        } else {
+          errs() << "llvm-objdump: warning: invalid instruction encoding\n";
+          if (InstSize == 0)
+            InstSize = 1; // skip illegible bytes
+        }
+      }
+    }
 
     if (CFG) {
       if (!symbolTableWorked) {
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 5a6f94a..b431c76 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -104,29 +104,27 @@ static bool error(error_code ec) {
   return true;
 }
 
-static const Target *GetTarget(const ObjectFile *Obj = NULL) {
+static const Target *getTarget(const ObjectFile *Obj = NULL) {
   // Figure out the target triple.
-  llvm::Triple TT("unknown-unknown-unknown");
+  llvm::Triple TheTriple("unknown-unknown-unknown");
   if (TripleName.empty()) {
     if (Obj)
-      TT.setArch(Triple::ArchType(Obj->getArch()));
+      TheTriple.setArch(Triple::ArchType(Obj->getArch()));
   } else
-    TT.setTriple(Triple::normalize(TripleName));
-
-  if (!ArchName.empty())
-    TT.setArchName(ArchName);
-
-  TripleName = TT.str();
+    TheTriple.setTriple(Triple::normalize(TripleName));
 
   // Get the target specific parser.
   std::string Error;
-  const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, Error);
-  if (TheTarget)
-    return TheTarget;
+  const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
+                                                         Error);
+  if (!TheTarget) {
+    errs() << ToolName << ": " << Error;
+    return 0;
+  }
 
-  errs() << ToolName << ": error: unable to get target for '" << TripleName
-         << "', see --version and --triple.\n";
-  return 0;
+  // Update the triple name and return the found target.
+  TripleName = TheTriple.getTriple();
+  return TheTarget;
 }
 
 void llvm::StringRefMemoryObject::anchor() { }
@@ -165,11 +163,11 @@ static bool RelocAddressLess(RelocationRef a, RelocationRef b) {
 }
 
 static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
-  const Target *TheTarget = GetTarget(Obj);
-  if (!TheTarget) {
-    // GetTarget prints out stuff.
+  const Target *TheTarget = getTarget(Obj);
+  // getTarget() will have already issued a diagnostic if necessary, so
+  // just bail here if it failed.
+  if (!TheTarget)
     return;
-  }
 
   error_code ec;
   for (section_iterator i = Obj->begin_sections(),
@@ -208,7 +206,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     if (InlineRelocs) {
       for (relocation_iterator ri = i->begin_relocations(),
                                re = i->end_relocations();
-                              ri != re; ri.increment(ec)) {
+                               ri != re; ri.increment(ec)) {
         if (error(ec)) break;
         Rels.push_back(*ri);
       }
@@ -465,9 +463,8 @@ static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
                << format("assoc %d comdat %d\n"
                          , unsigned(asd->Number)
                          , unsigned(asd->Selection));
-      } else {
+      } else
         outs() << "AUX Unknown\n";
-      }
     } else {
       StringRef name;
       if (error(coff->getSymbol(i, symbol))) return;
@@ -611,13 +608,12 @@ static void DumpInput(StringRef file) {
     return;
   }
 
-  if (Archive *a = dyn_cast<Archive>(binary.get())) {
+  if (Archive *a = dyn_cast<Archive>(binary.get()))
     DumpArchive(a);
-  } else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get())) {
+  else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get()))
     DumpObject(o);
-  } else {
+  else
     errs() << ToolName << ": '" << file << "': " << "Unrecognized file type.\n";
-  }
 }
 
 int main(int argc, char **argv) {
@@ -632,6 +628,9 @@ int main(int argc, char **argv) {
   llvm::InitializeAllAsmParsers();
   llvm::InitializeAllDisassemblers();
 
+  // Register the target printer for --version.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
   cl::ParseCommandLineOptions(argc, argv, "llvm object file dumper\n");
   TripleName = Triple::normalize(TripleName);
 
diff --git a/tools/llvm-prof/llvm-prof.cpp b/tools/llvm-prof/llvm-prof.cpp
index d9b6713..81e9503 100644
--- a/tools/llvm-prof/llvm-prof.cpp
+++ b/tools/llvm-prof/llvm-prof.cpp
@@ -281,7 +281,7 @@ int main(int argc, char **argv) {
   // using the standard profile info provider pass, but for now this gives us
   // access to additional information not exposed via the ProfileInfo
   // interface.
-  ProfileInfoLoader PIL(argv[0], ProfileDataFile, *M);
+  ProfileInfoLoader PIL(argv[0], ProfileDataFile);
 
   // Run the printer pass.
   PassManager PassMgr;
diff --git a/tools/llvm-ranlib/llvm-ranlib.cpp b/tools/llvm-ranlib/llvm-ranlib.cpp
index 64f795f..4006765 100644
--- a/tools/llvm-ranlib/llvm-ranlib.cpp
+++ b/tools/llvm-ranlib/llvm-ranlib.cpp
@@ -81,7 +81,7 @@ int main(int argc, char **argv) {
     if (!TheArchive)
       throw err_msg;
 
-    if (TheArchive->writeToDisk(true, false, false, &err_msg ))
+    if (TheArchive->writeToDisk(true, false, &err_msg ))
       throw err_msg;
 
     if (Verbose)
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 01a7d15..95de8d8 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -63,18 +63,37 @@ public:
     return 0;
   }
 
+  // Invalidate instruction cache for sections with execute permissions.
+  // Some platforms with separate data cache and instruction cache require
+  // explicit cache flush, otherwise JIT code manipulations (like resolved
+  // relocations) will get to the data cache but not to the instruction cache.
+  virtual void invalidateInstructionCache();
 };
 
 uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
                                                    unsigned SectionID) {
-  return (uint8_t*)sys::Memory::AllocateRWX(Size, 0, 0).base();
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
+  FunctionMemory.push_back(MB);
+  return (uint8_t*)MB.base();
 }
 
 uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
                                                    unsigned Alignment,
                                                    unsigned SectionID) {
-  return (uint8_t*)sys::Memory::AllocateRWX(Size, 0, 0).base();
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
+  DataMemory.push_back(MB);
+  return (uint8_t*)MB.base();
+}
+
+void TrivialMemoryManager::invalidateInstructionCache() {
+  for (int i = 0, e = FunctionMemory.size(); i != e; ++i)
+    sys::Memory::InvalidateInstructionCache(FunctionMemory[i].base(),
+                                            FunctionMemory[i].size());
+
+  for (int i = 0, e = DataMemory.size(); i != e; ++i)
+    sys::Memory::InvalidateInstructionCache(DataMemory[i].base(),
+                                            DataMemory[i].size());
 }
 
 static const char *ProgramName;
@@ -113,6 +132,8 @@ static int executeInput() {
 
   // Resolve all the relocations we can.
   Dyld.resolveRelocations();
+  // Clear instruction cache before code will be executed.
+  MemMgr->invalidateInstructionCache();
 
   // FIXME: Error out if there are unresolved relocations.
 
diff --git a/tools/llvm-shlib/Makefile b/tools/llvm-shlib/Makefile
index 75bee07..6d6c6e9 100644
--- a/tools/llvm-shlib/Makefile
+++ b/tools/llvm-shlib/Makefile
@@ -63,7 +63,7 @@ ifeq ($(HOST_OS),Darwin)
     endif
 endif
 
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux FreeBSD OpenBSD GNU))
+ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux FreeBSD OpenBSD GNU Bitrig))
     # Include everything from the .a's into the shared library.
     LLVMLibsOptions := -Wl,--whole-archive $(LLVMLibsOptions) \
                        -Wl,--no-whole-archive
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index fb05a58..31252dd 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -82,6 +82,12 @@ public:
     uint64_t Val = Rand32();
     return Val | (uint64_t(Rand32()) << 32);
   }
+
+  /// Rand operator for STL algorithms.
+  ptrdiff_t operator()(ptrdiff_t y) {
+    return  Rand64() % y;
+  }
+
 private:
   unsigned Seed;
 };
@@ -599,15 +605,13 @@ struct CmpModifier: public Modifier {
   }
 };
 
-void FillFunction(Function *F) {
+void FillFunction(Function *F, Random &R) {
   // Create a legal entry block.
   BasicBlock *BB = BasicBlock::Create(F->getContext(), "BB", F);
   ReturnInst::Create(F->getContext(), BB);
 
   // Create the value table.
   Modifier::PieceTable PT;
-  // Pick an initial seed value
-  Random R(SeedCL);
 
   // Consider arguments as legal values.
   for (Function::arg_iterator it = F->arg_begin(), e = F->arg_end();
@@ -648,15 +652,17 @@ void FillFunction(Function *F) {
   SM->ActN(5); // Throw in a few stores.
 }
 
-void IntroduceControlFlow(Function *F) {
-  std::set<Instruction*> BoolInst;
+void IntroduceControlFlow(Function *F, Random &R) {
+  std::vector<Instruction*> BoolInst;
   for (BasicBlock::iterator it = F->begin()->begin(),
        e = F->begin()->end(); it != e; ++it) {
     if (it->getType() == IntegerType::getInt1Ty(F->getContext()))
-      BoolInst.insert(it);
+      BoolInst.push_back(it);
   }
 
-  for (std::set<Instruction*>::iterator it = BoolInst.begin(),
+  std::random_shuffle(BoolInst.begin(), BoolInst.end(), R);
+
+  for (std::vector<Instruction*>::iterator it = BoolInst.begin(),
        e = BoolInst.end(); it != e; ++it) {
     Instruction *Instr = *it;
     BasicBlock *Curr = Instr->getParent();
@@ -678,8 +684,13 @@ int main(int argc, char **argv) {
 
   std::auto_ptr<Module> M(new Module("/tmp/autogen.bc", getGlobalContext()));
   Function *F = GenEmptyFunction(M.get());
-  FillFunction(F);
-  IntroduceControlFlow(F);
+
+  // Pick an initial seed value
+  Random R(SeedCL);
+  // Generate lots of random instructions inside a single basic block.
+  FillFunction(F, R);
+  // Break the basic block into many loops.
+  IntroduceControlFlow(F, R);
 
   // Figure out what stream we are supposed to write to...
   OwningPtr<tool_output_file> Out;
diff --git a/tools/llvm-stub/CMakeLists.txt b/tools/llvm-stub/CMakeLists.txt
deleted file mode 100644
index a98dc9e..0000000
--- a/tools/llvm-stub/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_tool(llvm-stub
-  llvm-stub.c
-  )
diff --git a/tools/llvm-stub/LLVMBuild.txt b/tools/llvm-stub/LLVMBuild.txt
deleted file mode 100644
index 5c3534c..0000000
--- a/tools/llvm-stub/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./tools/llvm-stub/LLVMBuild.txt --------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Tool
-name = llvm-stub
-parent = Tools
-required_libraries = 
diff --git a/tools/llvm-stub/llvm-stub.c b/tools/llvm-stub/llvm-stub.c
deleted file mode 100644
index 69cd6ed..0000000
--- a/tools/llvm-stub/llvm-stub.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*===- llvm-stub.c - Stub executable to run llvm bitcode files ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tool is used by the gccld program to enable transparent execution of
-// bitcode files by the user.  Specifically, gccld outputs two files when asked
-// to compile a <program> file:
-//    1. It outputs the LLVM bitcode file to <program>.bc
-//    2. It outputs a stub executable that runs lli on <program>.bc
-//
-// This allows the end user to just say ./<program> and have the JIT executed
-// automatically.  On unix, the stub executable emitted is actually a bourne
-// shell script that does the forwarding.  Windows does not like #!/bin/sh
-// programs in .exe files, so we make it an actual program, defined here.
-//
-//===----------------------------------------------------------------------===*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "llvm/Config/config.h"
-
-#if defined(HAVE_UNISTD_H) && !defined(_MSC_VER)
-#include <unistd.h>
-#endif
-
-#ifdef _WIN32
-#include <process.h>
-#include <io.h>
-#endif
-
-int main(int argc, char** argv) {
-  const char *Interp = getenv("LLVMINTERP");
-  const char **Args;
-  if (Interp == 0) Interp = "lli";
-
-  /* Set up the command line options to pass to the JIT. */
-  Args = (const char**)malloc(sizeof(char*) * (argc+2));
-  /* argv[0] is the JIT */
-  Args[0] = Interp;
-
-#ifdef LLVM_ON_WIN32
-  {
-    int len = strlen(argv[0]);
-    if (len < 4 || strcmp(argv[0] + len - 4, ".exe") != 0) {
-      /* .exe suffix is stripped off of argv[0] if the executable was run on the
-       * command line without one. Put it back on.
-       */
-      argv[0] = strcat(strcpy((char*)malloc(len + 5), argv[0]), ".exe");
-    }
-  }
-#endif
-
-  /* argv[1] is argv[0] + ".bc". */
-  Args[1] = strcat(strcpy((char*)malloc(strlen(argv[0])+4), argv[0]), ".bc");
-
-  /* The rest of the args are as before. */
-  memcpy((char **)Args+2, argv+1, sizeof(char*)*argc);
-
-  /* Run the JIT. */
-#if !defined(_WIN32) || defined(__MINGW64__)
-  execvp(Interp, (char **)Args); /* POSIX execvp takes a char *const[]. */
-#else
-  execvp(Interp, Args); /* windows execvp takes a const char *const *. */
-#endif
-  /* if _execv returns, the JIT could not be started. */
-  fprintf(stderr, "Could not execute the LLVM JIT.  Either add 'lli' to your"
-          " path, or set the\ninterpreter you want to use in the LLVMINTERP "
-          "environment variable.\n");
-  return 1;
-}
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index 6382a3f..b80bc34 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -46,10 +46,12 @@
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
-static cl::opt<bool> DisableInline("disable-inlining", cl::init(false),
+static cl::opt<bool>
+DisableInline("disable-inlining", cl::init(false),
   cl::desc("Do not run the inliner pass"));
 
-static cl::opt<bool> DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
+static cl::opt<bool>
+DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
   cl::desc("Do not run the GVN load PRE pass"));
 
 const char* LTOCodeGenerator::getVersionString() {
@@ -152,7 +154,7 @@ bool LTOCodeGenerator::writeMergedModules(const char *path,
 bool LTOCodeGenerator::compile_to_file(const char** name, std::string& errMsg) {
   // make unique temp .o file to put generated object file
   sys::PathWithStatus uniqueObjPath("lto-llvm.o");
-  if ( uniqueObjPath.createTemporaryFileOnDisk(false, &errMsg) ) {
+  if (uniqueObjPath.createTemporaryFileOnDisk(false, &errMsg)) {
     uniqueObjPath.eraseFromDisk();
     return true;
   }
@@ -172,7 +174,7 @@ bool LTOCodeGenerator::compile_to_file(const char** name, std::string& errMsg) {
   }
 
   objFile.keep();
-  if ( genResult ) {
+  if (genResult) {
     uniqueObjPath.eraseFromDisk();
     return true;
   }
@@ -202,46 +204,49 @@ const void* LTOCodeGenerator::compile(size_t* length, std::string& errMsg) {
   sys::Path(_nativeObjectPath).eraseFromDisk();
 
   // return buffer, unless error
-  if ( _nativeObjectFile == NULL )
+  if (_nativeObjectFile == NULL)
     return NULL;
   *length = _nativeObjectFile->getBufferSize();
   return _nativeObjectFile->getBufferStart();
 }
 
 bool LTOCodeGenerator::determineTarget(std::string& errMsg) {
-  if ( _target == NULL ) {
-    std::string Triple = _linker.getModule()->getTargetTriple();
-    if (Triple.empty())
-      Triple = sys::getDefaultTargetTriple();
-
-    // create target machine from info for merged modules
-    const Target *march = TargetRegistry::lookupTarget(Triple, errMsg);
-    if ( march == NULL )
-      return true;
-
-    // The relocation model is actually a static member of TargetMachine and
-    // needs to be set before the TargetMachine is instantiated.
-    Reloc::Model RelocModel = Reloc::Default;
-    switch( _codeModel ) {
-    case LTO_CODEGEN_PIC_MODEL_STATIC:
-      RelocModel = Reloc::Static;
-      break;
-    case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
-      RelocModel = Reloc::PIC_;
-      break;
-    case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
-      RelocModel = Reloc::DynamicNoPIC;
-      break;
-    }
-
-    // construct LTOModule, hand over ownership of module and target
-    SubtargetFeatures Features;
-    Features.getDefaultSubtargetFeatures(llvm::Triple(Triple));
-    std::string FeatureStr = Features.getString();
-    TargetOptions Options;
-    _target = march->createTargetMachine(Triple, _mCpu, FeatureStr, Options,
-                                         RelocModel);
+  if (_target != NULL)
+    return false;
+
+  std::string Triple = _linker.getModule()->getTargetTriple();
+  if (Triple.empty())
+    Triple = sys::getDefaultTargetTriple();
+
+  // create target machine from info for merged modules
+  const Target *march = TargetRegistry::lookupTarget(Triple, errMsg);
+  if (march == NULL)
+    return true;
+
+  // The relocation model is actually a static member of TargetMachine and
+  // needs to be set before the TargetMachine is instantiated.
+  Reloc::Model RelocModel = Reloc::Default;
+  switch (_codeModel) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+    RelocModel = Reloc::Static;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+    RelocModel = Reloc::PIC_;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    RelocModel = Reloc::DynamicNoPIC;
+    break;
   }
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(llvm::Triple(Triple));
+  std::string FeatureStr = Features.getString();
+  TargetOptions Options;
+  LTOModule::getTargetOptions(Options);
+  _target = march->createTargetMachine(Triple, _mCpu, FeatureStr, Options,
+                                       RelocModel, CodeModel::Default,
+                                       CodeGenOpt::Aggressive);
   return false;
 }
 
@@ -333,13 +338,13 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 /// Optimize merged modules using various IPO passes
 bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
                                           std::string &errMsg) {
-  if ( this->determineTarget(errMsg) )
+  if (this->determineTarget(errMsg))
     return true;
 
   Module* mergedModule = _linker.getModule();
 
   // if options were requested, set them
-  if ( !_codegenOptions.empty() )
+  if (!_codegenOptions.empty())
     cl::ParseCommandLineOptions(_codegenOptions.size(),
                                 const_cast<char **>(&_codegenOptions[0]));
 
@@ -372,8 +377,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   formatted_raw_ostream Out(out);
 
   if (_target->addPassesToEmitFile(*codeGenPasses, Out,
-                                   TargetMachine::CGFT_ObjectFile,
-                                   CodeGenOpt::Aggressive)) {
+                                   TargetMachine::CGFT_ObjectFile)) {
     errMsg = "target file type not supported";
     return true;
   }
@@ -402,7 +406,7 @@ void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
        !o.first.empty(); o = getToken(o.second)) {
     // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
     // that.
-    if ( _codegenOptions.empty() )
+    if (_codegenOptions.empty())
       _codegenOptions.push_back(strdup("libLTO"));
     _codegenOptions.push_back(strdup(o.first.str().c_str()));
   }
diff --git a/tools/lto/LTOCodeGenerator.h b/tools/lto/LTOCodeGenerator.h
index 032dc37..3081b7d 100644
--- a/tools/lto/LTOCodeGenerator.h
+++ b/tools/lto/LTOCodeGenerator.h
@@ -70,7 +70,6 @@ private:
   llvm::TargetMachine*        _target;
   bool                        _emitDwarfDebugInfo;
   bool                        _scopeRestrictionsDone;
-  bool                        _runInternalizePass;
   lto_codegen_model           _codeModel;
   StringSet                   _mustPreserveSymbols;
   StringSet                   _asmUndefinedRefs;
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index 1dbd64b..c5b3d10 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -37,6 +38,118 @@
 #include "llvm/ADT/Triple.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnableFPMAD("enable-fp-mad",
+  cl::desc("Enable less precise MAD instructions to be generated"),
+  cl::init(false));
+
+static cl::opt<bool>
+DisableFPElim("disable-fp-elim",
+  cl::desc("Disable frame pointer elimination optimization"),
+  cl::init(false));
+
+static cl::opt<bool>
+DisableFPElimNonLeaf("disable-non-leaf-fp-elim",
+  cl::desc("Disable frame pointer elimination optimization for non-leaf funcs"),
+  cl::init(false));
+
+static cl::opt<bool>
+EnableUnsafeFPMath("enable-unsafe-fp-math",
+  cl::desc("Enable optimizations that may decrease FP precision"),
+  cl::init(false));
+
+static cl::opt<bool>
+EnableNoInfsFPMath("enable-no-infs-fp-math",
+  cl::desc("Enable FP math optimizations that assume no +-Infs"),
+  cl::init(false));
+
+static cl::opt<bool>
+EnableNoNaNsFPMath("enable-no-nans-fp-math",
+  cl::desc("Enable FP math optimizations that assume no NaNs"),
+  cl::init(false));
+
+static cl::opt<bool>
+EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
+  cl::Hidden,
+  cl::desc("Force codegen to assume rounding mode can change dynamically"),
+  cl::init(false));
+
+static cl::opt<bool>
+GenerateSoftFloatCalls("soft-float",
+  cl::desc("Generate software floating point library calls"),
+  cl::init(false));
+
+static cl::opt<llvm::FloatABI::ABIType>
+FloatABIForCalls("float-abi",
+  cl::desc("Choose float ABI type"),
+  cl::init(FloatABI::Default),
+  cl::values(
+    clEnumValN(FloatABI::Default, "default",
+               "Target default float ABI type"),
+    clEnumValN(FloatABI::Soft, "soft",
+               "Soft float ABI (implied by -soft-float)"),
+    clEnumValN(FloatABI::Hard, "hard",
+               "Hard float ABI (uses FP registers)"),
+    clEnumValEnd));
+
+static cl::opt<llvm::FPOpFusion::FPOpFusionMode>
+FuseFPOps("fp-contract",
+  cl::desc("Enable aggresive formation of fused FP ops"),
+  cl::init(FPOpFusion::Standard),
+  cl::values(
+    clEnumValN(FPOpFusion::Fast, "fast",
+               "Fuse FP ops whenever profitable"),
+    clEnumValN(FPOpFusion::Standard, "on",
+               "Only fuse 'blessed' FP ops."),
+    clEnumValN(FPOpFusion::Strict, "off",
+               "Only fuse FP ops when the result won't be effected."),
+    clEnumValEnd));
+
+static cl::opt<bool>
+DontPlaceZerosInBSS("nozero-initialized-in-bss",
+  cl::desc("Don't place zero-initialized symbols into bss section"),
+  cl::init(false));
+
+static cl::opt<bool>
+EnableGuaranteedTailCallOpt("tailcallopt",
+  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
+  cl::init(false));
+
+static cl::opt<bool>
+DisableTailCalls("disable-tail-calls",
+  cl::desc("Never emit tail calls"),
+  cl::init(false));
+
+static cl::opt<unsigned>
+OverrideStackAlignment("stack-alignment",
+  cl::desc("Override default stack alignment"),
+  cl::init(0));
+
+static cl::opt<bool>
+EnableRealignStack("realign-stack",
+  cl::desc("Realign stack if needed"),
+  cl::init(true));
+
+static cl::opt<std::string>
+TrapFuncName("trap-func", cl::Hidden,
+  cl::desc("Emit a call to trap function rather than a trap instruction"),
+  cl::init(""));
+
+static cl::opt<bool>
+EnablePIE("enable-pie",
+  cl::desc("Assume the creation of a position independent executable."),
+  cl::init(false));
+
+static cl::opt<bool>
+SegmentedStacks("segmented-stacks",
+  cl::desc("Use segmented stacks if possible."),
+  cl::init(false));
+
+static cl::opt<bool>
+UseInitArray("use-init-array",
+  cl::desc("Use .init_array instead of .ctors."),
+  cl::init(false));
+
 LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
   : _module(m), _target(t),
     _context(*_target->getMCAsmInfo(), *_target->getRegisterInfo(), NULL),
@@ -117,6 +230,30 @@ LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
   return makeLTOModule(buffer.take(), errMsg);
 }
 
+void LTOModule::getTargetOptions(TargetOptions &Options) {
+  Options.LessPreciseFPMADOption = EnableFPMAD;
+  Options.NoFramePointerElim = DisableFPElim;
+  Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf;
+  Options.AllowFPOpFusion = FuseFPOps;
+  Options.UnsafeFPMath = EnableUnsafeFPMath;
+  Options.NoInfsFPMath = EnableNoInfsFPMath;
+  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    EnableHonorSignDependentRoundingFPMath;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  Options.NoZerosInBSS = DontPlaceZerosInBSS;
+  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
+  Options.DisableTailCalls = DisableTailCalls;
+  Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.RealignStack = EnableRealignStack;
+  Options.TrapFuncName = TrapFuncName;
+  Options.PositionIndependentExecutable = EnablePIE;
+  Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
+}
+
 LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
                                     std::string &errMsg) {
   static bool Initialized = false;
@@ -150,6 +287,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   std::string FeatureStr = Features.getString();
   std::string CPU;
   TargetOptions Options;
+  getTargetOptions(Options);
   TargetMachine *target = march->createTargetMachine(Triple, CPU, FeatureStr,
                                                      Options);
   LTOModule *Ret = new LTOModule(m.take(), target);
@@ -271,6 +409,9 @@ void LTOModule::addDefinedDataSymbol(GlobalValue *v) {
   // Add to list of defined symbols.
   addDefinedSymbol(v, false);
 
+  if (!v->hasSection() /* || !isTargetDarwin */)
+    return;
+
   // Special case i386/ppc ObjC data structures in magic sections:
   // The issue is that the old ObjC object format did some strange
   // contortions to avoid real linker symbols.  For instance, the
@@ -290,26 +431,25 @@ void LTOModule::addDefinedDataSymbol(GlobalValue *v) {
   // a class was missing.
   // The following synthesizes the implicit .objc_* symbols for the linker
   // from the ObjC data structures generated by the front end.
-  if (v->hasSection() /* && isTargetDarwin */) {
-    // special case if this data blob is an ObjC class definition
-    if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
-      if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-        addObjCClass(gv);
-      }
+
+  // special case if this data blob is an ObjC class definition
+  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClass(gv);
     }
+  }
 
-    // special case if this data blob is an ObjC category definition
-    else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
-      if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-        addObjCCategory(gv);
-      }
+  // special case if this data blob is an ObjC category definition
+  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCCategory(gv);
     }
+  }
 
-    // special case if this data blob is the list of referenced classes
-    else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
-      if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-        addObjCClassRef(gv);
-      }
+  // special case if this data blob is the list of referenced classes
+  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClassRef(gv);
     }
   }
 }
@@ -409,7 +549,7 @@ void LTOModule::addAsmGlobalSymbol(const char *name,
     // much.
 
     // fill information structure
-    info.name = name;
+    info.name = entry.getKey().data();
     info.attributes =
       LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
     info.isFunction = false;
@@ -599,7 +739,7 @@ namespace {
         markGlobal(*Symbol);
     }
     virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                              unsigned Size , unsigned ByteAlignment) {
+                              uint64_t Size , unsigned ByteAlignment) {
       markDefined(*Symbol);
     }
     virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -658,21 +798,20 @@ bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
   OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
                                                   _context, *Streamer,
                                                   *_target->getMCAsmInfo()));
-  OwningPtr<MCSubtargetInfo> STI(_target->getTarget().
-                      createMCSubtargetInfo(_target->getTargetTriple(),
-                                            _target->getTargetCPU(),
-                                            _target->getTargetFeatureString()));
-  OwningPtr<MCTargetAsmParser>
-    TAP(_target->getTarget().createMCAsmParser(*STI, *Parser.get()));
+  const Target &T = _target->getTarget();
+  OwningPtr<MCSubtargetInfo>
+    STI(T.createMCSubtargetInfo(_target->getTargetTriple(),
+                                _target->getTargetCPU(),
+                                _target->getTargetFeatureString()));
+  OwningPtr<MCTargetAsmParser> TAP(T.createMCAsmParser(*STI, *Parser.get()));
   if (!TAP) {
-    errMsg = "target " + std::string(_target->getTarget().getName()) +
-        " does not define AsmParser.";
+    errMsg = "target " + std::string(T.getName()) +
+      " does not define AsmParser.";
     return true;
   }
 
   Parser->setTargetParser(*TAP);
-  int Res = Parser->Run(false);
-  if (Res)
+  if (Parser->Run(false))
     return true;
 
   for (RecordStreamer::const_iterator i = Streamer->begin(),
@@ -687,6 +826,7 @@ bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
              Value == RecordStreamer::Used)
       addAsmGlobalSymbolUndef(Key.data());
   }
+
   return false;
 }
 
@@ -694,8 +834,10 @@ bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
 static bool isDeclaration(const GlobalValue &V) {
   if (V.hasAvailableExternallyLinkage())
     return true;
+
   if (V.isMaterializable())
     return false;
+
   return V.isDeclaration();
 }
 
diff --git a/tools/lto/LTOModule.h b/tools/lto/LTOModule.h
index cafb927..8e52206 100644
--- a/tools/lto/LTOModule.h
+++ b/tools/lto/LTOModule.h
@@ -29,6 +29,7 @@ namespace llvm {
   class Function;
   class GlobalValue;
   class MemoryBuffer;
+  class TargetOptions;
   class Value;
 }
 
@@ -126,6 +127,10 @@ public:
     return _asm_undefines;
   }
 
+  /// getTargetOptions - Fill the TargetOptions object with the options
+  /// specified on the command line.
+  static void getTargetOptions(llvm::TargetOptions &Options);
+
 private:
   /// parseSymbols - Parse the symbols from the module and model-level ASM and
   /// add them to either the defined or undefined lists.
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 2b22c3b..20deda9 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -194,7 +194,7 @@ static int DumpSegment64Command(MachOObject &Obj,
   }
   outs() << "  ])\n";
 
-  return 0;
+  return Res;
 }
 
 static void DumpSymbolTableEntryData(MachOObject &Obj,
@@ -332,6 +332,35 @@ static int DumpLinkeditDataCommand(MachOObject &Obj,
   return 0;
 }
 
+static int DumpDataInCodeDataCommand(MachOObject &Obj,
+                                     const MachOObject::LoadCommandInfo &LCI) {
+  InMemoryStruct<macho::LinkeditDataLoadCommand> LLC;
+  Obj.ReadLinkeditDataLoadCommand(LCI, LLC);
+  if (!LLC)
+    return Error("unable to read segment load command");
+
+  outs() << "  ('dataoff', " << LLC->DataOffset << ")\n"
+         << "  ('datasize', " << LLC->DataSize << ")\n"
+         << "  ('_data_regions', [\n";
+
+
+  unsigned NumRegions = LLC->DataSize / 8;
+  for (unsigned i = 0; i < NumRegions; ++i) {
+    InMemoryStruct<macho::DataInCodeTableEntry> DICE;
+    Obj.ReadDataInCodeTableEntry(LLC->DataOffset, i, DICE);
+    if (!DICE)
+      return Error("unable to read DataInCodeTableEntry");
+    outs() << "    # DICE " << i << "\n"
+           << "    ('offset', " << DICE->Offset << ")\n"
+           << "    ('length', " << DICE->Length << ")\n"
+           << "    ('kind', " << DICE->Kind << ")\n";
+  }
+
+  outs() <<"  ])\n";
+
+  return 0;
+}
+
 
 static int DumpLoadCommand(MachOObject &Obj, unsigned Index) {
   const MachOObject::LoadCommandInfo &LCI = Obj.getLoadCommandInfo(Index);
@@ -358,6 +387,9 @@ static int DumpLoadCommand(MachOObject &Obj, unsigned Index) {
   case macho::LCT_FunctionStarts:
     Res = DumpLinkeditDataCommand(Obj, LCI);
     break;
+  case macho::LCT_DataInCode:
+    Res = DumpDataInCodeDataCommand(Obj, LCI);
+    break;
   default:
     Warning("unknown load command: " + Twine(LCI.Command.Type));
     break;
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index a5b0511..4ada7d1 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LLVMContext.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CallGraphSCCPass.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/RegionPass.h"
@@ -104,15 +104,23 @@ StandardLinkOpts("std-link-opts",
 
 static cl::opt<bool>
 OptLevelO1("O1",
-           cl::desc("Optimization level 1. Similar to llvm-gcc -O1"));
+           cl::desc("Optimization level 1. Similar to clang -O1"));
 
 static cl::opt<bool>
 OptLevelO2("O2",
-           cl::desc("Optimization level 2. Similar to llvm-gcc -O2"));
+           cl::desc("Optimization level 2. Similar to clang -O2"));
+
+static cl::opt<bool>
+OptLevelOs("Os",
+           cl::desc("Like -O2 with extra optimizations for size. Similar to clang -Os"));
+
+static cl::opt<bool>
+OptLevelOz("Oz",
+           cl::desc("Like -Os but reduces code size further. Similar to clang -Oz"));
 
 static cl::opt<bool>
 OptLevelO3("O3",
-           cl::desc("Optimization level 3. Similar to llvm-gcc -O3"));
+           cl::desc("Optimization level 3. Similar to clang -O3"));
 
 static cl::opt<std::string>
 TargetTriple("mtriple", cl::desc("Override target triple for module"));
@@ -409,16 +417,21 @@ static inline void addPass(PassManagerBase &PM, Pass *P) {
 ///
 /// OptLevel - Optimization Level
 static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
-                                  unsigned OptLevel) {
+                                  unsigned OptLevel, unsigned SizeLevel) {
   FPM.add(createVerifierPass());                  // Verify that input is correct
 
   PassManagerBuilder Builder;
   Builder.OptLevel = OptLevel;
+  Builder.SizeLevel = SizeLevel;
 
   if (DisableInline) {
     // No inlining pass
   } else if (OptLevel > 1) {
     unsigned Threshold = 225;
+    if (SizeLevel == 1)      // -Os
+      Threshold = 75;
+    else if (SizeLevel == 2) // -Oz
+      Threshold = 25;
     if (OptLevel > 2)
       Threshold = 275;
     Builder.Inliner = createFunctionInliningPass(Threshold);
@@ -571,7 +584,7 @@ int main(int argc, char **argv) {
     Passes.add(TD);
 
   OwningPtr<FunctionPassManager> FPasses;
-  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+  if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
     FPasses.reset(new FunctionPassManager(M.get()));
     if (TD)
       FPasses->add(new TargetData(*TD));
@@ -617,17 +630,27 @@ int main(int argc, char **argv) {
     }
 
     if (OptLevelO1 && OptLevelO1.getPosition() < PassList.getPosition(i)) {
-      AddOptimizationPasses(Passes, *FPasses, 1);
+      AddOptimizationPasses(Passes, *FPasses, 1, 0);
       OptLevelO1 = false;
     }
 
     if (OptLevelO2 && OptLevelO2.getPosition() < PassList.getPosition(i)) {
-      AddOptimizationPasses(Passes, *FPasses, 2);
+      AddOptimizationPasses(Passes, *FPasses, 2, 0);
       OptLevelO2 = false;
     }
 
+    if (OptLevelOs && OptLevelOs.getPosition() < PassList.getPosition(i)) {
+      AddOptimizationPasses(Passes, *FPasses, 2, 1);
+      OptLevelOs = false;
+    }
+
+    if (OptLevelOz && OptLevelOz.getPosition() < PassList.getPosition(i)) {
+      AddOptimizationPasses(Passes, *FPasses, 2, 2);
+      OptLevelOz = false;
+    }
+
     if (OptLevelO3 && OptLevelO3.getPosition() < PassList.getPosition(i)) {
-      AddOptimizationPasses(Passes, *FPasses, 3);
+      AddOptimizationPasses(Passes, *FPasses, 3, 0);
       OptLevelO3 = false;
     }
 
@@ -682,15 +705,21 @@ int main(int argc, char **argv) {
   }
 
   if (OptLevelO1)
-    AddOptimizationPasses(Passes, *FPasses, 1);
+    AddOptimizationPasses(Passes, *FPasses, 1, 0);
 
   if (OptLevelO2)
-    AddOptimizationPasses(Passes, *FPasses, 2);
+    AddOptimizationPasses(Passes, *FPasses, 2, 0);
+
+  if (OptLevelOs)
+    AddOptimizationPasses(Passes, *FPasses, 2, 1);
+
+  if (OptLevelOz)
+    AddOptimizationPasses(Passes, *FPasses, 2, 2);
 
   if (OptLevelO3)
-    AddOptimizationPasses(Passes, *FPasses, 3);
+    AddOptimizationPasses(Passes, *FPasses, 3, 0);
 
-  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+  if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
     FPasses->doInitialization();
     for (Module::iterator F = M->begin(), E = M->end(); F != E; ++F)
       FPasses->run(*F);
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 89b8aa9..49d7e70 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -171,6 +171,34 @@ TEST(APIntTest, i1) {
   EXPECT_EQ(zero, neg_one.srem(one));
   EXPECT_EQ(zero, neg_one.urem(one));
   EXPECT_EQ(zero, one.srem(neg_one));
+
+  // sdivrem
+  {
+  APInt q(8, 0);
+  APInt r(8, 0);
+  APInt one(8, 1);
+  APInt two(8, 2);
+  APInt nine(8, 9);
+  APInt four(8, 4);
+
+  EXPECT_EQ(nine.srem(two), one);
+  EXPECT_EQ(nine.srem(-two), one);
+  EXPECT_EQ((-nine).srem(two), -one);
+  EXPECT_EQ((-nine).srem(-two), -one);
+
+  APInt::sdivrem(nine, two, q, r);
+  EXPECT_EQ(four, q);
+  EXPECT_EQ(one, r);
+  APInt::sdivrem(-nine, two, q, r);
+  EXPECT_EQ(-four, q);
+  EXPECT_EQ(-one, r);
+  APInt::sdivrem(nine, -two, q, r);
+  EXPECT_EQ(-four, q);
+  EXPECT_EQ(one, r);
+  APInt::sdivrem(-nine, -two, q, r);
+  EXPECT_EQ(four, q);
+  EXPECT_EQ(-one, r);
+  }
 }
 
 TEST(APIntTest, fromString) {
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index f733e13..d836036 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -11,14 +11,23 @@
 #ifndef __ppc__
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
 namespace {
 
-TEST(BitVectorTest, TrivialOperation) {
-  BitVector Vec;
+// Test fixture
+template <typename T>
+class BitVectorTest : public ::testing::Test { };
+
+// Test both BitVector and SmallBitVector with the same suite of tests.
+typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes;
+TYPED_TEST_CASE(BitVectorTest, BitVectorTestTypes);
+
+TYPED_TEST(BitVectorTest, TrivialOperation) {
+  TypeParam Vec;
   EXPECT_EQ(0U, Vec.count());
   EXPECT_EQ(0U, Vec.size());
   EXPECT_FALSE(Vec.any());
@@ -42,7 +51,8 @@ TEST(BitVectorTest, TrivialOperation) {
   EXPECT_FALSE(Vec.none());
   EXPECT_FALSE(Vec.empty());
 
-  BitVector Inv = ~Vec;
+  TypeParam Inv = Vec;
+  Inv.flip();
   EXPECT_EQ(6U, Inv.count());
   EXPECT_EQ(11U, Inv.size());
   EXPECT_TRUE(Inv.any());
@@ -52,7 +62,7 @@ TEST(BitVectorTest, TrivialOperation) {
 
   EXPECT_FALSE(Inv == Vec);
   EXPECT_TRUE(Inv != Vec);
-  Vec = ~Vec;
+  Vec.flip();
   EXPECT_TRUE(Inv == Vec);
   EXPECT_FALSE(Inv != Vec);
 
@@ -76,8 +86,8 @@ TEST(BitVectorTest, TrivialOperation) {
   EXPECT_FALSE(Vec[56]);
   Vec.resize(61, false);
 
-  BitVector Copy = Vec;
-  BitVector Alt(3, false);
+  TypeParam Copy = Vec;
+  TypeParam Alt(3, false);
   Alt.resize(6, true);
   std::swap(Alt, Vec);
   EXPECT_TRUE(Copy == Alt);
@@ -131,7 +141,7 @@ TEST(BitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.none());
   EXPECT_FALSE(Vec.empty());
 
-  Inv = ~BitVector();
+  Inv = TypeParam().flip();
   EXPECT_EQ(0U, Inv.count());
   EXPECT_EQ(0U, Inv.size());
   EXPECT_FALSE(Inv.any());
@@ -148,13 +158,13 @@ TEST(BitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.empty());
 }
 
-TEST(BitVectorTest, CompoundAssignment) {
-  BitVector A;
+TYPED_TEST(BitVectorTest, CompoundAssignment) {
+  TypeParam A;
   A.resize(10);
   A.set(4);
   A.set(7);
 
-  BitVector B;
+  TypeParam B;
   B.resize(50);
   B.set(5);
   B.set(18);
@@ -187,8 +197,8 @@ TEST(BitVectorTest, CompoundAssignment) {
   EXPECT_EQ(100U, A.size());
 }
 
-TEST(BitVectorTest, ProxyIndex) {
-  BitVector Vec(3);
+TYPED_TEST(BitVectorTest, ProxyIndex) {
+  TypeParam Vec(3);
   EXPECT_TRUE(Vec.none());
   Vec[0] = Vec[1] = Vec[2] = true;
   EXPECT_EQ(Vec.size(), Vec.count());
@@ -196,8 +206,8 @@ TEST(BitVectorTest, ProxyIndex) {
   EXPECT_TRUE(Vec.none());
 }
 
-TEST(BitVectorTest, PortableBitMask) {
-  BitVector A;
+TYPED_TEST(BitVectorTest, PortableBitMask) {
+  TypeParam A;
   const uint32_t Mask1[] = { 0x80000000, 6, 5 };
 
   A.resize(10);
@@ -242,6 +252,34 @@ TEST(BitVectorTest, PortableBitMask) {
   A.clearBitsNotInMask(Mask1, 1);
   EXPECT_EQ(64-4u, A.count());
 }
-}
 
+TYPED_TEST(BitVectorTest, BinOps) {
+  TypeParam A;
+  TypeParam B;
+
+  A.resize(65);
+  EXPECT_FALSE(A.anyCommon(B));
+  EXPECT_FALSE(B.anyCommon(B));
+
+  B.resize(64);
+  A.set(64);
+  EXPECT_FALSE(A.anyCommon(B));
+  EXPECT_FALSE(B.anyCommon(A));
+
+  B.set(63);
+  EXPECT_FALSE(A.anyCommon(B));
+  EXPECT_FALSE(B.anyCommon(A));
+
+  A.set(63);
+  EXPECT_TRUE(A.anyCommon(B));
+  EXPECT_TRUE(B.anyCommon(A));
+
+  B.resize(70);
+  B.set(64);
+  B.reset(63);
+  A.resize(64);
+  EXPECT_FALSE(A.anyCommon(B));
+  EXPECT_FALSE(B.anyCommon(A));
+}
+}
 #endif
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
new file mode 100644
index 0000000..d272b09
--- /dev/null
+++ b/unittests/ADT/CMakeLists.txt
@@ -0,0 +1,33 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_llvm_unittest(ADTTests
+  APFloatTest.cpp
+  APIntTest.cpp
+  BitVectorTest.cpp
+  DAGDeltaAlgorithmTest.cpp
+  DeltaAlgorithmTest.cpp
+  DenseMapTest.cpp
+  DenseSetTest.cpp
+  FoldingSet.cpp
+  HashingTest.cpp
+  ilistTest.cpp
+  ImmutableSetTest.cpp
+  IntEqClassesTest.cpp
+  IntervalMapTest.cpp
+  IntrusiveRefCntPtrTest.cpp
+  PackedVectorTest.cpp
+  SCCIteratorTest.cpp
+  SmallPtrSetTest.cpp
+  SmallStringTest.cpp
+  SmallVectorTest.cpp
+  SparseBitVectorTest.cpp
+  SparseSetTest.cpp
+  StringMapTest.cpp
+  StringRefTest.cpp
+  TinyPtrVectorTest.cpp
+  TripleTest.cpp
+  TwineTest.cpp
+  VariadicFunctionTest.cpp
+ )
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index e0ee778..75e7006 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -9,170 +9,283 @@
 
 #include "gtest/gtest.h"
 #include "llvm/ADT/DenseMap.h"
+#include <map>
+#include <set>
 
 using namespace llvm;
 
 namespace {
 
-// Test fixture
-class DenseMapTest : public testing::Test {
-protected:
-  DenseMap<uint32_t, uint32_t> uintMap;
-  DenseMap<uint32_t *, uint32_t *> uintPtrMap;
-  uint32_t dummyInt;
+uint32_t getTestKey(int i, uint32_t *) { return i; }
+uint32_t getTestValue(int i, uint32_t *) { return 42 + i; }
+
+uint32_t *getTestKey(int i, uint32_t **) {
+  static uint32_t dummy_arr1[8192];
+  assert(i < 8192 && "Only support 8192 dummy keys.");
+  return &dummy_arr1[i];
+}
+uint32_t *getTestValue(int i, uint32_t **) {
+  static uint32_t dummy_arr1[8192];
+  assert(i < 8192 && "Only support 8192 dummy keys.");
+  return &dummy_arr1[i];
+}
+
+/// \brief A test class that tries to check that construction and destruction
+/// occur correctly.
+class CtorTester {
+  static std::set<CtorTester *> Constructed;
+  int Value;
+
+public:
+  explicit CtorTester(int Value = 0) : Value(Value) {
+    EXPECT_TRUE(Constructed.insert(this).second);
+  }
+  CtorTester(uint32_t Value) : Value(Value) {
+    EXPECT_TRUE(Constructed.insert(this).second);
+  }
+  CtorTester(const CtorTester &Arg) : Value(Arg.Value) {
+    EXPECT_TRUE(Constructed.insert(this).second);
+  }
+  ~CtorTester() {
+    EXPECT_EQ(1u, Constructed.erase(this));
+  }
+  operator uint32_t() const { return Value; }
+
+  int getValue() const { return Value; }
+  bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; }
 };
 
-// Empty map tests
-TEST_F(DenseMapTest, EmptyIntMapTest) {
-  // Size tests
-  EXPECT_EQ(0u, uintMap.size());
-  EXPECT_TRUE(uintMap.empty());
+std::set<CtorTester *> CtorTester::Constructed;
 
-  // Iterator tests
-  EXPECT_TRUE(uintMap.begin() == uintMap.end());
+struct CtorTesterMapInfo {
+  static inline CtorTester getEmptyKey() { return CtorTester(-1); }
+  static inline CtorTester getTombstoneKey() { return CtorTester(-2); }
+  static unsigned getHashValue(const CtorTester &Val) {
+    return Val.getValue() * 37u;
+  }
+  static bool isEqual(const CtorTester &LHS, const CtorTester &RHS) {
+    return LHS == RHS;
+  }
+};
 
-  // Lookup tests
-  EXPECT_FALSE(uintMap.count(0u));
-  EXPECT_TRUE(uintMap.find(0u) == uintMap.end());
-  EXPECT_EQ(0u, uintMap.lookup(0u));
-}
+CtorTester getTestKey(int i, CtorTester *) { return CtorTester(i); }
+CtorTester getTestValue(int i, CtorTester *) { return CtorTester(42 + i); }
+
+// Test fixture, with helper functions implemented by forwarding to global
+// function overloads selected by component types of the type parameter. This
+// allows all of the map implementations to be tested with shared
+// implementations of helper routines.
+template <typename T>
+class DenseMapTest : public ::testing::Test {
+protected:
+  T Map;
+
+  static typename T::key_type *const dummy_key_ptr;
+  static typename T::mapped_type *const dummy_value_ptr;
+
+  typename T::key_type getKey(int i = 0) {
+    return getTestKey(i, dummy_key_ptr);
+  }
+  typename T::mapped_type getValue(int i = 0) {
+    return getTestValue(i, dummy_value_ptr);
+  }
+};
 
-// Empty map tests for pointer map
-TEST_F(DenseMapTest, EmptyPtrMapTest) {
+template <typename T>
+typename T::key_type *const DenseMapTest<T>::dummy_key_ptr = 0;
+template <typename T>
+typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = 0;
+
+// Register these types for testing.
+typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
+                         DenseMap<uint32_t *, uint32_t *>,
+                         DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
+                         SmallDenseMap<uint32_t, uint32_t>,
+                         SmallDenseMap<uint32_t *, uint32_t *>,
+                         SmallDenseMap<CtorTester, CtorTester, 4,
+                                       CtorTesterMapInfo>
+                         > DenseMapTestTypes;
+TYPED_TEST_CASE(DenseMapTest, DenseMapTestTypes);
+
+// Empty map tests
+TYPED_TEST(DenseMapTest, EmptyIntMapTest) {
   // Size tests
-  EXPECT_EQ(0u, uintPtrMap.size());
-  EXPECT_TRUE(uintPtrMap.empty());
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
 
   // Iterator tests
-  EXPECT_TRUE(uintPtrMap.begin() == uintPtrMap.end());
+  EXPECT_TRUE(this->Map.begin() == this->Map.end());
 
   // Lookup tests
-  EXPECT_FALSE(uintPtrMap.count(&dummyInt));
-  EXPECT_TRUE(uintPtrMap.find(&dummyInt) == uintPtrMap.begin());
-  EXPECT_EQ(0, uintPtrMap.lookup(&dummyInt));
+  EXPECT_FALSE(this->Map.count(this->getKey()));
+  EXPECT_TRUE(this->Map.find(this->getKey()) == this->Map.end());
+#ifndef _MSC_VER
+  EXPECT_EQ(typename TypeParam::mapped_type(),
+            this->Map.lookup(this->getKey()));
+#else
+  // MSVC, at least old versions, cannot parse the typename to disambiguate
+  // TypeParam::mapped_type as a type. However, because MSVC doesn't implement
+  // two-phase name lookup, it also doesn't require the typename. Deal with
+  // this mutual incompatibility through specialized code.
+  EXPECT_EQ(TypeParam::mapped_type(),
+            this->Map.lookup(this->getKey()));
+#endif
 }
 
 // Constant map tests
-TEST_F(DenseMapTest, ConstEmptyMapTest) {
-  const DenseMap<uint32_t, uint32_t> & constUintMap = uintMap;
-  const DenseMap<uint32_t *, uint32_t *> & constUintPtrMap = uintPtrMap;
-  EXPECT_EQ(0u, constUintMap.size());
-  EXPECT_EQ(0u, constUintPtrMap.size());
-  EXPECT_TRUE(constUintMap.empty());
-  EXPECT_TRUE(constUintPtrMap.empty());
-  EXPECT_TRUE(constUintMap.begin() == constUintMap.end());
-  EXPECT_TRUE(constUintPtrMap.begin() == constUintPtrMap.end());
+TYPED_TEST(DenseMapTest, ConstEmptyMapTest) {
+  const TypeParam &ConstMap = this->Map;
+  EXPECT_EQ(0u, ConstMap.size());
+  EXPECT_TRUE(ConstMap.empty());
+  EXPECT_TRUE(ConstMap.begin() == ConstMap.end());
 }
 
 // A map with a single entry
-TEST_F(DenseMapTest, SingleEntryMapTest) {
-  uintMap[0] = 1;
+TYPED_TEST(DenseMapTest, SingleEntryMapTest) {
+  this->Map[this->getKey()] = this->getValue();
 
   // Size tests
-  EXPECT_EQ(1u, uintMap.size());
-  EXPECT_FALSE(uintMap.begin() == uintMap.end());
-  EXPECT_FALSE(uintMap.empty());
+  EXPECT_EQ(1u, this->Map.size());
+  EXPECT_FALSE(this->Map.begin() == this->Map.end());
+  EXPECT_FALSE(this->Map.empty());
 
   // Iterator tests
-  DenseMap<uint32_t, uint32_t>::iterator it = uintMap.begin();
-  EXPECT_EQ(0u, it->first);
-  EXPECT_EQ(1u, it->second);
+  typename TypeParam::iterator it = this->Map.begin();
+  EXPECT_EQ(this->getKey(), it->first);
+  EXPECT_EQ(this->getValue(), it->second);
   ++it;
-  EXPECT_TRUE(it == uintMap.end());
+  EXPECT_TRUE(it == this->Map.end());
 
   // Lookup tests
-  EXPECT_TRUE(uintMap.count(0u));
-  EXPECT_TRUE(uintMap.find(0u) == uintMap.begin());
-  EXPECT_EQ(1u, uintMap.lookup(0u));
-  EXPECT_EQ(1u, uintMap[0]);
+  EXPECT_TRUE(this->Map.count(this->getKey()));
+  EXPECT_TRUE(this->Map.find(this->getKey()) == this->Map.begin());
+  EXPECT_EQ(this->getValue(), this->Map.lookup(this->getKey()));
+  EXPECT_EQ(this->getValue(), this->Map[this->getKey()]);
 }
 
 // Test clear() method
-TEST_F(DenseMapTest, ClearTest) {
-  uintMap[0] = 1;
-  uintMap.clear();
+TYPED_TEST(DenseMapTest, ClearTest) {
+  this->Map[this->getKey()] = this->getValue();
+  this->Map.clear();
 
-  EXPECT_EQ(0u, uintMap.size());
-  EXPECT_TRUE(uintMap.empty());
-  EXPECT_TRUE(uintMap.begin() == uintMap.end());
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
+  EXPECT_TRUE(this->Map.begin() == this->Map.end());
 }
 
 // Test erase(iterator) method
-TEST_F(DenseMapTest, EraseTest) {
-  uintMap[0] = 1;
-  uintMap.erase(uintMap.begin());
+TYPED_TEST(DenseMapTest, EraseTest) {
+  this->Map[this->getKey()] = this->getValue();
+  this->Map.erase(this->Map.begin());
 
-  EXPECT_EQ(0u, uintMap.size());
-  EXPECT_TRUE(uintMap.empty());
-  EXPECT_TRUE(uintMap.begin() == uintMap.end());
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
+  EXPECT_TRUE(this->Map.begin() == this->Map.end());
 }
 
 // Test erase(value) method
-TEST_F(DenseMapTest, EraseTest2) {
-  uintMap[0] = 1;
-  uintMap.erase(0);
+TYPED_TEST(DenseMapTest, EraseTest2) {
+  this->Map[this->getKey()] = this->getValue();
+  this->Map.erase(this->getKey());
 
-  EXPECT_EQ(0u, uintMap.size());
-  EXPECT_TRUE(uintMap.empty());
-  EXPECT_TRUE(uintMap.begin() == uintMap.end());
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
+  EXPECT_TRUE(this->Map.begin() == this->Map.end());
 }
 
 // Test insert() method
-TEST_F(DenseMapTest, InsertTest) {
-  uintMap.insert(std::make_pair(0u, 1u));
-  EXPECT_EQ(1u, uintMap.size());
-  EXPECT_EQ(1u, uintMap[0]);
+TYPED_TEST(DenseMapTest, InsertTest) {
+  this->Map.insert(std::make_pair(this->getKey(), this->getValue()));
+  EXPECT_EQ(1u, this->Map.size());
+  EXPECT_EQ(this->getValue(), this->Map[this->getKey()]);
 }
 
 // Test copy constructor method
-TEST_F(DenseMapTest, CopyConstructorTest) {
-  uintMap[0] = 1;
-  DenseMap<uint32_t, uint32_t> copyMap(uintMap);
+TYPED_TEST(DenseMapTest, CopyConstructorTest) {
+  this->Map[this->getKey()] = this->getValue();
+  TypeParam copyMap(this->Map);
 
   EXPECT_EQ(1u, copyMap.size());
-  EXPECT_EQ(1u, copyMap[0]);
+  EXPECT_EQ(this->getValue(), copyMap[this->getKey()]);
 }
 
 // Test assignment operator method
-TEST_F(DenseMapTest, AssignmentTest) {
-  uintMap[0] = 1;
-  DenseMap<uint32_t, uint32_t> copyMap = uintMap;
+TYPED_TEST(DenseMapTest, AssignmentTest) {
+  this->Map[this->getKey()] = this->getValue();
+  TypeParam copyMap = this->Map;
 
   EXPECT_EQ(1u, copyMap.size());
-  EXPECT_EQ(1u, copyMap[0]);
+  EXPECT_EQ(this->getValue(), copyMap[this->getKey()]);
+}
+
+// Test swap method
+TYPED_TEST(DenseMapTest, SwapTest) {
+  this->Map[this->getKey()] = this->getValue();
+  TypeParam otherMap;
+
+  this->Map.swap(otherMap);
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
+  EXPECT_EQ(1u, otherMap.size());
+  EXPECT_EQ(this->getValue(), otherMap[this->getKey()]);
+
+  this->Map.swap(otherMap);
+  EXPECT_EQ(0u, otherMap.size());
+  EXPECT_TRUE(otherMap.empty());
+  EXPECT_EQ(1u, this->Map.size());
+  EXPECT_EQ(this->getValue(), this->Map[this->getKey()]);
+
+  // Make this more interesting by inserting 100 numbers into the map.
+  for (int i = 0; i < 100; ++i)
+    this->Map[this->getKey(i)] = this->getValue(i);
+
+  this->Map.swap(otherMap);
+  EXPECT_EQ(0u, this->Map.size());
+  EXPECT_TRUE(this->Map.empty());
+  EXPECT_EQ(100u, otherMap.size());
+  for (int i = 0; i < 100; ++i)
+    EXPECT_EQ(this->getValue(i), otherMap[this->getKey(i)]);
+
+  this->Map.swap(otherMap);
+  EXPECT_EQ(0u, otherMap.size());
+  EXPECT_TRUE(otherMap.empty());
+  EXPECT_EQ(100u, this->Map.size());
+  for (int i = 0; i < 100; ++i)
+    EXPECT_EQ(this->getValue(i), this->Map[this->getKey(i)]);
 }
 
 // A more complex iteration test
-TEST_F(DenseMapTest, IterationTest) {
+TYPED_TEST(DenseMapTest, IterationTest) {
   bool visited[100];
+  std::map<typename TypeParam::key_type, unsigned> visitedIndex;
 
   // Insert 100 numbers into the map
   for (int i = 0; i < 100; ++i) {
     visited[i] = false;
-    uintMap[i] = 3;
+    visitedIndex[this->getKey(i)] = i;
+
+    this->Map[this->getKey(i)] = this->getValue(i);
   }
 
   // Iterate over all numbers and mark each one found.
-  for (DenseMap<uint32_t, uint32_t>::iterator it = uintMap.begin();
-      it != uintMap.end(); ++it) {
-    visited[it->first] = true;
-  }
+  for (typename TypeParam::iterator it = this->Map.begin();
+       it != this->Map.end(); ++it)
+    visited[visitedIndex[it->first]] = true;
 
   // Ensure every number was visited.
-  for (int i = 0; i < 100; ++i) {
+  for (int i = 0; i < 100; ++i)
     ASSERT_TRUE(visited[i]) << "Entry #" << i << " was never visited";
-  }
 }
 
 // const_iterator test
-TEST_F(DenseMapTest, ConstIteratorTest) {
+TYPED_TEST(DenseMapTest, ConstIteratorTest) {
   // Check conversion from iterator to const_iterator.
-  DenseMap<uint32_t, uint32_t>::iterator it = uintMap.begin();
-  DenseMap<uint32_t, uint32_t>::const_iterator cit(it);
+  typename TypeParam::iterator it = this->Map.begin();
+  typename TypeParam::const_iterator cit(it);
   EXPECT_TRUE(it == cit);
 
   // Check copying of const_iterators.
-  DenseMap<uint32_t, uint32_t>::const_iterator cit2(cit);
+  typename TypeParam::const_iterator cit2(cit);
   EXPECT_TRUE(cit == cit2);
 }
 
@@ -194,7 +307,7 @@ struct TestDenseMapInfo {
 };
 
 // find_as() tests
-TEST_F(DenseMapTest, FindAsTest) {
+TEST(DenseMapCustomTest, FindAsTest) {
   DenseMap<unsigned, unsigned, TestDenseMapInfo> map;
   map[0] = 1;
   map[1] = 2;
diff --git a/unittests/ADT/HashingTest.cpp b/unittests/ADT/HashingTest.cpp
index b148f14..1b3d061 100644
--- a/unittests/ADT/HashingTest.cpp
+++ b/unittests/ADT/HashingTest.cpp
@@ -345,7 +345,7 @@ TEST(HashingTest, HashCombineBasicTest) {
   EXPECT_EQ(hash_combine_range(arr1, arr1 + 6),
             hash_combine(i1, i2, i3, i4, i5, i6));
 
-  // Hashing a sequence of heterogenous types which *happen* to all produce the
+  // Hashing a sequence of heterogeneous types which *happen* to all produce the
   // same data for hashing produces the same as a range-based hash of the
   // fundamental values.
   const size_t s1 = 1024, s2 = 8888, s3 = 9000000;
diff --git a/unittests/ADT/SmallBitVectorTest.cpp b/unittests/ADT/SmallBitVectorTest.cpp
deleted file mode 100644
index c4dda9e..0000000
--- a/unittests/ADT/SmallBitVectorTest.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===- llvm/unittest/ADT/SmallBitVectorTest.cpp - SmallBitVector tests ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallBitVector.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-TEST(SmallBitVectorTest, TrivialOperation) {
-  SmallBitVector Vec;
-  EXPECT_EQ(0U, Vec.count());
-  EXPECT_EQ(0U, Vec.size());
-  EXPECT_FALSE(Vec.any());
-  EXPECT_TRUE(Vec.all());
-  EXPECT_TRUE(Vec.none());
-  EXPECT_TRUE(Vec.empty());
-
-  Vec.resize(5, true);
-  EXPECT_EQ(5U, Vec.count());
-  EXPECT_EQ(5U, Vec.size());
-  EXPECT_TRUE(Vec.any());
-  EXPECT_TRUE(Vec.all());
-  EXPECT_FALSE(Vec.none());
-  EXPECT_FALSE(Vec.empty());
-
-  Vec.resize(11);
-  EXPECT_EQ(5U, Vec.count());
-  EXPECT_EQ(11U, Vec.size());
-  EXPECT_TRUE(Vec.any());
-  EXPECT_FALSE(Vec.all());
-  EXPECT_FALSE(Vec.none());
-  EXPECT_FALSE(Vec.empty());
-
-  SmallBitVector Inv = ~Vec;
-  EXPECT_EQ(6U, Inv.count());
-  EXPECT_EQ(11U, Inv.size());
-  EXPECT_TRUE(Inv.any());
-  EXPECT_FALSE(Inv.all());
-  EXPECT_FALSE(Inv.none());
-  EXPECT_FALSE(Inv.empty());
-
-  EXPECT_FALSE(Inv == Vec);
-  EXPECT_TRUE(Inv != Vec);
-  Vec = ~Vec;
-  EXPECT_TRUE(Inv == Vec);
-  EXPECT_FALSE(Inv != Vec);
-
-  // Add some "interesting" data to Vec.
-  Vec.resize(23, true);
-  Vec.resize(25, false);
-  Vec.resize(26, true);
-  Vec.resize(29, false);
-  Vec.resize(33, true);
-  Vec.resize(57, false);
-  unsigned Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
-    ++Count;
-    EXPECT_TRUE(Vec[i]);
-    EXPECT_TRUE(Vec.test(i));
-  }
-  EXPECT_EQ(Count, Vec.count());
-  EXPECT_EQ(Count, 23u);
-  EXPECT_FALSE(Vec[0]);
-  EXPECT_TRUE(Vec[32]);
-  EXPECT_FALSE(Vec[56]);
-  Vec.resize(61, false);
-
-  SmallBitVector Copy = Vec;
-  SmallBitVector Alt(3, false);
-  Alt.resize(6, true);
-  std::swap(Alt, Vec);
-  EXPECT_TRUE(Copy == Alt);
-  EXPECT_TRUE(Vec.size() == 6);
-  EXPECT_TRUE(Vec.count() == 3);
-  EXPECT_TRUE(Vec.find_first() == 3);
-  std::swap(Copy, Vec);
-
-  // Add some more "interesting" data.
-  Vec.resize(68, true);
-  Vec.resize(78, false);
-  Vec.resize(89, true);
-  Vec.resize(90, false);
-  Vec.resize(91, true);
-  Vec.resize(130, false);
-  Count = 0;
-  for (unsigned i = Vec.find_first(); i != -1u; i = Vec.find_next(i)) {
-    ++Count;
-    EXPECT_TRUE(Vec[i]);
-    EXPECT_TRUE(Vec.test(i));
-  }
-  EXPECT_EQ(Count, Vec.count());
-  EXPECT_EQ(Count, 42u);
-  EXPECT_FALSE(Vec[0]);
-  EXPECT_TRUE(Vec[32]);
-  EXPECT_FALSE(Vec[60]);
-  EXPECT_FALSE(Vec[129]);
-
-  Vec.flip(60);
-  EXPECT_TRUE(Vec[60]);
-  EXPECT_EQ(Count + 1, Vec.count());
-  Vec.flip(60);
-  EXPECT_FALSE(Vec[60]);
-  EXPECT_EQ(Count, Vec.count());
-
-  Vec.reset(32);
-  EXPECT_FALSE(Vec[32]);
-  EXPECT_EQ(Count - 1, Vec.count());
-  Vec.set(32);
-  EXPECT_TRUE(Vec[32]);
-  EXPECT_EQ(Count, Vec.count());
-
-  Vec.flip();
-  EXPECT_EQ(Vec.size() - Count, Vec.count());
-
-  Vec.reset();
-  EXPECT_EQ(0U, Vec.count());
-  EXPECT_EQ(130U, Vec.size());
-  EXPECT_FALSE(Vec.any());
-  EXPECT_FALSE(Vec.all());
-  EXPECT_TRUE(Vec.none());
-  EXPECT_FALSE(Vec.empty());
-
-  Inv = ~SmallBitVector();
-  EXPECT_EQ(0U, Inv.count());
-  EXPECT_EQ(0U, Inv.size());
-  EXPECT_FALSE(Inv.any());
-  EXPECT_TRUE(Inv.all());
-  EXPECT_TRUE(Inv.none());
-  EXPECT_TRUE(Inv.empty());
-
-  Vec.clear();
-  EXPECT_EQ(0U, Vec.count());
-  EXPECT_EQ(0U, Vec.size());
-  EXPECT_FALSE(Vec.any());
-  EXPECT_TRUE(Vec.all());
-  EXPECT_TRUE(Vec.none());
-  EXPECT_TRUE(Vec.empty());
-}
-
-TEST(SmallBitVectorTest, CompoundAssignment) {
-  SmallBitVector A;
-  A.resize(10);
-  A.set(4);
-  A.set(7);
-
-  SmallBitVector B;
-  B.resize(50);
-  B.set(5);
-  B.set(18);
-
-  A |= B;
-  EXPECT_TRUE(A.test(4));
-  EXPECT_TRUE(A.test(5));
-  EXPECT_TRUE(A.test(7));
-  EXPECT_TRUE(A.test(18));
-  EXPECT_EQ(4U, A.count());
-  EXPECT_EQ(50U, A.size());
-
-  B.resize(10);
-  B.set();
-  B.reset(2);
-  B.reset(7);
-  A &= B;
-  EXPECT_FALSE(A.test(2));
-  EXPECT_FALSE(A.test(7));
-  EXPECT_EQ(2U, A.count());
-  EXPECT_EQ(50U, A.size());
-
-  B.resize(100);
-  B.set();
-
-  A ^= B;
-  EXPECT_TRUE(A.test(2));
-  EXPECT_TRUE(A.test(7));
-  EXPECT_EQ(98U, A.count());
-  EXPECT_EQ(100U, A.size());
-}
-
-TEST(SmallBitVectorTest, ProxyIndex) {
-  SmallBitVector Vec(3);
-  EXPECT_TRUE(Vec.none());
-  Vec[0] = Vec[1] = Vec[2] = true;
-  EXPECT_EQ(Vec.size(), Vec.count());
-  Vec[2] = Vec[1] = Vec[0] = false;
-  EXPECT_TRUE(Vec.none());
-}
-
-}
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index d5bfe76..7fd71f5 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -88,18 +88,17 @@ int Constructable::numDestructorCalls;
 int Constructable::numAssignmentCalls;
 
 // Test fixture class
+template <typename VectorT>
 class SmallVectorTest : public testing::Test {
 protected:
-  typedef SmallVector<Constructable, 4> VectorType;
-
-  VectorType theVector;
-  VectorType otherVector;
+  VectorT theVector;
+  VectorT otherVector;
 
   void SetUp() {
     Constructable::reset();
   }
 
-  void assertEmpty(VectorType & v) {
+  void assertEmpty(VectorT & v) {
     // Size tests
     EXPECT_EQ(0u, v.size());
     EXPECT_TRUE(v.empty());
@@ -109,7 +108,7 @@ protected:
   }
 
   // Assert that theVector contains the specified values, in order.
-  void assertValuesInOrder(VectorType & v, size_t size, ...) {
+  void assertValuesInOrder(VectorT & v, size_t size, ...) {
     EXPECT_EQ(size, v.size());
 
     va_list ap;
@@ -123,267 +122,327 @@ protected:
   }
 
   // Generate a sequence of values to initialize the vector.
-  void makeSequence(VectorType & v, int start, int end) {
+  void makeSequence(VectorT & v, int start, int end) {
     for (int i = start; i <= end; ++i) {
       v.push_back(Constructable(i));
     }
   }
 };
 
+typedef ::testing::Types<SmallVector<Constructable, 0>,
+                         SmallVector<Constructable, 1>,
+                         SmallVector<Constructable, 2>,
+                         SmallVector<Constructable, 4>
+                         > SmallVectorTestTypes;
+TYPED_TEST_CASE(SmallVectorTest, SmallVectorTestTypes);
+
 // New vector test.
-TEST_F(SmallVectorTest, EmptyVectorTest) {
+TYPED_TEST(SmallVectorTest, EmptyVectorTest) {
   SCOPED_TRACE("EmptyVectorTest");
-  assertEmpty(theVector);
-  EXPECT_TRUE(theVector.rbegin() == theVector.rend());
+  this->assertEmpty(this->theVector);
+  EXPECT_TRUE(this->theVector.rbegin() == this->theVector.rend());
   EXPECT_EQ(0, Constructable::getNumConstructorCalls());
   EXPECT_EQ(0, Constructable::getNumDestructorCalls());
 }
 
 // Simple insertions and deletions.
-TEST_F(SmallVectorTest, PushPopTest) {
+TYPED_TEST(SmallVectorTest, PushPopTest) {
   SCOPED_TRACE("PushPopTest");
 
+  // Track whether the vector will potentially have to grow.
+  bool RequiresGrowth = this->theVector.capacity() < 3;
+
   // Push an element
-  theVector.push_back(Constructable(1));
+  this->theVector.push_back(Constructable(1));
 
   // Size tests
-  assertValuesInOrder(theVector, 1u, 1);
-  EXPECT_FALSE(theVector.begin() == theVector.end());
-  EXPECT_FALSE(theVector.empty());
+  this->assertValuesInOrder(this->theVector, 1u, 1);
+  EXPECT_FALSE(this->theVector.begin() == this->theVector.end());
+  EXPECT_FALSE(this->theVector.empty());
 
   // Push another element
-  theVector.push_back(Constructable(2));
-  assertValuesInOrder(theVector, 2u, 1, 2);
+  this->theVector.push_back(Constructable(2));
+  this->assertValuesInOrder(this->theVector, 2u, 1, 2);
 
   // Insert at beginning
-  theVector.insert(theVector.begin(), theVector[1]);
-  assertValuesInOrder(theVector, 3u, 2, 1, 2);
+  this->theVector.insert(this->theVector.begin(), this->theVector[1]);
+  this->assertValuesInOrder(this->theVector, 3u, 2, 1, 2);
 
   // Pop one element
-  theVector.pop_back();
-  assertValuesInOrder(theVector, 2u, 2, 1);
+  this->theVector.pop_back();
+  this->assertValuesInOrder(this->theVector, 2u, 2, 1);
 
   // Pop remaining elements
-  theVector.pop_back();
-  theVector.pop_back();
-  assertEmpty(theVector);
+  this->theVector.pop_back();
+  this->theVector.pop_back();
+  this->assertEmpty(this->theVector);
 
   // Check number of constructor calls. Should be 2 for each list element,
   // one for the argument to push_back, one for the argument to insert,
   // and one for the list element itself.
-  EXPECT_EQ(5, Constructable::getNumConstructorCalls());
-  EXPECT_EQ(5, Constructable::getNumDestructorCalls());
+  if (!RequiresGrowth) {
+    EXPECT_EQ(5, Constructable::getNumConstructorCalls());
+    EXPECT_EQ(5, Constructable::getNumDestructorCalls());
+  } else {
+    // If we had to grow the vector, these only have a lower bound, but should
+    // always be equal.
+    EXPECT_LE(5, Constructable::getNumConstructorCalls());
+    EXPECT_EQ(Constructable::getNumConstructorCalls(),
+              Constructable::getNumDestructorCalls());
+  }
 }
 
 // Clear test.
-TEST_F(SmallVectorTest, ClearTest) {
+TYPED_TEST(SmallVectorTest, ClearTest) {
   SCOPED_TRACE("ClearTest");
 
-  makeSequence(theVector, 1, 2);
-  theVector.clear();
+  this->theVector.reserve(2);
+  this->makeSequence(this->theVector, 1, 2);
+  this->theVector.clear();
 
-  assertEmpty(theVector);
+  this->assertEmpty(this->theVector);
   EXPECT_EQ(4, Constructable::getNumConstructorCalls());
   EXPECT_EQ(4, Constructable::getNumDestructorCalls());
 }
 
 // Resize smaller test.
-TEST_F(SmallVectorTest, ResizeShrinkTest) {
+TYPED_TEST(SmallVectorTest, ResizeShrinkTest) {
   SCOPED_TRACE("ResizeShrinkTest");
 
-  makeSequence(theVector, 1, 3);
-  theVector.resize(1);
+  this->theVector.reserve(3);
+  this->makeSequence(this->theVector, 1, 3);
+  this->theVector.resize(1);
 
-  assertValuesInOrder(theVector, 1u, 1);
+  this->assertValuesInOrder(this->theVector, 1u, 1);
   EXPECT_EQ(6, Constructable::getNumConstructorCalls());
   EXPECT_EQ(5, Constructable::getNumDestructorCalls());
 }
 
 // Resize bigger test.
-TEST_F(SmallVectorTest, ResizeGrowTest) {
+TYPED_TEST(SmallVectorTest, ResizeGrowTest) {
   SCOPED_TRACE("ResizeGrowTest");
 
-  theVector.resize(2);
+  this->theVector.resize(2);
 
   // The extra constructor/destructor calls come from the temporary object used
   // to initialize the contents of the resized array (via copy construction).
   EXPECT_EQ(3, Constructable::getNumConstructorCalls());
   EXPECT_EQ(1, Constructable::getNumDestructorCalls());
-  EXPECT_EQ(2u, theVector.size());
+  EXPECT_EQ(2u, this->theVector.size());
 }
 
 // Resize with fill value.
-TEST_F(SmallVectorTest, ResizeFillTest) {
+TYPED_TEST(SmallVectorTest, ResizeFillTest) {
   SCOPED_TRACE("ResizeFillTest");
 
-  theVector.resize(3, Constructable(77));
-  assertValuesInOrder(theVector, 3u, 77, 77, 77);
+  this->theVector.resize(3, Constructable(77));
+  this->assertValuesInOrder(this->theVector, 3u, 77, 77, 77);
 }
 
 // Overflow past fixed size.
-TEST_F(SmallVectorTest, OverflowTest) {
+TYPED_TEST(SmallVectorTest, OverflowTest) {
   SCOPED_TRACE("OverflowTest");
 
   // Push more elements than the fixed size.
-  makeSequence(theVector, 1, 10);
+  this->makeSequence(this->theVector, 1, 10);
 
   // Test size and values.
-  EXPECT_EQ(10u, theVector.size());
+  EXPECT_EQ(10u, this->theVector.size());
   for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ(i+1, theVector[i].getValue());
+    EXPECT_EQ(i+1, this->theVector[i].getValue());
   }
 
   // Now resize back to fixed size.
-  theVector.resize(1);
+  this->theVector.resize(1);
 
-  assertValuesInOrder(theVector, 1u, 1);
+  this->assertValuesInOrder(this->theVector, 1u, 1);
 }
 
 // Iteration tests.
-TEST_F(SmallVectorTest, IterationTest) {
-  makeSequence(theVector, 1, 2);
+TYPED_TEST(SmallVectorTest, IterationTest) {
+  this->makeSequence(this->theVector, 1, 2);
 
   // Forward Iteration
-  VectorType::iterator it = theVector.begin();
-  EXPECT_TRUE(*it == theVector.front());
-  EXPECT_TRUE(*it == theVector[0]);
+  typename TypeParam::iterator it = this->theVector.begin();
+  EXPECT_TRUE(*it == this->theVector.front());
+  EXPECT_TRUE(*it == this->theVector[0]);
   EXPECT_EQ(1, it->getValue());
   ++it;
-  EXPECT_TRUE(*it == theVector[1]);
-  EXPECT_TRUE(*it == theVector.back());
+  EXPECT_TRUE(*it == this->theVector[1]);
+  EXPECT_TRUE(*it == this->theVector.back());
   EXPECT_EQ(2, it->getValue());
   ++it;
-  EXPECT_TRUE(it == theVector.end());
+  EXPECT_TRUE(it == this->theVector.end());
   --it;
-  EXPECT_TRUE(*it == theVector[1]);
+  EXPECT_TRUE(*it == this->theVector[1]);
   EXPECT_EQ(2, it->getValue());
   --it;
-  EXPECT_TRUE(*it == theVector[0]);
+  EXPECT_TRUE(*it == this->theVector[0]);
   EXPECT_EQ(1, it->getValue());
 
   // Reverse Iteration
-  VectorType::reverse_iterator rit = theVector.rbegin();
-  EXPECT_TRUE(*rit == theVector[1]);
+  typename TypeParam::reverse_iterator rit = this->theVector.rbegin();
+  EXPECT_TRUE(*rit == this->theVector[1]);
   EXPECT_EQ(2, rit->getValue());
   ++rit;
-  EXPECT_TRUE(*rit == theVector[0]);
+  EXPECT_TRUE(*rit == this->theVector[0]);
   EXPECT_EQ(1, rit->getValue());
   ++rit;
-  EXPECT_TRUE(rit == theVector.rend());
+  EXPECT_TRUE(rit == this->theVector.rend());
   --rit;
-  EXPECT_TRUE(*rit == theVector[0]);
+  EXPECT_TRUE(*rit == this->theVector[0]);
   EXPECT_EQ(1, rit->getValue());
   --rit;
-  EXPECT_TRUE(*rit == theVector[1]);
+  EXPECT_TRUE(*rit == this->theVector[1]);
   EXPECT_EQ(2, rit->getValue());
 }
 
 // Swap test.
-TEST_F(SmallVectorTest, SwapTest) {
+TYPED_TEST(SmallVectorTest, SwapTest) {
   SCOPED_TRACE("SwapTest");
 
-  makeSequence(theVector, 1, 2);
-  std::swap(theVector, otherVector);
+  this->makeSequence(this->theVector, 1, 2);
+  std::swap(this->theVector, this->otherVector);
 
-  assertEmpty(theVector);
-  assertValuesInOrder(otherVector, 2u, 1, 2);
+  this->assertEmpty(this->theVector);
+  this->assertValuesInOrder(this->otherVector, 2u, 1, 2);
 }
 
 // Append test
-TEST_F(SmallVectorTest, AppendTest) {
+TYPED_TEST(SmallVectorTest, AppendTest) {
   SCOPED_TRACE("AppendTest");
 
-  makeSequence(otherVector, 2, 3);
+  this->makeSequence(this->otherVector, 2, 3);
 
-  theVector.push_back(Constructable(1));
-  theVector.append(otherVector.begin(), otherVector.end());
+  this->theVector.push_back(Constructable(1));
+  this->theVector.append(this->otherVector.begin(), this->otherVector.end());
 
-  assertValuesInOrder(theVector, 3u, 1, 2, 3);
+  this->assertValuesInOrder(this->theVector, 3u, 1, 2, 3);
 }
 
 // Append repeated test
-TEST_F(SmallVectorTest, AppendRepeatedTest) {
+TYPED_TEST(SmallVectorTest, AppendRepeatedTest) {
   SCOPED_TRACE("AppendRepeatedTest");
 
-  theVector.push_back(Constructable(1));
-  theVector.append(2, Constructable(77));
-  assertValuesInOrder(theVector, 3u, 1, 77, 77);
+  this->theVector.push_back(Constructable(1));
+  this->theVector.append(2, Constructable(77));
+  this->assertValuesInOrder(this->theVector, 3u, 1, 77, 77);
 }
 
 // Assign test
-TEST_F(SmallVectorTest, AssignTest) {
+TYPED_TEST(SmallVectorTest, AssignTest) {
   SCOPED_TRACE("AssignTest");
 
-  theVector.push_back(Constructable(1));
-  theVector.assign(2, Constructable(77));
-  assertValuesInOrder(theVector, 2u, 77, 77);
+  this->theVector.push_back(Constructable(1));
+  this->theVector.assign(2, Constructable(77));
+  this->assertValuesInOrder(this->theVector, 2u, 77, 77);
 }
 
 // Erase a single element
-TEST_F(SmallVectorTest, EraseTest) {
+TYPED_TEST(SmallVectorTest, EraseTest) {
   SCOPED_TRACE("EraseTest");
 
-  makeSequence(theVector, 1, 3);
-  theVector.erase(theVector.begin());
-  assertValuesInOrder(theVector, 2u, 2, 3);
+  this->makeSequence(this->theVector, 1, 3);
+  this->theVector.erase(this->theVector.begin());
+  this->assertValuesInOrder(this->theVector, 2u, 2, 3);
 }
 
 // Erase a range of elements
-TEST_F(SmallVectorTest, EraseRangeTest) {
+TYPED_TEST(SmallVectorTest, EraseRangeTest) {
   SCOPED_TRACE("EraseRangeTest");
 
-  makeSequence(theVector, 1, 3);
-  theVector.erase(theVector.begin(), theVector.begin() + 2);
-  assertValuesInOrder(theVector, 1u, 3);
+  this->makeSequence(this->theVector, 1, 3);
+  this->theVector.erase(this->theVector.begin(), this->theVector.begin() + 2);
+  this->assertValuesInOrder(this->theVector, 1u, 3);
 }
 
 // Insert a single element.
-TEST_F(SmallVectorTest, InsertTest) {
+TYPED_TEST(SmallVectorTest, InsertTest) {
   SCOPED_TRACE("InsertTest");
 
-  makeSequence(theVector, 1, 3);
-  theVector.insert(theVector.begin() + 1, Constructable(77));
-  assertValuesInOrder(theVector, 4u, 1, 77, 2, 3);
+  this->makeSequence(this->theVector, 1, 3);
+  typename TypeParam::iterator I =
+    this->theVector.insert(this->theVector.begin() + 1, Constructable(77));
+  EXPECT_EQ(this->theVector.begin() + 1, I);
+  this->assertValuesInOrder(this->theVector, 4u, 1, 77, 2, 3);
 }
 
 // Insert repeated elements.
-TEST_F(SmallVectorTest, InsertRepeatedTest) {
+TYPED_TEST(SmallVectorTest, InsertRepeatedTest) {
   SCOPED_TRACE("InsertRepeatedTest");
 
-  makeSequence(theVector, 10, 15);
-  theVector.insert(theVector.begin() + 1, 2, Constructable(16));
-  assertValuesInOrder(theVector, 8u, 10, 16, 16, 11, 12, 13, 14, 15);
+  this->makeSequence(this->theVector, 10, 15);
+  typename TypeParam::iterator I =
+    this->theVector.insert(this->theVector.begin() + 1, 2, Constructable(16));
+  EXPECT_EQ(this->theVector.begin() + 1, I);
+  this->assertValuesInOrder(this->theVector, 8u,
+                      10, 16, 16, 11, 12, 13, 14, 15);
+
+  // Insert at end.
+  I = this->theVector.insert(this->theVector.end(), 2, Constructable(16));
+  EXPECT_EQ(this->theVector.begin() + 8, I);
+  this->assertValuesInOrder(this->theVector, 10u,
+                      10, 16, 16, 11, 12, 13, 14, 15, 16, 16);
+
+  // Empty insert.
+  EXPECT_EQ(this->theVector.end(),
+            this->theVector.insert(this->theVector.end(),
+                                   0, Constructable(42)));
+  EXPECT_EQ(this->theVector.begin() + 1,
+            this->theVector.insert(this->theVector.begin() + 1,
+                                   0, Constructable(42)));
 }
 
 // Insert range.
-TEST_F(SmallVectorTest, InsertRangeTest) {
-  SCOPED_TRACE("InsertRepeatedTest");
-
-  makeSequence(theVector, 1, 3);
-  theVector.insert(theVector.begin() + 1, 3, Constructable(77));
-  assertValuesInOrder(theVector, 6u, 1, 77, 77, 77, 2, 3);
+TYPED_TEST(SmallVectorTest, InsertRangeTest) {
+  SCOPED_TRACE("InsertRangeTest");
+
+  Constructable Arr[3] =
+    { Constructable(77), Constructable(77), Constructable(77) };
+
+  this->makeSequence(this->theVector, 1, 3);
+  typename TypeParam::iterator I =
+    this->theVector.insert(this->theVector.begin() + 1, Arr, Arr+3);
+  EXPECT_EQ(this->theVector.begin() + 1, I);
+  this->assertValuesInOrder(this->theVector, 6u, 1, 77, 77, 77, 2, 3);
+
+  // Insert at end.
+  I = this->theVector.insert(this->theVector.end(), Arr, Arr+3);
+  EXPECT_EQ(this->theVector.begin() + 6, I);
+  this->assertValuesInOrder(this->theVector, 9u,
+                            1, 77, 77, 77, 2, 3, 77, 77, 77);
+
+  // Empty insert.
+  EXPECT_EQ(this->theVector.end(),
+            this->theVector.insert(this->theVector.end(),
+                                   this->theVector.begin(),
+                                   this->theVector.begin()));
+  EXPECT_EQ(this->theVector.begin() + 1,
+            this->theVector.insert(this->theVector.begin() + 1,
+                                   this->theVector.begin(),
+                                   this->theVector.begin()));
 }
 
 // Comparison tests.
-TEST_F(SmallVectorTest, ComparisonTest) {
+TYPED_TEST(SmallVectorTest, ComparisonTest) {
   SCOPED_TRACE("ComparisonTest");
 
-  makeSequence(theVector, 1, 3);
-  makeSequence(otherVector, 1, 3);
+  this->makeSequence(this->theVector, 1, 3);
+  this->makeSequence(this->otherVector, 1, 3);
 
-  EXPECT_TRUE(theVector == otherVector);
-  EXPECT_FALSE(theVector != otherVector);
+  EXPECT_TRUE(this->theVector == this->otherVector);
+  EXPECT_FALSE(this->theVector != this->otherVector);
 
-  otherVector.clear();
-  makeSequence(otherVector, 2, 4);
+  this->otherVector.clear();
+  this->makeSequence(this->otherVector, 2, 4);
 
-  EXPECT_FALSE(theVector == otherVector);
-  EXPECT_TRUE(theVector != otherVector);
+  EXPECT_FALSE(this->theVector == this->otherVector);
+  EXPECT_TRUE(this->theVector != this->otherVector);
 }
 
 // Constant vector tests.
-TEST_F(SmallVectorTest, ConstVectorTest) {
-  VectorType constVector;
+TYPED_TEST(SmallVectorTest, ConstVectorTest) {
+  const TypeParam constVector;
 
   EXPECT_EQ(0u, constVector.size());
   EXPECT_TRUE(constVector.empty());
@@ -391,26 +450,40 @@ TEST_F(SmallVectorTest, ConstVectorTest) {
 }
 
 // Direct array access.
-TEST_F(SmallVectorTest, DirectVectorTest) {
-  EXPECT_EQ(0u, theVector.size());
-  EXPECT_LE(4u, theVector.capacity());
+TYPED_TEST(SmallVectorTest, DirectVectorTest) {
+  EXPECT_EQ(0u, this->theVector.size());
+  this->theVector.reserve(4);
+  EXPECT_LE(4u, this->theVector.capacity());
   EXPECT_EQ(0, Constructable::getNumConstructorCalls());
-  theVector.end()[0] = 1;
-  theVector.end()[1] = 2;
-  theVector.end()[2] = 3;
-  theVector.end()[3] = 4;
-  theVector.set_size(4);
-  EXPECT_EQ(4u, theVector.size());
+  this->theVector.end()[0] = 1;
+  this->theVector.end()[1] = 2;
+  this->theVector.end()[2] = 3;
+  this->theVector.end()[3] = 4;
+  this->theVector.set_size(4);
+  EXPECT_EQ(4u, this->theVector.size());
   EXPECT_EQ(4, Constructable::getNumConstructorCalls());
-  EXPECT_EQ(1, theVector[0].getValue());
-  EXPECT_EQ(2, theVector[1].getValue());
-  EXPECT_EQ(3, theVector[2].getValue());
-  EXPECT_EQ(4, theVector[3].getValue());
+  EXPECT_EQ(1, this->theVector[0].getValue());
+  EXPECT_EQ(2, this->theVector[1].getValue());
+  EXPECT_EQ(3, this->theVector[2].getValue());
+  EXPECT_EQ(4, this->theVector[3].getValue());
 }
 
-TEST_F(SmallVectorTest, IteratorTest) {
+TYPED_TEST(SmallVectorTest, IteratorTest) {
   std::list<int> L;
-  theVector.insert(theVector.end(), L.begin(), L.end());
+  this->theVector.insert(this->theVector.end(), L.begin(), L.end());
+}
+
+struct notassignable {
+  int &x;
+  notassignable(int &x) : x(x) {}
+};
+
+TEST(SmallVectorCustomTest, NoAssignTest) {
+  int x = 0;
+  SmallVector<notassignable, 2> vec;
+  vec.push_back(notassignable(x));
+  x = 42;
+  EXPECT_EQ(42, vec.pop_back_val().x);
 }
 
 }
diff --git a/unittests/ADT/SparseSetTest.cpp b/unittests/ADT/SparseSetTest.cpp
index a6ea757..eb0e0db 100644
--- a/unittests/ADT/SparseSetTest.cpp
+++ b/unittests/ADT/SparseSetTest.cpp
@@ -161,7 +161,7 @@ TEST(SparseSetTest, MultipleEntrySet) {
 struct Alt {
   unsigned Value;
   explicit Alt(unsigned x) : Value(x) {}
-  unsigned getSparseSetKey() const { return Value - 1000; }
+  unsigned getSparseSetIndex() const { return Value - 1000; }
 };
 
 TEST(SparseSetTest, AltStructSet) {
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index 2ae5820..5bb65cb 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -75,7 +75,6 @@ const std::string StringMapTest::testKeyStr(testKey);
 
 // Empty map tests.
 TEST_F(StringMapTest, EmptyMapTest) {
-  SCOPED_TRACE("EmptyMapTest");
   assertEmptyMap();
 }
 
@@ -102,14 +101,12 @@ TEST_F(StringMapTest, ConstEmptyMapTest) {
 
 // A map with a single entry.
 TEST_F(StringMapTest, SingleEntryMapTest) {
-  SCOPED_TRACE("SingleEntryMapTest");
   testMap[testKey] = testValue;
   assertSingleItemMap();
 }
 
 // Test clear() method.
 TEST_F(StringMapTest, ClearTest) {
-  SCOPED_TRACE("ClearTest");
   testMap[testKey] = testValue;
   testMap.clear();
   assertEmptyMap();
@@ -117,7 +114,6 @@ TEST_F(StringMapTest, ClearTest) {
 
 // Test erase(iterator) method.
 TEST_F(StringMapTest, EraseIteratorTest) {
-  SCOPED_TRACE("EraseIteratorTest");
   testMap[testKey] = testValue;
   testMap.erase(testMap.begin());
   assertEmptyMap();
@@ -125,7 +121,6 @@ TEST_F(StringMapTest, EraseIteratorTest) {
 
 // Test erase(value) method.
 TEST_F(StringMapTest, EraseValueTest) {
-  SCOPED_TRACE("EraseValueTest");
   testMap[testKey] = testValue;
   testMap.erase(testKey);
   assertEmptyMap();
@@ -133,13 +128,34 @@ TEST_F(StringMapTest, EraseValueTest) {
 
 // Test inserting two values and erasing one.
 TEST_F(StringMapTest, InsertAndEraseTest) {
-  SCOPED_TRACE("InsertAndEraseTest");
   testMap[testKey] = testValue;
   testMap["otherKey"] = 2;
   testMap.erase("otherKey");
   assertSingleItemMap();
 }
 
+TEST_F(StringMapTest, SmallFullMapTest) {
+  // StringMap has a tricky corner case when the map is small (<8 buckets) and
+  // it fills up through a balanced pattern of inserts and erases. This can
+  // lead to inf-loops in some cases (PR13148) so we test it explicitly here.
+  llvm::StringMap<int> Map(2);
+
+  Map["eins"] = 1;
+  Map["zwei"] = 2;
+  Map["drei"] = 3;
+  Map.erase("drei");
+  Map.erase("eins");
+  Map["veir"] = 4;
+  Map["funf"] = 5;
+
+  EXPECT_EQ(3u, Map.size());
+  EXPECT_EQ(0, Map.lookup("eins"));
+  EXPECT_EQ(2, Map.lookup("zwei"));
+  EXPECT_EQ(0, Map.lookup("drei"));
+  EXPECT_EQ(4, Map.lookup("veir"));
+  EXPECT_EQ(5, Map.lookup("funf"));
+}
+
 // A more complex iteration test.
 TEST_F(StringMapTest, IterationTest) {
   bool visited[100];
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index cc7a7fb..315eacb 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -221,6 +221,30 @@ TEST(StringRefTest, Split2) {
   EXPECT_TRUE(parts == expected);
 }
 
+TEST(StringRefTest, Trim) {
+  StringRef Str0("hello");
+  StringRef Str1(" hello ");
+  StringRef Str2("  hello  ");
+
+  EXPECT_EQ(StringRef("hello"), Str0.rtrim());
+  EXPECT_EQ(StringRef(" hello"), Str1.rtrim());
+  EXPECT_EQ(StringRef("  hello"), Str2.rtrim());
+  EXPECT_EQ(StringRef("hello"), Str0.ltrim());
+  EXPECT_EQ(StringRef("hello "), Str1.ltrim());
+  EXPECT_EQ(StringRef("hello  "), Str2.ltrim());
+  EXPECT_EQ(StringRef("hello"), Str0.trim());
+  EXPECT_EQ(StringRef("hello"), Str1.trim());
+  EXPECT_EQ(StringRef("hello"), Str2.trim());
+
+  EXPECT_EQ(StringRef("ello"), Str0.trim("hhhhhhhhhhh"));
+
+  EXPECT_EQ(StringRef(""), StringRef("").trim());
+  EXPECT_EQ(StringRef(""), StringRef(" ").trim());
+  EXPECT_EQ(StringRef("\0", 1), StringRef(" \0 ", 3).trim());
+  EXPECT_EQ(StringRef("\0\0", 2), StringRef("\0\0", 2).trim());
+  EXPECT_EQ(StringRef("x"), StringRef("\0\0x\0\0", 5).trim(StringRef("\0", 1)));
+}
+
 TEST(StringRefTest, StartsWith) {
   StringRef Str("hello");
   EXPECT_TRUE(Str.startswith("he"));
@@ -267,6 +291,10 @@ TEST(StringRefTest, Find) {
   EXPECT_EQ(1U, Str.find_first_not_of('h'));
   EXPECT_EQ(4U, Str.find_first_not_of("hel"));
   EXPECT_EQ(StringRef::npos, Str.find_first_not_of("hello"));
+
+  EXPECT_EQ(3U, Str.find_last_not_of('o'));
+  EXPECT_EQ(1U, Str.find_last_not_of("lo"));
+  EXPECT_EQ(StringRef::npos, Str.find_last_not_of("helo"));
 }
 
 TEST(StringRefTest, Count) {
diff --git a/unittests/ADT/TinyPtrVectorTest.cpp b/unittests/ADT/TinyPtrVectorTest.cpp
new file mode 100644
index 0000000..05dd797
--- /dev/null
+++ b/unittests/ADT/TinyPtrVectorTest.cpp
@@ -0,0 +1,448 @@
+//===- llvm/unittest/ADT/TinyPtrVectorTest.cpp ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TinyPtrVector unit tests.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <list>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+// The world's worst RNG, but it is deterministic and makes it easy to get
+// *some* shuffling of elements.
+static ptrdiff_t test_shuffle_rng(ptrdiff_t i) {
+  return (i + i * 33) % i;
+}
+static ptrdiff_t (*test_shuffle_rng_p)(ptrdiff_t) = &test_shuffle_rng;
+
+template <typename VectorT>
+class TinyPtrVectorTest : public testing::Test {
+protected:
+  typedef typename VectorT::value_type PtrT;
+  typedef typename remove_pointer<PtrT>::type ValueT;
+
+  VectorT V;
+  VectorT V2;
+
+  ValueT TestValues[1024];
+  std::vector<PtrT> TestPtrs;
+
+  TinyPtrVectorTest() {
+    for (size_t i = 0, e = array_lengthof(TestValues); i != e; ++i)
+      TestPtrs.push_back(&TestValues[i]);
+
+    std::random_shuffle(TestPtrs.begin(), TestPtrs.end(), test_shuffle_rng_p);
+  }
+
+  ArrayRef<PtrT> testArray(size_t N) {
+    return makeArrayRef(&TestPtrs[0], N);
+  }
+
+  void appendValues(VectorT &V, ArrayRef<PtrT> Values) {
+    for (size_t i = 0, e = Values.size(); i != e; ++i)
+      V.push_back(Values[i]);
+  }
+
+  void setVectors(ArrayRef<PtrT> Values1, ArrayRef<PtrT> Values2) {
+    V.clear();
+    appendValues(V, Values1);
+    V2.clear();
+    appendValues(V2, Values2);
+  }
+
+  void expectValues(const VectorT &V, ArrayRef<PtrT> Values) {
+    EXPECT_EQ(Values.empty(), V.empty());
+    EXPECT_EQ(Values.size(), V.size());
+    for (size_t i = 0, e = Values.size(); i != e; ++i) {
+      EXPECT_EQ(Values[i], V[i]);
+      EXPECT_EQ(Values[i], *llvm::next(V.begin(), i));
+    }
+    EXPECT_EQ(V.end(), llvm::next(V.begin(), Values.size()));
+  }
+};
+
+typedef ::testing::Types<TinyPtrVector<int*>,
+                         TinyPtrVector<double*>
+                         > TinyPtrVectorTestTypes;
+TYPED_TEST_CASE(TinyPtrVectorTest, TinyPtrVectorTestTypes);
+
+TYPED_TEST(TinyPtrVectorTest, EmptyTest) {
+  this->expectValues(this->V, this->testArray(0));
+}
+
+TYPED_TEST(TinyPtrVectorTest, PushPopBack) {
+  this->V.push_back(this->TestPtrs[0]);
+  this->expectValues(this->V, this->testArray(1));
+  this->V.push_back(this->TestPtrs[1]);
+  this->expectValues(this->V, this->testArray(2));
+  this->V.push_back(this->TestPtrs[2]);
+  this->expectValues(this->V, this->testArray(3));
+  this->V.push_back(this->TestPtrs[3]);
+  this->expectValues(this->V, this->testArray(4));
+  this->V.push_back(this->TestPtrs[4]);
+  this->expectValues(this->V, this->testArray(5));
+
+  // Pop and clobber a few values to keep things interesting.
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(4));
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(3));
+  this->TestPtrs[3] = &this->TestValues[42];
+  this->TestPtrs[4] = &this->TestValues[43];
+  this->V.push_back(this->TestPtrs[3]);
+  this->expectValues(this->V, this->testArray(4));
+  this->V.push_back(this->TestPtrs[4]);
+  this->expectValues(this->V, this->testArray(5));
+
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(4));
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(3));
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(2));
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(1));
+  this->V.pop_back();
+  this->expectValues(this->V, this->testArray(0));
+
+  this->appendValues(this->V, this->testArray(42));
+  this->expectValues(this->V, this->testArray(42));
+}
+
+TYPED_TEST(TinyPtrVectorTest, ClearTest) {
+  this->expectValues(this->V, this->testArray(0));
+  this->V.clear();
+  this->expectValues(this->V, this->testArray(0));
+
+  this->appendValues(this->V, this->testArray(1));
+  this->expectValues(this->V, this->testArray(1));
+  this->V.clear();
+  this->expectValues(this->V, this->testArray(0));
+
+  this->appendValues(this->V, this->testArray(42));
+  this->expectValues(this->V, this->testArray(42));
+  this->V.clear();
+  this->expectValues(this->V, this->testArray(0));
+}
+
+TYPED_TEST(TinyPtrVectorTest, CopyAndMoveCtorTest) {
+  this->appendValues(this->V, this->testArray(42));
+  TypeParam Copy(this->V);
+  this->expectValues(Copy, this->testArray(42));
+
+  // This is a separate copy, and so it shouldn't destroy the original.
+  Copy.clear();
+  this->expectValues(Copy, this->testArray(0));
+  this->expectValues(this->V, this->testArray(42));
+
+  TypeParam Copy2(this->V2);
+  this->appendValues(Copy2, this->testArray(42));
+  this->expectValues(Copy2, this->testArray(42));
+  this->expectValues(this->V2, this->testArray(0));
+
+#if LLVM_USE_RVALUE_REFERENCES
+  TypeParam Move(std::move(Copy2));
+  this->expectValues(Move, this->testArray(42));
+  this->expectValues(Copy2, this->testArray(0));
+#endif
+}
+
+TYPED_TEST(TinyPtrVectorTest, CopyAndMoveTest) {
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(0));
+  this->expectValues(this->V2, this->testArray(0));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(0));
+#endif
+
+  this->setVectors(this->testArray(1), this->testArray(0));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(0));
+  this->expectValues(this->V2, this->testArray(0));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(1), this->testArray(0));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(0));
+#endif
+
+  this->setVectors(this->testArray(2), this->testArray(0));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(0));
+  this->expectValues(this->V2, this->testArray(0));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(2), this->testArray(0));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(0));
+#endif
+
+  this->setVectors(this->testArray(42), this->testArray(0));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(0));
+  this->expectValues(this->V2, this->testArray(0));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(42), this->testArray(0));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(0));
+#endif
+
+  this->setVectors(this->testArray(0), this->testArray(1));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(1));
+  this->expectValues(this->V2, this->testArray(1));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(0), this->testArray(1));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(1));
+#endif
+
+  this->setVectors(this->testArray(0), this->testArray(2));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(2));
+  this->expectValues(this->V2, this->testArray(2));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(0), this->testArray(2));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(2));
+#endif
+
+  this->setVectors(this->testArray(0), this->testArray(42));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(42));
+  this->expectValues(this->V2, this->testArray(42));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(0), this->testArray(42));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(42));
+#endif
+
+  this->setVectors(this->testArray(1), this->testArray(1));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(1));
+  this->expectValues(this->V2, this->testArray(1));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(1));
+#endif
+
+  this->setVectors(this->testArray(1), this->testArray(2));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(2));
+  this->expectValues(this->V2, this->testArray(2));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(1), this->testArray(2));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(2));
+#endif
+
+  this->setVectors(this->testArray(1), this->testArray(42));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(42));
+  this->expectValues(this->V2, this->testArray(42));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(1), this->testArray(42));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(42));
+#endif
+
+  this->setVectors(this->testArray(2), this->testArray(1));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(1));
+  this->expectValues(this->V2, this->testArray(1));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(2), this->testArray(1));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(1));
+#endif
+
+  this->setVectors(this->testArray(2), this->testArray(2));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(2));
+  this->expectValues(this->V2, this->testArray(2));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(2), this->testArray(2));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(2));
+#endif
+
+  this->setVectors(this->testArray(2), this->testArray(42));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(42));
+  this->expectValues(this->V2, this->testArray(42));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(2), this->testArray(42));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(42));
+#endif
+
+  this->setVectors(this->testArray(42), this->testArray(1));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(1));
+  this->expectValues(this->V2, this->testArray(1));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(42), this->testArray(1));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(1));
+#endif
+
+  this->setVectors(this->testArray(42), this->testArray(2));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(2));
+  this->expectValues(this->V2, this->testArray(2));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(42), this->testArray(2));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(2));
+#endif
+
+  this->setVectors(this->testArray(42), this->testArray(42));
+  this->V = this->V2;
+  this->expectValues(this->V, this->testArray(42));
+  this->expectValues(this->V2, this->testArray(42));
+#if LLVM_USE_RVALUE_REFERENCES
+  this->setVectors(this->testArray(42), this->testArray(42));
+  this->V = std::move(this->V2);
+  this->expectValues(this->V, this->testArray(42));
+#endif
+}
+
+TYPED_TEST(TinyPtrVectorTest, EraseTest) {
+  this->appendValues(this->V, this->testArray(1));
+  this->expectValues(this->V, this->testArray(1));
+  this->V.erase(this->V.begin());
+  this->expectValues(this->V, this->testArray(0));
+
+  this->appendValues(this->V, this->testArray(42));
+  this->expectValues(this->V, this->testArray(42));
+  this->V.erase(this->V.begin());
+  this->TestPtrs.erase(this->TestPtrs.begin());
+  this->expectValues(this->V, this->testArray(41));
+  this->V.erase(llvm::next(this->V.begin(), 1));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 1));
+  this->expectValues(this->V, this->testArray(40));
+  this->V.erase(llvm::next(this->V.begin(), 2));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 2));
+  this->expectValues(this->V, this->testArray(39));
+  this->V.erase(llvm::next(this->V.begin(), 5));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 5));
+  this->expectValues(this->V, this->testArray(38));
+  this->V.erase(llvm::next(this->V.begin(), 13));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 13));
+  this->expectValues(this->V, this->testArray(37));
+
+  typename TypeParam::iterator I = this->V.begin();
+  do {
+    I = this->V.erase(I);
+  } while (I != this->V.end());
+  this->expectValues(this->V, this->testArray(0));
+}
+
+TYPED_TEST(TinyPtrVectorTest, EraseRangeTest) {
+  this->appendValues(this->V, this->testArray(1));
+  this->expectValues(this->V, this->testArray(1));
+  this->V.erase(this->V.begin(), this->V.begin());
+  this->expectValues(this->V, this->testArray(1));
+  this->V.erase(this->V.end(), this->V.end());
+  this->expectValues(this->V, this->testArray(1));
+  this->V.erase(this->V.begin(), this->V.end());
+  this->expectValues(this->V, this->testArray(0));
+
+  this->appendValues(this->V, this->testArray(42));
+  this->expectValues(this->V, this->testArray(42));
+  this->V.erase(this->V.begin(), llvm::next(this->V.begin(), 1));
+  this->TestPtrs.erase(this->TestPtrs.begin(),
+                       llvm::next(this->TestPtrs.begin(), 1));
+  this->expectValues(this->V, this->testArray(41));
+  this->V.erase(llvm::next(this->V.begin(), 1), llvm::next(this->V.begin(), 2));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 1),
+                       llvm::next(this->TestPtrs.begin(), 2));
+  this->expectValues(this->V, this->testArray(40));
+  this->V.erase(llvm::next(this->V.begin(), 2), llvm::next(this->V.begin(), 4));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 2),
+                       llvm::next(this->TestPtrs.begin(), 4));
+  this->expectValues(this->V, this->testArray(38));
+  this->V.erase(llvm::next(this->V.begin(), 5), llvm::next(this->V.begin(), 10));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 5),
+                       llvm::next(this->TestPtrs.begin(), 10));
+  this->expectValues(this->V, this->testArray(33));
+  this->V.erase(llvm::next(this->V.begin(), 13), llvm::next(this->V.begin(), 26));
+  this->TestPtrs.erase(llvm::next(this->TestPtrs.begin(), 13),
+                       llvm::next(this->TestPtrs.begin(), 26));
+  this->expectValues(this->V, this->testArray(20));
+  this->V.erase(llvm::next(this->V.begin(), 7), this->V.end());
+  this->expectValues(this->V, this->testArray(7));
+  this->V.erase(this->V.begin(), this->V.end());
+  this->expectValues(this->V, this->testArray(0));
+}
+
+TYPED_TEST(TinyPtrVectorTest, Insert) {
+  this->V.insert(this->V.end(), this->TestPtrs[0]);
+  this->expectValues(this->V, this->testArray(1));
+  this->V.clear();
+  this->appendValues(this->V, this->testArray(4));
+  this->expectValues(this->V, this->testArray(4));
+  this->V.insert(this->V.end(), this->TestPtrs[4]);
+  this->expectValues(this->V, this->testArray(5));
+  this->V.insert(this->V.begin(), this->TestPtrs[42]);
+  this->TestPtrs.insert(this->TestPtrs.begin(), this->TestPtrs[42]);
+  this->expectValues(this->V, this->testArray(6));
+  this->V.insert(llvm::next(this->V.begin(), 3), this->TestPtrs[43]);
+  this->TestPtrs.insert(llvm::next(this->TestPtrs.begin(), 3),
+                        this->TestPtrs[43]);
+  this->expectValues(this->V, this->testArray(7));
+}
+
+TYPED_TEST(TinyPtrVectorTest, InsertRange) {
+  this->V.insert(this->V.end(), this->TestPtrs.begin(), this->TestPtrs.begin());
+  this->expectValues(this->V, this->testArray(0));
+  this->V.insert(this->V.begin(), this->TestPtrs.begin(),
+                 this->TestPtrs.begin());
+  this->expectValues(this->V, this->testArray(0));
+  this->V.insert(this->V.end(), this->TestPtrs.end(), this->TestPtrs.end());
+  this->expectValues(this->V, this->testArray(0));
+  this->V.insert(this->V.end(), this->TestPtrs.begin(),
+                 llvm::next(this->TestPtrs.begin()));
+  this->expectValues(this->V, this->testArray(1));
+  this->V.clear();
+  this->V.insert(this->V.end(), this->TestPtrs.begin(),
+                 llvm::next(this->TestPtrs.begin(), 2));
+  this->expectValues(this->V, this->testArray(2));
+  this->V.clear();
+  this->V.insert(this->V.end(), this->TestPtrs.begin(),
+                 llvm::next(this->TestPtrs.begin(), 42));
+  this->expectValues(this->V, this->testArray(42));
+  this->V.clear();
+  this->V.insert(this->V.end(),
+                 llvm::next(this->TestPtrs.begin(), 5),
+                 llvm::next(this->TestPtrs.begin(), 13));
+  this->V.insert(this->V.begin(),
+                 llvm::next(this->TestPtrs.begin(), 0),
+                 llvm::next(this->TestPtrs.begin(), 3));
+  this->V.insert(llvm::next(this->V.begin(), 2),
+                 llvm::next(this->TestPtrs.begin(), 2),
+                 llvm::next(this->TestPtrs.begin(), 4));
+  this->V.erase(llvm::next(this->V.begin(), 4));
+  this->V.insert(llvm::next(this->V.begin(), 4),
+                 llvm::next(this->TestPtrs.begin(), 4),
+                 llvm::next(this->TestPtrs.begin(), 5));
+  this->expectValues(this->V, this->testArray(13));
+}
+
+}
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index 479046e..967437c 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -353,9 +353,9 @@ TEST(TripleTest, BitWidthArchVariants) {
   EXPECT_EQ(Triple::ppc, T.get32BitArchVariant().getArch());
   EXPECT_EQ(Triple::ppc64, T.get64BitArchVariant().getArch());
 
-  T.setArch(Triple::ptx32);
-  EXPECT_EQ(Triple::ptx32, T.get32BitArchVariant().getArch());
-  EXPECT_EQ(Triple::ptx64, T.get64BitArchVariant().getArch());
+  T.setArch(Triple::nvptx);
+  EXPECT_EQ(Triple::nvptx, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::nvptx64, T.get64BitArchVariant().getArch());
 
   T.setArch(Triple::sparc);
   EXPECT_EQ(Triple::sparc, T.get32BitArchVariant().getArch());
@@ -377,9 +377,9 @@ TEST(TripleTest, BitWidthArchVariants) {
   EXPECT_EQ(Triple::ppc, T.get32BitArchVariant().getArch());
   EXPECT_EQ(Triple::ppc64, T.get64BitArchVariant().getArch());
 
-  T.setArch(Triple::ptx64);
-  EXPECT_EQ(Triple::ptx32, T.get32BitArchVariant().getArch());
-  EXPECT_EQ(Triple::ptx64, T.get64BitArchVariant().getArch());
+  T.setArch(Triple::nvptx64);
+  EXPECT_EQ(Triple::nvptx, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::nvptx64, T.get64BitArchVariant().getArch());
 
   T.setArch(Triple::sparcv9);
   EXPECT_EQ(Triple::sparc, T.get32BitArchVariant().getArch());
@@ -390,4 +390,69 @@ TEST(TripleTest, BitWidthArchVariants) {
   EXPECT_EQ(Triple::x86_64, T.get64BitArchVariant().getArch());
 }
 
+TEST(TripleTest, getOSVersion) {
+  Triple T;
+  unsigned Major, Minor, Micro;
+
+  T = Triple("i386-apple-darwin9");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)5, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)3, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+
+  T = Triple("x86_64-apple-darwin9");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)5, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)3, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+
+  T = Triple("x86_64-apple-macosx");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)4, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)3, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+
+  T = Triple("x86_64-apple-macosx10.7");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)7, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)3, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+
+  T = Triple("armv7-apple-ios");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)4, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)3, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+
+  T = Triple("armv7-apple-ios5.0");
+  T.getMacOSXVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)10, Major);
+  EXPECT_EQ((unsigned)4, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+  T.getiOSVersion(Major, Minor, Micro);
+  EXPECT_EQ((unsigned)5, Major);
+  EXPECT_EQ((unsigned)0, Minor);
+  EXPECT_EQ((unsigned)0, Micro);
+}
+
 }
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..7991a41
--- /dev/null
+++ b/unittests/Analysis/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  )
+
+add_llvm_unittest(AnalysisTests
+  ScalarEvolutionTest.cpp
+  )
diff --git a/unittests/Analysis/Makefile b/unittests/Analysis/Makefile
index f89240e..b548d25 100644
--- a/unittests/Analysis/Makefile
+++ b/unittests/Analysis/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = Analysis
-LINK_COMPONENTS := core support target analysis ipa
+LINK_COMPONENTS := analysis
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Bitcode/CMakeLists.txt b/unittests/Bitcode/CMakeLists.txt
new file mode 100644
index 0000000..d8f5fe1
--- /dev/null
+++ b/unittests/Bitcode/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  BitReader
+  BitWriter
+  )
+
+add_llvm_unittest(BitcodeTests
+  BitReaderTest.cpp
+  )
diff --git a/unittests/Bitcode/Makefile b/unittests/Bitcode/Makefile
index aa437e7..fcec879 100644
--- a/unittests/Bitcode/Makefile
+++ b/unittests/Bitcode/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = Bitcode
-LINK_COMPONENTS := core support bitreader bitwriter
+LINK_COMPONENTS := bitreader bitwriter
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 78009a8..84bd444 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -1,178 +1,14 @@
-function(add_llvm_unittest test_dirname)
-  string(REGEX MATCH "([^/]+)$" test_name ${test_dirname})
-  if (CMAKE_BUILD_TYPE)
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY
-      ${LLVM_BINARY_DIR}/unittests/${test_dirname}/${CMAKE_BUILD_TYPE})
-  else()
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY
-      ${LLVM_BINARY_DIR}/unittests/${test_dirname})
-  endif()
-  if( NOT LLVM_BUILD_TESTS )
-    set(EXCLUDE_FROM_ALL ON)
-  endif()
-  add_llvm_executable(${test_name}Tests ${ARGN})
-  add_dependencies(UnitTests ${test_name}Tests)
-  set_target_properties(${test_name}Tests PROPERTIES FOLDER "Tests")
-endfunction()
-
 add_custom_target(UnitTests)
 set_target_properties(UnitTests PROPERTIES FOLDER "Tests")
 
-include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
-add_definitions(-DGTEST_HAS_RTTI=0)
-if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
-  llvm_replace_compiler_option(CMAKE_CXX_FLAGS "-frtti" "-fno-rtti")
-elseif( MSVC )
-  llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/GR" "/GR-")
-endif()
-
-if (NOT LLVM_ENABLE_THREADS)
-  add_definitions(-DGTEST_HAS_PTHREAD=0)
-endif()
-
-if(SUPPORTS_NO_VARIADIC_MACROS_FLAG)
-  add_definitions("-Wno-variadic-macros")
-endif()
-
-set(LLVM_LINK_COMPONENTS
-  jit
-  interpreter
-  nativecodegen
-  BitWriter
-  BitReader
-  AsmParser
-  Core
-  Support
-  )
-
-set(LLVM_USED_LIBS
-  gtest
-  gtest_main
-  LLVMSupport # gtest needs it for raw_ostream.
-  )
-
-add_llvm_unittest(ADT
-  ADT/APFloatTest.cpp
-  ADT/APIntTest.cpp
-  ADT/BitVectorTest.cpp
-  ADT/DAGDeltaAlgorithmTest.cpp
-  ADT/DeltaAlgorithmTest.cpp
-  ADT/DenseMapTest.cpp
-  ADT/DenseSetTest.cpp
-  ADT/FoldingSet.cpp
-  ADT/HashingTest.cpp
-  ADT/ilistTest.cpp
-  ADT/ImmutableSetTest.cpp
-  ADT/IntEqClassesTest.cpp
-  ADT/IntervalMapTest.cpp
-  ADT/IntrusiveRefCntPtrTest.cpp
-  ADT/PackedVectorTest.cpp
-  ADT/SmallBitVectorTest.cpp
-  ADT/SmallStringTest.cpp
-  ADT/SmallVectorTest.cpp
-  ADT/SparseBitVectorTest.cpp
-  ADT/SparseSetTest.cpp
-  ADT/StringMapTest.cpp
-  ADT/StringRefTest.cpp
-  ADT/TripleTest.cpp
-  ADT/TwineTest.cpp
-  ADT/VariadicFunctionTest.cpp
- )
-
-add_llvm_unittest(Analysis
-  Analysis/ScalarEvolutionTest.cpp
-  )
-
-add_llvm_unittest(ExecutionEngine
-  ExecutionEngine/ExecutionEngineTest.cpp
-  )
-
-if( LLVM_USE_INTEL_JITEVENTS )
-  include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
-  link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )
-  set(ProfileTestSources
-    ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
-    )
-  set(LLVM_LINK_COMPONENTS
-    ${LLVM_LINK_COMPONENTS}
-    IntelJITEvents
-    ) 
-endif( LLVM_USE_INTEL_JITEVENTS )
-
-if( LLVM_USE_OPROFILE )
-  set(ProfileTestSources
-    ${ProfileTestSources}
-    ExecutionEngine/JIT/OProfileJITEventListenerTest.cpp
-    )
-  set(LLVM_LINK_COMPONENTS
-    ${LLVM_LINK_COMPONENTS}
-    OProfileJIT
-    )
-endif( LLVM_USE_OPROFILE )
-
-set(JITTestsSources
-  ExecutionEngine/JIT/JITEventListenerTest.cpp
-  ExecutionEngine/JIT/JITMemoryManagerTest.cpp
-  ExecutionEngine/JIT/JITTest.cpp
-  ExecutionEngine/JIT/MultiJITTest.cpp
-  ${ProfileTestSources}
-  )
-
-if(MSVC)
-  list(APPEND JITTestsSources ExecutionEngine/JIT/JITTests.def)
-endif()
-
-add_llvm_unittest(ExecutionEngine/JIT ${JITTestsSources})
-
-if(MINGW OR CYGWIN)
-  set_property(TARGET JITTests PROPERTY LINK_FLAGS -Wl,--export-all-symbols)
-endif()
-
-add_llvm_unittest(Transforms/Utils
-  Transforms/Utils/Cloning.cpp
-  )
-
-set(VMCoreSources
-  VMCore/ConstantsTest.cpp
-  VMCore/InstructionsTest.cpp
-  VMCore/MetadataTest.cpp
-  VMCore/PassManagerTest.cpp
-  VMCore/ValueMapTest.cpp
-  VMCore/VerifierTest.cpp
-  VMCore/DominatorTreeTest.cpp
-  )
-
-# MSVC9 and 8 cannot compile ValueMapTest.cpp due to their bug.
-# See issue#331418 in Visual Studio.
-if(MSVC AND MSVC_VERSION LESS 1600)
-  list(REMOVE_ITEM VMCoreSources VMCore/ValueMapTest.cpp)
-endif()
-
-add_llvm_unittest(VMCore ${VMCoreSources})
-
-add_llvm_unittest(Bitcode
-  Bitcode/BitReaderTest.cpp
-  )
-
-set(LLVM_LINK_COMPONENTS
-  Support
-  Core
-  )
+function(add_llvm_unittest test_dirname)
+  add_unittest(UnitTests ${test_dirname} ${ARGN})
+endfunction()
 
-add_llvm_unittest(Support
-  Support/AllocatorTest.cpp
-  Support/Casting.cpp
-  Support/CommandLineTest.cpp
-  Support/ConstantRangeTest.cpp
-  Support/EndianTest.cpp
-  Support/LeakDetectorTest.cpp
-  Support/MathExtrasTest.cpp
-  Support/Path.cpp
-  Support/raw_ostream_test.cpp
-  Support/RegexTest.cpp
-  Support/SwapByteOrderTest.cpp
-  Support/TimeValue.cpp
-  Support/TypeBuilderTest.cpp
-  Support/ValueHandleTest.cpp
-  Support/YAMLParserTest.cpp
-  )
+add_subdirectory(ADT)
+add_subdirectory(Analysis)
+add_subdirectory(ExecutionEngine)
+add_subdirectory(Bitcode)
+add_subdirectory(Support)
+add_subdirectory(Transforms)
+add_subdirectory(VMCore)
diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 0000000..5fffadd
--- /dev/null
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  interpreter
+  )
+
+add_llvm_unittest(ExecutionEngineTests
+  ExecutionEngineTest.cpp
+  )
+
+add_subdirectory(JIT)
diff --git a/unittests/ExecutionEngine/JIT/CMakeLists.txt b/unittests/ExecutionEngine/JIT/CMakeLists.txt
new file mode 100644
index 0000000..d43d72d
--- /dev/null
+++ b/unittests/ExecutionEngine/JIT/CMakeLists.txt
@@ -0,0 +1,57 @@
+set(LLVM_LINK_COMPONENTS
+  asmparser
+  bitreader
+  bitwriter
+  jit
+  nativecodegen
+  )
+
+# HACK: Declare a couple of source files as optionally compiled to satisfy the
+# missing-file-checker in LLVM's weird CMake build.
+set(LLVM_OPTIONAL_SOURCES
+  IntelJITEventListenerTest.cpp
+  OProfileJITEventListenerTest.cpp
+  )
+
+if( LLVM_USE_INTEL_JITEVENTS )
+  include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
+  link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )
+  set(ProfileTestSources
+    IntelJITEventListenerTest.cpp
+    )
+  set(LLVM_LINK_COMPONENTS
+    ${LLVM_LINK_COMPONENTS}
+    IntelJITEvents
+    ) 
+endif( LLVM_USE_INTEL_JITEVENTS )
+
+if( LLVM_USE_OPROFILE )
+  set(ProfileTestSources
+    ${ProfileTestSources}
+    OProfileJITEventListenerTest.cpp
+    )
+  set(LLVM_LINK_COMPONENTS
+    ${LLVM_LINK_COMPONENTS}
+    OProfileJIT
+    )
+endif( LLVM_USE_OPROFILE )
+
+set(JITTestsSources
+  JITEventListenerTest.cpp
+  JITMemoryManagerTest.cpp
+  JITTest.cpp
+  MultiJITTest.cpp
+  ${ProfileTestSources}
+  )
+
+if(MSVC)
+  list(APPEND JITTestsSources JITTests.def)
+endif()
+
+add_llvm_unittest(JITTests
+  ${JITTestsSources}
+  )
+
+if(MINGW OR CYGWIN)
+  set_property(TARGET JITTests PROPERTY LINK_FLAGS -Wl,--export-all-symbols)
+endif()
diff --git a/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp
index f8d8830..333888a 100644
--- a/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp
@@ -12,10 +12,10 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineCodeInfo.h"
 #include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/TargetSelect.h"
 #include "gtest/gtest.h"
 #include <vector>
diff --git a/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h b/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h
index 53608cb..5f02b38 100644
--- a/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h
+++ b/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h
@@ -10,18 +10,18 @@
 #ifndef JIT_EVENT_LISTENER_TEST_COMMON_H
 #define JIT_EVENT_LISTENER_TEST_COMMON_H
 
-#include "llvm/Analysis/DIBuilder.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/CodeGen/MachineCodeInfo.h"
-#include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Config/config.h"
 
 #include "gtest/gtest.h"
 
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index fa52321..89f7e8e 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -7,29 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Assembly/Parser.h"
 #include "llvm/BasicBlock.h"
-#include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Constant.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
+#include "llvm/Type.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Assembly/Parser.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Type.h"
 
+#include "gtest/gtest.h"
 #include <vector>
 
 using namespace llvm;
@@ -477,10 +477,11 @@ TEST_F(JITTest, ModuleDeletion) {
 }
 #endif // !defined(__arm__)
 
-// ARM and PPC still emit stubs for calls since the target may be too far away
-// to call directly.  This #if can probably be removed when
+// ARM, MIPS and PPC still emit stubs for calls since the target may be
+// too far away to call directly.  This #if can probably be removed when
 // http://llvm.org/PR5201 is fixed.
-#if !defined(__arm__) && !defined(__powerpc__) && !defined(__ppc__)
+#if !defined(__arm__) && !defined(__mips__) && \
+    !defined(__powerpc__) && !defined(__ppc__)
 typedef int (*FooPtr) ();
 
 TEST_F(JITTest, NoStubs) {
@@ -554,8 +555,9 @@ TEST_F(JITTest, FunctionPointersOutliveTheirCreator) {
 #endif
 }
 
-// ARM doesn't have an implementation of replaceMachineCodeForFunction(), so
-// recompileAndRelinkFunction doesn't work.
+// ARM does not have an implementation
+// of replaceMachineCodeForFunction(), so recompileAndRelinkFunction
+// doesn't work.
 #if !defined(__arm__)
 TEST_F(JITTest, FunctionIsRecompiledAndRelinked) {
   Function *F = Function::Create(TypeBuilder<int(void), false>::get(Context),
diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile
index c404fb0..b535a6b 100644
--- a/unittests/ExecutionEngine/JIT/Makefile
+++ b/unittests/ExecutionEngine/JIT/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../../..
 TESTNAME = JIT
-LINK_COMPONENTS := asmparser bitreader bitwriter core jit native support
+LINK_COMPONENTS := asmparser bitreader bitwriter jit native
 
 include $(LEVEL)/Makefile.config
 
diff --git a/unittests/ExecutionEngine/Makefile b/unittests/ExecutionEngine/Makefile
index a0395cd..63508d2 100644
--- a/unittests/ExecutionEngine/Makefile
+++ b/unittests/ExecutionEngine/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = ExecutionEngine
-LINK_COMPONENTS := engine interpreter
+LINK_COMPONENTS :=interpreter
 PARALLEL_DIRS = JIT
 
 include $(LEVEL)/Makefile.config
diff --git a/unittests/Support/AlignOfTest.cpp b/unittests/Support/AlignOfTest.cpp
new file mode 100644
index 0000000..c45db2c
--- /dev/null
+++ b/unittests/Support/AlignOfTest.cpp
@@ -0,0 +1,328 @@
+//===- llvm/unittest/Support/AlignOfTest.cpp - Alignment utility tests ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Compiler.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+// Disable warnings about questionable type definitions.
+// We're testing that even questionable types work with the alignment utilities.
+#ifdef _MSC_VER
+#pragma warning(disable:4584)
+#endif
+
+// Define some fixed alignment types to use in these tests.
+#if __cplusplus == 201103L || __has_feature(cxx_alignas)
+typedef char alignas(1) A1;
+typedef char alignas(2) A2;
+typedef char alignas(4) A4;
+typedef char alignas(8) A8;
+#elif defined(__clang__) || defined(__GNUC__)
+typedef char A1 __attribute__((aligned(1)));
+typedef char A2 __attribute__((aligned(2)));
+typedef char A4 __attribute__((aligned(4)));
+typedef char A8 __attribute__((aligned(8)));
+#elif defined(_MSC_VER)
+typedef __declspec(align(1)) char A1;
+typedef __declspec(align(2)) char A2;
+typedef __declspec(align(4)) char A4;
+typedef __declspec(align(8)) char A8;
+#else
+# error No supported align as directive.
+#endif
+
+// Wrap the forced aligned types in structs to hack around compiler bugs.
+struct SA1 { A1 a; };
+struct SA2 { A2 a; };
+struct SA4 { A4 a; };
+struct SA8 { A8 a; };
+
+struct S1 {};
+struct S2 { char a; };
+struct S3 { int x; };
+struct S4 { double y; };
+struct S5 { A1 a1; A2 a2; A4 a4; A8 a8; };
+struct S6 { double f(); };
+struct D1 : S1 {};
+struct D2 : S6 { float g(); };
+struct D3 : S2 {};
+struct D4 : S2 { int x; };
+struct D5 : S3 { char c; };
+struct D6 : S2, S3 {};
+struct D7 : S1, S3 {};
+struct D8 : S1, D4, D5 { double x[2]; };
+struct D9 : S1, D1 { S1 s1; };
+struct V1 { virtual ~V1(); };
+struct V2 { int x; virtual ~V2(); };
+struct V3 : V1 { virtual ~V3(); };
+struct V4 : virtual V2 { int y; virtual ~V4(); };
+struct V5 : V4, V3 { double z; virtual ~V5(); };
+struct V6 : S1 { virtual ~V6(); };
+struct V7 : virtual V2, virtual V6 { virtual ~V7(); };
+struct V8 : V5, virtual V6, V7 { double zz; virtual ~V8(); };
+
+// Ensure alignment is a compile-time constant.
+char LLVM_ATTRIBUTE_UNUSED test_arr1
+  [AlignOf<char>::Alignment > 0]
+  [AlignOf<short>::Alignment > 0]
+  [AlignOf<int>::Alignment > 0]
+  [AlignOf<long>::Alignment > 0]
+  [AlignOf<long long>::Alignment > 0]
+  [AlignOf<float>::Alignment > 0]
+  [AlignOf<double>::Alignment > 0]
+  [AlignOf<long double>::Alignment > 0]
+  [AlignOf<void *>::Alignment > 0]
+  [AlignOf<int *>::Alignment > 0]
+  [AlignOf<double (*)(double)>::Alignment > 0]
+  [AlignOf<double (S6::*)()>::Alignment > 0];
+char LLVM_ATTRIBUTE_UNUSED test_arr2
+  [AlignOf<A1>::Alignment > 0]
+  [AlignOf<A2>::Alignment > 0]
+  [AlignOf<A4>::Alignment > 0]
+  [AlignOf<A8>::Alignment > 0]
+  [AlignOf<SA1>::Alignment > 0]
+  [AlignOf<SA2>::Alignment > 0]
+  [AlignOf<SA4>::Alignment > 0]
+  [AlignOf<SA8>::Alignment > 0];
+char LLVM_ATTRIBUTE_UNUSED test_arr3
+  [AlignOf<S1>::Alignment > 0]
+  [AlignOf<S2>::Alignment > 0]
+  [AlignOf<S3>::Alignment > 0]
+  [AlignOf<S4>::Alignment > 0]
+  [AlignOf<S5>::Alignment > 0]
+  [AlignOf<S6>::Alignment > 0];
+char LLVM_ATTRIBUTE_UNUSED test_arr4
+  [AlignOf<D1>::Alignment > 0]
+  [AlignOf<D2>::Alignment > 0]
+  [AlignOf<D3>::Alignment > 0]
+  [AlignOf<D4>::Alignment > 0]
+  [AlignOf<D5>::Alignment > 0]
+  [AlignOf<D6>::Alignment > 0]
+  [AlignOf<D7>::Alignment > 0]
+  [AlignOf<D8>::Alignment > 0]
+  [AlignOf<D9>::Alignment > 0];
+char LLVM_ATTRIBUTE_UNUSED test_arr5
+  [AlignOf<V1>::Alignment > 0]
+  [AlignOf<V2>::Alignment > 0]
+  [AlignOf<V3>::Alignment > 0]
+  [AlignOf<V4>::Alignment > 0]
+  [AlignOf<V5>::Alignment > 0]
+  [AlignOf<V6>::Alignment > 0]
+  [AlignOf<V7>::Alignment > 0]
+  [AlignOf<V8>::Alignment > 0];
+
+TEST(AlignOfTest, BasicAlignmentInvariants) {
+  // For a very strange reason, many compilers do not support this. Both Clang
+  // and GCC fail to align these properly.
+  EXPECT_EQ(1u, alignOf<A1>());
+#if 0
+  EXPECT_EQ(2u, alignOf<A2>());
+  EXPECT_EQ(4u, alignOf<A4>());
+  EXPECT_EQ(8u, alignOf<A8>());
+#endif
+
+  // But once wrapped in structs, the alignment is correctly managed.
+  EXPECT_LE(1u, alignOf<SA1>());
+  EXPECT_LE(2u, alignOf<SA2>());
+  EXPECT_LE(4u, alignOf<SA4>());
+  EXPECT_LE(8u, alignOf<SA8>());
+
+  EXPECT_EQ(1u, alignOf<char>());
+  EXPECT_LE(alignOf<char>(),   alignOf<short>());
+  EXPECT_LE(alignOf<short>(),  alignOf<int>());
+  EXPECT_LE(alignOf<int>(),    alignOf<long>());
+  EXPECT_LE(alignOf<long>(),   alignOf<long long>());
+  EXPECT_LE(alignOf<char>(),   alignOf<float>());
+  EXPECT_LE(alignOf<float>(),  alignOf<double>());
+  EXPECT_LE(alignOf<char>(),   alignOf<long double>());
+  EXPECT_LE(alignOf<char>(),   alignOf<void *>());
+  EXPECT_EQ(alignOf<void *>(), alignOf<int *>());
+  EXPECT_LE(alignOf<char>(),   alignOf<S1>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<S2>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<S3>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<S4>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<S5>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<S6>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D1>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D2>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D3>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D4>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D5>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D6>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D7>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D8>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<D9>());
+  EXPECT_LE(alignOf<S1>(),     alignOf<V1>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V2>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V3>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V4>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V5>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V6>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V7>());
+  EXPECT_LE(alignOf<V1>(),     alignOf<V8>());
+}
+
+TEST(AlignOfTest, BasicAlignedArray) {
+  // Note: this code exclusively uses the struct-wrapped arbitrarily aligned
+  // types because of the bugs mentioned above where GCC and Clang both
+  // disregard the arbitrary alignment specifier until the type is used to
+  // declare a member of a struct.
+  EXPECT_LE(1u, alignOf<AlignedCharArray<SA1>::union_type>());
+  EXPECT_LE(2u, alignOf<AlignedCharArray<SA2>::union_type>());
+  EXPECT_LE(4u, alignOf<AlignedCharArray<SA4>::union_type>());
+  EXPECT_LE(8u, alignOf<AlignedCharArray<SA8>::union_type>());
+
+  EXPECT_LE(1u, sizeof(AlignedCharArray<SA1>::union_type));
+  EXPECT_LE(2u, sizeof(AlignedCharArray<SA2>::union_type));
+  EXPECT_LE(4u, sizeof(AlignedCharArray<SA4>::union_type));
+  EXPECT_LE(8u, sizeof(AlignedCharArray<SA8>::union_type));
+
+  EXPECT_EQ(1u, (alignOf<AlignedCharArray<SA1>::union_type>()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArray<SA1, SA2>::union_type>()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArray<SA1, SA2, SA4>::union_type>()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArray<SA1, SA2, SA4, SA8>::union_type>()));
+
+  EXPECT_EQ(1u, sizeof(AlignedCharArray<SA1>::union_type));
+  EXPECT_EQ(2u, sizeof(AlignedCharArray<SA1, SA2>::union_type));
+  EXPECT_EQ(4u, sizeof(AlignedCharArray<SA1, SA2, SA4>::union_type));
+  EXPECT_EQ(8u, sizeof(AlignedCharArray<SA1, SA2, SA4, SA8>::union_type));
+
+  EXPECT_EQ(1u, (alignOf<AlignedCharArray<SA1[1]>::union_type>()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArray<SA1[2], SA2[1]>::union_type>()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArray<SA1[42], SA2[55],
+                                          SA4[13]>::union_type>()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArray<SA1[2], SA2[1],
+                                          SA4, SA8>::union_type>()));
+
+  EXPECT_EQ(1u,  sizeof(AlignedCharArray<SA1[1]>::union_type));
+  EXPECT_EQ(2u,  sizeof(AlignedCharArray<SA1[2], SA2[1]>::union_type));
+  EXPECT_EQ(4u,  sizeof(AlignedCharArray<SA1[3], SA2[2], SA4>::union_type));
+  EXPECT_EQ(16u, sizeof(AlignedCharArray<SA1, SA2[3],
+                                         SA4[3], SA8>::union_type));
+
+  // For other tests we simply assert that the alignment of the union mathes
+  // that of the fundamental type and hope that we have any weird type
+  // productions that would trigger bugs.
+  EXPECT_EQ(alignOf<char>(), alignOf<AlignedCharArray<char>::union_type>());
+  EXPECT_EQ(alignOf<short>(), alignOf<AlignedCharArray<short>::union_type>());
+  EXPECT_EQ(alignOf<int>(), alignOf<AlignedCharArray<int>::union_type>());
+  EXPECT_EQ(alignOf<long>(), alignOf<AlignedCharArray<long>::union_type>());
+  EXPECT_EQ(alignOf<long long>(),
+            alignOf<AlignedCharArray<long long>::union_type>());
+  EXPECT_EQ(alignOf<float>(), alignOf<AlignedCharArray<float>::union_type>());
+  EXPECT_EQ(alignOf<double>(), alignOf<AlignedCharArray<double>::union_type>());
+  EXPECT_EQ(alignOf<long double>(),
+            alignOf<AlignedCharArray<long double>::union_type>());
+  EXPECT_EQ(alignOf<void *>(), alignOf<AlignedCharArray<void *>::union_type>());
+  EXPECT_EQ(alignOf<int *>(), alignOf<AlignedCharArray<int *>::union_type>());
+  EXPECT_EQ(alignOf<double (*)(double)>(),
+            alignOf<AlignedCharArray<double (*)(double)>::union_type>());
+  EXPECT_EQ(alignOf<double (S6::*)()>(),
+            alignOf<AlignedCharArray<double (S6::*)()>::union_type>());
+  EXPECT_EQ(alignOf<S1>(), alignOf<AlignedCharArray<S1>::union_type>());
+  EXPECT_EQ(alignOf<S2>(), alignOf<AlignedCharArray<S2>::union_type>());
+  EXPECT_EQ(alignOf<S3>(), alignOf<AlignedCharArray<S3>::union_type>());
+  EXPECT_EQ(alignOf<S4>(), alignOf<AlignedCharArray<S4>::union_type>());
+  EXPECT_EQ(alignOf<S5>(), alignOf<AlignedCharArray<S5>::union_type>());
+  EXPECT_EQ(alignOf<S6>(), alignOf<AlignedCharArray<S6>::union_type>());
+  EXPECT_EQ(alignOf<D1>(), alignOf<AlignedCharArray<D1>::union_type>());
+  EXPECT_EQ(alignOf<D2>(), alignOf<AlignedCharArray<D2>::union_type>());
+  EXPECT_EQ(alignOf<D3>(), alignOf<AlignedCharArray<D3>::union_type>());
+  EXPECT_EQ(alignOf<D4>(), alignOf<AlignedCharArray<D4>::union_type>());
+  EXPECT_EQ(alignOf<D5>(), alignOf<AlignedCharArray<D5>::union_type>());
+  EXPECT_EQ(alignOf<D6>(), alignOf<AlignedCharArray<D6>::union_type>());
+  EXPECT_EQ(alignOf<D7>(), alignOf<AlignedCharArray<D7>::union_type>());
+  EXPECT_EQ(alignOf<D8>(), alignOf<AlignedCharArray<D8>::union_type>());
+  EXPECT_EQ(alignOf<D9>(), alignOf<AlignedCharArray<D9>::union_type>());
+  EXPECT_EQ(alignOf<V1>(), alignOf<AlignedCharArray<V1>::union_type>());
+  EXPECT_EQ(alignOf<V2>(), alignOf<AlignedCharArray<V2>::union_type>());
+  EXPECT_EQ(alignOf<V3>(), alignOf<AlignedCharArray<V3>::union_type>());
+  EXPECT_EQ(alignOf<V4>(), alignOf<AlignedCharArray<V4>::union_type>());
+  EXPECT_EQ(alignOf<V5>(), alignOf<AlignedCharArray<V5>::union_type>());
+  EXPECT_EQ(alignOf<V6>(), alignOf<AlignedCharArray<V6>::union_type>());
+  EXPECT_EQ(alignOf<V7>(), alignOf<AlignedCharArray<V7>::union_type>());
+
+  // Some versions of MSVC get this wrong somewhat disturbingly. The failure
+  // appears to be benign: alignOf<V8>() produces a preposterous value: 12
+#ifndef _MSC_VER
+  EXPECT_EQ(alignOf<V8>(), alignOf<AlignedCharArray<V8>::union_type>());
+#endif
+
+  EXPECT_EQ(sizeof(char), sizeof(AlignedCharArray<char>::union_type));
+  EXPECT_EQ(sizeof(char[1]), sizeof(AlignedCharArray<char[1]>::union_type));
+  EXPECT_EQ(sizeof(char[2]), sizeof(AlignedCharArray<char[2]>::union_type));
+  EXPECT_EQ(sizeof(char[3]), sizeof(AlignedCharArray<char[3]>::union_type));
+  EXPECT_EQ(sizeof(char[4]), sizeof(AlignedCharArray<char[4]>::union_type));
+  EXPECT_EQ(sizeof(char[5]), sizeof(AlignedCharArray<char[5]>::union_type));
+  EXPECT_EQ(sizeof(char[8]), sizeof(AlignedCharArray<char[8]>::union_type));
+  EXPECT_EQ(sizeof(char[13]), sizeof(AlignedCharArray<char[13]>::union_type));
+  EXPECT_EQ(sizeof(char[16]), sizeof(AlignedCharArray<char[16]>::union_type));
+  EXPECT_EQ(sizeof(char[21]), sizeof(AlignedCharArray<char[21]>::union_type));
+  EXPECT_EQ(sizeof(char[32]), sizeof(AlignedCharArray<char[32]>::union_type));
+  EXPECT_EQ(sizeof(short), sizeof(AlignedCharArray<short>::union_type));
+  EXPECT_EQ(sizeof(int), sizeof(AlignedCharArray<int>::union_type));
+  EXPECT_EQ(sizeof(long), sizeof(AlignedCharArray<long>::union_type));
+  EXPECT_EQ(sizeof(long long),
+            sizeof(AlignedCharArray<long long>::union_type));
+  EXPECT_EQ(sizeof(float), sizeof(AlignedCharArray<float>::union_type));
+  EXPECT_EQ(sizeof(double), sizeof(AlignedCharArray<double>::union_type));
+  EXPECT_EQ(sizeof(long double),
+            sizeof(AlignedCharArray<long double>::union_type));
+  EXPECT_EQ(sizeof(void *), sizeof(AlignedCharArray<void *>::union_type));
+  EXPECT_EQ(sizeof(int *), sizeof(AlignedCharArray<int *>::union_type));
+  EXPECT_EQ(sizeof(double (*)(double)),
+            sizeof(AlignedCharArray<double (*)(double)>::union_type));
+  EXPECT_EQ(sizeof(double (S6::*)()),
+            sizeof(AlignedCharArray<double (S6::*)()>::union_type));
+  EXPECT_EQ(sizeof(S1), sizeof(AlignedCharArray<S1>::union_type));
+  EXPECT_EQ(sizeof(S2), sizeof(AlignedCharArray<S2>::union_type));
+  EXPECT_EQ(sizeof(S3), sizeof(AlignedCharArray<S3>::union_type));
+  EXPECT_EQ(sizeof(S4), sizeof(AlignedCharArray<S4>::union_type));
+  EXPECT_EQ(sizeof(S5), sizeof(AlignedCharArray<S5>::union_type));
+  EXPECT_EQ(sizeof(S6), sizeof(AlignedCharArray<S6>::union_type));
+  EXPECT_EQ(sizeof(D1), sizeof(AlignedCharArray<D1>::union_type));
+  EXPECT_EQ(sizeof(D2), sizeof(AlignedCharArray<D2>::union_type));
+  EXPECT_EQ(sizeof(D3), sizeof(AlignedCharArray<D3>::union_type));
+  EXPECT_EQ(sizeof(D4), sizeof(AlignedCharArray<D4>::union_type));
+  EXPECT_EQ(sizeof(D5), sizeof(AlignedCharArray<D5>::union_type));
+  EXPECT_EQ(sizeof(D6), sizeof(AlignedCharArray<D6>::union_type));
+  EXPECT_EQ(sizeof(D7), sizeof(AlignedCharArray<D7>::union_type));
+  EXPECT_EQ(sizeof(D8), sizeof(AlignedCharArray<D8>::union_type));
+  EXPECT_EQ(sizeof(D9), sizeof(AlignedCharArray<D9>::union_type));
+  EXPECT_EQ(sizeof(D9[1]), sizeof(AlignedCharArray<D9[1]>::union_type));
+  EXPECT_EQ(sizeof(D9[2]), sizeof(AlignedCharArray<D9[2]>::union_type));
+  EXPECT_EQ(sizeof(D9[3]), sizeof(AlignedCharArray<D9[3]>::union_type));
+  EXPECT_EQ(sizeof(D9[4]), sizeof(AlignedCharArray<D9[4]>::union_type));
+  EXPECT_EQ(sizeof(D9[5]), sizeof(AlignedCharArray<D9[5]>::union_type));
+  EXPECT_EQ(sizeof(D9[8]), sizeof(AlignedCharArray<D9[8]>::union_type));
+  EXPECT_EQ(sizeof(D9[13]), sizeof(AlignedCharArray<D9[13]>::union_type));
+  EXPECT_EQ(sizeof(D9[16]), sizeof(AlignedCharArray<D9[16]>::union_type));
+  EXPECT_EQ(sizeof(D9[21]), sizeof(AlignedCharArray<D9[21]>::union_type));
+  EXPECT_EQ(sizeof(D9[32]), sizeof(AlignedCharArray<D9[32]>::union_type));
+  EXPECT_EQ(sizeof(V1), sizeof(AlignedCharArray<V1>::union_type));
+  EXPECT_EQ(sizeof(V2), sizeof(AlignedCharArray<V2>::union_type));
+  EXPECT_EQ(sizeof(V3), sizeof(AlignedCharArray<V3>::union_type));
+  EXPECT_EQ(sizeof(V4), sizeof(AlignedCharArray<V4>::union_type));
+  EXPECT_EQ(sizeof(V5), sizeof(AlignedCharArray<V5>::union_type));
+  EXPECT_EQ(sizeof(V6), sizeof(AlignedCharArray<V6>::union_type));
+  EXPECT_EQ(sizeof(V7), sizeof(AlignedCharArray<V7>::union_type));
+
+  // Some versions of MSVC also get this wrong. The failure again appears to be
+  // benign: sizeof(V8) is only 52 bytes, but our array reserves 56.
+#ifndef _MSC_VER
+  EXPECT_EQ(sizeof(V8), sizeof(AlignedCharArray<V8>::union_type));
+#endif
+}
+
+}
diff --git a/unittests/Support/BlockFrequencyTest.cpp b/unittests/Support/BlockFrequencyTest.cpp
index df25642..9c5bd7b 100644
--- a/unittests/Support/BlockFrequencyTest.cpp
+++ b/unittests/Support/BlockFrequencyTest.cpp
@@ -34,7 +34,7 @@ TEST(BlockFrequencyTest, MaxToHalfMax) {
   BlockFrequency Freq(UINT64_MAX);
   BranchProbability Prob(UINT32_MAX / 2, UINT32_MAX);
   Freq *= Prob;
-  EXPECT_EQ(Freq.getFrequency(), 9223372034707292159LLu);
+  EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
 }
 
 TEST(BlockFrequencyTest, BigToBig) {
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
new file mode 100644
index 0000000..3b9bf84
--- /dev/null
+++ b/unittests/Support/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  Core
+  )
+
+add_llvm_unittest(SupportTests
+  AlignOfTest.cpp
+  AllocatorTest.cpp
+  BlockFrequencyTest.cpp
+  Casting.cpp
+  CommandLineTest.cpp
+  ConstantRangeTest.cpp
+  DataExtractorTest.cpp
+  EndianTest.cpp
+  FileOutputBufferTest.cpp
+  IntegersSubsetTest.cpp
+  LeakDetectorTest.cpp
+  ManagedStatic.cpp
+  MathExtrasTest.cpp
+  Path.cpp
+  raw_ostream_test.cpp
+  RegexTest.cpp
+  SwapByteOrderTest.cpp
+  TimeValue.cpp
+  ValueHandleTest.cpp
+  YAMLParserTest.cpp
+  )
diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index 72fa24a..13e9038 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp
@@ -55,6 +55,17 @@ TEST(CommandLineTest, ParseEnvironment) {
   EXPECT_EQ("hello", EnvironmentTestOption);
 }
 
+// This test used to make valgrind complain
+// ("Conditional jump or move depends on uninitialised value(s)")
+TEST(CommandLineTest, ParseEnvironmentToLocalVar) {
+  // Put cl::opt on stack to check for proper initialization of fields.
+  cl::opt<std::string> EnvironmentTestOptionLocal("env-test-opt-local");
+  TempEnvVar TEV(test_env_var, "-env-test-opt-local=hello-local");
+  EXPECT_EQ("", EnvironmentTestOptionLocal);
+  cl::ParseEnvironmentOptions("CommandLineTest", test_env_var);
+  EXPECT_EQ("hello-local", EnvironmentTestOptionLocal);
+}
+
 #endif  // SKIP_ENVIRONMENT_TESTS
 
 }  // anonymous namespace
diff --git a/unittests/Support/ConstantRangeTest.cpp b/unittests/Support/ConstantRangeTest.cpp
index 742bcb4..263f93c 100644
--- a/unittests/Support/ConstantRangeTest.cpp
+++ b/unittests/Support/ConstantRangeTest.cpp
@@ -114,11 +114,15 @@ TEST_F(ConstantRangeTest, SingleElement) {
 }
 
 TEST_F(ConstantRangeTest, GetSetSize) {
-  EXPECT_EQ(Full.getSetSize(), APInt(16, 0));
-  EXPECT_EQ(Empty.getSetSize(), APInt(16, 0));
-  EXPECT_EQ(One.getSetSize(), APInt(16, 1));
-  EXPECT_EQ(Some.getSetSize(), APInt(16, 0xaa0));
-  EXPECT_EQ(Wrap.getSetSize(), APInt(16, 0x10000 - 0xaa0));
+  EXPECT_EQ(Full.getSetSize(), APInt(17, 65536));
+  EXPECT_EQ(Empty.getSetSize(), APInt(17, 0));
+  EXPECT_EQ(One.getSetSize(), APInt(17, 1));
+  EXPECT_EQ(Some.getSetSize(), APInt(17, 0xaa0));
+
+  ConstantRange Wrap(APInt(4, 7), APInt(4, 3));
+  ConstantRange Wrap2(APInt(4, 8), APInt(4, 7));
+  EXPECT_EQ(Wrap.getSetSize(), APInt(5, 12));
+  EXPECT_EQ(Wrap2.getSetSize(), APInt(5, 15));
 }
 
 TEST_F(ConstantRangeTest, GetMinsAndMaxes) {
@@ -189,6 +193,10 @@ TEST_F(ConstantRangeTest, ZExt) {
   EXPECT_EQ(ZSome, ConstantRange(Some.getLower().zext(20),
                                  Some.getUpper().zext(20)));
   EXPECT_EQ(ZWrap, ConstantRange(APInt(20, 0), APInt(20, 0x10000)));
+
+  // zext([5, 0), 3->7) = [5, 8)
+  ConstantRange FiveZero(APInt(3, 5), APInt(3, 0));
+  EXPECT_EQ(FiveZero.zeroExtend(7), ConstantRange(APInt(7, 5), APInt(7, 8)));
 }
 
 TEST_F(ConstantRangeTest, SExt) {
@@ -232,6 +240,41 @@ TEST_F(ConstantRangeTest, IntersectWith) {
   ConstantRange LHS(APInt(16, 4), APInt(16, 2));
   ConstantRange RHS(APInt(16, 6), APInt(16, 5));
   EXPECT_TRUE(LHS.intersectWith(RHS) == LHS);
+
+  // previous bug: intersection of [min, 3) and [2, max) should be 2
+  LHS = ConstantRange(APInt(32, -2147483646), APInt(32, 3));
+  RHS = ConstantRange(APInt(32, 2), APInt(32, 2147483646));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 2)));
+
+  // [2, 0) /\ [4, 3) = [2, 0)
+  LHS = ConstantRange(APInt(32, 2), APInt(32, 0));
+  RHS = ConstantRange(APInt(32, 4), APInt(32, 3));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 2), APInt(32, 0)));
+
+  // [2, 0) /\ [4, 2) = [4, 0)
+  LHS = ConstantRange(APInt(32, 2), APInt(32, 0));
+  RHS = ConstantRange(APInt(32, 4), APInt(32, 2));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 4), APInt(32, 0)));
+
+  // [4, 2) /\ [5, 1) = [5, 1)
+  LHS = ConstantRange(APInt(32, 4), APInt(32, 2));
+  RHS = ConstantRange(APInt(32, 5), APInt(32, 1));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 5), APInt(32, 1)));
+
+  // [2, 0) /\ [7, 4) = [7, 4)
+  LHS = ConstantRange(APInt(32, 2), APInt(32, 0));
+  RHS = ConstantRange(APInt(32, 7), APInt(32, 4));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 7), APInt(32, 4)));
+
+  // [4, 2) /\ [1, 0) = [1, 0)
+  LHS = ConstantRange(APInt(32, 4), APInt(32, 2));
+  RHS = ConstantRange(APInt(32, 1), APInt(32, 0));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 4), APInt(32, 2)));
+ 
+  // [15, 0) /\ [7, 6) = [15, 0)
+  LHS = ConstantRange(APInt(32, 15), APInt(32, 0));
+  RHS = ConstantRange(APInt(32, 7), APInt(32, 6));
+  EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 15), APInt(32, 0)));
 }
 
 TEST_F(ConstantRangeTest, UnionWith) {
@@ -254,6 +297,23 @@ TEST_F(ConstantRangeTest, UnionWith) {
               ConstantRange(16));
 }
 
+TEST_F(ConstantRangeTest, SetDifference) {
+  EXPECT_EQ(Full.difference(Empty), Full);
+  EXPECT_EQ(Full.difference(Full), Empty);
+  EXPECT_EQ(Empty.difference(Empty), Empty);
+  EXPECT_EQ(Empty.difference(Full), Empty);
+
+  ConstantRange A(APInt(16, 3), APInt(16, 7));
+  ConstantRange B(APInt(16, 5), APInt(16, 9));
+  ConstantRange C(APInt(16, 3), APInt(16, 5));
+  ConstantRange D(APInt(16, 7), APInt(16, 9));
+  ConstantRange E(APInt(16, 5), APInt(16, 4));
+  ConstantRange F(APInt(16, 7), APInt(16, 3));
+  EXPECT_EQ(A.difference(B), C);
+  EXPECT_EQ(B.difference(A), D);
+  EXPECT_EQ(E.difference(A), F);
+}
+
 TEST_F(ConstantRangeTest, SubtractAPInt) {
   EXPECT_EQ(Full.subtract(APInt(16, 4)), Full);
   EXPECT_EQ(Empty.subtract(APInt(16, 4)), Empty);
@@ -326,6 +386,14 @@ TEST_F(ConstantRangeTest, Multiply) {
   EXPECT_EQ(Some.multiply(Wrap), Full);
   EXPECT_EQ(Wrap.multiply(Wrap), Full);
 
+  ConstantRange Zero(APInt(16, 0));
+  EXPECT_EQ(Zero.multiply(Full), Zero);
+  EXPECT_EQ(Zero.multiply(Some), Zero);
+  EXPECT_EQ(Zero.multiply(Wrap), Zero);
+  EXPECT_EQ(Full.multiply(Zero), Zero);
+  EXPECT_EQ(Some.multiply(Zero), Zero);
+  EXPECT_EQ(Wrap.multiply(Zero), Zero);
+
   // http://llvm.org/PR4545
   EXPECT_EQ(ConstantRange(APInt(4, 1), APInt(4, 6)).multiply(
                 ConstantRange(APInt(4, 6), APInt(4, 2))),
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
new file mode 100644
index 0000000..edd350a
--- /dev/null
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -0,0 +1,137 @@
+//===- llvm/unittest/Support/FileOutputBuffer.cpp - unit tests ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::sys;
+
+#define ASSERT_NO_ERROR(x) \
+  if (error_code ASSERT_NO_ERROR_ec = x) { \
+    errs() << #x ": did not return errc::success.\n" \
+            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
+            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
+  } else {}
+
+namespace {
+
+
+// NOTE: Temporarily run this test on unix only.  Once the file mapping
+// routines are ported to Windows, this conditional can be removed.
+#if LLVM_ON_UNIX
+
+
+TEST(FileOutputBuffer, Test) {
+  // Create unique temporary directory for these tests
+  SmallString<128> TestDirectory;
+  {
+    int fd;
+    ASSERT_NO_ERROR(
+      fs::unique_file("FileOutputBuffer-test-%%-%%-%%-%%/dir", fd,
+                      TestDirectory));
+    ::close(fd);
+    TestDirectory = path::parent_path(TestDirectory);
+  }
+     
+  // TEST 1: Verify commit case.
+  SmallString<128> File1(TestDirectory);
+	File1.append("/file1");
+  {
+    OwningPtr<FileOutputBuffer> Buffer;
+    ASSERT_NO_ERROR(FileOutputBuffer::create(File1, 8192, Buffer));
+    // Start buffer with special header.
+    memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Write to end of buffer to verify it is writable.
+    memcpy(Buffer->getBufferEnd() - 20, "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Commit buffer.
+    ASSERT_NO_ERROR(Buffer->commit());
+  }
+  // Verify file exists and starts with special header.
+  bool MagicMatches = false;
+  ASSERT_NO_ERROR(fs::has_magic(Twine(File1), Twine("AABBCCDDEEFFGGHHIIJJ"), 
+                                                                MagicMatches));
+  EXPECT_TRUE(MagicMatches);
+  // Verify file is correct size.
+  uint64_t File1Size;
+  ASSERT_NO_ERROR(fs::file_size(Twine(File1), File1Size));
+  ASSERT_EQ(File1Size, 8192ULL);
+
+ 	// TEST 2: Verify abort case.
+  SmallString<128> File2(TestDirectory);
+	File2.append("/file2");
+  {
+    OwningPtr<FileOutputBuffer> Buffer2;
+    ASSERT_NO_ERROR(FileOutputBuffer::create(File2, 8192, Buffer2));
+    // Fill buffer with special header.
+    memcpy(Buffer2->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Do *not* commit buffer.
+  }
+  // Verify file does not exist (because buffer not commited).
+  bool Exists = false;
+  ASSERT_NO_ERROR(fs::exists(Twine(File2), Exists));
+  EXPECT_FALSE(Exists);  
+
+
+  // TEST 3: Verify sizing down case.
+  SmallString<128> File3(TestDirectory);
+	File3.append("/file3");
+  {
+    OwningPtr<FileOutputBuffer> Buffer;
+    ASSERT_NO_ERROR(FileOutputBuffer::create(File3, 8192000, Buffer));
+    // Start buffer with special header.
+    memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Write to end of buffer to verify it is writable.
+    memcpy(Buffer->getBufferEnd() - 20, "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Commit buffer, but size down to smaller size
+    ASSERT_NO_ERROR(Buffer->commit(5000));
+  }
+  // Verify file exists and starts with special header.
+  bool MagicMatches3 = false;
+  ASSERT_NO_ERROR(fs::has_magic(Twine(File3), Twine("AABBCCDDEEFFGGHHIIJJ"), 
+                                                              MagicMatches3));
+  EXPECT_TRUE(MagicMatches3);
+  // Verify file is correct size.
+  uint64_t File3Size;
+  ASSERT_NO_ERROR(fs::file_size(Twine(File3), File3Size));
+  ASSERT_EQ(File3Size, 5000ULL);
+
+
+  // TEST 4: Verify file can be made executable.
+  SmallString<128> File4(TestDirectory);
+	File4.append("/file4");
+  {
+    OwningPtr<FileOutputBuffer> Buffer;
+    ASSERT_NO_ERROR(FileOutputBuffer::create(File4, 8192, Buffer, 
+                                              FileOutputBuffer::F_executable));
+    // Start buffer with special header.
+    memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
+    // Commit buffer.
+    ASSERT_NO_ERROR(Buffer->commit());
+  }
+  // Verify file exists and is executable.
+  fs::file_status Status;
+  ASSERT_NO_ERROR(fs::status(Twine(File4), Status));
+  bool IsExecutable = (Status.permissions() & fs::owner_exe);
+  EXPECT_TRUE(IsExecutable);
+
+  // Clean up.
+  uint32_t RemovedCount;
+  ASSERT_NO_ERROR(fs::remove_all(TestDirectory.str(), RemovedCount));
+}
+
+#endif // LLVM_ON_UNIX
+
+} // anonymous namespace
diff --git a/unittests/Support/IntegersSubsetTest.cpp b/unittests/Support/IntegersSubsetTest.cpp
new file mode 100644
index 0000000..5d1dde4
--- /dev/null
+++ b/unittests/Support/IntegersSubsetTest.cpp
@@ -0,0 +1,328 @@
+//===- llvm/unittest/Support/IntegersSubsetTest.cpp - IntegersSubset tests ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/IntegersSubset.h" 
+#include "llvm/Support/IntegersSubsetMapping.h"
+
+#include "gtest/gtest.h"
+
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+  
+  class Int : public APInt {
+  public:
+    Int() {}
+    Int(uint64_t V) : APInt(64, V) {}
+    Int(const APInt& Src) : APInt(Src) {}
+    bool operator < (const APInt& RHS) const { return ult(RHS); }
+    bool operator > (const APInt& RHS) const { return ugt(RHS); }
+    bool operator <= (const APInt& RHS) const { return ule(RHS); }
+    bool operator >= (const APInt& RHS) const { return uge(RHS); }
+  };
+  
+  typedef IntRange<Int> Range;
+  typedef IntegersSubsetGeneric<Int> Subset;
+  typedef IntegersSubsetMapping<unsigned,Subset,Int> Mapping;
+  
+  TEST(IntegersSubsetTest, GeneralTest) {
+    
+    // Test construction.
+
+    std::vector<Range> Ranges;
+    Ranges.reserve(3);
+
+    // Initialize Subset as union of three pairs:
+    // { {0, 8}, {10, 18}, {20, 28} }
+    for (unsigned i = 0; i < 3; ++i)
+      Ranges.push_back(Range(Int(i*10), Int(i*10 + 8)));
+
+    Subset TheSubset(Ranges);
+    
+    for (unsigned i = 0; i < 3; ++i) {
+      EXPECT_EQ(TheSubset.getItem(i).getLow(), Int(i*10));
+      EXPECT_EQ(TheSubset.getItem(i).getHigh(), Int(i*10 + 8));
+    }
+    
+    EXPECT_EQ(TheSubset.getNumItems(), 3ULL);
+    
+    // Test belonging to range.
+    
+    EXPECT_TRUE(TheSubset.isSatisfies(Int(5)));
+    EXPECT_FALSE(TheSubset.isSatisfies(Int(9)));
+    
+    // Test when subset contains the only item.
+    
+    Ranges.clear();
+    Ranges.push_back(Range(Int(10), Int(10)));
+    
+    Subset TheSingleNumber(Ranges);
+    
+    EXPECT_TRUE(TheSingleNumber.isSingleNumber());
+    
+    Ranges.push_back(Range(Int(12), Int(15)));
+    
+    Subset NotASingleNumber(Ranges);
+    
+    EXPECT_FALSE(NotASingleNumber.isSingleNumber());
+    
+    // Test when subset contains items that are not a ranges but
+    // the single numbers.
+    
+    Ranges.clear();
+    Ranges.push_back(Range(Int(10), Int(10)));
+    Ranges.push_back(Range(Int(15), Int(19)));
+    
+    Subset WithSingleNumberItems(Ranges);
+    
+    EXPECT_TRUE(WithSingleNumberItems.isSingleNumber(0));
+    EXPECT_FALSE(WithSingleNumberItems.isSingleNumber(1));
+    
+    // Test size of subset. Note subset itself may be not optimized (improper),
+    // so it may contain duplicates, and the size of subset { {0, 9} {5, 9} }
+    // will 15 instead of 10.
+    
+    Ranges.clear();
+    Ranges.push_back(Range(Int(0), Int(9)));
+    Ranges.push_back(Range(Int(5), Int(9)));
+    
+    Subset NotOptimizedSubset(Ranges);
+    
+    EXPECT_EQ(NotOptimizedSubset.getSize(), 15ULL);
+
+    // Test access to a single value.
+    // getSingleValue(idx) method represents subset as flat numbers collection,
+    // so subset { {0, 3}, {8, 10} } will represented as array
+    // { 0, 1, 2, 3, 8, 9, 10 }.
+    
+    Ranges.clear();
+    Ranges.push_back(Range(Int(0), Int(3)));
+    Ranges.push_back(Range(Int(8), Int(10)));
+    
+    Subset OneMoreSubset(Ranges);
+    
+    EXPECT_EQ(OneMoreSubset.getSingleValue(5), Int(9));
+  }
+  
+  TEST(IntegersSubsetTest, MappingTest) {
+
+    Mapping::Cases TheCases;
+    
+    unsigned Successors[3] = {0, 1, 2};
+    
+    // Test construction.
+    
+    Mapping TheMapping;
+    for (unsigned i = 0; i < 3; ++i)
+      TheMapping.add(Int(10*i), Int(10*i + 9), Successors + i);
+    TheMapping.add(Int(111), Int(222), Successors);
+    TheMapping.removeItem(--TheMapping.end());
+    
+    TheMapping.getCases(TheCases);
+    
+    EXPECT_EQ(TheCases.size(), 3ULL);
+    
+    for (unsigned i = 0; i < 3; ++i) {
+      Mapping::Cases::iterator CaseIt = TheCases.begin();
+      std::advance(CaseIt, i);  
+      EXPECT_EQ(CaseIt->first, Successors + i);
+      EXPECT_EQ(CaseIt->second.getNumItems(), 1ULL);
+      EXPECT_EQ(CaseIt->second.getItem(0), Range(Int(10*i), Int(10*i + 9)));
+    }
+    
+    // Test verification.
+    
+    Mapping ImproperMapping;
+    ImproperMapping.add(Int(10), Int(11), Successors + 0);
+    ImproperMapping.add(Int(11), Int(12), Successors + 1);
+    
+    Mapping::RangeIterator ErrItem;
+    EXPECT_FALSE(ImproperMapping.verify(ErrItem));
+    EXPECT_EQ(ErrItem, --ImproperMapping.end());
+    
+    Mapping ProperMapping;
+    ProperMapping.add(Int(10), Int(11), Successors + 0);
+    ProperMapping.add(Int(12), Int(13), Successors + 1);
+    
+    EXPECT_TRUE(ProperMapping.verify(ErrItem));
+    
+    // Test optimization.
+    
+    Mapping ToBeOptimized;
+    
+    for (unsigned i = 0; i < 3; ++i) {
+      ToBeOptimized.add(Int(i * 10), Int(i * 10 + 1), Successors + i);
+      ToBeOptimized.add(Int(i * 10 + 2), Int(i * 10 + 9), Successors + i);
+    }
+    
+    ToBeOptimized.optimize();
+    
+    TheCases.clear();
+    ToBeOptimized.getCases(TheCases);
+    
+    EXPECT_EQ(TheCases.size(), 3ULL);
+    
+    for (unsigned i = 0; i < 3; ++i) {
+      Mapping::Cases::iterator CaseIt = TheCases.begin();
+      std::advance(CaseIt, i);  
+      EXPECT_EQ(CaseIt->first, Successors + i);
+      EXPECT_EQ(CaseIt->second.getNumItems(), 1ULL);
+      EXPECT_EQ(CaseIt->second.getItem(0), Range(Int(i * 10), Int(i * 10 + 9)));
+    }
+  }
+  
+  typedef unsigned unsigned_pair[2];
+  typedef unsigned_pair unsigned_ranges[];
+  
+  void TestDiff(
+      const unsigned_ranges LHS,
+      unsigned LSize,
+      const unsigned_ranges RHS,
+      unsigned RSize,
+      const unsigned_ranges ExcludeRes,
+      unsigned ExcludeResSize,
+      const unsigned_ranges IntersectRes,
+      unsigned IntersectResSize
+      ) {
+    
+    Mapping::RangesCollection Ranges;
+    
+    Mapping LHSMapping;
+    for (unsigned i = 0; i < LSize; ++i)
+      Ranges.push_back(Range(Int(LHS[i][0]), Int(LHS[i][1])));
+    LHSMapping.add(Ranges);
+    
+    Ranges.clear();
+
+    Mapping RHSMapping;
+    for (unsigned i = 0; i < RSize; ++i)
+      Ranges.push_back(Range(Int(RHS[i][0]), Int(RHS[i][1])));
+    RHSMapping.add(Ranges);
+    
+    Mapping LExclude, Intersection;
+    
+    LHSMapping.diff(&LExclude, &Intersection, 0, RHSMapping);
+    
+    if (ExcludeResSize) {
+      EXPECT_EQ(LExclude.size(), ExcludeResSize);
+      
+      unsigned i = 0;
+      for (Mapping::RangeIterator rei = LExclude.begin(),
+           e = LExclude.end(); rei != e; ++rei, ++i)
+        EXPECT_EQ(rei->first, Range(ExcludeRes[i][0], ExcludeRes[i][1]));
+    } else
+      EXPECT_TRUE(LExclude.empty());
+    
+    if (IntersectResSize) {
+      EXPECT_EQ(Intersection.size(), IntersectResSize);
+      
+      unsigned i = 0;
+      for (Mapping::RangeIterator ii = Intersection.begin(),
+           e = Intersection.end(); ii != e; ++ii, ++i)
+        EXPECT_EQ(ii->first, Range(IntersectRes[i][0], IntersectRes[i][1]));
+    } else
+      EXPECT_TRUE(Intersection.empty());
+
+    LExclude.clear();
+    Intersection.clear();
+    RHSMapping.diff(0, &Intersection, &LExclude, LHSMapping);
+    
+    // Check LExclude again.
+    if (ExcludeResSize) {
+      EXPECT_EQ(LExclude.size(), ExcludeResSize);
+      
+      unsigned i = 0;
+      for (Mapping::RangeIterator rei = LExclude.begin(),
+           e = LExclude.end(); rei != e; ++rei, ++i)
+        EXPECT_EQ(rei->first, Range(ExcludeRes[i][0], ExcludeRes[i][1]));
+    } else
+      EXPECT_TRUE(LExclude.empty());    
+  }  
+  
+  TEST(IntegersSubsetTest, DiffTest) {
+    
+    static const unsigned NOT_A_NUMBER = 0xffff;
+    
+    {
+      unsigned_ranges LHS = { { 0, 4 }, { 7, 10 }, { 13, 17 } };
+      unsigned_ranges RHS = { { 3, 14 } };
+      unsigned_ranges ExcludeRes = { { 0, 2 }, { 15, 17 } };
+      unsigned_ranges IntersectRes = { { 3, 4 }, { 7, 10 }, { 13, 14 } };
+
+      TestDiff(LHS, 3, RHS, 1, ExcludeRes, 2, IntersectRes, 3);
+    }
+
+    {
+      unsigned_ranges LHS = { { 0, 4 }, { 7, 10 }, { 13, 17 } };
+      unsigned_ranges RHS = { { 0, 4 }, { 13, 17 } };
+      unsigned_ranges ExcludeRes = { { 7, 10 } };
+      unsigned_ranges IntersectRes = { { 0, 4 }, { 13, 17 } };
+
+      TestDiff(LHS, 3, RHS, 2, ExcludeRes, 1, IntersectRes, 2);
+    }
+
+    {
+      unsigned_ranges LHS = { { 0, 17 } };
+      unsigned_ranges RHS = { { 1, 5 }, { 10, 12 }, { 15, 16 } };
+      unsigned_ranges ExcludeRes =
+          { { 0, 0 }, { 6, 9 }, { 13, 14 }, { 17, 17 } };
+      unsigned_ranges IntersectRes = { { 1, 5 }, { 10, 12 }, { 15, 16 } };
+
+      TestDiff(LHS, 1, RHS, 3, ExcludeRes, 4, IntersectRes, 3);
+    }
+
+    {
+      unsigned_ranges LHS = { { 2, 4 } };
+      unsigned_ranges RHS = { { 0, 5 } };
+      unsigned_ranges ExcludeRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
+      unsigned_ranges IntersectRes = { { 2, 4 } };
+
+      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 0, IntersectRes, 1);
+    }
+
+    {
+      unsigned_ranges LHS = { { 2, 4 } };
+      unsigned_ranges RHS = { { 7, 8 } };
+      unsigned_ranges ExcludeRes = { { 2, 4 } };
+      unsigned_ranges IntersectRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
+
+      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 0);
+    }
+
+    {
+      unsigned_ranges LHS = { { 3, 7 } };
+      unsigned_ranges RHS = { { 1, 4 } };
+      unsigned_ranges ExcludeRes = { { 5, 7 } };
+      unsigned_ranges IntersectRes = { { 3, 4 } };
+
+      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 1);
+    }
+    
+    {
+      unsigned_ranges LHS = { { 0, 7 } };
+      unsigned_ranges RHS = { { 0, 5 }, { 6, 9 } };
+      unsigned_ranges ExcludeRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
+      unsigned_ranges IntersectRes = { { 0, 5 }, {6, 7} };
+
+      TestDiff(LHS, 1, RHS, 2, ExcludeRes, 0, IntersectRes, 2);
+    }
+    
+    {
+      unsigned_ranges LHS = { { 17, 17 } };
+      unsigned_ranges RHS = { { 4, 4 } };
+      unsigned_ranges ExcludeRes = { {17, 17} };
+      unsigned_ranges IntersectRes = { { NOT_A_NUMBER, NOT_A_NUMBER } };
+
+      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 0);
+    }     
+  }   
+}
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 358dad0..a071a5a 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -312,4 +312,72 @@ TEST_F(FileSystemTest, Magic) {
   }
 }
 
+#if !defined(_WIN32) // FIXME: Win32 has different permission schema.
+TEST_F(FileSystemTest, Permissions) {
+  // Create a temp file.
+  int FileDescriptor;
+  SmallString<64> TempPath;
+  ASSERT_NO_ERROR(
+    fs::unique_file("%%-%%-%%-%%.temp", FileDescriptor, TempPath));
+
+  // Mark file as read-only
+  const fs::perms AllWrite = fs::owner_write|fs::group_write|fs::others_write;
+  ASSERT_NO_ERROR(fs::permissions(Twine(TempPath), fs::remove_perms|AllWrite));
+ 
+  // Verify file is read-only
+  fs::file_status Status;
+  ASSERT_NO_ERROR(fs::status(Twine(TempPath), Status));
+  bool AnyWriteBits = (Status.permissions() & AllWrite);
+  EXPECT_FALSE(AnyWriteBits);
+  
+  // Mark file as read-write
+  ASSERT_NO_ERROR(fs::permissions(Twine(TempPath), fs::add_perms|AllWrite));
+  
+  // Verify file is read-write
+  ASSERT_NO_ERROR(fs::status(Twine(TempPath), Status));
+  AnyWriteBits = (Status.permissions() & AllWrite);
+  EXPECT_TRUE(AnyWriteBits);
+}
+#endif
+
+#if !defined(_WIN32) // FIXME: temporary suppressed.
+TEST_F(FileSystemTest, FileMapping) {
+  // Create a temp file.
+  int FileDescriptor;
+  SmallString<64> TempPath;
+  ASSERT_NO_ERROR(
+    fs::unique_file("%%-%%-%%-%%.temp", FileDescriptor, TempPath));
+
+  // Grow temp file to be 4096 bytes 
+  ASSERT_NO_ERROR(sys::fs::resize_file(Twine(TempPath), 4096));
+  
+  // Map in temp file and add some content
+  void* MappedMemory;
+  ASSERT_NO_ERROR(fs::map_file_pages(Twine(TempPath), 0, 4096, 
+                                true /*writable*/, MappedMemory));
+  char* Memory = reinterpret_cast<char*>(MappedMemory);
+  strcpy(Memory, "hello there");
+  
+  // Unmap temp file
+  ASSERT_NO_ERROR(fs::unmap_file_pages(MappedMemory, 4096));
+  MappedMemory = NULL;
+  Memory = NULL;
+  
+  // Map it back in read-only
+  ASSERT_NO_ERROR(fs::map_file_pages(Twine(TempPath), 0, 4096, 
+                                false /*read-only*/, MappedMemory));
+  
+  // Verify content
+  Memory = reinterpret_cast<char*>(MappedMemory);
+  bool SAME = (strcmp(Memory, "hello there") == 0);
+  EXPECT_TRUE(SAME);
+  
+  // Unmap temp file
+  ASSERT_NO_ERROR(fs::unmap_file_pages(MappedMemory, 4096));
+  MappedMemory = NULL;
+  Memory = NULL;
+}
+#endif
+
+
 } // anonymous namespace
diff --git a/unittests/Support/YAMLParserTest.cpp b/unittests/Support/YAMLParserTest.cpp
index e88427a..480a573 100644
--- a/unittests/Support/YAMLParserTest.cpp
+++ b/unittests/Support/YAMLParserTest.cpp
@@ -16,11 +16,17 @@
 
 namespace llvm {
 
+static void SuppressDiagnosticsOutput(const SMDiagnostic &, void *) {
+  // Prevent SourceMgr from writing errors to stderr 
+  // to reduce noise in unit test runs.
+}
+
 // Checks that the given input gives a parse error. Makes sure that an error
 // text is available and the parse fails.
 static void ExpectParseError(StringRef Message, StringRef Input) {
   SourceMgr SM;
   yaml::Stream Stream(Input, SM);
+  SM.setDiagHandler(SuppressDiagnosticsOutput);
   EXPECT_FALSE(Stream.validate()) << Message << ": " << Input;
   EXPECT_TRUE(Stream.failed()) << Message << ": " << Input;
 }
diff --git a/unittests/Transforms/CMakeLists.txt b/unittests/Transforms/CMakeLists.txt
new file mode 100644
index 0000000..e3ce185
--- /dev/null
+++ b/unittests/Transforms/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Utils)
diff --git a/unittests/Transforms/Utils/CMakeLists.txt b/unittests/Transforms/Utils/CMakeLists.txt
new file mode 100644
index 0000000..365bfbb
--- /dev/null
+++ b/unittests/Transforms/Utils/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  TransformUtils
+  )
+
+add_llvm_unittest(UtilsTests
+  Cloning.cpp
+  Local.cpp
+  )
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index 4243b2d..ea3d5be 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -18,6 +18,7 @@
 using namespace llvm;
 
 namespace {
+
 class CloneInstruction : public ::testing::Test {
 protected:
   virtual void SetUp() {
@@ -48,7 +49,6 @@ protected:
   LLVMContext context;
   Value *V;
 };
-}
 
 TEST_F(CloneInstruction, OverflowBits) {
   V = new Argument(Type::getInt32Ty(context));
@@ -142,3 +142,5 @@ TEST_F(CloneInstruction, Exact) {
   SDiv->setIsExact(true);
   EXPECT_TRUE(this->clone(SDiv)->isExact());
 }
+
+}
diff --git a/unittests/Transforms/Utils/Local.cpp b/unittests/Transforms/Utils/Local.cpp
index 3026b4b..727f5ea 100644
--- a/unittests/Transforms/Utils/Local.cpp
+++ b/unittests/Transforms/Utils/Local.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "llvm/BasicBlock.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
 
+#include "gtest/gtest.h"
+
 using namespace llvm;
 
 TEST(Local, RecursivelyDeleteDeadPHINodes) {
diff --git a/unittests/Transforms/Utils/Makefile b/unittests/Transforms/Utils/Makefile
index fdf4be0..e6c2a2c 100644
--- a/unittests/Transforms/Utils/Makefile
+++ b/unittests/Transforms/Utils/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../../..
 TESTNAME = Utils
-LINK_COMPONENTS := core support transformutils
+LINK_COMPONENTS := TransformUtils
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/VMCore/CMakeLists.txt b/unittests/VMCore/CMakeLists.txt
new file mode 100644
index 0000000..4025c7a
--- /dev/null
+++ b/unittests/VMCore/CMakeLists.txt
@@ -0,0 +1,35 @@
+set(LLVM_LINK_COMPONENTS
+  asmparser
+  core
+  ipa
+  )
+
+set(VMCoreSources
+  ConstantsTest.cpp
+  DominatorTreeTest.cpp
+  IRBuilderTest.cpp
+  InstructionsTest.cpp
+  MDBuilderTest.cpp
+  MetadataTest.cpp
+  PassManagerTest.cpp
+  TypeBuilderTest.cpp
+  TypesTest.cpp
+  ValueMapTest.cpp
+  VerifierTest.cpp
+  )
+
+# MSVC9 and 8 cannot compile ValueMapTest.cpp due to their bug.
+# See issue#331418 in Visual Studio.
+if(MSVC AND MSVC_VERSION LESS 1600)
+  list(REMOVE_ITEM VMCoreSources ValueMapTest.cpp)
+endif()
+
+# HACK: Declare a couple of source files as optionally compiled to satisfy the
+# missing-file-checker in LLVM's weird CMake build.
+set(LLVM_OPTIONAL_SOURCES
+  ValueMapTest.cpp
+  )
+
+add_llvm_unittest(VMCoreTests
+  ${VMCoreSources}
+  )
diff --git a/unittests/Support/IRBuilderTest.cpp b/unittests/VMCore/IRBuilderTest.cpp
index b15de9e..b6a3795 100644
--- a/unittests/Support/IRBuilderTest.cpp
+++ b/unittests/VMCore/IRBuilderTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/Support/IRBuilderTest.cpp - IRBuilder tests ----------===//
+//===- llvm/unittest/VMCore/IRBuilderTest.cpp - IRBuilder tests -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/OwningPtr.h"
 
@@ -20,13 +21,14 @@
 using namespace llvm;
 
 namespace {
+
 class IRBuilderTest : public testing::Test {
 protected:
   virtual void SetUp() {
     M.reset(new Module("MyModule", getGlobalContext()));
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(getGlobalContext()),
                                           /*isVarArg=*/false);
-    Function *F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
+    F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
     BB = BasicBlock::Create(getGlobalContext(), "", F);
   }
 
@@ -36,9 +38,9 @@ protected:
   }
 
   OwningPtr<Module> M;
+  Function *F;
   BasicBlock *BB;
 };
-}
 
 TEST_F(IRBuilderTest, Lifetime) {
   IRBuilder<> Builder(BB);
@@ -70,3 +72,28 @@ TEST_F(IRBuilderTest, Lifetime) {
   ASSERT_TRUE(II_End1 != NULL);
   EXPECT_EQ(II_End1->getIntrinsicID(), Intrinsic::lifetime_end);
 }
+
+TEST_F(IRBuilderTest, CreateCondBr) {
+  IRBuilder<> Builder(BB);
+  BasicBlock *TBB = BasicBlock::Create(getGlobalContext(), "", F);
+  BasicBlock *FBB = BasicBlock::Create(getGlobalContext(), "", F);
+
+  BranchInst *BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB);
+  TerminatorInst *TI = BB->getTerminator();
+  EXPECT_EQ(BI, TI);
+  EXPECT_EQ(2u, TI->getNumSuccessors());
+  EXPECT_EQ(TBB, TI->getSuccessor(0));
+  EXPECT_EQ(FBB, TI->getSuccessor(1));
+
+  BI->eraseFromParent();
+  MDNode *Weights = MDBuilder(getGlobalContext()).createBranchWeights(42, 13);
+  BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB, Weights);
+  TI = BB->getTerminator();
+  EXPECT_EQ(BI, TI);
+  EXPECT_EQ(2u, TI->getNumSuccessors());
+  EXPECT_EQ(TBB, TI->getSuccessor(0));
+  EXPECT_EQ(FBB, TI->getSuccessor(1));
+  EXPECT_EQ(Weights, TI->getMetadata(LLVMContext::MD_prof));
+}
+
+}
diff --git a/unittests/VMCore/InstructionsTest.cpp b/unittests/VMCore/InstructionsTest.cpp
index d002101..72cdc8b 100644
--- a/unittests/VMCore/InstructionsTest.cpp
+++ b/unittests/VMCore/InstructionsTest.cpp
@@ -7,16 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Instructions.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Support/MDBuilder.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Target/TargetData.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/Support/MDBuilderTest.cpp b/unittests/VMCore/MDBuilderTest.cpp
index d54c7e8..847039b 100644
--- a/unittests/Support/MDBuilderTest.cpp
+++ b/unittests/VMCore/MDBuilderTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittests/Support/MDBuilderTest.cpp - MDBuilder unit tests ----===//
+//===- llvm/unittests/MDBuilderTest.cpp - MDBuilder unit tests ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Operator.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/MDBuilder.h"
+
+#include "gtest/gtest.h"
+
 using namespace llvm;
 
 namespace {
diff --git a/unittests/VMCore/Makefile b/unittests/VMCore/Makefile
index df55065..d743dc5 100644
--- a/unittests/VMCore/Makefile
+++ b/unittests/VMCore/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = VMCore
-LINK_COMPONENTS := core support target ipa asmparser
+LINK_COMPONENTS := core ipa asmparser
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/VMCore/PassManagerTest.cpp b/unittests/VMCore/PassManagerTest.cpp
index af845b0..60d33c1 100644
--- a/unittests/VMCore/PassManagerTest.cpp
+++ b/unittests/VMCore/PassManagerTest.cpp
@@ -324,7 +324,7 @@ namespace llvm {
 
       Passes.run(M);
       // Some passes must be rerun because a pass that modified the
-      // module/function was run inbetween
+      // module/function was run in between
       EXPECT_EQ(2, mNDM->run);
       EXPECT_EQ(1, mNDNM->run);
       EXPECT_EQ(1, mNDM2->run);
diff --git a/unittests/Support/TypeBuilderTest.cpp b/unittests/VMCore/TypeBuilderTest.cpp
index 20d0e73..a746b1f 100644
--- a/unittests/Support/TypeBuilderTest.cpp
+++ b/unittests/VMCore/TypeBuilderTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/Support/TypeBuilderTest.cpp - TypeBuilder tests -----===//
+//===- llvm/unittest/TypeBuilderTest.cpp - TypeBuilder tests --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/TypeBuilder.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/ArrayRef.h"
 
@@ -167,13 +167,13 @@ TEST(TypeBuilderTest, Context) {
             &(TypeBuilder<types::i<1>, true>::get(context2))->getContext());
 }
 
-class MyType {
+struct MyType {
   int a;
   int *b;
   void *array[1];
 };
 
-class MyPortableType {
+struct MyPortableType {
   int32_t a;
   int32_t *b;
   void *array[1];
diff --git a/unittests/VMCore/TypesTest.cpp b/unittests/VMCore/TypesTest.cpp
new file mode 100644
index 0000000..0416643
--- /dev/null
+++ b/unittests/VMCore/TypesTest.cpp
@@ -0,0 +1,30 @@
+//===- llvm/unittest/VMCore/TypesTest.cpp - Type unit tests ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+TEST(TypesTest, StructType) {
+  LLVMContext C;
+
+  // PR13522
+  StructType *Struct = StructType::create(C, "FooBar");
+  EXPECT_EQ("FooBar", Struct->getName());
+  Struct->setName(Struct->getName().substr(0, 3));
+  EXPECT_EQ("Foo", Struct->getName());
+  Struct->setName("");
+  EXPECT_TRUE(Struct->getName().empty());
+  EXPECT_FALSE(Struct->hasName());
+}
+
+}  // end anonymous namespace
diff --git a/utils/GetRepositoryPath b/utils/GetRepositoryPath
index 326231c..f3b0cc5 100755
--- a/utils/GetRepositoryPath
+++ b/utils/GetRepositoryPath
@@ -16,7 +16,7 @@ fi
 cd $1
 if [ -d .svn ]; then
   svn info | grep 'URL:' | cut -d: -f2-
-elif [ -d .git/svn ]; then
+elif [ -f .git/svn/.metadata ]; then
   git svn info | grep 'URL:' | cut -d: -f2-
 elif [ -d .git ]; then
   git remote -v | grep 'fetch' | awk '{ print $2 }'
diff --git a/utils/GetSourceVersion b/utils/GetSourceVersion
index cbed7da..b57a6aa 100755
--- a/utils/GetSourceVersion
+++ b/utils/GetSourceVersion
@@ -16,7 +16,7 @@ fi
 cd $1
 if [ -d .svn ]; then
   svnversion | sed -e "s#\([0-9]*\)[A-Z]*#\1#"
-elif [ -d .git/svn ]; then
+elif [ -f .git/svn/.metadata ]; then
   git svn info | grep 'Revision:' | cut -d: -f2-
 elif [ -d .git ]; then
   git log -1 --pretty=format:%H
diff --git a/utils/Makefile b/utils/Makefile
index ecb30be..7a3c17d 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ..
 PARALLEL_DIRS := FileCheck FileUpdate TableGen PerfectShuffle \
-	      count fpcmp llvm-lit not unittest
+	      count fpcmp llvm-lit not unittest yaml2obj
 
 EXTRA_DIST := check-each-file codegen-diff countloc.sh \
               DSAclean.py DSAextract.py emacs findsym.pl GenLibDeps.pl \
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index dc92a6c..026d47f 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -96,9 +96,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AsmMatcherEmitter.h"
 #include "CodeGenTarget.h"
-#include "StringMatcher.h"
 #include "StringToOffsetTable.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/PointerUnion.h"
@@ -111,6 +109,9 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <cassert>
 #include <map>
 #include <set>
 using namespace llvm;
@@ -123,6 +124,14 @@ namespace {
 class AsmMatcherInfo;
 struct SubtargetFeatureInfo;
 
+class AsmMatcherEmitter {
+  RecordKeeper &Records;
+public:
+  AsmMatcherEmitter(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &o);
+};
+
 /// ClassInfo - Helper class for storing the information about a particular
 /// class of operands which can be matched.
 struct ClassInfo {
@@ -177,6 +186,8 @@ struct ClassInfo {
   /// For register classes, the records for all the registers in this class.
   std::set<Record*> Registers;
 
+  /// For custom match classes, he diagnostic kind for when the predicate fails.
+  std::string DiagnosticType;
 public:
   /// isRegisterClass() - Check if this is a register class.
   bool isRegisterClass() const {
@@ -385,7 +396,7 @@ struct MatchableInfo {
 
   /// ResOperands - This is the operand list that should be built for the result
   /// MCInst.
-  std::vector<ResOperand> ResOperands;
+  SmallVector<ResOperand, 8> ResOperands;
 
   /// AsmString - The assembly string for this instruction (with variants
   /// removed), e.g. "movsx $src, $dst".
@@ -399,7 +410,7 @@ struct MatchableInfo {
   /// annotated with a class and where in the OperandList they were defined.
   /// This directly corresponds to the tokenized AsmString after the mnemonic is
   /// removed.
-  SmallVector<AsmOperand, 4> AsmOperands;
+  SmallVector<AsmOperand, 8> AsmOperands;
 
   /// Predicates - The required subtarget features to match this instruction.
   SmallVector<SubtargetFeatureInfo*, 4> RequiredFeatures;
@@ -419,13 +430,17 @@ struct MatchableInfo {
       AsmString(Alias->AsmString) {
   }
 
-  void Initialize(const AsmMatcherInfo &Info,
+  // Two-operand aliases clone from the main matchable, but mark the second
+  // operand as a tied operand of the first for purposes of the assembler.
+  void formTwoOperandAlias(StringRef Constraint);
+
+  void initialize(const AsmMatcherInfo &Info,
                   SmallPtrSet<Record*, 16> &SingletonRegisters,
                   int AsmVariantNo, std::string &RegisterPrefix);
 
-  /// Validate - Return true if this matchable is a valid thing to match against
+  /// validate - Return true if this matchable is a valid thing to match against
   /// and perform a bunch of validity checking.
-  bool Validate(StringRef CommentDelimiter, bool Hack) const;
+  bool validate(StringRef CommentDelimiter, bool Hack) const;
 
   /// extractSingletonRegisterForAsmOperand - Extract singleton register,
   /// if present, from specified token.
@@ -433,9 +448,9 @@ struct MatchableInfo {
   extractSingletonRegisterForAsmOperand(unsigned i, const AsmMatcherInfo &Info,
                                         std::string &RegisterPrefix);
 
-  /// FindAsmOperand - Find the AsmOperand with the specified name and
+  /// findAsmOperand - Find the AsmOperand with the specified name and
   /// suboperand index.
-  int FindAsmOperand(StringRef N, int SubOpIdx) const {
+  int findAsmOperand(StringRef N, int SubOpIdx) const {
     for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i)
       if (N == AsmOperands[i].SrcOpName &&
           SubOpIdx == AsmOperands[i].SubOpIdx)
@@ -443,17 +458,17 @@ struct MatchableInfo {
     return -1;
   }
 
-  /// FindAsmOperandNamed - Find the first AsmOperand with the specified name.
+  /// findAsmOperandNamed - Find the first AsmOperand with the specified name.
   /// This does not check the suboperand index.
-  int FindAsmOperandNamed(StringRef N) const {
+  int findAsmOperandNamed(StringRef N) const {
     for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i)
       if (N == AsmOperands[i].SrcOpName)
         return i;
     return -1;
   }
 
-  void BuildInstructionResultOperands();
-  void BuildAliasResultOperands();
+  void buildInstructionResultOperands();
+  void buildAliasResultOperands();
 
   /// operator< - Compare two matchables.
   bool operator<(const MatchableInfo &RHS) const {
@@ -465,7 +480,7 @@ struct MatchableInfo {
       return AsmOperands.size() < RHS.AsmOperands.size();
 
     // Compare lexicographically by operand. The matcher validates that other
-    // orderings wouldn't be ambiguous using \see CouldMatchAmbiguouslyWith().
+    // orderings wouldn't be ambiguous using \see couldMatchAmbiguouslyWith().
     for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) {
       if (*AsmOperands[i].Class < *RHS.AsmOperands[i].Class)
         return true;
@@ -476,10 +491,10 @@ struct MatchableInfo {
     return false;
   }
 
-  /// CouldMatchAmbiguouslyWith - Check whether this matchable could
+  /// couldMatchAmbiguouslyWith - Check whether this matchable could
   /// ambiguously match the same set of operands as \arg RHS (without being a
   /// strictly superior match).
-  bool CouldMatchAmbiguouslyWith(const MatchableInfo &RHS) {
+  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) {
     // The primary comparator is the instruction mnemonic.
     if (Mnemonic != RHS.Mnemonic)
       return false;
@@ -518,7 +533,7 @@ struct MatchableInfo {
   void dump();
 
 private:
-  void TokenizeAsmString(const AsmMatcherInfo &Info);
+  void tokenizeAsmString(const AsmMatcherInfo &Info);
 };
 
 /// SubtargetFeatureInfo - Helper class for storing information on a subtarget
@@ -543,7 +558,7 @@ struct OperandMatchEntry {
   MatchableInfo* MI;
   ClassInfo *CI;
 
-  static OperandMatchEntry Create(MatchableInfo* mi, ClassInfo *ci,
+  static OperandMatchEntry create(MatchableInfo* mi, ClassInfo *ci,
                                   unsigned opMask) {
     OperandMatchEntry X;
     X.OperandMask = opMask;
@@ -580,6 +595,9 @@ public:
   /// Map of Predicate records to their subtarget information.
   std::map<Record*, SubtargetFeatureInfo*> SubtargetFeatures;
 
+  /// Map of AsmOperandClass records to their class information.
+  std::map<Record*, ClassInfo*> AsmOperandClasses;
+
 private:
   /// Map of token to class information which has already been constructed.
   std::map<std::string, ClassInfo*> TokenClasses;
@@ -587,9 +605,6 @@ private:
   /// Map of RegisterClass records to their class information.
   std::map<Record*, ClassInfo*> RegisterClassClasses;
 
-  /// Map of AsmOperandClass records to their class information.
-  std::map<Record*, ClassInfo*> AsmOperandClasses;
-
 private:
   /// getTokenClass - Lookup or create the class for the given token.
   ClassInfo *getTokenClass(StringRef Token);
@@ -599,17 +614,17 @@ private:
                              int SubOpIdx);
   ClassInfo *getOperandClass(Record *Rec, int SubOpIdx);
 
-  /// BuildRegisterClasses - Build the ClassInfo* instances for register
+  /// buildRegisterClasses - Build the ClassInfo* instances for register
   /// classes.
-  void BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters);
+  void buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters);
 
-  /// BuildOperandClasses - Build the ClassInfo* instances for user defined
+  /// buildOperandClasses - Build the ClassInfo* instances for user defined
   /// operand classes.
-  void BuildOperandClasses();
+  void buildOperandClasses();
 
-  void BuildInstructionOperandReference(MatchableInfo *II, StringRef OpName,
+  void buildInstructionOperandReference(MatchableInfo *II, StringRef OpName,
                                         unsigned AsmOpIdx);
-  void BuildAliasOperandReference(MatchableInfo *II, StringRef OpName,
+  void buildAliasOperandReference(MatchableInfo *II, StringRef OpName,
                                   MatchableInfo::AsmOperand &Op);
 
 public:
@@ -617,12 +632,12 @@ public:
                  CodeGenTarget &Target,
                  RecordKeeper &Records);
 
-  /// BuildInfo - Construct the various tables used during matching.
-  void BuildInfo();
+  /// buildInfo - Construct the various tables used during matching.
+  void buildInfo();
 
-  /// BuildOperandMatchInfo - Build the necessary information to handle user
+  /// buildOperandMatchInfo - Build the necessary information to handle user
   /// defined operand parsing methods.
-  void BuildOperandMatchInfo();
+  void buildOperandMatchInfo();
 
   /// getSubtargetFeature - Lookup or create the subtarget feature info for the
   /// given operand.
@@ -638,7 +653,7 @@ public:
   }
 };
 
-}
+} // End anonymous namespace
 
 void MatchableInfo::dump() {
   errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n";
@@ -650,14 +665,86 @@ void MatchableInfo::dump() {
   }
 }
 
-void MatchableInfo::Initialize(const AsmMatcherInfo &Info,
+static std::pair<StringRef, StringRef>
+parseTwoOperandConstraint(StringRef S, SMLoc Loc) {
+  // Split via the '='.
+  std::pair<StringRef, StringRef> Ops = S.split('=');
+  if (Ops.second == "")
+    throw TGError(Loc, "missing '=' in two-operand alias constraint");
+  // Trim whitespace and the leading '$' on the operand names.
+  size_t start = Ops.first.find_first_of('$');
+  if (start == std::string::npos)
+    throw TGError(Loc, "expected '$' prefix on asm operand name");
+  Ops.first = Ops.first.slice(start + 1, std::string::npos);
+  size_t end = Ops.first.find_last_of(" \t");
+  Ops.first = Ops.first.slice(0, end);
+  // Now the second operand.
+  start = Ops.second.find_first_of('$');
+  if (start == std::string::npos)
+    throw TGError(Loc, "expected '$' prefix on asm operand name");
+  Ops.second = Ops.second.slice(start + 1, std::string::npos);
+  end = Ops.second.find_last_of(" \t");
+  Ops.first = Ops.first.slice(0, end);
+  return Ops;
+}
+
+void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
+  // Figure out which operands are aliased and mark them as tied.
+  std::pair<StringRef, StringRef> Ops =
+    parseTwoOperandConstraint(Constraint, TheDef->getLoc());
+
+  // Find the AsmOperands that refer to the operands we're aliasing.
+  int SrcAsmOperand = findAsmOperandNamed(Ops.first);
+  int DstAsmOperand = findAsmOperandNamed(Ops.second);
+  if (SrcAsmOperand == -1)
+    throw TGError(TheDef->getLoc(),
+                  "unknown source two-operand alias operand '" +
+                  Ops.first.str() + "'.");
+  if (DstAsmOperand == -1)
+    throw TGError(TheDef->getLoc(),
+                  "unknown destination two-operand alias operand '" +
+                  Ops.second.str() + "'.");
+
+  // Find the ResOperand that refers to the operand we're aliasing away
+  // and update it to refer to the combined operand instead.
+  for (unsigned i = 0, e = ResOperands.size(); i != e; ++i) {
+    ResOperand &Op = ResOperands[i];
+    if (Op.Kind == ResOperand::RenderAsmOperand &&
+        Op.AsmOperandNum == (unsigned)SrcAsmOperand) {
+      Op.AsmOperandNum = DstAsmOperand;
+      break;
+    }
+  }
+  // Remove the AsmOperand for the alias operand.
+  AsmOperands.erase(AsmOperands.begin() + SrcAsmOperand);
+  // Adjust the ResOperand references to any AsmOperands that followed
+  // the one we just deleted.
+  for (unsigned i = 0, e = ResOperands.size(); i != e; ++i) {
+    ResOperand &Op = ResOperands[i];
+    switch(Op.Kind) {
+    default:
+      // Nothing to do for operands that don't reference AsmOperands.
+      break;
+    case ResOperand::RenderAsmOperand:
+      if (Op.AsmOperandNum > (unsigned)SrcAsmOperand)
+        --Op.AsmOperandNum;
+      break;
+    case ResOperand::TiedOperand:
+      if (Op.TiedOperandNum > (unsigned)SrcAsmOperand)
+        --Op.TiedOperandNum;
+      break;
+    }
+  }
+}
+
+void MatchableInfo::initialize(const AsmMatcherInfo &Info,
                                SmallPtrSet<Record*, 16> &SingletonRegisters,
                                int AsmVariantNo, std::string &RegisterPrefix) {
   AsmVariantID = AsmVariantNo;
   AsmString =
     CodeGenInstruction::FlattenAsmStringVariants(AsmString, AsmVariantNo);
 
-  TokenizeAsmString(Info);
+  tokenizeAsmString(Info);
 
   // Compute the require features.
   std::vector<Record*> Predicates =TheDef->getValueAsListOfDefs("Predicates");
@@ -674,8 +761,8 @@ void MatchableInfo::Initialize(const AsmMatcherInfo &Info,
   }
 }
 
-/// TokenizeAsmString - Tokenize a simplified assembly string.
-void MatchableInfo::TokenizeAsmString(const AsmMatcherInfo &Info) {
+/// tokenizeAsmString - Tokenize a simplified assembly string.
+void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info) {
   StringRef String = AsmString;
   unsigned Prev = 0;
   bool InTok = true;
@@ -749,6 +836,9 @@ void MatchableInfo::TokenizeAsmString(const AsmMatcherInfo &Info) {
     throw TGError(TheDef->getLoc(),
                   "Instruction '" + TheDef->getName() + "' has no tokens");
   Mnemonic = AsmOperands[0].Token;
+  if (Mnemonic.empty())
+    throw TGError(TheDef->getLoc(),
+                  "Missing instruction mnemonic");
   // FIXME : Check and raise an error if it is a register.
   if (Mnemonic[0] == '$')
     throw TGError(TheDef->getLoc(),
@@ -758,7 +848,7 @@ void MatchableInfo::TokenizeAsmString(const AsmMatcherInfo &Info) {
   AsmOperands.erase(AsmOperands.begin());
 }
 
-bool MatchableInfo::Validate(StringRef CommentDelimiter, bool Hack) const {
+bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
   // Reject matchables with no .s string.
   if (AsmString.empty())
     throw TGError(TheDef->getLoc(), "instruction with empty asm string");
@@ -872,6 +962,7 @@ ClassInfo *AsmMatcherInfo::getTokenClass(StringRef Token) {
     Entry->PredicateMethod = "<invalid>";
     Entry->RenderMethod = "<invalid>";
     Entry->ParserMethod = "";
+    Entry->DiagnosticType = "";
     Classes.push_back(Entry);
   }
 
@@ -929,7 +1020,7 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
 }
 
 void AsmMatcherInfo::
-BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
+buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   const std::vector<CodeGenRegister*> &Registers =
     Target.getRegBank().getRegisters();
   ArrayRef<CodeGenRegisterClass*> RegClassList =
@@ -997,6 +1088,8 @@ BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
     CI->PredicateMethod = ""; // unused
     CI->RenderMethod = "addRegOperands";
     CI->Registers = *it;
+    // FIXME: diagnostic type.
+    CI->DiagnosticType = "";
     Classes.push_back(CI);
     RegisterSetClasses.insert(std::make_pair(*it, CI));
   }
@@ -1054,7 +1147,7 @@ BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   }
 }
 
-void AsmMatcherInfo::BuildOperandClasses() {
+void AsmMatcherInfo::buildOperandClasses() {
   std::vector<Record*> AsmOperands =
     Records.getAllDerivedDefinitions("AsmOperandClass");
 
@@ -1112,6 +1205,12 @@ void AsmMatcherInfo::BuildOperandClasses() {
     if (StringInit *SI = dynamic_cast<StringInit*>(PRMName))
       CI->ParserMethod = SI->getValue();
 
+    // Get the diagnostic type or leave it as empty.
+    // Get the parse method name or leave it as empty.
+    Init *DiagnosticType = (*it)->getValueInit("DiagnosticType");
+    if (StringInit *SI = dynamic_cast<StringInit*>(DiagnosticType))
+      CI->DiagnosticType = SI->getValue();
+
     AsmOperandClasses[*it] = CI;
     Classes.push_back(CI);
   }
@@ -1123,11 +1222,11 @@ AsmMatcherInfo::AsmMatcherInfo(Record *asmParser,
   : Records(records), AsmParser(asmParser), Target(target) {
 }
 
-/// BuildOperandMatchInfo - Build the necessary information to handle user
+/// buildOperandMatchInfo - Build the necessary information to handle user
 /// defined operand parsing methods.
-void AsmMatcherInfo::BuildOperandMatchInfo() {
+void AsmMatcherInfo::buildOperandMatchInfo() {
 
-  /// Map containing a mask with all operands indicies that can be found for
+  /// Map containing a mask with all operands indices that can be found for
   /// that class inside a instruction.
   std::map<ClassInfo*, unsigned> OpClassMask;
 
@@ -1152,12 +1251,12 @@ void AsmMatcherInfo::BuildOperandMatchInfo() {
          iie = OpClassMask.end(); iit != iie; ++iit) {
       unsigned OpMask = iit->second;
       ClassInfo *CI = iit->first;
-      OperandMatchInfo.push_back(OperandMatchEntry::Create(&II, CI, OpMask));
+      OperandMatchInfo.push_back(OperandMatchEntry::create(&II, CI, OpMask));
     }
   }
 }
 
-void AsmMatcherInfo::BuildInfo() {
+void AsmMatcherInfo::buildInfo() {
   // Build information about all of the AssemblerPredicates.
   std::vector<Record*> AllPredicates =
     Records.getAllDerivedDefinitions("Predicate");
@@ -1222,11 +1321,11 @@ void AsmMatcherInfo::BuildInfo() {
 
       OwningPtr<MatchableInfo> II(new MatchableInfo(CGI));
 
-      II->Initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
+      II->initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
 
       // Ignore instructions which shouldn't be matched and diagnose invalid
       // instruction definitions with an error.
-      if (!II->Validate(CommentDelimiter, true))
+      if (!II->validate(CommentDelimiter, true))
         continue;
 
       // Ignore "Int_*" and "*_Int" instructions, which are internal aliases.
@@ -1255,29 +1354,30 @@ void AsmMatcherInfo::BuildInfo() {
 
       OwningPtr<MatchableInfo> II(new MatchableInfo(Alias));
 
-      II->Initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
+      II->initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
 
       // Validate the alias definitions.
-      II->Validate(CommentDelimiter, false);
+      II->validate(CommentDelimiter, false);
 
       Matchables.push_back(II.take());
     }
   }
 
   // Build info for the register classes.
-  BuildRegisterClasses(SingletonRegisters);
+  buildRegisterClasses(SingletonRegisters);
 
   // Build info for the user defined assembly operand classes.
-  BuildOperandClasses();
+  buildOperandClasses();
 
   // Build the information about matchables, now that we have fully formed
   // classes.
+  std::vector<MatchableInfo*> NewMatchables;
   for (std::vector<MatchableInfo*>::iterator it = Matchables.begin(),
          ie = Matchables.end(); it != ie; ++it) {
     MatchableInfo *II = *it;
 
     // Parse the tokens after the mnemonic.
-    // Note: BuildInstructionOperandReference may insert new AsmOperands, so
+    // Note: buildInstructionOperandReference may insert new AsmOperands, so
     // don't precompute the loop bound.
     for (unsigned i = 0; i != II->AsmOperands.size(); ++i) {
       MatchableInfo::AsmOperand &Op = II->AsmOperands[i];
@@ -1310,16 +1410,34 @@ void AsmMatcherInfo::BuildInfo() {
         OperandName = Token.substr(1);
 
       if (II->DefRec.is<const CodeGenInstruction*>())
-        BuildInstructionOperandReference(II, OperandName, i);
+        buildInstructionOperandReference(II, OperandName, i);
       else
-        BuildAliasOperandReference(II, OperandName, Op);
+        buildAliasOperandReference(II, OperandName, Op);
     }
 
-    if (II->DefRec.is<const CodeGenInstruction*>())
-      II->BuildInstructionResultOperands();
-    else
-      II->BuildAliasResultOperands();
+    if (II->DefRec.is<const CodeGenInstruction*>()) {
+      II->buildInstructionResultOperands();
+      // If the instruction has a two-operand alias, build up the
+      // matchable here. We'll add them in bulk at the end to avoid
+      // confusing this loop.
+      std::string Constraint =
+        II->TheDef->getValueAsString("TwoOperandAliasConstraint");
+      if (Constraint != "") {
+        // Start by making a copy of the original matchable.
+        OwningPtr<MatchableInfo> AliasII(new MatchableInfo(*II));
+
+        // Adjust it to be a two-operand alias.
+        AliasII->formTwoOperandAlias(Constraint);
+
+        // Add the alias to the matchables list.
+        NewMatchables.push_back(AliasII.take());
+      }
+    } else
+      II->buildAliasResultOperands();
   }
+  if (!NewMatchables.empty())
+    Matchables.insert(Matchables.end(), NewMatchables.begin(),
+                      NewMatchables.end());
 
   // Process token alias definitions and set up the associated superclass
   // information.
@@ -1339,10 +1457,10 @@ void AsmMatcherInfo::BuildInfo() {
   std::sort(Classes.begin(), Classes.end(), less_ptr<ClassInfo>());
 }
 
-/// BuildInstructionOperandReference - The specified operand is a reference to a
+/// buildInstructionOperandReference - The specified operand is a reference to a
 /// named operand such as $src.  Resolve the Class and OperandInfo pointers.
 void AsmMatcherInfo::
-BuildInstructionOperandReference(MatchableInfo *II,
+buildInstructionOperandReference(MatchableInfo *II,
                                  StringRef OperandName,
                                  unsigned AsmOpIdx) {
   const CodeGenInstruction &CGI = *II->DefRec.get<const CodeGenInstruction*>();
@@ -1399,10 +1517,10 @@ BuildInstructionOperandReference(MatchableInfo *II,
   Op->SrcOpName = OperandName;
 }
 
-/// BuildAliasOperandReference - When parsing an operand reference out of the
+/// buildAliasOperandReference - When parsing an operand reference out of the
 /// matching string (e.g. "movsx $src, $dst"), determine what the class of the
 /// operand reference is by looking it up in the result pattern definition.
-void AsmMatcherInfo::BuildAliasOperandReference(MatchableInfo *II,
+void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II,
                                                 StringRef OperandName,
                                                 MatchableInfo::AsmOperand &Op) {
   const CodeGenInstAlias &CGA = *II->DefRec.get<const CodeGenInstAlias*>();
@@ -1427,7 +1545,7 @@ void AsmMatcherInfo::BuildAliasOperandReference(MatchableInfo *II,
                 OperandName.str() + "'");
 }
 
-void MatchableInfo::BuildInstructionResultOperands() {
+void MatchableInfo::buildInstructionResultOperands() {
   const CodeGenInstruction *ResultInst = getResultInst();
 
   // Loop over all operands of the result instruction, determining how to
@@ -1443,7 +1561,7 @@ void MatchableInfo::BuildInstructionResultOperands() {
     }
 
     // Find out what operand from the asmparser this MCInst operand comes from.
-    int SrcOperand = FindAsmOperandNamed(OpInfo.Name);
+    int SrcOperand = findAsmOperandNamed(OpInfo.Name);
     if (OpInfo.Name.empty() || SrcOperand == -1)
       throw TGError(TheDef->getLoc(), "Instruction '" +
                     TheDef->getName() + "' has operand '" + OpInfo.Name +
@@ -1466,7 +1584,7 @@ void MatchableInfo::BuildInstructionResultOperands() {
   }
 }
 
-void MatchableInfo::BuildAliasResultOperands() {
+void MatchableInfo::buildAliasResultOperands() {
   const CodeGenInstAlias &CGA = *DefRec.get<const CodeGenInstAlias*>();
   const CodeGenInstruction *ResultInst = getResultInst();
 
@@ -1495,7 +1613,7 @@ void MatchableInfo::BuildAliasResultOperands() {
       switch (CGA.ResultOperands[AliasOpNo].Kind) {
       case CodeGenInstAlias::ResultOperand::K_Record: {
         StringRef Name = CGA.ResultOperands[AliasOpNo].getName();
-        int SrcOperand = FindAsmOperand(Name, SubIdx);
+        int SrcOperand = findAsmOperand(Name, SubIdx);
         if (SrcOperand == -1)
           throw TGError(TheDef->getLoc(), "Instruction '" +
                         TheDef->getName() + "' has operand '" + OpName +
@@ -1520,7 +1638,7 @@ void MatchableInfo::BuildAliasResultOperands() {
   }
 }
 
-static void EmitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
+static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
                                 std::vector<MatchableInfo*> &Infos,
                                 raw_ostream &OS) {
   // Write the convert function to a separate stream, so we can drop it after
@@ -1661,8 +1779,8 @@ static void EmitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
   OS << CvtOS.str();
 }
 
-/// EmitMatchClassEnumeration - Emit the enumeration for match class kinds.
-static void EmitMatchClassEnumeration(CodeGenTarget &Target,
+/// emitMatchClassEnumeration - Emit the enumeration for match class kinds.
+static void emitMatchClassEnumeration(CodeGenTarget &Target,
                                       std::vector<ClassInfo*> &Infos,
                                       raw_ostream &OS) {
   OS << "namespace {\n\n";
@@ -1692,37 +1810,24 @@ static void EmitMatchClassEnumeration(CodeGenTarget &Target,
   OS << "}\n\n";
 }
 
-/// EmitValidateOperandClass - Emit the function to validate an operand class.
-static void EmitValidateOperandClass(AsmMatcherInfo &Info,
+/// emitValidateOperandClass - Emit the function to validate an operand class.
+static void emitValidateOperandClass(AsmMatcherInfo &Info,
                                      raw_ostream &OS) {
-  OS << "static bool validateOperandClass(MCParsedAsmOperand *GOp, "
+  OS << "static unsigned validateOperandClass(MCParsedAsmOperand *GOp, "
      << "MatchClassKind Kind) {\n";
   OS << "  " << Info.Target.getName() << "Operand &Operand = *("
      << Info.Target.getName() << "Operand*)GOp;\n";
 
   // The InvalidMatchClass is not to match any operand.
   OS << "  if (Kind == InvalidMatchClass)\n";
-  OS << "    return false;\n\n";
+  OS << "    return MCTargetAsmParser::Match_InvalidOperand;\n\n";
 
   // Check for Token operands first.
+  // FIXME: Use a more specific diagnostic type.
   OS << "  if (Operand.isToken())\n";
-  OS << "    return isSubclass(matchTokenString(Operand.getToken()), Kind);"
-     << "\n\n";
-
-  // Check for register operands, including sub-classes.
-  OS << "  if (Operand.isReg()) {\n";
-  OS << "    MatchClassKind OpKind;\n";
-  OS << "    switch (Operand.getReg()) {\n";
-  OS << "    default: OpKind = InvalidMatchClass; break;\n";
-  for (std::map<Record*, ClassInfo*>::iterator
-         it = Info.RegisterClasses.begin(), ie = Info.RegisterClasses.end();
-       it != ie; ++it)
-    OS << "    case " << Info.Target.getName() << "::"
-       << it->first->getName() << ": OpKind = " << it->second->Name
-       << "; break;\n";
-  OS << "    }\n";
-  OS << "    return isSubclass(OpKind, Kind);\n";
-  OS << "  }\n\n";
+  OS << "    return isSubclass(matchTokenString(Operand.getToken()), Kind) ?\n"
+     << "             MCTargetAsmParser::Match_Success :\n"
+     << "             MCTargetAsmParser::Match_InvalidOperand;\n\n";
 
   // Check the user classes. We don't care what order since we're only
   // actually matching against one of them.
@@ -1734,18 +1839,39 @@ static void EmitValidateOperandClass(AsmMatcherInfo &Info,
       continue;
 
     OS << "  // '" << CI.ClassName << "' class\n";
-    OS << "  if (Kind == " << CI.Name
-       << " && Operand." << CI.PredicateMethod << "()) {\n";
-    OS << "    return true;\n";
+    OS << "  if (Kind == " << CI.Name << ") {\n";
+    OS << "    if (Operand." << CI.PredicateMethod << "())\n";
+    OS << "      return MCTargetAsmParser::Match_Success;\n";
+    if (!CI.DiagnosticType.empty())
+      OS << "    return " << Info.Target.getName() << "AsmParser::Match_"
+         << CI.DiagnosticType << ";\n";
     OS << "  }\n\n";
   }
 
-  OS << "  return false;\n";
+  // Check for register operands, including sub-classes.
+  OS << "  if (Operand.isReg()) {\n";
+  OS << "    MatchClassKind OpKind;\n";
+  OS << "    switch (Operand.getReg()) {\n";
+  OS << "    default: OpKind = InvalidMatchClass; break;\n";
+  for (std::map<Record*, ClassInfo*>::iterator
+         it = Info.RegisterClasses.begin(), ie = Info.RegisterClasses.end();
+       it != ie; ++it)
+    OS << "    case " << Info.Target.getName() << "::"
+       << it->first->getName() << ": OpKind = " << it->second->Name
+       << "; break;\n";
+  OS << "    }\n";
+  OS << "    return isSubclass(OpKind, Kind) ? "
+     << "MCTargetAsmParser::Match_Success :\n                             "
+     << "         MCTargetAsmParser::Match_InvalidOperand;\n  }\n\n";
+
+  // Generic fallthrough match failure case for operands that don't have
+  // specialized diagnostic types.
+  OS << "  return MCTargetAsmParser::Match_InvalidOperand;\n";
   OS << "}\n\n";
 }
 
-/// EmitIsSubclass - Emit the subclass predicate function.
-static void EmitIsSubclass(CodeGenTarget &Target,
+/// emitIsSubclass - Emit the subclass predicate function.
+static void emitIsSubclass(CodeGenTarget &Target,
                            std::vector<ClassInfo*> &Infos,
                            raw_ostream &OS) {
   OS << "/// isSubclass - Compute whether \\arg A is a subclass of \\arg B.\n";
@@ -1789,9 +1915,9 @@ static void EmitIsSubclass(CodeGenTarget &Target,
   OS << "}\n\n";
 }
 
-/// EmitMatchTokenString - Emit the function to match a token string to the
+/// emitMatchTokenString - Emit the function to match a token string to the
 /// appropriate match class value.
-static void EmitMatchTokenString(CodeGenTarget &Target,
+static void emitMatchTokenString(CodeGenTarget &Target,
                                  std::vector<ClassInfo*> &Infos,
                                  raw_ostream &OS) {
   // Construct the match list.
@@ -1813,9 +1939,9 @@ static void EmitMatchTokenString(CodeGenTarget &Target,
   OS << "}\n\n";
 }
 
-/// EmitMatchRegisterName - Emit the function to match a string to the target
+/// emitMatchRegisterName - Emit the function to match a string to the target
 /// specific register enum.
-static void EmitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
+static void emitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
                                   raw_ostream &OS) {
   // Construct the match list.
   std::vector<StringMatcher::StringPair> Matches;
@@ -1839,9 +1965,9 @@ static void EmitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
   OS << "}\n\n";
 }
 
-/// EmitSubtargetFeatureFlagEnumeration - Emit the subtarget feature flag
+/// emitSubtargetFeatureFlagEnumeration - Emit the subtarget feature flag
 /// definitions.
-static void EmitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
+static void emitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
                                                 raw_ostream &OS) {
   OS << "// Flags for subtarget features that participate in "
      << "instruction matching.\n";
@@ -1856,9 +1982,48 @@ static void EmitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
   OS << "};\n\n";
 }
 
-/// EmitComputeAvailableFeatures - Emit the function to compute the list of
+/// emitOperandDiagnosticTypes - Emit the operand matching diagnostic types.
+static void emitOperandDiagnosticTypes(AsmMatcherInfo &Info, raw_ostream &OS) {
+  // Get the set of diagnostic types from all of the operand classes.
+  std::set<StringRef> Types;
+  for (std::map<Record*, ClassInfo*>::const_iterator
+       I = Info.AsmOperandClasses.begin(),
+       E = Info.AsmOperandClasses.end(); I != E; ++I) {
+    if (!I->second->DiagnosticType.empty())
+      Types.insert(I->second->DiagnosticType);
+  }
+
+  if (Types.empty()) return;
+
+  // Now emit the enum entries.
+  for (std::set<StringRef>::const_iterator I = Types.begin(), E = Types.end();
+       I != E; ++I)
+    OS << "  Match_" << *I << ",\n";
+  OS << "  END_OPERAND_DIAGNOSTIC_TYPES\n";
+}
+
+/// emitGetSubtargetFeatureName - Emit the helper function to get the
+/// user-level name for a subtarget feature.
+static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
+  OS << "// User-level names for subtarget features that participate in\n"
+     << "// instruction matching.\n"
+     << "static const char *getSubtargetFeatureName(unsigned Val) {\n"
+     << "  switch(Val) {\n";
+  for (std::map<Record*, SubtargetFeatureInfo*>::const_iterator
+         it = Info.SubtargetFeatures.begin(),
+         ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
+    SubtargetFeatureInfo &SFI = *it->second;
+    // FIXME: Totally just a placeholder name to get the algorithm working.
+    OS << "  case " << SFI.getEnumName() << ": return \""
+       << SFI.TheDef->getValueAsString("PredicateName") << "\";\n";
+  }
+  OS << "  default: return \"(unknown)\";\n";
+  OS << "  }\n}\n\n";
+}
+
+/// emitComputeAvailableFeatures - Emit the function to compute the list of
 /// available features given a subtarget.
-static void EmitComputeAvailableFeatures(AsmMatcherInfo &Info,
+static void emitComputeAvailableFeatures(AsmMatcherInfo &Info,
                                          raw_ostream &OS) {
   std::string ClassName =
     Info.AsmParser->getValueAsString("AsmParserClassName");
@@ -1933,9 +2098,9 @@ static std::string GetAliasRequiredFeatures(Record *R,
   return Result;
 }
 
-/// EmitMnemonicAliases - If the target has any MnemonicAlias<> definitions,
+/// emitMnemonicAliases - If the target has any MnemonicAlias<> definitions,
 /// emit a function for them and return true, otherwise return false.
-static bool EmitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info) {
+static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info) {
   // Ignore aliases when match-prefix is set.
   if (!MatchPrefix.empty())
     return false;
@@ -2023,7 +2188,7 @@ static const char *getMinimalTypeForRange(uint64_t Range) {
   return "uint8_t";
 }
 
-static void EmitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
+static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
                               const AsmMatcherInfo &Info, StringRef ClassName) {
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
@@ -2193,7 +2358,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   // Compute the information on the instructions to match.
   AsmMatcherInfo Info(AsmParser, Target, Records);
-  Info.BuildInfo();
+  Info.buildInfo();
 
   // Sort the instruction table using the partial order on classes. We use
   // stable_sort to ensure that ambiguous instructions are still
@@ -2216,7 +2381,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         MatchableInfo &A = *Info.Matchables[i];
         MatchableInfo &B = *Info.Matchables[j];
 
-        if (A.CouldMatchAmbiguouslyWith(B)) {
+        if (A.couldMatchAmbiguouslyWith(B)) {
           errs() << "warning: ambiguous matchables:\n";
           A.dump();
           errs() << "\nis incomparable with:\n";
@@ -2232,12 +2397,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   });
 
   // Compute the information on the custom operand parsing.
-  Info.BuildOperandMatchInfo();
+  Info.buildOperandMatchInfo();
 
   // Write the output.
 
-  EmitSourceFileHeader("Assembly Matcher Source Fragment", OS);
-
   // Information for the class declaration.
   OS << "\n#ifdef GET_ASSEMBLER_HEADER\n";
   OS << "#undef GET_ASSEMBLER_HEADER\n";
@@ -2270,41 +2433,55 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "#endif // GET_ASSEMBLER_HEADER_INFO\n\n";
 
+  // Emit the operand match diagnostic enum names.
+  OS << "\n#ifdef GET_OPERAND_DIAGNOSTIC_TYPES\n";
+  OS << "#undef GET_OPERAND_DIAGNOSTIC_TYPES\n\n";
+  emitOperandDiagnosticTypes(Info, OS);
+  OS << "#endif // GET_OPERAND_DIAGNOSTIC_TYPES\n\n";
+
+
   OS << "\n#ifdef GET_REGISTER_MATCHER\n";
   OS << "#undef GET_REGISTER_MATCHER\n\n";
 
   // Emit the subtarget feature enumeration.
-  EmitSubtargetFeatureFlagEnumeration(Info, OS);
+  emitSubtargetFeatureFlagEnumeration(Info, OS);
 
   // Emit the function to match a register name to number.
-  EmitMatchRegisterName(Target, AsmParser, OS);
+  emitMatchRegisterName(Target, AsmParser, OS);
 
   OS << "#endif // GET_REGISTER_MATCHER\n\n";
 
+  OS << "\n#ifdef GET_SUBTARGET_FEATURE_NAME\n";
+  OS << "#undef GET_SUBTARGET_FEATURE_NAME\n\n";
+
+  // Generate the helper function to get the names for subtarget features.
+  emitGetSubtargetFeatureName(Info, OS);
+
+  OS << "#endif // GET_SUBTARGET_FEATURE_NAME\n\n";
 
   OS << "\n#ifdef GET_MATCHER_IMPLEMENTATION\n";
   OS << "#undef GET_MATCHER_IMPLEMENTATION\n\n";
 
   // Generate the function that remaps for mnemonic aliases.
-  bool HasMnemonicAliases = EmitMnemonicAliases(OS, Info);
+  bool HasMnemonicAliases = emitMnemonicAliases(OS, Info);
 
   // Generate the unified function to convert operands into an MCInst.
-  EmitConvertToMCInst(Target, ClassName, Info.Matchables, OS);
+  emitConvertToMCInst(Target, ClassName, Info.Matchables, OS);
 
   // Emit the enumeration for classes which participate in matching.
-  EmitMatchClassEnumeration(Target, Info.Classes, OS);
+  emitMatchClassEnumeration(Target, Info.Classes, OS);
 
   // Emit the routine to match token strings to their match class.
-  EmitMatchTokenString(Target, Info.Classes, OS);
+  emitMatchTokenString(Target, Info.Classes, OS);
 
   // Emit the subclass predicate routine.
-  EmitIsSubclass(Target, Info.Classes, OS);
+  emitIsSubclass(Target, Info.Classes, OS);
 
   // Emit the routine to validate an operand against a match class.
-  EmitValidateOperandClass(Info, OS);
+  emitValidateOperandClass(Info, OS);
 
   // Emit the available features compute function.
-  EmitComputeAvailableFeatures(Info, OS);
+  emitComputeAvailableFeatures(Info, OS);
 
 
   size_t MaxNumOperands = 0;
@@ -2415,8 +2592,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << Target.getName() << ClassName << "::\n"
      << "MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>"
      << " &Operands,\n";
-  OS << "                     MCInst &Inst, unsigned &ErrorInfo,\n";
-  OS << "                     unsigned VariantID) {\n";
+  OS << "                     MCInst &Inst, unsigned &ErrorInfo, ";
+  OS << "unsigned VariantID) {\n";
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
@@ -2444,6 +2621,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  bool HadMatchOtherThanFeatures = false;\n";
   OS << "  bool HadMatchOtherThanPredicate = false;\n";
   OS << "  unsigned RetCode = Match_InvalidOperand;\n";
+  OS << "  unsigned MissingFeatures = ~0U;\n";
   OS << "  // Set ErrorInfo to the operand that mismatches if it is\n";
   OS << "  // wrong for all instances of the instruction.\n";
   OS << "  ErrorInfo = ~0U;\n";
@@ -2471,15 +2649,25 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    for (unsigned i = 0; i != " << MaxNumOperands << "; ++i) {\n";
   OS << "      if (i + 1 >= Operands.size()) {\n";
   OS << "        OperandsValid = (it->Classes[i] == " <<"InvalidMatchClass);\n";
+  OS << "        if (!OperandsValid) ErrorInfo = i + 1;\n";
   OS << "        break;\n";
   OS << "      }\n";
-  OS << "      if (validateOperandClass(Operands[i+1], "
-                                       "(MatchClassKind)it->Classes[i]))\n";
+  OS << "      unsigned Diag = validateOperandClass(Operands[i+1],\n";
+  OS.indent(43);
+  OS << "(MatchClassKind)it->Classes[i]);\n";
+  OS << "      if (Diag == Match_Success)\n";
   OS << "        continue;\n";
   OS << "      // If this operand is broken for all of the instances of this\n";
   OS << "      // mnemonic, keep track of it so we can report loc info.\n";
-  OS << "      if (it == MnemonicRange.first || ErrorInfo <= i+1)\n";
+  OS << "      // If we already had a match that only failed due to a\n";
+  OS << "      // target predicate, that diagnostic is preferred.\n";
+  OS << "      if (!HadMatchOtherThanPredicate &&\n";
+  OS << "          (it == MnemonicRange.first || ErrorInfo <= i+1)) {\n";
   OS << "        ErrorInfo = i+1;\n";
+  OS << "        // InvalidOperand is the default. Prefer specificity.\n";
+  OS << "        if (Diag != Match_InvalidOperand)\n";
+  OS << "          RetCode = Diag;\n";
+  OS << "      }\n";
   OS << "      // Otherwise, just reject this instance of the mnemonic.\n";
   OS << "      OperandsValid = false;\n";
   OS << "      break;\n";
@@ -2491,6 +2679,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    if ((AvailableFeatures & it->RequiredFeatures) "
      << "!= it->RequiredFeatures) {\n";
   OS << "      HadMatchOtherThanFeatures = true;\n";
+  OS << "      unsigned NewMissingFeatures = it->RequiredFeatures & "
+        "~AvailableFeatures;\n";
+  OS << "      if (CountPopulation_32(NewMissingFeatures) <= "
+        "CountPopulation_32(MissingFeatures))\n";
+  OS << "        MissingFeatures = NewMissingFeatures;\n";
   OS << "      continue;\n";
   OS << "    }\n";
   OS << "\n";
@@ -2524,12 +2717,23 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "  // Okay, we had no match.  Try to return a useful error code.\n";
   OS << "  if (HadMatchOtherThanPredicate || !HadMatchOtherThanFeatures)";
-  OS << " return RetCode;\n";
+  OS << "  return RetCode;\n";
+  OS << "  // Missing feature matches return which features were missing\n";
+  OS << "  ErrorInfo = MissingFeatures;\n";
   OS << "  return Match_MissingFeature;\n";
   OS << "}\n\n";
 
   if (Info.OperandMatchInfo.size())
-    EmitCustomOperandParsing(OS, Target, Info, ClassName);
+    emitCustomOperandParsing(OS, Target, Info, ClassName);
 
   OS << "#endif // GET_MATCHER_IMPLEMENTATION\n\n";
 }
+
+namespace llvm {
+
+void EmitAsmMatcher(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Assembly Matcher Source Fragment", OS);
+  AsmMatcherEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/AsmMatcherEmitter.h b/utils/TableGen/AsmMatcherEmitter.h
deleted file mode 100644
index e04ac10..0000000
--- a/utils/TableGen/AsmMatcherEmitter.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- AsmMatcherEmitter.h - Generate an assembly matcher -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend emits a target specifier matcher for converting parsed
-// assembly operands in the MCInst structures.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ASMMATCHER_EMITTER_H
-#define ASMMATCHER_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include <cassert>
-
-namespace llvm {
-  class AsmMatcherEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-  public:
-    AsmMatcherEmitter(RecordKeeper &R) : Records(R) {}
-
-    // run - Output the matcher, returning true on failure.
-    void run(raw_ostream &o);
-  };
-}
-#endif
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index d079b45..57979b3 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -12,19 +12,50 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AsmWriterEmitter.h"
 #include "AsmWriterInst.h"
 #include "CodeGenTarget.h"
-#include "StringToOffsetTable.h"
 #include "SequenceToOffsetTable.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
+#include <cassert>
+#include <map>
+#include <vector>
 using namespace llvm;
 
+namespace {
+class AsmWriterEmitter {
+  RecordKeeper &Records;
+  std::map<const CodeGenInstruction*, AsmWriterInst*> CGIAWIMap;
+  std::vector<const CodeGenInstruction*> NumberedInstructions;
+public:
+  AsmWriterEmitter(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &o);
+
+private:
+  void EmitPrintInstruction(raw_ostream &o);
+  void EmitGetRegisterName(raw_ostream &o);
+  void EmitPrintAliasInstruction(raw_ostream &O);
+
+  AsmWriterInst *getAsmWriterInstByID(unsigned ID) const {
+    assert(ID < NumberedInstructions.size());
+    std::map<const CodeGenInstruction*, AsmWriterInst*>::const_iterator I =
+      CGIAWIMap.find(NumberedInstructions[ID]);
+    assert(I != CGIAWIMap.end() && "Didn't find inst!");
+    return I->second;
+  }
+  void FindUniqueOperandCommands(std::vector<std::string> &UOC,
+                                 std::vector<unsigned> &InstIdxs,
+                                 std::vector<unsigned> &InstOpsUsed) const;
+};
+} // end anonymous namespace
+
 static void PrintCases(std::vector<std::pair<std::string,
                        AsmWriterOperand> > &OpsToPrint, raw_ostream &O) {
   O << "    case " << OpsToPrint.back().first << ": ";
@@ -928,10 +959,17 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 }
 
 void AsmWriterEmitter::run(raw_ostream &O) {
-  EmitSourceFileHeader("Assembly Writer Source Fragment", O);
-
   EmitPrintInstruction(O);
   EmitGetRegisterName(O);
   EmitPrintAliasInstruction(O);
 }
 
+
+namespace llvm {
+
+void EmitAsmWriter(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Assembly Writer Source Fragment", OS);
+  AsmWriterEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/AsmWriterEmitter.h b/utils/TableGen/AsmWriterEmitter.h
deleted file mode 100644
index 9719b20..0000000
--- a/utils/TableGen/AsmWriterEmitter.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===- AsmWriterEmitter.h - Generate an assembly writer ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend is responsible for emitting an assembly printer for the
-// code generator.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ASMWRITER_EMITTER_H
-#define ASMWRITER_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include <map>
-#include <vector>
-#include <cassert>
-
-namespace llvm {
-  class AsmWriterInst;
-  class CodeGenInstruction;
-  
-  class AsmWriterEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-    std::map<const CodeGenInstruction*, AsmWriterInst*> CGIAWIMap;
-    std::vector<const CodeGenInstruction*> NumberedInstructions;
-  public:
-    AsmWriterEmitter(RecordKeeper &R) : Records(R) {}
-
-    // run - Output the asmwriter, returning true on failure.
-    void run(raw_ostream &o);
-
-private:
-    void EmitPrintInstruction(raw_ostream &o);
-    void EmitGetRegisterName(raw_ostream &o);
-    void EmitPrintAliasInstruction(raw_ostream &O);
-    
-    AsmWriterInst *getAsmWriterInstByID(unsigned ID) const {
-      assert(ID < NumberedInstructions.size());
-      std::map<const CodeGenInstruction*, AsmWriterInst*>::const_iterator I =
-        CGIAWIMap.find(NumberedInstructions[ID]);
-      assert(I != CGIAWIMap.end() && "Didn't find inst!");
-      return I->second;
-    }
-    void FindUniqueOperandCommands(std::vector<std::string> &UOC,
-                                   std::vector<unsigned> &InstIdxs,
-                                   std::vector<unsigned> &InstOpsUsed) const;
-  };
-}
-#endif
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 2b70f1c..0e14cba 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_tablegen(llvm-tblgen LLVM
   CodeGenDAGPatterns.cpp
   CodeGenInstruction.cpp
   CodeGenRegisters.cpp
+  CodeGenSchedule.cpp
   CodeGenTarget.cpp
   DAGISelEmitter.cpp
   DAGISelMatcherEmitter.cpp
@@ -27,7 +28,6 @@ add_tablegen(llvm-tblgen LLVM
   PseudoLoweringEmitter.cpp
   RegisterInfoEmitter.cpp
   SetTheory.cpp
-  StringMatcher.cpp
   SubtargetEmitter.cpp
   TGValueTypes.cpp
   TableGen.cpp
diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index afbb3a8..e9c4bd3 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp
@@ -12,13 +12,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CallingConvEmitter.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <cassert>
 using namespace llvm;
 
+namespace {
+class CallingConvEmitter {
+  RecordKeeper &Records;
+public:
+  explicit CallingConvEmitter(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &o);
+
+private:
+  void EmitCallingConv(Record *CC, raw_ostream &O);
+  void EmitAction(Record *Action, unsigned Indent, raw_ostream &O);
+  unsigned Counter;
+};
+} // End anonymous namespace
+
 void CallingConvEmitter::run(raw_ostream &O) {
-  EmitSourceFileHeader("Calling Convention Implementation Fragment", O);
 
   std::vector<Record*> CCs = Records.getAllDerivedDefinitions("CallingConv");
   
@@ -210,3 +225,12 @@ void CallingConvEmitter::EmitAction(Record *Action,
     }
   }
 }
+
+namespace llvm {
+
+void EmitCallingConv(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Calling Convention Implementation Fragment", OS);
+  CallingConvEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/CallingConvEmitter.h b/utils/TableGen/CallingConvEmitter.h
deleted file mode 100644
index 7bddd6c..0000000
--- a/utils/TableGen/CallingConvEmitter.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- CallingConvEmitter.h - Generate calling conventions ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend is responsible for emitting descriptions of the calling
-// conventions supported by this target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CALLINGCONV_EMITTER_H
-#define CALLINGCONV_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include <cassert>
-
-namespace llvm {
-  class CallingConvEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-  public:
-    explicit CallingConvEmitter(RecordKeeper &R) : Records(R) {}
-
-    // run - Output the asmwriter, returning true on failure.
-    void run(raw_ostream &o);
-    
-  private:
-    void EmitCallingConv(Record *CC, raw_ostream &O);
-    void EmitAction(Record *Action, unsigned Indent, raw_ostream &O);
-    unsigned Counter;
-  };
-}
-#endif
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index 3943e8a..31a39b1 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -13,13 +13,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeEmitterGen.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <map>
+#include <string>
+#include <vector>
 using namespace llvm;
 
 // FIXME: Somewhat hackish to use a command line option for this. There should
@@ -30,6 +32,27 @@ MCEmitter("mc-emitter",
           cl::desc("Generate CodeEmitter for use with the MC library."),
           cl::init(false));
 
+namespace {
+
+class CodeEmitterGen {
+  RecordKeeper &Records;
+public:
+  CodeEmitterGen(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &o);
+private:
+  void emitMachineOpEmitter(raw_ostream &o, const std::string &Namespace);
+  void emitGetValueBit(raw_ostream &o, const std::string &Namespace);
+  void reverseBits(std::vector<Record*> &Insts);
+  int getVariableBit(const std::string &VarName, BitsInit *BI, int bit);
+  std::string getInstructionCase(Record *R, CodeGenTarget &Target);
+  void AddCodeToMergeInOperand(Record *R, BitsInit *BI,
+                               const std::string &VarName,
+                               unsigned &NumberedOp,
+                               std::string &Case, CodeGenTarget &Target);
+
+};
+
 void CodeEmitterGen::reverseBits(std::vector<Record*> &Insts) {
   for (std::vector<Record*>::iterator I = Insts.begin(), E = Insts.end();
        I != E; ++I) {
@@ -214,7 +237,6 @@ void CodeEmitterGen::run(raw_ostream &o) {
   // For little-endian instruction bit encodings, reverse the bit order
   if (Target.isLittleEndianEncoding()) reverseBits(Insts);
 
-  EmitSourceFileHeader("Machine Code Emitter", o);
 
   const std::vector<const CodeGenInstruction*> &NumberedInstructions =
     Target.getInstructionsByEnumValue();
@@ -304,3 +326,14 @@ void CodeEmitterGen::run(raw_ostream &o) {
     << "  return Value;\n"
     << "}\n\n";
 }
+
+} // End anonymous namespace
+
+namespace llvm {
+
+void EmitCodeEmitter(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Machine Code Emitter", OS);
+  CodeEmitterGen(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/CodeEmitterGen.h b/utils/TableGen/CodeEmitterGen.h
deleted file mode 100644
index 7f6ee2a..0000000
--- a/utils/TableGen/CodeEmitterGen.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- CodeEmitterGen.h - Code Emitter Generator ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// FIXME: document
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CODEMITTERGEN_H
-#define CODEMITTERGEN_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include <vector>
-#include <string>
-
-namespace llvm {
-
-class RecordVal;
-class BitsInit;
-class CodeGenTarget;
-
-class CodeEmitterGen : public TableGenBackend {
-  RecordKeeper &Records;
-public:
-  CodeEmitterGen(RecordKeeper &R) : Records(R) {}
-
-  // run - Output the code emitter
-  void run(raw_ostream &o);
-private:
-  void emitMachineOpEmitter(raw_ostream &o, const std::string &Namespace);
-  void emitGetValueBit(raw_ostream &o, const std::string &Namespace);
-  void reverseBits(std::vector<Record*> &Insts);
-  int getVariableBit(const std::string &VarName, BitsInit *BI, int bit);
-  std::string getInstructionCase(Record *R, CodeGenTarget &Target);
-  void
-  AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
-                          unsigned &NumberedOp,
-                          std::string &Case, CodeGenTarget &Target);
-    
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index d4b02fb..34f8a34 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -2520,6 +2520,37 @@ static void InferFromPattern(const CodeGenInstruction &Inst,
     IsVariadic = true;  // Can warn if we want.
 }
 
+/// hasNullFragReference - Return true if the DAG has any reference to the
+/// null_frag operator.
+static bool hasNullFragReference(DagInit *DI) {
+  DefInit *OpDef = dynamic_cast<DefInit*>(DI->getOperator());
+  if (!OpDef) return false;
+  Record *Operator = OpDef->getDef();
+
+  // If this is the null fragment, return true.
+  if (Operator->getName() == "null_frag") return true;
+  // If any of the arguments reference the null fragment, return true.
+  for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
+    DagInit *Arg = dynamic_cast<DagInit*>(DI->getArg(i));
+    if (Arg && hasNullFragReference(Arg))
+      return true;
+  }
+
+  return false;
+}
+
+/// hasNullFragReference - Return true if any DAG in the list references
+/// the null_frag operator.
+static bool hasNullFragReference(ListInit *LI) {
+  for (unsigned i = 0, e = LI->getSize(); i != e; ++i) {
+    DagInit *DI = dynamic_cast<DagInit*>(LI->getElement(i));
+    assert(DI && "non-dag in an instruction Pattern list?!");
+    if (hasNullFragReference(DI))
+      return true;
+  }
+  return false;
+}
+
 /// ParseInstructions - Parse all of the instructions, inlining and resolving
 /// any fragments involved.  This populates the Instructions list with fully
 /// resolved instructions.
@@ -2534,8 +2565,11 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
     // If there is no pattern, only collect minimal information about the
     // instruction for its operand list.  We have to assume that there is one
-    // result, as we have no detailed info.
-    if (!LI || LI->getSize() == 0) {
+    // result, as we have no detailed info. A pattern which references the
+    // null_frag operator is as-if no pattern were specified. Normally this
+    // is from a multiclass expansion w/ a SDPatternOperator passed in as
+    // null_frag.
+    if (!LI || LI->getSize() == 0 || hasNullFragReference(LI)) {
       std::vector<Record*> Results;
       std::vector<Record*> Operands;
 
@@ -2874,6 +2908,11 @@ void CodeGenDAGPatterns::ParsePatterns() {
   for (unsigned i = 0, e = Patterns.size(); i != e; ++i) {
     Record *CurPattern = Patterns[i];
     DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch");
+
+    // If the pattern references the null_frag, there's nothing to do.
+    if (hasNullFragReference(Tree))
+      continue;
+
     TreePattern *Pattern = new TreePattern(CurPattern, Tree, true, *this);
 
     // Inline pattern fragments into it.
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index fb9ad93..33381e9 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -556,9 +556,31 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
     ResultOperand ResOp(static_cast<int64_t>(0));
     if (tryAliasOpMatch(Result, AliasOpNo, InstOpRec, (NumSubOps > 1),
                         R->getLoc(), T, ResOp)) {
-      ResultOperands.push_back(ResOp);
-      ResultInstOperandIndex.push_back(std::make_pair(i, -1));
-      ++AliasOpNo;
+      // If this is a simple operand, or a complex operand with a custom match
+      // class, then we can match is verbatim.
+      if (NumSubOps == 1 ||
+          (InstOpRec->getValue("ParserMatchClass") &&
+           InstOpRec->getValueAsDef("ParserMatchClass")
+             ->getValueAsString("Name") != "Imm")) {
+        ResultOperands.push_back(ResOp);
+        ResultInstOperandIndex.push_back(std::make_pair(i, -1));
+        ++AliasOpNo;
+
+      // Otherwise, we need to match each of the suboperands individually.
+      } else {
+         DagInit *MIOI = ResultInst->Operands[i].MIOperandInfo;
+         for (unsigned SubOp = 0; SubOp != NumSubOps; ++SubOp) {
+          Record *SubRec = dynamic_cast<DefInit*>(MIOI->getArg(SubOp))->getDef();
+
+          // Take care to instantiate each of the suboperands with the correct
+          // nomenclature: $foo.bar
+          ResultOperands.push_back(
+              ResultOperand(Result->getArgName(AliasOpNo) + "." +
+                            MIOI->getArgName(SubOp), SubRec));
+          ResultInstOperandIndex.push_back(std::make_pair(i, SubOp));
+         }
+         ++AliasOpNo;
+      }
       continue;
     }
 
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 468277a..3ba9f24 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -280,7 +280,7 @@ namespace llvm {
 
     struct ResultOperand {
     private:
-      StringRef Name;
+      std::string Name;
       Record *R;
 
       int64_t Imm;
@@ -291,7 +291,7 @@ namespace llvm {
         K_Reg
       } Kind;
 
-      ResultOperand(StringRef N, Record *r) : Name(N), R(r), Kind(K_Record) {}
+      ResultOperand(std::string N, Record *r) : Name(N), R(r), Kind(K_Record) {}
       ResultOperand(int64_t I) : Imm(I), Kind(K_Imm) {}
       ResultOperand(Record *r) : R(r), Kind(K_Reg) {}
 
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 3f6ba61..6efe952 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -72,7 +72,10 @@ namespace llvm {
 
     /// canThrow - True if the intrinsic can throw.
     bool canThrow;
-    
+
+    /// isNoReturn - True if the intrinsic is no-return.
+    bool isNoReturn;
+
     enum ArgAttribute {
       NoCapture
     };
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 45c5bb8..ff3ad72 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -83,9 +83,43 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
     EnumValue(Enum),
     CostPerUse(R->getValueAsInt("CostPerUse")),
     CoveredBySubRegs(R->getValueAsBit("CoveredBySubRegs")),
-    SubRegsComplete(false)
+    NumNativeRegUnits(0),
+    SubRegsComplete(false),
+    SuperRegsComplete(false),
+    TopoSig(~0u)
 {}
 
+void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
+  std::vector<Record*> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
+  std::vector<Record*> SRs = TheDef->getValueAsListOfDefs("SubRegs");
+
+  if (SRIs.size() != SRs.size())
+    throw TGError(TheDef->getLoc(),
+                  "SubRegs and SubRegIndices must have the same size");
+
+  for (unsigned i = 0, e = SRIs.size(); i != e; ++i) {
+    ExplicitSubRegIndices.push_back(RegBank.getSubRegIdx(SRIs[i]));
+    ExplicitSubRegs.push_back(RegBank.getReg(SRs[i]));
+  }
+
+  // Also compute leading super-registers. Each register has a list of
+  // covered-by-subregs super-registers where it appears as the first explicit
+  // sub-register.
+  //
+  // This is used by computeSecondarySubRegs() to find candidates.
+  if (CoveredBySubRegs && !ExplicitSubRegs.empty())
+    ExplicitSubRegs.front()->LeadingSuperRegs.push_back(this);
+
+  // Add ad hoc alias links. This is a symmetric relationship between two
+  // registers, so build a symmetric graph by adding links in both ends.
+  std::vector<Record*> Aliases = TheDef->getValueAsListOfDefs("Aliases");
+  for (unsigned i = 0, e = Aliases.size(); i != e; ++i) {
+    CodeGenRegister *Reg = RegBank.getReg(Aliases[i]);
+    ExplicitAliases.push_back(Reg);
+    Reg->ExplicitAliases.push_back(this);
+  }
+}
+
 const std::string &CodeGenRegister::getName() const {
   return TheDef->getName();
 }
@@ -109,7 +143,7 @@ public:
 
   bool isValid() const { return UnitI != UnitE; }
 
-  unsigned operator* () const { assert(isValid()); return *UnitI; };
+  unsigned operator* () const { assert(isValid()); return *UnitI; }
 
   const CodeGenRegister *getReg() const { assert(isValid()); return *RegI; }
 
@@ -153,15 +187,7 @@ bool CodeGenRegister::inheritRegUnits(CodeGenRegBank &RegBank) {
   unsigned OldNumUnits = RegUnits.size();
   for (SubRegMap::const_iterator I = SubRegs.begin(), E = SubRegs.end();
        I != E; ++I) {
-    // Strangely a register may have itself as a subreg (self-cycle) e.g. XMM.
-    // Only create a unit if no other subregs have units.
     CodeGenRegister *SR = I->second;
-    if (SR == this) {
-      // RegUnits are only empty during getSubRegs, prior to computing weight.
-      if (RegUnits.empty())
-        RegUnits.push_back(RegBank.newRegUnit(0));
-      continue;
-    }
     // Merge the subregister's units into this register's RegUnits.
     mergeRegUnits(RegUnits, SR->RegUnits);
   }
@@ -169,27 +195,22 @@ bool CodeGenRegister::inheritRegUnits(CodeGenRegBank &RegBank) {
 }
 
 const CodeGenRegister::SubRegMap &
-CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
+CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   // Only compute this map once.
   if (SubRegsComplete)
     return SubRegs;
   SubRegsComplete = true;
 
-  std::vector<Record*> SubList = TheDef->getValueAsListOfDefs("SubRegs");
-  std::vector<Record*> IdxList = TheDef->getValueAsListOfDefs("SubRegIndices");
-  if (SubList.size() != IdxList.size())
-    throw TGError(TheDef->getLoc(), "Register " + getName() +
-                  " SubRegIndices doesn't match SubRegs");
-
-  // First insert the direct subregs and make sure they are fully indexed.
-  SmallVector<CodeGenSubRegIndex*, 8> Indices;
-  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
-    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
-    CodeGenSubRegIndex *Idx = RegBank.getSubRegIdx(IdxList[i]);
-    Indices.push_back(Idx);
+  // First insert the explicit subregs and make sure they are fully indexed.
+  for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
+    CodeGenRegister *SR = ExplicitSubRegs[i];
+    CodeGenSubRegIndex *Idx = ExplicitSubRegIndices[i];
     if (!SubRegs.insert(std::make_pair(Idx, SR)).second)
       throw TGError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
                     " appears twice in Register " + getName());
+    // Map explicit sub-registers first, so the names take precedence.
+    // The inherited sub-registers are mapped below.
+    SubReg2Idx.insert(std::make_pair(SR, Idx));
   }
 
   // Keep track of inherited subregs and how they can be reached.
@@ -197,23 +218,14 @@ CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
 
   // Clone inherited subregs and place duplicate entries in Orphans.
   // Here the order is important - earlier subregs take precedence.
-  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
-    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
-    const SubRegMap &Map = SR->getSubRegs(RegBank);
-
-    // Add this as a super-register of SR now all sub-registers are in the list.
-    // This creates a topological ordering, the exact order depends on the
-    // order getSubRegs is called on all registers.
-    SR->SuperRegs.push_back(this);
+  for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
+    CodeGenRegister *SR = ExplicitSubRegs[i];
+    const SubRegMap &Map = SR->computeSubRegs(RegBank);
 
     for (SubRegMap::const_iterator SI = Map.begin(), SE = Map.end(); SI != SE;
          ++SI) {
       if (!SubRegs.insert(*SI).second)
         Orphans.insert(SI->second);
-
-      // Noop sub-register indexes are possible, so avoid duplicates.
-      if (SI->second != SR)
-        SI->second->SuperRegs.push_back(this);
     }
   }
 
@@ -221,11 +233,12 @@ CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
   // If dsub_2 has ComposedOf = [qsub_1, dsub_0], and this register has a
   // qsub_1 subreg, add a dsub_2 subreg.  Keep growing Indices and process
   // expanded subreg indices recursively.
+  SmallVector<CodeGenSubRegIndex*, 8> Indices = ExplicitSubRegIndices;
   for (unsigned i = 0; i != Indices.size(); ++i) {
     CodeGenSubRegIndex *Idx = Indices[i];
     const CodeGenSubRegIndex::CompMap &Comps = Idx->getComposites();
     CodeGenRegister *SR = SubRegs[Idx];
-    const SubRegMap &Map = SR->getSubRegs(RegBank);
+    const SubRegMap &Map = SR->computeSubRegs(RegBank);
 
     // Look at the possible compositions of Idx.
     // They may not all be supported by SR.
@@ -244,44 +257,6 @@ CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
     }
   }
 
-  // Process the composites.
-  ListInit *Comps = TheDef->getValueAsListInit("CompositeIndices");
-  for (unsigned i = 0, e = Comps->size(); i != e; ++i) {
-    DagInit *Pat = dynamic_cast<DagInit*>(Comps->getElement(i));
-    if (!Pat)
-      throw TGError(TheDef->getLoc(), "Invalid dag '" +
-                    Comps->getElement(i)->getAsString() +
-                    "' in CompositeIndices");
-    DefInit *BaseIdxInit = dynamic_cast<DefInit*>(Pat->getOperator());
-    if (!BaseIdxInit || !BaseIdxInit->getDef()->isSubClassOf("SubRegIndex"))
-      throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
-                    Pat->getAsString());
-    CodeGenSubRegIndex *BaseIdx = RegBank.getSubRegIdx(BaseIdxInit->getDef());
-
-    // Resolve list of subreg indices into R2.
-    CodeGenRegister *R2 = this;
-    for (DagInit::const_arg_iterator di = Pat->arg_begin(),
-         de = Pat->arg_end(); di != de; ++di) {
-      DefInit *IdxInit = dynamic_cast<DefInit*>(*di);
-      if (!IdxInit || !IdxInit->getDef()->isSubClassOf("SubRegIndex"))
-        throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
-                      Pat->getAsString());
-      CodeGenSubRegIndex *Idx = RegBank.getSubRegIdx(IdxInit->getDef());
-      const SubRegMap &R2Subs = R2->getSubRegs(RegBank);
-      SubRegMap::const_iterator ni = R2Subs.find(Idx);
-      if (ni == R2Subs.end())
-        throw TGError(TheDef->getLoc(), "Composite " + Pat->getAsString() +
-                      " refers to bad index in " + R2->getName());
-      R2 = ni->second;
-    }
-
-    // Insert composite index. Allow overriding inherited indices etc.
-    SubRegs[BaseIdx] = R2;
-
-    // R2 is no longer an orphan.
-    Orphans.erase(R2);
-  }
-
   // Now Orphans contains the inherited subregisters without a direct index.
   // Create inferred indexes for all missing entries.
   // Work backwards in the Indices vector in order to compose subregs bottom-up.
@@ -296,46 +271,283 @@ CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
   //   dsub_2 -> ssub_0
   //
   // We pick the latter composition because another register may have [dsub_0,
-  // dsub_1, dsub_2] subregs without neccessarily having a qsub_1 subreg.  The
+  // dsub_1, dsub_2] subregs without necessarily having a qsub_1 subreg.  The
   // dsub_2 -> ssub_0 composition can be shared.
   while (!Indices.empty() && !Orphans.empty()) {
     CodeGenSubRegIndex *Idx = Indices.pop_back_val();
     CodeGenRegister *SR = SubRegs[Idx];
-    const SubRegMap &Map = SR->getSubRegs(RegBank);
+    const SubRegMap &Map = SR->computeSubRegs(RegBank);
     for (SubRegMap::const_iterator SI = Map.begin(), SE = Map.end(); SI != SE;
          ++SI)
       if (Orphans.erase(SI->second))
         SubRegs[RegBank.getCompositeSubRegIndex(Idx, SI->first)] = SI->second;
   }
 
-  // Initialize RegUnitList. A register with no subregisters creates its own
-  // unit. Otherwise, it inherits all its subregister's units. Because
-  // getSubRegs is called recursively, this processes the register hierarchy in
-  // postorder.
+  // Compute the inverse SubReg -> Idx map.
+  for (SubRegMap::const_iterator SI = SubRegs.begin(), SE = SubRegs.end();
+       SI != SE; ++SI) {
+    if (SI->second == this) {
+      SMLoc Loc;
+      if (TheDef)
+        Loc = TheDef->getLoc();
+      throw TGError(Loc, "Register " + getName() +
+                    " has itself as a sub-register");
+    }
+    // Ensure that every sub-register has a unique name.
+    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*>::iterator Ins =
+      SubReg2Idx.insert(std::make_pair(SI->second, SI->first)).first;
+    if (Ins->second == SI->first)
+      continue;
+    // Trouble: Two different names for SI->second.
+    SMLoc Loc;
+    if (TheDef)
+      Loc = TheDef->getLoc();
+    throw TGError(Loc, "Sub-register can't have two names: " +
+                  SI->second->getName() + " available as " +
+                  SI->first->getName() + " and " + Ins->second->getName());
+  }
+
+  // Derive possible names for sub-register concatenations from any explicit
+  // sub-registers. By doing this before computeSecondarySubRegs(), we ensure
+  // that getConcatSubRegIndex() won't invent any concatenated indices that the
+  // user already specified.
+  for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
+    CodeGenRegister *SR = ExplicitSubRegs[i];
+    if (!SR->CoveredBySubRegs || SR->ExplicitSubRegs.size() <= 1)
+      continue;
+
+    // SR is composed of multiple sub-regs. Find their names in this register.
+    SmallVector<CodeGenSubRegIndex*, 8> Parts;
+    for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j)
+      Parts.push_back(getSubRegIndex(SR->ExplicitSubRegs[j]));
+
+    // Offer this as an existing spelling for the concatenation of Parts.
+    RegBank.addConcatSubRegIndex(Parts, ExplicitSubRegIndices[i]);
+  }
+
+  // Initialize RegUnitList. Because getSubRegs is called recursively, this
+  // processes the register hierarchy in postorder.
   //
-  // TODO: We currently assume all register units correspond to a named "leaf"
-  // register. We should also unify register units for ad-hoc register
-  // aliases. This can be done by iteratively merging units for aliasing
-  // registers using a worklist.
-  assert(RegUnits.empty() && "Should only initialize RegUnits once");
-  if (SubRegs.empty())
-    RegUnits.push_back(RegBank.newRegUnit(0));
-  else
-    inheritRegUnits(RegBank);
+  // Inherit all sub-register units. It is good enough to look at the explicit
+  // sub-registers, the other registers won't contribute any more units.
+  for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
+    CodeGenRegister *SR = ExplicitSubRegs[i];
+    // Explicit sub-registers are usually disjoint, so this is a good way of
+    // computing the union. We may pick up a few duplicates that will be
+    // eliminated below.
+    unsigned N = RegUnits.size();
+    RegUnits.append(SR->RegUnits.begin(), SR->RegUnits.end());
+    std::inplace_merge(RegUnits.begin(), RegUnits.begin() + N, RegUnits.end());
+  }
+  RegUnits.erase(std::unique(RegUnits.begin(), RegUnits.end()), RegUnits.end());
+
+  // Absent any ad hoc aliasing, we create one register unit per leaf register.
+  // These units correspond to the maximal cliques in the register overlap
+  // graph which is optimal.
+  //
+  // When there is ad hoc aliasing, we simply create one unit per edge in the
+  // undirected ad hoc aliasing graph. Technically, we could do better by
+  // identifying maximal cliques in the ad hoc graph, but cliques larger than 2
+  // are extremely rare anyway (I've never seen one), so we don't bother with
+  // the added complexity.
+  for (unsigned i = 0, e = ExplicitAliases.size(); i != e; ++i) {
+    CodeGenRegister *AR = ExplicitAliases[i];
+    // Only visit each edge once.
+    if (AR->SubRegsComplete)
+      continue;
+    // Create a RegUnit representing this alias edge, and add it to both
+    // registers.
+    unsigned Unit = RegBank.newRegUnit(this, AR);
+    RegUnits.push_back(Unit);
+    AR->RegUnits.push_back(Unit);
+  }
+
+  // Finally, create units for leaf registers without ad hoc aliases. Note that
+  // a leaf register with ad hoc aliases doesn't get its own unit - it isn't
+  // necessary. This means the aliasing leaf registers can share a single unit.
+  if (RegUnits.empty())
+    RegUnits.push_back(RegBank.newRegUnit(this));
+
+  // We have now computed the native register units. More may be adopted later
+  // for balancing purposes.
+  NumNativeRegUnits = RegUnits.size();
+
   return SubRegs;
 }
 
+// In a register that is covered by its sub-registers, try to find redundant
+// sub-registers. For example:
+//
+//   QQ0 = {Q0, Q1}
+//   Q0 = {D0, D1}
+//   Q1 = {D2, D3}
+//
+// We can infer that D1_D2 is also a sub-register, even if it wasn't named in
+// the register definition.
+//
+// The explicitly specified registers form a tree. This function discovers
+// sub-register relationships that would force a DAG.
+//
+void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
+  // Collect new sub-registers first, add them later.
+  SmallVector<SubRegMap::value_type, 8> NewSubRegs;
+
+  // Look at the leading super-registers of each sub-register. Those are the
+  // candidates for new sub-registers, assuming they are fully contained in
+  // this register.
+  for (SubRegMap::iterator I = SubRegs.begin(), E = SubRegs.end(); I != E; ++I){
+    const CodeGenRegister *SubReg = I->second;
+    const CodeGenRegister::SuperRegList &Leads = SubReg->LeadingSuperRegs;
+    for (unsigned i = 0, e = Leads.size(); i != e; ++i) {
+      CodeGenRegister *Cand = const_cast<CodeGenRegister*>(Leads[i]);
+      // Already got this sub-register?
+      if (Cand == this || getSubRegIndex(Cand))
+        continue;
+      // Check if each component of Cand is already a sub-register.
+      // We know that the first component is I->second, and is present with the
+      // name I->first.
+      SmallVector<CodeGenSubRegIndex*, 8> Parts(1, I->first);
+      assert(!Cand->ExplicitSubRegs.empty() &&
+             "Super-register has no sub-registers");
+      for (unsigned j = 1, e = Cand->ExplicitSubRegs.size(); j != e; ++j) {
+        if (CodeGenSubRegIndex *Idx = getSubRegIndex(Cand->ExplicitSubRegs[j]))
+          Parts.push_back(Idx);
+        else {
+          // Sub-register doesn't exist.
+          Parts.clear();
+          break;
+        }
+      }
+      // If some Cand sub-register is not part of this register, or if Cand only
+      // has one sub-register, there is nothing to do.
+      if (Parts.size() <= 1)
+        continue;
+
+      // Each part of Cand is a sub-register of this. Make the full Cand also
+      // a sub-register with a concatenated sub-register index.
+      CodeGenSubRegIndex *Concat= RegBank.getConcatSubRegIndex(Parts);
+      NewSubRegs.push_back(std::make_pair(Concat, Cand));
+    }
+  }
+
+  // Now add all the new sub-registers.
+  for (unsigned i = 0, e = NewSubRegs.size(); i != e; ++i) {
+    // Don't add Cand if another sub-register is already using the index.
+    if (!SubRegs.insert(NewSubRegs[i]).second)
+      continue;
+
+    CodeGenSubRegIndex *NewIdx = NewSubRegs[i].first;
+    CodeGenRegister *NewSubReg = NewSubRegs[i].second;
+    SubReg2Idx.insert(std::make_pair(NewSubReg, NewIdx));
+  }
+
+  // Create sub-register index composition maps for the synthesized indices.
+  for (unsigned i = 0, e = NewSubRegs.size(); i != e; ++i) {
+    CodeGenSubRegIndex *NewIdx = NewSubRegs[i].first;
+    CodeGenRegister *NewSubReg = NewSubRegs[i].second;
+    for (SubRegMap::const_iterator SI = NewSubReg->SubRegs.begin(),
+           SE = NewSubReg->SubRegs.end(); SI != SE; ++SI) {
+      CodeGenSubRegIndex *SubIdx = getSubRegIndex(SI->second);
+      if (!SubIdx)
+        throw TGError(TheDef->getLoc(), "No SubRegIndex for " +
+                      SI->second->getName() + " in " + getName());
+      NewIdx->addComposite(SI->first, SubIdx);
+    }
+  }
+}
+
+void CodeGenRegister::computeSuperRegs(CodeGenRegBank &RegBank) {
+  // Only visit each register once.
+  if (SuperRegsComplete)
+    return;
+  SuperRegsComplete = true;
+
+  // Make sure all sub-registers have been visited first, so the super-reg
+  // lists will be topologically ordered.
+  for (SubRegMap::const_iterator I = SubRegs.begin(), E = SubRegs.end();
+       I != E; ++I)
+    I->second->computeSuperRegs(RegBank);
+
+  // Now add this as a super-register on all sub-registers.
+  // Also compute the TopoSigId in post-order.
+  TopoSigId Id;
+  for (SubRegMap::const_iterator I = SubRegs.begin(), E = SubRegs.end();
+       I != E; ++I) {
+    // Topological signature computed from SubIdx, TopoId(SubReg).
+    // Loops and idempotent indices have TopoSig = ~0u.
+    Id.push_back(I->first->EnumValue);
+    Id.push_back(I->second->TopoSig);
+
+    // Don't add duplicate entries.
+    if (!I->second->SuperRegs.empty() && I->second->SuperRegs.back() == this)
+      continue;
+    I->second->SuperRegs.push_back(this);
+  }
+  TopoSig = RegBank.getTopoSig(Id);
+}
+
 void
 CodeGenRegister::addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
                                     CodeGenRegBank &RegBank) const {
   assert(SubRegsComplete && "Must precompute sub-registers");
-  std::vector<Record*> Indices = TheDef->getValueAsListOfDefs("SubRegIndices");
-  for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
-    CodeGenSubRegIndex *Idx = RegBank.getSubRegIdx(Indices[i]);
-    CodeGenRegister *SR = SubRegs.find(Idx)->second;
+  for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
+    CodeGenRegister *SR = ExplicitSubRegs[i];
     if (OSet.insert(SR))
       SR->addSubRegsPreOrder(OSet, RegBank);
   }
+  // Add any secondary sub-registers that weren't part of the explicit tree.
+  for (SubRegMap::const_iterator I = SubRegs.begin(), E = SubRegs.end();
+       I != E; ++I)
+    OSet.insert(I->second);
+}
+
+// Compute overlapping registers.
+//
+// The standard set is all super-registers and all sub-registers, but the
+// target description can add arbitrary overlapping registers via the 'Aliases'
+// field. This complicates things, but we can compute overlapping sets using
+// the following rules:
+//
+// 1. The relation overlap(A, B) is reflexive and symmetric but not transitive.
+//
+// 2. overlap(A, B) implies overlap(A, S) for all S in supers(B).
+//
+// Alternatively:
+//
+//    overlap(A, B) iff there exists:
+//    A' in { A, subregs(A) } and B' in { B, subregs(B) } such that:
+//    A' = B' or A' in aliases(B') or B' in aliases(A').
+//
+// Here subregs(A) is the full flattened sub-register set returned by
+// A.getSubRegs() while aliases(A) is simply the special 'Aliases' field in the
+// description of register A.
+//
+// This also implies that registers with a common sub-register are considered
+// overlapping. This can happen when forming register pairs:
+//
+//    P0 = (R0, R1)
+//    P1 = (R1, R2)
+//    P2 = (R2, R3)
+//
+// In this case, we will infer an overlap between P0 and P1 because of the
+// shared sub-register R1. There is no overlap between P0 and P2.
+//
+void CodeGenRegister::computeOverlaps(CodeGenRegister::Set &Overlaps,
+                                      const CodeGenRegBank &RegBank) const {
+  assert(!RegUnits.empty() && "Compute register units before overlaps.");
+
+  // Register units are assigned such that the overlapping registers are the
+  // super-registers of the root registers of the register units.
+  for (unsigned rui = 0, rue = RegUnits.size(); rui != rue; ++rui) {
+    const RegUnit &RU = RegBank.getRegUnit(RegUnits[rui]);
+    ArrayRef<const CodeGenRegister*> Roots = RU.getRoots();
+    for (unsigned ri = 0, re = Roots.size(); ri != re; ++ri) {
+      const CodeGenRegister *Root = Roots[ri];
+      Overlaps.insert(Root);
+      ArrayRef<const CodeGenRegister*> Supers = Root->getSuperRegs();
+      Overlaps.insert(Supers.begin(), Supers.end());
+    }
+  }
 }
 
 // Get the sum of this register's unit weights.
@@ -343,7 +555,7 @@ unsigned CodeGenRegister::getWeight(const CodeGenRegBank &RegBank) const {
   unsigned Weight = 0;
   for (RegUnitList::const_iterator I = RegUnits.begin(), E = RegUnits.end();
        I != E; ++I) {
-    Weight += RegBank.getRegUnitWeight(*I);
+    Weight += RegBank.getRegUnit(*I).Weight;
   }
   return Weight;
 }
@@ -462,7 +674,10 @@ struct TupleExpander : SetTheory::Expander {
 //===----------------------------------------------------------------------===//
 
 CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
-  : TheDef(R), Name(R->getName()), EnumValue(-1) {
+  : TheDef(R),
+    Name(R->getName()),
+    TopoSigs(RegBank.getNumTopoSigs()),
+    EnumValue(-1) {
   // Rename anonymous register classes.
   if (R->getName().size() > 9 && R->getName()[9] == '.') {
     static unsigned AnonCounter = 0;
@@ -487,7 +702,9 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   // Default allocation order always contains all registers.
   for (unsigned i = 0, e = Elements->size(); i != e; ++i) {
     Orders[0].push_back((*Elements)[i]);
-    Members.insert(RegBank.getReg((*Elements)[i]));
+    const CodeGenRegister *Reg = RegBank.getReg((*Elements)[i]);
+    Members.insert(Reg);
+    TopoSigs.set(Reg->getTopoSig());
   }
 
   // Alternative allocation orders may be subsets.
@@ -505,29 +722,6 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
     }
   }
 
-  // SubRegClasses is a list<dag> containing (RC, subregindex, ...) dags.
-  ListInit *SRC = R->getValueAsListInit("SubRegClasses");
-  for (ListInit::const_iterator i = SRC->begin(), e = SRC->end(); i != e; ++i) {
-    DagInit *DAG = dynamic_cast<DagInit*>(*i);
-    if (!DAG) throw "SubRegClasses must contain DAGs";
-    DefInit *DAGOp = dynamic_cast<DefInit*>(DAG->getOperator());
-    Record *RCRec;
-    if (!DAGOp || !(RCRec = DAGOp->getDef())->isSubClassOf("RegisterClass"))
-      throw "Operator '" + DAG->getOperator()->getAsString() +
-        "' in SubRegClasses is not a RegisterClass";
-    // Iterate over args, all SubRegIndex instances.
-    for (DagInit::const_arg_iterator ai = DAG->arg_begin(), ae = DAG->arg_end();
-         ai != ae; ++ai) {
-      DefInit *Idx = dynamic_cast<DefInit*>(*ai);
-      Record *IdxRec;
-      if (!Idx || !(IdxRec = Idx->getDef())->isSubClassOf("SubRegIndex"))
-        throw "Argument '" + (*ai)->getAsString() +
-          "' in SubRegClasses is not a SubRegIndex";
-      if (!SubRegClasses.insert(std::make_pair(IdxRec, RCRec)).second)
-        throw "SubRegIndex '" + IdxRec->getName() + "' mentioned twice";
-    }
-  }
-
   // Allow targets to override the size in bits of the RegisterClass.
   unsigned Size = R->getValueAsInt("Size");
 
@@ -542,15 +736,20 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
 // Create an inferred register class that was missing from the .td files.
 // Most properties will be inherited from the closest super-class after the
 // class structure has been computed.
-CodeGenRegisterClass::CodeGenRegisterClass(StringRef Name, Key Props)
+CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
+                                           StringRef Name, Key Props)
   : Members(*Props.Members),
     TheDef(0),
     Name(Name),
+    TopoSigs(RegBank.getNumTopoSigs()),
     EnumValue(-1),
     SpillSize(Props.SpillSize),
     SpillAlignment(Props.SpillAlignment),
     CopyCost(0),
     Allocatable(true) {
+  for (CodeGenRegister::Set::iterator I = Members.begin(), E = Members.end();
+       I != E; ++I)
+    TopoSigs.set((*I)->getTopoSig());
 }
 
 // Compute inherited propertied for a synthesized register class.
@@ -634,13 +833,6 @@ static int TopoOrderRC(const void *PA, const void *PB) {
   if (A == B)
     return 0;
 
-  // Order by descending set size.  Note that the classes' allocation order may
-  // not have been computed yet.  The Members set is always vaild.
-  if (A->getMembers().size() > B->getMembers().size())
-    return -1;
-  if (A->getMembers().size() < B->getMembers().size())
-    return 1;
-
   // Order by ascending spill size.
   if (A->SpillSize < B->SpillSize)
     return -1;
@@ -653,6 +845,13 @@ static int TopoOrderRC(const void *PA, const void *PB) {
   if (A->SpillAlignment > B->SpillAlignment)
     return 1;
 
+  // Order by descending set size.  Note that the classes' allocation order may
+  // not have been computed yet.  The Members set is always vaild.
+  if (A->getMembers().size() > B->getMembers().size())
+    return -1;
+  if (A->getMembers().size() < B->getMembers().size())
+    return 1;
+
   // Finally order by name as a tie breaker.
   return StringRef(A->getName()).compare(B->getName());
 }
@@ -687,7 +886,7 @@ void CodeGenRegisterClass::computeSubClasses(CodeGenRegBank &RegBank) {
       RC.SubClasses |= SubRC->SubClasses;
     }
 
-    // Sweep up missed clique members.  They will be immediately preceeding RC.
+    // Sweep up missed clique members.  They will be immediately preceding RC.
     for (unsigned s = rci - 1; s && testSubClass(&RC, RegClasses[s - 1]); --s)
       RC.SubClasses.set(s - 1);
   }
@@ -772,15 +971,29 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) : Records(Records) {
       getReg((*TupRegs)[j]);
   }
 
-  // Precompute all sub-register maps now all the registers are known.
+  // Now all the registers are known. Build the object graph of explicit
+  // register-register references.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    Registers[i]->buildObjectGraph(*this);
+
+  // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
-  NumRegUnits = 0;
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i]->getSubRegs(*this);
+    Registers[i]->computeSubRegs(*this);
+
+  // Infer even more sub-registers by combining leading super-registers.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    if (Registers[i]->CoveredBySubRegs)
+      Registers[i]->computeSecondarySubRegs(*this);
+
+  // After the sub-register graph is complete, compute the topologically
+  // ordered SuperRegs list.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    Registers[i]->computeSuperRegs(*this);
 
   // Native register units are associated with a leaf register. They've all been
   // discovered now.
-  NumNativeRegUnits = NumRegUnits;
+  NumNativeRegUnits = RegUnits.size();
 
   // Read in register class definitions.
   std::vector<Record*> RCs = Records.getAllDerivedDefinitions("RegisterClass");
@@ -844,7 +1057,7 @@ CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC,
     return FoundI->second;
 
   // Sub-class doesn't exist, create a new one.
-  CodeGenRegisterClass *NewRC = new CodeGenRegisterClass(Name, K);
+  CodeGenRegisterClass *NewRC = new CodeGenRegisterClass(*this, Name, K);
   addToMaps(NewRC);
   return NewRC;
 }
@@ -871,9 +1084,37 @@ CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A,
   return Comp;
 }
 
+CodeGenSubRegIndex *CodeGenRegBank::
+getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex*, 8> &Parts) {
+  assert(Parts.size() > 1 && "Need two parts to concatenate");
+
+  // Look for an existing entry.
+  CodeGenSubRegIndex *&Idx = ConcatIdx[Parts];
+  if (Idx)
+    return Idx;
+
+  // None exists, synthesize one.
+  std::string Name = Parts.front()->getName();
+  for (unsigned i = 1, e = Parts.size(); i != e; ++i) {
+    Name += '_';
+    Name += Parts[i]->getName();
+  }
+  return Idx = getSubRegIdx(new Record(Name, SMLoc(), Records));
+}
+
 void CodeGenRegBank::computeComposites() {
+  // Keep track of TopoSigs visited. We only need to visit each TopoSig once,
+  // and many registers will share TopoSigs on regular architectures.
+  BitVector TopoSigs(getNumTopoSigs());
+
   for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
     CodeGenRegister *Reg1 = Registers[i];
+
+    // Skip identical subreg structures already processed.
+    if (TopoSigs.test(Reg1->getTopoSig()))
+      continue;
+    TopoSigs.set(Reg1->getTopoSig());
+
     const CodeGenRegister::SubRegMap &SRM1 = Reg1->getSubRegs();
     for (CodeGenRegister::SubRegMap::const_iterator i1 = SRM1.begin(),
          e1 = SRM1.end(); i1 != e1; ++i1) {
@@ -886,23 +1127,21 @@ void CodeGenRegBank::computeComposites() {
       // Try composing Idx1 with another SubRegIndex.
       for (CodeGenRegister::SubRegMap::const_iterator i2 = SRM2.begin(),
            e2 = SRM2.end(); i2 != e2; ++i2) {
-      CodeGenSubRegIndex *Idx2 = i2->first;
+        CodeGenSubRegIndex *Idx2 = i2->first;
         CodeGenRegister *Reg3 = i2->second;
         // Ignore identity compositions.
         if (Reg2 == Reg3)
           continue;
         // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
-        for (CodeGenRegister::SubRegMap::const_iterator i1d = SRM1.begin(),
-             e1d = SRM1.end(); i1d != e1d; ++i1d) {
-          if (i1d->second == Reg3) {
-            // Conflicting composition? Emit a warning but allow it.
-            if (CodeGenSubRegIndex *Prev = Idx1->addComposite(Idx2, i1d->first))
-              PrintWarning(Twine("SubRegIndex") + Idx1->getQualifiedName() +
-                     " and " + Idx2->getQualifiedName() +
-                     " compose ambiguously as " + Prev->getQualifiedName() +
-                     " or " + i1d->first->getQualifiedName());
-          }
-        }
+        CodeGenSubRegIndex *Idx3 = Reg1->getSubRegIndex(Reg3);
+        assert(Idx3 && "Sub-register doesn't have an index");
+
+        // Conflicting composition? Emit a warning but allow it.
+        if (CodeGenSubRegIndex *Prev = Idx1->addComposite(Idx2, Idx3))
+          PrintWarning(Twine("SubRegIndex ") + Idx1->getQualifiedName() +
+                       " and " + Idx2->getQualifiedName() +
+                       " compose ambiguously as " + Prev->getQualifiedName() +
+                       " or " + Idx3->getQualifiedName());
       }
     }
   }
@@ -1023,7 +1262,7 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
         Reg = UnitI.getReg();
         Weight = 0;
       }
-      unsigned UWeight = RegBank.getRegUnitWeight(*UnitI);
+      unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight;
       if (!UWeight) {
         UWeight = 1;
         RegBank.increaseRegUnitWeight(*UnitI, UWeight);
@@ -1059,17 +1298,21 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
 static bool normalizeWeight(CodeGenRegister *Reg,
                             std::vector<UberRegSet> &UberSets,
                             std::vector<UberRegSet*> &RegSets,
+                            std::set<unsigned> &NormalRegs,
                             CodeGenRegister::RegUnitList &NormalUnits,
                             CodeGenRegBank &RegBank) {
   bool Changed = false;
+  if (!NormalRegs.insert(Reg->EnumValue).second)
+    return Changed;
+
   const CodeGenRegister::SubRegMap &SRM = Reg->getSubRegs();
   for (CodeGenRegister::SubRegMap::const_iterator SRI = SRM.begin(),
          SRE = SRM.end(); SRI != SRE; ++SRI) {
     if (SRI->second == Reg)
       continue; // self-cycles happen
 
-    Changed |=
-      normalizeWeight(SRI->second, UberSets, RegSets, NormalUnits, RegBank);
+    Changed |= normalizeWeight(SRI->second, UberSets, RegSets,
+                               NormalRegs, NormalUnits, RegBank);
   }
   // Postorder register normalization.
 
@@ -1114,11 +1357,6 @@ static bool normalizeWeight(CodeGenRegister *Reg,
 // The goal is that two registers in the same class will have the same weight,
 // where each register's weight is defined as sum of its units' weights.
 void CodeGenRegBank::computeRegUnitWeights() {
-  assert(RegUnitWeights.empty() && "Only initialize RegUnitWeights once");
-
-  // Only allocatable units will be initialized to nonzero weight.
-  RegUnitWeights.resize(NumRegUnits);
-
   std::vector<UberRegSet> UberSets;
   std::vector<UberRegSet*> RegSets(Registers.size());
   computeUberSets(UberSets, RegSets, *this);
@@ -1134,8 +1372,9 @@ void CodeGenRegBank::computeRegUnitWeights() {
     Changed = false;
     for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
       CodeGenRegister::RegUnitList NormalUnits;
-      Changed |=
-        normalizeWeight(Registers[i], UberSets, RegSets, NormalUnits, *this);
+      std::set<unsigned> NormalRegs;
+      Changed |= normalizeWeight(Registers[i], UberSets, RegSets,
+                                 NormalRegs, NormalUnits, *this);
     }
   }
 }
@@ -1294,80 +1533,6 @@ void CodeGenRegBank::computeRegUnitSets() {
   }
 }
 
-// Compute sets of overlapping registers.
-//
-// The standard set is all super-registers and all sub-registers, but the
-// target description can add arbitrary overlapping registers via the 'Aliases'
-// field. This complicates things, but we can compute overlapping sets using
-// the following rules:
-//
-// 1. The relation overlap(A, B) is reflexive and symmetric but not transitive.
-//
-// 2. overlap(A, B) implies overlap(A, S) for all S in supers(B).
-//
-// Alternatively:
-//
-//    overlap(A, B) iff there exists:
-//    A' in { A, subregs(A) } and B' in { B, subregs(B) } such that:
-//    A' = B' or A' in aliases(B') or B' in aliases(A').
-//
-// Here subregs(A) is the full flattened sub-register set returned by
-// A.getSubRegs() while aliases(A) is simply the special 'Aliases' field in the
-// description of register A.
-//
-// This also implies that registers with a common sub-register are considered
-// overlapping. This can happen when forming register pairs:
-//
-//    P0 = (R0, R1)
-//    P1 = (R1, R2)
-//    P2 = (R2, R3)
-//
-// In this case, we will infer an overlap between P0 and P1 because of the
-// shared sub-register R1. There is no overlap between P0 and P2.
-//
-void CodeGenRegBank::
-computeOverlaps(std::map<const CodeGenRegister*, CodeGenRegister::Set> &Map) {
-  assert(Map.empty());
-
-  // Collect overlaps that don't follow from rule 2.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    CodeGenRegister *Reg = Registers[i];
-    CodeGenRegister::Set &Overlaps = Map[Reg];
-
-    // Reg overlaps itself.
-    Overlaps.insert(Reg);
-
-    // All super-registers overlap.
-    const CodeGenRegister::SuperRegList &Supers = Reg->getSuperRegs();
-    Overlaps.insert(Supers.begin(), Supers.end());
-
-    // Form symmetrical relations from the special Aliases[] lists.
-    std::vector<Record*> RegList = Reg->TheDef->getValueAsListOfDefs("Aliases");
-    for (unsigned i2 = 0, e2 = RegList.size(); i2 != e2; ++i2) {
-      CodeGenRegister *Reg2 = getReg(RegList[i2]);
-      CodeGenRegister::Set &Overlaps2 = Map[Reg2];
-      const CodeGenRegister::SuperRegList &Supers2 = Reg2->getSuperRegs();
-      // Reg overlaps Reg2 which implies it overlaps supers(Reg2).
-      Overlaps.insert(Reg2);
-      Overlaps.insert(Supers2.begin(), Supers2.end());
-      Overlaps2.insert(Reg);
-      Overlaps2.insert(Supers.begin(), Supers.end());
-    }
-  }
-
-  // Apply rule 2. and inherit all sub-register overlaps.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    CodeGenRegister *Reg = Registers[i];
-    CodeGenRegister::Set &Overlaps = Map[Reg];
-    const CodeGenRegister::SubRegMap &SRM = Reg->getSubRegs();
-    for (CodeGenRegister::SubRegMap::const_iterator i2 = SRM.begin(),
-         e2 = SRM.end(); i2 != e2; ++i2) {
-      CodeGenRegister::Set &Overlaps2 = Map[i2->second];
-      Overlaps.insert(Overlaps2.begin(), Overlaps2.end());
-    }
-  }
-}
-
 void CodeGenRegBank::computeDerivedInfo() {
   computeComposites();
 
@@ -1471,6 +1636,7 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
                                                 unsigned FirstSubRegRC) {
   SmallVector<std::pair<const CodeGenRegister*,
                         const CodeGenRegister*>, 16> SSPairs;
+  BitVector TopoSigs(getNumTopoSigs());
 
   // Iterate in SubRegIndex numerical order to visit synthetic indices last.
   for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
@@ -1483,12 +1649,14 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
 
     // Build list of (Super, Sub) pairs for this SubIdx.
     SSPairs.clear();
+    TopoSigs.reset();
     for (CodeGenRegister::Set::const_iterator RI = RC->getMembers().begin(),
          RE = RC->getMembers().end(); RI != RE; ++RI) {
       const CodeGenRegister *Super = *RI;
       const CodeGenRegister *Sub = Super->getSubRegs().find(SubIdx)->second;
       assert(Sub && "Missing sub-register");
       SSPairs.push_back(std::make_pair(Super, Sub));
+      TopoSigs.set(Sub->getTopoSig());
     }
 
     // Iterate over sub-register class candidates.  Ignore classes created by
@@ -1496,6 +1664,9 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
     for (unsigned rci = FirstSubRegRC, rce = RegClasses.size(); rci != rce;
          ++rci) {
       CodeGenRegisterClass *SubRC = RegClasses[rci];
+      // Topological shortcut: SubRC members have the wrong shape.
+      if (!TopoSigs.anyCommon(SubRC->getTopoSigs()))
+        continue;
       // Compute the subset of RC that maps into SubRC.
       CodeGenRegister::Set SubSet;
       for (unsigned i = 0, e = SSPairs.size(); i != e; ++i)
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 232a6e7..eb6724e 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -35,9 +35,10 @@ namespace llvm {
   /// CodeGenSubRegIndex - Represents a sub-register index.
   class CodeGenSubRegIndex {
     Record *const TheDef;
-    const unsigned EnumValue;
 
   public:
+    const unsigned EnumValue;
+
     CodeGenSubRegIndex(Record *R, unsigned Enum);
 
     const std::string &getName() const;
@@ -67,6 +68,7 @@ namespace llvm {
     // Return a conflicting composite, or NULL
     CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A,
                                      CodeGenSubRegIndex *B) {
+      assert(A && B);
       std::pair<CompMap::iterator, bool> Ins =
         Composed.insert(std::make_pair(A, B));
       return (Ins.second || Ins.first->second == B) ? 0 : Ins.first->second;
@@ -100,9 +102,20 @@ namespace llvm {
 
     const std::string &getName() const;
 
-    // Get a map of sub-registers computed lazily.
+    // Extract more information from TheDef. This is used to build an object
+    // graph after all CodeGenRegister objects have been created.
+    void buildObjectGraph(CodeGenRegBank&);
+
+    // Lazily compute a map of all sub-registers.
     // This includes unique entries for all sub-sub-registers.
-    const SubRegMap &getSubRegs(CodeGenRegBank&);
+    const SubRegMap &computeSubRegs(CodeGenRegBank&);
+
+    // Compute extra sub-registers by combining the existing sub-registers.
+    void computeSecondarySubRegs(CodeGenRegBank&);
+
+    // Add this as a super-register to all sub-registers after the sub-register
+    // graph has been built.
+    void computeSuperRegs(CodeGenRegBank&);
 
     const SubRegMap &getSubRegs() const {
       assert(SubRegsComplete && "Must precompute sub-registers");
@@ -113,23 +126,54 @@ namespace llvm {
     void addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
                             CodeGenRegBank&) const;
 
-    // List of super-registers in topological order, small to large.
+    // Return the sub-register index naming Reg as a sub-register of this
+    // register. Returns NULL if Reg is not a sub-register.
+    CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const {
+      return SubReg2Idx.lookup(Reg);
+    }
+
     typedef std::vector<const CodeGenRegister*> SuperRegList;
 
-    // Get the list of super-registers. This is valid after getSubReg
-    // visits all registers during RegBank construction.
+    // Get the list of super-registers in topological order, small to large.
+    // This is valid after computeSubRegs visits all registers during RegBank
+    // construction.
     const SuperRegList &getSuperRegs() const {
       assert(SubRegsComplete && "Must precompute sub-registers");
       return SuperRegs;
     }
 
+    // Get the list of ad hoc aliases. The graph is symmetric, so the list
+    // contains all registers in 'Aliases', and all registers that mention this
+    // register in 'Aliases'.
+    ArrayRef<CodeGenRegister*> getExplicitAliases() const {
+      return ExplicitAliases;
+    }
+
+    // Get the topological signature of this register. This is a small integer
+    // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have
+    // identical sub-register structure. That is, they support the same set of
+    // sub-register indices mapping to the same kind of sub-registers
+    // (TopoSig-wise).
+    unsigned getTopoSig() const {
+      assert(SuperRegsComplete && "TopoSigs haven't been computed yet.");
+      return TopoSig;
+    }
+
     // List of register units in ascending order.
     typedef SmallVector<unsigned, 16> RegUnitList;
 
+    // How many entries in RegUnitList are native?
+    unsigned NumNativeRegUnits;
+
     // Get the list of register units.
-    // This is only valid after getSubRegs() completes.
+    // This is only valid after computeSubRegs() completes.
     const RegUnitList &getRegUnits() const { return RegUnits; }
 
+    // Get the native register units. This is a prefix of getRegUnits().
+    ArrayRef<unsigned> getNativeRegUnits() const {
+      return makeArrayRef(RegUnits).slice(0, NumNativeRegUnits);
+    }
+
     // Inherit register units from subregisters.
     // Return true if the RegUnits changed.
     bool inheritRegUnits(CodeGenRegBank &RegBank);
@@ -153,10 +197,27 @@ namespace llvm {
     // Canonically ordered set.
     typedef std::set<const CodeGenRegister*, Less> Set;
 
+    // Compute the set of registers overlapping this.
+    void computeOverlaps(Set &Overlaps, const CodeGenRegBank&) const;
+
   private:
     bool SubRegsComplete;
+    bool SuperRegsComplete;
+    unsigned TopoSig;
+
+    // The sub-registers explicit in the .td file form a tree.
+    SmallVector<CodeGenSubRegIndex*, 8> ExplicitSubRegIndices;
+    SmallVector<CodeGenRegister*, 8> ExplicitSubRegs;
+
+    // Explicit ad hoc aliases, symmetrized to form an undirected graph.
+    SmallVector<CodeGenRegister*, 8> ExplicitAliases;
+
+    // Super-registers where this is the first explicit sub-register.
+    SuperRegList LeadingSuperRegs;
+
     SubRegMap SubRegs;
     SuperRegList SuperRegs;
+    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*> SubReg2Idx;
     RegUnitList RegUnits;
   };
 
@@ -189,6 +250,10 @@ namespace llvm {
     DenseMap<CodeGenSubRegIndex*,
              SmallPtrSet<CodeGenRegisterClass*, 8> > SuperRegClasses;
 
+    // Bit vector of TopoSigs for the registers in this class. This will be
+    // very sparse on regular architectures.
+    BitVector TopoSigs;
+
   public:
     unsigned EnumValue;
     std::string Namespace;
@@ -197,8 +262,6 @@ namespace llvm {
     unsigned SpillAlignment;
     int CopyCost;
     bool Allocatable;
-    // Map SubRegIndex -> RegisterClass
-    DenseMap<Record*,Record*> SubRegClasses;
     std::string AltOrderSelect;
 
     // Return the Record that defined this class, or NULL if the class was
@@ -279,6 +342,9 @@ namespace llvm {
     // getOrder(0).
     const CodeGenRegister::Set &getMembers() const { return Members; }
 
+    // Get a bit vector of TopoSigs present in this register class.
+    const BitVector &getTopoSigs() const { return TopoSigs; }
+
     // Populate a unique sorted list of units from a register set.
     void buildRegUnitSet(std::vector<unsigned> &RegUnits) const;
 
@@ -310,12 +376,37 @@ namespace llvm {
     };
 
     // Create a non-user defined register class.
-    CodeGenRegisterClass(StringRef Name, Key Props);
+    CodeGenRegisterClass(CodeGenRegBank&, StringRef Name, Key Props);
 
     // Called by CodeGenRegBank::CodeGenRegBank().
     static void computeSubClasses(CodeGenRegBank&);
   };
 
+  // Register units are used to model interference and register pressure.
+  // Every register is assigned one or more register units such that two
+  // registers overlap if and only if they have a register unit in common.
+  //
+  // Normally, one register unit is created per leaf register. Non-leaf
+  // registers inherit the units of their sub-registers.
+  struct RegUnit {
+    // Weight assigned to this RegUnit for estimating register pressure.
+    // This is useful when equalizing weights in register classes with mixed
+    // register topologies.
+    unsigned Weight;
+
+    // Each native RegUnit corresponds to one or two root registers. The full
+    // set of registers containing this unit can be computed as the union of
+    // these two registers and their super-registers.
+    const CodeGenRegister *Roots[2];
+
+    RegUnit() : Weight(0) { Roots[0] = Roots[1] = 0; }
+
+    ArrayRef<const CodeGenRegister*> getRoots() const {
+      assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
+      return makeArrayRef(Roots, !!Roots[0] + !!Roots[1]);
+    }
+  };
+
   // Each RegUnitSet is a sorted vector with a name.
   struct RegUnitSet {
     typedef std::vector<unsigned>::const_iterator iterator;
@@ -324,6 +415,10 @@ namespace llvm {
     std::vector<unsigned> Units;
   };
 
+  // Base vector for identifying TopoSigs. The contents uniquely identify a
+  // TopoSig, only computeSuperRegs needs to know how.
+  typedef SmallVector<unsigned, 16> TopoSigId;
+
   // CodeGenRegBank - Represent a target's registers and the relations between
   // them.
   class CodeGenRegBank {
@@ -335,15 +430,19 @@ namespace llvm {
     DenseMap<Record*, CodeGenSubRegIndex*> Def2SubRegIdx;
     unsigned NumNamedIndices;
 
+    typedef std::map<SmallVector<CodeGenSubRegIndex*, 8>,
+                     CodeGenSubRegIndex*> ConcatIdxMap;
+    ConcatIdxMap ConcatIdx;
+
     // Registers.
     std::vector<CodeGenRegister*> Registers;
     DenseMap<Record*, CodeGenRegister*> Def2Reg;
     unsigned NumNativeRegUnits;
-    unsigned NumRegUnits; // # native + adopted register units.
 
-    // Map each register unit to a weight (for register pressure).
-    // Includes native and adopted register units.
-    std::vector<unsigned> RegUnitWeights;
+    std::map<TopoSigId, unsigned> TopoSigs;
+
+    // Includes native (0..NumNativeRegUnits-1) and adopted register units.
+    SmallVector<RegUnit, 8> RegUnits;
 
     // Register classes.
     std::vector<CodeGenRegisterClass*> RegClasses;
@@ -405,6 +504,17 @@ namespace llvm {
     CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A,
                                                 CodeGenSubRegIndex *B);
 
+    // Find or create a sub-register index representing the concatenation of
+    // non-overlapping sibling indices.
+    CodeGenSubRegIndex *
+      getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex*, 8>&);
+
+    void
+    addConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex*, 8> &Parts,
+                         CodeGenSubRegIndex *Idx) {
+      ConcatIdx.insert(std::make_pair(Parts, Idx));
+    }
+
     const std::vector<CodeGenRegister*> &getRegisters() { return Registers; }
 
     // Find a register from its Record def.
@@ -415,15 +525,34 @@ namespace llvm {
       return Reg->EnumValue - 1;
     }
 
+    // Return the number of allocated TopoSigs. The first TopoSig representing
+    // leaf registers is allocated number 0.
+    unsigned getNumTopoSigs() const {
+      return TopoSigs.size();
+    }
+
+    // Find or create a TopoSig for the given TopoSigId.
+    // This function is only for use by CodeGenRegister::computeSuperRegs().
+    // Others should simply use Reg->getTopoSig().
+    unsigned getTopoSig(const TopoSigId &Id) {
+      return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second;
+    }
+
+    // Create a native register unit that is associated with one or two root
+    // registers.
+    unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = 0) {
+      RegUnits.resize(RegUnits.size() + 1);
+      RegUnits.back().Roots[0] = R0;
+      RegUnits.back().Roots[1] = R1;
+      return RegUnits.size() - 1;
+    }
+
     // Create a new non-native register unit that can be adopted by a register
     // to increase its pressure. Note that NumNativeRegUnits is not increased.
     unsigned newRegUnit(unsigned Weight) {
-      if (!RegUnitWeights.empty()) {
-        assert(Weight && "should only add allocatable units");
-        RegUnitWeights.resize(NumRegUnits+1);
-        RegUnitWeights[NumRegUnits] = Weight;
-      }
-      return NumRegUnits++;
+      RegUnits.resize(RegUnits.size() + 1);
+      RegUnits.back().Weight = Weight;
+      return RegUnits.size() - 1;
     }
 
     // Native units are the singular unit of a leaf register. Register aliasing
@@ -433,6 +562,13 @@ namespace llvm {
       return RUID < NumNativeRegUnits;
     }
 
+    unsigned getNumNativeRegUnits() const {
+      return NumNativeRegUnits;
+    }
+
+    RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
+    const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
+
     ArrayRef<CodeGenRegisterClass*> getRegClasses() const {
       return RegClasses;
     }
@@ -447,23 +583,18 @@ namespace llvm {
     /// return the superclass.  Otherwise return null.
     const CodeGenRegisterClass* getRegClassForRegister(Record *R);
 
-    // Get a register unit's weight. Zero for unallocatable registers.
-    unsigned getRegUnitWeight(unsigned RUID) const {
-      return RegUnitWeights[RUID];
-    }
-
     // Get the sum of unit weights.
     unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
       unsigned Weight = 0;
       for (std::vector<unsigned>::const_iterator
              I = Units.begin(), E = Units.end(); I != E; ++I)
-        Weight += getRegUnitWeight(*I);
+        Weight += getRegUnit(*I).Weight;
       return Weight;
     }
 
     // Increase a RegUnitWeight.
     void increaseRegUnitWeight(unsigned RUID, unsigned Inc) {
-      RegUnitWeights[RUID] += Inc;
+      getRegUnit(RUID).Weight += Inc;
     }
 
     // Get the number of register pressure dimensions.
@@ -485,15 +616,6 @@ namespace llvm {
     // Computed derived records such as missing sub-register indices.
     void computeDerivedInfo();
 
-    // Compute full overlap sets for every register. These sets include the
-    // rarely used aliases that are neither sub nor super-registers.
-    //
-    // Map[R1].count(R2) is reflexive and symmetric, but not transitive.
-    //
-    // If R1 is a sub-register of R2, Map[R1] is a subset of Map[R2].
-    void computeOverlaps(std::map<const CodeGenRegister*,
-                                  CodeGenRegister::Set> &Map);
-
     // Compute the set of registers completely covered by the registers in Regs.
     // The returned BitVector will have a bit set for each register in Regs,
     // all sub-registers, and all super-registers that are covered by the
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
new file mode 100644
index 0000000..f57fd18
--- /dev/null
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -0,0 +1,151 @@
+//===- CodeGenSchedule.cpp - Scheduling MachineModels ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines structures to encapsulate the machine model as decribed in
+// the target description.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget-emitter"
+
+#include "CodeGenSchedule.h"
+#include "CodeGenTarget.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// CodeGenModels ctor interprets machine model records and populates maps.
+CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
+                                       const CodeGenTarget &TGT):
+  Records(RK), Target(TGT), NumItineraryClasses(0), HasProcItineraries(false) {
+
+  // Populate SchedClassIdxMap and set NumItineraryClasses.
+  CollectSchedClasses();
+
+  // Populate ProcModelMap.
+  CollectProcModels();
+}
+
+// Visit all the instruction definitions for this target to gather and enumerate
+// the itinerary classes. These are the explicitly specified SchedClasses. More
+// SchedClasses may be inferred.
+void CodeGenSchedModels::CollectSchedClasses() {
+
+  // NoItinerary is always the first class at Index=0
+  SchedClasses.resize(1);
+  SchedClasses.back().Name = "NoItinerary";
+  SchedClassIdxMap[SchedClasses.back().Name] = 0;
+
+  // Gather and sort all itinerary classes used by instruction descriptions.
+  std::vector<Record*> ItinClassList;
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    Record *SchedDef = (*I)->TheDef->getValueAsDef("Itinerary");
+    // Map a new SchedClass with no index.
+    if (!SchedClassIdxMap.count(SchedDef->getName())) {
+      SchedClassIdxMap[SchedDef->getName()] = 0;
+      ItinClassList.push_back(SchedDef);
+    }
+  }
+  // Assign each itinerary class unique number, skipping NoItinerary==0
+  NumItineraryClasses = ItinClassList.size();
+  std::sort(ItinClassList.begin(), ItinClassList.end(), LessRecord());
+  for (unsigned i = 0, N = NumItineraryClasses; i < N; i++) {
+    Record *ItinDef = ItinClassList[i];
+    SchedClassIdxMap[ItinDef->getName()] = SchedClasses.size();
+    SchedClasses.push_back(CodeGenSchedClass(ItinDef));
+  }
+
+  // TODO: Infer classes from non-itinerary scheduler resources.
+}
+
+// Gather all processor models.
+void CodeGenSchedModels::CollectProcModels() {
+  std::vector<Record*> ProcRecords =
+    Records.getAllDerivedDefinitions("Processor");
+  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+
+  // Reserve space because we can. Reallocation would be ok.
+  ProcModels.reserve(ProcRecords.size());
+
+  // For each processor, find a unique machine model.
+  for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
+    addProcModel(ProcRecords[i]);
+}
+
+// Get a unique processor model based on the defined MachineModel and
+// ProcessorItineraries.
+void CodeGenSchedModels::addProcModel(Record *ProcDef) {
+  unsigned Idx = getProcModelIdx(ProcDef);
+  if (Idx < ProcModels.size())
+    return;
+
+  Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
+  Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+
+  std::string ModelName = ModelDef->getName();
+  const std::string &ItinName = ItinsDef->getName();
+
+  bool NoModel = ModelDef->getValueAsBit("NoModel");
+  bool hasTopLevelItin = !ItinsDef->getValueAsListOfDefs("IID").empty();
+  if (NoModel) {
+    // If an itinerary is defined without a machine model, infer a new model.
+    if (NoModel && hasTopLevelItin) {
+      ModelName = ItinName + "Model";
+      ModelDef = NULL;
+    }
+  }
+  else {
+    // If a machine model is defined, the itinerary must be defined within it
+    // rather than in the Processor definition itself.
+    assert(!hasTopLevelItin && "Itinerary must be defined in SchedModel");
+    ItinsDef = ModelDef->getValueAsDef("Itineraries");
+  }
+
+  ProcModelMap[getProcModelKey(ProcDef)]= ProcModels.size();
+
+  ProcModels.push_back(CodeGenProcModel(ModelName, ModelDef, ItinsDef));
+
+  std::vector<Record*> ItinRecords = ItinsDef->getValueAsListOfDefs("IID");
+  CollectProcItin(ProcModels.back(), ItinRecords);
+}
+
+// Gather the processor itineraries.
+void CodeGenSchedModels::CollectProcItin(CodeGenProcModel &ProcModel,
+                                         std::vector<Record*> ItinRecords) {
+  // Skip empty itinerary.
+  if (ItinRecords.empty())
+    return;
+
+  HasProcItineraries = true;
+
+  ProcModel.ItinDefList.resize(NumItineraryClasses+1);
+
+  // Insert each itinerary data record in the correct position within
+  // the processor model's ItinDefList.
+  for (unsigned i = 0, N = ItinRecords.size(); i < N; i++) {
+    Record *ItinData = ItinRecords[i];
+    Record *ItinDef = ItinData->getValueAsDef("TheClass");
+    if (!SchedClassIdxMap.count(ItinDef->getName())) {
+      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+            << " has unused itinerary class " << ItinDef->getName() << '\n');
+      continue;
+    }
+    ProcModel.ItinDefList[getItinClassIdx(ItinDef)] = ItinData;
+  }
+#ifndef NDEBUG
+  // Check for missing itinerary entries.
+  assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
+  for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
+    if (!ProcModel.ItinDefList[i])
+      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+            << " missing itinerary for class " << SchedClasses[i].Name << '\n');
+  }
+#endif
+}
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
new file mode 100644
index 0000000..9da0145
--- /dev/null
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -0,0 +1,172 @@
+//===- CodeGenSchedule.h - Scheduling Machine Models ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines structures to encapsulate the machine model as decribed in
+// the target description.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_SCHEDULE_H
+#define CODEGEN_SCHEDULE_H
+
+#include "llvm/TableGen/Record.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+
+class CodeGenTarget;
+
+// Scheduling class.
+//
+// Each instruction description will be mapped to a scheduling class. It may be
+// an explicitly defined itinerary class, or an inferred class in which case
+// ItinClassDef == NULL.
+struct CodeGenSchedClass {
+  std::string Name;
+  unsigned Index;
+  Record *ItinClassDef;
+
+  CodeGenSchedClass(): Index(0), ItinClassDef(0) {}
+  CodeGenSchedClass(Record *rec): Index(0), ItinClassDef(rec) {
+    Name = rec->getName();
+  }
+};
+
+// Processor model.
+//
+// ModelName is a unique name used to name an instantiation of MCSchedModel.
+//
+// ModelDef is NULL for inferred Models. This happens when a processor defines
+// an itinerary but no machine model. If the processer defines neither a machine
+// model nor itinerary, then ModelDef remains pointing to NoModel. NoModel has
+// the special "NoModel" field set to true.
+//
+// ItinsDef always points to a valid record definition, but may point to the
+// default NoItineraries. NoItineraries has an empty list of InstrItinData
+// records.
+//
+// ItinDefList orders this processor's InstrItinData records by SchedClass idx.
+struct CodeGenProcModel {
+  std::string ModelName;
+  Record *ModelDef;
+  Record *ItinsDef;
+
+  // Array of InstrItinData records indexed by CodeGenSchedClass::Index.
+  // The list is empty if the subtarget has no itineraries.
+  std::vector<Record *> ItinDefList;
+
+  CodeGenProcModel(const std::string &Name, Record *MDef, Record *IDef):
+    ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+};
+
+// Top level container for machine model data.
+class CodeGenSchedModels {
+  RecordKeeper &Records;
+  const CodeGenTarget &Target;
+
+  // List of unique SchedClasses.
+  std::vector<CodeGenSchedClass> SchedClasses;
+
+  // Map SchedClass name to itinerary index.
+  // These are either explicit itinerary classes or inferred classes.
+  StringMap<unsigned> SchedClassIdxMap;
+
+  // SchedClass indices 1 up to and including NumItineraryClasses identify
+  // itinerary classes that are explicitly used for this target's instruction
+  // definitions. NoItinerary always has index 0 regardless of whether it is
+  // explicitly referenced.
+  //
+  // Any inferred SchedClass have a index greater than NumItineraryClasses.
+  unsigned NumItineraryClasses;
+
+  // List of unique processor models.
+  std::vector<CodeGenProcModel> ProcModels;
+
+  // Map Processor's MachineModel + ProcItin fields to a CodeGenProcModel index.
+  typedef DenseMap<std::pair<Record*, Record*>, unsigned> ProcModelMapTy;
+  ProcModelMapTy ProcModelMap;
+
+  // True if any processors have nonempty itineraries.
+  bool HasProcItineraries;
+
+public:
+  CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
+
+  // Check if any instructions are assigned to an explicit itinerary class other
+  // than NoItinerary.
+  bool hasItineraryClasses() const { return NumItineraryClasses > 0; }
+
+  // Return the number of itinerary classes in use by this target's instruction
+  // descriptions, not including "NoItinerary".
+  unsigned numItineraryClasses() const {
+    return NumItineraryClasses;
+  }
+
+  // Get a SchedClass from its index.
+  const CodeGenSchedClass &getSchedClass(unsigned Idx) {
+    assert(Idx < SchedClasses.size() && "bad SchedClass index");
+    return SchedClasses[Idx];
+  }
+
+  // Get an itinerary class's index. Value indices are '0' for NoItinerary up to
+  // and including numItineraryClasses().
+  unsigned getItinClassIdx(Record *ItinDef) const {
+    assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
+    unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
+    assert(Idx <= NumItineraryClasses && "bad ItinClass index");
+    return Idx;
+  }
+
+  bool hasProcessorItineraries() const {
+    return HasProcItineraries;
+  }
+
+  // Get an existing machine model for a processor definition.
+  const CodeGenProcModel &getProcModel(Record *ProcDef) const {
+    unsigned idx = getProcModelIdx(ProcDef);
+    assert(idx < ProcModels.size() && "missing machine model");
+    return ProcModels[idx];
+  }
+
+  // Iterate over the unique processor models.
+  typedef std::vector<CodeGenProcModel>::const_iterator ProcIter;
+  ProcIter procModelBegin() const { return ProcModels.begin(); }
+  ProcIter procModelEnd() const { return ProcModels.end(); }
+
+private:
+  // Get a key that can uniquely identify a machine model.
+  ProcModelMapTy::key_type getProcModelKey(Record *ProcDef) const {
+    Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
+    Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+    return std::make_pair(ModelDef, ItinsDef);
+  }
+
+  // Get the unique index of a machine model.
+  unsigned getProcModelIdx(Record *ProcDef) const {
+    ProcModelMapTy::const_iterator I =
+      ProcModelMap.find(getProcModelKey(ProcDef));
+    if (I == ProcModelMap.end())
+      return ProcModels.size();
+    return I->second;
+  }
+
+  // Initialize a new processor model if it is unique.
+  void addProcModel(Record *ProcDef);
+
+  void CollectSchedClasses();
+  void CollectProcModels();
+  void CollectProcItin(CodeGenProcModel &ProcModel,
+                       std::vector<Record*> ItinRecords);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index cf67935..1dd2efc 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -16,6 +16,7 @@
 
 #include "CodeGenTarget.h"
 #include "CodeGenIntrinsics.h"
+#include "CodeGenSchedule.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
@@ -112,7 +113,7 @@ std::string llvm::getQualifiedName(const Record *R) {
 /// getTarget - Return the current instance of the Target class.
 ///
 CodeGenTarget::CodeGenTarget(RecordKeeper &records)
-  : Records(records), RegBank(0) {
+  : Records(records), RegBank(0), SchedModels(0) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     throw std::string("ERROR: No 'Target' subclasses defined!");
@@ -121,6 +122,10 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
   TargetRec = Targets[0];
 }
 
+CodeGenTarget::~CodeGenTarget() {
+  delete RegBank;
+  delete SchedModels;
+}
 
 const std::string &CodeGenTarget::getName() const {
   return TargetRec->getName();
@@ -155,18 +160,18 @@ Record *CodeGenTarget::getAsmParser() const {
 /// this target.
 ///
 Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
-  std::vector<Record*> LI = 
+  std::vector<Record*> LI =
     TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   if (i >= LI.size())
     throw "Target does not have an AsmParserVariant #" + utostr(i) + "!";
   return LI[i];
 }
 
-/// getAsmParserVariantCount - Return the AssmblyParserVariant definition 
+/// getAsmParserVariantCount - Return the AssmblyParserVariant definition
 /// available for this target.
 ///
 unsigned CodeGenTarget::getAsmParserVariantCount() const {
-  std::vector<Record*> LI = 
+  std::vector<Record*> LI =
     TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   return LI.size();
 }
@@ -235,6 +240,11 @@ void CodeGenTarget::ReadLegalValueTypes() const {
                         LegalValueTypes.end());
 }
 
+CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
+  if (!SchedModels)
+    SchedModels = new CodeGenSchedModels(Records, *this);
+  return *SchedModels;
+}
 
 void CodeGenTarget::ReadInstructions() const {
   std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
@@ -387,6 +397,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   isOverloaded = false;
   isCommutative = false;
   canThrow = false;
+  isNoReturn = false;
 
   if (DefName.size() <= 4 ||
       std::string(DefName.begin(), DefName.begin() + 4) != "int_")
@@ -511,6 +522,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       isCommutative = true;
     else if (Property->getName() == "Throws")
       canThrow = true;
+    else if (Property->getName() == "IntrNoReturn")
+      isNoReturn = true;
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
       ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture));
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 85463da..2f8cee4 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -26,6 +26,7 @@
 namespace llvm {
 
 struct CodeGenRegister;
+class CodeGenSchedModels;
 class CodeGenTarget;
 
 // SelectionDAG node properties.
@@ -72,9 +73,12 @@ class CodeGenTarget {
   void ReadInstructions() const;
   void ReadLegalValueTypes() const;
 
+  mutable CodeGenSchedModels *SchedModels;
+
   mutable std::vector<const CodeGenInstruction*> InstrsByEnum;
 public:
   CodeGenTarget(RecordKeeper &Records);
+  ~CodeGenTarget();
 
   Record *getTargetRecord() const { return TargetRec; }
   const std::string &getName() const;
@@ -96,7 +100,7 @@ public:
   ///
   Record *getAsmParserVariant(unsigned i) const;
 
-  /// getAsmParserVariantCount - Return the AssmblyParserVariant definition 
+  /// getAsmParserVariantCount - Return the AssmblyParserVariant definition
   /// available for this target.
   ///
   unsigned getAsmParserVariantCount() const;
@@ -139,6 +143,8 @@ public:
     return false;
   }
 
+  CodeGenSchedModels &getSchedModels() const;
+
 private:
   DenseMap<const Record*, CodeGenInstruction*> &getInstructions() const {
     if (Instructions.empty()) ReadInstructions();
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index 7db9003..b47dd71 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -11,12 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DAGISelEmitter.h"
+#include "CodeGenDAGPatterns.h"
 #include "DAGISelMatcher.h"
-#include "llvm/TableGen/Record.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 using namespace llvm;
 
+namespace {
+/// DAGISelEmitter - The top-level class which coordinates construction
+/// and emission of the instruction selector.
+class DAGISelEmitter {
+  CodeGenDAGPatterns CGP;
+public:
+  explicit DAGISelEmitter(RecordKeeper &R) : CGP(R) {}
+  void run(raw_ostream &OS);
+};
+} // End anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // DAGISelEmitter Helper methods
 //
@@ -104,11 +116,11 @@ struct PatternSortingPredicate {
     return LHS->ID < RHS->ID;
   }
 };
-}
+} // End anonymous namespace
 
 
 void DAGISelEmitter::run(raw_ostream &OS) {
-  EmitSourceFileHeader("DAG Instruction Selector for the " +
+  emitSourceFileHeader("DAG Instruction Selector for the " +
                        CGP.getTargetInfo().getName() + " target", OS);
 
   OS << "// *** NOTE: This file is #included into the middle of the target\n"
@@ -153,3 +165,11 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   EmitMatcherTable(TheMatcher, CGP, OS);
   delete TheMatcher;
 }
+
+namespace llvm {
+
+void EmitDAGISel(RecordKeeper &RK, raw_ostream &OS) {
+  DAGISelEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/DAGISelEmitter.h b/utils/TableGen/DAGISelEmitter.h
deleted file mode 100644
index 9c9fe42..0000000
--- a/utils/TableGen/DAGISelEmitter.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===- DAGISelEmitter.h - Generate an instruction selector ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend emits a DAG instruction selector.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DAGISEL_EMITTER_H
-#define DAGISEL_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include "CodeGenDAGPatterns.h"
-
-namespace llvm {
-
-/// DAGISelEmitter - The top-level class which coordinates construction
-/// and emission of the instruction selector.
-///
-class DAGISelEmitter : public TableGenBackend {
-  RecordKeeper &Records;
-  CodeGenDAGPatterns CGP;
-public:
-  explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {}
-
-  // run - Output the isel, returning true on failure.
-  void run(raw_ostream &OS);
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 99ebf98..3ca16f0 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -35,7 +35,7 @@ void EmitMatcherTable(const Matcher *Matcher, const CodeGenDAGPatterns &CGP,
                       raw_ostream &OS);
 
 
-/// Matcher - Base class for all the the DAG ISel Matcher representation
+/// Matcher - Base class for all the DAG ISel Matcher representation
 /// nodes.
 class Matcher {
   // The next matcher node that is executed after this one.  Null if this is the
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index bd425a9..1445edb 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -526,10 +526,10 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
       // Print the result #'s for EmitNode.
       if (const EmitNodeMatcher *E = dyn_cast<EmitNodeMatcher>(EN)) {
         if (unsigned NumResults = EN->getNumVTs()) {
-          OS.PadToColumn(CommentIndent) << "// Results = ";
+          OS.PadToColumn(CommentIndent) << "// Results =";
           unsigned First = E->getFirstResultSlot();
           for (unsigned i = 0; i != NumResults; ++i)
-            OS << "#" << First+i << " ";
+            OS << " #" << First+i;
         }
       }
       OS << '\n';
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 2ac7b87..aed222c 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -690,6 +690,13 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   bool NodeHasChain = InstPatNode &&
                       InstPatNode->TreeHasProperty(SDNPHasChain, CGP);
 
+  // Instructions which load and store from memory should have a chain,
+  // regardless of whether they happen to have an internal pattern saying so.
+  if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)
+      && (II.hasCtrlDep || II.mayLoad || II.mayStore || II.canFoldAsLoad ||
+          II.hasSideEffects))
+      NodeHasChain = true;
+
   bool isRoot = N == Pattern.getDstPattern();
 
   // TreeHasOutGlue - True if this tree has glue.
diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index 4abf54e..8bfecea 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -15,14 +15,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/TableGen/Record.h"
 #include "CodeGenTarget.h"
-#include "DFAPacketizerEmitter.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <list>
-
+#include <map>
+#include <string>
 using namespace llvm;
 
 //
+// class DFAPacketizerEmitter: class that generates and prints out the DFA
+// for resource tracking.
+//
+namespace {
+class DFAPacketizerEmitter {
+private:
+  std::string TargetName;
+  //
+  // allInsnClasses is the set of all possible resources consumed by an
+  // InstrStage.
+  //
+  DenseSet<unsigned> allInsnClasses;
+  RecordKeeper &Records;
+
+public:
+  DFAPacketizerEmitter(RecordKeeper &R);
+
+  //
+  // collectAllInsnClasses: Populate allInsnClasses which is a set of units
+  // used in each stage.
+  //
+  void collectAllInsnClasses(const std::string &Name,
+                             Record *ItinData,
+                             unsigned &NStages,
+                             raw_ostream &OS);
+
+  void run(raw_ostream &OS);
+};
+} // End anonymous namespace.
+
+//
 //
 // State represents the usage of machine resources if the packet contains
 // a set of instruction classes.
@@ -61,7 +94,12 @@ class State {
   // PossibleStates is the set of valid resource states that ensue from valid
   // transitions.
   //
-  bool canAddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates);
+  bool canAddInsnClass(unsigned InsnClass) const;
+  //
+  // AddInsnClass - Return all combinations of resource reservation
+  // which are possible from this state (PossibleStates).
+  //
+  void AddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates);
 };
 } // End anonymous namespace.
 
@@ -87,6 +125,10 @@ namespace {
 struct ltState {
   bool operator()(const State *s1, const State *s2) const;
 };
+
+struct ltTransition {
+  bool operator()(const Transition *s1, const Transition *s2) const;
+};
 } // End anonymous namespace.
 
 
@@ -102,7 +144,8 @@ public:
   std::set<State*, ltState> states;
 
   // Map from a state to the list of transitions with that state as source.
-  std::map<State*, SmallVector<Transition*, 16>, ltState> stateTransitions;
+  std::map<State*, std::set<Transition*, ltTransition>, ltState>
+    stateTransitions;
   State *currentState;
 
   // Highest valued Input seen.
@@ -160,21 +203,19 @@ bool ltState::operator()(const State *s1, const State *s2) const {
     return (s1->stateNum < s2->stateNum);
 }
 
+bool ltTransition::operator()(const Transition *s1, const Transition *s2) const {
+    return (s1->input < s2->input);
+}
 
 //
-// canAddInsnClass - Returns true if an instruction of type InsnClass is a
-// valid transition from this state i.e., can an instruction of type InsnClass
-// be added to the packet represented by this state.
-//
-// PossibleStates is the set of valid resource states that ensue from valid
-// transitions.
+// AddInsnClass - Return all combinations of resource reservation
+// which are possible from this state (PossibleStates).
 //
-bool State::canAddInsnClass(unsigned InsnClass,
+void State::AddInsnClass(unsigned InsnClass,
                             std::set<unsigned> &PossibleStates) {
   //
   // Iterate over all resource states in currentState.
   //
-  bool AddedState = false;
 
   for (std::set<unsigned>::iterator SI = stateInfo.begin();
        SI != stateInfo.end(); ++SI) {
@@ -207,13 +248,26 @@ bool State::canAddInsnClass(unsigned InsnClass,
             (VisitedResourceStates.count(ResultingResourceState) == 0)) {
           VisitedResourceStates.insert(ResultingResourceState);
           PossibleStates.insert(ResultingResourceState);
-          AddedState = true;
         }
       }
     }
   }
 
-  return AddedState;
+}
+
+
+//
+// canAddInsnClass - Quickly verifies if an instruction of type InsnClass is a
+// valid transition from this state i.e., can an instruction of type InsnClass
+// be added to the packet represented by this state.
+//
+bool State::canAddInsnClass(unsigned InsnClass) const {
+  for (std::set<unsigned>::const_iterator SI = stateInfo.begin();
+       SI != stateInfo.end(); ++SI) {
+    if (~*SI & InsnClass)
+      return true;
+  }
+  return false;
 }
 
 
@@ -234,7 +288,9 @@ void DFA::addTransition(Transition *T) {
     LargestInput = T->input;
 
   // Add the new transition.
-  stateTransitions[T->from].push_back(T);
+  bool Added = stateTransitions[T->from].insert(T).second;
+  assert(Added && "Cannot have multiple states for the same input");
+  (void)Added;
 }
 
 
@@ -248,11 +304,13 @@ State *DFA::getTransition(State *From, unsigned I) {
     return NULL;
 
   // Do we have a transition from state From with Input I?
-  for (SmallVector<Transition*, 16>::iterator VI =
-         stateTransitions[From].begin();
-         VI != stateTransitions[From].end(); ++VI)
-    if ((*VI)->input == I)
-      return (*VI)->to;
+  Transition TVal(NULL, I, NULL);
+  // Do not count this temporal instance
+  Transition::currentTransitionNum--;
+  std::set<Transition*, ltTransition>::iterator T =
+    stateTransitions[From].find(&TVal);
+  if (T != stateTransitions[From].end())
+    return (*T)->to;
 
   return NULL;
 }
@@ -266,7 +324,7 @@ bool DFA::isValidTransition(State *From, unsigned InsnClass) {
 int State::currentStateNum = 0;
 int Transition::currentTransitionNum = 0;
 
-DFAGen::DFAGen(RecordKeeper &R):
+DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R):
   TargetName(CodeGenTarget(R).getName()),
   allInsnClasses(), Records(R) {}
 
@@ -298,11 +356,12 @@ void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
     StateEntry[i] = ValidTransitions;
     for (unsigned j = 0; j <= LargestInput; ++j) {
       assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
-      if (!isValidTransition(*SI, j))
+      State *To = getTransition(*SI, j);
+      if (To == NULL)
         continue;
 
       OS << "{" << j << ", "
-         << getTransition(*SI, j)->stateNum
+         << To->stateNum
          << "},    ";
       ++ValidTransitions;
     }
@@ -346,7 +405,7 @@ void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
 // collectAllInsnClasses - Populate allInsnClasses which is a set of units
 // used in each stage.
 //
-void DFAGen::collectAllInsnClasses(const std::string &Name,
+void DFAPacketizerEmitter::collectAllInsnClasses(const std::string &Name,
                                   Record *ItinData,
                                   unsigned &NStages,
                                   raw_ostream &OS) {
@@ -402,8 +461,7 @@ void DFAGen::collectAllInsnClasses(const std::string &Name,
 //
 // Run the worklist algorithm to generate the DFA.
 //
-void DFAGen::run(raw_ostream &OS) {
-  EmitSourceFileHeader("Target DFA Packetizer Tables", OS);
+void DFAPacketizerEmitter::run(raw_ostream &OS) {
 
   // Collect processor iteraries.
   std::vector<Record*> ProcItinList =
@@ -482,8 +540,10 @@ void DFAGen::run(raw_ostream &OS) {
       // and the state can accommodate this InsnClass, create a transition.
       //
       if (!D.getTransition(current, InsnClass) &&
-          current->canAddInsnClass(InsnClass, NewStateResources)) {
+          current->canAddInsnClass(InsnClass)) {
         State *NewState = NULL;
+        current->AddInsnClass(InsnClass, NewStateResources);
+        assert(NewStateResources.size() && "New states must be generated");
 
         //
         // If we have seen this state before, then do not create a new state.
@@ -510,3 +570,12 @@ void DFAGen::run(raw_ostream &OS) {
   // Print out the table.
   D.writeTableAndAPI(OS, TargetName);
 }
+
+namespace llvm {
+
+void EmitDFAPacketizer(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Target DFA Packetizer Tables", OS);
+  DFAPacketizerEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/DFAPacketizerEmitter.h b/utils/TableGen/DFAPacketizerEmitter.h
deleted file mode 100644
index 1727150..0000000
--- a/utils/TableGen/DFAPacketizerEmitter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===- DFAPacketizerEmitter.h - Packetization DFA for a VLIW machine-------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class parses the Schedule.td file and produces an API that can be used
-// to reason about whether an instruction can be added to a packet on a VLIW
-// architecture. The class internally generates a deterministic finite
-// automaton (DFA) that models all possible mappings of machine instructions
-// to functional units as instructions are added to a packet.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include <map>
-#include <string>
-
-namespace llvm {
-//
-// class DFAGen: class that generates and prints out the DFA for resource
-// tracking.
-//
-class DFAGen : public TableGenBackend {
-private:
-  std::string TargetName;
-  //
-  // allInsnClasses is the set of all possible resources consumed by an
-  // InstrStage.
-  //
-  DenseSet<unsigned> allInsnClasses;
-  RecordKeeper &Records;
-
-public:
-  DFAGen(RecordKeeper &R);
-
-  //
-  // collectAllInsnClasses: Populate allInsnClasses which is a set of units
-  // used in each stage.
-  //
-  void collectAllInsnClasses(const std::string &Name,
-                            Record *ItinData,
-                            unsigned &NStages,
-                            raw_ostream &OS);
-
-  void run(raw_ostream &OS);
-};
-}
diff --git a/utils/TableGen/DisassemblerEmitter.cpp b/utils/TableGen/DisassemblerEmitter.cpp
index 4650197..826465a 100644
--- a/utils/TableGen/DisassemblerEmitter.cpp
+++ b/utils/TableGen/DisassemblerEmitter.cpp
@@ -7,13 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DisassemblerEmitter.h"
 #include "CodeGenTarget.h"
 #include "X86DisassemblerTables.h"
 #include "X86RecognizableInstr.h"
-#include "FixedLenDecoderEmitter.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
 using namespace llvm::X86Disassembler;
@@ -94,26 +93,27 @@ using namespace llvm::X86Disassembler;
 /// X86RecognizableInstr.cpp contains the implementation for a single
 ///   instruction.
 
-void DisassemblerEmitter::run(raw_ostream &OS) {
-  CodeGenTarget Target(Records);
+namespace llvm {
+
+extern void EmitFixedLenDecoder(RecordKeeper &RK, raw_ostream &OS,
+                                std::string PredicateNamespace,
+                                std::string GPrefix,
+                                std::string GPostfix,
+                                std::string ROK,
+                                std::string RFail,
+                                std::string L);
 
-  OS << "/*===- TableGen'erated file "
-     << "---------------------------------------*- C -*-===*\n"
-     << " *\n"
-     << " * " << Target.getName() << " Disassembler\n"
-     << " *\n"
-     << " * Automatically generated file, do not edit!\n"
-     << " *\n"
-     << " *===---------------------------------------------------------------"
-     << "-------===*/\n";
+void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
+  CodeGenTarget Target(Records);
+  emitSourceFileHeader(" * " + Target.getName() + " Disassembler", OS);
 
   // X86 uses a custom disassembler.
   if (Target.getName() == "X86") {
     DisassemblerTables Tables;
-  
+
     const std::vector<const CodeGenInstruction*> &numberedInstructions =
       Target.getInstructionsByEnumValue();
-    
+
     for (unsigned i = 0, e = numberedInstructions.size(); i != e; ++i)
       RecognizableInstr::processInstr(Tables, *numberedInstructions[i], i);
 
@@ -130,13 +130,18 @@ void DisassemblerEmitter::run(raw_ostream &OS) {
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
   if (Target.getName() == "ARM" ||
       Target.getName() == "Thumb") {
-    FixedLenDecoderEmitter(Records,
-                           "ARM",
-                           "if (!Check(S, ", ")) return MCDisassembler::Fail;",
-                           "S", "MCDisassembler::Fail",
-                           "  MCDisassembler::DecodeStatus S = MCDisassembler::Success;\n(void)S;").run(OS);
+    EmitFixedLenDecoder(Records, OS, "ARM",
+                        "if (!Check(S, ", ")) return MCDisassembler::Fail;",
+                        "S", "MCDisassembler::Fail",
+                        "  MCDisassembler::DecodeStatus S = "
+                          "MCDisassembler::Success;\n(void)S;");
     return;
   }
 
-  FixedLenDecoderEmitter(Records, Target.getName()).run(OS);
+  EmitFixedLenDecoder(Records, OS, Target.getName(),
+                      "if (", " == MCDisassembler::Fail)"
+                       " return MCDisassembler::Fail;",
+                      "MCDisassembler::Success", "MCDisassembler::Fail", "");
 }
+
+} // End llvm namespace
diff --git a/utils/TableGen/DisassemblerEmitter.h b/utils/TableGen/DisassemblerEmitter.h
deleted file mode 100644
index 63ee552..0000000
--- a/utils/TableGen/DisassemblerEmitter.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===- DisassemblerEmitter.h - Disassembler Generator -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DISASSEMBLEREMITTER_H
-#define DISASSEMBLEREMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-
-namespace llvm {
-
-  class DisassemblerEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-  public:
-    DisassemblerEmitter(RecordKeeper &R) : Records(R) {}
-
-    /// run - Output the disassembler.
-    void run(raw_ostream &o);
-  };
-
-} // end llvm namespace
-
-#endif
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index fe484ca..0c8b28d 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -13,192 +13,199 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "EDEmitter.h"
-
 #include "AsmWriterInst.h"
 #include "CodeGenTarget.h"
-
-#include "llvm/TableGen/Record.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <string>
 #include <vector>
 
 using namespace llvm;
 
+// TODO: There's a suspiciously large amount of "table" data in this
+// backend which should probably be in the TableGen file itself.
+
 ///////////////////////////////////////////////////////////
 // Support classes for emitting nested C data structures //
 ///////////////////////////////////////////////////////////
 
-namespace {
+// TODO: These classes are probably generally useful to other backends;
+// add them to TableGen's "helper" API's.
 
-  class EnumEmitter {
-  private:
-    std::string Name;
-    std::vector<std::string> Entries;
-  public:
-    EnumEmitter(const char *N) : Name(N) {
-    }
-    int addEntry(const char *e) {
-      Entries.push_back(std::string(e));
-      return Entries.size() - 1;
+namespace {
+class EnumEmitter {
+private:
+  std::string Name;
+  std::vector<std::string> Entries;
+public:
+  EnumEmitter(const char *N) : Name(N) {
+  }
+  int addEntry(const char *e) {
+    Entries.push_back(std::string(e));
+    return Entries.size() - 1;
+  }
+  void emit(raw_ostream &o, unsigned int &i) {
+    o.indent(i) << "enum " << Name.c_str() << " {" << "\n";
+    i += 2;
+
+    unsigned int index = 0;
+    unsigned int numEntries = Entries.size();
+    for (index = 0; index < numEntries; ++index) {
+      o.indent(i) << Entries[index];
+      if (index < (numEntries - 1))
+        o << ",";
+      o << "\n";
     }
-    void emit(raw_ostream &o, unsigned int &i) {
-      o.indent(i) << "enum " << Name.c_str() << " {" << "\n";
-      i += 2;
-
-      unsigned int index = 0;
-      unsigned int numEntries = Entries.size();
-      for (index = 0; index < numEntries; ++index) {
-        o.indent(i) << Entries[index];
-        if (index < (numEntries - 1))
-          o << ",";
-        o << "\n";
-      }
 
-      i -= 2;
-      o.indent(i) << "};" << "\n";
+    i -= 2;
+    o.indent(i) << "};" << "\n";
+  }
+
+  void emitAsFlags(raw_ostream &o, unsigned int &i) {
+    o.indent(i) << "enum " << Name.c_str() << " {" << "\n";
+    i += 2;
+
+    unsigned int index = 0;
+    unsigned int numEntries = Entries.size();
+    unsigned int flag = 1;
+    for (index = 0; index < numEntries; ++index) {
+      o.indent(i) << Entries[index] << " = " << format("0x%x", flag);
+      if (index < (numEntries - 1))
+        o << ",";
+      o << "\n";
+      flag <<= 1;
     }
 
-    void emitAsFlags(raw_ostream &o, unsigned int &i) {
-      o.indent(i) << "enum " << Name.c_str() << " {" << "\n";
-      i += 2;
-
-      unsigned int index = 0;
-      unsigned int numEntries = Entries.size();
-      unsigned int flag = 1;
-      for (index = 0; index < numEntries; ++index) {
-        o.indent(i) << Entries[index] << " = " << format("0x%x", flag);
-        if (index < (numEntries - 1))
-          o << ",";
-        o << "\n";
-        flag <<= 1;
-      }
+    i -= 2;
+    o.indent(i) << "};" << "\n";
+  }
+};
+} // End anonymous namespace
 
-      i -= 2;
-      o.indent(i) << "};" << "\n";
-    }
-  };
+namespace {
+class ConstantEmitter {
+public:
+  virtual ~ConstantEmitter() { }
+  virtual void emit(raw_ostream &o, unsigned int &i) = 0;
+};
+} // End anonymous namespace
 
-  class ConstantEmitter {
-  public:
-    virtual ~ConstantEmitter() { }
-    virtual void emit(raw_ostream &o, unsigned int &i) = 0;
+namespace {
+class LiteralConstantEmitter : public ConstantEmitter {
+private:
+  bool IsNumber;
+  union {
+    int Number;
+    const char* String;
   };
+public:
+  LiteralConstantEmitter(int number = 0) :
+    IsNumber(true),
+    Number(number) {
+  }
+  void set(const char *string) {
+    IsNumber = false;
+    Number = 0;
+    String = string;
+  }
+  bool is(const char *string) {
+    return !strcmp(String, string);
+  }
+  void emit(raw_ostream &o, unsigned int &i) {
+    if (IsNumber)
+      o << Number;
+    else
+      o << String;
+  }
+};
+} // End anonymous namespace
 
-  class LiteralConstantEmitter : public ConstantEmitter {
-  private:
-    bool IsNumber;
-    union {
-      int Number;
-      const char* String;
-    };
-  public:
-    LiteralConstantEmitter(int number = 0) :
-      IsNumber(true),
-      Number(number) {
-    }
-    void set(const char *string) {
-      IsNumber = false;
-      Number = 0;
-      String = string;
-    }
-    bool is(const char *string) {
-      return !strcmp(String, string);
-    }
-    void emit(raw_ostream &o, unsigned int &i) {
-      if (IsNumber)
-        o << Number;
-      else
-        o << String;
-    }
-  };
+namespace {
+class CompoundConstantEmitter : public ConstantEmitter {
+private:
+  unsigned int Padding;
+  std::vector<ConstantEmitter *> Entries;
+public:
+  CompoundConstantEmitter(unsigned int padding = 0) : Padding(padding) {
+  }
+  CompoundConstantEmitter &addEntry(ConstantEmitter *e) {
+    Entries.push_back(e);
 
-  class CompoundConstantEmitter : public ConstantEmitter {
-  private:
-    unsigned int Padding;
-    std::vector<ConstantEmitter *> Entries;
-  public:
-    CompoundConstantEmitter(unsigned int padding = 0) : Padding(padding) {
+    return *this;
+  }
+  ~CompoundConstantEmitter() {
+    while (Entries.size()) {
+      ConstantEmitter *entry = Entries.back();
+      Entries.pop_back();
+      delete entry;
     }
-    CompoundConstantEmitter &addEntry(ConstantEmitter *e) {
-      Entries.push_back(e);
+  }
+  void emit(raw_ostream &o, unsigned int &i) {
+    o << "{" << "\n";
+    i += 2;
 
-      return *this;
-    }
-    ~CompoundConstantEmitter() {
-      while (Entries.size()) {
-        ConstantEmitter *entry = Entries.back();
-        Entries.pop_back();
-        delete entry;
-      }
-    }
-    void emit(raw_ostream &o, unsigned int &i) {
-      o << "{" << "\n";
-      i += 2;
-
-      unsigned int index;
-      unsigned int numEntries = Entries.size();
-
-      unsigned int numToPrint;
-
-      if (Padding) {
-        if (numEntries > Padding) {
-          fprintf(stderr, "%u entries but %u padding\n", numEntries, Padding);
-          llvm_unreachable("More entries than padding");
-        }
-        numToPrint = Padding;
-      } else {
-        numToPrint = numEntries;
-      }
+    unsigned int index;
+    unsigned int numEntries = Entries.size();
 
-      for (index = 0; index < numToPrint; ++index) {
-        o.indent(i);
-        if (index < numEntries)
-          Entries[index]->emit(o, i);
-        else
-          o << "-1";
+    unsigned int numToPrint;
 
-        if (index < (numToPrint - 1))
-          o << ",";
-        o << "\n";
+    if (Padding) {
+      if (numEntries > Padding) {
+        fprintf(stderr, "%u entries but %u padding\n", numEntries, Padding);
+        llvm_unreachable("More entries than padding");
       }
-
-      i -= 2;
-      o.indent(i) << "}";
+      numToPrint = Padding;
+    } else {
+      numToPrint = numEntries;
     }
-  };
 
-  class FlagsConstantEmitter : public ConstantEmitter {
-  private:
-    std::vector<std::string> Flags;
-  public:
-    FlagsConstantEmitter() {
-    }
-    FlagsConstantEmitter &addEntry(const char *f) {
-      Flags.push_back(std::string(f));
-      return *this;
-    }
-    void emit(raw_ostream &o, unsigned int &i) {
-      unsigned int index;
-      unsigned int numFlags = Flags.size();
-      if (numFlags == 0)
-        o << "0";
-
-      for (index = 0; index < numFlags; ++index) {
-        o << Flags[index].c_str();
-        if (index < (numFlags - 1))
-          o << " | ";
-      }
+    for (index = 0; index < numToPrint; ++index) {
+      o.indent(i);
+      if (index < numEntries)
+        Entries[index]->emit(o, i);
+      else
+        o << "-1";
+
+      if (index < (numToPrint - 1))
+        o << ",";
+      o << "\n";
     }
-  };
-}
 
-EDEmitter::EDEmitter(RecordKeeper &R) : Records(R) {
-}
+    i -= 2;
+    o.indent(i) << "}";
+  }
+};
+} // End anonymous namespace
+
+namespace {
+class FlagsConstantEmitter : public ConstantEmitter {
+private:
+  std::vector<std::string> Flags;
+public:
+  FlagsConstantEmitter() {
+  }
+  FlagsConstantEmitter &addEntry(const char *f) {
+    Flags.push_back(std::string(f));
+    return *this;
+  }
+  void emit(raw_ostream &o, unsigned int &i) {
+    unsigned int index;
+    unsigned int numFlags = Flags.size();
+    if (numFlags == 0)
+      o << "0";
+
+    for (index = 0; index < numFlags; ++index) {
+      o << Flags[index].c_str();
+      if (index < (numFlags - 1))
+        o << " | ";
+    }
+  }
+};
+} // End anonymous namespace
 
 /// populateOperandOrder - Accepts a CodeGenInstruction and generates its
 ///   AsmWriterInst for the desired assembly syntax, giving an ordered list of
@@ -213,9 +220,9 @@ EDEmitter::EDEmitter(RecordKeeper &R) : Records(R) {
 ///                     representing an index in the operand descriptor array.
 /// @arg inst         - The instruction to use when looking up the operands
 /// @arg syntax       - The syntax to use, according to LLVM's enumeration
-void populateOperandOrder(CompoundConstantEmitter *operandOrder,
-                          const CodeGenInstruction &inst,
-                          unsigned syntax) {
+static void populateOperandOrder(CompoundConstantEmitter *operandOrder,
+                                 const CodeGenInstruction &inst,
+                                 unsigned syntax) {
   unsigned int numArgs = 0;
 
   AsmWriterInst awInst(inst, syntax, -1, -1);
@@ -310,6 +317,11 @@ static int X86TypeFromOpName(LiteralConstantEmitter *type,
   MEM("f128mem");
   MEM("f256mem");
   MEM("opaque512mem");
+  // Gather
+  MEM("vx32mem")
+  MEM("vy32mem")
+  MEM("vx64mem")
+  MEM("vy64mem")
 
   // all R, I, R, I
   LEA("lea32mem");
@@ -975,17 +987,23 @@ static void emitCommonEnums(raw_ostream &o, unsigned int &i) {
   o << "\n";
 }
 
-void EDEmitter::run(raw_ostream &o) {
+namespace llvm {
+
+void EmitEnhancedDisassemblerInfo(RecordKeeper &RK, raw_ostream &OS) {
+  emitSourceFileHeader("Enhanced Disassembler Info", OS);
   unsigned int i = 0;
 
   CompoundConstantEmitter infoArray;
-  CodeGenTarget target(Records);
+  CodeGenTarget target(RK);
 
   populateInstInfo(infoArray, target);
 
-  emitCommonEnums(o, i);
+  emitCommonEnums(OS, i);
 
-  o << "static const llvm::EDInstInfo instInfo" << target.getName() << "[] = ";
-  infoArray.emit(o, i);
-  o << ";" << "\n";
+  OS << "static const llvm::EDInstInfo instInfo"
+     << target.getName() << "[] = ";
+  infoArray.emit(OS, i);
+  OS << ";" << "\n";
 }
+
+} // End llvm namespace
diff --git a/utils/TableGen/EDEmitter.h b/utils/TableGen/EDEmitter.h
deleted file mode 100644
index f268375..0000000
--- a/utils/TableGen/EDEmitter.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- EDEmitter.h - Generate instruction descriptions for ED ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend is responsible for emitting a description of each
-// instruction in a format that the semantic disassembler can use to tokenize
-// and parse instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SEMANTIC_INFO_EMITTER_H
-#define SEMANTIC_INFO_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-
-namespace llvm {
-  
-  class EDEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-  public:
-    EDEmitter(RecordKeeper &R);
-    
-    // run - Output the instruction table.
-    void run(raw_ostream &o);
-  };
-  
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index e8dad77..ca784d0 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -17,28 +17,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FastISelEmitter.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
+#include "CodeGenDAGPatterns.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 using namespace llvm;
 
-namespace {
 
 /// InstructionMemo - This class holds additional information about an
 /// instruction needed to emit code for it.
 ///
+namespace {
 struct InstructionMemo {
   std::string Name;
   const CodeGenRegisterClass *RC;
   std::string SubRegNo;
   std::vector<std::string>* PhysRegs;
 };
-  
+} // End anonymous namespace
+
 /// ImmPredicateSet - This uniques predicates (represented as a string) and
 /// gives them unique (small) integer ID's that start at 0.
+namespace {
 class ImmPredicateSet {
   DenseMap<TreePattern *, unsigned> ImmIDs;
   std::vector<TreePredicateFn> PredsByName;
@@ -63,10 +66,12 @@ public:
   iterator end() const { return PredsByName.end(); }
   
 };
+} // End anonymous namespace
 
 /// OperandsSignature - This class holds a description of a list of operand
 /// types. It has utility methods for emitting text based on the operands.
 ///
+namespace {
 struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
@@ -352,7 +357,9 @@ struct OperandsSignature {
       Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes);
   }
 };
+} // End anonymous namespace
 
+namespace {
 class FastISelMap {
   typedef std::map<std::string, InstructionMemo> PredMap;
   typedef std::map<MVT::SimpleValueType, PredMap> RetPredMap;
@@ -375,8 +382,7 @@ public:
   void printImmediatePredicates(raw_ostream &OS);
   void printFunctionDefinitions(raw_ostream &OS);
 };
-
-}
+} // End anonymous namespace
 
 static std::string getOpcodeName(Record *Op, CodeGenDAGPatterns &CGP) {
   return CGP.getSDNodeInfo(Op).getEnumName();
@@ -639,7 +645,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
                 Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
                                              ImmediatePredicates, true);
                 OS << "(" << InstNS << Memo.Name << ", ";
-                OS << InstNS << Memo.RC->getName() << "RegisterClass";
+                OS << "&" << InstNS << Memo.RC->getName() << "RegClass";
                 if (!Operands.empty())
                   OS << ", ";
                 Operands.PrintArguments(OS, *Memo.PhysRegs);
@@ -731,7 +737,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
               Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
                                            ImmediatePredicates, true);
               OS << "(" << InstNS << Memo.Name << ", ";
-              OS << InstNS << Memo.RC->getName() << "RegisterClass";
+              OS << "&" << InstNS << Memo.RC->getName() << "RegClass";
               if (!Operands.empty())
                 OS << ", ";
               Operands.PrintArguments(OS, *Memo.PhysRegs);
@@ -850,23 +856,22 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
   // TODO: SignaturesWithConstantForms should be empty here.
 }
 
-void FastISelEmitter::run(raw_ostream &OS) {
+namespace llvm {
+
+void EmitFastISel(RecordKeeper &RK, raw_ostream &OS) {
+  CodeGenDAGPatterns CGP(RK);
   const CodeGenTarget &Target = CGP.getTargetInfo();
+  emitSourceFileHeader("\"Fast\" Instruction Selector for the " +
+                       Target.getName() + " target", OS);
 
   // Determine the target's namespace name.
   std::string InstNS = Target.getInstNamespace() + "::";
   assert(InstNS.size() > 2 && "Can't determine target-specific namespace!");
 
-  EmitSourceFileHeader("\"Fast\" Instruction Selector for the " +
-                       Target.getName() + " target", OS);
-
   FastISelMap F(InstNS);
   F.collectPatterns(CGP);
   F.printImmediatePredicates(OS);
   F.printFunctionDefinitions(OS);
 }
 
-FastISelEmitter::FastISelEmitter(RecordKeeper &R)
-  : Records(R), CGP(R) {
-}
-
+} // End llvm namespace
diff --git a/utils/TableGen/FastISelEmitter.h b/utils/TableGen/FastISelEmitter.h
deleted file mode 100644
index 4f75ac1..0000000
--- a/utils/TableGen/FastISelEmitter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===- FastISelEmitter.h - Generate an instruction selector -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend emits a "fast" instruction selector.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FASTISEL_EMITTER_H
-#define FASTISEL_EMITTER_H
-
-#include "CodeGenDAGPatterns.h"
-#include "llvm/TableGen/TableGenBackend.h"
-
-namespace llvm {
-
-class CodeGenTarget;
-
-/// FastISelEmitter - The top-level class which coordinates construction
-/// and emission of the instruction selector.
-///
-class FastISelEmitter : public TableGenBackend {
-  RecordKeeper &Records;
-  CodeGenDAGPatterns CGP;
-public:
-  explicit FastISelEmitter(RecordKeeper &R);
-
-  // run - Output the isel, returning true on failure.
-  void run(raw_ostream &OS);
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 9b676f2..2cdde55 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -14,13 +14,14 @@
 
 #define DEBUG_TYPE "decoder-emitter"
 
-#include "FixedLenDecoderEmitter.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/TableGenBackend.h"
 
 #include <vector>
 #include <map>
@@ -28,6 +29,67 @@
 
 using namespace llvm;
 
+namespace {
+struct EncodingField {
+  unsigned Base, Width, Offset;
+  EncodingField(unsigned B, unsigned W, unsigned O)
+    : Base(B), Width(W), Offset(O) { }
+};
+} // End anonymous namespace
+
+namespace {
+struct OperandInfo {
+  std::vector<EncodingField> Fields;
+  std::string Decoder;
+
+  OperandInfo(std::string D)
+    : Decoder(D) { }
+
+  void addField(unsigned Base, unsigned Width, unsigned Offset) {
+    Fields.push_back(EncodingField(Base, Width, Offset));
+  }
+
+  unsigned numFields() const { return Fields.size(); }
+
+  typedef std::vector<EncodingField>::const_iterator const_iterator;
+
+  const_iterator begin() const { return Fields.begin(); }
+  const_iterator end() const   { return Fields.end();   }
+};
+} // End anonymous namespace
+
+namespace {
+class FixedLenDecoderEmitter {
+public:
+
+  // Defaults preserved here for documentation, even though they aren't
+  // strictly necessary given the way that this is currently being called.
+  FixedLenDecoderEmitter(RecordKeeper &R,
+                         std::string PredicateNamespace,
+                         std::string GPrefix  = "if (",
+                         std::string GPostfix = " == MCDisassembler::Fail)"
+                         " return MCDisassembler::Fail;",
+                         std::string ROK      = "MCDisassembler::Success",
+                         std::string RFail    = "MCDisassembler::Fail",
+                         std::string L        = "") :
+    Target(R),
+    PredicateNamespace(PredicateNamespace),
+    GuardPrefix(GPrefix), GuardPostfix(GPostfix),
+    ReturnOK(ROK), ReturnFail(RFail), Locals(L) {}
+
+  // run - Output the code emitter
+  void run(raw_ostream &o);
+
+private:
+  CodeGenTarget Target;
+public:
+  std::string PredicateNamespace;
+  std::string GuardPrefix, GuardPostfix;
+  std::string ReturnOK, ReturnFail;
+  std::string Locals;
+};
+} // End anonymous namespace
+
 // The set (BIT_TRUE, BIT_FALSE, BIT_UNSET) represents a ternary logic system
 // for a bit value.
 //
@@ -83,7 +145,9 @@ static BitsInit &getBitsField(const Record &def, const char *str) {
 }
 
 // Forward declaration.
+namespace {
 class FilterChooser;
+} // End anonymous namespace
 
 // Representation of the instruction to work on.
 typedef std::vector<bit_value_t> insn_t;
@@ -124,6 +188,7 @@ typedef std::vector<bit_value_t> insn_t;
 /// decoder could try to decode the even/odd register numbering and assign to
 /// VST4q8a or VST4q8b, but for the time being, the decoder chooses the "a"
 /// version and return the Opcode since the two have the same Asm format string.
+namespace {
 class Filter {
 protected:
   const FilterChooser *Owner;// points to the FilterChooser who owns this filter
@@ -180,6 +245,7 @@ public:
   // the filter distinguishes more categories of instructions.
   unsigned usefulness() const;
 }; // End of class Filter
+} // End anonymous namespace
 
 // These are states of our finite state machines used in FilterChooser's
 // filterProcessor() which produces the filter candidates to use.
@@ -206,6 +272,7 @@ typedef enum {
 /// It is useful to think of a Filter as governing the switch stmts of the
 /// decoding tree.  And each case is delegated to an inferior FilterChooser to
 /// decide what further remaining bits to look at.
+namespace {
 class FilterChooser {
 protected:
   friend class Filter;
@@ -385,6 +452,7 @@ protected:
   // the instruction at this level or the instruction is not decodeable.
   bool emit(raw_ostream &o, unsigned &Indentation) const;
 };
+} // End anonymous namespace
 
 ///////////////////////////
 //                       //
@@ -1573,3 +1641,18 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
 
   o << "\n} // End llvm namespace \n";
 }
+
+namespace llvm {
+
+void EmitFixedLenDecoder(RecordKeeper &RK, raw_ostream &OS,
+                         std::string PredicateNamespace,
+                         std::string GPrefix,
+                         std::string GPostfix,
+                         std::string ROK,
+                         std::string RFail,
+                         std::string L) {
+  FixedLenDecoderEmitter(RK, PredicateNamespace, GPrefix, GPostfix,
+                         ROK, RFail, L).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/FixedLenDecoderEmitter.h b/utils/TableGen/FixedLenDecoderEmitter.h
deleted file mode 100644
index 195297c..0000000
--- a/utils/TableGen/FixedLenDecoderEmitter.h
+++ /dev/null
@@ -1,79 +0,0 @@
-//===------------ FixedLenDecoderEmitter.h - Decoder Generator --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// It contains the tablegen backend that emits the decoder functions for
-// targets with fixed length instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FixedLenDECODEREMITTER_H
-#define FixedLenDECODEREMITTER_H
-
-#include "CodeGenTarget.h"
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-struct EncodingField {
-  unsigned Base, Width, Offset;
-  EncodingField(unsigned B, unsigned W, unsigned O)
-    : Base(B), Width(W), Offset(O) { }
-};
-
-struct OperandInfo {
-  std::vector<EncodingField> Fields;
-  std::string Decoder;
-
-  OperandInfo(std::string D)
-    : Decoder(D) { }
-
-  void addField(unsigned Base, unsigned Width, unsigned Offset) {
-    Fields.push_back(EncodingField(Base, Width, Offset));
-  }
-
-  unsigned numFields() const { return Fields.size(); }
-
-  typedef std::vector<EncodingField>::const_iterator const_iterator;
-
-  const_iterator begin() const { return Fields.begin(); }
-  const_iterator end() const   { return Fields.end();   }
-};
-
-class FixedLenDecoderEmitter : public TableGenBackend {
-public:
-  FixedLenDecoderEmitter(RecordKeeper &R,
-                         std::string PredicateNamespace,
-                         std::string GPrefix  = "if (",
-                         std::string GPostfix = " == MCDisassembler::Fail)"
-                         " return MCDisassembler::Fail;",
-                         std::string ROK      = "MCDisassembler::Success",
-                         std::string RFail    = "MCDisassembler::Fail",
-                         std::string L        = "") :
-    Target(R),
-    PredicateNamespace(PredicateNamespace),
-    GuardPrefix(GPrefix), GuardPostfix(GPostfix),
-    ReturnOK(ROK), ReturnFail(RFail), Locals(L) {}
-
-  // run - Output the code emitter
-  void run(raw_ostream &o);
-
-private:
-  CodeGenTarget Target;
-public:
-  std::string PredicateNamespace;
-  std::string GuardPrefix, GuardPostfix;
-  std::string ReturnOK, ReturnFail;
-  std::string Locals;
-};
-
-} // end llvm namespace
-
-#endif
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 8b3efd3..3adb869 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -12,15 +12,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstrInfoEmitter.h"
+
+#include "CodeGenDAGPatterns.h"
+#include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
-#include "llvm/TableGen/Record.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
 #include <cstdio>
+#include <map>
+#include <vector>
 using namespace llvm;
 
+namespace {
+class InstrInfoEmitter {
+  RecordKeeper &Records;
+  CodeGenDAGPatterns CDP;
+  const CodeGenSchedModels &SchedModels;
+
+public:
+  InstrInfoEmitter(RecordKeeper &R):
+    Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {}
+
+  // run - Output the instruction set description.
+  void run(raw_ostream &OS);
+
+private:
+  void emitEnums(raw_ostream &OS);
+
+  typedef std::map<std::vector<std::string>, unsigned> OperandInfoMapTy;
+  void emitRecord(const CodeGenInstruction &Inst, unsigned Num,
+                  Record *InstrInfo,
+                  std::map<std::vector<Record*>, unsigned> &EL,
+                  const OperandInfoMapTy &OpInfo,
+                  raw_ostream &OS);
+
+  // Operand information.
+  void EmitOperandInfo(raw_ostream &OS, OperandInfoMapTy &OperandInfoIDs);
+  std::vector<std::string> GetOperandInfo(const CodeGenInstruction &Inst);
+};
+} // End anonymous namespace
+
 static void PrintDefList(const std::vector<Record*> &Uses,
                          unsigned Num, raw_ostream &OS) {
   OS << "static const uint16_t ImplicitList" << Num << "[] = { ";
@@ -30,23 +64,6 @@ static void PrintDefList(const std::vector<Record*> &Uses,
 }
 
 //===----------------------------------------------------------------------===//
-// Instruction Itinerary Information.
-//===----------------------------------------------------------------------===//
-
-void InstrInfoEmitter::GatherItinClasses() {
-  std::vector<Record*> DefList =
-  Records.getAllDerivedDefinitions("InstrItinClass");
-  std::sort(DefList.begin(), DefList.end(), LessRecord());
-
-  for (unsigned i = 0, N = DefList.size(); i < N; i++)
-    ItinClassMap[DefList[i]->getName()] = i;
-}
-
-unsigned InstrInfoEmitter::getItinClassNumber(const Record *InstRec) {
-  return ItinClassMap[InstRec->getValueAsDef("Itinerary")->getName()];
-}
-
-//===----------------------------------------------------------------------===//
 // Operand Info Emission.
 //===----------------------------------------------------------------------===//
 
@@ -163,11 +180,10 @@ void InstrInfoEmitter::EmitOperandInfo(raw_ostream &OS,
 
 // run - Emit the main instruction description records for the target...
 void InstrInfoEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("Target Instruction Enum Values", OS);
   emitEnums(OS);
 
-  GatherItinClasses();
-
-  EmitSourceFileHeader("Target Instruction Descriptors", OS);
+  emitSourceFileHeader("Target Instruction Descriptors", OS);
 
   OS << "\n#ifdef GET_INSTRINFO_MC_DESC\n";
   OS << "#undef GET_INSTRINFO_MC_DESC\n";
@@ -288,10 +304,11 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
     MinOperands = Inst.Operands.back().MIOperandNo +
                   Inst.Operands.back().MINumOperands;
 
+  Record *ItinDef = Inst.TheDef->getValueAsDef("Itinerary");
   OS << "  { ";
   OS << Num << ",\t" << MinOperands << ",\t"
      << Inst.Operands.NumDefs << ",\t"
-     << getItinClassNumber(Inst.TheDef) << ",\t"
+     << SchedModels.getItinClassIdx(ItinDef) << ",\t"
      << Inst.TheDef->getValueAsInt("Size") << ",\t0";
 
   // Emit all of the target indepedent flags...
@@ -362,7 +379,6 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
 
 // emitEnums - Print out enum values for all of the instructions.
 void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
-  EmitSourceFileHeader("Target Instruction Enum Values", OS);
 
   OS << "\n#ifdef GET_INSTRINFO_ENUM\n";
   OS << "#undef GET_INSTRINFO_ENUM\n";
@@ -394,3 +410,11 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
 
   OS << "#endif // GET_INSTRINFO_ENUM\n\n";
 }
+
+namespace llvm {
+
+void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
+  InstrInfoEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/InstrInfoEmitter.h b/utils/TableGen/InstrInfoEmitter.h
deleted file mode 100644
index f8d3ea5..0000000
--- a/utils/TableGen/InstrInfoEmitter.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//===- InstrInfoEmitter.h - Generate a Instruction Set Desc. ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend is responsible for emitting a description of the target
-// instruction set for the code generator.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef INSTRINFO_EMITTER_H
-#define INSTRINFO_EMITTER_H
-
-#include "CodeGenDAGPatterns.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include <vector>
-#include <map>
-
-namespace llvm {
-
-class StringInit;
-class IntInit;
-class ListInit;
-class CodeGenInstruction;
-
-class InstrInfoEmitter : public TableGenBackend {
-  RecordKeeper &Records;
-  CodeGenDAGPatterns CDP;
-  std::map<std::string, unsigned> ItinClassMap;
-
-public:
-  InstrInfoEmitter(RecordKeeper &R) : Records(R), CDP(R) { }
-
-  // run - Output the instruction set description.
-  void run(raw_ostream &OS);
-
-private:
-  void emitEnums(raw_ostream &OS);
-
-  typedef std::map<std::vector<std::string>, unsigned> OperandInfoMapTy;
-  void emitRecord(const CodeGenInstruction &Inst, unsigned Num,
-                  Record *InstrInfo,
-                  std::map<std::vector<Record*>, unsigned> &EL,
-                  const OperandInfoMapTy &OpInfo,
-                  raw_ostream &OS);
-
-  // Itinerary information.
-  void GatherItinClasses();
-  unsigned getItinClassNumber(const Record *InstRec);
-
-  // Operand information.
-  void EmitOperandInfo(raw_ostream &OS, OperandInfoMapTy &OperandInfoIDs);
-  std::vector<std::string> GetOperandInfo(const CodeGenInstruction &Inst);
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 8e1bae8..155d1ab 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -11,23 +11,62 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
-#include "IntrinsicEmitter.h"
-#include "StringMatcher.h"
-#include "llvm/TableGen/Record.h"
+#include "SequenceToOffsetTable.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
 using namespace llvm;
 
+namespace {
+class IntrinsicEmitter {
+  RecordKeeper &Records;
+  bool TargetOnly;
+  std::string TargetPrefix;
+
+public:
+  IntrinsicEmitter(RecordKeeper &R, bool T)
+    : Records(R), TargetOnly(T) {}
+
+  void run(raw_ostream &OS);
+
+  void EmitPrefix(raw_ostream &OS);
+
+  void EmitEnumInfo(const std::vector<CodeGenIntrinsic> &Ints,
+                    raw_ostream &OS);
+
+  void EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints,
+                            raw_ostream &OS);
+  void EmitIntrinsicToNameTable(const std::vector<CodeGenIntrinsic> &Ints,
+                                raw_ostream &OS);
+  void EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints,
+                                    raw_ostream &OS);
+  void EmitVerifier(const std::vector<CodeGenIntrinsic> &Ints,
+                    raw_ostream &OS);
+  void EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
+                     raw_ostream &OS);
+  void EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints,
+                      raw_ostream &OS);
+  void EmitModRefBehavior(const std::vector<CodeGenIntrinsic> &Ints,
+                          raw_ostream &OS);
+  void EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
+                                    raw_ostream &OS);
+  void EmitSuffix(raw_ostream &OS);
+};
+} // End anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // IntrinsicEmitter Implementation
 //===----------------------------------------------------------------------===//
 
 void IntrinsicEmitter::run(raw_ostream &OS) {
-  EmitSourceFileHeader("Intrinsic Function Source Fragment", OS);
-  
+  emitSourceFileHeader("Intrinsic Function Source Fragment", OS);
+
   std::vector<CodeGenIntrinsic> Ints = LoadIntrinsics(Records, TargetOnly);
-  
+
   if (TargetOnly && !Ints.empty())
     TargetPrefix = Ints[0].TargetPrefix;
 
@@ -45,9 +84,6 @@ void IntrinsicEmitter::run(raw_ostream &OS) {
   // Emit the function name recognizer.
   EmitFnNameRecognizer(Ints, OS);
   
-  // Emit the intrinsic verifier.
-  EmitVerifier(Ints, OS);
-  
   // Emit the intrinsic declaration generator.
   EmitGenerator(Ints, OS);
   
@@ -174,337 +210,299 @@ EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";
 }
 
-static void EmitTypeForValueType(raw_ostream &OS, MVT::SimpleValueType VT) {
+
+// NOTE: This must be kept in synch with the copy in lib/VMCore/Function.cpp!
+enum IIT_Info {
+  // Common values should be encoded with 0-15.
+  IIT_Done = 0,
+  IIT_I1   = 1,
+  IIT_I8   = 2,
+  IIT_I16  = 3,
+  IIT_I32  = 4,
+  IIT_I64  = 5,
+  IIT_F32  = 6,
+  IIT_F64  = 7,
+  IIT_V2   = 8,
+  IIT_V4   = 9,
+  IIT_V8   = 10,
+  IIT_V16  = 11,
+  IIT_V32  = 12,
+  IIT_MMX  = 13,
+  IIT_PTR  = 14,
+  IIT_ARG  = 15,
+  
+  // Values from 16+ are only encodable with the inefficient encoding.
+  IIT_METADATA = 16,
+  IIT_EMPTYSTRUCT = 17,
+  IIT_STRUCT2 = 18,
+  IIT_STRUCT3 = 19,
+  IIT_STRUCT4 = 20,
+  IIT_STRUCT5 = 21,
+  IIT_EXTEND_VEC_ARG = 22,
+  IIT_TRUNC_VEC_ARG = 23,
+  IIT_ANYPTR = 24
+};
+
+
+static void EncodeFixedValueType(MVT::SimpleValueType VT,
+                                 std::vector<unsigned char> &Sig) {
   if (EVT(VT).isInteger()) {
     unsigned BitWidth = EVT(VT).getSizeInBits();
-    OS << "IntegerType::get(Context, " << BitWidth << ")";
-  } else if (VT == MVT::Other) {
-    // MVT::OtherVT is used to mean the empty struct type here.
-    OS << "StructType::get(Context)";
-  } else if (VT == MVT::f16) {
-    OS << "Type::getHalfTy(Context)";
-  } else if (VT == MVT::f32) {
-    OS << "Type::getFloatTy(Context)";
-  } else if (VT == MVT::f64) {
-    OS << "Type::getDoubleTy(Context)";
-  } else if (VT == MVT::f80) {
-    OS << "Type::getX86_FP80Ty(Context)";
-  } else if (VT == MVT::f128) {
-    OS << "Type::getFP128Ty(Context)";
-  } else if (VT == MVT::ppcf128) {
-    OS << "Type::getPPC_FP128Ty(Context)";
-  } else if (VT == MVT::isVoid) {
-    OS << "Type::getVoidTy(Context)";
-  } else if (VT == MVT::Metadata) {
-    OS << "Type::getMetadataTy(Context)";
-  } else if (VT == MVT::x86mmx) {
-    OS << "Type::getX86_MMXTy(Context)";
-  } else {
-    assert(false && "Unsupported ValueType!");
+    switch (BitWidth) {
+    default: throw "unhandled integer type width in intrinsic!";
+    case 1: return Sig.push_back(IIT_I1);
+    case 8: return Sig.push_back(IIT_I8);
+    case 16: return Sig.push_back(IIT_I16);
+    case 32: return Sig.push_back(IIT_I32);
+    case 64: return Sig.push_back(IIT_I64);
+    }
   }
-}
-
-static void EmitTypeGenerate(raw_ostream &OS, const Record *ArgType,
-                             unsigned &ArgNo);
-
-static void EmitTypeGenerate(raw_ostream &OS,
-                             const std::vector<Record*> &ArgTypes,
-                             unsigned &ArgNo) {
-  if (ArgTypes.empty())
-    return EmitTypeForValueType(OS, MVT::isVoid);
   
-  if (ArgTypes.size() == 1)
-    return EmitTypeGenerate(OS, ArgTypes.front(), ArgNo);
-
-  OS << "StructType::get(";
-
-  for (std::vector<Record*>::const_iterator
-         I = ArgTypes.begin(), E = ArgTypes.end(); I != E; ++I) {
-    EmitTypeGenerate(OS, *I, ArgNo);
-    OS << ", ";
+  switch (VT) {
+  default: throw "unhandled MVT in intrinsic!";
+  case MVT::f32: return Sig.push_back(IIT_F32);
+  case MVT::f64: return Sig.push_back(IIT_F64);
+  case MVT::Metadata: return Sig.push_back(IIT_METADATA);
+  case MVT::x86mmx: return Sig.push_back(IIT_MMX);
+  // MVT::OtherVT is used to mean the empty struct type here.
+  case MVT::Other: return Sig.push_back(IIT_EMPTYSTRUCT);
   }
-
-  OS << " NULL)";
 }
 
-static void EmitTypeGenerate(raw_ostream &OS, const Record *ArgType,
-                             unsigned &ArgNo) {
-  MVT::SimpleValueType VT = getValueType(ArgType->getValueAsDef("VT"));
-
-  if (ArgType->isSubClassOf("LLVMMatchType")) {
-    unsigned Number = ArgType->getValueAsInt("Number");
-    assert(Number < ArgNo && "Invalid matching number!");
-    if (ArgType->isSubClassOf("LLVMExtendedElementVectorType"))
-      OS << "VectorType::getExtendedElementVectorType"
-         << "(dyn_cast<VectorType>(Tys[" << Number << "]))";
-    else if (ArgType->isSubClassOf("LLVMTruncatedElementVectorType"))
-      OS << "VectorType::getTruncatedElementVectorType"
-         << "(dyn_cast<VectorType>(Tys[" << Number << "]))";
+#ifdef _MSC_VER
+#pragma optimize("",off) // MSVC 2010 optimizer can't deal with this function.
+#endif 
+
+static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
+                            std::vector<unsigned char> &Sig) {
+  
+  if (R->isSubClassOf("LLVMMatchType")) {
+    unsigned Number = R->getValueAsInt("Number");
+    assert(Number < ArgCodes.size() && "Invalid matching number!");
+    if (R->isSubClassOf("LLVMExtendedElementVectorType"))
+      Sig.push_back(IIT_EXTEND_VEC_ARG);
+    else if (R->isSubClassOf("LLVMTruncatedElementVectorType"))
+      Sig.push_back(IIT_TRUNC_VEC_ARG);
     else
-      OS << "Tys[" << Number << "]";
-  } else if (VT == MVT::iAny || VT == MVT::fAny || VT == MVT::vAny) {
-    // NOTE: The ArgNo variable here is not the absolute argument number, it is
-    // the index of the "arbitrary" type in the Tys array passed to the
-    // Intrinsic::getDeclaration function. Consequently, we only want to
-    // increment it when we actually hit an overloaded type. Getting this wrong
-    // leads to very subtle bugs!
-    OS << "Tys[" << ArgNo++ << "]";
-  } else if (EVT(VT).isVector()) {
+      Sig.push_back(IIT_ARG);
+    return Sig.push_back((Number << 2) | ArgCodes[Number]);
+  }
+  
+  MVT::SimpleValueType VT = getValueType(R->getValueAsDef("VT"));
+
+  unsigned Tmp = 0;
+  switch (VT) {
+  default: break;
+  case MVT::iPTRAny: ++Tmp; // FALL THROUGH.
+  case MVT::vAny: ++Tmp; // FALL THROUGH.
+  case MVT::fAny: ++Tmp; // FALL THROUGH.
+  case MVT::iAny: {
+    // If this is an "any" valuetype, then the type is the type of the next
+    // type in the list specified to getIntrinsic().  
+    Sig.push_back(IIT_ARG);
+    
+    // Figure out what arg # this is consuming, and remember what kind it was.
+    unsigned ArgNo = ArgCodes.size();
+    ArgCodes.push_back(Tmp);
+    
+    // Encode what sort of argument it must be in the low 2 bits of the ArgNo.
+    return Sig.push_back((ArgNo << 2) | Tmp);
+  }
+  
+  case MVT::iPTR: {
+    unsigned AddrSpace = 0;
+    if (R->isSubClassOf("LLVMQualPointerType")) {
+      AddrSpace = R->getValueAsInt("AddrSpace");
+      assert(AddrSpace < 256 && "Address space exceeds 255");
+    }
+    if (AddrSpace) {
+      Sig.push_back(IIT_ANYPTR);
+      Sig.push_back(AddrSpace);
+    } else {
+      Sig.push_back(IIT_PTR);
+    }
+    return EncodeFixedType(R->getValueAsDef("ElTy"), ArgCodes, Sig);
+  }
+  }
+  
+  if (EVT(VT).isVector()) {
     EVT VVT = VT;
-    OS << "VectorType::get(";
-    EmitTypeForValueType(OS, VVT.getVectorElementType().getSimpleVT().SimpleTy);
-    OS << ", " << VVT.getVectorNumElements() << ")";
-  } else if (VT == MVT::iPTR) {
-    OS << "PointerType::getUnqual(";
-    EmitTypeGenerate(OS, ArgType->getValueAsDef("ElTy"), ArgNo);
-    OS << ")";
-  } else if (VT == MVT::iPTRAny) {
-    // Make sure the user has passed us an argument type to overload. If not,
-    // treat it as an ordinary (not overloaded) intrinsic.
-    OS << "(" << ArgNo << " < Tys.size()) ? Tys[" << ArgNo
-    << "] : PointerType::getUnqual(";
-    EmitTypeGenerate(OS, ArgType->getValueAsDef("ElTy"), ArgNo);
-    OS << ")";
-    ++ArgNo;
-  } else if (VT == MVT::isVoid) {
-    if (ArgNo == 0)
-      OS << "Type::getVoidTy(Context)";
-    else
-      // MVT::isVoid is used to mean varargs here.
-      OS << "...";
-  } else {
-    EmitTypeForValueType(OS, VT);
+    switch (VVT.getVectorNumElements()) {
+    default: throw "unhandled vector type width in intrinsic!";
+    case 2: Sig.push_back(IIT_V2); break;
+    case 4: Sig.push_back(IIT_V4); break;
+    case 8: Sig.push_back(IIT_V8); break;
+    case 16: Sig.push_back(IIT_V16); break;
+    case 32: Sig.push_back(IIT_V32); break;
+    }
+    
+    return EncodeFixedValueType(VVT.getVectorElementType().
+                                getSimpleVT().SimpleTy, Sig);
   }
-}
-
-/// RecordListComparator - Provide a deterministic comparator for lists of
-/// records.
-namespace {
-  typedef std::pair<std::vector<Record*>, std::vector<Record*> > RecPair;
-  struct RecordListComparator {
-    bool operator()(const RecPair &LHS,
-                    const RecPair &RHS) const {
-      unsigned i = 0;
-      const std::vector<Record*> *LHSVec = &LHS.first;
-      const std::vector<Record*> *RHSVec = &RHS.first;
-      unsigned RHSSize = RHSVec->size();
-      unsigned LHSSize = LHSVec->size();
-
-      for (; i != LHSSize; ++i) {
-        if (i == RHSSize) return false;  // RHS is shorter than LHS.
-        if ((*LHSVec)[i] != (*RHSVec)[i])
-          return (*LHSVec)[i]->getName() < (*RHSVec)[i]->getName();
-      }
 
-      if (i != RHSSize) return true;
+  EncodeFixedValueType(VT, Sig);
+}
 
-      i = 0;
-      LHSVec = &LHS.second;
-      RHSVec = &RHS.second;
-      RHSSize = RHSVec->size();
-      LHSSize = LHSVec->size();
+#ifdef _MSC_VER
+#pragma optimize("",on)
+#endif
 
-      for (i = 0; i != LHSSize; ++i) {
-        if (i == RHSSize) return false;  // RHS is shorter than LHS.
-        if ((*LHSVec)[i] != (*RHSVec)[i])
-          return (*LHSVec)[i]->getName() < (*RHSVec)[i]->getName();
-      }
-
-      return i != RHSSize;
+/// ComputeFixedEncoding - If we can encode the type signature for this
+/// intrinsic into 32 bits, return it.  If not, return ~0U.
+static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
+                                 std::vector<unsigned char> &TypeSig) {
+  std::vector<unsigned char> ArgCodes;
+  
+  if (Int.IS.RetVTs.empty())
+    TypeSig.push_back(IIT_Done);
+  else if (Int.IS.RetVTs.size() == 1 &&
+           Int.IS.RetVTs[0] == MVT::isVoid)
+    TypeSig.push_back(IIT_Done);
+  else {
+    switch (Int.IS.RetVTs.size()) {
+      case 1: break;
+      case 2: TypeSig.push_back(IIT_STRUCT2); break;
+      case 3: TypeSig.push_back(IIT_STRUCT3); break;
+      case 4: TypeSig.push_back(IIT_STRUCT4); break;
+      case 5: TypeSig.push_back(IIT_STRUCT5); break;
+      default: assert(0 && "Unhandled case in struct");
     }
-  };
+    
+    for (unsigned i = 0, e = Int.IS.RetVTs.size(); i != e; ++i)
+      EncodeFixedType(Int.IS.RetTypeDefs[i], ArgCodes, TypeSig);
+  }
+  
+  for (unsigned i = 0, e = Int.IS.ParamTypeDefs.size(); i != e; ++i)
+    EncodeFixedType(Int.IS.ParamTypeDefs[i], ArgCodes, TypeSig);
 }
 
-void IntrinsicEmitter::EmitVerifier(const std::vector<CodeGenIntrinsic> &Ints, 
-                                    raw_ostream &OS) {
-  OS << "// Verifier::visitIntrinsicFunctionCall code.\n";
-  OS << "#ifdef GET_INTRINSIC_VERIFIER\n";
-  OS << "  switch (ID) {\n";
-  OS << "  default: llvm_unreachable(\"Invalid intrinsic!\");\n";
+static void printIITEntry(raw_ostream &OS, unsigned char X) {
+  OS << (unsigned)X;
+}
+
+void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints, 
+                                     raw_ostream &OS) {
+  // If we can compute a 32-bit fixed encoding for this intrinsic, do so and
+  // capture it in this vector, otherwise store a ~0U.
+  std::vector<unsigned> FixedEncodings;
+  
+  SequenceToOffsetTable<std::vector<unsigned char> > LongEncodingTable;
   
-  // This checking can emit a lot of very common code.  To reduce the amount of
-  // code that we emit, batch up cases that have identical types.  This avoids
-  // problems where GCC can run out of memory compiling Verifier.cpp.
-  typedef std::map<RecPair, std::vector<unsigned>, RecordListComparator> MapTy;
-  MapTy UniqueArgInfos;
+  std::vector<unsigned char> TypeSig;
   
   // Compute the unique argument type info.
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i)
-    UniqueArgInfos[make_pair(Ints[i].IS.RetTypeDefs,
-                             Ints[i].IS.ParamTypeDefs)].push_back(i);
-
-  // Loop through the array, emitting one comparison for each batch.
-  for (MapTy::iterator I = UniqueArgInfos.begin(),
-       E = UniqueArgInfos.end(); I != E; ++I) {
-    for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-      OS << "  case Intrinsic::" << Ints[I->second[i]].EnumName << ":\t\t// "
-         << Ints[I->second[i]].Name << "\n";
-    
-    const RecPair &ArgTypes = I->first;
-    const std::vector<Record*> &RetTys = ArgTypes.first;
-    const std::vector<Record*> &ParamTys = ArgTypes.second;
-    std::vector<unsigned> OverloadedTypeIndices;
-
-    OS << "    VerifyIntrinsicPrototype(ID, IF, " << RetTys.size() << ", "
-       << ParamTys.size();
-
-    // Emit return types.
-    for (unsigned j = 0, je = RetTys.size(); j != je; ++j) {
-      Record *ArgType = RetTys[j];
-      OS << ", ";
-
-      if (ArgType->isSubClassOf("LLVMMatchType")) {
-        unsigned Number = ArgType->getValueAsInt("Number");
-        assert(Number < OverloadedTypeIndices.size() &&
-               "Invalid matching number!");
-        Number = OverloadedTypeIndices[Number];
-        if (ArgType->isSubClassOf("LLVMExtendedElementVectorType"))
-          OS << "~(ExtendedElementVectorType | " << Number << ")";
-        else if (ArgType->isSubClassOf("LLVMTruncatedElementVectorType"))
-          OS << "~(TruncatedElementVectorType | " << Number << ")";
-        else
-          OS << "~" << Number;
-      } else {
-        MVT::SimpleValueType VT = getValueType(ArgType->getValueAsDef("VT"));
-        OS << getEnumName(VT);
-
-        if (EVT(VT).isOverloaded())
-          OverloadedTypeIndices.push_back(j);
-
-        if (VT == MVT::isVoid && j != 0 && j != je - 1)
-          throw "Var arg type not last argument";
+  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
+    // Get the signature for the intrinsic.
+    TypeSig.clear();
+    ComputeFixedEncoding(Ints[i], TypeSig);
+
+    // Check to see if we can encode it into a 32-bit word.  We can only encode
+    // 8 nibbles into a 32-bit word.
+    if (TypeSig.size() <= 8) {
+      bool Failed = false;
+      unsigned Result = 0;
+      for (unsigned i = 0, e = TypeSig.size(); i != e; ++i) {
+        // If we had an unencodable argument, bail out.
+        if (TypeSig[i] > 15) {
+          Failed = true;
+          break;
+        }
+        Result = (Result << 4) | TypeSig[e-i-1];
       }
-    }
-
-    // Emit the parameter types.
-    for (unsigned j = 0, je = ParamTys.size(); j != je; ++j) {
-      Record *ArgType = ParamTys[j];
-      OS << ", ";
-
-      if (ArgType->isSubClassOf("LLVMMatchType")) {
-        unsigned Number = ArgType->getValueAsInt("Number");
-        assert(Number < OverloadedTypeIndices.size() &&
-               "Invalid matching number!");
-        Number = OverloadedTypeIndices[Number];
-        if (ArgType->isSubClassOf("LLVMExtendedElementVectorType"))
-          OS << "~(ExtendedElementVectorType | " << Number << ")";
-        else if (ArgType->isSubClassOf("LLVMTruncatedElementVectorType"))
-          OS << "~(TruncatedElementVectorType | " << Number << ")";
-        else
-          OS << "~" << Number;
-      } else {
-        MVT::SimpleValueType VT = getValueType(ArgType->getValueAsDef("VT"));
-        OS << getEnumName(VT);
-
-        if (EVT(VT).isOverloaded())
-          OverloadedTypeIndices.push_back(j + RetTys.size());
-
-        if (VT == MVT::isVoid && j != 0 && j != je - 1)
-          throw "Var arg type not last argument";
+      
+      // If this could be encoded into a 31-bit word, return it.
+      if (!Failed && (Result >> 31) == 0) {
+        FixedEncodings.push_back(Result);
+        continue;
       }
     }
+
+    // Otherwise, we're going to unique the sequence into the
+    // LongEncodingTable, and use its offset in the 32-bit table instead.
+    LongEncodingTable.add(TypeSig);
       
-    OS << ");\n";
-    OS << "    break;\n";
+    // This is a placehold that we'll replace after the table is laid out.
+    FixedEncodings.push_back(~0U);
   }
-  OS << "  }\n";
-  OS << "#endif\n\n";
-}
-
-void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints, 
-                                     raw_ostream &OS) {
-  OS << "// Code for generating Intrinsic function declarations.\n";
-  OS << "#ifdef GET_INTRINSIC_GENERATOR\n";
-  OS << "  switch (id) {\n";
-  OS << "  default: llvm_unreachable(\"Invalid intrinsic!\");\n";
   
-  // Similar to GET_INTRINSIC_VERIFIER, batch up cases that have identical
-  // types.
-  typedef std::map<RecPair, std::vector<unsigned>, RecordListComparator> MapTy;
-  MapTy UniqueArgInfos;
+  LongEncodingTable.layout();
   
-  // Compute the unique argument type info.
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i)
-    UniqueArgInfos[make_pair(Ints[i].IS.RetTypeDefs,
-                             Ints[i].IS.ParamTypeDefs)].push_back(i);
+  OS << "// Global intrinsic function declaration type table.\n";
+  OS << "#ifdef GET_INTRINSIC_GENERATOR_GLOBAL\n";
 
-  // Loop through the array, emitting one generator for each batch.
-  std::string IntrinsicStr = TargetPrefix + "Intrinsic::";
+  OS << "static const unsigned IIT_Table[] = {\n  ";
   
-  for (MapTy::iterator I = UniqueArgInfos.begin(),
-       E = UniqueArgInfos.end(); I != E; ++I) {
-    for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-      OS << "  case " << IntrinsicStr << Ints[I->second[i]].EnumName 
-         << ":\t\t// " << Ints[I->second[i]].Name << "\n";
+  for (unsigned i = 0, e = FixedEncodings.size(); i != e; ++i) {
+    if ((i & 7) == 7)
+      OS << "\n  ";
     
-    const RecPair &ArgTypes = I->first;
-    const std::vector<Record*> &RetTys = ArgTypes.first;
-    const std::vector<Record*> &ParamTys = ArgTypes.second;
-
-    unsigned N = ParamTys.size();
-
-    if (N > 1 &&
-        getValueType(ParamTys[N - 1]->getValueAsDef("VT")) == MVT::isVoid) {
-      OS << "    IsVarArg = true;\n";
-      --N;
+    // If the entry fit in the table, just emit it.
+    if (FixedEncodings[i] != ~0U) {
+      OS << "0x" << utohexstr(FixedEncodings[i]) << ", ";
+      continue;
     }
-
-    unsigned ArgNo = 0;
-    OS << "    ResultTy = ";
-    EmitTypeGenerate(OS, RetTys, ArgNo);
-    OS << ";\n";
     
-    for (unsigned j = 0; j != N; ++j) {
-      OS << "    ArgTys.push_back(";
-      EmitTypeGenerate(OS, ParamTys[j], ArgNo);
-      OS << ");\n";
-    }
+    TypeSig.clear();
+    ComputeFixedEncoding(Ints[i], TypeSig);
 
-    OS << "    break;\n";
+    
+    // Otherwise, emit the offset into the long encoding table.  We emit it this
+    // way so that it is easier to read the offset in the .def file.
+    OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
   }
+  
+  OS << "0\n};\n\n";
+  
+  // Emit the shared table of register lists.
+  OS << "static const unsigned char IIT_LongEncodingTable[] = {\n";
+  if (!LongEncodingTable.empty())
+    LongEncodingTable.emit(OS, printIITEntry);
+  OS << "  255\n};\n\n";
+  
+  OS << "#endif\n\n";  // End of GET_INTRINSIC_GENERATOR_GLOBAL
+}
 
-  OS << "  }\n";
-  OS << "#endif\n\n";
+enum ModRefKind {
+  MRK_none,
+  MRK_readonly,
+  MRK_readnone
+};
+
+static ModRefKind getModRefKind(const CodeGenIntrinsic &intrinsic) {
+  switch (intrinsic.ModRef) {
+  case CodeGenIntrinsic::NoMem:
+    return MRK_readnone;
+  case CodeGenIntrinsic::ReadArgMem:
+  case CodeGenIntrinsic::ReadMem:
+    return MRK_readonly;
+  case CodeGenIntrinsic::ReadWriteArgMem:
+  case CodeGenIntrinsic::ReadWriteMem:
+    return MRK_none;
+  }
+  llvm_unreachable("bad mod-ref kind");
 }
 
 namespace {
-  enum ModRefKind {
-    MRK_none,
-    MRK_readonly,
-    MRK_readnone
-  };
-
-  ModRefKind getModRefKind(const CodeGenIntrinsic &intrinsic) {
-    switch (intrinsic.ModRef) {
-    case CodeGenIntrinsic::NoMem:
-      return MRK_readnone;
-    case CodeGenIntrinsic::ReadArgMem:
-    case CodeGenIntrinsic::ReadMem:
-      return MRK_readonly;
-    case CodeGenIntrinsic::ReadWriteArgMem:
-    case CodeGenIntrinsic::ReadWriteMem:
-      return MRK_none;
-    }
-    llvm_unreachable("bad mod-ref kind");
+struct AttributeComparator {
+  bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
+    // Sort throwing intrinsics after non-throwing intrinsics.
+    if (L->canThrow != R->canThrow)
+      return R->canThrow;
+
+    if (L->isNoReturn != R->isNoReturn)
+      return R->isNoReturn;
+
+    // Try to order by readonly/readnone attribute.
+    ModRefKind LK = getModRefKind(*L);
+    ModRefKind RK = getModRefKind(*R);
+    if (LK != RK) return (LK > RK);
+
+    // Order by argument attributes.
+    // This is reliable because each side is already sorted internally.
+    return (L->ArgumentAttributes < R->ArgumentAttributes);
   }
-
-  struct AttributeComparator {
-    bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
-      // Sort throwing intrinsics after non-throwing intrinsics.
-      if (L->canThrow != R->canThrow)
-        return R->canThrow;
-
-      // Try to order by readonly/readnone attribute.
-      ModRefKind LK = getModRefKind(*L);
-      ModRefKind RK = getModRefKind(*R);
-      if (LK != RK) return (LK > RK);
-
-      // Order by argument attributes.
-      // This is reliable because each side is already sorted internally.
-      return (L->ArgumentAttributes < R->ArgumentAttributes);
-    }
-  };
-}
+};
+} // End anonymous namespace
 
 /// EmitAttributes - This emits the Intrinsic::getAttributes method.
 void IntrinsicEmitter::
@@ -592,16 +590,30 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
 
     ModRefKind modRef = getModRefKind(intrinsic);
 
-    if (!intrinsic.canThrow || modRef) {
+    if (!intrinsic.canThrow || modRef || intrinsic.isNoReturn) {
       OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(~0, ";
+      bool Emitted = false;
       if (!intrinsic.canThrow) {
         OS << "Attribute::NoUnwind";
-        if (modRef) OS << '|';
+        Emitted = true;
+      }
+      
+      if (intrinsic.isNoReturn) {
+        if (Emitted) OS << '|';
+        OS << "Attribute::NoReturn";
+        Emitted = true;
       }
+
       switch (modRef) {
       case MRK_none: break;
-      case MRK_readonly: OS << "Attribute::ReadOnly"; break;
-      case MRK_readnone: OS << "Attribute::ReadNone"; break;
+      case MRK_readonly:
+        if (Emitted) OS << '|';
+        OS << "Attribute::ReadOnly";
+        break;
+      case MRK_readnone:
+        if (Emitted) OS << '|';
+        OS << "Attribute::ReadNone"; 
+        break;
       }
       OS << ");\n";
     }
@@ -616,7 +628,8 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   
   OS << "    }\n";
   OS << "  }\n";
-  OS << "  return AttrListPtr::get(AWI, NumAttrs);\n";
+  OS << "  return AttrListPtr::get(ArrayRef<AttributeWithIndex>(AWI, "
+             "NumAttrs));\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
 }
@@ -730,3 +743,11 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "}\n";
   OS << "#endif\n\n";
 }
+
+namespace llvm {
+
+void EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly = false) {
+  IntrinsicEmitter(RK, TargetOnly).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/IntrinsicEmitter.h b/utils/TableGen/IntrinsicEmitter.h
deleted file mode 100644
index f9bcd59..0000000
--- a/utils/TableGen/IntrinsicEmitter.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- IntrinsicEmitter.h - Generate intrinsic information ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend emits information about intrinsic functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef INTRINSIC_EMITTER_H
-#define INTRINSIC_EMITTER_H
-
-#include "CodeGenIntrinsics.h"
-#include "llvm/TableGen/TableGenBackend.h"
-
-namespace llvm {
-  class IntrinsicEmitter : public TableGenBackend {
-    RecordKeeper &Records;
-    bool TargetOnly;
-    std::string TargetPrefix;
-    
-  public:
-    IntrinsicEmitter(RecordKeeper &R, bool T = false) 
-      : Records(R), TargetOnly(T) {}
-
-    void run(raw_ostream &OS);
-
-    void EmitPrefix(raw_ostream &OS);
-    
-    void EmitEnumInfo(const std::vector<CodeGenIntrinsic> &Ints, 
-                      raw_ostream &OS);
-
-    void EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints, 
-                              raw_ostream &OS);
-    void EmitIntrinsicToNameTable(const std::vector<CodeGenIntrinsic> &Ints, 
-                                  raw_ostream &OS);
-    void EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints, 
-                                      raw_ostream &OS);
-    void EmitVerifier(const std::vector<CodeGenIntrinsic> &Ints, 
-                      raw_ostream &OS);
-    void EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints, 
-                       raw_ostream &OS);
-    void EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints,
-                        raw_ostream &OS);
-    void EmitModRefBehavior(const std::vector<CodeGenIntrinsic> &Ints,
-                            raw_ostream &OS);
-    void EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints, 
-                                      raw_ostream &OS);
-    void EmitSuffix(raw_ostream &OS);
-  };
-
-} // End llvm namespace
-
-#endif
-
-
-
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 802d112..8d9d419 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -9,16 +9,62 @@
 
 #define DEBUG_TYPE "pseudo-lowering"
 #include "CodeGenInstruction.h"
-#include "PseudoLoweringEmitter.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
+#include "CodeGenTarget.h"
 #include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <vector>
 using namespace llvm;
 
+namespace {
+class PseudoLoweringEmitter {
+  struct OpData {
+    enum MapKind { Operand, Imm, Reg };
+    MapKind Kind;
+    union {
+      unsigned Operand;   // Operand number mapped to.
+      uint64_t Imm;       // Integer immedate value.
+      Record *Reg;        // Physical register.
+    } Data;
+  };
+  struct PseudoExpansion {
+    CodeGenInstruction Source;  // The source pseudo instruction definition.
+    CodeGenInstruction Dest;    // The destination instruction to lower to.
+    IndexedMap<OpData> OperandMap;
+
+    PseudoExpansion(CodeGenInstruction &s, CodeGenInstruction &d,
+                    IndexedMap<OpData> &m) :
+      Source(s), Dest(d), OperandMap(m) {}
+  };
+
+  RecordKeeper &Records;
+
+  // It's overkill to have an instance of the full CodeGenTarget object,
+  // but it loads everything on demand, not in the constructor, so it's
+  // lightweight in performance, so it works out OK.
+  CodeGenTarget Target;
+
+  SmallVector<PseudoExpansion, 64> Expansions;
+
+  unsigned addDagOperandMapping(Record *Rec, DagInit *Dag,
+                                CodeGenInstruction &Insn,
+                                IndexedMap<OpData> &OperandMap,
+                                unsigned BaseIdx);
+  void evaluateExpansion(Record *Pseudo);
+  void emitLoweringEmitter(raw_ostream &o);
+public:
+  PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {}
+
+  /// run - Output the pseudo-lowerings.
+  void run(raw_ostream &o);
+};
+} // End anonymous namespace
+
 // FIXME: This pass currently can only expand a pseudo to a single instruction.
 //        The pseudo expansion really should take a list of dags, not just
 //        a single dag, so we can do fancier things.
@@ -150,7 +196,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
 
 void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
   // Emit file header.
-  EmitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
+  emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
 
   o << "bool " << Target.getName() + "AsmPrinter" << "::\n"
     << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
@@ -242,3 +288,10 @@ void PseudoLoweringEmitter::run(raw_ostream &o) {
   emitLoweringEmitter(o);
 }
 
+namespace llvm {
+
+void EmitPseudoLowering(RecordKeeper &RK, raw_ostream &OS) {
+  PseudoLoweringEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/PseudoLoweringEmitter.h b/utils/TableGen/PseudoLoweringEmitter.h
deleted file mode 100644
index 325bc8b..0000000
--- a/utils/TableGen/PseudoLoweringEmitter.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//===- PseudoLoweringEmitter.h - PseudoLowering Generator -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PSEUDOLOWERINGEMITTER_H
-#define PSEUDOLOWERINGEMITTER_H
-
-#include "CodeGenInstruction.h"
-#include "CodeGenTarget.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/SmallVector.h"
-
-namespace llvm {
-
-class PseudoLoweringEmitter : public TableGenBackend {
-  struct OpData {
-    enum MapKind { Operand, Imm, Reg };
-    MapKind Kind;
-    union {
-      unsigned Operand;   // Operand number mapped to.
-      uint64_t Imm;       // Integer immedate value.
-      Record *Reg;        // Physical register.
-    } Data;
-  };
-  struct PseudoExpansion {
-    CodeGenInstruction Source;  // The source pseudo instruction definition.
-    CodeGenInstruction Dest;    // The destination instruction to lower to.
-    IndexedMap<OpData> OperandMap;
-
-    PseudoExpansion(CodeGenInstruction &s, CodeGenInstruction &d,
-                    IndexedMap<OpData> &m) :
-      Source(s), Dest(d), OperandMap(m) {}
-  };
-
-  RecordKeeper &Records;
-
-  // It's overkill to have an instance of the full CodeGenTarget object,
-  // but it loads everything on demand, not in the constructor, so it's
-  // lightweight in performance, so it works out OK.
-  CodeGenTarget Target;
-
-  SmallVector<PseudoExpansion, 64> Expansions;
-
-  unsigned addDagOperandMapping(Record *Rec, DagInit *Dag,
-                                CodeGenInstruction &Insn,
-                                IndexedMap<OpData> &OperandMap,
-                                unsigned BaseIdx);
-  void evaluateExpansion(Record *Pseudo);
-  void emitLoweringEmitter(raw_ostream &o);
-public:
-  PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {}
-
-  /// run - Output the pseudo-lowerings.
-  void run(raw_ostream &o);
-};
-
-} // end llvm namespace
-
-#endif
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 97fcca3..b913878 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -13,21 +13,58 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RegisterInfoEmitter.h"
-#include "CodeGenTarget.h"
 #include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
 #include <set>
+#include <vector>
 using namespace llvm;
 
+namespace {
+class RegisterInfoEmitter {
+  RecordKeeper &Records;
+public:
+  RegisterInfoEmitter(RecordKeeper &R) : Records(R) {}
+
+  // runEnums - Print out enum values for all of the registers.
+  void runEnums(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank);
+
+  // runMCDesc - Print out MC register descriptions.
+  void runMCDesc(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank);
+
+  // runTargetHeader - Emit a header fragment for the register info emitter.
+  void runTargetHeader(raw_ostream &o, CodeGenTarget &Target,
+                       CodeGenRegBank &Bank);
+
+  // runTargetDesc - Output the target register and register file descriptions.
+  void runTargetDesc(raw_ostream &o, CodeGenTarget &Target,
+                     CodeGenRegBank &Bank);
+
+  // run - Output the register file description.
+  void run(raw_ostream &o);
+
+private:
+  void EmitRegMapping(raw_ostream &o,
+                      const std::vector<CodeGenRegister*> &Regs, bool isCtor);
+  void EmitRegMappingTables(raw_ostream &o,
+                            const std::vector<CodeGenRegister*> &Regs,
+                            bool isCtor);
+  void EmitRegClasses(raw_ostream &OS, CodeGenTarget &Target);
+
+  void EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
+                           const std::string &ClassName);
+};
+} // End anonymous namespace
+
 // runEnums - Print out enum values for all of the registers.
 void RegisterInfoEmitter::runEnums(raw_ostream &OS,
                                    CodeGenTarget &Target, CodeGenRegBank &Bank) {
@@ -38,7 +75,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
 
   std::string Namespace = Registers[0]->TheDef->getValueAsString("Namespace");
 
-  EmitSourceFileHeader("Target Register Enum Values", OS);
+  emitSourceFileHeader("Target Register Enum Values", OS);
 
   OS << "\n#ifdef GET_REGINFO_ENUM\n";
   OS << "#undef GET_REGINFO_ENUM\n";
@@ -151,6 +188,17 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "unsigned " << ClassName << "::getNumRegPressureSets() const {\n"
      << "  return " << NumSets << ";\n}\n\n";
 
+  OS << "// Get the name of this register unit pressure set.\n"
+     << "const char *" << ClassName << "::\n"
+     << "getRegPressureSetName(unsigned Idx) const {\n"
+     << "  static const char *PressureNameTable[] = {\n";
+  for (unsigned i = 0; i < NumSets; ++i ) {
+    OS << "    \"" << RegBank.getRegPressureSet(i).Name << "\",\n";
+  }
+  OS << "    0 };\n"
+     << "  return PressureNameTable[Idx];\n"
+     << "}\n\n";
+
   OS << "// Get the register unit pressure limit for this dimension.\n"
      << "// This limit must be adjusted dynamically for reserved registers.\n"
      << "unsigned " << ClassName << "::\n"
@@ -159,7 +207,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
   for (unsigned i = 0; i < NumSets; ++i ) {
     const RegUnitSet &RegUnits = RegBank.getRegPressureSet(i);
     OS << "    " << RegBank.getRegUnitSetWeight(RegUnits.Units)
-       << ",  \t// " << i << ": " << RegBank.getRegPressureSet(i).Name << "\n";
+       << ",  \t// " << i << ": " << RegUnits.Name << "\n";
   }
   OS << "    0 };\n"
      << "  return PressureLimitTable[Idx];\n"
@@ -431,101 +479,203 @@ public:
   }
 };
 
-static void printRegister(raw_ostream &OS, const CodeGenRegister *Reg) {
-  OS << getQualifiedName(Reg->TheDef);
-}
-
 static void printSimpleValueType(raw_ostream &OS, MVT::SimpleValueType VT) {
   OS << getEnumName(VT);
 }
 
+static void printSubRegIndex(raw_ostream &OS, const CodeGenSubRegIndex *Idx) {
+  OS << Idx->EnumValue;
+}
+
+// Differentially encoded register and regunit lists allow for better
+// compression on regular register banks. The sequence is computed from the
+// differential list as:
+//
+//   out[0] = InitVal;
+//   out[n+1] = out[n] + diff[n]; // n = 0, 1, ...
+//
+// The initial value depends on the specific list. The list is terminated by a
+// 0 differential which means we can't encode repeated elements.
+
+typedef SmallVector<uint16_t, 4> DiffVec;
+
+// Differentially encode a sequence of numbers into V. The starting value and
+// terminating 0 are not added to V, so it will have the same size as List.
+static
+DiffVec &diffEncode(DiffVec &V, unsigned InitVal, ArrayRef<unsigned> List) {
+  assert(V.empty() && "Clear DiffVec before diffEncode.");
+  uint16_t Val = uint16_t(InitVal);
+  for (unsigned i = 0; i != List.size(); ++i) {
+    uint16_t Cur = List[i];
+    V.push_back(Cur - Val);
+    Val = Cur;
+  }
+  return V;
+}
+
+template<typename Iter>
+static
+DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) {
+  assert(V.empty() && "Clear DiffVec before diffEncode.");
+  uint16_t Val = uint16_t(InitVal);
+  for (Iter I = Begin; I != End; ++I) {
+    uint16_t Cur = (*I)->EnumValue;
+    V.push_back(Cur - Val);
+    Val = Cur;
+  }
+  return V;
+}
+
+static void printDiff16(raw_ostream &OS, uint16_t Val) {
+  OS << Val;
+}
+
 //
 // runMCDesc - Print out MC register descriptions.
 //
 void
 RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
                                CodeGenRegBank &RegBank) {
-  EmitSourceFileHeader("MC Register Information", OS);
+  emitSourceFileHeader("MC Register Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_MC_DESC\n";
   OS << "#undef GET_REGINFO_MC_DESC\n";
 
   const std::vector<CodeGenRegister*> &Regs = RegBank.getRegisters();
-  std::map<const CodeGenRegister*, CodeGenRegister::Set> Overlaps;
-  RegBank.computeOverlaps(Overlaps);
 
   // The lists of sub-registers, super-registers, and overlaps all go in the
   // same array. That allows us to share suffixes.
   typedef std::vector<const CodeGenRegister*> RegVec;
-  SmallVector<RegVec, 4> SubRegLists(Regs.size());
-  SmallVector<RegVec, 4> OverlapLists(Regs.size());
-  SequenceToOffsetTable<RegVec, CodeGenRegister::Less> RegSeqs;
+
+  // Differentially encoded lists.
+  SequenceToOffsetTable<DiffVec> DiffSeqs;
+  SmallVector<DiffVec, 4> SubRegLists(Regs.size());
+  SmallVector<DiffVec, 4> SuperRegLists(Regs.size());
+  SmallVector<DiffVec, 4> OverlapLists(Regs.size());
+  SmallVector<DiffVec, 4> RegUnitLists(Regs.size());
+  SmallVector<unsigned, 4> RegUnitInitScale(Regs.size());
+
+  // Keep track of sub-register names as well. These are not differentially
+  // encoded.
+  typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
+  SequenceToOffsetTable<SubRegIdxVec> SubRegIdxSeqs;
+  SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
+
+  SequenceToOffsetTable<std::string> RegStrings;
 
   // Precompute register lists for the SequenceToOffsetTable.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
     const CodeGenRegister *Reg = Regs[i];
 
+    RegStrings.add(Reg->getName());
+
     // Compute the ordered sub-register list.
     SetVector<const CodeGenRegister*> SR;
     Reg->addSubRegsPreOrder(SR, RegBank);
-    RegVec &SubRegList = SubRegLists[i];
-    SubRegList.assign(SR.begin(), SR.end());
-    RegSeqs.add(SubRegList);
+    diffEncode(SubRegLists[i], Reg->EnumValue, SR.begin(), SR.end());
+    DiffSeqs.add(SubRegLists[i]);
+
+    // Compute the corresponding sub-register indexes.
+    SubRegIdxVec &SRIs = SubRegIdxLists[i];
+    for (unsigned j = 0, je = SR.size(); j != je; ++j)
+      SRIs.push_back(Reg->getSubRegIndex(SR[j]));
+    SubRegIdxSeqs.add(SRIs);
 
     // Super-registers are already computed.
     const RegVec &SuperRegList = Reg->getSuperRegs();
-    RegSeqs.add(SuperRegList);
-
-    // The list of overlaps doesn't need to have any particular order, except
-    // Reg itself must be the first element. Pick an ordering that has one of
-    // the other lists as a suffix.
-    RegVec &OverlapList = OverlapLists[i];
-    const RegVec &Suffix = SubRegList.size() > SuperRegList.size() ?
-                           SubRegList : SuperRegList;
-    CodeGenRegister::Set Omit(Suffix.begin(), Suffix.end());
-
-    // First element is Reg itself.
-    OverlapList.push_back(Reg);
-    Omit.insert(Reg);
-
-    // Any elements not in Suffix.
-    const CodeGenRegister::Set &OSet = Overlaps[Reg];
-    std::set_difference(OSet.begin(), OSet.end(),
-                        Omit.begin(), Omit.end(),
-                        std::back_inserter(OverlapList),
-                        CodeGenRegister::Less());
-
-    // Finally, Suffix itself.
-    OverlapList.insert(OverlapList.end(), Suffix.begin(), Suffix.end());
-    RegSeqs.add(OverlapList);
+    diffEncode(SuperRegLists[i], Reg->EnumValue,
+               SuperRegList.begin(), SuperRegList.end());
+    DiffSeqs.add(SuperRegLists[i]);
+
+    // The list of overlaps doesn't need to have any particular order, and Reg
+    // itself must be omitted.
+    DiffVec &OverlapList = OverlapLists[i];
+    CodeGenRegister::Set OSet;
+    Reg->computeOverlaps(OSet, RegBank);
+    OSet.erase(Reg);
+    diffEncode(OverlapList, Reg->EnumValue, OSet.begin(), OSet.end());
+    DiffSeqs.add(OverlapList);
+
+    // Differentially encode the register unit list, seeded by register number.
+    // First compute a scale factor that allows more diff-lists to be reused:
+    //
+    //   D0 -> (S0, S1)
+    //   D1 -> (S2, S3)
+    //
+    // A scale factor of 2 allows D0 and D1 to share a diff-list. The initial
+    // value for the differential decoder is the register number multiplied by
+    // the scale.
+    //
+    // Check the neighboring registers for arithmetic progressions.
+    unsigned ScaleA = ~0u, ScaleB = ~0u;
+    ArrayRef<unsigned> RUs = Reg->getNativeRegUnits();
+    if (i > 0 && Regs[i-1]->getNativeRegUnits().size() == RUs.size())
+      ScaleB = RUs.front() - Regs[i-1]->getNativeRegUnits().front();
+    if (i+1 != Regs.size() &&
+        Regs[i+1]->getNativeRegUnits().size() == RUs.size())
+      ScaleA = Regs[i+1]->getNativeRegUnits().front() - RUs.front();
+    unsigned Scale = std::min(ScaleB, ScaleA);
+    // Default the scale to 0 if it can't be encoded in 4 bits.
+    if (Scale >= 16)
+      Scale = 0;
+    RegUnitInitScale[i] = Scale;
+    DiffSeqs.add(diffEncode(RegUnitLists[i], Scale * Reg->EnumValue, RUs));
   }
 
   // Compute the final layout of the sequence table.
-  RegSeqs.layout();
+  DiffSeqs.layout();
+  SubRegIdxSeqs.layout();
 
   OS << "namespace llvm {\n\n";
 
   const std::string &TargetName = Target.getName();
 
-  // Emit the shared table of register lists.
-  OS << "extern const uint16_t " << TargetName << "RegLists[] = {\n";
-  RegSeqs.emit(OS, printRegister);
+  // Emit the shared table of differential lists.
+  OS << "extern const uint16_t " << TargetName << "RegDiffLists[] = {\n";
+  DiffSeqs.emit(OS, printDiff16);
+  OS << "};\n\n";
+
+  // Emit the table of sub-register indexes.
+  OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[] = {\n";
+  SubRegIdxSeqs.emit(OS, printSubRegIndex);
+  OS << "};\n\n";
+
+  // Emit the string table.
+  RegStrings.layout();
+  OS << "extern const char " << TargetName << "RegStrings[] = {\n";
+  RegStrings.emit(OS, printChar);
   OS << "};\n\n";
 
   OS << "extern const MCRegisterDesc " << TargetName
      << "RegDesc[] = { // Descriptors\n";
-  OS << "  { \"NOREG\", 0, 0, 0 },\n";
+  OS << "  { " << RegStrings.get("") << ", 0, 0, 0, 0, 0 },\n";
 
   // Emit the register descriptors now.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
     const CodeGenRegister *Reg = Regs[i];
-    OS << "  { \"" << Reg->getName() << "\", "
-       << RegSeqs.get(OverlapLists[i]) << ", "
-       << RegSeqs.get(SubRegLists[i]) << ", "
-       << RegSeqs.get(Reg->getSuperRegs()) << " },\n";
+    OS << "  { " << RegStrings.get(Reg->getName()) << ", "
+       << DiffSeqs.get(OverlapLists[i]) << ", "
+       << DiffSeqs.get(SubRegLists[i]) << ", "
+       << DiffSeqs.get(SuperRegLists[i]) << ", "
+       << SubRegIdxSeqs.get(SubRegIdxLists[i]) << ", "
+       << (DiffSeqs.get(RegUnitLists[i])*16 + RegUnitInitScale[i]) << " },\n";
   }
   OS << "};\n\n";      // End of register descriptors...
 
+  // Emit the table of register unit roots. Each regunit has one or two root
+  // registers.
+  OS << "extern const uint16_t " << TargetName << "RegUnitRoots[][2] = {\n";
+  for (unsigned i = 0, e = RegBank.getNumNativeRegUnits(); i != e; ++i) {
+    ArrayRef<const CodeGenRegister*> Roots = RegBank.getRegUnit(i).getRoots();
+    assert(!Roots.empty() && "All regunits must have a root register.");
+    assert(Roots.size() <= 2 && "More than two roots not supported yet.");
+    OS << "  { " << getQualifiedName(Roots.front()->TheDef);
+    for (unsigned r = 1; r != Roots.size(); ++r)
+      OS << ", " << getQualifiedName(Roots[r]->TheDef);
+    OS << " },\n";
+  }
+  OS << "};\n\n";
+
   ArrayRef<CodeGenRegisterClass*> RegisterClasses = RegBank.getRegClasses();
 
   // Loop over all of the register classes... emitting each one.
@@ -587,52 +737,41 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "};\n\n";
 
-  // Emit the data table for getSubReg().
   ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
-  if (SubRegIndices.size()) {
-    OS << "const uint16_t " << TargetName << "SubRegTable[]["
-       << SubRegIndices.size() << "] = {\n";
-    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-      const CodeGenRegister::SubRegMap &SRM = Regs[i]->getSubRegs();
-      OS << "  /* " << Regs[i]->TheDef->getName() << " */\n";
-      if (SRM.empty()) {
-        OS << "  {0},\n";
-        continue;
-      }
-      OS << "  {";
-      for (unsigned j = 0, je = SubRegIndices.size(); j != je; ++j) {
-        // FIXME: We really should keep this to 80 columns...
-        CodeGenRegister::SubRegMap::const_iterator SubReg =
-          SRM.find(SubRegIndices[j]);
-        if (SubReg != SRM.end())
-          OS << getQualifiedName(SubReg->second->TheDef);
-        else
-          OS << "0";
-        if (j != je - 1)
-          OS << ", ";
-      }
-      OS << "}" << (i != e ? "," : "") << "\n";
-    }
-    OS << "};\n\n";
-    OS << "const uint16_t *get" << TargetName
-       << "SubRegTable() {\n  return (const uint16_t *)" << TargetName
-       << "SubRegTable;\n}\n\n";
-  }
 
   EmitRegMappingTables(OS, Regs, false);
 
+  // Emit Reg encoding table
+  OS << "extern const uint16_t " << TargetName;
+  OS << "RegEncodingTable[] = {\n";
+  // Add entry for NoRegister
+  OS << "  0,\n";
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    Record *Reg = Regs[i]->TheDef;
+    BitsInit *BI = Reg->getValueAsBitsInit("HWEncoding");
+    uint64_t Value = 0;
+    for (unsigned b = 0, be = BI->getNumBits(); b != be; ++b) {
+      if (BitInit *B = dynamic_cast<BitInit*>(BI->getBit(b)))
+      Value |= (uint64_t)B->getValue() << b;
+    }
+    OS << "  " << Value << ",\n";
+  }
+  OS << "};\n";       // End of HW encoding table
+
   // MCRegisterInfo initialization routine.
   OS << "static inline void Init" << TargetName
      << "MCRegisterInfo(MCRegisterInfo *RI, unsigned RA, "
-     << "unsigned DwarfFlavour = 0, unsigned EHFlavour = 0) {\n";
-  OS << "  RI->InitMCRegisterInfo(" << TargetName << "RegDesc, "
+     << "unsigned DwarfFlavour = 0, unsigned EHFlavour = 0) {\n"
+     << "  RI->InitMCRegisterInfo(" << TargetName << "RegDesc, "
      << Regs.size()+1 << ", RA, " << TargetName << "MCRegisterClasses, "
-     << RegisterClasses.size() << ", " << TargetName << "RegLists, ";
-  if (SubRegIndices.size() != 0)
-    OS << "(uint16_t*)" << TargetName << "SubRegTable, "
-       << SubRegIndices.size() << ");\n\n";
-  else
-    OS << "NULL, 0);\n\n";
+     << RegisterClasses.size() << ", "
+     << TargetName << "RegUnitRoots, "
+     << RegBank.getNumNativeRegUnits() << ", "
+     << TargetName << "RegDiffLists, "
+     << TargetName << "RegStrings, "
+     << TargetName << "SubRegIdxLists, "
+     << SubRegIndices.size() << ",\n"
+     << "  " << TargetName << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, false);
 
@@ -645,7 +784,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 void
 RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
                                      CodeGenRegBank &RegBank) {
-  EmitSourceFileHeader("Register Information Header Fragment", OS);
+  emitSourceFileHeader("Register Information Header Fragment", OS);
 
   OS << "\n#ifdef GET_REGINFO_HEADER\n";
   OS << "#undef GET_REGINFO_HEADER\n";
@@ -661,16 +800,16 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "  explicit " << ClassName
      << "(unsigned RA, unsigned D = 0, unsigned E = 0);\n"
      << "  virtual bool needsStackRealignment(const MachineFunction &) const\n"
-     << "     { return false; }\n"
-     << "  unsigned composeSubRegIndices(unsigned, unsigned) const;\n"
-     << "  const TargetRegisterClass *"
-        "getSubClassWithSubReg(const TargetRegisterClass*, unsigned) const;\n"
-     << "  const TargetRegisterClass *getMatchingSuperRegClass("
-        "const TargetRegisterClass*, const TargetRegisterClass*, "
-        "unsigned) const;\n"
-     << "  const RegClassWeight &getRegClassWeight("
+     << "     { return false; }\n";
+  if (!RegBank.getSubRegIndices().empty()) {
+    OS << "  unsigned composeSubRegIndices(unsigned, unsigned) const;\n"
+      << "  const TargetRegisterClass *"
+      "getSubClassWithSubReg(const TargetRegisterClass*, unsigned) const;\n";
+  }
+  OS << "  const RegClassWeight &getRegClassWeight("
      << "const TargetRegisterClass *RC) const;\n"
      << "  unsigned getNumRegPressureSets() const;\n"
+     << "  const char *getRegPressureSetName(unsigned Idx) const;\n"
      << "  unsigned getRegPressureSetLimit(unsigned Idx) const;\n"
      << "  const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const;\n"
@@ -688,9 +827,6 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
 
       // Output the extern for the instance.
       OS << "  extern const TargetRegisterClass " << Name << "RegClass;\n";
-      // Output the extern for the pointer to the instance (should remove).
-      OS << "  static const TargetRegisterClass * const " << Name
-         << "RegisterClass = &" << Name << "RegClass;\n";
     }
     OS << "} // end of namespace " << TargetName << "\n\n";
   }
@@ -704,7 +840,7 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
 void
 RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
                                    CodeGenRegBank &RegBank){
-  EmitSourceFileHeader("Target Register and Register Classes Information", OS);
+  emitSourceFileHeader("Target Register and Register Classes Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_TARGET_DESC\n";
   OS << "#undef GET_REGINFO_TARGET_DESC\n";
@@ -717,6 +853,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Start out by emitting each of the register classes.
   ArrayRef<CodeGenRegisterClass*> RegisterClasses = RegBank.getRegClasses();
+  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
 
   // Collect all registers belonging to any allocatable class.
   std::set<Record*> AllocatableRegs;
@@ -739,72 +876,85 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   VTSeqs.emit(OS, printSimpleValueType, "MVT::Other");
   OS << "};\n";
 
+  // Emit SubRegIndex names, skipping 0
+  OS << "\nstatic const char *const SubRegIndexTable[] = { \"";
+  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+    OS << SubRegIndices[i]->getName();
+    if (i+1 != e)
+      OS << "\", \"";
+  }
+  OS << "\" };\n\n";
+
+  // Emit names of the anonymous subreg indices.
+  unsigned NamedIndices = RegBank.getNumNamedIndices();
+  if (SubRegIndices.size() > NamedIndices) {
+    OS << "  enum {";
+    for (unsigned i = NamedIndices, e = SubRegIndices.size(); i != e; ++i) {
+      OS << "\n    " << SubRegIndices[i]->getName() << " = " << i+1;
+      if (i+1 != e)
+        OS << ',';
+    }
+    OS << "\n  };\n\n";
+  }
+  OS << "\n";
+
   // Now that all of the structs have been emitted, emit the instances.
   if (!RegisterClasses.empty()) {
-    std::map<unsigned, std::set<unsigned> > SuperRegClassMap;
-
     OS << "\nstatic const TargetRegisterClass *const "
        << "NullRegClasses[] = { NULL };\n\n";
 
-    unsigned NumSubRegIndices = RegBank.getSubRegIndices().size();
-
-    if (NumSubRegIndices) {
-      // Compute the super-register classes for each RegisterClass
-      for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-        const CodeGenRegisterClass &RC = *RegisterClasses[rc];
-        for (DenseMap<Record*,Record*>::const_iterator
-             i = RC.SubRegClasses.begin(),
-             e = RC.SubRegClasses.end(); i != e; ++i) {
-          // Find the register class number of i->second for SuperRegClassMap.
-          const CodeGenRegisterClass *RC2 = RegBank.getRegClass(i->second);
-          assert(RC2 && "Invalid register class in SubRegClasses");
-          SuperRegClassMap[RC2->EnumValue].insert(rc);
-        }
-      }
-
-      // Emit the super-register classes for each RegisterClass
-      for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-        const CodeGenRegisterClass &RC = *RegisterClasses[rc];
-
-        // Give the register class a legal C name if it's anonymous.
-        std::string Name = RC.getName();
-
-        OS << "// " << Name
-           << " Super-register Classes...\n"
-           << "static const TargetRegisterClass *const "
-           << Name << "SuperRegClasses[] = {\n  ";
-
-        bool Empty = true;
-        std::map<unsigned, std::set<unsigned> >::iterator I =
-          SuperRegClassMap.find(rc);
-        if (I != SuperRegClassMap.end()) {
-          for (std::set<unsigned>::iterator II = I->second.begin(),
-                 EE = I->second.end(); II != EE; ++II) {
-            const CodeGenRegisterClass &RC2 = *RegisterClasses[*II];
-            if (!Empty)
-              OS << ", ";
-            OS << "&" << RC2.getQualifiedName() << "RegClass";
-            Empty = false;
-          }
-        }
-
-        OS << (!Empty ? ", " : "") << "NULL";
-        OS << "\n};\n\n";
-      }
-    }
+    // Emit register class bit mask tables. The first bit mask emitted for a
+    // register class, RC, is the set of sub-classes, including RC itself.
+    //
+    // If RC has super-registers, also create a list of subreg indices and bit
+    // masks, (Idx, Mask). The bit mask has a bit for every superreg regclass,
+    // SuperRC, that satisfies:
+    //
+    //   For all SuperReg in SuperRC: SuperReg:Idx in RC
+    //
+    // The 0-terminated list of subreg indices starts at:
+    //
+    //   RC->getSuperRegIndices() = SuperRegIdxSeqs + ...
+    //
+    // The corresponding bitmasks follow the sub-class mask in memory. Each
+    // mask has RCMaskWords uint32_t entries.
+    //
+    // Every bit mask present in the list has at least one bit set.
+
+    // Compress the sub-reg index lists.
+    typedef std::vector<const CodeGenSubRegIndex*> IdxList;
+    SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
+    SequenceToOffsetTable<IdxList> SuperRegIdxSeqs;
+    BitVector MaskBV(RegisterClasses.size());
 
-    // Emit the sub-classes array for each RegisterClass
     for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
       const CodeGenRegisterClass &RC = *RegisterClasses[rc];
-
-      // Give the register class a legal C name if it's anonymous.
-      std::string Name = RC.getName();
-
-      OS << "static const uint32_t " << Name << "SubclassMask[] = {\n  ";
+      OS << "static const uint32_t " << RC.getName() << "SubClassMask[] = {\n  ";
       printBitVectorAsHex(OS, RC.getSubClasses(), 32);
+
+      // Emit super-reg class masks for any relevant SubRegIndices that can
+      // project into RC.
+      IdxList &SRIList = SuperRegIdxLists[rc];
+      for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
+        CodeGenSubRegIndex *Idx = SubRegIndices[sri];
+        MaskBV.reset();
+        RC.getSuperRegClasses(Idx, MaskBV);
+        if (MaskBV.none())
+          continue;
+        SRIList.push_back(Idx);
+        OS << "\n  ";
+        printBitVectorAsHex(OS, MaskBV, 32);
+        OS << "// " << Idx->getName();
+      }
+      SuperRegIdxSeqs.add(SRIList);
       OS << "\n};\n\n";
     }
 
+    OS << "static const uint16_t SuperRegIdxSeqs[] = {\n";
+    SuperRegIdxSeqs.layout();
+    SuperRegIdxSeqs.emit(OS, printSubRegIndex);
+    OS << "};\n\n";
+
     // Emit NULL terminated super-class lists.
     for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
       const CodeGenRegisterClass &RC = *RegisterClasses[rc];
@@ -865,13 +1015,12 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
          << '&' << Target.getName() << "MCRegisterClasses[" << RC.getName()
          << "RegClassID],\n    "
          << "VTLists + " << VTSeqs.get(RC.VTs) << ",\n    "
-         << RC.getName() << "SubclassMask,\n    ";
+         << RC.getName() << "SubClassMask,\n    SuperRegIdxSeqs + "
+         << SuperRegIdxSeqs.get(SuperRegIdxLists[i]) << ",\n    ";
       if (RC.getSuperClasses().empty())
         OS << "NullRegClasses,\n    ";
       else
         OS << RC.getName() << "Superclasses,\n    ";
-      OS << (NumSubRegIndices ? RC.getName() + "Super" : std::string("Null"))
-         << "RegClasses,\n    ";
       if (RC.AltOrderSelect.empty())
         OS << "0\n";
       else
@@ -906,67 +1055,39 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "};\n";      // End of register descriptors...
 
 
-  // Calculate the mapping of subregister+index pairs to physical registers.
-  // This will also create further anonymous indices.
-  unsigned NamedIndices = RegBank.getNumNamedIndices();
-
-  // Emit SubRegIndex names, skipping 0
-  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
-  OS << "\nstatic const char *const " << TargetName
-     << "SubRegIndexTable[] = { \"";
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    OS << SubRegIndices[i]->getName();
-    if (i+1 != e)
-      OS << "\", \"";
-  }
-  OS << "\" };\n\n";
-
-  // Emit names of the anonymous subreg indices.
-  if (SubRegIndices.size() > NamedIndices) {
-    OS << "  enum {";
-    for (unsigned i = NamedIndices, e = SubRegIndices.size(); i != e; ++i) {
-      OS << "\n    " << SubRegIndices[i]->getName() << " = " << i+1;
-      if (i+1 != e)
-        OS << ',';
-    }
-    OS << "\n  };\n\n";
-  }
-  OS << "\n";
-
   std::string ClassName = Target.getName() + "GenRegisterInfo";
 
   // Emit composeSubRegIndices
-  OS << "unsigned " << ClassName
-     << "::composeSubRegIndices(unsigned IdxA, unsigned IdxB) const {\n"
-     << "  switch (IdxA) {\n"
-     << "  default:\n    return IdxB;\n";
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    bool Open = false;
-    for (unsigned j = 0; j != e; ++j) {
-      if (CodeGenSubRegIndex *Comp =
+  if (!SubRegIndices.empty()) {
+    OS << "unsigned " << ClassName
+      << "::composeSubRegIndices(unsigned IdxA, unsigned IdxB) const {\n"
+      << "  switch (IdxA) {\n"
+      << "  default:\n    return IdxB;\n";
+    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+      bool Open = false;
+      for (unsigned j = 0; j != e; ++j) {
+        if (CodeGenSubRegIndex *Comp =
             SubRegIndices[i]->compose(SubRegIndices[j])) {
-        if (!Open) {
-          OS << "  case " << SubRegIndices[i]->getQualifiedName()
-             << ": switch(IdxB) {\n    default: return IdxB;\n";
-          Open = true;
+          if (!Open) {
+            OS << "  case " << SubRegIndices[i]->getQualifiedName()
+              << ": switch(IdxB) {\n    default: return IdxB;\n";
+            Open = true;
+          }
+          OS << "    case " << SubRegIndices[j]->getQualifiedName()
+            << ": return " << Comp->getQualifiedName() << ";\n";
         }
-        OS << "    case " << SubRegIndices[j]->getQualifiedName()
-           << ": return " << Comp->getQualifiedName() << ";\n";
       }
+      if (Open)
+        OS << "    }\n";
     }
-    if (Open)
-      OS << "    }\n";
+    OS << "  }\n}\n\n";
   }
-  OS << "  }\n}\n\n";
 
   // Emit getSubClassWithSubReg.
-  OS << "const TargetRegisterClass *" << ClassName
-     << "::getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx)"
-        " const {\n";
-  if (SubRegIndices.empty()) {
-    OS << "  assert(Idx == 0 && \"Target has no sub-registers\");\n"
-       << "  return RC;\n";
-  } else {
+  if (!SubRegIndices.empty()) {
+    OS << "const TargetRegisterClass *" << ClassName
+       << "::getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx)"
+       << " const {\n";
     // Use the smallest type that can hold a regclass ID with room for a
     // sentinel.
     if (RegisterClasses.size() < UINT8_MAX)
@@ -993,63 +1114,18 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
        << "  if (!Idx) return RC;\n  --Idx;\n"
        << "  assert(Idx < " << SubRegIndices.size() << " && \"Bad subreg\");\n"
        << "  unsigned TV = Table[RC->getID()][Idx];\n"
-       << "  return TV ? getRegClass(TV - 1) : 0;\n";
-  }
-  OS << "}\n\n";
-
-  // Emit getMatchingSuperRegClass.
-  OS << "const TargetRegisterClass *" << ClassName
-     << "::getMatchingSuperRegClass(const TargetRegisterClass *A,"
-        " const TargetRegisterClass *B, unsigned Idx) const {\n";
-  if (SubRegIndices.empty()) {
-    OS << "  llvm_unreachable(\"Target has no sub-registers\");\n";
-  } else {
-    // We need to find the largest sub-class of A such that every register has
-    // an Idx sub-register in B.  Map (B, Idx) to a bit-vector of
-    // super-register classes that map into B. Then compute the largest common
-    // sub-class with A by taking advantage of the register class ordering,
-    // like getCommonSubClass().
-
-    // Bitvector table is NumRCs x NumSubIndexes x BVWords, where BVWords is
-    // the number of 32-bit words required to represent all register classes.
-    const unsigned BVWords = (RegisterClasses.size()+31)/32;
-    BitVector BV(RegisterClasses.size());
-
-    OS << "  static const uint32_t Table[" << RegisterClasses.size()
-       << "][" << SubRegIndices.size() << "][" << BVWords << "] = {\n";
-    for (unsigned rci = 0, rce = RegisterClasses.size(); rci != rce; ++rci) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[rci];
-      OS << "    {\t// " << RC.getName() << "\n";
-      for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
-        CodeGenSubRegIndex *Idx = SubRegIndices[sri];
-        BV.reset();
-        RC.getSuperRegClasses(Idx, BV);
-        OS << "      { ";
-        printBitVectorAsHex(OS, BV, 32);
-        OS << "},\t// " << Idx->getName() << '\n';
-      }
-      OS << "    },\n";
-    }
-    OS << "  };\n  assert(A && B && \"Missing regclass\");\n"
-       << "  --Idx;\n"
-       << "  assert(Idx < " << SubRegIndices.size() << " && \"Bad subreg\");\n"
-       << "  const uint32_t *TV = Table[B->getID()][Idx];\n"
-       << "  const uint32_t *SC = A->getSubClassMask();\n"
-       << "  for (unsigned i = 0; i != " << BVWords << "; ++i)\n"
-       << "    if (unsigned Common = TV[i] & SC[i])\n"
-       << "      return getRegClass(32*i + CountTrailingZeros_32(Common));\n"
-       << "  return 0;\n";
+       << "  return TV ? getRegClass(TV - 1) : 0;\n}\n\n";
   }
-  OS << "}\n\n";
 
   EmitRegUnitPressure(OS, RegBank, ClassName);
 
   // Emit the constructor of the class...
   OS << "extern const MCRegisterDesc " << TargetName << "RegDesc[];\n";
-  OS << "extern const uint16_t " << TargetName << "RegLists[];\n";
-  if (SubRegIndices.size() != 0)
-    OS << "extern const uint16_t *get" << TargetName
-       << "SubRegTable();\n";
+  OS << "extern const uint16_t " << TargetName << "RegDiffLists[];\n";
+  OS << "extern const char " << TargetName << "RegStrings[];\n";
+  OS << "extern const uint16_t " << TargetName << "RegUnitRoots[][2];\n";
+  OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n";
+  OS << "extern const uint16_t " << TargetName << "RegEncodingTable[];\n";
 
   EmitRegMappingTables(OS, Regs, true);
 
@@ -1057,17 +1133,17 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "(unsigned RA, unsigned DwarfFlavour, unsigned EHFlavour)\n"
      << "  : TargetRegisterInfo(" << TargetName << "RegInfoDesc"
      << ", RegisterClasses, RegisterClasses+" << RegisterClasses.size() <<",\n"
-     << "             " << TargetName << "SubRegIndexTable) {\n"
+     << "             SubRegIndexTable) {\n"
      << "  InitMCRegisterInfo(" << TargetName << "RegDesc, "
      << Regs.size()+1 << ", RA,\n                     " << TargetName
      << "MCRegisterClasses, " << RegisterClasses.size() << ",\n"
-     << "                     " << TargetName << "RegLists,\n"
-     << "                     ";
-  if (SubRegIndices.size() != 0)
-    OS << "get" << TargetName << "SubRegTable(), "
-       << SubRegIndices.size() << ");\n\n";
-  else
-    OS << "NULL, 0);\n\n";
+     << "                     " << TargetName << "RegUnitRoots,\n"
+     << "                     " << RegBank.getNumNativeRegUnits() << ",\n"
+     << "                     " << TargetName << "RegDiffLists,\n"
+     << "                     " << TargetName << "RegStrings,\n"
+     << "                     " << TargetName << "SubRegIdxLists,\n"
+     << "                     " << SubRegIndices.size() << ",\n"
+     << "                     " << TargetName << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, true);
 
@@ -1111,3 +1187,11 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   runTargetHeader(OS, Target, RegBank);
   runTargetDesc(OS, Target, RegBank);
 }
+
+namespace llvm {
+
+void EmitRegisterInfo(RecordKeeper &RK, raw_ostream &OS) {
+  RegisterInfoEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/RegisterInfoEmitter.h b/utils/TableGen/RegisterInfoEmitter.h
deleted file mode 100644
index ee9903c..0000000
--- a/utils/TableGen/RegisterInfoEmitter.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===- RegisterInfoEmitter.h - Generate a Register File Desc. ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend is responsible for emitting a description of a target
-// register file for a code generator.  It uses instances of the Register,
-// RegisterAliases, and RegisterClass classes to gather this information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef REGISTER_INFO_EMITTER_H
-#define REGISTER_INFO_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include <vector>
-
-namespace llvm {
-
-class CodeGenRegBank;
-struct CodeGenRegister;
-class CodeGenTarget;
-
-class RegisterInfoEmitter : public TableGenBackend {
-  RecordKeeper &Records;
-public:
-  RegisterInfoEmitter(RecordKeeper &R) : Records(R) {}
-
-  // runEnums - Print out enum values for all of the registers.
-  void runEnums(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank);
-
-  // runMCDesc - Print out MC register descriptions.
-  void runMCDesc(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank);
-
-  // runTargetHeader - Emit a header fragment for the register info emitter.
-  void runTargetHeader(raw_ostream &o, CodeGenTarget &Target,
-                       CodeGenRegBank &Bank);
-
-  // runTargetDesc - Output the target register and register file descriptions.
-  void runTargetDesc(raw_ostream &o, CodeGenTarget &Target,
-                     CodeGenRegBank &Bank);
-
-  // run - Output the register file description.
-  void run(raw_ostream &o);
-
-private:
-  void EmitRegMapping(raw_ostream &o,
-                      const std::vector<CodeGenRegister*> &Regs, bool isCtor);
-  void EmitRegMappingTables(raw_ostream &o,
-                            const std::vector<CodeGenRegister*> &Regs,
-                            bool isCtor);
-  void EmitRegClasses(raw_ostream &OS, CodeGenTarget &Target);
-
-  void EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
-                           const std::string &ClassName);
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index 97c764e..d8ab2ee 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h
@@ -81,6 +81,8 @@ public:
       Seqs.erase(I);
   }
 
+  bool empty() const { return Seqs.empty(); }
+  
   /// layout - Computes the final table layout.
   void layout() {
     assert(Entries == 0 && "Can only call layout() once");
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
index 0649fd1..46e6db1 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/utils/TableGen/SetTheory.cpp
@@ -160,9 +160,17 @@ struct InterleaveOp : public SetTheory::Operator {
 // (sequence "Format", From, To) Generate a sequence of records by name.
 struct SequenceOp : public SetTheory::Operator {
   void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
-    if (Expr->arg_size() != 3)
+    int Step = 1;
+    if (Expr->arg_size() > 4)
       throw "Bad args to (sequence \"Format\", From, To): " +
         Expr->getAsString();
+    else if (Expr->arg_size() == 4) {
+      if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[3])) {
+        Step = II->getValue();
+      } else
+        throw "Stride must be an integer: " + Expr->getAsString();
+    }
+
     std::string Format;
     if (StringInit *SI = dynamic_cast<StringInit*>(Expr->arg_begin()[0]))
       Format = SI->getValue();
@@ -187,8 +195,12 @@ struct SequenceOp : public SetTheory::Operator {
     RecordKeeper &Records =
       dynamic_cast<DefInit&>(*Expr->getOperator()).getDef()->getRecords();
 
-    int Step = From <= To ? 1 : -1;
-    for (To += Step; From != To; From += Step) {
+    Step *= From <= To ? 1 : -1;
+    while (true) {
+      if (Step > 0 && From > To)
+        break;
+      else if (Step < 0 && From < To)
+        break;
       std::string Name;
       raw_string_ostream OS(Name);
       OS << format(Format.c_str(), unsigned(From));
@@ -200,6 +212,8 @@ struct SequenceOp : public SetTheory::Operator {
         Elts.insert(Result->begin(), Result->end());
       else
         Elts.insert(Rec);
+
+      From += Step;
     }
   }
 };
diff --git a/utils/TableGen/StringToOffsetTable.h b/utils/TableGen/StringToOffsetTable.h
index 803f5bd..a098d7d 100644
--- a/utils/TableGen/StringToOffsetTable.h
+++ b/utils/TableGen/StringToOffsetTable.h
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 namespace llvm {
 
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 986c50f..3472343 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -11,14 +11,60 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SubtargetEmitter.h"
 #include "CodeGenTarget.h"
-#include "llvm/TableGen/Record.h"
+#include "CodeGenSchedule.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
 using namespace llvm;
 
+namespace {
+class SubtargetEmitter {
+
+  RecordKeeper &Records;
+  CodeGenSchedModels &SchedModels;
+  std::string Target;
+
+  void Enumeration(raw_ostream &OS, const char *ClassName, bool isBits);
+  unsigned FeatureKeyValues(raw_ostream &OS);
+  unsigned CPUKeyValues(raw_ostream &OS);
+  void FormItineraryStageString(const std::string &Names,
+                                Record *ItinData, std::string &ItinString,
+                                unsigned &NStages);
+  void FormItineraryOperandCycleString(Record *ItinData, std::string &ItinString,
+                                       unsigned &NOperandCycles);
+  void FormItineraryBypassString(const std::string &Names,
+                                 Record *ItinData,
+                                 std::string &ItinString, unsigned NOperandCycles);
+  void EmitStageAndOperandCycleData(raw_ostream &OS,
+                                    std::vector<std::vector<InstrItinerary> >
+                                      &ProcItinLists);
+  void EmitItineraries(raw_ostream &OS,
+                       std::vector<std::vector<InstrItinerary> >
+                         &ProcItinLists);
+  void EmitProcessorProp(raw_ostream &OS, const Record *R, const char *Name,
+                         char Separator);
+  void EmitProcessorModels(raw_ostream &OS);
+  void EmitProcessorLookup(raw_ostream &OS);
+  void EmitSchedModel(raw_ostream &OS);
+  void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
+                             unsigned NumProcs);
+
+public:
+  SubtargetEmitter(RecordKeeper &R, CodeGenTarget &TGT):
+    Records(R), SchedModels(TGT.getSchedModels()), Target(TGT.getName()) {}
+
+  void run(raw_ostream &o);
+
+};
+} // End anonymous namespace
+
 //
 // Enumeration - Emit the specified class as an enumeration.
 //
@@ -196,28 +242,6 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS) {
 }
 
 //
-// CollectAllItinClasses - Gathers and enumerates all the itinerary classes.
-// Returns itinerary class count.
-//
-unsigned SubtargetEmitter::
-CollectAllItinClasses(raw_ostream &OS,
-                      std::map<std::string, unsigned> &ItinClassesMap,
-                      std::vector<Record*> &ItinClassList) {
-  // For each itinerary class
-  unsigned N = ItinClassList.size();
-  for (unsigned i = 0; i < N; i++) {
-    // Next itinerary class
-    const Record *ItinClass = ItinClassList[i];
-    // Get name of itinerary class
-    // Assign itinerary class a unique number
-    ItinClassesMap[ItinClass->getName()] = i;
-  }
-
-  // Return itinerary class count
-  return N;
-}
-
-//
 // FormItineraryStageString - Compose a string containing the stage
 // data initialization for the specified itinerary.  N is the number
 // of stages.
@@ -303,32 +327,31 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name,
 }
 
 //
-// EmitStageAndOperandCycleData - Generate unique itinerary stages and
-// operand cycle tables.  Record itineraries for processors.
+// EmitStageAndOperandCycleData - Generate unique itinerary stages and operand
+// cycle tables. Create a list of InstrItinerary objects (ProcItinLists) indexed
+// by CodeGenSchedClass::Index.
 //
-void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS,
-       unsigned NItinClasses,
-       std::map<std::string, unsigned> &ItinClassesMap,
-       std::vector<Record*> &ItinClassList,
-       std::vector<std::vector<InstrItinerary> > &ProcList) {
-  // Gather processor iteraries
-  std::vector<Record*> ProcItinList =
-                       Records.getAllDerivedDefinitions("ProcessorItineraries");
-
-  // If just no itinerary then don't bother
-  if (ProcItinList.size() < 2) return;
+void SubtargetEmitter::
+EmitStageAndOperandCycleData(raw_ostream &OS,
+                             std::vector<std::vector<InstrItinerary> >
+                               &ProcItinLists) {
+
+  // Multiple processor models may share an itinerary record. Emit it once.
+  SmallPtrSet<Record*, 8> ItinsDefSet;
 
   // Emit functional units for all the itineraries.
-  for (unsigned i = 0, N = ProcItinList.size(); i < N; ++i) {
-    // Next record
-    Record *Proc = ProcItinList[i];
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+
+    if (!ItinsDefSet.insert(PI->ItinsDef))
+      continue;
 
-    std::vector<Record*> FUs = Proc->getValueAsListOfDefs("FU");
+    std::vector<Record*> FUs = PI->ItinsDef->getValueAsListOfDefs("FU");
     if (FUs.empty())
       continue;
 
-    const std::string &Name = Proc->getName();
-    OS << "\n// Functional units for itineraries \"" << Name << "\"\n"
+    const std::string &Name = PI->ItinsDef->getName();
+    OS << "\n// Functional units for \"" << Name << "\"\n"
        << "namespace " << Name << "FU {\n";
 
     for (unsigned j = 0, FUN = FUs.size(); j < FUN; ++j)
@@ -337,7 +360,7 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS,
 
     OS << "}\n";
 
-    std::vector<Record*> BPs = Proc->getValueAsListOfDefs("BP");
+    std::vector<Record*> BPs = PI->ItinsDef->getValueAsListOfDefs("BP");
     if (BPs.size()) {
       OS << "\n// Pipeline forwarding pathes for itineraries \"" << Name
          << "\"\n" << "namespace " << Name << "Bypass {\n";
@@ -363,47 +386,57 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS,
 
   // Begin pipeline bypass table
   std::string BypassTable = "extern const unsigned " + Target +
-    "ForwardingPathes[] = {\n";
-  BypassTable += "  0, // No itinerary\n";
+    "ForwardingPaths[] = {\n";
+  BypassTable += " 0, // No itinerary\n";
 
+  // For each Itinerary across all processors, add a unique entry to the stages,
+  // operand cycles, and pipepine bypess tables. Then add the new Itinerary
+  // object with computed offsets to the ProcItinLists result.
   unsigned StageCount = 1, OperandCycleCount = 1;
   std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
-  for (unsigned i = 0, N = ProcItinList.size(); i < N; i++) {
-    // Next record
-    Record *Proc = ProcItinList[i];
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    const CodeGenProcModel &ProcModel = *PI;
 
-    // Get processor itinerary name
-    const std::string &Name = Proc->getName();
+    // Add process itinerary to the list.
+    ProcItinLists.resize(ProcItinLists.size()+1);
 
-    // Skip default
-    if (Name == "NoItineraries") continue;
+    // If this processor defines no itineraries, then leave the itinerary list
+    // empty.
+    std::vector<InstrItinerary> &ItinList = ProcItinLists.back();
+    if (ProcModel.ItinDefList.empty())
+      continue;
 
-    // Create and expand processor itinerary to cover all itinerary classes
-    std::vector<InstrItinerary> ItinList;
-    ItinList.resize(NItinClasses);
+    // Reserve index==0 for NoItinerary.
+    ItinList.resize(SchedModels.numItineraryClasses()+1);
 
-    // Get itinerary data list
-    std::vector<Record*> ItinDataList = Proc->getValueAsListOfDefs("IID");
+    const std::string &Name = ProcModel.ItinsDef->getName();
 
     // For each itinerary data
-    for (unsigned j = 0, M = ItinDataList.size(); j < M; j++) {
+    for (unsigned SchedClassIdx = 0,
+           SchedClassEnd = ProcModel.ItinDefList.size();
+         SchedClassIdx < SchedClassEnd; ++SchedClassIdx) {
+
       // Next itinerary data
-      Record *ItinData = ItinDataList[j];
+      Record *ItinData = ProcModel.ItinDefList[SchedClassIdx];
 
       // Get string and stage count
       std::string ItinStageString;
-      unsigned NStages;
-      FormItineraryStageString(Name, ItinData, ItinStageString, NStages);
+      unsigned NStages = 0;
+      if (ItinData)
+        FormItineraryStageString(Name, ItinData, ItinStageString, NStages);
 
       // Get string and operand cycle count
       std::string ItinOperandCycleString;
-      unsigned NOperandCycles;
-      FormItineraryOperandCycleString(ItinData, ItinOperandCycleString,
-                                      NOperandCycles);
-
+      unsigned NOperandCycles = 0;
       std::string ItinBypassString;
-      FormItineraryBypassString(Name, ItinData, ItinBypassString,
-                                NOperandCycles);
+      if (ItinData) {
+        FormItineraryOperandCycleString(ItinData, ItinOperandCycleString,
+                                        NOperandCycles);
+
+        FormItineraryBypassString(Name, ItinData, ItinBypassString,
+                                  NOperandCycles);
+      }
 
       // Check to see if stage already exists and create if it doesn't
       unsigned FindStage = 0;
@@ -443,33 +476,26 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS,
         }
       }
 
-      // Locate where to inject into processor itinerary table
-      const std::string &Name = ItinData->getValueAsDef("TheClass")->getName();
-      unsigned Find = ItinClassesMap[Name];
-
       // Set up itinerary as location and location + stage count
-      unsigned NumUOps = ItinClassList[Find]->getValueAsInt("NumMicroOps");
+      int NumUOps = ItinData ? ItinData->getValueAsInt("NumMicroOps") : 0;
       InstrItinerary Intinerary = { NumUOps, FindStage, FindStage + NStages,
                                     FindOperandCycle,
                                     FindOperandCycle + NOperandCycles};
 
       // Inject - empty slots will be 0, 0
-      ItinList[Find] = Intinerary;
+      ItinList[SchedClassIdx] = Intinerary;
     }
-
-    // Add process itinerary to list
-    ProcList.push_back(ItinList);
   }
 
   // Closing stage
-  StageTable += "  { 0, 0, 0, llvm::InstrStage::Required } // End itinerary\n";
+  StageTable += "  { 0, 0, 0, llvm::InstrStage::Required } // End stages\n";
   StageTable += "};\n";
 
   // Closing operand cycles
-  OperandCycleTable += "  0 // End itinerary\n";
+  OperandCycleTable += "  0 // End operand cycles\n";
   OperandCycleTable += "};\n";
 
-  BypassTable += "  0 // End itinerary\n";
+  BypassTable += " 0 // End bypass tables\n";
   BypassTable += "};\n";
 
   // Emit tables.
@@ -479,61 +505,100 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS,
 }
 
 //
-// EmitProcessorData - Generate data for processor itineraries.
+// EmitProcessorData - Generate data for processor itineraries that were
+// computed during EmitStageAndOperandCycleData(). ProcItinLists lists all
+// Itineraries for each processor. The Itinerary lists are indexed on
+// CodeGenSchedClass::Index.
 //
 void SubtargetEmitter::
-EmitProcessorData(raw_ostream &OS,
-                  std::vector<Record*> &ItinClassList,
-                  std::vector<std::vector<InstrItinerary> > &ProcList) {
-  // Get an iterator for processor itinerary stages
+EmitItineraries(raw_ostream &OS,
+                std::vector<std::vector<InstrItinerary> > &ProcItinLists) {
+
+  // Multiple processor models may share an itinerary record. Emit it once.
+  SmallPtrSet<Record*, 8> ItinsDefSet;
+
+  // For each processor's machine model
   std::vector<std::vector<InstrItinerary> >::iterator
-      ProcListIter = ProcList.begin();
+      ProcItinListsIter = ProcItinLists.begin();
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
 
-  // For each processor itinerary
-  std::vector<Record*> Itins =
-                       Records.getAllDerivedDefinitions("ProcessorItineraries");
-  for (unsigned i = 0, N = Itins.size(); i < N; i++) {
-    // Next record
-    Record *Itin = Itins[i];
+    Record *ItinsDef = PI->ItinsDef;
+    if (!ItinsDefSet.insert(ItinsDef))
+      continue;
 
     // Get processor itinerary name
-    const std::string &Name = Itin->getName();
+    const std::string &Name = ItinsDef->getName();
 
-    // Skip default
-    if (Name == "NoItineraries") continue;
+    // Get the itinerary list for the processor.
+    assert(ProcItinListsIter != ProcItinLists.end() && "bad iterator");
+    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter++;
 
-    // Begin processor itinerary table
     OS << "\n";
-    OS << "static const llvm::InstrItinerary " << Name << "[] = {\n";
+    OS << "static const llvm::InstrItinerary ";
+    if (ItinList.empty()) {
+      OS << '*' << Name << " = 0;\n";
+      continue;
+    }
 
-    // For each itinerary class
-    std::vector<InstrItinerary> &ItinList = *ProcListIter++;
-    assert(ItinList.size() == ItinClassList.size() && "bad itinerary");
+    // Begin processor itinerary table
+    OS << Name << "[] = {\n";
+
+    // For each itinerary class in CodeGenSchedClass::Index order.
     for (unsigned j = 0, M = ItinList.size(); j < M; ++j) {
       InstrItinerary &Intinerary = ItinList[j];
 
-      // Emit in the form of
+      // Emit Itinerary in the form of
       // { firstStage, lastStage, firstCycle, lastCycle } // index
-      if (Intinerary.FirstStage == 0) {
-        OS << "  { 1, 0, 0, 0, 0 }";
-      } else {
-        OS << "  { " <<
-          Intinerary.NumMicroOps << ", " <<
-          Intinerary.FirstStage << ", " <<
-          Intinerary.LastStage << ", " <<
-          Intinerary.FirstOperandCycle << ", " <<
-          Intinerary.LastOperandCycle << " }";
-      }
-
-      OS << ", // " << j << " " << ItinClassList[j]->getName() << "\n";
+      OS << "  { " <<
+        Intinerary.NumMicroOps << ", " <<
+        Intinerary.FirstStage << ", " <<
+        Intinerary.LastStage << ", " <<
+        Intinerary.FirstOperandCycle << ", " <<
+        Intinerary.LastOperandCycle << " }" <<
+        ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
     }
-
     // End processor itinerary table
-    OS << "  { 1, ~0U, ~0U, ~0U, ~0U } // end marker\n";
+    OS << "  { 0, ~0U, ~0U, ~0U, ~0U } // end marker\n";
     OS << "};\n";
   }
 }
 
+// Emit either the value defined in the TableGen Record, or the default
+// value defined in the C++ header. The Record is null if the processor does not
+// define a model.
+void SubtargetEmitter::EmitProcessorProp(raw_ostream &OS, const Record *R,
+                                         const char *Name, char Separator) {
+  OS << "  ";
+  int V = R ? R->getValueAsInt(Name) : -1;
+  if (V >= 0)
+    OS << V << Separator << " // " << Name;
+  else
+    OS << "MCSchedModel::Default" << Name << Separator;
+  OS << '\n';
+}
+
+void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
+  // For each processor model.
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    // Skip default
+    // Begin processor itinerary properties
+    OS << "\n";
+    OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
+    EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
+    if (SchedModels.hasItineraryClasses())
+      OS << "  " << PI->ItinsDef->getName();
+    else
+      OS << "  0";
+    OS << ");\n";
+  }
+}
+
 //
 // EmitProcessorLookup - generate cpu name to itinerary lookup table.
 //
@@ -547,7 +612,7 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
   OS << "\n";
   OS << "// Sorted (by key) array of itineraries for CPU subtype.\n"
      << "extern const llvm::SubtargetInfoKV "
-     << Target << "ProcItinKV[] = {\n";
+     << Target << "ProcSchedKV[] = {\n";
 
   // For each processor
   for (unsigned i = 0, N = ProcessorList.size(); i < N;) {
@@ -555,13 +620,13 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
     Record *Processor = ProcessorList[i];
 
     const std::string &Name = Processor->getValueAsString("Name");
-    const std::string &ProcItin =
-      Processor->getValueAsDef("ProcItin")->getName();
+    const std::string &ProcModelName =
+      SchedModels.getProcModel(Processor).ModelName;
 
     // Emit as { "cpu", procinit },
     OS << "  { "
        << "\"" << Name << "\", "
-       << "(void *)&" << ProcItin;
+       << "(void *)&" << ProcModelName;
 
     OS << " }";
 
@@ -576,31 +641,19 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
 }
 
 //
-// EmitData - Emits all stages and itineries, folding common patterns.
+// EmitSchedModel - Emits all scheduling model tables, folding common patterns.
 //
-void SubtargetEmitter::EmitData(raw_ostream &OS) {
-  std::map<std::string, unsigned> ItinClassesMap;
-  // Gather and sort all itinerary classes
-  std::vector<Record*> ItinClassList =
-    Records.getAllDerivedDefinitions("InstrItinClass");
-  std::sort(ItinClassList.begin(), ItinClassList.end(), LessRecord());
-
-  // Enumerate all the itinerary classes
-  unsigned NItinClasses = CollectAllItinClasses(OS, ItinClassesMap,
-                                                ItinClassList);
-  // Make sure the rest is worth the effort
-  HasItineraries = NItinClasses != 1;   // Ignore NoItinerary.
-
-  if (HasItineraries) {
-    std::vector<std::vector<InstrItinerary> > ProcList;
+void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) {
+  if (SchedModels.hasItineraryClasses()) {
+    std::vector<std::vector<InstrItinerary> > ProcItinLists;
     // Emit the stage data
-    EmitStageAndOperandCycleData(OS, NItinClasses, ItinClassesMap,
-                                 ItinClassList, ProcList);
-    // Emit the processor itinerary data
-    EmitProcessorData(OS, ItinClassList, ProcList);
-    // Emit the processor lookup data
-    EmitProcessorLookup(OS);
+    EmitStageAndOperandCycleData(OS, ProcItinLists);
+    EmitItineraries(OS, ProcItinLists);
   }
+  // Emit the processor machine model
+  EmitProcessorModels(OS);
+  // Emit the processor lookup data
+  EmitProcessorLookup(OS);
 }
 
 //
@@ -620,7 +673,7 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
   OS << Target;
   OS << "Subtarget::ParseSubtargetFeatures(StringRef CPU, StringRef FS) {\n"
      << "  DEBUG(dbgs() << \"\\nFeatures:\" << FS);\n"
-     << "  DEBUG(dbgs() << \"\\nCPU:\" << CPU);\n";
+     << "  DEBUG(dbgs() << \"\\nCPU:\" << CPU << \"\\n\\n\");\n";
 
   if (Features.empty()) {
     OS << "}\n";
@@ -654,9 +707,7 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
 // SubtargetEmitter::run - Main subtarget enumeration emitter.
 //
 void SubtargetEmitter::run(raw_ostream &OS) {
-  Target = CodeGenTarget(Records).getName();
-
-  EmitSourceFileHeader("Subtarget Enumeration Source Fragment", OS);
+  emitSourceFileHeader("Subtarget Enumeration Source Fragment", OS);
 
   OS << "\n#ifdef GET_SUBTARGETINFO_ENUM\n";
   OS << "#undef GET_SUBTARGETINFO_ENUM\n";
@@ -677,7 +728,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "\n";
   unsigned NumProcs = CPUKeyValues(OS);
   OS << "\n";
-  EmitData(OS);
+  EmitSchedModel(OS);
   OS << "\n";
 #if 0
   OS << "}\n";
@@ -696,11 +747,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
-  if (HasItineraries) {
-    OS << Target << "ProcItinKV, "
+  if (SchedModels.hasItineraryClasses()) {
+    OS << Target << "ProcSchedKV, "
        << Target << "Stages, "
        << Target << "OperandCycles, "
-       << Target << "ForwardingPathes, ";
+       << Target << "ForwardingPaths, ";
   } else
     OS << "0, 0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
@@ -742,11 +793,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "namespace llvm {\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "SubTypeKV[];\n";
-  if (HasItineraries) {
-    OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcItinKV[];\n";
+  if (SchedModels.hasItineraryClasses()) {
+    OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcSchedKV[];\n";
     OS << "extern const llvm::InstrStage " << Target << "Stages[];\n";
     OS << "extern const unsigned " << Target << "OperandCycles[];\n";
-    OS << "extern const unsigned " << Target << "ForwardingPathes[];\n";
+    OS << "extern const unsigned " << Target << "ForwardingPaths[];\n";
   }
 
   OS << ClassName << "::" << ClassName << "(StringRef TT, StringRef CPU, "
@@ -761,11 +812,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
-  if (HasItineraries) {
-    OS << Target << "ProcItinKV, "
+  if (SchedModels.hasItineraryClasses()) {
+    OS << Target << "ProcSchedKV, "
        << Target << "Stages, "
        << Target << "OperandCycles, "
-       << Target << "ForwardingPathes, ";
+       << Target << "ForwardingPaths, ";
   } else
     OS << "0, 0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
@@ -773,3 +824,12 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   OS << "#endif // GET_SUBTARGETINFO_CTOR\n\n";
 }
+
+namespace llvm {
+
+void EmitSubtarget(RecordKeeper &RK, raw_ostream &OS) {
+  CodeGenTarget CGTarget(RK);
+  SubtargetEmitter(RK, CGTarget).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/SubtargetEmitter.h b/utils/TableGen/SubtargetEmitter.h
deleted file mode 100644
index ff01274..0000000
--- a/utils/TableGen/SubtargetEmitter.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//===- SubtargetEmitter.h - Generate subtarget enumerations -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tablegen backend emits subtarget enumerations.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBTARGET_EMITTER_H
-#define SUBTARGET_EMITTER_H
-
-#include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/MC/MCInstrItineraries.h"
-#include <vector>
-#include <map>
-#include <string>
-
-
-namespace llvm {
-
-class SubtargetEmitter : public TableGenBackend {
-
-  RecordKeeper &Records;
-  std::string Target;
-  bool HasItineraries;
-
-  void Enumeration(raw_ostream &OS, const char *ClassName, bool isBits);
-  unsigned FeatureKeyValues(raw_ostream &OS);
-  unsigned CPUKeyValues(raw_ostream &OS);
-  unsigned CollectAllItinClasses(raw_ostream &OS,
-                                 std::map<std::string,unsigned> &ItinClassesMap,
-                                 std::vector<Record*> &ItinClassList);
-  void FormItineraryStageString(const std::string &Names,
-                                Record *ItinData, std::string &ItinString,
-                                unsigned &NStages);
-  void FormItineraryOperandCycleString(Record *ItinData, std::string &ItinString,
-                                       unsigned &NOperandCycles);
-  void FormItineraryBypassString(const std::string &Names,
-                                 Record *ItinData,
-                                 std::string &ItinString, unsigned NOperandCycles);
-  void EmitStageAndOperandCycleData(raw_ostream &OS, unsigned NItinClasses,
-                     std::map<std::string, unsigned> &ItinClassesMap,
-                     std::vector<Record*> &ItinClassList,
-                     std::vector<std::vector<InstrItinerary> > &ProcList);
-  void EmitProcessorData(raw_ostream &OS,
-                         std::vector<Record*> &ItinClassList,
-                         std::vector<std::vector<InstrItinerary> > &ProcList);
-  void EmitProcessorLookup(raw_ostream &OS);
-  void EmitData(raw_ostream &OS);
-  void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
-                             unsigned NumProcs);
-
-public:
-  SubtargetEmitter(RecordKeeper &R) : Records(R), HasItineraries(false) {}
-
-  // run - Output the subtarget enumerations, returning true on failure.
-  void run(raw_ostream &o);
-
-};
-
-
-} // End llvm namespace
-
-#endif
-
-
-
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 8c41358..9695b4a 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -11,22 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AsmMatcherEmitter.h"
-#include "AsmWriterEmitter.h"
-#include "CallingConvEmitter.h"
-#include "CodeEmitterGen.h"
-#include "DAGISelEmitter.h"
-#include "DFAPacketizerEmitter.h"
-#include "DisassemblerEmitter.h"
-#include "EDEmitter.h"
-#include "FastISelEmitter.h"
-#include "InstrInfoEmitter.h"
-#include "IntrinsicEmitter.h"
-#include "PseudoLoweringEmitter.h"
-#include "RegisterInfoEmitter.h"
-#include "SubtargetEmitter.h"
-#include "SetTheory.h"
+#include "TableGenBackends.h" // Declares all backends.
 
+#include "SetTheory.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
@@ -102,7 +89,7 @@ namespace {
   cl::opt<std::string>
   Class("class", cl::desc("Print Enum list for this class"),
           cl::value_desc("class name"));
-  
+
   class LLVMTableGenAction : public TableGenAction {
   public:
     bool operator()(raw_ostream &OS, RecordKeeper &Records) {
@@ -111,49 +98,49 @@ namespace {
         OS << Records;           // No argument, dump all contents
         break;
       case GenEmitter:
-        CodeEmitterGen(Records).run(OS);
+        EmitCodeEmitter(Records, OS);
         break;
       case GenRegisterInfo:
-        RegisterInfoEmitter(Records).run(OS);
+        EmitRegisterInfo(Records, OS);
         break;
       case GenInstrInfo:
-        InstrInfoEmitter(Records).run(OS);
+        EmitInstrInfo(Records, OS);
         break;
       case GenCallingConv:
-        CallingConvEmitter(Records).run(OS);
+        EmitCallingConv(Records, OS);
         break;
       case GenAsmWriter:
-        AsmWriterEmitter(Records).run(OS);
+        EmitAsmWriter(Records, OS);
         break;
       case GenAsmMatcher:
-        AsmMatcherEmitter(Records).run(OS);
+        EmitAsmMatcher(Records, OS);
         break;
       case GenDisassembler:
-        DisassemblerEmitter(Records).run(OS);
+        EmitDisassembler(Records, OS);
         break;
       case GenPseudoLowering:
-        PseudoLoweringEmitter(Records).run(OS);
+        EmitPseudoLowering(Records, OS);
         break;
       case GenDAGISel:
-        DAGISelEmitter(Records).run(OS);
+        EmitDAGISel(Records, OS);
         break;
       case GenDFAPacketizer:
-        DFAGen(Records).run(OS);
+        EmitDFAPacketizer(Records, OS);
         break;
       case GenFastISel:
-        FastISelEmitter(Records).run(OS);
+        EmitFastISel(Records, OS);
         break;
       case GenSubtarget:
-        SubtargetEmitter(Records).run(OS);
+        EmitSubtarget(Records, OS);
         break;
       case GenIntrinsic:
-        IntrinsicEmitter(Records).run(OS);
+        EmitIntrinsics(Records, OS);
         break;
       case GenTgtIntrinsic:
-        IntrinsicEmitter(Records, true).run(OS);
+        EmitIntrinsics(Records, OS, true);
         break;
       case GenEDInfo:
-        EDEmitter(Records).run(OS);
+        EmitEnhancedDisassemblerInfo(Records, OS);
         break;
       case PrintEnums:
       {
@@ -179,7 +166,7 @@ namespace {
         break;
       }
       }
-  
+
       return false;
     }
   };
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
new file mode 100644
index 0000000..2c00c40
--- /dev/null
+++ b/utils/TableGen/TableGenBackends.h
@@ -0,0 +1,78 @@
+//===- TableGenBackends.h - Declarations for LLVM TableGen Backends -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations for all of the LLVM TableGen
+// backends. A "TableGen backend" is just a function. See below for a
+// precise description.
+//
+//===----------------------------------------------------------------------===//
+
+
+// A TableGen backend is a function that looks like
+//
+//    EmitFoo(RecordKeeper &RK, raw_ostream &OS /*, anything else you need */ )
+//
+// What you do inside of that function is up to you, but it will usually
+// involve generating C++ code to the provided raw_ostream.
+//
+// The RecordKeeper is just a top-level container for an in-memory
+// representation of the data encoded in the TableGen file. What a TableGen
+// backend does is walk around that in-memory representation and generate
+// stuff based on the information it contains.
+//
+// The in-memory representation is a node-graph (think of it like JSON but
+// with a richer ontology of types), where the nodes are subclasses of
+// Record. The methods `getClass`, `getDef` are the basic interface to
+// access the node-graph.  RecordKeeper also provides a handy method
+// `getAllDerivedDefinitions`. Consult "include/llvm/TableGen/Record.h" for
+// the exact interfaces provided by Record's and RecordKeeper.
+//
+// A common pattern for TableGen backends is for the EmitFoo function to
+// instantiate a class which holds some context for the generation process,
+// and then have most of the work happen in that class's methods. This
+// pattern partly has historical roots in the previous TableGen backend API
+// that involved a class and an invocation like `FooEmitter(RK).run(OS)`.
+//
+// Remember to wrap private things in an anonymous namespace. For most
+// backends, this means that the EmitFoo function is the only thing not in
+// the anonymous namespace.
+
+
+// FIXME: Reorganize TableGen so that build dependencies can be more
+// accurately expressed. Currently, touching any of the emitters (or
+// anything that they transitively depend on) causes everything dependent
+// on TableGen to be rebuilt (this includes all the targets!). Perhaps have
+// a standalone TableGen binary and have the backends be loadable modules
+// of some sort; then the dependency could be expressed as being on the
+// module, and all the modules would have a common dependency on the
+// TableGen binary with as few dependencies as possible on the rest of
+// LLVM.
+
+
+namespace llvm {
+
+class raw_ostream;
+class RecordKeeper;
+
+void EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly = false);
+void EmitAsmMatcher(RecordKeeper &RK, raw_ostream &OS);
+void EmitAsmWriter(RecordKeeper &RK, raw_ostream &OS);
+void EmitCallingConv(RecordKeeper &RK, raw_ostream &OS);
+void EmitCodeEmitter(RecordKeeper &RK, raw_ostream &OS);
+void EmitDAGISel(RecordKeeper &RK, raw_ostream &OS);
+void EmitDFAPacketizer(RecordKeeper &RK, raw_ostream &OS);
+void EmitDisassembler(RecordKeeper &RK, raw_ostream &OS);
+void EmitEnhancedDisassemblerInfo(RecordKeeper &RK, raw_ostream &OS);
+void EmitFastISel(RecordKeeper &RK, raw_ostream &OS);
+void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS);
+void EmitPseudoLowering(RecordKeeper &RK, raw_ostream &OS);
+void EmitRegisterInfo(RecordKeeper &RK, raw_ostream &OS);
+void EmitSubtarget(RecordKeeper &RK, raw_ostream &OS);
+
+} // End llvm namespace
diff --git a/utils/TableGen/X86DisassemblerShared.h b/utils/TableGen/X86DisassemblerShared.h
index 0417e9d..c13a0cc 100644
--- a/utils/TableGen/X86DisassemblerShared.h
+++ b/utils/TableGen/X86DisassemblerShared.h
@@ -14,6 +14,7 @@
 #include <string.h>
 
 #define INSTRUCTION_SPECIFIER_FIELDS       \
+  struct OperandSpecifier operands[X86_MAX_OPERANDS]; \
   bool                    filtered;        \
   InstructionContext      insnContext;     \
   std::string             name;            \
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index 2875168..f3bd373 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -21,10 +21,11 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include <map>
 
 using namespace llvm;
 using namespace X86Disassembler;
-  
+
 /// inheritsFrom - Indicates whether all instructions in one class also belong
 ///   to another class.
 ///
@@ -36,7 +37,7 @@ static inline bool inheritsFrom(InstructionContext child,
                                 bool VEX_LIG = false) {
   if (child == parent)
     return true;
-  
+
   switch (parent) {
   case IC:
     return(inheritsFrom(child, IC_64BIT) ||
@@ -117,17 +118,17 @@ static inline bool inheritsFrom(InstructionContext child,
 /// @param upper  - The class that may be preferable
 /// @param lower  - The class that may be less preferable
 /// @return       - True if upper is to be preferred, false otherwise.
-static inline bool outranks(InstructionContext upper, 
+static inline bool outranks(InstructionContext upper,
                             InstructionContext lower) {
   assert(upper < IC_max);
   assert(lower < IC_max);
-  
+
 #define ENUM_ENTRY(n, r, d) r,
   static int ranks[IC_max] = {
     INSTRUCTION_CONTEXTS
   };
 #undef ENUM_ENTRY
-  
+
   return (ranks[upper] > ranks[lower]);
 }
 
@@ -170,24 +171,22 @@ static inline const char* stringForOperandEncoding(OperandEncoding encoding) {
   }
 }
 
-void DisassemblerTables::emitOneID(raw_ostream &o,
-                                   uint32_t &i,
-                                   InstrUID id,
+void DisassemblerTables::emitOneID(raw_ostream &o, unsigned &i, InstrUID id,
                                    bool addComma) const {
   if (id)
     o.indent(i * 2) << format("0x%hx", id);
   else
     o.indent(i * 2) << 0;
-  
+
   if (addComma)
     o << ", ";
   else
     o << "  ";
-  
+
   o << "/* ";
   o << InstructionSpecifiers[id].name;
   o << "*/";
-  
+
   o << "\n";
 }
 
@@ -197,8 +196,7 @@ void DisassemblerTables::emitOneID(raw_ostream &o,
 ///
 /// @param o        - The output stream on which to emit the table.
 /// @param i        - The indentation level for that output stream.
-static void emitEmptyTable(raw_ostream &o, uint32_t &i)
-{
+static void emitEmptyTable(raw_ostream &o, unsigned &i) {
   o.indent(i * 2) << "0x0, /* EmptyTable */\n";
 }
 
@@ -207,15 +205,12 @@ static void emitEmptyTable(raw_ostream &o, uint32_t &i)
 ///
 /// @param decision - The decision to be compacted.
 /// @return         - The compactest available representation for the decision.
-static ModRMDecisionType getDecisionType(ModRMDecision &decision)
-{
+static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
   bool satisfiesOneEntry = true;
   bool satisfiesSplitRM = true;
   bool satisfiesSplitReg = true;
 
-  uint16_t index;
-
-  for (index = 0; index < 256; ++index) {
+  for (unsigned index = 0; index < 256; ++index) {
     if (decision.instructionIDs[index] != decision.instructionIDs[0])
       satisfiesOneEntry = false;
 
@@ -252,27 +247,25 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision)
 ///   to a particular decision type.
 ///
 /// @param dt - The decision type.
-/// @return   - A pointer to the statically-allocated string (e.g., 
+/// @return   - A pointer to the statically-allocated string (e.g.,
 ///             "MODRM_ONEENTRY" for MODRM_ONEENTRY).
-static const char* stringForDecisionType(ModRMDecisionType dt)
-{
+static const char* stringForDecisionType(ModRMDecisionType dt) {
 #define ENUM_ENTRY(n) case n: return #n;
   switch (dt) {
     default:
-      llvm_unreachable("Unknown decision type");  
+      llvm_unreachable("Unknown decision type");
     MODRMTYPES
-  };  
+  };
 #undef ENUM_ENTRY
 }
-  
+
 /// stringForModifierType - Returns a statically-allocated string corresponding
 ///   to an opcode modifier type.
 ///
 /// @param mt - The modifier type.
 /// @return   - A pointer to the statically-allocated string (e.g.,
 ///             "MODIFIER_NONE" for MODIFIER_NONE).
-static const char* stringForModifierType(ModifierType mt)
-{
+static const char* stringForModifierType(ModifierType mt) {
 #define ENUM_ENTRY(n) case n: return #n;
   switch(mt) {
     default:
@@ -281,35 +274,31 @@ static const char* stringForModifierType(ModifierType mt)
   };
 #undef ENUM_ENTRY
 }
-  
+
 DisassemblerTables::DisassemblerTables() {
   unsigned i;
-  
+
   for (i = 0; i < array_lengthof(Tables); i++) {
     Tables[i] = new ContextDecision;
     memset(Tables[i], 0, sizeof(ContextDecision));
   }
-  
+
   HasConflicts = false;
 }
-  
+
 DisassemblerTables::~DisassemblerTables() {
   unsigned i;
-  
+
   for (i = 0; i < array_lengthof(Tables); i++)
     delete Tables[i];
 }
-  
-void DisassemblerTables::emitModRMDecision(raw_ostream &o1,
-                                           raw_ostream &o2,
-                                           uint32_t &i1,
-                                           uint32_t &i2,
-                                           ModRMDecision &decision)
-  const {
-  static uint64_t sTableNumber = 0;
-  static uint64_t sEntryNumber = 1;
+
+void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
+                                           unsigned &i1, unsigned &i2,
+                                           ModRMDecision &decision) const {
+  static uint32_t sTableNumber = 0;
+  static uint32_t sEntryNumber = 1;
   ModRMDecisionType dt = getDecisionType(decision);
-  uint16_t index;
 
   if (dt == MODRM_ONEENTRY && decision.instructionIDs[0] == 0)
   {
@@ -338,13 +327,13 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1,
       emitOneID(o1, i1, decision.instructionIDs[0xc0], true); // mod = 0b11
       break;
     case MODRM_SPLITREG:
-      for (index = 0; index < 64; index += 8)
+      for (unsigned index = 0; index < 64; index += 8)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
-      for (index = 0xc0; index < 256; index += 8)
+      for (unsigned index = 0xc0; index < 256; index += 8)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
       break;
     case MODRM_FULL:
-      for (index = 0; index < 256; ++index)
+      for (unsigned index = 0; index < 256; ++index)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
       break;
   }
@@ -380,20 +369,15 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1,
   ++sTableNumber;
 }
 
-void DisassemblerTables::emitOpcodeDecision(
-  raw_ostream &o1,
-  raw_ostream &o2,
-  uint32_t &i1,
-  uint32_t &i2,
-  OpcodeDecision &decision) const {
-  uint16_t index;
-
+void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
+                                            unsigned &i1, unsigned &i2,
+                                            OpcodeDecision &decision) const {
   o2.indent(i2) << "{ /* struct OpcodeDecision */" << "\n";
   i2++;
   o2.indent(i2) << "{" << "\n";
   i2++;
 
-  for (index = 0; index < 256; ++index) {
+  for (unsigned index = 0; index < 256; ++index) {
     o2.indent(i2);
 
     o2 << "/* 0x" << format("%02hhx", index) << " */" << "\n";
@@ -412,21 +396,16 @@ void DisassemblerTables::emitOpcodeDecision(
   o2.indent(i2) << "}" << "\n";
 }
 
-void DisassemblerTables::emitContextDecision(
-  raw_ostream &o1,
-  raw_ostream &o2,
-  uint32_t &i1,
-  uint32_t &i2,
-  ContextDecision &decision,
-  const char* name) const {
+void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2,
+                                             unsigned &i1, unsigned &i2,
+                                             ContextDecision &decision,
+                                             const char* name) const {
   o2.indent(i2) << "static const struct ContextDecision " << name << " = {\n";
   i2++;
   o2.indent(i2) << "{ /* opcodeDecisions */" << "\n";
   i2++;
 
-  unsigned index;
-
-  for (index = 0; index < IC_max; ++index) {
+  for (unsigned index = 0; index < IC_max; ++index) {
     o2.indent(i2) << "/* ";
     o2 << stringForContext((InstructionContext)index);
     o2 << " */";
@@ -444,58 +423,81 @@ void DisassemblerTables::emitContextDecision(
   o2.indent(i2) << "};" << "\n";
 }
 
-void DisassemblerTables::emitInstructionInfo(raw_ostream &o, uint32_t &i) 
-  const {
+void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
+                                             unsigned &i) const {
+  unsigned NumInstructions = InstructionSpecifiers.size();
+
+  o << "static const struct OperandSpecifier x86OperandSets[]["
+    << X86_MAX_OPERANDS << "] = {\n";
+
+  typedef std::vector<std::pair<const char *, const char *> > OperandListTy;
+  std::map<OperandListTy, unsigned> OperandSets;
+
+  unsigned OperandSetNum = 0;
+  for (unsigned Index = 0; Index < NumInstructions; ++Index) {
+    OperandListTy OperandList;
+
+    for (unsigned OperandIndex = 0; OperandIndex < X86_MAX_OPERANDS;
+         ++OperandIndex) {
+      const char *Encoding =
+        stringForOperandEncoding((OperandEncoding)InstructionSpecifiers[Index]
+                                 .operands[OperandIndex].encoding);
+      const char *Type =
+        stringForOperandType((OperandType)InstructionSpecifiers[Index]
+                             .operands[OperandIndex].type);
+      OperandList.push_back(std::make_pair(Encoding, Type));
+    }
+    unsigned &N = OperandSets[OperandList];
+    if (N != 0) continue;
+
+    N = ++OperandSetNum;
+
+    o << "  { /* " << (OperandSetNum - 1) << " */\n";
+    for (unsigned i = 0, e = OperandList.size(); i != e; ++i) {
+      o << "    { " << OperandList[i].first << ", "
+        << OperandList[i].second << " },\n";
+    }
+    o << "  },\n";
+  }
+  o << "};" << "\n\n";
+
   o.indent(i * 2) << "static const struct InstructionSpecifier ";
   o << INSTRUCTIONS_STR "[" << InstructionSpecifiers.size() << "] = {\n";
-  
-  i++;
 
-  uint16_t numInstructions = InstructionSpecifiers.size();
-  uint16_t index, operandIndex;
+  i++;
 
-  for (index = 0; index < numInstructions; ++index) {
+  for (unsigned index = 0; index < NumInstructions; ++index) {
     o.indent(i * 2) << "{ /* " << index << " */" << "\n";
     i++;
 
     o.indent(i * 2) << stringForModifierType(
                        (ModifierType)InstructionSpecifiers[index].modifierType);
-    o << "," << "\n";
+    o << ",\n";
 
     o.indent(i * 2) << "0x";
     o << format("%02hhx", (uint16_t)InstructionSpecifiers[index].modifierBase);
-    o << "," << "\n";
-
-    o.indent(i * 2) << "{" << "\n";
-    i++;
-
-    for (operandIndex = 0; operandIndex < X86_MAX_OPERANDS; ++operandIndex) {
-      o.indent(i * 2) << "{ ";
-      o <<stringForOperandEncoding((OperandEncoding)InstructionSpecifiers[index]
-                                   .operands[operandIndex]
-                                   .encoding);
-      o << ", ";
-      o << stringForOperandType((OperandType)InstructionSpecifiers[index]
-                                .operands[operandIndex]
-                                .type);
-      o << " }";
-
-      if (operandIndex < X86_MAX_OPERANDS - 1)
-        o << ",";
-
-      o << "\n";
+    o << ",\n";
+
+    OperandListTy OperandList;
+    for (unsigned OperandIndex = 0; OperandIndex < X86_MAX_OPERANDS;
+         ++OperandIndex) {
+      const char *Encoding =
+        stringForOperandEncoding((OperandEncoding)InstructionSpecifiers[index]
+                                 .operands[OperandIndex].encoding);
+      const char *Type =
+        stringForOperandType((OperandType)InstructionSpecifiers[index]
+                             .operands[OperandIndex].type);
+      OperandList.push_back(std::make_pair(Encoding, Type));
     }
+    o.indent(i * 2) << (OperandSets[OperandList] - 1) << ",\n";
 
-    i--;
-    o.indent(i * 2) << "}," << "\n";
-    
     o.indent(i * 2) << "/* " << InstructionSpecifiers[index].name << " */";
     o << "\n";
 
     i--;
     o.indent(i * 2) << "}";
 
-    if (index + 1 < numInstructions)
+    if (index + 1 < NumInstructions)
       o << ",";
 
     o << "\n";
@@ -505,14 +507,12 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o, uint32_t &i)
   o.indent(i * 2) << "};" << "\n";
 }
 
-void DisassemblerTables::emitContextTable(raw_ostream &o, uint32_t &i) const {
-  uint16_t index;
-
-  o.indent(i * 2) << "static const InstructionContext " CONTEXTS_STR
+void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
+  o.indent(i * 2) << "static const uint8_t " CONTEXTS_STR
                      "[256] = {\n";
   i++;
 
-  for (index = 0; index < 256; ++index) {
+  for (unsigned index = 0; index < 256; ++index) {
     o.indent(i * 2);
 
     if ((index & ATTR_VEXL) && (index & ATTR_REXW) && (index & ATTR_OPSIZE))
@@ -545,7 +545,7 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, uint32_t &i) const {
       o << "IC_64BIT_REXW_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
       o << "IC_64BIT_REXW_XD";
-    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && 
+    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) &&
              (index & ATTR_OPSIZE))
       o << "IC_64BIT_REXW_OPSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XD) && (index & ATTR_OPSIZE))
@@ -593,11 +593,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, uint32_t &i) const {
   o.indent(i * 2) << "};" << "\n";
 }
 
-void DisassemblerTables::emitContextDecisions(raw_ostream &o1,
-                                            raw_ostream &o2,
-                                            uint32_t &i1,
-                                            uint32_t &i2)
-  const {
+void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
+                                             unsigned &i1, unsigned &i2) const {
   emitContextDecision(o1, o2, i1, i2, *Tables[0], ONEBYTE_STR);
   emitContextDecision(o1, o2, i1, i2, *Tables[1], TWOBYTE_STR);
   emitContextDecision(o1, o2, i1, i2, *Tables[2], THREEBYTE38_STR);
@@ -607,15 +604,15 @@ void DisassemblerTables::emitContextDecisions(raw_ostream &o1,
 }
 
 void DisassemblerTables::emit(raw_ostream &o) const {
-  uint32_t i1 = 0;
-  uint32_t i2 = 0;
-  
+  unsigned i1 = 0;
+  unsigned i2 = 0;
+
   std::string s1;
   std::string s2;
-  
+
   raw_string_ostream o1(s1);
   raw_string_ostream o2(s2);
-  
+
   emitInstructionInfo(o, i2);
   o << "\n";
 
@@ -641,9 +638,7 @@ void DisassemblerTables::setTableFields(ModRMDecision     &decision,
                                         const ModRMFilter &filter,
                                         InstrUID          uid,
                                         uint8_t           opcode) {
-  unsigned index;
-
-  for (index = 0; index < 256; ++index) {
+  for (unsigned index = 0; index < 256; ++index) {
     if (filter.accepts(index)) {
       if (decision.instructionIDs[index] == uid)
         continue;
@@ -653,10 +648,10 @@ void DisassemblerTables::setTableFields(ModRMDecision     &decision,
           InstructionSpecifiers[uid];
         InstructionSpecifier &previousInfo =
           InstructionSpecifiers[decision.instructionIDs[index]];
-        
+
         if(newInfo.filtered)
           continue; // filtered instructions get lowest priority
-        
+
         if(previousInfo.name == "NOOP" && (newInfo.name == "XCHG16ar" ||
                                            newInfo.name == "XCHG32ar" ||
                                            newInfo.name == "XCHG32ar64" ||
@@ -665,7 +660,7 @@ void DisassemblerTables::setTableFields(ModRMDecision     &decision,
 
         if (outranks(previousInfo.insnContext, newInfo.insnContext))
           continue;
-        
+
         if (previousInfo.insnContext == newInfo.insnContext &&
             !previousInfo.filtered) {
           errs() << "Error: Primary decode conflict: ";
@@ -690,17 +685,15 @@ void DisassemblerTables::setTableFields(OpcodeType          type,
                                         InstrUID            uid,
                                         bool                is32bit,
                                         bool                ignoresVEX_L) {
-  unsigned index;
-  
   ContextDecision &decision = *Tables[type];
 
-  for (index = 0; index < IC_max; ++index) {
+  for (unsigned index = 0; index < IC_max; ++index) {
     if (is32bit && inheritsFrom((InstructionContext)index, IC_64BIT))
       continue;
 
-    if (inheritsFrom((InstructionContext)index, 
+    if (inheritsFrom((InstructionContext)index,
                      InstructionSpecifiers[uid].insnContext, ignoresVEX_L))
-      setTableFields(decision.opcodeDecisions[index].modRMDecisions[opcode], 
+      setTableFields(decision.opcodeDecisions[index].modRMDecisions[opcode],
                      filter,
                      uid,
                      opcode);
diff --git a/utils/TableGen/X86DisassemblerTables.h b/utils/TableGen/X86DisassemblerTables.h
index e148cd2..ea006c0 100644
--- a/utils/TableGen/X86DisassemblerTables.h
+++ b/utils/TableGen/X86DisassemblerTables.h
@@ -42,13 +42,13 @@ private:
   /// [4] three-byte opcodes of the form 0f a6 __
   /// [5] three-byte opcodes of the form 0f a7 __
   ContextDecision* Tables[6];
-  
+
   /// The instruction information table
   std::vector<InstructionSpecifier> InstructionSpecifiers;
-  
+
   /// True if there are primary decode conflicts in the instruction set
   bool HasConflicts;
-  
+
   /// emitOneID - Emits a table entry for a single instruction entry, at the
   ///   innermost level of the structure hierarchy.  The entry is printed out
   ///   in the format "nnnn, /* MNEMONIC */" where nnnn is the ID in decimal,
@@ -64,7 +64,7 @@ private:
                  uint32_t &i,
                  InstrUID id,
                  bool addComma) const;
-  
+
   /// emitModRMDecision - Emits a table of entries corresponding to a single
   ///   ModR/M decision.  Compacts the ModR/M decision if possible.  ModR/M
   ///   decisions are printed as:
@@ -77,12 +77,12 @@ private:
   ///   where nnnn is a unique ID for the corresponding table of IDs.
   ///   TYPE indicates whether the table has one entry that is the same
   ///   regardless of ModR/M byte, two entries - one for bytes 0x00-0xbf and one
-  ///   for bytes 0xc0-0xff -, or 256 entries, one for each possible byte.  
+  ///   for bytes 0xc0-0xff -, or 256 entries, one for each possible byte.
   ///   nnnn is the number of a table for looking up these values.  The tables
   ///   are written separately so that tables consisting entirely of zeros will
   ///   not be duplicated.  (These all have the name modRMEmptyTable.)  A table
   ///   is printed as:
-  ///   
+  ///
   ///   InstrUID modRMTablennnn[k] = {
   ///     nnnn, /* MNEMONIC */
   ///     ...
@@ -100,7 +100,7 @@ private:
                          uint32_t &i1,
                          uint32_t &i2,
                          ModRMDecision &decision) const;
-  
+
   /// emitOpcodeDecision - Emits an OpcodeDecision and all its subsidiary ModR/M
   ///   decisions.  An OpcodeDecision is printed as:
   ///
@@ -129,8 +129,8 @@ private:
                           uint32_t &i1,
                           uint32_t &i2,
                           OpcodeDecision &decision) const;
-  
-  /// emitContextDecision - Emits a ContextDecision and all its subsidiary 
+
+  /// emitContextDecision - Emits a ContextDecision and all its subsidiary
   ///   Opcode and ModRMDecisions.  A ContextDecision is printed as:
   ///
   ///   struct ContextDecision NAME = {
@@ -163,10 +163,10 @@ private:
   void emitContextDecision(raw_ostream &o1,
                            raw_ostream &o2,
                            uint32_t &i1,
-                           uint32_t &i2,                           
+                           uint32_t &i2,
                            ContextDecision &decision,
                            const char* name) const;
-  
+
   /// emitInstructionInfo - Prints the instruction specifier table, which has
   ///   one entry for each instruction, and contains name and operand
   ///   information.  This table is printed as:
@@ -187,17 +187,17 @@ private:
   ///   };
   ///
   ///   k is the total number of instructions.
-  ///   nnnn is the ID of the current instruction (0-based).  This table 
+  ///   nnnn is the ID of the current instruction (0-based).  This table
   ///   includes entries for non-instructions like PHINODE.
   ///   0xnn is the lowest possible opcode for the current instruction, used for
   ///   AddRegFrm instructions to compute the operand's value.
   ///   ENCODING and TYPE describe the encoding and type for a single operand.
   ///
-  /// @param o  - The output stream to which the instruction table should be 
+  /// @param o  - The output stream to which the instruction table should be
   ///             written.
   /// @param i  - The indent level for use with the stream.
   void emitInstructionInfo(raw_ostream &o, uint32_t &i) const;
-  
+
   /// emitContextTable - Prints the table that is used to translate from an
   ///   instruction attribute mask to an instruction context.  This table is
   ///   printed as:
@@ -213,7 +213,7 @@ private:
   /// @param o  - The output stream to which the context table should be written.
   /// @param i  - The indent level for use with the stream.
   void emitContextTable(raw_ostream &o, uint32_t &i) const;
-  
+
   /// emitContextDecisions - Prints all four ContextDecision structures using
   ///   emitContextDecision().
   ///
@@ -225,7 +225,7 @@ private:
   void emitContextDecisions(raw_ostream &o1,
                             raw_ostream &o2,
                             uint32_t &i1,
-                            uint32_t &i2) const; 
+                            uint32_t &i2) const;
 
   /// setTableFields - Uses a ModRMFilter to set the appropriate entries in a
   ///   ModRMDecision to refer to a particular instruction ID.
@@ -241,14 +241,14 @@ private:
 public:
   /// Constructor - Allocates space for the class decisions and clears them.
   DisassemblerTables();
-  
+
   ~DisassemblerTables();
-  
+
   /// emit - Emits the instruction table, context table, and class decisions.
   ///
   /// @param o  - The output stream to print the tables to.
   void emit(raw_ostream &o) const;
-  
+
   /// setTableFields - Uses the opcode type, instruction context, opcode, and a
   ///   ModRMFilter as criteria to set a particular set of entries in the
   ///   decode tables to point to a specific uid.
@@ -268,24 +268,24 @@ public:
                       const ModRMFilter &filter,
                       InstrUID uid,
                       bool is32bit,
-                      bool ignoresVEX_L);  
-  
+                      bool ignoresVEX_L);
+
   /// specForUID - Returns the instruction specifier for a given unique
   ///   instruction ID.  Used when resolving collisions.
   ///
   /// @param uid  - The unique ID of the instruction.
-  /// @return     - A reference to the instruction specifier. 
+  /// @return     - A reference to the instruction specifier.
   InstructionSpecifier& specForUID(InstrUID uid) {
     if (uid >= InstructionSpecifiers.size())
       InstructionSpecifiers.resize(uid + 1);
-    
+
     return InstructionSpecifiers[uid];
   }
-  
+
   // hasConflicts - Reports whether there were primary decode conflicts
   //   from any instructions added to the tables.
   // @return  - true if there were; false otherwise.
-  
+
   bool hasConflicts() {
     return HasConflicts;
   }
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 6a01cce..7ac2336 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -57,19 +57,19 @@ namespace X86Local {
     MRMDestMem  = 4,
     MRMSrcReg   = 5,
     MRMSrcMem   = 6,
-    MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, 
+    MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19,
     MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23,
     MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27,
     MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31,
     MRMInitReg  = 32,
+    RawFrmImm8  = 43,
+    RawFrmImm16 = 44,
 #define MAP(from, to) MRM_##from = to,
     MRM_MAPPING
 #undef MAP
-    RawFrmImm8  = 43,
-    RawFrmImm16 = 44,
     lastMRM
   };
-  
+
   enum {
     TB  = 1,
     REP = 2,
@@ -82,17 +82,17 @@ namespace X86Local {
 }
 
 // If rows are added to the opcode extension tables, then corresponding entries
-// must be added here.  
+// must be added here.
 //
 // If the row corresponds to a single byte (i.e., 8f), then add an entry for
 // that byte to ONE_BYTE_EXTENSION_TABLES.
 //
-// If the row corresponds to two bytes where the first is 0f, add an entry for 
+// If the row corresponds to two bytes where the first is 0f, add an entry for
 // the second byte to TWO_BYTE_EXTENSION_TABLES.
 //
 // If the row corresponds to some other set of bytes, you will need to modify
 // the code in RecognizableInstr::emitDecodePath() as well, and add new prefixes
-// to the X86 TD files, except in two cases: if the first two bytes of such a 
+// to the X86 TD files, except in two cases: if the first two bytes of such a
 // new combination are 0f 38 or 0f 3a, you just have to add maps called
 // THREE_BYTE_38_EXTENSION_TABLES and THREE_BYTE_3A_EXTENSION_TABLES and add a
 // switch(Opcode) just below the case X86Local::T8: or case X86Local::TA: line
@@ -116,7 +116,7 @@ namespace X86Local {
   EXTENSION_TABLE(f7)             \
   EXTENSION_TABLE(fe)             \
   EXTENSION_TABLE(ff)
-  
+
 #define TWO_BYTE_EXTENSION_TABLES \
   EXTENSION_TABLE(00)             \
   EXTENSION_TABLE(01)             \
@@ -134,7 +134,7 @@ namespace X86Local {
 using namespace X86Disassembler;
 
 /// needsModRMForDecode - Indicates whether a particular instruction requires a
-///   ModR/M byte for the instruction to be properly decoded.  For example, a 
+///   ModR/M byte for the instruction to be properly decoded.  For example, a
 ///   MRMDestReg instruction needs the Mod field in the ModR/M byte to be set to
 ///   0b11.
 ///
@@ -213,17 +213,17 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   Rec = insn.TheDef;
   Name = Rec->getName();
   Spec = &tables.specForUID(UID);
-  
+
   if (!Rec->isSubClassOf("X86Inst")) {
     ShouldBeEmitted = false;
     return;
   }
-  
+
   Prefix   = byteFromRec(Rec, "Prefix");
   Opcode   = byteFromRec(Rec, "Opcode");
   Form     = byteFromRec(Rec, "FormBits");
   SegOvr   = byteFromRec(Rec, "SegOvrBits");
-  
+
   HasOpSizePrefix  = Rec->getValueAsBit("hasOpSizePrefix");
   HasAdSizePrefix  = Rec->getValueAsBit("hasAdSizePrefix");
   HasREX_WPrefix   = Rec->getValueAsBit("hasREX_WPrefix");
@@ -235,12 +235,12 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   IgnoresVEX_L     = Rec->getValueAsBit("ignoresVEX_L");
   HasLockPrefix    = Rec->getValueAsBit("hasLockPrefix");
   IsCodeGenOnly    = Rec->getValueAsBit("isCodeGenOnly");
-  
+
   Name      = Rec->getName();
   AsmString = Rec->getValueAsString("AsmString");
-  
+
   Operands = &insn.Operands.OperandList;
-  
+
   IsSSE            = (HasOpSizePrefix && (Name.find("16") == Name.npos)) ||
                      (Name.find("CRC32") != Name.npos);
   HasFROperands    = hasFROperands();
@@ -262,32 +262,32 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
     }
   }
   // FIXME: These instructions aren't marked as 64-bit in any way
-  Is64Bit |= Rec->getName() == "JMP64pcrel32" || 
-             Rec->getName() == "MASKMOVDQU64" || 
-             Rec->getName() == "POPFS64" || 
-             Rec->getName() == "POPGS64" || 
-             Rec->getName() == "PUSHFS64" || 
+  Is64Bit |= Rec->getName() == "JMP64pcrel32" ||
+             Rec->getName() == "MASKMOVDQU64" ||
+             Rec->getName() == "POPFS64" ||
+             Rec->getName() == "POPGS64" ||
+             Rec->getName() == "PUSHFS64" ||
              Rec->getName() == "PUSHGS64" ||
              Rec->getName() == "REX64_PREFIX" ||
-             Rec->getName().find("MOV64") != Name.npos || 
+             Rec->getName().find("MOV64") != Name.npos ||
              Rec->getName().find("PUSH64") != Name.npos ||
              Rec->getName().find("POP64") != Name.npos;
 
   ShouldBeEmitted  = true;
 }
-  
+
 void RecognizableInstr::processInstr(DisassemblerTables &tables,
-	const CodeGenInstruction &insn,
-                                   InstrUID uid)
+                                     const CodeGenInstruction &insn,
+                                     InstrUID uid)
 {
   // Ignore "asm parser only" instructions.
   if (insn.TheDef->getValueAsBit("isAsmParserOnly"))
     return;
-  
+
   RecognizableInstr recogInstr(tables, insn, uid);
-  
+
   recogInstr.emitInstructionSpecifier(tables);
-  
+
   if (recogInstr.shouldBeEmitted())
     recogInstr.emitDecodePath(tables);
 }
@@ -386,55 +386,40 @@ InstructionContext RecognizableInstr::insnContext() const {
 
   return insnContext;
 }
-  
+
 RecognizableInstr::filter_ret RecognizableInstr::filter() const {
   ///////////////////
   // FILTER_STRONG
   //
-    
+
   // Filter out intrinsics
-  
-  if (!Rec->isSubClassOf("X86Inst"))
-    return FILTER_STRONG;
-  
+
+  assert(Rec->isSubClassOf("X86Inst") && "Can only filter X86 instructions");
+
   if (Form == X86Local::Pseudo ||
       (IsCodeGenOnly && Name.find("_REV") == Name.npos))
     return FILTER_STRONG;
-  
-  if (Form == X86Local::MRMInitReg)
-    return FILTER_STRONG;
-    
-    
+
+
   // Filter out artificial instructions but leave in the LOCK_PREFIX so it is
   // printed as a separate "instruction".
-    
+
   if (Name.find("_Int") != Name.npos       ||
-      Name.find("Int_") != Name.npos       ||
-      Name.find("_NOREX") != Name.npos     ||
-      Name.find("2SDL") != Name.npos)
+      Name.find("Int_") != Name.npos)
     return FILTER_STRONG;
 
   // Filter out instructions with segment override prefixes.
   // They're too messy to handle now and we'll special case them if needed.
-    
+
   if (SegOvr)
     return FILTER_STRONG;
-    
-  // Filter out instructions that can't be printed.
-    
-  if (AsmString.size() == 0)
-    return FILTER_STRONG;
-   
-  // Filter out instructions with subreg operands.
-   
-  if (AsmString.find("subreg") != AsmString.npos)
-    return FILTER_STRONG;
+
 
   /////////////////
   // FILTER_WEAK
   //
 
-    
+
   // Filter out instructions with a LOCK prefix;
   //   prefer forms that do not have the prefix
   if (HasLockPrefix)
@@ -474,9 +459,9 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
     return FILTER_WEAK;
 
   if (HasFROperands && Name.find("MOV") != Name.npos &&
-     ((Name.find("2") != Name.npos && Name.find("32") == Name.npos) || 
+     ((Name.find("2") != Name.npos && Name.find("32") == Name.npos) ||
       (Name.find("to") != Name.npos)))
-    return FILTER_WEAK;
+    return FILTER_STRONG;
 
   return FILTER_NORMAL;
 }
@@ -487,7 +472,7 @@ bool RecognizableInstr::hasFROperands() const {
 
   for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
     const std::string &recName = OperandList[operandIndex].Rec->getName();
-      
+
     if (recName.find("FR") != recName.npos)
       return true;
   }
@@ -497,57 +482,57 @@ bool RecognizableInstr::hasFROperands() const {
 bool RecognizableInstr::has256BitOperands() const {
   const std::vector<CGIOperandList::OperandInfo> &OperandList = *Operands;
   unsigned numOperands = OperandList.size();
-    
+
   for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
     const std::string &recName = OperandList[operandIndex].Rec->getName();
-       
-    if (!recName.compare("VR256") || !recName.compare("f256mem")) {
+
+    if (!recName.compare("VR256")) {
       return true;
     }
   }
   return false;
 }
-  
-void RecognizableInstr::handleOperand(
-  bool optional,
-  unsigned &operandIndex,
-  unsigned &physicalOperandIndex,
-  unsigned &numPhysicalOperands,
-  unsigned *operandMapping,
-  OperandEncoding (*encodingFromString)(const std::string&, bool hasOpSizePrefix)) {
+
+void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex,
+                                      unsigned &physicalOperandIndex,
+                                      unsigned &numPhysicalOperands,
+                                      const unsigned *operandMapping,
+                                      OperandEncoding (*encodingFromString)
+                                        (const std::string&,
+                                         bool hasOpSizePrefix)) {
   if (optional) {
     if (physicalOperandIndex >= numPhysicalOperands)
       return;
   } else {
     assert(physicalOperandIndex < numPhysicalOperands);
   }
-  
+
   while (operandMapping[operandIndex] != operandIndex) {
     Spec->operands[operandIndex].encoding = ENCODING_DUP;
     Spec->operands[operandIndex].type =
       (OperandType)(TYPE_DUP0 + operandMapping[operandIndex]);
     ++operandIndex;
   }
-  
+
   const std::string &typeName = (*Operands)[operandIndex].Rec->getName();
 
   Spec->operands[operandIndex].encoding = encodingFromString(typeName,
                                                               HasOpSizePrefix);
-  Spec->operands[operandIndex].type = typeFromString(typeName, 
+  Spec->operands[operandIndex].type = typeFromString(typeName,
                                                      IsSSE,
                                                      HasREX_WPrefix,
                                                      HasOpSizePrefix);
-  
+
   ++operandIndex;
   ++physicalOperandIndex;
 }
 
 void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
   Spec->name       = Name;
-    
-  if (!Rec->isSubClassOf("X86Inst"))
+
+  if (!ShouldBeEmitted)
     return;
-  
+
   switch (filter()) {
   case FILTER_WEAK:
     Spec->filtered = true;
@@ -558,29 +543,26 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
   case FILTER_NORMAL:
     break;
   }
-  
+
   Spec->insnContext = insnContext();
-    
+
   const std::vector<CGIOperandList::OperandInfo> &OperandList = *Operands;
-  
-  unsigned operandIndex;
+
   unsigned numOperands = OperandList.size();
   unsigned numPhysicalOperands = 0;
-  
+
   // operandMapping maps from operands in OperandList to their originals.
   // If operandMapping[i] != i, then the entry is a duplicate.
   unsigned operandMapping[X86_MAX_OPERANDS];
-  
-  bool hasFROperands = false;
-  
   assert(numOperands <= X86_MAX_OPERANDS && "X86_MAX_OPERANDS is not large enough");
-  
-  for (operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
+
+  for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
     if (OperandList[operandIndex].Constraints.size()) {
       const CGIOperandList::ConstraintInfo &Constraint =
         OperandList[operandIndex].Constraints[0];
       if (Constraint.isTied()) {
-        operandMapping[operandIndex] = Constraint.getTiedOperand();
+        operandMapping[operandIndex] = operandIndex;
+        operandMapping[Constraint.getTiedOperand()] = operandIndex;
       } else {
         ++numPhysicalOperands;
         operandMapping[operandIndex] = operandIndex;
@@ -589,20 +571,7 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
       ++numPhysicalOperands;
       operandMapping[operandIndex] = operandIndex;
     }
-
-    const std::string &recName = OperandList[operandIndex].Rec->getName();
-
-    if (recName.find("FR") != recName.npos)
-      hasFROperands = true;
   }
-  
-  if (hasFROperands && Name.find("MOV") != Name.npos &&
-     ((Name.find("2") != Name.npos && Name.find("32") == Name.npos) ||
-      (Name.find("to") != Name.npos)))
-    ShouldBeEmitted = false;
-  
-  if (!ShouldBeEmitted)
-    return;
 
 #define HANDLE_OPERAND(class)               \
   handleOperand(false,                      \
@@ -611,7 +580,7 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
                 numPhysicalOperands,        \
                 operandMapping,             \
                 class##EncodingFromString);
-  
+
 #define HANDLE_OPTIONAL(class)              \
   handleOperand(true,                       \
                 operandIndex,               \
@@ -619,17 +588,17 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
                 numPhysicalOperands,        \
                 operandMapping,             \
                 class##EncodingFromString);
-  
+
   // operandIndex should always be < numOperands
-  operandIndex = 0;
+  unsigned operandIndex = 0;
   // physicalOperandIndex should always be < numPhysicalOperands
   unsigned physicalOperandIndex = 0;
-    
+
   switch (Form) {
   case X86Local::RawFrm:
     // Operand 1 (optional) is an address or immediate.
     // Operand 2 (optional) is an immediate.
-    assert(numPhysicalOperands <= 2 && 
+    assert(numPhysicalOperands <= 2 &&
            "Unexpected number of operands for RawFrm");
     HANDLE_OPTIONAL(relocation)
     HANDLE_OPTIONAL(immediate)
@@ -653,14 +622,14 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
     else
       assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
              "Unexpected number of operands for MRMDestRegFrm");
-  
+
     HANDLE_OPERAND(rmRegister)
 
     if (HasVEX_4VPrefix)
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
-          
+
     HANDLE_OPERAND(roRegister)
     HANDLE_OPTIONAL(immediate)
     break;
@@ -681,7 +650,7 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
-          
+
     HANDLE_OPERAND(roRegister)
     HANDLE_OPTIONAL(immediate)
     break;
@@ -690,14 +659,15 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
     // Operand 2 is a register operand in the R/M field.
     // - In AVX, there is a register operand in the VEX.vvvv field here -
     // Operand 3 (optional) is an immediate.
+    // Operand 4 (optional) is an immediate.
 
     if (HasVEX_4VPrefix || HasVEX_4VOp3Prefix)
       assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 5 &&
-             "Unexpected number of operands for MRMSrcRegFrm with VEX_4V"); 
+             "Unexpected number of operands for MRMSrcRegFrm with VEX_4V");
     else
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
+      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 4 &&
              "Unexpected number of operands for MRMSrcRegFrm");
-  
+
     HANDLE_OPERAND(roRegister)
 
     if (HasVEX_4VPrefix)
@@ -716,6 +686,7 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
     if (!HasMemOp4Prefix)
       HANDLE_OPTIONAL(immediate)
     HANDLE_OPTIONAL(immediate) // above might be a register in 7:4
+    HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcMem:
     // Operand 1 is a register operand in the Reg/Opcode field.
@@ -725,11 +696,11 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
 
     if (HasVEX_4VPrefix || HasVEX_4VOp3Prefix)
       assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 5 &&
-             "Unexpected number of operands for MRMSrcMemFrm with VEX_4V"); 
+             "Unexpected number of operands for MRMSrcMemFrm with VEX_4V");
     else
       assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
              "Unexpected number of operands for MRMSrcMemFrm");
-    
+
     HANDLE_OPERAND(roRegister)
 
     if (HasVEX_4VPrefix)
@@ -759,16 +730,18 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
   case X86Local::MRM7r:
     // Operand 1 is a register operand in the R/M field.
     // Operand 2 (optional) is an immediate or relocation.
+    // Operand 3 (optional) is an immediate.
     if (HasVEX_4VPrefix)
       assert(numPhysicalOperands <= 3 &&
              "Unexpected number of operands for MRMnRFrm with VEX_4V");
     else
-      assert(numPhysicalOperands <= 2 &&
+      assert(numPhysicalOperands <= 3 &&
              "Unexpected number of operands for MRMnRFrm");
     if (HasVEX_4VPrefix)
       HANDLE_OPERAND(vvvvRegister)
     HANDLE_OPTIONAL(rmRegister)
     HANDLE_OPTIONAL(relocation)
+    HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRM0m:
   case X86Local::MRM1m:
@@ -809,7 +782,7 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
     // Ignored.
     break;
   }
-  
+
   #undef HANDLE_OPERAND
   #undef HANDLE_OPTIONAL
 }
@@ -823,8 +796,8 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     break;
 
   OpcodeType    opcodeType  = (OpcodeType)-1;
-  
-  ModRMFilter*  filter      = NULL; 
+
+  ModRMFilter*  filter      = NULL;
   uint8_t       opcodeToSet = 0;
 
   switch (Prefix) {
@@ -1022,26 +995,26 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     if(Spec->modifierType != MODIFIER_MODRM) {
       assert(opcodeToSet < 0xf9 &&
              "Not enough room for all ADDREG_FRM operands");
-    
+
       uint8_t currentOpcode;
 
       for (currentOpcode = opcodeToSet;
            currentOpcode < opcodeToSet + 8;
            ++currentOpcode)
-        tables.setTableFields(opcodeType, 
-                              insnContext(), 
-                              currentOpcode, 
-                              *filter, 
+        tables.setTableFields(opcodeType,
+                              insnContext(),
+                              currentOpcode,
+                              *filter,
                               UID, Is32Bit, IgnoresVEX_L);
-    
+
       Spec->modifierType = MODIFIER_OPCODE;
       Spec->modifierBase = opcodeToSet;
     } else {
       // modifierBase was set where MODIFIER_MODRM was set
-      tables.setTableFields(opcodeType, 
-                            insnContext(), 
-                            opcodeToSet, 
-                            *filter, 
+      tables.setTableFields(opcodeType,
+                            insnContext(),
+                            opcodeToSet,
+                            *filter,
                             UID, Is32Bit, IgnoresVEX_L);
     }
   } else {
@@ -1050,13 +1023,13 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
                           opcodeToSet,
                           *filter,
                           UID, Is32Bit, IgnoresVEX_L);
-    
+
     Spec->modifierType = MODIFIER_NONE;
     Spec->modifierBase = opcodeToSet;
   }
-  
+
   delete filter;
-  
+
 #undef MAP
 }
 
@@ -1066,7 +1039,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
                                               bool hasREX_WPrefix,
                                               bool hasOpSizePrefix) {
   if (isSSE) {
-    // For SSE instructions, we ignore the OpSize prefix and force operand 
+    // For SSE instructions, we ignore the OpSize prefix and force operand
     // sizes.
     TYPE("GR16",              TYPE_R16)
     TYPE("GR32",              TYPE_R32)
@@ -1140,6 +1113,10 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("GR16_NOAX",           TYPE_Rv)
   TYPE("GR32_NOAX",           TYPE_Rv)
   TYPE("GR64_NOAX",           TYPE_R64)
+  TYPE("vx32mem",             TYPE_M32)
+  TYPE("vy32mem",             TYPE_M32)
+  TYPE("vx64mem",             TYPE_M64)
+  TYPE("vy64mem",             TYPE_M64)
   errs() << "Unhandled type string " << s << "\n";
   llvm_unreachable("Unhandled type string");
 }
@@ -1243,6 +1220,10 @@ OperandEncoding RecognizableInstr::memoryEncodingFromString
   ENCODING("opaque48mem",     ENCODING_RM)
   ENCODING("opaque80mem",     ENCODING_RM)
   ENCODING("opaque512mem",    ENCODING_RM)
+  ENCODING("vx32mem",         ENCODING_RM)
+  ENCODING("vy32mem",         ENCODING_RM)
+  ENCODING("vx64mem",         ENCODING_RM)
+  ENCODING("vy64mem",         ENCODING_RM)
   errs() << "Unhandled memory encoding " << s << "\n";
   llvm_unreachable("Unhandled memory encoding");
 }
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 6c0a234..542e510 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -204,7 +204,7 @@ private:
                      unsigned &operandIndex,
                      unsigned &physicalOperandIndex,
                      unsigned &numPhysicalOperands,
-                     unsigned *operandMapping,
+                     const unsigned *operandMapping,
                      OperandEncoding (*encodingFromString)
                        (const std::string&,
                         bool hasOpSizePrefix));
diff --git a/utils/UpdateCMakeLists.pl b/utils/UpdateCMakeLists.pl
index 8f535145..d92a767 100755
--- a/utils/UpdateCMakeLists.pl
+++ b/utils/UpdateCMakeLists.pl
@@ -68,8 +68,7 @@ sub UpdateCMake {
   while(<IN>) {
     if (!$foundLibrary) {
       print OUT $_;
-      if (/^add_clang_library\(/ || /^add_llvm_library\(/ || /^add_llvm_target\(/
-          || /^add_executable\(/) {
+      if (/^add_[^_]+_library\(/ || /^add_llvm_target\(/ || /^add_executable\(/) {
         $foundLibrary = 1;
         EmitCMakeList($dir);
       }
diff --git a/utils/buildit/build_llvm b/utils/buildit/build_llvm
index 994fb06..6aee831 100755
--- a/utils/buildit/build_llvm
+++ b/utils/buildit/build_llvm
@@ -133,7 +133,7 @@ if [ \! -f Makefile.config ]; then
     || exit 1
 fi
 
-SUBVERSION=`echo $RC_ProjectSourceVersion | sed -e 's/[^.]*\.\([0-9]*\).*/\1/'`
+SUBVERSION=`echo $RC_ProjectSourceVersion | sed -e 's/.*\.\([0-9]*\).*/\1/'`
 
 if [ "x$SUBVERSION" != "x$RC_ProjectSourceVersion" ]; then
     LLVM_SUBMIT_SUBVERSION=`printf "%02d" $SUBVERSION`
diff --git a/utils/findmisopt b/utils/findmisopt
index f2a872c..88f991a 100755
--- a/utils/findmisopt
+++ b/utils/findmisopt
@@ -76,8 +76,7 @@ echo "  Optimized program: $optprog"
 # Define the list of optimizations to run. This comprises the same set of 
 # optimizations that opt -std-compile-opts and gccld run, in the same order.
 opt_switches=`llvm-as < /dev/null -o - | opt -std-compile-opts -disable-output -debug-pass=Arguments 2>&1 | sed 's/Pass Arguments: //'`
-ld_switches=`llvm-as < /dev/null -o - | llvm-ld - -debug-pass=Arguments 2>&1 | sed 's/Pass Arguments: //'`
-all_switches="$opt_switches $ld_switches"
+all_switches="$opt_switches"
 echo "Passes : $all_switches"
 
 # Create output directory if it doesn't exist
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index b5f7986..71882b7 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -23,62 +23,16 @@ kUseCloseFDs = not kIsWindows
 # Use temporary files to replace /dev/null on Windows.
 kAvoidDevNull = kIsWindows
 
-# Negate if win32file is not found.
-kHaveWin32File = kIsWindows
-
-def RemoveForce(f):
-    try:
-        os.remove(f)
-    except OSError:
-        pass
-
-def WinWaitReleased(f):
-    global kHaveWin32File
-    if not kHaveWin32File:
-        return
-    try:
-        import time
-        import win32file, pywintypes
-        retry_cnt = 256
-        while True:
-            try:
-                h = win32file.CreateFile(
-                    f,
-                    win32file.GENERIC_READ,
-                    0, # Exclusive
-                    None,
-                    win32file.OPEN_EXISTING,
-                    win32file.FILE_ATTRIBUTE_NORMAL,
-                    None)
-                h.close()
-                return
-            except WindowsError, (winerror, strerror):
-                retry_cnt = retry_cnt - 1
-                if retry_cnt <= 0:
-                    raise
-                elif winerror == 32: # ERROR_SHARING_VIOLATION
-                    pass
-                else:
-                    raise
-            except pywintypes.error, e:
-                retry_cnt = retry_cnt - 1
-                if retry_cnt <= 0:
-                    raise
-                elif e[0]== 32: # ERROR_SHARING_VIOLATION
-                    pass
-                else:
-                    raise
-            time.sleep(0.01)
-    except ImportError, e:
-        kHaveWin32File = False
-        return
-
 def executeCommand(command, cwd=None, env=None):
+    # Close extra file handles on UNIX (on Windows this cannot be done while
+    # also redirecting input).
+    close_fds = not kIsWindows
+
     p = subprocess.Popen(command, cwd=cwd,
                          stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE,
-                         env=env)
+                         env=env, close_fds=close_fds)
     out,err = p.communicate()
     exitCode = p.wait()
 
@@ -165,7 +119,6 @@ def executeShCmd(cmd, cfg, cwd, results):
             else:
                 if r[2] is None:
                     if kAvoidDevNull and r[0] == '/dev/null':
-                        r[0] = None
                         r[2] = tempfile.TemporaryFile(mode=r[1])
                     else:
                         r[2] = open(r[0], r[1])
@@ -174,7 +127,7 @@ def executeShCmd(cmd, cfg, cwd, results):
                     # FIXME: Actually, this is probably an instance of PR6753.
                     if r[1] == 'a':
                         r[2].seek(0, 2)
-                    opened_files.append(r)
+                    opened_files.append(r[2])
                 result = r[2]
             final_redirects.append(result)
 
@@ -236,7 +189,7 @@ def executeShCmd(cmd, cfg, cwd, results):
     # on Win32, for example). Since we have already spawned the subprocess, our
     # handles have already been transferred so we do not need them anymore.
     for f in opened_files:
-        f[2].close()
+        f.close()
 
     # FIXME: There is probably still deadlock potential here. Yawn.
     procData = [None] * len(procs)
@@ -275,15 +228,12 @@ def executeShCmd(cmd, cfg, cwd, results):
         else:
             exitCode = res
 
-    # Make sure opened_files is released by other (child) processes.
-    if kIsWindows:
-        for f in opened_files:
-            if f[0] is not None:
-                WinWaitReleased(f[0])
-
     # Remove any named temporary files we created.
     for f in named_temp_files:
-        RemoveForce(f)
+        try:
+            os.remove(f)
+        except OSError:
+            pass
 
     if cmd.negate:
         exitCode = not exitCode
diff --git a/utils/llvm-build/llvmbuild/main.py b/utils/llvm-build/llvmbuild/main.py
index baecc6d..27d23d0 100644
--- a/utils/llvm-build/llvmbuild/main.py
+++ b/utils/llvm-build/llvmbuild/main.py
@@ -55,7 +55,7 @@ def make_install_dir(path):
     Create the given directory path for installation, including any parents.
     """
 
-    # os.makedirs considers it an error to be called with an existant path.
+    # os.makedirs considers it an error to be called with an existent path.
     if not os.path.exists(path):
         os.makedirs(path)
 
diff --git a/utils/llvm-lit/llvm-lit.in b/utils/llvm-lit/llvm-lit.in
index 1df1747..879d18b 100644
--- a/utils/llvm-lit/llvm-lit.in
+++ b/utils/llvm-lit/llvm-lit.in
@@ -8,7 +8,7 @@ llvm_source_root = "@LLVM_SOURCE_DIR@"
 llvm_obj_root = "@LLVM_BINARY_DIR@"
 
 # Make sure we can find the lit package.
-sys.path.append(os.path.join(llvm_source_root, 'utils', 'lit'))
+sys.path.insert(0, os.path.join(llvm_source_root, 'utils', 'lit'))
 
 # Set up some builtin parameters, so that by default the LLVM test suite
 # configuration file knows how to find the object tree.
@@ -18,7 +18,8 @@ builtin_parameters = {
     'llvm_site_config' : os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
     }
 
-clang_site_config = os.path.join(llvm_obj_root, 'tools', 'clang', 'test', 'lit.site.cfg')
+clang_site_config = os.path.join(llvm_obj_root, 'tools', 'clang', 'test',
+                                 'lit.site.cfg')
 if os.path.exists(clang_site_config):
     builtin_parameters['clang_site_config'] = clang_site_config
 
diff --git a/utils/llvm.grm b/utils/llvm.grm
index 322036b..ad2799f 100644
--- a/utils/llvm.grm
+++ b/utils/llvm.grm
@@ -175,6 +175,7 @@ FuncAttr      ::= noreturn
  | returns_twice
  | nonlazybind
  | address_safety
+ | ia_nsdialect
  ;
 
 OptFuncAttrs  ::= + _ | OptFuncAttrs FuncAttr ;
diff --git a/utils/obj2yaml/CMakeLists.txt b/utils/obj2yaml/CMakeLists.txt
new file mode 100644
index 0000000..d64bf1b
--- /dev/null
+++ b/utils/obj2yaml/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS archive object)
+
+add_llvm_utility(obj2yaml
+  obj2yaml.cpp coff2yaml.cpp
+  )
+
+target_link_libraries(obj2yaml LLVMSupport)
diff --git a/tools/llvm-ld/Makefile b/utils/obj2yaml/Makefile
index 8793ca9..5b96bdd 100644
--- a/tools/llvm-ld/Makefile
+++ b/utils/obj2yaml/Makefile
@@ -1,14 +1,20 @@
-##===- tools/llvm-ld/Makefile ------------------------------*- Makefile -*-===##
-# 
+##===- utils/obj2yaml/Makefile ----------------------------*- Makefile -*-===##
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
-LEVEL := ../..
-TOOLNAME := llvm-ld
-LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize
+LEVEL = ../..
+TOOLNAME = obj2yaml
+USEDLIBS = LLVMObject.a LLVMSupport.a
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS = 1
+
+# Don't install this utility
+NO_INSTALL = 1
 
 include $(LEVEL)/Makefile.common
diff --git a/utils/obj2yaml/coff2yaml.cpp b/utils/obj2yaml/coff2yaml.cpp
new file mode 100644
index 0000000..c9a7159
--- /dev/null
+++ b/utils/obj2yaml/coff2yaml.cpp
@@ -0,0 +1,362 @@
+//===------ utils/obj2yaml.cpp - obj2yaml conversion tool -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "obj2yaml.h"
+
+#include "llvm/Object/COFF.h"
+
+
+template <typename One, typename Two>
+struct pod_pair { // I'd much rather use std::pair, but it's not a POD
+  One first;
+  Two second;
+};
+
+#define STRING_PAIR(x)  {llvm::COFF::x, #x}
+static const pod_pair<llvm::COFF::MachineTypes, const char *> 
+MachineTypePairs [] = {
+  STRING_PAIR(IMAGE_FILE_MACHINE_UNKNOWN),
+  STRING_PAIR(IMAGE_FILE_MACHINE_AM33),
+  STRING_PAIR(IMAGE_FILE_MACHINE_AMD64),
+  STRING_PAIR(IMAGE_FILE_MACHINE_ARM),
+  STRING_PAIR(IMAGE_FILE_MACHINE_ARMV7),
+  STRING_PAIR(IMAGE_FILE_MACHINE_EBC),
+  STRING_PAIR(IMAGE_FILE_MACHINE_I386),
+  STRING_PAIR(IMAGE_FILE_MACHINE_IA64),
+  STRING_PAIR(IMAGE_FILE_MACHINE_M32R),
+  STRING_PAIR(IMAGE_FILE_MACHINE_MIPS16),
+  STRING_PAIR(IMAGE_FILE_MACHINE_MIPSFPU),
+  STRING_PAIR(IMAGE_FILE_MACHINE_MIPSFPU16),
+  STRING_PAIR(IMAGE_FILE_MACHINE_POWERPC),
+  STRING_PAIR(IMAGE_FILE_MACHINE_POWERPCFP),
+  STRING_PAIR(IMAGE_FILE_MACHINE_R4000),
+  STRING_PAIR(IMAGE_FILE_MACHINE_SH3),
+  STRING_PAIR(IMAGE_FILE_MACHINE_SH3DSP),
+  STRING_PAIR(IMAGE_FILE_MACHINE_SH4),
+  STRING_PAIR(IMAGE_FILE_MACHINE_SH5),
+  STRING_PAIR(IMAGE_FILE_MACHINE_THUMB),
+  STRING_PAIR(IMAGE_FILE_MACHINE_WCEMIPSV2)
+};
+
+static const pod_pair<llvm::COFF::SectionCharacteristics, const char *> 
+SectionCharacteristicsPairs1 [] = {
+  STRING_PAIR(IMAGE_SCN_TYPE_NO_PAD),
+  STRING_PAIR(IMAGE_SCN_CNT_CODE),
+  STRING_PAIR(IMAGE_SCN_CNT_INITIALIZED_DATA),
+  STRING_PAIR(IMAGE_SCN_CNT_UNINITIALIZED_DATA),
+  STRING_PAIR(IMAGE_SCN_LNK_OTHER),
+  STRING_PAIR(IMAGE_SCN_LNK_INFO),
+  STRING_PAIR(IMAGE_SCN_LNK_REMOVE),
+  STRING_PAIR(IMAGE_SCN_LNK_COMDAT),
+  STRING_PAIR(IMAGE_SCN_GPREL),
+  STRING_PAIR(IMAGE_SCN_MEM_PURGEABLE),
+  STRING_PAIR(IMAGE_SCN_MEM_16BIT),
+  STRING_PAIR(IMAGE_SCN_MEM_LOCKED),
+  STRING_PAIR(IMAGE_SCN_MEM_PRELOAD)
+};
+
+static const pod_pair<llvm::COFF::SectionCharacteristics, const char *> 
+SectionCharacteristicsPairsAlignment [] = {
+  STRING_PAIR(IMAGE_SCN_ALIGN_1BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_2BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_4BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_8BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_16BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_32BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_64BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_128BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_256BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_512BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_1024BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_2048BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_4096BYTES),
+  STRING_PAIR(IMAGE_SCN_ALIGN_8192BYTES)
+};
+
+static const pod_pair<llvm::COFF::SectionCharacteristics, const char *> 
+SectionCharacteristicsPairs2 [] = {
+  STRING_PAIR(IMAGE_SCN_LNK_NRELOC_OVFL),
+  STRING_PAIR(IMAGE_SCN_MEM_DISCARDABLE),
+  STRING_PAIR(IMAGE_SCN_MEM_NOT_CACHED),
+  STRING_PAIR(IMAGE_SCN_MEM_NOT_PAGED),
+  STRING_PAIR(IMAGE_SCN_MEM_SHARED),
+  STRING_PAIR(IMAGE_SCN_MEM_EXECUTE),
+  STRING_PAIR(IMAGE_SCN_MEM_READ),
+  STRING_PAIR(IMAGE_SCN_MEM_WRITE)
+};
+  
+static const pod_pair<llvm::COFF::SymbolBaseType, const char *> 
+SymbolBaseTypePairs [] = {
+  STRING_PAIR(IMAGE_SYM_TYPE_NULL),
+  STRING_PAIR(IMAGE_SYM_TYPE_VOID),
+  STRING_PAIR(IMAGE_SYM_TYPE_CHAR),
+  STRING_PAIR(IMAGE_SYM_TYPE_SHORT),
+  STRING_PAIR(IMAGE_SYM_TYPE_INT),
+  STRING_PAIR(IMAGE_SYM_TYPE_LONG),
+  STRING_PAIR(IMAGE_SYM_TYPE_FLOAT),
+  STRING_PAIR(IMAGE_SYM_TYPE_DOUBLE),
+  STRING_PAIR(IMAGE_SYM_TYPE_STRUCT),
+  STRING_PAIR(IMAGE_SYM_TYPE_UNION),
+  STRING_PAIR(IMAGE_SYM_TYPE_ENUM),
+  STRING_PAIR(IMAGE_SYM_TYPE_MOE),
+  STRING_PAIR(IMAGE_SYM_TYPE_BYTE),
+  STRING_PAIR(IMAGE_SYM_TYPE_WORD),
+  STRING_PAIR(IMAGE_SYM_TYPE_UINT),
+  STRING_PAIR(IMAGE_SYM_TYPE_DWORD)
+};
+
+static const pod_pair<llvm::COFF::SymbolComplexType, const char *> 
+SymbolComplexTypePairs [] = {
+  STRING_PAIR(IMAGE_SYM_DTYPE_NULL),
+  STRING_PAIR(IMAGE_SYM_DTYPE_POINTER),
+  STRING_PAIR(IMAGE_SYM_DTYPE_FUNCTION),
+  STRING_PAIR(IMAGE_SYM_DTYPE_ARRAY),
+};
+  
+static const pod_pair<llvm::COFF::SymbolStorageClass, const char *> 
+SymbolStorageClassPairs [] = {
+  STRING_PAIR(IMAGE_SYM_CLASS_END_OF_FUNCTION),
+  STRING_PAIR(IMAGE_SYM_CLASS_NULL),
+  STRING_PAIR(IMAGE_SYM_CLASS_AUTOMATIC),
+  STRING_PAIR(IMAGE_SYM_CLASS_EXTERNAL),
+  STRING_PAIR(IMAGE_SYM_CLASS_STATIC),
+  STRING_PAIR(IMAGE_SYM_CLASS_REGISTER),
+  STRING_PAIR(IMAGE_SYM_CLASS_EXTERNAL_DEF),
+  STRING_PAIR(IMAGE_SYM_CLASS_LABEL),
+  STRING_PAIR(IMAGE_SYM_CLASS_UNDEFINED_LABEL),
+  STRING_PAIR(IMAGE_SYM_CLASS_MEMBER_OF_STRUCT),
+  STRING_PAIR(IMAGE_SYM_CLASS_ARGUMENT),
+  STRING_PAIR(IMAGE_SYM_CLASS_STRUCT_TAG),
+  STRING_PAIR(IMAGE_SYM_CLASS_MEMBER_OF_UNION),
+  STRING_PAIR(IMAGE_SYM_CLASS_UNION_TAG),
+  STRING_PAIR(IMAGE_SYM_CLASS_TYPE_DEFINITION),
+  STRING_PAIR(IMAGE_SYM_CLASS_UNDEFINED_STATIC),
+  STRING_PAIR(IMAGE_SYM_CLASS_ENUM_TAG),
+  STRING_PAIR(IMAGE_SYM_CLASS_MEMBER_OF_ENUM),
+  STRING_PAIR(IMAGE_SYM_CLASS_REGISTER_PARAM),
+  STRING_PAIR(IMAGE_SYM_CLASS_BIT_FIELD),
+  STRING_PAIR(IMAGE_SYM_CLASS_BLOCK),
+  STRING_PAIR(IMAGE_SYM_CLASS_FUNCTION),
+  STRING_PAIR(IMAGE_SYM_CLASS_END_OF_STRUCT),
+  STRING_PAIR(IMAGE_SYM_CLASS_FILE),
+  STRING_PAIR(IMAGE_SYM_CLASS_SECTION),
+  STRING_PAIR(IMAGE_SYM_CLASS_WEAK_EXTERNAL),
+  STRING_PAIR(IMAGE_SYM_CLASS_CLR_TOKEN),
+};
+
+static const pod_pair<llvm::COFF::RelocationTypeX86, const char *> 
+RelocationTypeX86Pairs [] = {
+  STRING_PAIR(IMAGE_REL_I386_ABSOLUTE),
+  STRING_PAIR(IMAGE_REL_I386_DIR16),
+  STRING_PAIR(IMAGE_REL_I386_REL16),
+  STRING_PAIR(IMAGE_REL_I386_DIR32),
+  STRING_PAIR(IMAGE_REL_I386_DIR32NB),
+  STRING_PAIR(IMAGE_REL_I386_SEG12),
+  STRING_PAIR(IMAGE_REL_I386_SECTION),
+  STRING_PAIR(IMAGE_REL_I386_SECREL),
+  STRING_PAIR(IMAGE_REL_I386_TOKEN),
+  STRING_PAIR(IMAGE_REL_I386_SECREL7),
+  STRING_PAIR(IMAGE_REL_I386_REL32),
+  STRING_PAIR(IMAGE_REL_AMD64_ABSOLUTE),
+  STRING_PAIR(IMAGE_REL_AMD64_ADDR64),
+  STRING_PAIR(IMAGE_REL_AMD64_ADDR32),
+  STRING_PAIR(IMAGE_REL_AMD64_ADDR32NB),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32_1),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32_2),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32_3),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32_4),
+  STRING_PAIR(IMAGE_REL_AMD64_REL32_5),
+  STRING_PAIR(IMAGE_REL_AMD64_SECTION),
+  STRING_PAIR(IMAGE_REL_AMD64_SECREL),
+  STRING_PAIR(IMAGE_REL_AMD64_SECREL7),
+  STRING_PAIR(IMAGE_REL_AMD64_TOKEN),
+  STRING_PAIR(IMAGE_REL_AMD64_SREL32),
+  STRING_PAIR(IMAGE_REL_AMD64_PAIR),
+  STRING_PAIR(IMAGE_REL_AMD64_SSPAN32)
+};
+
+static const pod_pair<llvm::COFF::RelocationTypesARM, const char *> 
+RelocationTypesARMPairs [] = {
+  STRING_PAIR(IMAGE_REL_ARM_ABSOLUTE),
+  STRING_PAIR(IMAGE_REL_ARM_ADDR32),
+  STRING_PAIR(IMAGE_REL_ARM_ADDR32NB),
+  STRING_PAIR(IMAGE_REL_ARM_BRANCH24),
+  STRING_PAIR(IMAGE_REL_ARM_BRANCH11),
+  STRING_PAIR(IMAGE_REL_ARM_TOKEN),
+  STRING_PAIR(IMAGE_REL_ARM_BLX24),
+  STRING_PAIR(IMAGE_REL_ARM_BLX11),
+  STRING_PAIR(IMAGE_REL_ARM_SECTION),
+  STRING_PAIR(IMAGE_REL_ARM_SECREL),
+  STRING_PAIR(IMAGE_REL_ARM_MOV32A),
+  STRING_PAIR(IMAGE_REL_ARM_MOV32T),
+  STRING_PAIR(IMAGE_REL_ARM_BRANCH20T),
+  STRING_PAIR(IMAGE_REL_ARM_BRANCH24T),
+  STRING_PAIR(IMAGE_REL_ARM_BLX23T)
+};
+#undef STRING_PAIR
+
+
+static const char endl = '\n';
+
+namespace yaml {  // COFF-specific yaml-writing specific routines
+
+static llvm::raw_ostream &writeName(llvm::raw_ostream &Out, 
+                             const char *Name, std::size_t NameSize) {
+  for (std::size_t i = 0; i < NameSize; ++i) {
+    if (!Name[i]) break;
+    Out << Name[i];
+  }
+  return Out;
+}
+
+// Given an array of pod_pair<enum, const char *>, write all enums that match
+template <typename T, std::size_t N>
+static llvm::raw_ostream &writeBitMask(llvm::raw_ostream &Out, 
+              const pod_pair<T, const char *> (&Arr)[N], unsigned long Val) {
+  for (std::size_t i = 0; i < N; ++i)
+    if (Val & Arr[i].first)
+      Out << Arr[i].second << ", ";
+  return Out;
+}
+
+} // end of yaml namespace
+
+// Given an array of pod_pair<enum, const char *>, look up a value
+template <typename T, std::size_t N>
+const char *nameLookup(const pod_pair<T, const char *> (&Arr)[N], 
+                           unsigned long Val, const char *NotFound = NULL) {
+  T n = static_cast<T>(Val);
+  for (std::size_t i = 0; i < N; ++i)
+    if (n == Arr[i].first)
+      return Arr[i].second;
+  return NotFound;
+}
+
+
+static llvm::raw_ostream &yamlCOFFHeader(
+          const llvm::object::coff_file_header *Header,llvm::raw_ostream &Out) {
+
+  Out << "header: !Header" << endl;
+  Out << "  Machine: ";
+  Out << nameLookup(MachineTypePairs, Header->Machine, "# Unknown_MachineTypes")
+      << " # (";
+  return yaml::writeHexNumber(Out, Header->Machine) << ")" << endl << endl;
+}
+
+
+static llvm::raw_ostream &yamlCOFFSections(llvm::object::COFFObjectFile &Obj, 
+                            std::size_t NumSections, llvm::raw_ostream &Out) {
+  llvm::error_code ec;
+  Out << "sections:" << endl;
+  for (llvm::object::section_iterator iter = Obj.begin_sections(); 
+                             iter != Obj.end_sections(); iter.increment(ec)) {
+    const llvm::object::coff_section *sect = Obj.getCOFFSection(iter);
+  
+    Out << "  - !Section" << endl;
+    Out << "    Name: ";
+    yaml::writeName(Out, sect->Name, sizeof(sect->Name)) << endl;
+
+    Out << "    Characteristics: [";
+    yaml::writeBitMask(Out, SectionCharacteristicsPairs1, sect->Characteristics);
+    Out << nameLookup(SectionCharacteristicsPairsAlignment, 
+        sect->Characteristics & 0x00F00000, "# Unrecognized_IMAGE_SCN_ALIGN") 
+        << ", ";
+    yaml::writeBitMask(Out, SectionCharacteristicsPairs2, sect->Characteristics);
+    Out << "] # ";
+    yaml::writeHexNumber(Out, sect->Characteristics) << endl;
+
+    llvm::ArrayRef<uint8_t> sectionData;
+    Obj.getSectionContents(sect, sectionData);    
+    Out << "    SectionData: ";
+    yaml::writeHexStream(Out, sectionData) << endl;
+    if (iter->begin_relocations() != iter->end_relocations())
+      Out << "    Relocations:\n";
+    for (llvm::object::relocation_iterator rIter = iter->begin_relocations();
+                       rIter != iter->end_relocations(); rIter.increment(ec)) {
+      const llvm::object::coff_relocation *reloc = Obj.getCOFFRelocation(rIter);
+
+        Out << "      - !Relocation" << endl;
+        Out << "        VirtualAddress: " ;
+        yaml::writeHexNumber(Out, reloc->VirtualAddress) << endl;
+        Out << "        SymbolTableIndex: " << reloc->SymbolTableIndex << endl;
+        Out << "        Type: " 
+            << nameLookup(RelocationTypeX86Pairs, reloc->Type) << endl;
+    // TODO: Use the correct reloc type for the machine.
+        Out << endl;
+      }
+
+  } 
+  return Out;
+}
+
+static llvm::raw_ostream& yamlCOFFSymbols(llvm::object::COFFObjectFile &Obj, 
+                              std::size_t NumSymbols, llvm::raw_ostream &Out) {
+  llvm::error_code ec;
+  Out << "symbols:" << endl;
+  for (llvm::object::symbol_iterator iter = Obj.begin_symbols(); 
+                             iter != Obj.end_symbols(); iter.increment(ec)) {
+ // Gather all the info that we need
+    llvm::StringRef str;
+    const llvm::object::coff_symbol *symbol = Obj.getCOFFSymbol(iter);
+    Obj.getSymbolName(symbol, str);
+    std::size_t  simpleType  = symbol->getBaseType();
+    std::size_t complexType  = symbol->getComplexType();
+    std::size_t storageClass = symbol->StorageClass;
+    
+    Out << "  - !Symbol" << endl;
+    Out << "    Name: " << str << endl; 
+
+    Out << "    Value: "         << symbol->Value << endl;
+    Out << "    SectionNumber: " << symbol->SectionNumber << endl;
+
+    Out << "    SimpleType: " 
+        << nameLookup(SymbolBaseTypePairs, simpleType, 
+            "# Unknown_SymbolBaseType") 
+        << " # (" << simpleType << ")" << endl;
+    
+    Out << "    ComplexType: " 
+        << nameLookup(SymbolComplexTypePairs, complexType, 
+                "# Unknown_SymbolComplexType") 
+        << " # (" << complexType << ")" << endl;
+    
+    Out << "    StorageClass: " 
+        << nameLookup(SymbolStorageClassPairs, storageClass,
+              "# Unknown_StorageClass") 
+        << " # (" << (int) storageClass << ")" << endl;
+
+    if (symbol->NumberOfAuxSymbols > 0) {
+      llvm::ArrayRef<uint8_t> aux = Obj.getSymbolAuxData(symbol);
+      Out << "    NumberOfAuxSymbols: " 
+          << (int) symbol->NumberOfAuxSymbols << endl;
+      Out << "    AuxillaryData: ";
+      yaml::writeHexStream(Out, aux);
+    }
+      
+    Out << endl;
+  }
+
+  return Out;
+}
+
+
+llvm::error_code coff2yaml(llvm::raw_ostream &Out, llvm::MemoryBuffer *TheObj) {
+  llvm::error_code ec;
+  llvm::object::COFFObjectFile obj(TheObj, ec);
+  if (!ec) {
+    const llvm::object::coff_file_header *hd;
+    ec = obj.getHeader(hd);
+    if (!ec) {
+      yamlCOFFHeader(hd, Out);
+      yamlCOFFSections(obj, hd->NumberOfSections, Out);
+      yamlCOFFSymbols(obj, hd->NumberOfSymbols, Out);
+    }
+  }
+  return ec;
+}
diff --git a/utils/obj2yaml/obj2yaml.cpp b/utils/obj2yaml/obj2yaml.cpp
new file mode 100644
index 0000000..ff253fa
--- /dev/null
+++ b/utils/obj2yaml/obj2yaml.cpp
@@ -0,0 +1,89 @@
+//===------ utils/obj2yaml.cpp - obj2yaml conversion tool -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "obj2yaml.h"
+
+#include "llvm/ADT/OwningPtr.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/COFF.h"
+
+const char endl = '\n';
+
+namespace yaml {  // generic yaml-writing specific routines
+
+unsigned char printable(unsigned char Ch) {
+  return Ch >= ' ' && Ch <= '~' ? Ch : '.';
+}
+  
+llvm::raw_ostream &writeHexStream(llvm::raw_ostream &Out, 
+                                     const llvm::ArrayRef<uint8_t> arr) {
+  const char *hex = "0123456789ABCDEF";
+  Out << " !hex \"";
+
+  typedef llvm::ArrayRef<uint8_t>::const_iterator iter_t;
+  const iter_t end = arr.end();
+  for (iter_t iter = arr.begin(); iter != end; ++iter)
+    Out << hex[(*iter >> 4) & 0x0F] << hex[(*iter & 0x0F)];
+
+  Out << "\" # |";
+  for (iter_t iter = arr.begin(); iter != end; ++iter)
+    Out << printable(*iter);
+  Out << "|" << endl;
+
+  return Out;
+  }
+
+llvm::raw_ostream &writeHexNumber(llvm::raw_ostream &Out, unsigned long long N) {
+  if (N >= 10)
+    Out << "0x";
+  Out.write_hex(N);
+  return Out;
+}
+
+}
+
+
+using namespace llvm;
+enum ObjectFileType { coff };
+
+cl::opt<ObjectFileType> InputFormat(
+  cl::desc("Choose input format"),
+    cl::values(
+      clEnumVal(coff, "process COFF object files"),
+    clEnumValEnd));
+    
+cl::opt<std::string> InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+int main(int argc, char * argv[]) {
+  cl::ParseCommandLineOptions(argc, argv);
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+
+// Process the input file  
+  OwningPtr<MemoryBuffer> buf;
+
+// TODO: If this is an archive, then burst it and dump each entry
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, buf))
+    llvm::errs() << "Error: '" << ec.message() << "' opening file '" 
+              << InputFilename << "'" << endl;
+  else {
+    ec = coff2yaml(llvm::outs(), buf.take());
+    if (ec)
+      llvm::errs() << "Error: " << ec.message() << " dumping COFF file" << endl;
+  }
+
+  return 0;
+}
diff --git a/utils/obj2yaml/obj2yaml.h b/utils/obj2yaml/obj2yaml.h
new file mode 100644
index 0000000..2a23b49
--- /dev/null
+++ b/utils/obj2yaml/obj2yaml.h
@@ -0,0 +1,35 @@
+//===------ utils/obj2yaml.hpp - obj2yaml conversion tool -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This file declares some helper routines, and also the format-specific
+// writers. To add a new format, add the declaration here, and, in a separate
+// source file, implement it.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_OBJ2YAML_H
+#define LLVM_UTILS_OBJ2YAML_H
+
+#include "llvm/ADT/ArrayRef.h"
+
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace yaml {  // routines for writing YAML
+// Write a hex stream:
+//    <Prefix> !hex: "<hex digits>" #|<ASCII chars>\n
+  llvm::raw_ostream &writeHexStream
+    (llvm::raw_ostream &Out, const llvm::ArrayRef<uint8_t> arr);
+
+// Writes a number in hex; prefix it by 0x if it is >= 10
+  llvm::raw_ostream &writeHexNumber
+    (llvm::raw_ostream &Out, unsigned long long N);
+}
+
+llvm::error_code coff2yaml(llvm::raw_ostream &Out, llvm::MemoryBuffer *TheObj);
+
+#endif
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index d2946d8..a62e829 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -193,7 +193,7 @@ function check_valid_urls() {
     done
 }
 
-# Export sources to the the build directory.
+# Export sources to the build directory.
 function export_sources() {
     check_valid_urls
 
diff --git a/utils/test_debuginfo.pl b/utils/test_debuginfo.pl
index fb61fb0..0832a33 100755
--- a/utils/test_debuginfo.pl
+++ b/utils/test_debuginfo.pl
@@ -24,7 +24,7 @@ my $debugger_script_file = "$output_dir/$input_filename.debugger.script";
 my $output_file = "$output_dir/$input_filename.gdb.output";
 
 # Extract debugger commands from testcase. They are marked with DEBUGGER: 
-# at the beginnign of a comment line.
+# at the beginning of a comment line.
 open(INPUT, $testcase_file);
 open(OUTPUT, ">$debugger_script_file");
 while(<INPUT>) {
@@ -49,7 +49,7 @@ if (!$my_debugger) {
 my $debugger_options = "-q -batch -n -x";
 
 # run debugger and capture output.
-system("$my_debugger $debugger_options $debugger_script_file $executable_file >& $output_file");
+system("$my_debugger $debugger_options $debugger_script_file $executable_file > $output_file 2>&1");
 
 # validate output.
 system("FileCheck", "-input-file", "$output_file", "$testcase_file");
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index 73491f0..70ed35d 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -27,6 +27,12 @@ endif()
 set(LLVM_REQUIRES_RTTI 1)
 add_definitions( -DGTEST_HAS_RTTI=0 )
 
+# Visual Studio 2012 only supports up to 8 template parameters in
+# std::tr1::tuple by default, but gtest requires 10
+if(MSVC AND MSVC_VERSION EQUAL 1700)
+  add_definitions(-D_VARIADIC_MAX=10)
+endif ()
+
 add_llvm_library(gtest
   googletest/gtest.cc
   googletest/gtest-death-test.cc
diff --git a/utils/unittest/googletest/gtest.cc b/utils/unittest/googletest/gtest.cc
index 3fdff0a..eb5c68c 100644
--- a/utils/unittest/googletest/gtest.cc
+++ b/utils/unittest/googletest/gtest.cc
@@ -1465,7 +1465,7 @@ char* CodePointToUtf8(UInt32 code_point, char* str) {
   return str;
 }
 
-// The following two functions only make sense if the the system
+// The following two functions only make sense if the system
 // uses UTF-16 for wide string encoding. All supported systems
 // with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
 
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h b/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h
index 65a2101..6554cfc 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h
@@ -203,7 +203,6 @@ class GTestFlagSaver {
   bool list_tests_;
   String output_;
   bool print_time_;
-  bool pretty_;
   internal::Int32 random_seed_;
   internal::Int32 repeat_;
   bool shuffle_;
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
index b198a3d..f8a5cc9 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
@@ -102,12 +102,11 @@
 // but still find raw_ostream& overloads.
 namespace llvm {
 class convertible_fwd_ostream : public std::ostream {
-  std::ostream& os_;
   raw_os_ostream ros_;
 
 public:
   convertible_fwd_ostream(std::ostream& os)
-    : std::ostream(os.rdbuf()), os_(os), ros_(*this) {}
+    : std::ostream(os.rdbuf()), ros_(*this) {}
   operator raw_ostream&() { return ros_; }
 };
 }
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index f35f855..c83e8ca 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -1,7 +1,7 @@
 " Vim syntax file
 " Language:   llvm
 " Maintainer: The LLVM team, http://llvm.org/
-" Version:      $Revision: 137806 $
+" Version:      $Revision: 156080 $
 
 if version < 600
   syntax clear
@@ -14,7 +14,7 @@ syn case match
 " Types.
 " Types also include struct, array, vector, etc. but these don't
 " benefit as much from having dedicated highlighting rules.
-syn keyword llvmType void float double
+syn keyword llvmType void float double half
 syn keyword llvmType x86_fp80 fp128 ppc_fp128
 syn keyword llvmType type label opaque
 syn match   llvmType /\<i\d\+\>/
diff --git a/utils/yaml2obj/CMakeLists.txt b/utils/yaml2obj/CMakeLists.txt
new file mode 100644
index 0000000..f8b1197
--- /dev/null
+++ b/utils/yaml2obj/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_utility(yaml2obj
+  yaml2obj.cpp
+  )
+
+target_link_libraries(yaml2obj LLVMSupport)
diff --git a/tools/llvm-stub/Makefile b/utils/yaml2obj/Makefile
index 077efa2..e746d85 100644
--- a/tools/llvm-stub/Makefile
+++ b/utils/yaml2obj/Makefile
@@ -1,15 +1,20 @@
-##===- tools/llvm-stub/Makefile ----------------------------*- Makefile -*-===##
-# 
+##===- utils/yaml2obj/Makefile ----------------------------*- Makefile -*-===##
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
-LEVEL := ../..
-TOOLNAME := llvm-stub
-LINK_COMPONENTS := object
+LEVEL = ../..
+TOOLNAME = yaml2obj
+USEDLIBS = LLVMSupport.a
 
-include $(LEVEL)/Makefile.common
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS = 1
 
+# Don't install this utility
+NO_INSTALL = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/utils/yaml2obj/yaml2obj.cpp b/utils/yaml2obj/yaml2obj.cpp
new file mode 100644
index 0000000..c3b3e54
--- /dev/null
+++ b/utils/yaml2obj/yaml2obj.cpp
@@ -0,0 +1,879 @@
+//===- yaml2obj - Convert YAML to a binary object file --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This program takes a YAML description of an object file and outputs the
+// binary equivalent.
+//
+// This is used for writing tests that require binary files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/YAMLParser.h"
+
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<std::string>
+  Input(cl::Positional, cl::desc("<input>"), cl::init("-"));
+
+template<class T>
+typename llvm::enable_if_c<std::numeric_limits<T>::is_integer, bool>::type
+getAs(const llvm::yaml::ScalarNode *SN, T &Result) {
+  SmallString<4> Storage;
+  StringRef Value = SN->getValue(Storage);
+  if (Value.getAsInteger(0, Result))
+    return false;
+  return true;
+}
+
+// Given a container with begin and end with ::value_type of a character type.
+// Iterate through pairs of characters in the the set of [a-fA-F0-9] ignoring
+// all other characters.
+struct hex_pair_iterator {
+  StringRef::const_iterator Current, End;
+  typedef SmallVector<char, 2> value_type;
+  value_type Pair;
+  bool IsDone;
+
+  hex_pair_iterator(StringRef C)
+    : Current(C.begin()), End(C.end()), IsDone(false) {
+    // Initalize Pair.
+    ++*this;
+  }
+
+  // End iterator.
+  hex_pair_iterator() : Current(), End(), IsDone(true) {}
+
+  value_type operator *() const {
+    return Pair;
+  }
+
+  hex_pair_iterator operator ++() {
+    // We're at the end of the input.
+    if (Current == End) {
+      IsDone = true;
+      return *this;
+    }
+    Pair = value_type();
+    for (; Current != End && Pair.size() != 2; ++Current) {
+      // Is a valid hex digit.
+      if ((*Current >= '0' && *Current <= '9') ||
+          (*Current >= 'a' && *Current <= 'f') ||
+          (*Current >= 'A' && *Current <= 'F'))
+        Pair.push_back(*Current);
+    }
+    // Hit the end without getting 2 hex digits. Pair is invalid.
+    if (Pair.size() != 2)
+      IsDone = true;
+    return *this;
+  }
+
+  bool operator ==(const hex_pair_iterator Other) {
+    return (IsDone == Other.IsDone) ||
+           (Current == Other.Current && End == Other.End);
+  }
+
+  bool operator !=(const hex_pair_iterator Other) {
+    return !(*this == Other);
+  }
+};
+
+template <class ContainerOut>
+static bool hexStringToByteArray(StringRef Str, ContainerOut &Out) {
+  for (hex_pair_iterator I(Str), E; I != E; ++I) {
+    typename hex_pair_iterator::value_type Pair = *I;
+    typename ContainerOut::value_type Byte;
+    if (StringRef(Pair.data(), 2).getAsInteger(16, Byte))
+      return false;
+    Out.push_back(Byte);
+  }
+  return true;
+}
+
+/// This parses a yaml stream that represents a COFF object file.
+/// See docs/yaml2obj for the yaml scheema.
+struct COFFParser {
+  COFFParser(yaml::Stream &Input) : YS(Input) {
+    std::memset(&Header, 0, sizeof(Header));
+    // A COFF string table always starts with a 4 byte size field. Offsets into
+    // it include this size, so allocate it now.
+    StringTable.append(4, 0);
+  }
+
+  bool parseHeader(yaml::Node *HeaderN) {
+    yaml::MappingNode *MN = dyn_cast<yaml::MappingNode>(HeaderN);
+    if (!MN) {
+      YS.printError(HeaderN, "header's value must be a mapping node");
+      return false;
+    }
+    for (yaml::MappingNode::iterator i = MN->begin(), e = MN->end();
+                                     i != e; ++i) {
+      yaml::ScalarNode *Key = dyn_cast<yaml::ScalarNode>(i->getKey());
+      if (!Key) {
+        YS.printError(i->getKey(), "Keys must be scalar values");
+        return false;
+      }
+      SmallString<32> Storage;
+      StringRef KeyValue = Key->getValue(Storage);
+      if (KeyValue == "Characteristics") {
+        if (!parseHeaderCharacteristics(i->getValue()))
+          return false;
+      } else {
+        yaml::ScalarNode *Value = dyn_cast<yaml::ScalarNode>(i->getValue());
+        if (!Value) {
+          YS.printError(Value,
+            Twine(KeyValue) + " must be a scalar value");
+          return false;
+        }
+        if (KeyValue == "Machine") {
+          uint16_t Machine;
+          if (!getAs(Value, Machine)) {
+            // It's not a raw number, try matching the string.
+            StringRef ValueValue = Value->getValue(Storage);
+            Machine = StringSwitch<COFF::MachineTypes>(ValueValue)
+              .Case( "IMAGE_FILE_MACHINE_UNKNOWN"
+                   , COFF::IMAGE_FILE_MACHINE_UNKNOWN)
+              .Case( "IMAGE_FILE_MACHINE_AM33"
+                   , COFF::IMAGE_FILE_MACHINE_AM33)
+              .Case( "IMAGE_FILE_MACHINE_AMD64"
+                   , COFF::IMAGE_FILE_MACHINE_AMD64)
+              .Case( "IMAGE_FILE_MACHINE_ARM"
+                   , COFF::IMAGE_FILE_MACHINE_ARM)
+              .Case( "IMAGE_FILE_MACHINE_ARMV7"
+                   , COFF::IMAGE_FILE_MACHINE_ARMV7)
+              .Case( "IMAGE_FILE_MACHINE_EBC"
+                   , COFF::IMAGE_FILE_MACHINE_EBC)
+              .Case( "IMAGE_FILE_MACHINE_I386"
+                   , COFF::IMAGE_FILE_MACHINE_I386)
+              .Case( "IMAGE_FILE_MACHINE_IA64"
+                   , COFF::IMAGE_FILE_MACHINE_IA64)
+              .Case( "IMAGE_FILE_MACHINE_M32R"
+                   , COFF::IMAGE_FILE_MACHINE_M32R)
+              .Case( "IMAGE_FILE_MACHINE_MIPS16"
+                   , COFF::IMAGE_FILE_MACHINE_MIPS16)
+              .Case( "IMAGE_FILE_MACHINE_MIPSFPU"
+                   , COFF::IMAGE_FILE_MACHINE_MIPSFPU)
+              .Case( "IMAGE_FILE_MACHINE_MIPSFPU16"
+                   , COFF::IMAGE_FILE_MACHINE_MIPSFPU16)
+              .Case( "IMAGE_FILE_MACHINE_POWERPC"
+                   , COFF::IMAGE_FILE_MACHINE_POWERPC)
+              .Case( "IMAGE_FILE_MACHINE_POWERPCFP"
+                   , COFF::IMAGE_FILE_MACHINE_POWERPCFP)
+              .Case( "IMAGE_FILE_MACHINE_R4000"
+                   , COFF::IMAGE_FILE_MACHINE_R4000)
+              .Case( "IMAGE_FILE_MACHINE_SH3"
+                   , COFF::IMAGE_FILE_MACHINE_SH3)
+              .Case( "IMAGE_FILE_MACHINE_SH3DSP"
+                   , COFF::IMAGE_FILE_MACHINE_SH3DSP)
+              .Case( "IMAGE_FILE_MACHINE_SH4"
+                   , COFF::IMAGE_FILE_MACHINE_SH4)
+              .Case( "IMAGE_FILE_MACHINE_SH5"
+                   , COFF::IMAGE_FILE_MACHINE_SH5)
+              .Case( "IMAGE_FILE_MACHINE_THUMB"
+                   , COFF::IMAGE_FILE_MACHINE_THUMB)
+              .Case( "IMAGE_FILE_MACHINE_WCEMIPSV2"
+                   , COFF::IMAGE_FILE_MACHINE_WCEMIPSV2)
+              .Default(COFF::MT_Invalid);
+            if (Machine == COFF::MT_Invalid) {
+              YS.printError(Value, "Invalid value for Machine");
+              return false;
+            }
+          }
+          Header.Machine = Machine;
+        } else if (KeyValue == "NumberOfSections") {
+          if (!getAs(Value, Header.NumberOfSections)) {
+              YS.printError(Value, "Invalid value for NumberOfSections");
+              return false;
+          }
+        } else if (KeyValue == "TimeDateStamp") {
+          if (!getAs(Value, Header.TimeDateStamp)) {
+              YS.printError(Value, "Invalid value for TimeDateStamp");
+              return false;
+          }
+        } else if (KeyValue == "PointerToSymbolTable") {
+          if (!getAs(Value, Header.PointerToSymbolTable)) {
+              YS.printError(Value, "Invalid value for PointerToSymbolTable");
+              return false;
+          }
+        } else if (KeyValue == "NumberOfSymbols") {
+          if (!getAs(Value, Header.NumberOfSymbols)) {
+              YS.printError(Value, "Invalid value for NumberOfSymbols");
+              return false;
+          }
+        } else if (KeyValue == "SizeOfOptionalHeader") {
+          if (!getAs(Value, Header.SizeOfOptionalHeader)) {
+              YS.printError(Value, "Invalid value for SizeOfOptionalHeader");
+              return false;
+          }
+        } else {
+          YS.printError(Key, "Unrecognized key in header");
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  bool parseHeaderCharacteristics(yaml::Node *Characteristics) {
+    yaml::ScalarNode *Value = dyn_cast<yaml::ScalarNode>(Characteristics);
+    yaml::SequenceNode *SeqValue
+      = dyn_cast<yaml::SequenceNode>(Characteristics);
+    if (!Value && !SeqValue) {
+      YS.printError(Characteristics,
+        "Characteristics must either be a number or sequence");
+      return false;
+    }
+    if (Value) {
+      if (!getAs(Value, Header.Characteristics)) {
+        YS.printError(Value, "Invalid value for Characteristics");
+        return false;
+      }
+    } else {
+      for (yaml::SequenceNode::iterator ci = SeqValue->begin(),
+                                        ce = SeqValue->end();
+                                        ci != ce; ++ci) {
+        yaml::ScalarNode *CharValue = dyn_cast<yaml::ScalarNode>(&*ci);
+        if (!CharValue) {
+          YS.printError(CharValue,
+            "Characteristics must be scalar values");
+          return false;
+        }
+        SmallString<32> Storage;
+        StringRef Char = CharValue->getValue(Storage);
+        uint16_t Characteristic = StringSwitch<COFF::Characteristics>(Char)
+          .Case( "IMAGE_FILE_RELOCS_STRIPPED"
+                , COFF::IMAGE_FILE_RELOCS_STRIPPED)
+          .Case( "IMAGE_FILE_EXECUTABLE_IMAGE"
+                , COFF::IMAGE_FILE_EXECUTABLE_IMAGE)
+          .Case( "IMAGE_FILE_LINE_NUMS_STRIPPED"
+                , COFF::IMAGE_FILE_LINE_NUMS_STRIPPED)
+          .Case( "IMAGE_FILE_LOCAL_SYMS_STRIPPED"
+                , COFF::IMAGE_FILE_LOCAL_SYMS_STRIPPED)
+          .Case( "IMAGE_FILE_AGGRESSIVE_WS_TRIM"
+                , COFF::IMAGE_FILE_AGGRESSIVE_WS_TRIM)
+          .Case( "IMAGE_FILE_LARGE_ADDRESS_AWARE"
+                , COFF::IMAGE_FILE_LARGE_ADDRESS_AWARE)
+          .Case( "IMAGE_FILE_BYTES_REVERSED_LO"
+                , COFF::IMAGE_FILE_BYTES_REVERSED_LO)
+          .Case( "IMAGE_FILE_32BIT_MACHINE"
+                , COFF::IMAGE_FILE_32BIT_MACHINE)
+          .Case( "IMAGE_FILE_DEBUG_STRIPPED"
+                , COFF::IMAGE_FILE_DEBUG_STRIPPED)
+          .Case( "IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP"
+                , COFF::IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP)
+          .Case( "IMAGE_FILE_SYSTEM"
+                , COFF::IMAGE_FILE_SYSTEM)
+          .Case( "IMAGE_FILE_DLL"
+                , COFF::IMAGE_FILE_DLL)
+          .Case( "IMAGE_FILE_UP_SYSTEM_ONLY"
+                , COFF::IMAGE_FILE_UP_SYSTEM_ONLY)
+          .Default(COFF::C_Invalid);
+        if (Characteristic == COFF::C_Invalid) {
+          // TODO: Typo-correct.
+          YS.printError(CharValue,
+            "Invalid value for Characteristic");
+          return false;
+        }
+        Header.Characteristics |= Characteristic;
+      }
+    }
+    return true;
+  }
+
+  bool parseSections(yaml::Node *SectionsN) {
+    yaml::SequenceNode *SN = dyn_cast<yaml::SequenceNode>(SectionsN);
+    if (!SN) {
+      YS.printError(SectionsN, "Sections must be a sequence");
+      return false;
+    }
+    for (yaml::SequenceNode::iterator i = SN->begin(), e = SN->end();
+                                      i != e; ++i) {
+      Section Sec;
+      std::memset(&Sec.Header, 0, sizeof(Sec.Header));
+      yaml::MappingNode *SecMap = dyn_cast<yaml::MappingNode>(&*i);
+      if (!SecMap) {
+        YS.printError(&*i, "Section entry must be a map");
+        return false;
+      }
+      for (yaml::MappingNode::iterator si = SecMap->begin(), se = SecMap->end();
+                                       si != se; ++si) {
+        yaml::ScalarNode *Key = dyn_cast<yaml::ScalarNode>(si->getKey());
+        if (!Key) {
+          YS.printError(si->getKey(), "Keys must be scalar values");
+          return false;
+        }
+        SmallString<32> Storage;
+        StringRef KeyValue = Key->getValue(Storage);
+
+        yaml::ScalarNode *Value = dyn_cast<yaml::ScalarNode>(si->getValue());
+        if (KeyValue == "Name") {
+          // If the name is less than 8 bytes, store it in place, otherwise
+          // store it in the string table.
+          StringRef Name = Value->getValue(Storage);
+          std::fill_n(Sec.Header.Name, unsigned(COFF::NameSize), 0);
+          if (Name.size() <= COFF::NameSize) {
+            std::copy(Name.begin(), Name.end(), Sec.Header.Name);
+          } else {
+            // Add string to the string table and format the index for output.
+            unsigned Index = getStringIndex(Name);
+            std::string str = utostr(Index);
+            if (str.size() > 7) {
+              YS.printError(Value, "String table got too large");
+              return false;
+            }
+            Sec.Header.Name[0] = '/';
+            std::copy(str.begin(), str.end(), Sec.Header.Name + 1);
+          }
+        } else if (KeyValue == "VirtualSize") {
+          if (!getAs(Value, Sec.Header.VirtualSize)) {
+            YS.printError(Value, "Invalid value for VirtualSize");
+            return false;
+          }
+        } else if (KeyValue == "VirtualAddress") {
+          if (!getAs(Value, Sec.Header.VirtualAddress)) {
+            YS.printError(Value, "Invalid value for VirtualAddress");
+            return false;
+          }
+        } else if (KeyValue == "SizeOfRawData") {
+          if (!getAs(Value, Sec.Header.SizeOfRawData)) {
+            YS.printError(Value, "Invalid value for SizeOfRawData");
+            return false;
+          }
+        } else if (KeyValue == "PointerToRawData") {
+          if (!getAs(Value, Sec.Header.PointerToRawData)) {
+            YS.printError(Value, "Invalid value for PointerToRawData");
+            return false;
+          }
+        } else if (KeyValue == "PointerToRelocations") {
+          if (!getAs(Value, Sec.Header.PointerToRelocations)) {
+            YS.printError(Value, "Invalid value for PointerToRelocations");
+            return false;
+          }
+        } else if (KeyValue == "PointerToLineNumbers") {
+          if (!getAs(Value, Sec.Header.PointerToLineNumbers)) {
+            YS.printError(Value, "Invalid value for PointerToLineNumbers");
+            return false;
+          }
+        } else if (KeyValue == "NumberOfRelocations") {
+          if (!getAs(Value, Sec.Header.NumberOfRelocations)) {
+            YS.printError(Value, "Invalid value for NumberOfRelocations");
+            return false;
+          }
+        } else if (KeyValue == "NumberOfLineNumbers") {
+          if (!getAs(Value, Sec.Header.NumberOfLineNumbers)) {
+            YS.printError(Value, "Invalid value for NumberOfLineNumbers");
+            return false;
+          }
+        } else if (KeyValue == "Characteristics") {
+          yaml::SequenceNode *SeqValue
+            = dyn_cast<yaml::SequenceNode>(si->getValue());
+          if (!Value && !SeqValue) {
+            YS.printError(si->getValue(),
+              "Characteristics must either be a number or sequence");
+            return false;
+          }
+          if (Value) {
+            if (!getAs(Value, Sec.Header.Characteristics)) {
+              YS.printError(Value, "Invalid value for Characteristics");
+              return false;
+            }
+          } else {
+            for (yaml::SequenceNode::iterator ci = SeqValue->begin(),
+                                              ce = SeqValue->end();
+                                              ci != ce; ++ci) {
+              yaml::ScalarNode *CharValue = dyn_cast<yaml::ScalarNode>(&*ci);
+              if (!CharValue) {
+                YS.printError(CharValue, "Invalid value for Characteristics");
+                return false;
+              }
+              StringRef Char = CharValue->getValue(Storage);
+              uint32_t Characteristic =
+                StringSwitch<COFF::SectionCharacteristics>(Char)
+                .Case( "IMAGE_SCN_TYPE_NO_PAD"
+                     , COFF::IMAGE_SCN_TYPE_NO_PAD)
+                .Case( "IMAGE_SCN_CNT_CODE"
+                     , COFF::IMAGE_SCN_CNT_CODE)
+                .Case( "IMAGE_SCN_CNT_INITIALIZED_DATA"
+                     , COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+                .Case( "IMAGE_SCN_CNT_UNINITIALIZED_DATA"
+                     , COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+                .Case( "IMAGE_SCN_LNK_OTHER"
+                     , COFF::IMAGE_SCN_LNK_OTHER)
+                .Case( "IMAGE_SCN_LNK_INFO"
+                     , COFF::IMAGE_SCN_LNK_INFO)
+                .Case( "IMAGE_SCN_LNK_REMOVE"
+                     , COFF::IMAGE_SCN_LNK_REMOVE)
+                .Case( "IMAGE_SCN_LNK_COMDAT"
+                     , COFF::IMAGE_SCN_LNK_COMDAT)
+                .Case( "IMAGE_SCN_GPREL"
+                     , COFF::IMAGE_SCN_GPREL)
+                .Case( "IMAGE_SCN_MEM_PURGEABLE"
+                     , COFF::IMAGE_SCN_MEM_PURGEABLE)
+                .Case( "IMAGE_SCN_MEM_16BIT"
+                     , COFF::IMAGE_SCN_MEM_16BIT)
+                .Case( "IMAGE_SCN_MEM_LOCKED"
+                     , COFF::IMAGE_SCN_MEM_LOCKED)
+                .Case( "IMAGE_SCN_MEM_PRELOAD"
+                     , COFF::IMAGE_SCN_MEM_PRELOAD)
+                .Case( "IMAGE_SCN_ALIGN_1BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_1BYTES)
+                .Case( "IMAGE_SCN_ALIGN_2BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_2BYTES)
+                .Case( "IMAGE_SCN_ALIGN_4BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_4BYTES)
+                .Case( "IMAGE_SCN_ALIGN_8BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_8BYTES)
+                .Case( "IMAGE_SCN_ALIGN_16BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_16BYTES)
+                .Case( "IMAGE_SCN_ALIGN_32BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_32BYTES)
+                .Case( "IMAGE_SCN_ALIGN_64BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_64BYTES)
+                .Case( "IMAGE_SCN_ALIGN_128BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_128BYTES)
+                .Case( "IMAGE_SCN_ALIGN_256BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_256BYTES)
+                .Case( "IMAGE_SCN_ALIGN_512BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_512BYTES)
+                .Case( "IMAGE_SCN_ALIGN_1024BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_1024BYTES)
+                .Case( "IMAGE_SCN_ALIGN_2048BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_2048BYTES)
+                .Case( "IMAGE_SCN_ALIGN_4096BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_4096BYTES)
+                .Case( "IMAGE_SCN_ALIGN_8192BYTES"
+                     , COFF::IMAGE_SCN_ALIGN_8192BYTES)
+                .Case( "IMAGE_SCN_LNK_NRELOC_OVFL"
+                     , COFF::IMAGE_SCN_LNK_NRELOC_OVFL)
+                .Case( "IMAGE_SCN_MEM_DISCARDABLE"
+                     , COFF::IMAGE_SCN_MEM_DISCARDABLE)
+                .Case( "IMAGE_SCN_MEM_NOT_CACHED"
+                     , COFF::IMAGE_SCN_MEM_NOT_CACHED)
+                .Case( "IMAGE_SCN_MEM_NOT_PAGED"
+                     , COFF::IMAGE_SCN_MEM_NOT_PAGED)
+                .Case( "IMAGE_SCN_MEM_SHARED"
+                     , COFF::IMAGE_SCN_MEM_SHARED)
+                .Case( "IMAGE_SCN_MEM_EXECUTE"
+                     , COFF::IMAGE_SCN_MEM_EXECUTE)
+                .Case( "IMAGE_SCN_MEM_READ"
+                     , COFF::IMAGE_SCN_MEM_READ)
+                .Case( "IMAGE_SCN_MEM_WRITE"
+                     , COFF::IMAGE_SCN_MEM_WRITE)
+                .Default(COFF::SC_Invalid);
+              if (Characteristic == COFF::SC_Invalid) {
+                YS.printError(CharValue, "Invalid value for Characteristic");
+                return false;
+              }
+              Sec.Header.Characteristics |= Characteristic;
+            }
+          }
+        } else if (KeyValue == "SectionData") {
+          yaml::ScalarNode *Value = dyn_cast<yaml::ScalarNode>(si->getValue());
+          SmallString<32> Storage;
+          StringRef Data = Value->getValue(Storage);
+          if (!hexStringToByteArray(Data, Sec.Data)) {
+            YS.printError(Value, "SectionData must be a collection of pairs of"
+                                 "hex bytes");
+            return false;
+          }
+        } else
+          si->skip();
+      }
+      Sections.push_back(Sec);
+    }
+    return true;
+  }
+
+  bool parseSymbols(yaml::Node *SymbolsN) {
+    yaml::SequenceNode *SN = dyn_cast<yaml::SequenceNode>(SymbolsN);
+    if (!SN) {
+      YS.printError(SymbolsN, "Symbols must be a sequence");
+      return false;
+    }
+    for (yaml::SequenceNode::iterator i = SN->begin(), e = SN->end();
+                                      i != e; ++i) {
+      Symbol Sym;
+      std::memset(&Sym.Header, 0, sizeof(Sym.Header));
+      yaml::MappingNode *SymMap = dyn_cast<yaml::MappingNode>(&*i);
+      if (!SymMap) {
+        YS.printError(&*i, "Symbol must be a map");
+        return false;
+      }
+      for (yaml::MappingNode::iterator si = SymMap->begin(), se = SymMap->end();
+                                       si != se; ++si) {
+        yaml::ScalarNode *Key = dyn_cast<yaml::ScalarNode>(si->getKey());
+        if (!Key) {
+          YS.printError(si->getKey(), "Keys must be scalar values");
+          return false;
+        }
+        SmallString<32> Storage;
+        StringRef KeyValue = Key->getValue(Storage);
+
+        yaml::ScalarNode *Value = dyn_cast<yaml::ScalarNode>(si->getValue());
+        if (!Value) {
+          YS.printError(si->getValue(), "Must be a scalar value");
+          return false;
+        }
+        if (KeyValue == "Name") {
+          // If the name is less than 8 bytes, store it in place, otherwise
+          // store it in the string table.
+          StringRef Name = Value->getValue(Storage);
+          std::fill_n(Sym.Header.Name, unsigned(COFF::NameSize), 0);
+          if (Name.size() <= COFF::NameSize) {
+            std::copy(Name.begin(), Name.end(), Sym.Header.Name);
+          } else {
+            // Add string to the string table and format the index for output.
+            unsigned Index = getStringIndex(Name);
+            *reinterpret_cast<support::aligned_ulittle32_t*>(
+              Sym.Header.Name + 4) = Index;
+          }
+        } else if (KeyValue == "Value") {
+          if (!getAs(Value, Sym.Header.Value)) {
+            YS.printError(Value, "Invalid value for Value");
+            return false;
+          }
+        } else if (KeyValue == "SimpleType") {
+          Sym.Header.Type |= StringSwitch<COFF::SymbolBaseType>(
+            Value->getValue(Storage))
+            .Case("IMAGE_SYM_TYPE_NULL", COFF::IMAGE_SYM_TYPE_NULL)
+            .Case("IMAGE_SYM_TYPE_VOID", COFF::IMAGE_SYM_TYPE_VOID)
+            .Case("IMAGE_SYM_TYPE_CHAR", COFF::IMAGE_SYM_TYPE_CHAR)
+            .Case("IMAGE_SYM_TYPE_SHORT", COFF::IMAGE_SYM_TYPE_SHORT)
+            .Case("IMAGE_SYM_TYPE_INT", COFF::IMAGE_SYM_TYPE_INT)
+            .Case("IMAGE_SYM_TYPE_LONG", COFF::IMAGE_SYM_TYPE_LONG)
+            .Case("IMAGE_SYM_TYPE_FLOAT", COFF::IMAGE_SYM_TYPE_FLOAT)
+            .Case("IMAGE_SYM_TYPE_DOUBLE", COFF::IMAGE_SYM_TYPE_DOUBLE)
+            .Case("IMAGE_SYM_TYPE_STRUCT", COFF::IMAGE_SYM_TYPE_STRUCT)
+            .Case("IMAGE_SYM_TYPE_UNION", COFF::IMAGE_SYM_TYPE_UNION)
+            .Case("IMAGE_SYM_TYPE_ENUM", COFF::IMAGE_SYM_TYPE_ENUM)
+            .Case("IMAGE_SYM_TYPE_MOE", COFF::IMAGE_SYM_TYPE_MOE)
+            .Case("IMAGE_SYM_TYPE_BYTE", COFF::IMAGE_SYM_TYPE_BYTE)
+            .Case("IMAGE_SYM_TYPE_WORD", COFF::IMAGE_SYM_TYPE_WORD)
+            .Case("IMAGE_SYM_TYPE_UINT", COFF::IMAGE_SYM_TYPE_UINT)
+            .Case("IMAGE_SYM_TYPE_DWORD", COFF::IMAGE_SYM_TYPE_DWORD)
+            .Default(COFF::IMAGE_SYM_TYPE_NULL);
+        } else if (KeyValue == "ComplexType") {
+          Sym.Header.Type |= StringSwitch<COFF::SymbolComplexType>(
+            Value->getValue(Storage))
+            .Case("IMAGE_SYM_DTYPE_NULL", COFF::IMAGE_SYM_DTYPE_NULL)
+            .Case("IMAGE_SYM_DTYPE_POINTER", COFF::IMAGE_SYM_DTYPE_POINTER)
+            .Case("IMAGE_SYM_DTYPE_FUNCTION", COFF::IMAGE_SYM_DTYPE_FUNCTION)
+            .Case("IMAGE_SYM_DTYPE_ARRAY", COFF::IMAGE_SYM_DTYPE_ARRAY)
+            .Default(COFF::IMAGE_SYM_DTYPE_NULL)
+            << COFF::SCT_COMPLEX_TYPE_SHIFT;
+        } else if (KeyValue == "StorageClass") {
+          Sym.Header.StorageClass = StringSwitch<COFF::SymbolStorageClass>(
+            Value->getValue(Storage))
+            .Case( "IMAGE_SYM_CLASS_END_OF_FUNCTION"
+                 , COFF::IMAGE_SYM_CLASS_END_OF_FUNCTION)
+            .Case( "IMAGE_SYM_CLASS_NULL"
+                 , COFF::IMAGE_SYM_CLASS_NULL)
+            .Case( "IMAGE_SYM_CLASS_AUTOMATIC"
+                 , COFF::IMAGE_SYM_CLASS_AUTOMATIC)
+            .Case( "IMAGE_SYM_CLASS_EXTERNAL"
+                 , COFF::IMAGE_SYM_CLASS_EXTERNAL)
+            .Case( "IMAGE_SYM_CLASS_STATIC"
+                 , COFF::IMAGE_SYM_CLASS_STATIC)
+            .Case( "IMAGE_SYM_CLASS_REGISTER"
+                 , COFF::IMAGE_SYM_CLASS_REGISTER)
+            .Case( "IMAGE_SYM_CLASS_EXTERNAL_DEF"
+                 , COFF::IMAGE_SYM_CLASS_EXTERNAL_DEF)
+            .Case( "IMAGE_SYM_CLASS_LABEL"
+                 , COFF::IMAGE_SYM_CLASS_LABEL)
+            .Case( "IMAGE_SYM_CLASS_UNDEFINED_LABEL"
+                 , COFF::IMAGE_SYM_CLASS_UNDEFINED_LABEL)
+            .Case( "IMAGE_SYM_CLASS_MEMBER_OF_STRUCT"
+                 , COFF::IMAGE_SYM_CLASS_MEMBER_OF_STRUCT)
+            .Case( "IMAGE_SYM_CLASS_ARGUMENT"
+                 , COFF::IMAGE_SYM_CLASS_ARGUMENT)
+            .Case( "IMAGE_SYM_CLASS_STRUCT_TAG"
+                 , COFF::IMAGE_SYM_CLASS_STRUCT_TAG)
+            .Case( "IMAGE_SYM_CLASS_MEMBER_OF_UNION"
+                 , COFF::IMAGE_SYM_CLASS_MEMBER_OF_UNION)
+            .Case( "IMAGE_SYM_CLASS_UNION_TAG"
+                 , COFF::IMAGE_SYM_CLASS_UNION_TAG)
+            .Case( "IMAGE_SYM_CLASS_TYPE_DEFINITION"
+                 , COFF::IMAGE_SYM_CLASS_TYPE_DEFINITION)
+            .Case( "IMAGE_SYM_CLASS_UNDEFINED_STATIC"
+                 , COFF::IMAGE_SYM_CLASS_UNDEFINED_STATIC)
+            .Case( "IMAGE_SYM_CLASS_ENUM_TAG"
+                 , COFF::IMAGE_SYM_CLASS_ENUM_TAG)
+            .Case( "IMAGE_SYM_CLASS_MEMBER_OF_ENUM"
+                 , COFF::IMAGE_SYM_CLASS_MEMBER_OF_ENUM)
+            .Case( "IMAGE_SYM_CLASS_REGISTER_PARAM"
+                 , COFF::IMAGE_SYM_CLASS_REGISTER_PARAM)
+            .Case( "IMAGE_SYM_CLASS_BIT_FIELD"
+                 , COFF::IMAGE_SYM_CLASS_BIT_FIELD)
+            .Case( "IMAGE_SYM_CLASS_BLOCK"
+                 , COFF::IMAGE_SYM_CLASS_BLOCK)
+            .Case( "IMAGE_SYM_CLASS_FUNCTION"
+                 , COFF::IMAGE_SYM_CLASS_FUNCTION)
+            .Case( "IMAGE_SYM_CLASS_END_OF_STRUCT"
+                 , COFF::IMAGE_SYM_CLASS_END_OF_STRUCT)
+            .Case( "IMAGE_SYM_CLASS_FILE"
+                 , COFF::IMAGE_SYM_CLASS_FILE)
+            .Case( "IMAGE_SYM_CLASS_SECTION"
+                 , COFF::IMAGE_SYM_CLASS_SECTION)
+            .Case( "IMAGE_SYM_CLASS_WEAK_EXTERNAL"
+                 , COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL)
+            .Case( "IMAGE_SYM_CLASS_CLR_TOKEN"
+                 , COFF::IMAGE_SYM_CLASS_CLR_TOKEN)
+            .Default(COFF::SSC_Invalid);
+          if (Sym.Header.StorageClass == COFF::SSC_Invalid) {
+            YS.printError(Value, "Invalid value for StorageClass");
+            return false;
+          }
+        } else if (KeyValue == "SectionNumber") {
+          if (!getAs(Value, Sym.Header.SectionNumber)) {
+              YS.printError(Value, "Invalid value for SectionNumber");
+              return false;
+          }
+        } else if (KeyValue == "AuxillaryData") {
+          StringRef Data = Value->getValue(Storage);
+          if (!hexStringToByteArray(Data, Sym.AuxSymbols)) {
+            YS.printError(Value, "AuxillaryData must be a collection of pairs"
+                                 "of hex bytes");
+            return false;
+          }
+        } else
+          si->skip();
+      }
+      Symbols.push_back(Sym);
+    }
+    return true;
+  }
+
+  bool parse() {
+    yaml::Document &D = *YS.begin();
+    yaml::MappingNode *Root = dyn_cast<yaml::MappingNode>(D.getRoot());
+    if (!Root) {
+      YS.printError(D.getRoot(), "Root node must be a map");
+      return false;
+    }
+    for (yaml::MappingNode::iterator i = Root->begin(), e = Root->end();
+                                     i != e; ++i) {
+      yaml::ScalarNode *Key = dyn_cast<yaml::ScalarNode>(i->getKey());
+      if (!Key) {
+        YS.printError(i->getKey(), "Keys must be scalar values");
+        return false;
+      }
+      SmallString<32> Storage;
+      StringRef KeyValue = Key->getValue(Storage);
+      if (KeyValue == "header") {
+        if (!parseHeader(i->getValue()))
+          return false;
+      } else if (KeyValue == "sections") {
+        if (!parseSections(i->getValue()))
+          return false;
+      } else if (KeyValue == "symbols") {
+        if (!parseSymbols(i->getValue()))
+          return false;
+      }
+    }
+    return !YS.failed();
+  }
+
+  unsigned getStringIndex(StringRef Str) {
+    StringMap<unsigned>::iterator i = StringTableMap.find(Str);
+    if (i == StringTableMap.end()) {
+      unsigned Index = StringTable.size();
+      StringTable.append(Str.begin(), Str.end());
+      StringTable.push_back(0);
+      StringTableMap[Str] = Index;
+      return Index;
+    }
+    return i->second;
+  }
+
+  yaml::Stream &YS;
+  COFF::header Header;
+
+  struct Section {
+    COFF::section Header;
+    std::vector<uint8_t> Data;
+    std::vector<COFF::relocation> Relocations;
+  };
+
+  struct Symbol {
+    COFF::symbol Header;
+    std::vector<uint8_t> AuxSymbols;
+  };
+
+  std::vector<Section> Sections;
+  std::vector<Symbol> Symbols;
+  StringMap<unsigned> StringTableMap;
+  std::string StringTable;
+};
+
+// Take a CP and assign addresses and sizes to everything. Returns false if the
+// layout is not valid to do.
+static bool layoutCOFF(COFFParser &CP) {
+  uint32_t SectionTableStart = 0;
+  uint32_t SectionTableSize  = 0;
+
+  // The section table starts immediately after the header, including the
+  // optional header.
+  SectionTableStart = sizeof(COFF::header) + CP.Header.SizeOfOptionalHeader;
+  SectionTableSize = sizeof(COFF::section) * CP.Sections.size();
+
+  uint32_t CurrentSectionDataOffset = SectionTableStart + SectionTableSize;
+
+  // Assign each section data address consecutively.
+  for (std::vector<COFFParser::Section>::iterator i = CP.Sections.begin(),
+                                                  e = CP.Sections.end();
+                                                  i != e; ++i) {
+    if (!i->Data.empty()) {
+      i->Header.SizeOfRawData = i->Data.size();
+      i->Header.PointerToRawData = CurrentSectionDataOffset;
+      CurrentSectionDataOffset += i->Header.SizeOfRawData;
+      // TODO: Handle alignment.
+    } else {
+      i->Header.SizeOfRawData = 0;
+      i->Header.PointerToRawData = 0;
+    }
+  }
+
+  uint32_t SymbolTableStart = CurrentSectionDataOffset;
+
+  // Calculate number of symbols.
+  uint32_t NumberOfSymbols = 0;
+  for (std::vector<COFFParser::Symbol>::iterator i = CP.Symbols.begin(),
+                                                 e = CP.Symbols.end();
+                                                 i != e; ++i) {
+    if (i->AuxSymbols.size() % COFF::SymbolSize != 0) {
+      errs() << "AuxillaryData size not a multiple of symbol size!\n";
+      return false;
+    }
+    i->Header.NumberOfAuxSymbols = i->AuxSymbols.size() / COFF::SymbolSize;
+    NumberOfSymbols += 1 + i->Header.NumberOfAuxSymbols;
+  }
+
+  // Store all the allocated start addresses in the header.
+  CP.Header.NumberOfSections = CP.Sections.size();
+  CP.Header.NumberOfSymbols = NumberOfSymbols;
+  CP.Header.PointerToSymbolTable = SymbolTableStart;
+
+  *reinterpret_cast<support::ulittle32_t *>(&CP.StringTable[0])
+    = CP.StringTable.size();
+
+  return true;
+}
+
+template <typename value_type>
+struct binary_le_impl {
+  value_type Value;
+  binary_le_impl(value_type V) : Value(V) {}
+};
+
+template <typename value_type>
+raw_ostream &operator <<( raw_ostream &OS
+                        , const binary_le_impl<value_type> &BLE) {
+  char Buffer[sizeof(BLE.Value)];
+  support::endian::write_le<value_type, support::unaligned>(Buffer, BLE.Value);
+  OS.write(Buffer, sizeof(BLE.Value));
+  return OS;
+}
+
+template <typename value_type>
+binary_le_impl<value_type> binary_le(value_type V) {
+  return binary_le_impl<value_type>(V);
+}
+
+void writeCOFF(COFFParser &CP, raw_ostream &OS) {
+  OS << binary_le(CP.Header.Machine)
+     << binary_le(CP.Header.NumberOfSections)
+     << binary_le(CP.Header.TimeDateStamp)
+     << binary_le(CP.Header.PointerToSymbolTable)
+     << binary_le(CP.Header.NumberOfSymbols)
+     << binary_le(CP.Header.SizeOfOptionalHeader)
+     << binary_le(CP.Header.Characteristics);
+
+  // Output section table.
+  for (std::vector<COFFParser::Section>::const_iterator i = CP.Sections.begin(),
+                                                        e = CP.Sections.end();
+                                                        i != e; ++i) {
+    OS.write(i->Header.Name, COFF::NameSize);
+    OS << binary_le(i->Header.VirtualSize)
+       << binary_le(i->Header.VirtualAddress)
+       << binary_le(i->Header.SizeOfRawData)
+       << binary_le(i->Header.PointerToRawData)
+       << binary_le(i->Header.PointerToRelocations)
+       << binary_le(i->Header.PointerToLineNumbers)
+       << binary_le(i->Header.NumberOfRelocations)
+       << binary_le(i->Header.NumberOfLineNumbers)
+       << binary_le(i->Header.Characteristics);
+  }
+
+  // Output section data.
+  for (std::vector<COFFParser::Section>::const_iterator i = CP.Sections.begin(),
+                                                        e = CP.Sections.end();
+                                                        i != e; ++i) {
+    if (!i->Data.empty())
+      OS.write(reinterpret_cast<const char*>(&i->Data[0]), i->Data.size());
+  }
+
+  // Output symbol table.
+
+  for (std::vector<COFFParser::Symbol>::const_iterator i = CP.Symbols.begin(),
+                                                       e = CP.Symbols.end();
+                                                       i != e; ++i) {
+    OS.write(i->Header.Name, COFF::NameSize);
+    OS << binary_le(i->Header.Value)
+       << binary_le(i->Header.SectionNumber)
+       << binary_le(i->Header.Type)
+       << binary_le(i->Header.StorageClass)
+       << binary_le(i->Header.NumberOfAuxSymbols);
+    if (!i->AuxSymbols.empty())
+      OS.write( reinterpret_cast<const char*>(&i->AuxSymbols[0])
+              , i->AuxSymbols.size());
+  }
+
+  // Output string table.
+  OS.write(&CP.StringTable[0], CP.StringTable.size());
+}
+
+int main(int argc, char **argv) {
+  cl::ParseCommandLineOptions(argc, argv);
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+
+  OwningPtr<MemoryBuffer> Buf;
+  if (MemoryBuffer::getFileOrSTDIN(Input, Buf))
+    return 1;
+
+  SourceMgr SM;
+  yaml::Stream S(Buf->getBuffer(), SM);
+  COFFParser CP(S);
+  if (!CP.parse()) {
+    errs() << "yaml2obj: Failed to parse YAML file!\n";
+    return 1;
+  }
+  if (!layoutCOFF(CP)) {
+    errs() << "yaml2obj: Failed to layout COFF file!\n";
+    return 1;
+  }
+  writeCOFF(CP, outs());
+}
author	dim <dim@FreeBSD.org>	2012-08-15 19:34:23 +0000
committer	dim <dim@FreeBSD.org>	2012-08-15 19:34:23 +0000
commit	721c201bd55ffb73cb2ba8d39e0570fa38c44e15 (patch)
tree	eacfc83d988e4b9d11114387ae7dc41243f2a363
parent	2b2816e083a455f7a656ae88b0fd059d1688bb36 (diff)
download	FreeBSD-src-721c201bd55ffb73cb2ba8d39e0570fa38c44e15.zip FreeBSD-src-721c201bd55ffb73cb2ba8d39e0570fa38c44e15.tar.gz